Mercurial > lbo > hg > scrapeprice
changeset 15:99031188b089
Restructure code
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Mon, 21 Sep 2020 17:16:34 +0200 |
parents | 29415ea96e5f |
children | 364f0ae83e7d |
files | README.md src/implem.rs src/implem/audiophil.rs src/implem/mod.rs src/main.rs src/util.rs |
diffstat | 6 files changed, 150 insertions(+), 104 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Mon Sep 21 17:16:34 2020 +0200 @@ -0,0 +1,37 @@ +# `scrapeprice` + +is a small one-binary Rust framework for fetching and analyzing web pages. It +honors robots.txt and is based on tokio. The logic for specific websites is +implemented in Rust -- alternatively, a configuration-based system can be +implemented. + +To implement your own scraper, you need to implement three traits according to +your needs; the implementations are stored in a `driver::DriverLogic` object. + +- `driver::Explorer` tells the driver which URLs to visit. It returns a list of +URLs to initially visit and extracts new URLs from fetched documents. +- `driver::Extractor<T>` extracts items of type `T` from a fetched web page. +- `driver::Storage<T>` stores the items returned by an `Extractor`, whether in a +log file, CSV or other text format, or into a database. + +Once you have implemented those traits, check out the `main.rs` file as example. +Not much more is needed now: + +```rust + let logic = driver::DriverLogic { + explore: Box::new(implem::YourExplorer::new()), + store: Box::new(implem::YourStorage {}), + extract: Box::new(implem::YourItemPriceExtractor {}), + }; + let mut driver = driver::Driver::new(logic, None); + + let mut ival = tokio::time::interval(tokio::time::Duration::from_millis(2000)); + + loop { + ival.tick().await; + if let Err(e) = driver.drive().await { + warn!("Error from driver: {}", e); + } + } +``` +
--- a/src/implem.rs Mon Sep 21 16:59:06 2020 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -//! Implementations of common traits that are useful to plug together a Driver. - -use std::iter; -use std::iter::FromIterator; - -use crate::driver; -use crate::extract; -use crate::err::HTTPError; - -use hyper::Uri; -use log::{info,warn,error}; -use rex_regex as rex; - -pub struct AudiophilItemPriceExtractor { -} - -fn substring(s: String, (start, len): (usize, usize)) -> String { - String::from_iter(s.chars().skip(start).take(len)) -} - -impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor { - fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> { - info!("Extracting info from {}", uri); - let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); - let prices = data.pop().unwrap(); - let descs = data.pop().unwrap(); - - let onlytext = rex::compile("^([a-zA-Z0-9€\\.,+/ -]+)").unwrap(); - - let zipped = descs - .into_iter() - .zip(prices) - .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) - .map(move |(desc, price)| { - let desc2; - let price2; - let (ok, descmatch) = rex::match_re(&onlytext, &desc); - if ok { - desc2 = substring(desc, descmatch[0]); - } else { - desc2 = desc; - } - let (ok, pricematch) = rex::match_re(&onlytext, &price); - if ok { - price2 = substring(price, pricematch[0]); - } else { - price2 = price; - } - - ScrapedPrice { item: desc2, price: price2, note: 44 } - }) - .collect(); - info!("Extracted {:?}", zipped); - zipped - } -} - -pub struct AudiophilExplorer { - known: Vec<hyper::Uri>, -} - -impl AudiophilExplorer { - pub fn new() -> AudiophilExplorer { - let want = vec![ - "https://audiophil-foto.de/de/shop/kameras/sony/", - "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/", - "https://audiophil-foto.de/de/shop/kameras/leica/", - "https://audiophil-foto.de/de/shop/objektive/sony/", - "https://audiophil-foto.de/de/shop/objektive/zeiss/", - "https://audiophil-foto.de/de/shop/objektive/sigma/", - ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect(); - AudiophilExplorer { known: want } - } -} - -impl driver::Explorer for AudiophilExplorer { - fn idle(&mut self) -> Vec<Uri> { - self.known.drain(..).collect() - } - fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> { - vec![] - } -} - -pub struct DebuggingStorage { } - -#[derive(Debug)] -pub struct ScrapedPrice { - item: String, - price: String, - note: i32, -} - -#[async_trait::async_trait] -impl driver::Storage<ScrapedPrice> for DebuggingStorage { - async fn store(&mut self, all: Box<dyn Iterator<Item=ScrapedPrice> + Send>) -> Result<(), HTTPError> { - info!("STORAGE: Received {:?}", all.collect::<Vec<ScrapedPrice>>()); - Ok(()) - } -} -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/implem/audiophil.rs Mon Sep 21 17:16:34 2020 +0200 @@ -0,0 +1,82 @@ +use crate::util::{ScrapedPrice}; + +use std::iter::FromIterator; + +use crate::driver; +use crate::extract; + +use hyper::Uri; +use log::{info}; +use rex_regex as rex; + +pub struct AudiophilItemPriceExtractor { +} + +fn substring(s: String, (start, len): (usize, usize)) -> String { + String::from_iter(s.chars().skip(start).take(len)) +} + +impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor { + fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> { + info!("Extracting info from {}", uri); + let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); + let prices = data.pop().unwrap(); + let descs = data.pop().unwrap(); + + let onlytext = rex::compile("^([a-zA-Z0-9€\\.,+/ -]+)").unwrap(); + + let zipped = descs + .into_iter() + .zip(prices) + .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) + .map(move |(desc, price)| { + let desc2; + let price2; + let (ok, descmatch) = rex::match_re(&onlytext, &desc); + if ok { + desc2 = substring(desc, descmatch[0]); + } else { + desc2 = desc; + } + let (ok, pricematch) = rex::match_re(&onlytext, &price); + if ok { + price2 = substring(price, pricematch[0]); + } else { + price2 = price; + } + + ScrapedPrice { item: desc2, price: price2, note: 44 } + }) + .collect(); + info!("Extracted {:?}", zipped); + zipped + } +} + +pub struct AudiophilExplorer { + known: Vec<hyper::Uri>, +} + +impl AudiophilExplorer { + pub fn new() -> AudiophilExplorer { + let want = vec![ + "https://audiophil-foto.de/de/shop/kameras/sony/", + "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/", + "https://audiophil-foto.de/de/shop/kameras/leica/", + "https://audiophil-foto.de/de/shop/objektive/sony/", + "https://audiophil-foto.de/de/shop/objektive/zeiss/", + "https://audiophil-foto.de/de/shop/objektive/sigma/", + ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect(); + AudiophilExplorer { known: want } + } +} + +impl driver::Explorer for AudiophilExplorer { + fn idle(&mut self) -> Vec<Uri> { + self.known.drain(..).collect() + } + fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> { + vec![] + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/implem/mod.rs Mon Sep 21 17:16:34 2020 +0200 @@ -0,0 +1,1 @@ +pub mod audiophil;
--- a/src/main.rs Mon Sep 21 16:59:06 2020 +0200 +++ b/src/main.rs Mon Sep 21 17:16:34 2020 +0200 @@ -3,6 +3,9 @@ mod extract; mod http; mod implem; +mod util; + +use implem::audiophil as audiophil; use log::{info, warn}; use env_logger; @@ -16,9 +19,9 @@ //test_fetch_page().await.unwrap(); let logic = driver::DriverLogic { - explore: Box::new(implem::AudiophilExplorer::new()), - store: Box::new(implem::DebuggingStorage {}), - extract: Box::new(implem::AudiophilItemPriceExtractor {}), + explore: Box::new(audiophil::AudiophilExplorer::new()), + store: Box::new(util::DebuggingStorage {}), + extract: Box::new(audiophil::AudiophilItemPriceExtractor {}), }; let mut driver = driver::Driver::new(logic, None);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/util.rs Mon Sep 21 17:16:34 2020 +0200 @@ -0,0 +1,24 @@ +//! Implementations of common traits that are useful to plug together a Driver. + +use crate::driver; +use crate::err::HTTPError; + +use log::{info}; + +pub struct DebuggingStorage { } + +#[derive(Debug)] +pub struct ScrapedPrice { + pub item: String, + pub price: String, + pub note: i32, +} + +#[async_trait::async_trait] +impl driver::Storage<ScrapedPrice> for DebuggingStorage { + async fn store(&mut self, all: Box<dyn Iterator<Item=ScrapedPrice> + Send>) -> Result<(), HTTPError> { + info!("STORAGE: Received {:?}", all.collect::<Vec<ScrapedPrice>>()); + Ok(()) + } +} +