Mercurial > lbo > hg > scrapeprice
changeset 6:e2526accc58f
Get it to a working scraper
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 22 Mar 2020 14:23:19 +0100 |
parents | cc875ec12026 |
children | 8dee877af779 |
files | Cargo.lock Cargo.toml src/driver.rs src/extract.rs src/implem.rs src/main.rs |
diffstat | 6 files changed, 131 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/Cargo.lock Sun Mar 22 13:20:43 2020 +0100 +++ b/Cargo.lock Sun Mar 22 14:23:19 2020 +0100 @@ -881,6 +881,12 @@ checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" [[package]] +name = "rex-regex" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8961e9d637060caccd8306ca91fdef77a67ea9657ac2e137c09eb7f4c6789c8c" + +[[package]] name = "ring" version = "0.16.11" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -955,6 +961,7 @@ "hyper", "hyper-rustls", "log", + "rex-regex", "robots_txt", "scraper", "tokio",
--- a/Cargo.toml Sun Mar 22 13:20:43 2020 +0100 +++ b/Cargo.toml Sun Mar 22 14:23:19 2020 +0100 @@ -16,3 +16,4 @@ log = "0.4" env_logger = "0.7" async-trait = "0.1" +rex-regex = "0.1"
--- a/src/driver.rs Sun Mar 22 13:20:43 2020 +0100 +++ b/src/driver.rs Sun Mar 22 14:23:19 2020 +0100 @@ -14,7 +14,7 @@ /// Store fetched results, which come as key/value pairs, somewhere. #[async_trait::async_trait] pub trait Storage { - async fn store(&mut self, iter: &mut dyn iter::Iterator<Item=(String,String)>) -> Result<(), err::HTTPError>; + async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) ->Result<(), err::HTTPError>; } /// Return Uris to explore, both as initial set and for every fetched page. @@ -61,7 +61,7 @@ let doc = extract::parse_response(resp)?; if let Some(ref mut extracted) = self.logic.extract.extract(&doc) { info!("Stored extracted information"); - self.logic.store.store(extracted.all().as_mut()); + self.logic.store.store(extracted.all()); } let next = self.logic.explore.next(&doc); info!("Appended URIs after fetch: {:?}", next);
--- a/src/extract.rs Sun Mar 22 13:20:43 2020 +0100 +++ b/src/extract.rs Sun Mar 22 14:23:19 2020 +0100 @@ -35,7 +35,6 @@ let mut values = vec![]; for e in selected { - println!("selected: {}", e.inner_html()); values.push(e.inner_html()); } r.push(values); @@ -49,13 +48,13 @@ } pub trait Extracted { - fn all(&mut self) -> Box<dyn iter::Iterator<Item=(String,String)>> { + fn all(&mut self) -> Box<dyn iter::Iterator<Item = (String, String)> + Send> { Box::new(iter::empty()) } } pub trait Extractor { - fn extract(&mut self, doc: &Document) -> Option<&mut dyn Extracted> { + fn extract(&mut self, doc: &Document) -> Option<Box<dyn Extracted>> { None } } @@ -73,9 +72,11 @@ let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); - let zipped: Vec<(String, String)> = descs.into_iter().zip(prices).map(|(desc, price)| { - (desc.trim().to_string(), price.trim().to_string()) - }).collect(); + let zipped: Vec<(String, String)> = descs + .into_iter() + .zip(prices) + .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) + .collect(); println!("{:?}", zipped); } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/implem.rs Sun Mar 22 14:23:19 2020 +0100 @@ -0,0 +1,95 @@ +//! Implementations of common traits that are useful to plug together a Driver. + +use std::iter; +use std::iter::FromIterator; + +use crate::driver; +use crate::extract; +use crate::http; +use crate::err::HTTPError; + +use hyper::Uri; +use log::{info,warn,error}; +use rex_regex as rex; + +pub struct AudiophilItemPriceExtractor { +} + +fn substring(s: String, (start, len): (usize, usize)) -> String { + String::from_iter(s.chars().skip(start).take(len)) +} + +impl extract::Extractor for AudiophilItemPriceExtractor { + fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn extract::Extracted>> { + let mut data = doc.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); + let prices = data.pop().unwrap(); + let descs = data.pop().unwrap(); + + let onlytext = rex::compile("^[a-zA-Z0-9\\.+/ -]+").unwrap(); + + let zipped: Vec<(String, String)> = descs + .into_iter() + .zip(prices) + .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) + .map(move |(desc, price)| { + let desc2; + let price2; + let (ok, descmatch) = rex::match_re(&onlytext, &desc); + if ok { + desc2 = substring(desc, descmatch[0]); + } else { + desc2 = desc; + } + let (ok, pricematch) = rex::match_re(&onlytext, &price); + if ok { + price2 = substring(price, pricematch[0]); + } else { + price2 = price; + } + + (desc2, price2) + }) + .collect(); + println!("{:?}", zipped); + None + } +} + +pub struct AudiophilExplorer { + known: Vec<hyper::Uri>, +} + +impl AudiophilExplorer { + pub fn new() -> AudiophilExplorer { + let want = vec![ + "https://audiophil-foto.de/de/shop/kameras/sony/", + "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/", + "https://audiophil-foto.de/de/shop/kameras/leica/", + "https://audiophil-foto.de/de/shop/objektive/sony/", + "https://audiophil-foto.de/de/shop/objektive/zeiss/", + "https://audiophil-foto.de/de/shop/objektive/sigma/", + ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect(); + AudiophilExplorer { known: want } + } +} + +impl driver::Explorer for AudiophilExplorer { + fn idle(&mut self) -> Vec<Uri> { + self.known.drain(..).collect() + } + fn next(&mut self, _: &extract::Document) -> Vec<Uri> { + vec![] + } +} + +pub struct DebuggingStorage { } + +#[async_trait::async_trait] +impl driver::Storage for DebuggingStorage { + async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) -> Result<(), HTTPError> { + let all = iter.collect::<Vec<(String,String)>>(); + info!("STORAGE: Received {:?}", all); + Ok(()) + } +} +
--- a/src/main.rs Sun Mar 22 13:20:43 2020 +0100 +++ b/src/main.rs Sun Mar 22 14:23:19 2020 +0100 @@ -2,9 +2,11 @@ mod err; mod extract; mod http; +mod implem; use log::{info, warn}; use env_logger; +use tokio; async fn test_fetch_page() -> hyper::Result<()> { let mut cl = http::HTTPS::new(); @@ -21,5 +23,21 @@ env_logger::Builder::from_default_env().filter(None, log::LevelFilter::Info).init(); info!("scrapeprice: init"); - test_fetch_page().await.unwrap(); + //test_fetch_page().await.unwrap(); + + let logic = driver::DriverLogic { + explore: Box::new(implem::AudiophilExplorer::new()), + store: Box::new(implem::DebuggingStorage {}), + extract: Box::new(implem::AudiophilItemPriceExtractor {}), + }; + let mut driver = driver::Driver::new(logic, None); + + let mut ival = tokio::time::interval(tokio::time::Duration::from_millis(2000)); + + loop { + ival.tick().await; + if let Err(e) = driver.drive().await { + warn!("Error from driver: {}", e); + } + } }