Mercurial > lbo > hg > scrapeprice
view example/audiophil/src/audiophil.rs @ 22:0e7b6f3050d0 default tip
Use new traits system in example
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Tue, 22 Sep 2020 19:32:51 +0200 |
parents | b16039ffcb17 |
children |
line wrap: on
line source
use scrapeprice::err::HTTPError; use scrapeprice::util::ScrapedPrice; use std::collections::HashSet; use std::collections::LinkedList; use std::iter::FromIterator; use scrapeprice::{driver, extract}; use hyper::Uri; use log::info; use rex_regex as rex; pub struct AudiophilItemPriceExtractor {} fn substring(s: String, (start, len): (usize, usize)) -> String { String::from_iter(s.chars().skip(start).take(len)) } impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor { fn next_sites(&mut self, _uri: &Uri, _doc: &extract::Document) -> Vec<Uri> { vec![] } fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> { info!("Extracting info from {}", uri); let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); let onlytext = rex::compile("^([a-zA-Z0-9€\\.,+/ -]+)").unwrap(); let zipped = descs .into_iter() .zip(prices) .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) .map(move |(desc, price)| { let desc2; let price2; let (ok, descmatch) = rex::match_re(&onlytext, &desc); if ok { desc2 = substring(desc, descmatch[0]); } else { desc2 = desc; } let (ok, pricematch) = rex::match_re(&onlytext, &price); if ok { price2 = substring(price, pricematch[0]); } else { price2 = price; } ScrapedPrice { item: desc2, price: price2, note: 44, } }) .collect(); info!("Extracted {:?}", zipped); zipped } } pub struct AudiophilQueue { q: LinkedList<Uri>, visited: HashSet<Uri>, } impl AudiophilQueue { pub fn new() -> AudiophilQueue { let initial: Vec<Uri> = vec![ "https://audiophil-foto.de/de/shop/kameras/sony/", "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/", "https://audiophil-foto.de/de/shop/kameras/leica/", "https://audiophil-foto.de/de/shop/objektive/sony/", "https://audiophil-foto.de/de/shop/objektive/zeiss/", "https://audiophil-foto.de/de/shop/objektive/sigma/", ] .into_iter() .map(|s| s.parse::<Uri>().unwrap()) .collect(); AudiophilQueue { q: LinkedList::from_iter(initial.into_iter()), visited: HashSet::new(), } } } #[async_trait::async_trait] impl driver::Queue for AudiophilQueue { async fn add(&mut self, uris: &[Uri]) -> Result<(), HTTPError> { for u in uris { if !self.visited.contains(u) { self.q.push_back(u.clone()); } } Ok(()) } async fn next(&mut self) -> Result<Option<Uri>, HTTPError> { if !self.q.is_empty() { return Ok(self.q.pop_front()); } Ok(None) } async fn visited(&mut self, uri: &Uri) -> Result<(), HTTPError> { self.visited.insert(uri.clone()); Ok(()) } }