Mercurial > lbo > hg > scrapeprice
changeset 22:0e7b6f3050d0 default tip
Use new traits system in example
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Tue, 22 Sep 2020 19:32:51 +0200 |
parents | e4c4a7c00fbd |
children | |
files | example/audiophil/src/audiophil.rs example/audiophil/src/main.rs src/driver.rs |
diffstat | 3 files changed, 48 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/example/audiophil/src/audiophil.rs Tue Sep 22 19:18:20 2020 +0200 +++ b/example/audiophil/src/audiophil.rs Tue Sep 22 19:32:51 2020 +0200 @@ -1,21 +1,26 @@ -use crate::util::{ScrapedPrice}; +use scrapeprice::err::HTTPError; +use scrapeprice::util::ScrapedPrice; +use std::collections::HashSet; +use std::collections::LinkedList; use std::iter::FromIterator; use scrapeprice::{driver, extract}; use hyper::Uri; -use log::{info}; +use log::info; use rex_regex as rex; -pub struct AudiophilItemPriceExtractor { -} +pub struct AudiophilItemPriceExtractor {} fn substring(s: String, (start, len): (usize, usize)) -> String { String::from_iter(s.chars().skip(start).take(len)) } impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor { + fn next_sites(&mut self, _uri: &Uri, _doc: &extract::Document) -> Vec<Uri> { + vec![] + } fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> { info!("Extracting info from {}", uri); let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); @@ -44,7 +49,11 @@ price2 = price; } - ScrapedPrice { item: desc2, price: price2, note: 44 } + ScrapedPrice { + item: desc2, + price: price2, + note: 44, + } }) .collect(); info!("Extracted {:?}", zipped); @@ -52,31 +61,49 @@ } } -pub struct AudiophilExplorer { - known: Vec<hyper::Uri>, +pub struct AudiophilQueue { + q: LinkedList<Uri>, + visited: HashSet<Uri>, } -impl AudiophilExplorer { - pub fn new() -> AudiophilExplorer { - let want = vec![ +impl AudiophilQueue { + pub fn new() -> AudiophilQueue { + let initial: Vec<Uri> = vec![ "https://audiophil-foto.de/de/shop/kameras/sony/", "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/", "https://audiophil-foto.de/de/shop/kameras/leica/", "https://audiophil-foto.de/de/shop/objektive/sony/", "https://audiophil-foto.de/de/shop/objektive/zeiss/", "https://audiophil-foto.de/de/shop/objektive/sigma/", - ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect(); - AudiophilExplorer { known: want } + ] + .into_iter() + .map(|s| s.parse::<Uri>().unwrap()) + .collect(); + AudiophilQueue { + q: LinkedList::from_iter(initial.into_iter()), + visited: HashSet::new(), + } } } #[async_trait::async_trait] -impl driver::Explorer for AudiophilExplorer { - async fn idle(&mut self) -> Vec<Uri> { - self.known.drain(..).collect() +impl driver::Queue for AudiophilQueue { + async fn add(&mut self, uris: &[Uri]) -> Result<(), HTTPError> { + for u in uris { + if !self.visited.contains(u) { + self.q.push_back(u.clone()); + } + } + Ok(()) } - fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> { - vec![] + async fn next(&mut self) -> Result<Option<Uri>, HTTPError> { + if !self.q.is_empty() { + return Ok(self.q.pop_front()); + } + Ok(None) + } + async fn visited(&mut self, uri: &Uri) -> Result<(), HTTPError> { + self.visited.insert(uri.clone()); + Ok(()) } } -
--- a/example/audiophil/src/main.rs Tue Sep 22 19:18:20 2020 +0200 +++ b/example/audiophil/src/main.rs Tue Sep 22 19:32:51 2020 +0200 @@ -14,9 +14,9 @@ //test_fetch_page().await.unwrap(); let logic = driver::DriverLogic { - explore: Box::new(audiophil::AudiophilExplorer::new()), store: Box::new(util::DebuggingStorage {}), extract: Box::new(audiophil::AudiophilItemPriceExtractor {}), + queue: Box::new(audiophil::AudiophilQueue::new()), }; let mut driver = driver::Driver::new(logic, None);
--- a/src/driver.rs Tue Sep 22 19:18:20 2020 +0200 +++ b/src/driver.rs Tue Sep 22 19:32:51 2020 +0200 @@ -33,6 +33,8 @@ async fn add(&mut self, uris: &[Uri]) -> Result<(), err::HTTPError>; /// Returns a site to scrape next. async fn next(&mut self) -> Result<Option<Uri>, err::HTTPError>; + /// Confirm that an URL has been visited successfully + async fn visited(&mut self, uri: &Uri) -> Result<(), err::HTTPError>; } /// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and