Mercurial > lbo > hg > scrapeprice
changeset 11:b7600da13c32
Simplify Extractor
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Mon, 23 Mar 2020 14:11:43 +0100 |
parents | 600d6afee88a |
children | 1c464fb19d9f |
files | src/driver.rs src/implem.rs |
diffstat | 2 files changed, 19 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/src/driver.rs Sun Mar 22 23:15:08 2020 +0100 +++ b/src/driver.rs Mon Mar 23 14:11:43 2020 +0100 @@ -13,8 +13,8 @@ /// Store fetched results, which come as key/value pairs, somewhere. #[async_trait::async_trait] -pub trait Storage { - async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) ->Result<(), err::HTTPError>; +pub trait Storage<T> { + async fn store(&mut self, d: Vec<T>) ->Result<(), err::HTTPError>; } /// Return Uris to explore, both as initial set and for every fetched page. @@ -26,39 +26,32 @@ fn next(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<Uri>; } -/// Extracted information can be presented as sequence of key/value pairs. -pub trait Extracted { - fn all(&mut self) -> Box<dyn iter::Iterator<Item = (String, String)> + Send> { - Box::new(iter::empty()) - } -} - /// An Extractor retrieves information from a Document. -pub trait Extractor { - fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn Extracted>> { - None +pub trait Extractor<T> { + fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<T> { + vec![] } } /// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and /// what and how to store it. -pub struct DriverLogic { +pub struct DriverLogic<T> { pub explore: Box<dyn Explorer>, - pub store: Box<dyn Storage>, - pub extract: Box<dyn Extractor>, + pub store: Box<dyn Storage<T>>, + pub extract: Box<dyn Extractor<T>>, } -pub struct Driver { +pub struct Driver<T> { https: http::HTTPS, - logic: DriverLogic, + logic: DriverLogic<T>, // This could be made into a more elaborate scheduler. queue: Vec<Uri>, } -impl Driver { +impl<T> Driver<T> { /// Create a new Driver instance. - pub fn new(logic: DriverLogic, https: Option<http::HTTPS>) -> Driver { + pub fn new(logic: DriverLogic<T>, https: Option<http::HTTPS>) -> Driver<T> { Driver { https: https.unwrap_or(http::HTTPS::new()), logic: logic, queue: Vec::with_capacity(64) } } @@ -73,10 +66,8 @@ info!("Starting fetch of {}", uri); let resp = self.https.get(&uri).await?; let doc = extract::parse_response(resp)?; - if let Some(ref mut extracted) = self.logic.extract.extract(&uri, &doc) { - info!("Stored extracted information"); - self.logic.store.store(extracted.all()); - } + let extracted = self.logic.extract.extract(&uri, &doc); + self.logic.store.store(extracted); let next = self.logic.explore.next(&uri, &doc); info!("Appended URIs after fetch: {:?}", next); self.queue.extend(next);
--- a/src/implem.rs Sun Mar 22 23:15:08 2020 +0100 +++ b/src/implem.rs Mon Mar 23 14:11:43 2020 +0100 @@ -18,8 +18,8 @@ String::from_iter(s.chars().skip(start).take(len)) } -impl driver::Extractor for AudiophilItemPriceExtractor { - fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn driver::Extracted>> { +impl driver::Extractor<(String,String)> for AudiophilItemPriceExtractor { + fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<(String,String)> { info!("Extracting info from {}", uri); let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); @@ -51,7 +51,7 @@ }) .collect(); info!("Extracted {:?}", zipped); - None + zipped } } @@ -85,9 +85,8 @@ pub struct DebuggingStorage { } #[async_trait::async_trait] -impl driver::Storage for DebuggingStorage { - async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) -> Result<(), HTTPError> { - let all = iter.collect::<Vec<(String,String)>>(); +impl driver::Storage<(String,String)> for DebuggingStorage { + async fn store(&mut self, all: Vec<(String,String)>) -> Result<(), HTTPError> { info!("STORAGE: Received {:?}", all); Ok(()) }