Mercurial > lbo > hg > scrapeprice
changeset 9:e13f77dac798
Give Uri info to extractor
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 22 Mar 2020 23:14:21 +0100 |
parents | 6027d11cb86d |
children | 600d6afee88a |
files | src/driver.rs src/http.rs src/implem.rs src/main.rs |
diffstat | 4 files changed, 10 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- a/src/driver.rs Sun Mar 22 23:10:06 2020 +0100 +++ b/src/driver.rs Sun Mar 22 23:14:21 2020 +0100 @@ -35,7 +35,7 @@ /// An Extractor retrieves information from a Document. pub trait Extractor { - fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn Extracted>> { + fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn Extracted>> { None } } @@ -71,9 +71,9 @@ if let Some(uri) = self.queue.pop() { info!("Starting fetch of {}", uri); - let resp = self.https.get(uri).await?; + let resp = self.https.get(&uri).await?; let doc = extract::parse_response(resp)?; - if let Some(ref mut extracted) = self.logic.extract.extract(&doc) { + if let Some(ref mut extracted) = self.logic.extract.extract(&uri, &doc) { info!("Stored extracted information"); self.logic.store.store(extracted.all()); }
--- a/src/http.rs Sun Mar 22 23:10:06 2020 +0100 +++ b/src/http.rs Sun Mar 22 23:14:21 2020 +0100 @@ -51,16 +51,16 @@ } } - pub async fn get(&mut self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> { + pub async fn get(&mut self, uri: &hyper::Uri) -> Result<GetResponse, HTTPError> { if let Ok(true) = self.robots_ok(&uri).await { return self.get_nocheck(uri).await; } unimplemented!() } - pub async fn get_nocheck(&self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> { + pub async fn get_nocheck(&self, uri: &hyper::Uri) -> Result<GetResponse, HTTPError> { let max_redirect: i32 = 10; - let mut uri = uri; + let mut uri = uri.clone(); let host = uri.host().unwrap().to_string(); for i in 0..max_redirect { @@ -125,7 +125,7 @@ .path_and_query("/robots.txt") .build() .unwrap(); - let resp = self.get_nocheck(robots_uri).await?; + let resp = self.get_nocheck(&robots_uri).await?; let robots = bytes_to_str(resp.body).unwrap(); let is_ok = robots_ok(&robots, uri); self.robots_txt_cache.insert(host.to_string(), robots);
--- a/src/implem.rs Sun Mar 22 23:10:06 2020 +0100 +++ b/src/implem.rs Sun Mar 22 23:14:21 2020 +0100 @@ -5,7 +5,6 @@ use crate::driver; use crate::extract; -use crate::http; use crate::err::HTTPError; use hyper::Uri; @@ -20,7 +19,8 @@ } impl driver::Extractor for AudiophilItemPriceExtractor { - fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn driver::Extracted>> { + fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn driver::Extracted>> { + info!("Extracting info from {}", uri); let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); @@ -50,7 +50,7 @@ (desc2, price2) }) .collect(); - println!("{:?}", zipped); + info!("Extracted {:?}", zipped); None } }
--- a/src/main.rs Sun Mar 22 23:10:06 2020 +0100 +++ b/src/main.rs Sun Mar 22 23:14:21 2020 +0100 @@ -8,16 +8,6 @@ use env_logger; use tokio; -async fn test_fetch_page() -> hyper::Result<()> { - let mut cl = http::HTTPS::new(); - let res = cl.get("https://audiophil-foto.de/de/shop/kameras/sony/".parse::<hyper::Uri>().unwrap()).await.unwrap(); - info!("Fetch 1 was {}", res.status); - let res = cl.get("https://audiophil-foto.de/de/shop/kameras/nikon/".parse::<hyper::Uri>().unwrap()).await.unwrap(); - info!("Fetch 2 was {}", res.status); - - Ok(()) -} - #[tokio::main] async fn main() { env_logger::Builder::from_default_env().filter(None, log::LevelFilter::Info).init();