changeset 9:e13f77dac798

Give Uri info to extractor
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 22 Mar 2020 23:14:21 +0100
parents 6027d11cb86d
children 600d6afee88a
files src/driver.rs src/http.rs src/implem.rs src/main.rs
diffstat 4 files changed, 10 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/src/driver.rs	Sun Mar 22 23:10:06 2020 +0100
+++ b/src/driver.rs	Sun Mar 22 23:14:21 2020 +0100
@@ -35,7 +35,7 @@
 
 /// An Extractor retrieves information from a Document.
 pub trait Extractor {
-    fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn Extracted>> {
+    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn Extracted>> {
         None
     }
 }
@@ -71,9 +71,9 @@
 
         if let Some(uri) = self.queue.pop() {
             info!("Starting fetch of {}", uri);
-            let resp = self.https.get(uri).await?;
+            let resp = self.https.get(&uri).await?;
             let doc = extract::parse_response(resp)?;
-            if let Some(ref mut extracted) = self.logic.extract.extract(&doc) {
+            if let Some(ref mut extracted) = self.logic.extract.extract(&uri, &doc) {
                 info!("Stored extracted information");
                 self.logic.store.store(extracted.all());
             }
--- a/src/http.rs	Sun Mar 22 23:10:06 2020 +0100
+++ b/src/http.rs	Sun Mar 22 23:14:21 2020 +0100
@@ -51,16 +51,16 @@
         }
     }
 
-    pub async fn get(&mut self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> {
+    pub async fn get(&mut self, uri: &hyper::Uri) -> Result<GetResponse, HTTPError> {
         if let Ok(true) = self.robots_ok(&uri).await {
             return self.get_nocheck(uri).await;
         }
         unimplemented!()
     }
 
-    pub async fn get_nocheck(&self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> {
+    pub async fn get_nocheck(&self, uri: &hyper::Uri) -> Result<GetResponse, HTTPError> {
         let max_redirect: i32 = 10;
-        let mut uri = uri;
+        let mut uri = uri.clone();
         let host = uri.host().unwrap().to_string();
 
         for i in 0..max_redirect {
@@ -125,7 +125,7 @@
                     .path_and_query("/robots.txt")
                     .build()
                     .unwrap();
-                let resp = self.get_nocheck(robots_uri).await?;
+                let resp = self.get_nocheck(&robots_uri).await?;
                 let robots = bytes_to_str(resp.body).unwrap();
                 let is_ok = robots_ok(&robots, uri);
                 self.robots_txt_cache.insert(host.to_string(), robots);
--- a/src/implem.rs	Sun Mar 22 23:10:06 2020 +0100
+++ b/src/implem.rs	Sun Mar 22 23:14:21 2020 +0100
@@ -5,7 +5,6 @@
 
 use crate::driver;
 use crate::extract;
-use crate::http;
 use crate::err::HTTPError;
 
 use hyper::Uri;
@@ -20,7 +19,8 @@
 }
 
 impl driver::Extractor for AudiophilItemPriceExtractor {
-    fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn driver::Extracted>> {
+    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn driver::Extracted>> {
+        info!("Extracting info from {}", uri);
         let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
         let prices = data.pop().unwrap();
         let descs = data.pop().unwrap();
@@ -50,7 +50,7 @@
                 (desc2, price2)
             })
             .collect();
-        println!("{:?}", zipped);
+        info!("Extracted {:?}", zipped);
         None
     }
 }
--- a/src/main.rs	Sun Mar 22 23:10:06 2020 +0100
+++ b/src/main.rs	Sun Mar 22 23:14:21 2020 +0100
@@ -8,16 +8,6 @@
 use env_logger;
 use tokio;
 
-async fn test_fetch_page() -> hyper::Result<()> {
-    let mut cl = http::HTTPS::new();
-    let res = cl.get("https://audiophil-foto.de/de/shop/kameras/sony/".parse::<hyper::Uri>().unwrap()).await.unwrap();
-    info!("Fetch 1 was {}", res.status);
-    let res = cl.get("https://audiophil-foto.de/de/shop/kameras/nikon/".parse::<hyper::Uri>().unwrap()).await.unwrap();
-    info!("Fetch 2 was {}", res.status);
-
-    Ok(())
-}
-
 #[tokio::main]
 async fn main() {
     env_logger::Builder::from_default_env().filter(None, log::LevelFilter::Info).init();