changeset 6:e2526accc58f

Get it to a working scraper
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 22 Mar 2020 14:23:19 +0100
parents cc875ec12026
children 8dee877af779
files Cargo.lock Cargo.toml src/driver.rs src/extract.rs src/implem.rs src/main.rs
diffstat 6 files changed, 131 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/Cargo.lock	Sun Mar 22 13:20:43 2020 +0100
+++ b/Cargo.lock	Sun Mar 22 14:23:19 2020 +0100
@@ -881,6 +881,12 @@
 checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae"
 
 [[package]]
+name = "rex-regex"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8961e9d637060caccd8306ca91fdef77a67ea9657ac2e137c09eb7f4c6789c8c"
+
+[[package]]
 name = "ring"
 version = "0.16.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -955,6 +961,7 @@
  "hyper",
  "hyper-rustls",
  "log",
+ "rex-regex",
  "robots_txt",
  "scraper",
  "tokio",
--- a/Cargo.toml	Sun Mar 22 13:20:43 2020 +0100
+++ b/Cargo.toml	Sun Mar 22 14:23:19 2020 +0100
@@ -16,3 +16,4 @@
 log = "0.4"
 env_logger = "0.7"
 async-trait = "0.1"
+rex-regex = "0.1"
--- a/src/driver.rs	Sun Mar 22 13:20:43 2020 +0100
+++ b/src/driver.rs	Sun Mar 22 14:23:19 2020 +0100
@@ -14,7 +14,7 @@
 /// Store fetched results, which come as key/value pairs, somewhere.
 #[async_trait::async_trait]
 pub trait Storage {
-    async fn store(&mut self, iter: &mut dyn iter::Iterator<Item=(String,String)>) -> Result<(), err::HTTPError>;
+    async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) ->Result<(), err::HTTPError>;
 }
 
 /// Return Uris to explore, both as initial set and for every fetched page.
@@ -61,7 +61,7 @@
             let doc = extract::parse_response(resp)?;
             if let Some(ref mut extracted) = self.logic.extract.extract(&doc) {
                 info!("Stored extracted information");
-                self.logic.store.store(extracted.all().as_mut());
+                self.logic.store.store(extracted.all());
             }
             let next = self.logic.explore.next(&doc);
             info!("Appended URIs after fetch: {:?}", next);
--- a/src/extract.rs	Sun Mar 22 13:20:43 2020 +0100
+++ b/src/extract.rs	Sun Mar 22 14:23:19 2020 +0100
@@ -35,7 +35,6 @@
 
             let mut values = vec![];
             for e in selected {
-                println!("selected: {}", e.inner_html());
                 values.push(e.inner_html());
             }
             r.push(values);
@@ -49,13 +48,13 @@
 }
 
 pub trait Extracted {
-    fn all(&mut self) -> Box<dyn iter::Iterator<Item=(String,String)>> {
+    fn all(&mut self) -> Box<dyn iter::Iterator<Item = (String, String)> + Send> {
         Box::new(iter::empty())
     }
 }
 
 pub trait Extractor {
-    fn extract(&mut self, doc: &Document) -> Option<&mut dyn Extracted> {
+    fn extract(&mut self, doc: &Document) -> Option<Box<dyn Extracted>> {
         None
     }
 }
@@ -73,9 +72,11 @@
         let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap();
         let prices = data.pop().unwrap();
         let descs = data.pop().unwrap();
-        let zipped: Vec<(String, String)> = descs.into_iter().zip(prices).map(|(desc, price)| {
-            (desc.trim().to_string(), price.trim().to_string())
-        }).collect();
+        let zipped: Vec<(String, String)> = descs
+            .into_iter()
+            .zip(prices)
+            .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string()))
+            .collect();
         println!("{:?}", zipped);
     }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/implem.rs	Sun Mar 22 14:23:19 2020 +0100
@@ -0,0 +1,95 @@
+//! Implementations of common traits that are useful to plug together a Driver.
+
+use std::iter;
+use std::iter::FromIterator;
+
+use crate::driver;
+use crate::extract;
+use crate::http;
+use crate::err::HTTPError;
+
+use hyper::Uri;
+use log::{info,warn,error};
+use rex_regex as rex;
+
+pub struct AudiophilItemPriceExtractor {
+}
+
+fn substring(s: String, (start, len): (usize, usize)) -> String {
+    String::from_iter(s.chars().skip(start).take(len))
+}
+
+impl extract::Extractor for AudiophilItemPriceExtractor {
+    fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn extract::Extracted>> {
+        let mut data = doc.get_fields(&[".bez.neu", ".preis strong"]).unwrap();
+        let prices = data.pop().unwrap();
+        let descs = data.pop().unwrap();
+
+        let onlytext = rex::compile("^[a-zA-Z0-9\\.+/ -]+").unwrap();
+
+        let zipped: Vec<(String, String)> = descs
+            .into_iter()
+            .zip(prices)
+            .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string()))
+            .map(move |(desc, price)| {
+                let desc2;
+                let price2;
+                let (ok, descmatch) = rex::match_re(&onlytext, &desc);
+                if ok {
+                    desc2 = substring(desc, descmatch[0]);
+                } else {
+                    desc2 = desc;
+                }
+                let (ok, pricematch) = rex::match_re(&onlytext, &price);
+                if ok {
+                    price2 = substring(price, pricematch[0]);
+                } else {
+                    price2 = price;
+                }
+
+                (desc2, price2)
+            })
+            .collect();
+        println!("{:?}", zipped);
+        None
+    }
+}
+
+pub struct AudiophilExplorer {
+    known: Vec<hyper::Uri>,
+}
+
+impl AudiophilExplorer {
+    pub fn new() -> AudiophilExplorer {
+        let want = vec![
+            "https://audiophil-foto.de/de/shop/kameras/sony/",
+            "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/",
+            "https://audiophil-foto.de/de/shop/kameras/leica/",
+            "https://audiophil-foto.de/de/shop/objektive/sony/",
+            "https://audiophil-foto.de/de/shop/objektive/zeiss/",
+            "https://audiophil-foto.de/de/shop/objektive/sigma/",
+        ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect();
+        AudiophilExplorer { known: want }
+    }
+}
+
+impl driver::Explorer for AudiophilExplorer {
+    fn idle(&mut self) -> Vec<Uri> {
+        self.known.drain(..).collect()
+    }
+    fn next(&mut self, _: &extract::Document) -> Vec<Uri> {
+        vec![]
+    }
+}
+
+pub struct DebuggingStorage { }
+
+#[async_trait::async_trait]
+impl driver::Storage for DebuggingStorage {
+    async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) -> Result<(), HTTPError> {
+        let all = iter.collect::<Vec<(String,String)>>();
+        info!("STORAGE: Received {:?}", all);
+        Ok(())
+    }
+}
+
--- a/src/main.rs	Sun Mar 22 13:20:43 2020 +0100
+++ b/src/main.rs	Sun Mar 22 14:23:19 2020 +0100
@@ -2,9 +2,11 @@
 mod err;
 mod extract;
 mod http;
+mod implem;
 
 use log::{info, warn};
 use env_logger;
+use tokio;
 
 async fn test_fetch_page() -> hyper::Result<()> {
     let mut cl = http::HTTPS::new();
@@ -21,5 +23,21 @@
     env_logger::Builder::from_default_env().filter(None, log::LevelFilter::Info).init();
 
     info!("scrapeprice: init");
-    test_fetch_page().await.unwrap();
+    //test_fetch_page().await.unwrap();
+
+    let logic = driver::DriverLogic {
+        explore: Box::new(implem::AudiophilExplorer::new()),
+        store: Box::new(implem::DebuggingStorage {}),
+        extract: Box::new(implem::AudiophilItemPriceExtractor {}),
+    };
+    let mut driver = driver::Driver::new(logic, None);
+
+    let mut ival = tokio::time::interval(tokio::time::Duration::from_millis(2000));
+
+    loop {
+        ival.tick().await;
+        if let Err(e) = driver.drive().await {
+            warn!("Error from driver:  {}", e);
+        }
+    }
 }