changeset 15:99031188b089

Restructure code
author Lewin Bormann <lbo@spheniscida.de>
date Mon, 21 Sep 2020 17:16:34 +0200
parents 29415ea96e5f
children 364f0ae83e7d
files README.md src/implem.rs src/implem/audiophil.rs src/implem/mod.rs src/main.rs src/util.rs
diffstat 6 files changed, 150 insertions(+), 104 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Mon Sep 21 17:16:34 2020 +0200
@@ -0,0 +1,37 @@
+# `scrapeprice`
+
+is a small one-binary Rust framework for fetching and analyzing web pages. It
+honors robots.txt and is based on tokio. The logic for specific websites is
+implemented in Rust -- alternatively, a configuration-based system can be
+implemented.
+
+To implement your own scraper, you need to implement three traits according to
+your needs; the implementations are stored in a `driver::DriverLogic` object.
+
+- `driver::Explorer` tells the driver which URLs to visit. It returns a list of
+URLs to initially visit and extracts new URLs from fetched documents.
+- `driver::Extractor<T>` extracts items of type `T` from a fetched web page.
+- `driver::Storage<T>` stores the items returned by an `Extractor`, whether in a
+log file, CSV or other text format, or into a database.
+
+Once you have implemented those traits, check out the `main.rs` file as example.
+Not much more is needed now:
+
+```rust
+    let logic = driver::DriverLogic {    
+        explore: Box::new(implem::YourExplorer::new()),    
+        store: Box::new(implem::YourStorage {}),    
+        extract: Box::new(implem::YourItemPriceExtractor {}),    
+    };    
+    let mut driver = driver::Driver::new(logic, None);    
+     
+    let mut ival = tokio::time::interval(tokio::time::Duration::from_millis(2000));    
+    
+    loop {    
+        ival.tick().await;     
+        if let Err(e) = driver.drive().await {    
+            warn!("Error from driver:  {}", e);    
+        }    
+    } 
+```
+
--- a/src/implem.rs	Mon Sep 21 16:59:06 2020 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,101 +0,0 @@
-//! Implementations of common traits that are useful to plug together a Driver.
-
-use std::iter;
-use std::iter::FromIterator;
-
-use crate::driver;
-use crate::extract;
-use crate::err::HTTPError;
-
-use hyper::Uri;
-use log::{info,warn,error};
-use rex_regex as rex;
-
-pub struct AudiophilItemPriceExtractor {
-}
-
-fn substring(s: String, (start, len): (usize, usize)) -> String {
-    String::from_iter(s.chars().skip(start).take(len))
-}
-
-impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor {
-    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> {
-        info!("Extracting info from {}", uri);
-        let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
-        let prices = data.pop().unwrap();
-        let descs = data.pop().unwrap();
-
-        let onlytext = rex::compile("^([a-zA-Z0-9€\\.,+/ -]+)").unwrap();
-
-        let zipped = descs
-            .into_iter()
-            .zip(prices)
-            .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string()))
-            .map(move |(desc, price)| {
-                let desc2;
-                let price2;
-                let (ok, descmatch) = rex::match_re(&onlytext, &desc);
-                if ok {
-                    desc2 = substring(desc, descmatch[0]);
-                } else {
-                    desc2 = desc;
-                }
-                let (ok, pricematch) = rex::match_re(&onlytext, &price);
-                if ok {
-                    price2 = substring(price, pricematch[0]);
-                } else {
-                    price2 = price;
-                }
-
-                ScrapedPrice { item: desc2, price: price2, note: 44 }
-            })
-            .collect();
-        info!("Extracted {:?}", zipped);
-        zipped
-    }
-}
-
-pub struct AudiophilExplorer {
-    known: Vec<hyper::Uri>,
-}
-
-impl AudiophilExplorer {
-    pub fn new() -> AudiophilExplorer {
-        let want = vec![
-            "https://audiophil-foto.de/de/shop/kameras/sony/",
-            "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/",
-            "https://audiophil-foto.de/de/shop/kameras/leica/",
-            "https://audiophil-foto.de/de/shop/objektive/sony/",
-            "https://audiophil-foto.de/de/shop/objektive/zeiss/",
-            "https://audiophil-foto.de/de/shop/objektive/sigma/",
-        ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect();
-        AudiophilExplorer { known: want }
-    }
-}
-
-impl driver::Explorer for AudiophilExplorer {
-    fn idle(&mut self) -> Vec<Uri> {
-        self.known.drain(..).collect()
-    }
-    fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> {
-        vec![]
-    }
-}
-
-pub struct DebuggingStorage { }
-
-#[derive(Debug)]
-pub struct ScrapedPrice {
-    item: String,
-    price: String,
-    note: i32,
-}
-
-#[async_trait::async_trait]
-impl driver::Storage<ScrapedPrice> for DebuggingStorage {
-    async fn store(&mut self, all: Box<dyn Iterator<Item=ScrapedPrice> + Send>) -> Result<(), HTTPError> {
-        info!("STORAGE: Received {:?}", all.collect::<Vec<ScrapedPrice>>());
-        Ok(())
-    }
-}
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/implem/audiophil.rs	Mon Sep 21 17:16:34 2020 +0200
@@ -0,0 +1,82 @@
+use crate::util::{ScrapedPrice};
+
+use std::iter::FromIterator;
+
+use crate::driver;
+use crate::extract;
+
+use hyper::Uri;
+use log::{info};
+use rex_regex as rex;
+
+pub struct AudiophilItemPriceExtractor {
+}
+
+fn substring(s: String, (start, len): (usize, usize)) -> String {
+    String::from_iter(s.chars().skip(start).take(len))
+}
+
+impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor {
+    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> {
+        info!("Extracting info from {}", uri);
+        let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
+        let prices = data.pop().unwrap();
+        let descs = data.pop().unwrap();
+
+        let onlytext = rex::compile("^([a-zA-Z0-9€\\.,+/ -]+)").unwrap();
+
+        let zipped = descs
+            .into_iter()
+            .zip(prices)
+            .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string()))
+            .map(move |(desc, price)| {
+                let desc2;
+                let price2;
+                let (ok, descmatch) = rex::match_re(&onlytext, &desc);
+                if ok {
+                    desc2 = substring(desc, descmatch[0]);
+                } else {
+                    desc2 = desc;
+                }
+                let (ok, pricematch) = rex::match_re(&onlytext, &price);
+                if ok {
+                    price2 = substring(price, pricematch[0]);
+                } else {
+                    price2 = price;
+                }
+
+                ScrapedPrice { item: desc2, price: price2, note: 44 }
+            })
+            .collect();
+        info!("Extracted {:?}", zipped);
+        zipped
+    }
+}
+
+pub struct AudiophilExplorer {
+    known: Vec<hyper::Uri>,
+}
+
+impl AudiophilExplorer {
+    pub fn new() -> AudiophilExplorer {
+        let want = vec![
+            "https://audiophil-foto.de/de/shop/kameras/sony/",
+            "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/",
+            "https://audiophil-foto.de/de/shop/kameras/leica/",
+            "https://audiophil-foto.de/de/shop/objektive/sony/",
+            "https://audiophil-foto.de/de/shop/objektive/zeiss/",
+            "https://audiophil-foto.de/de/shop/objektive/sigma/",
+        ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect();
+        AudiophilExplorer { known: want }
+    }
+}
+
+impl driver::Explorer for AudiophilExplorer {
+    fn idle(&mut self) -> Vec<Uri> {
+        self.known.drain(..).collect()
+    }
+    fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> {
+        vec![]
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/implem/mod.rs	Mon Sep 21 17:16:34 2020 +0200
@@ -0,0 +1,1 @@
+pub mod audiophil;
--- a/src/main.rs	Mon Sep 21 16:59:06 2020 +0200
+++ b/src/main.rs	Mon Sep 21 17:16:34 2020 +0200
@@ -3,6 +3,9 @@
 mod extract;
 mod http;
 mod implem;
+mod util;
+
+use implem::audiophil as audiophil;
 
 use log::{info, warn};
 use env_logger;
@@ -16,9 +19,9 @@
     //test_fetch_page().await.unwrap();
 
     let logic = driver::DriverLogic {
-        explore: Box::new(implem::AudiophilExplorer::new()),
-        store: Box::new(implem::DebuggingStorage {}),
-        extract: Box::new(implem::AudiophilItemPriceExtractor {}),
+        explore: Box::new(audiophil::AudiophilExplorer::new()),
+        store: Box::new(util::DebuggingStorage {}),
+        extract: Box::new(audiophil::AudiophilItemPriceExtractor {}),
     };
     let mut driver = driver::Driver::new(logic, None);
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/util.rs	Mon Sep 21 17:16:34 2020 +0200
@@ -0,0 +1,24 @@
+//! Implementations of common traits that are useful to plug together a Driver.
+
+use crate::driver;
+use crate::err::HTTPError;
+
+use log::{info};
+
+pub struct DebuggingStorage { }
+
+#[derive(Debug)]
+pub struct ScrapedPrice {
+    pub item: String,
+    pub price: String,
+    pub note: i32,
+}
+
+#[async_trait::async_trait]
+impl driver::Storage<ScrapedPrice> for DebuggingStorage {
+    async fn store(&mut self, all: Box<dyn Iterator<Item=ScrapedPrice> + Send>) -> Result<(), HTTPError> {
+        info!("STORAGE: Received {:?}", all.collect::<Vec<ScrapedPrice>>());
+        Ok(())
+    }
+}
+