changeset 22:0e7b6f3050d0 default tip

Use new traits system in example
author Lewin Bormann <lbo@spheniscida.de>
date Tue, 22 Sep 2020 19:32:51 +0200
parents e4c4a7c00fbd
children
files example/audiophil/src/audiophil.rs example/audiophil/src/main.rs src/driver.rs
diffstat 3 files changed, 48 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/example/audiophil/src/audiophil.rs	Tue Sep 22 19:18:20 2020 +0200
+++ b/example/audiophil/src/audiophil.rs	Tue Sep 22 19:32:51 2020 +0200
@@ -1,21 +1,26 @@
-use crate::util::{ScrapedPrice};
+use scrapeprice::err::HTTPError;
+use scrapeprice::util::ScrapedPrice;
 
+use std::collections::HashSet;
+use std::collections::LinkedList;
 use std::iter::FromIterator;
 
 use scrapeprice::{driver, extract};
 
 use hyper::Uri;
-use log::{info};
+use log::info;
 use rex_regex as rex;
 
-pub struct AudiophilItemPriceExtractor {
-}
+pub struct AudiophilItemPriceExtractor {}
 
 fn substring(s: String, (start, len): (usize, usize)) -> String {
     String::from_iter(s.chars().skip(start).take(len))
 }
 
 impl driver::Extractor<ScrapedPrice> for AudiophilItemPriceExtractor {
+    fn next_sites(&mut self, _uri: &Uri, _doc: &extract::Document) -> Vec<Uri> {
+        vec![]
+    }
     fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<ScrapedPrice> {
         info!("Extracting info from {}", uri);
         let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
@@ -44,7 +49,11 @@
                     price2 = price;
                 }
 
-                ScrapedPrice { item: desc2, price: price2, note: 44 }
+                ScrapedPrice {
+                    item: desc2,
+                    price: price2,
+                    note: 44,
+                }
             })
             .collect();
         info!("Extracted {:?}", zipped);
@@ -52,31 +61,49 @@
     }
 }
 
-pub struct AudiophilExplorer {
-    known: Vec<hyper::Uri>,
+pub struct AudiophilQueue {
+    q: LinkedList<Uri>,
+    visited: HashSet<Uri>,
 }
 
-impl AudiophilExplorer {
-    pub fn new() -> AudiophilExplorer {
-        let want = vec![
+impl AudiophilQueue {
+    pub fn new() -> AudiophilQueue {
+        let initial: Vec<Uri> = vec![
             "https://audiophil-foto.de/de/shop/kameras/sony/",
             "https://audiophil-foto.de/de/shop/kameras/pentax-ricoh/",
             "https://audiophil-foto.de/de/shop/kameras/leica/",
             "https://audiophil-foto.de/de/shop/objektive/sony/",
             "https://audiophil-foto.de/de/shop/objektive/zeiss/",
             "https://audiophil-foto.de/de/shop/objektive/sigma/",
-        ].into_iter().map(|s| s.parse::<Uri>().unwrap()).collect();
-        AudiophilExplorer { known: want }
+        ]
+        .into_iter()
+        .map(|s| s.parse::<Uri>().unwrap())
+        .collect();
+        AudiophilQueue {
+            q: LinkedList::from_iter(initial.into_iter()),
+            visited: HashSet::new(),
+        }
     }
 }
 
 #[async_trait::async_trait]
-impl driver::Explorer for AudiophilExplorer {
-    async fn idle(&mut self) -> Vec<Uri> {
-        self.known.drain(..).collect()
+impl driver::Queue for AudiophilQueue {
+    async fn add(&mut self, uris: &[Uri]) -> Result<(), HTTPError> {
+        for u in uris {
+            if !self.visited.contains(u) {
+                self.q.push_back(u.clone());
+            }
+        }
+        Ok(())
     }
-    fn next(&mut self, _: &Uri, _: &extract::Document) -> Vec<Uri> {
-        vec![]
+    async fn next(&mut self) -> Result<Option<Uri>, HTTPError> {
+        if !self.q.is_empty() {
+            return Ok(self.q.pop_front());
+        }
+        Ok(None)
+    }
+    async fn visited(&mut self, uri: &Uri) -> Result<(), HTTPError> {
+        self.visited.insert(uri.clone());
+        Ok(())
     }
 }
-
--- a/example/audiophil/src/main.rs	Tue Sep 22 19:18:20 2020 +0200
+++ b/example/audiophil/src/main.rs	Tue Sep 22 19:32:51 2020 +0200
@@ -14,9 +14,9 @@
     //test_fetch_page().await.unwrap();
 
     let logic = driver::DriverLogic {
-        explore: Box::new(audiophil::AudiophilExplorer::new()),
         store: Box::new(util::DebuggingStorage {}),
         extract: Box::new(audiophil::AudiophilItemPriceExtractor {}),
+        queue: Box::new(audiophil::AudiophilQueue::new()),
     };
     let mut driver = driver::Driver::new(logic, None);
 
--- a/src/driver.rs	Tue Sep 22 19:18:20 2020 +0200
+++ b/src/driver.rs	Tue Sep 22 19:32:51 2020 +0200
@@ -33,6 +33,8 @@
     async fn add(&mut self, uris: &[Uri]) -> Result<(), err::HTTPError>;
     /// Returns a site to scrape next.
     async fn next(&mut self) -> Result<Option<Uri>, err::HTTPError>;
+    /// Confirm that an URL has been visited successfully
+    async fn visited(&mut self, uri: &Uri) -> Result<(), err::HTTPError>;
 }
 
 /// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and