changeset 5:cc875ec12026

Add driver logic
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 22 Mar 2020 13:20:43 +0100
parents 768efcbf56a3
children e2526accc58f
files Cargo.lock Cargo.toml src/driver.rs src/err.rs src/extract.rs src/http.rs src/main.rs
diffstat 7 files changed, 129 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/Cargo.lock	Sat Mar 21 17:20:50 2020 +0100
+++ b/Cargo.lock	Sun Mar 22 13:20:43 2020 +0100
@@ -10,6 +10,17 @@
 ]
 
 [[package]]
+name = "async-trait"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "750b1c38a1dfadd108da0f01c08f4cdc7ff1bb39b325f9c82cc972361780a6e1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
 name = "atty"
 version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -938,6 +949,7 @@
 name = "scrapeprice"
 version = "0.1.0"
 dependencies = [
+ "async-trait",
  "env_logger",
  "http",
  "hyper",
--- a/Cargo.toml	Sat Mar 21 17:20:50 2020 +0100
+++ b/Cargo.toml	Sun Mar 22 13:20:43 2020 +0100
@@ -15,3 +15,4 @@
 robots_txt = "0.6"
 log = "0.4"
 env_logger = "0.7"
+async-trait = "0.1"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/driver.rs	Sun Mar 22 13:20:43 2020 +0100
@@ -0,0 +1,75 @@
+#![allow(unused)]
+
+//! Drive the scraping process.
+
+use std::iter;
+
+use crate::err;
+use crate::extract;
+use crate::http;
+
+use hyper::Uri;
+use log::{info,warn,error};
+
+/// Store fetched results, which come as key/value pairs, somewhere.
+#[async_trait::async_trait]
+pub trait Storage {
+    async fn store(&mut self, iter: &mut dyn iter::Iterator<Item=(String,String)>) -> Result<(), err::HTTPError>;
+}
+
+/// Return Uris to explore, both as initial set and for every fetched page.
+pub trait Explorer {
+    /// Return pages to fetch in any case, e.g. time-based. Called on every iteration of the
+    /// driver.
+    fn idle(&mut self) -> Vec<Uri>;
+    /// Return pages to fetch based on a fetched document.
+    fn next(&mut self, doc: &extract::Document) -> Vec<Uri>;
+}
+
+/// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and
+/// what and how to store it.
+pub struct DriverLogic {
+    pub explore: Box<dyn Explorer>,
+    pub store: Box<dyn Storage>,
+    pub extract: Box<dyn extract::Extractor>,
+}
+
+pub struct Driver {
+    https: http::HTTPS,
+    logic: DriverLogic,
+
+    // This could be made into a more elaborate scheduler.
+    queue: Vec<Uri>,
+}
+
+impl Driver {
+    /// Create a new Driver instance.
+    pub fn new(logic: DriverLogic, https: Option<http::HTTPS>) -> Driver {
+        Driver { https: https.unwrap_or(http::HTTPS::new()), logic: logic, queue: Vec::with_capacity(64) }
+    }
+
+    /// Run Driver a single step, i.e. first explore, then process one page. Returns true if a page
+    /// was processed.
+    pub async fn drive(&mut self) -> Result<bool, err::HTTPError> {
+        let new = self.logic.explore.idle();
+        info!("Appended URIs to queue: {:?}", new);
+        self.queue.extend(new.into_iter());
+
+        if let Some(uri) = self.queue.pop() {
+            info!("Starting fetch of {}", uri);
+            let resp = self.https.get(uri).await?;
+            let doc = extract::parse_response(resp)?;
+            if let Some(ref mut extracted) = self.logic.extract.extract(&doc) {
+                info!("Stored extracted information");
+                self.logic.store.store(extracted.all().as_mut());
+            }
+            let next = self.logic.explore.next(&doc);
+            info!("Appended URIs after fetch: {:?}", next);
+            self.queue.extend(next);
+            return Ok(true);
+        } else {
+            Ok(false)
+        }
+    }
+}
+
--- a/src/err.rs	Sat Mar 21 17:20:50 2020 +0100
+++ b/src/err.rs	Sun Mar 22 13:20:43 2020 +0100
@@ -1,3 +1,5 @@
+
+#![allow(unused)]
 
 use std::fmt;
 use std::error::Error;
@@ -9,20 +11,22 @@
 
 #[derive(Debug)]
 pub enum HTTPError {
+    EncodingError(String),
+    HttpError(http::Error),
     HyperError(hyper::Error),
     LogicError(String),
     StatusError(hyper::StatusCode),
-    HttpError(http::Error),
 }
 
 impl fmt::Display for HTTPError {
     fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
-        let e;
+        let e: String;
         match self {
+            HTTPError::EncodingError(he) => e = he.clone(),
+            HTTPError::HttpError(he) => e = format!("{}", he),
             HTTPError::HyperError(he) => e = format!("{}", he),
             HTTPError::LogicError(s) => e = s.clone(),
             HTTPError::StatusError(sc) => e = format!("{}", sc),
-            HTTPError::HttpError(he) => e = format!("{}", he),
         }
         write!(f, "HTTPError({})", e)?;
         Ok(())
--- a/src/extract.rs	Sat Mar 21 17:20:50 2020 +0100
+++ b/src/extract.rs	Sun Mar 22 13:20:43 2020 +0100
@@ -1,22 +1,28 @@
+#![allow(unused)]
+
 use crate::err::{logic_err, HTTPError};
 use crate::http;
 
+use std::iter;
+
 use log::info;
 use scraper::Html;
 
-pub struct Extract {
+/// A fetched document is given to the Extractor which gets information from it and returns the
+/// storable data.
+pub struct Document {
     html: Html,
 }
 
-pub fn parse_response(r: http::GetResponse) -> Extract {
-    let content = http::bytes_to_str(r.body).unwrap();
+pub fn parse_response(r: http::GetResponse) -> Result<Document, HTTPError> {
+    let content = http::bytes_to_str(r.body)?;
     let doc = Html::parse_document(content.as_str());
-    Extract { html: doc }
+    Ok(Document { html: doc })
 }
 
-impl Extract {
-    fn new(content: &str) -> Extract {
-        Extract {
+impl Document {
+    fn new(content: &str) -> Document {
+        Document {
             html: Html::parse_document(content),
         }
     }
@@ -42,16 +48,28 @@
     }
 }
 
+pub trait Extracted {
+    fn all(&mut self) -> Box<dyn iter::Iterator<Item=(String,String)>> {
+        Box::new(iter::empty())
+    }
+}
+
+pub trait Extractor {
+    fn extract(&mut self, doc: &Document) -> Option<&mut dyn Extracted> {
+        None
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use super::Extract;
+    use super::Document;
 
     use std::iter;
 
     #[test]
-    fn test_extract() {
+    fn test_document() {
         let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap();
-        let ex = Extract::new(&content);
+        let ex = Document::new(&content);
         let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap();
         let prices = data.pop().unwrap();
         let descs = data.pop().unwrap();
--- a/src/http.rs	Sat Mar 21 17:20:50 2020 +0100
+++ b/src/http.rs	Sun Mar 22 13:20:43 2020 +0100
@@ -1,3 +1,6 @@
+
+#![allow(unused)]
+
 use crate::err::HTTPError;
 
 use std::collections::HashMap;
@@ -35,8 +38,8 @@
     pub body: hyper::body::Bytes,
 }
 
-pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, std::string::FromUtf8Error> {
-    String::from_utf8(b.as_ref().to_vec())
+pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, HTTPError> {
+    String::from_utf8(b.as_ref().to_vec()).map_err(|e| HTTPError::EncodingError(format!("{}", e)))
 }
 
 impl HTTPS {
--- a/src/main.rs	Sat Mar 21 17:20:50 2020 +0100
+++ b/src/main.rs	Sun Mar 22 13:20:43 2020 +0100
@@ -1,3 +1,4 @@
+mod driver;
 mod err;
 mod extract;
 mod http;