Mercurial > lbo > hg > scrapeprice
changeset 5:cc875ec12026
Add driver logic
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 22 Mar 2020 13:20:43 +0100 |
parents | 768efcbf56a3 |
children | e2526accc58f |
files | Cargo.lock Cargo.toml src/driver.rs src/err.rs src/extract.rs src/http.rs src/main.rs |
diffstat | 7 files changed, 129 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/Cargo.lock Sat Mar 21 17:20:50 2020 +0100 +++ b/Cargo.lock Sun Mar 22 13:20:43 2020 +0100 @@ -10,6 +10,17 @@ ] [[package]] +name = "async-trait" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "750b1c38a1dfadd108da0f01c08f4cdc7ff1bb39b325f9c82cc972361780a6e1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -938,6 +949,7 @@ name = "scrapeprice" version = "0.1.0" dependencies = [ + "async-trait", "env_logger", "http", "hyper",
--- a/Cargo.toml Sat Mar 21 17:20:50 2020 +0100 +++ b/Cargo.toml Sun Mar 22 13:20:43 2020 +0100 @@ -15,3 +15,4 @@ robots_txt = "0.6" log = "0.4" env_logger = "0.7" +async-trait = "0.1"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/driver.rs Sun Mar 22 13:20:43 2020 +0100 @@ -0,0 +1,75 @@ +#![allow(unused)] + +//! Drive the scraping process. + +use std::iter; + +use crate::err; +use crate::extract; +use crate::http; + +use hyper::Uri; +use log::{info,warn,error}; + +/// Store fetched results, which come as key/value pairs, somewhere. +#[async_trait::async_trait] +pub trait Storage { + async fn store(&mut self, iter: &mut dyn iter::Iterator<Item=(String,String)>) -> Result<(), err::HTTPError>; +} + +/// Return Uris to explore, both as initial set and for every fetched page. +pub trait Explorer { + /// Return pages to fetch in any case, e.g. time-based. Called on every iteration of the + /// driver. + fn idle(&mut self) -> Vec<Uri>; + /// Return pages to fetch based on a fetched document. + fn next(&mut self, doc: &extract::Document) -> Vec<Uri>; +} + +/// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and +/// what and how to store it. +pub struct DriverLogic { + pub explore: Box<dyn Explorer>, + pub store: Box<dyn Storage>, + pub extract: Box<dyn extract::Extractor>, +} + +pub struct Driver { + https: http::HTTPS, + logic: DriverLogic, + + // This could be made into a more elaborate scheduler. + queue: Vec<Uri>, +} + +impl Driver { + /// Create a new Driver instance. + pub fn new(logic: DriverLogic, https: Option<http::HTTPS>) -> Driver { + Driver { https: https.unwrap_or(http::HTTPS::new()), logic: logic, queue: Vec::with_capacity(64) } + } + + /// Run Driver a single step, i.e. first explore, then process one page. Returns true if a page + /// was processed. + pub async fn drive(&mut self) -> Result<bool, err::HTTPError> { + let new = self.logic.explore.idle(); + info!("Appended URIs to queue: {:?}", new); + self.queue.extend(new.into_iter()); + + if let Some(uri) = self.queue.pop() { + info!("Starting fetch of {}", uri); + let resp = self.https.get(uri).await?; + let doc = extract::parse_response(resp)?; + if let Some(ref mut extracted) = self.logic.extract.extract(&doc) { + info!("Stored extracted information"); + self.logic.store.store(extracted.all().as_mut()); + } + let next = self.logic.explore.next(&doc); + info!("Appended URIs after fetch: {:?}", next); + self.queue.extend(next); + return Ok(true); + } else { + Ok(false) + } + } +} +
--- a/src/err.rs Sat Mar 21 17:20:50 2020 +0100 +++ b/src/err.rs Sun Mar 22 13:20:43 2020 +0100 @@ -1,3 +1,5 @@ + +#![allow(unused)] use std::fmt; use std::error::Error; @@ -9,20 +11,22 @@ #[derive(Debug)] pub enum HTTPError { + EncodingError(String), + HttpError(http::Error), HyperError(hyper::Error), LogicError(String), StatusError(hyper::StatusCode), - HttpError(http::Error), } impl fmt::Display for HTTPError { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - let e; + let e: String; match self { + HTTPError::EncodingError(he) => e = he.clone(), + HTTPError::HttpError(he) => e = format!("{}", he), HTTPError::HyperError(he) => e = format!("{}", he), HTTPError::LogicError(s) => e = s.clone(), HTTPError::StatusError(sc) => e = format!("{}", sc), - HTTPError::HttpError(he) => e = format!("{}", he), } write!(f, "HTTPError({})", e)?; Ok(())
--- a/src/extract.rs Sat Mar 21 17:20:50 2020 +0100 +++ b/src/extract.rs Sun Mar 22 13:20:43 2020 +0100 @@ -1,22 +1,28 @@ +#![allow(unused)] + use crate::err::{logic_err, HTTPError}; use crate::http; +use std::iter; + use log::info; use scraper::Html; -pub struct Extract { +/// A fetched document is given to the Extractor which gets information from it and returns the +/// storable data. +pub struct Document { html: Html, } -pub fn parse_response(r: http::GetResponse) -> Extract { - let content = http::bytes_to_str(r.body).unwrap(); +pub fn parse_response(r: http::GetResponse) -> Result<Document, HTTPError> { + let content = http::bytes_to_str(r.body)?; let doc = Html::parse_document(content.as_str()); - Extract { html: doc } + Ok(Document { html: doc }) } -impl Extract { - fn new(content: &str) -> Extract { - Extract { +impl Document { + fn new(content: &str) -> Document { + Document { html: Html::parse_document(content), } } @@ -42,16 +48,28 @@ } } +pub trait Extracted { + fn all(&mut self) -> Box<dyn iter::Iterator<Item=(String,String)>> { + Box::new(iter::empty()) + } +} + +pub trait Extractor { + fn extract(&mut self, doc: &Document) -> Option<&mut dyn Extracted> { + None + } +} + #[cfg(test)] mod tests { - use super::Extract; + use super::Document; use std::iter; #[test] - fn test_extract() { + fn test_document() { let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); - let ex = Extract::new(&content); + let ex = Document::new(&content); let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap();
--- a/src/http.rs Sat Mar 21 17:20:50 2020 +0100 +++ b/src/http.rs Sun Mar 22 13:20:43 2020 +0100 @@ -1,3 +1,6 @@ + +#![allow(unused)] + use crate::err::HTTPError; use std::collections::HashMap; @@ -35,8 +38,8 @@ pub body: hyper::body::Bytes, } -pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, std::string::FromUtf8Error> { - String::from_utf8(b.as_ref().to_vec()) +pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, HTTPError> { + String::from_utf8(b.as_ref().to_vec()).map_err(|e| HTTPError::EncodingError(format!("{}", e))) } impl HTTPS {