view src/driver.rs @ 9:e13f77dac798

Give Uri info to extractor
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 22 Mar 2020 23:14:21 +0100
parents 6027d11cb86d
children 600d6afee88a
line wrap: on
line source

#![allow(unused)]

//! Drive the scraping process.

use std::iter;

use crate::err;
use crate::extract;
use crate::http;

use hyper::Uri;
use log::{info,warn,error};

/// Store fetched results, which come as key/value pairs, somewhere.
#[async_trait::async_trait]
pub trait Storage {
    async fn store(&mut self, iter: Box<dyn iter::Iterator<Item=(String,String)>+Send>) ->Result<(), err::HTTPError>;
}

/// Return Uris to explore, both as initial set and for every fetched page.
pub trait Explorer {
    /// Return pages to fetch in any case, e.g. time-based. Called on every iteration of the
    /// driver.
    fn idle(&mut self) -> Vec<Uri>;
    /// Return pages to fetch based on a fetched document.
    fn next(&mut self, doc: &extract::Document) -> Vec<Uri>;
}

/// Extracted information can be presented as sequence of key/value pairs.
pub trait Extracted {
    fn all(&mut self) -> Box<dyn iter::Iterator<Item = (String, String)> + Send> {
        Box::new(iter::empty())
    }
}

/// An Extractor retrieves information from a Document.
pub trait Extractor {
    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Option<Box<dyn Extracted>> {
        None
    }
}

/// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and
/// what and how to store it.
pub struct DriverLogic {
    pub explore: Box<dyn Explorer>,
    pub store: Box<dyn Storage>,
    pub extract: Box<dyn Extractor>,
}

pub struct Driver {
    https: http::HTTPS,
    logic: DriverLogic,

    // This could be made into a more elaborate scheduler.
    queue: Vec<Uri>,
}

impl Driver {
    /// Create a new Driver instance.
    pub fn new(logic: DriverLogic, https: Option<http::HTTPS>) -> Driver {
        Driver { https: https.unwrap_or(http::HTTPS::new()), logic: logic, queue: Vec::with_capacity(64) }
    }

    /// Run Driver a single step, i.e. first explore, then process one page. Returns true if a page
    /// was processed.
    pub async fn drive(&mut self) -> Result<bool, err::HTTPError> {
        let new = self.logic.explore.idle();
        info!("Appended URIs to queue: {:?}", new);
        self.queue.extend(new.into_iter());

        if let Some(uri) = self.queue.pop() {
            info!("Starting fetch of {}", uri);
            let resp = self.https.get(&uri).await?;
            let doc = extract::parse_response(resp)?;
            if let Some(ref mut extracted) = self.logic.extract.extract(&uri, &doc) {
                info!("Stored extracted information");
                self.logic.store.store(extracted.all());
            }
            let next = self.logic.explore.next(&doc);
            info!("Appended URIs after fetch: {:?}", next);
            self.queue.extend(next);
            return Ok(true);
        } else {
            Ok(false)
        }
    }
}