view src/driver.rs @ 22:0e7b6f3050d0 default tip

Use new traits system in example
author Lewin Bormann <lbo@spheniscida.de>
date Tue, 22 Sep 2020 19:32:51 +0200
parents e4c4a7c00fbd
children
line wrap: on
line source

#![allow(unused)]

//! Drive the scraping process.

use std::iter;

use crate::err;
use crate::extract;
use crate::http;

use hyper::Uri;
use log::{info,warn,error};

/// Store fetched results, which come as key/value pairs, somewhere.
#[async_trait::async_trait]
pub trait Storage<T: Send> {
    async fn store(&mut self, d: Box<dyn Iterator<Item=T> + Send>) ->Result<(), err::HTTPError>;
}

/// An Extractor retrieves information from a Document.
pub trait Extractor<T: Send> {
    fn extract(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<T> {
        vec![]
    }
    /// Return pages to fetch based on a fetched document.
    fn next_sites(&mut self, uri: &Uri, doc: &extract::Document) -> Vec<Uri>;
}

/// The Queue manages and prioritizes order and volume of sites to fetch.
#[async_trait::async_trait]
pub trait Queue {
    /// Add a site to the queue.
    async fn add(&mut self, uris: &[Uri]) -> Result<(), err::HTTPError>;
    /// Returns a site to scrape next.
    async fn next(&mut self) -> Result<Option<Uri>, err::HTTPError>;
    /// Confirm that an URL has been visited successfully
    async fn visited(&mut self, uri: &Uri) -> Result<(), err::HTTPError>;
}

/// DriverLogic holds the driven implementation. The members tell the driver what to fetch, and
/// what and how to store it.
pub struct DriverLogic<T> {
    pub store: Box<dyn Storage<T>>,
    pub extract: Box<dyn Extractor<T>>,
    pub queue: Box<dyn Queue>,
}

pub struct Driver<T> {
    https: http::HTTPS,
    logic: DriverLogic<T>,
}

impl<T: 'static + Send> Driver<T> {
    /// Create a new Driver instance.
    pub fn new(logic: DriverLogic<T>, https: Option<http::HTTPS>) -> Driver<T> {
        Driver { https: https.unwrap_or(http::HTTPS::new()), logic: logic }
    }

    /// Run Driver a single step, i.e. first explore, then process one page. Returns true if a page
    /// was processed.
    pub async fn drive(&mut self) -> Result<bool, err::HTTPError> {
        let next = self.logic.queue.next().await?;
        info!("Next URL: {:?}", next);

        if let Some(uri) = next {
            info!("Starting fetch of {}", uri);
            let resp = self.https.get(&uri).await?;
            let doc = extract::parse_response(resp)?;
            let extracted = self.logic.extract.extract(&uri, &doc);
            self.logic.store.store(Box::new(extracted.into_iter()));
            let next_urls = self.logic.extract.next_sites(&uri, &doc);
            info!("Appended URIs after fetch: {:?}", next_urls);
            self.logic.queue.add(&next_urls);
            return Ok(true);
        } else {
            Ok(false)
        }
    }
}