Mercurial > lbo > hg > scrapeprice
view src/extract.rs @ 6:e2526accc58f
Get it to a working scraper
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 22 Mar 2020 14:23:19 +0100 |
parents | cc875ec12026 |
children | 8dee877af779 |
line wrap: on
line source
#![allow(unused)] use crate::err::{logic_err, HTTPError}; use crate::http; use std::iter; use log::info; use scraper::Html; /// A fetched document is given to the Extractor which gets information from it and returns the /// storable data. pub struct Document { html: Html, } pub fn parse_response(r: http::GetResponse) -> Result<Document, HTTPError> { let content = http::bytes_to_str(r.body)?; let doc = Html::parse_document(content.as_str()); Ok(Document { html: doc }) } impl Document { fn new(content: &str) -> Document { Document { html: Html::parse_document(content), } } pub fn get_fields(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> { let mut r = Vec::with_capacity(selectors.len()); for sel in selectors { let selector = scraper::Selector::parse(sel) .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel)))?; let selected = self.html.select(&selector); let mut values = vec![]; for e in selected { values.push(e.inner_html()); } r.push(values); } Ok(r) } pub fn get_field(&self, selector: &str) -> Result<Vec<String>, HTTPError> { let v = self.get_fields(&[selector])?; Ok(v[0].clone()) } } pub trait Extracted { fn all(&mut self) -> Box<dyn iter::Iterator<Item = (String, String)> + Send> { Box::new(iter::empty()) } } pub trait Extractor { fn extract(&mut self, doc: &Document) -> Option<Box<dyn Extracted>> { None } } #[cfg(test)] mod tests { use super::Document; use std::iter; #[test] fn test_document() { let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); let ex = Document::new(&content); let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); let zipped: Vec<(String, String)> = descs .into_iter() .zip(prices) .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) .collect(); println!("{:?}", zipped); } }