Mercurial > lbo > hg > scrapeprice
view src/extract.rs @ 18:233c28d6d968
Remove example_main and add some docs
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Tue, 22 Sep 2020 13:29:39 +0200 |
parents | 6027d11cb86d |
children |
line wrap: on
line source
#![allow(unused)] use crate::err::{logic_err, HTTPError}; use crate::http; use std::iter; use log::info; use scraper::Html; /// A fetched document is given to the Extractor which gets information from it and returns the /// storable data. The underlying logic is implemented by the `scraper` crate. pub struct Document { html: Html, } pub fn parse_response(r: http::GetResponse) -> Result<Document, HTTPError> { let content = http::bytes_to_str(r.body)?; let doc = Html::parse_document(content.as_str()); Ok(Document { html: doc }) } impl Document { fn new(content: &str) -> Document { Document { html: Html::parse_document(content), } } /// For every CSS selector in `selectors`, return a vec of contents in that selector. pub fn get_contents(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> { let mut r = Vec::with_capacity(selectors.len()); for sel in selectors { let selector = parse_selector(sel)?; let selected = self.html.select(&selector); let mut values = vec![]; for e in selected { values.push(e.inner_html()); } r.push(values); } Ok(r) } /// For a selector, return a vec of contents for the selector. pub fn get_content(&self, selector: &str) -> Result<Vec<String>, HTTPError> { let v = self.get_contents(&[selector])?; Ok(v[0].clone()) } /// For the elements described by selector, return the attributes pub fn get_attr(&self, selector: &str, attr: &str) -> Result<Vec<String>, HTTPError> { let selector = parse_selector(selector)?; let sel = self.html.select(&selector); let mut fetched = vec![]; for item in sel { fetched.push(item.value().attr(attr).unwrap_or("").to_string()); } Ok(fetched) } } fn parse_selector(sel: &str) -> Result<scraper::Selector, HTTPError> { scraper::Selector::parse(sel) .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel))) } #[cfg(test)] mod tests { use super::Document; use std::iter; #[test] fn test_document() { let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); let ex = Document::new(&content); let mut data = ex.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); let zipped: Vec<(String, String)> = descs .into_iter() .zip(prices) .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) .collect(); println!("{:?}", zipped); let links = ex.get_attr("a", "href").unwrap(); println!("All links: {:?}", links); } }