Mercurial > lbo > hg > scrapeprice
changeset 4:768efcbf56a3
Move error elsewhere and enhance Extract interface
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sat, 21 Mar 2020 17:20:50 +0100 |
parents | 6f4e48cd69b4 |
children | cc875ec12026 |
files | src/err.rs src/extract.rs src/http.rs src/main.rs |
diffstat | 4 files changed, 72 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/err.rs Sat Mar 21 17:20:50 2020 +0100 @@ -0,0 +1,41 @@ + +use std::fmt; +use std::error::Error; + +pub fn logic_err(e: &dyn Error) -> HTTPError { + let s = format!("{}", e); + HTTPError::LogicError(s) +} + +#[derive(Debug)] +pub enum HTTPError { + HyperError(hyper::Error), + LogicError(String), + StatusError(hyper::StatusCode), + HttpError(http::Error), +} + +impl fmt::Display for HTTPError { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + let e; + match self { + HTTPError::HyperError(he) => e = format!("{}", he), + HTTPError::LogicError(s) => e = s.clone(), + HTTPError::StatusError(sc) => e = format!("{}", sc), + HTTPError::HttpError(he) => e = format!("{}", he), + } + write!(f, "HTTPError({})", e)?; + Ok(()) + } +} + +impl Error for HTTPError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + &HTTPError::HyperError(ref e) => Some(e), + &HTTPError::HttpError(ref e) => Some(e), + _ => None, + } + } +} +
--- a/src/extract.rs Sat Mar 21 17:00:04 2020 +0100 +++ b/src/extract.rs Sat Mar 21 17:20:50 2020 +0100 @@ -1,3 +1,4 @@ +use crate::err::{logic_err, HTTPError}; use crate::http; use log::info; @@ -19,12 +20,25 @@ html: Html::parse_document(content), } } - pub fn get_field(&self, selector: &str) { - let selector = scraper::Selector::parse(selector).unwrap(); - let selected = self.html.select(&selector); - for e in selected { - println!("selected: {}", e.inner_html()); + pub fn get_fields(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> { + let mut r = Vec::with_capacity(selectors.len()); + for sel in selectors { + let selector = scraper::Selector::parse(sel) + .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel)))?; + let selected = self.html.select(&selector); + + let mut values = vec![]; + for e in selected { + println!("selected: {}", e.inner_html()); + values.push(e.inner_html()); + } + r.push(values); } + Ok(r) + } + pub fn get_field(&self, selector: &str) -> Result<Vec<String>, HTTPError> { + let v = self.get_fields(&[selector])?; + Ok(v[0].clone()) } } @@ -32,11 +46,18 @@ mod tests { use super::Extract; + use std::iter; + #[test] fn test_extract() { let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); let ex = Extract::new(&content); - ex.get_field(".bez.neu"); - ex.get_field(".preis strong"); + let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); + let prices = data.pop().unwrap(); + let descs = data.pop().unwrap(); + let zipped: Vec<(String, String)> = descs.into_iter().zip(prices).map(|(desc, price)| { + (desc.trim().to_string(), price.trim().to_string()) + }).collect(); + println!("{:?}", zipped); } }
--- a/src/http.rs Sat Mar 21 17:00:04 2020 +0100 +++ b/src/http.rs Sat Mar 21 17:20:50 2020 +0100 @@ -1,7 +1,7 @@ +use crate::err::HTTPError; + use std::collections::HashMap; use std::convert::{Into, TryFrom}; -use std::error::Error; -use std::fmt; use http; use hyper; @@ -23,38 +23,6 @@ m.check_path(uri.path()) && m2.check_path(uri.path()) } -#[derive(Debug)] -pub enum HTTPError { - HyperError(hyper::Error), - LogicError(String), - StatusError(hyper::StatusCode), - HttpError(http::Error), -} - -impl fmt::Display for HTTPError { - fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - let e; - match self { - HTTPError::HyperError(he) => e = format!("{}", he), - HTTPError::LogicError(s) => e = s.clone(), - HTTPError::StatusError(sc) => e = format!("{}", sc), - HTTPError::HttpError(he) => e = format!("{}", he), - } - write!(f, "HTTPError({})", e)?; - Ok(()) - } -} - -impl Error for HTTPError { - fn source(&self) -> Option<&(dyn Error + 'static)> { - match self { - &HTTPError::HyperError(ref e) => Some(e), - &HTTPError::HttpError(ref e) => Some(e), - _ => None, - } - } -} - pub struct HTTPS { client: HyperHTTPS, agent: String,