Mercurial > lbo > hg > scrapeprice
changeset 7:8dee877af779
Expand Document: Allow fetching attributes
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 22 Mar 2020 23:08:08 +0100 |
parents | e2526accc58f |
children | 6027d11cb86d |
files | src/extract.rs src/implem.rs |
diffstat | 2 files changed, 24 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/src/extract.rs Sun Mar 22 14:23:19 2020 +0100 +++ b/src/extract.rs Sun Mar 22 23:08:08 2020 +0100 @@ -26,11 +26,10 @@ html: Html::parse_document(content), } } - pub fn get_fields(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> { + pub fn get_contents(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> { let mut r = Vec::with_capacity(selectors.len()); for sel in selectors { - let selector = scraper::Selector::parse(sel) - .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel)))?; + let selector = parse_selector(sel)?; let selected = self.html.select(&selector); let mut values = vec![]; @@ -41,10 +40,24 @@ } Ok(r) } - pub fn get_field(&self, selector: &str) -> Result<Vec<String>, HTTPError> { - let v = self.get_fields(&[selector])?; + pub fn get_content(&self, selector: &str) -> Result<Vec<String>, HTTPError> { + let v = self.get_contents(&[selector])?; Ok(v[0].clone()) } + pub fn get_attr(&self, selector: &str, attr: &str) -> Result<Vec<String>, HTTPError> { + let selector = parse_selector(selector)?; + let sel = self.html.select(&selector); + let mut fetched = vec![]; + for item in sel { + fetched.push(item.value().attr(attr).unwrap_or("").to_string()); + } + Ok(fetched) + } +} + +fn parse_selector(sel: &str) -> Result<scraper::Selector, HTTPError> { + scraper::Selector::parse(sel) + .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel))) } pub trait Extracted { @@ -69,7 +82,7 @@ fn test_document() { let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); let ex = Document::new(&content); - let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); + let mut data = ex.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); let zipped: Vec<(String, String)> = descs @@ -78,5 +91,8 @@ .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string())) .collect(); println!("{:?}", zipped); + + let links = ex.get_attr("a", "href").unwrap(); + println!("All links: {:?}", links); } }
--- a/src/implem.rs Sun Mar 22 14:23:19 2020 +0100 +++ b/src/implem.rs Sun Mar 22 23:08:08 2020 +0100 @@ -21,11 +21,11 @@ impl extract::Extractor for AudiophilItemPriceExtractor { fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn extract::Extracted>> { - let mut data = doc.get_fields(&[".bez.neu", ".preis strong"]).unwrap(); + let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap(); let prices = data.pop().unwrap(); let descs = data.pop().unwrap(); - let onlytext = rex::compile("^[a-zA-Z0-9\\.+/ -]+").unwrap(); + let onlytext = rex::compile("^[a-zA-Z0-9\\.,+/ -]+").unwrap(); let zipped: Vec<(String, String)> = descs .into_iter()