changeset 7:8dee877af779

Expand Document: Allow fetching attributes
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 22 Mar 2020 23:08:08 +0100
parents e2526accc58f
children 6027d11cb86d
files src/extract.rs src/implem.rs
diffstat 2 files changed, 24 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/src/extract.rs	Sun Mar 22 14:23:19 2020 +0100
+++ b/src/extract.rs	Sun Mar 22 23:08:08 2020 +0100
@@ -26,11 +26,10 @@
             html: Html::parse_document(content),
         }
     }
-    pub fn get_fields(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> {
+    pub fn get_contents(&self, selectors: &[&str]) -> Result<Vec<Vec<String>>, HTTPError> {
         let mut r = Vec::with_capacity(selectors.len());
         for sel in selectors {
-            let selector = scraper::Selector::parse(sel)
-                .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel)))?;
+            let selector = parse_selector(sel)?;
             let selected = self.html.select(&selector);
 
             let mut values = vec![];
@@ -41,10 +40,24 @@
         }
         Ok(r)
     }
-    pub fn get_field(&self, selector: &str) -> Result<Vec<String>, HTTPError> {
-        let v = self.get_fields(&[selector])?;
+    pub fn get_content(&self, selector: &str) -> Result<Vec<String>, HTTPError> {
+        let v = self.get_contents(&[selector])?;
         Ok(v[0].clone())
     }
+    pub fn get_attr(&self, selector: &str, attr: &str) -> Result<Vec<String>, HTTPError> {
+        let selector = parse_selector(selector)?;
+        let sel = self.html.select(&selector);
+        let mut fetched = vec![];
+        for item in sel {
+            fetched.push(item.value().attr(attr).unwrap_or("").to_string());
+        }
+        Ok(fetched)
+    }
+}
+
+fn parse_selector(sel: &str) -> Result<scraper::Selector, HTTPError> {
+    scraper::Selector::parse(sel)
+        .map_err(|_| HTTPError::LogicError(format!("failed to parse selector {}", sel)))
 }
 
 pub trait Extracted {
@@ -69,7 +82,7 @@
     fn test_document() {
         let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap();
         let ex = Document::new(&content);
-        let mut data = ex.get_fields(&[".bez.neu", ".preis strong"]).unwrap();
+        let mut data = ex.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
         let prices = data.pop().unwrap();
         let descs = data.pop().unwrap();
         let zipped: Vec<(String, String)> = descs
@@ -78,5 +91,8 @@
             .map(|(desc, price)| (desc.trim().to_string(), price.trim().to_string()))
             .collect();
         println!("{:?}", zipped);
+
+        let links = ex.get_attr("a", "href").unwrap();
+        println!("All links: {:?}", links);
     }
 }
--- a/src/implem.rs	Sun Mar 22 14:23:19 2020 +0100
+++ b/src/implem.rs	Sun Mar 22 23:08:08 2020 +0100
@@ -21,11 +21,11 @@
 
 impl extract::Extractor for AudiophilItemPriceExtractor {
     fn extract(&mut self, doc: &extract::Document) -> Option<Box<dyn extract::Extracted>> {
-        let mut data = doc.get_fields(&[".bez.neu", ".preis strong"]).unwrap();
+        let mut data = doc.get_contents(&[".bez.neu", ".preis strong"]).unwrap();
         let prices = data.pop().unwrap();
         let descs = data.pop().unwrap();
 
-        let onlytext = rex::compile("^[a-zA-Z0-9\\.+/ -]+").unwrap();
+        let onlytext = rex::compile("^[a-zA-Z0-9\\.,+/ -]+").unwrap();
 
         let zipped: Vec<(String, String)> = descs
             .into_iter()