changeset 3:6f4e48cd69b4

Start work on extract module
author Lewin Bormann <lbo@spheniscida.de>
date Sat, 21 Mar 2020 17:00:04 +0100
parents 5b14b84fc45c
children 768efcbf56a3
files src/extract.rs src/main.rs
diffstat 2 files changed, 43 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/extract.rs	Sat Mar 21 17:00:04 2020 +0100
@@ -0,0 +1,42 @@
+use crate::http;
+
+use log::info;
+use scraper::Html;
+
+pub struct Extract {
+    html: Html,
+}
+
+pub fn parse_response(r: http::GetResponse) -> Extract {
+    let content = http::bytes_to_str(r.body).unwrap();
+    let doc = Html::parse_document(content.as_str());
+    Extract { html: doc }
+}
+
+impl Extract {
+    fn new(content: &str) -> Extract {
+        Extract {
+            html: Html::parse_document(content),
+        }
+    }
+    pub fn get_field(&self, selector: &str) {
+        let selector = scraper::Selector::parse(selector).unwrap();
+        let selected = self.html.select(&selector);
+        for e in selected {
+            println!("selected: {}", e.inner_html());
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Extract;
+
+    #[test]
+    fn test_extract() {
+        let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap();
+        let ex = Extract::new(&content);
+        ex.get_field(".bez.neu");
+        ex.get_field(".preis strong");
+    }
+}
--- a/src/main.rs	Fri Mar 20 23:00:59 2020 +0100
+++ b/src/main.rs	Sat Mar 21 17:00:04 2020 +0100
@@ -1,3 +1,4 @@
+mod extract;
 mod http;
 
 use log::{info, warn};