Mercurial > lbo > hg > scrapeprice
changeset 3:6f4e48cd69b4
Start work on extract module
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sat, 21 Mar 2020 17:00:04 +0100 |
parents | 5b14b84fc45c |
children | 768efcbf56a3 |
files | src/extract.rs src/main.rs |
diffstat | 2 files changed, 43 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/extract.rs Sat Mar 21 17:00:04 2020 +0100 @@ -0,0 +1,42 @@ +use crate::http; + +use log::info; +use scraper::Html; + +pub struct Extract { + html: Html, +} + +pub fn parse_response(r: http::GetResponse) -> Extract { + let content = http::bytes_to_str(r.body).unwrap(); + let doc = Html::parse_document(content.as_str()); + Extract { html: doc } +} + +impl Extract { + fn new(content: &str) -> Extract { + Extract { + html: Html::parse_document(content), + } + } + pub fn get_field(&self, selector: &str) { + let selector = scraper::Selector::parse(selector).unwrap(); + let selected = self.html.select(&selector); + for e in selected { + println!("selected: {}", e.inner_html()); + } + } +} + +#[cfg(test)] +mod tests { + use super::Extract; + + #[test] + fn test_extract() { + let content = String::from_utf8(std::fs::read("audiophil_sony.html").unwrap()).unwrap(); + let ex = Extract::new(&content); + ex.get_field(".bez.neu"); + ex.get_field(".preis strong"); + } +}