Mercurial > lbo > hg > scrapeprice
changeset 2:5b14b84fc45c
Refine fetch logic and add logging
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Fri, 20 Mar 2020 23:00:59 +0100 |
parents | 5f9a2f190f6c |
children | 6f4e48cd69b4 |
files | src/http.rs src/main.rs |
diffstat | 2 files changed, 105 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/src/http.rs Fri Mar 20 23:00:48 2020 +0100 +++ b/src/http.rs Fri Mar 20 23:00:59 2020 +0100 @@ -1,9 +1,11 @@ use std::collections::HashMap; use std::convert::{Into, TryFrom}; use std::error::Error; +use std::fmt; use http; use hyper; +use log::{info, warn, error}; use robots_txt::{matcher, parts::robots}; type HyperHTTPS = @@ -14,10 +16,6 @@ b.build(hyper_rustls::HttpsConnector::new()) } -pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, std::string::FromUtf8Error> { - String::from_utf8(b.as_ref().to_vec()) -} - fn robots_ok(robots_txt: &str, uri: &hyper::Uri) -> bool { let r = robots::Robots::from_str(robots_txt); let m = matcher::SimpleMatcher::new(&r.choose_section("*").rules); @@ -25,6 +23,38 @@ m.check_path(uri.path()) && m2.check_path(uri.path()) } +#[derive(Debug)] +pub enum HTTPError { + HyperError(hyper::Error), + LogicError(String), + StatusError(hyper::StatusCode), + HttpError(http::Error), +} + +impl fmt::Display for HTTPError { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + let e; + match self { + HTTPError::HyperError(he) => e = format!("{}", he), + HTTPError::LogicError(s) => e = s.clone(), + HTTPError::StatusError(sc) => e = format!("{}", sc), + HTTPError::HttpError(he) => e = format!("{}", he), + } + write!(f, "HTTPError({})", e)?; + Ok(()) + } +} + +impl Error for HTTPError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + &HTTPError::HyperError(ref e) => Some(e), + &HTTPError::HttpError(ref e) => Some(e), + _ => None, + } + } +} + pub struct HTTPS { client: HyperHTTPS, agent: String, @@ -37,6 +67,10 @@ pub body: hyper::body::Bytes, } +pub fn bytes_to_str(b: hyper::body::Bytes) -> Result<String, std::string::FromUtf8Error> { + String::from_utf8(b.as_ref().to_vec()) +} + impl HTTPS { pub fn new() -> HTTPS { HTTPS { @@ -46,51 +80,85 @@ } } - pub async fn get(&mut self, uri: hyper::Uri) -> Result<GetResponse, Box<dyn Error>> { + pub async fn get(&mut self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> { if let Ok(true) = self.robots_ok(&uri).await { - return self - .get_nocheck(uri) - .await - .map_err(|e| Box::new(e) as Box<dyn Error>); + return self.get_nocheck(uri).await; } unimplemented!() } - pub async fn get_nocheck(&self, uri: hyper::Uri) -> hyper::Result<GetResponse> { - use follow_redirects::ClientExt; + pub async fn get_nocheck(&self, uri: hyper::Uri) -> Result<GetResponse, HTTPError> { + let max_redirect: i32 = 10; + let mut uri = uri; + let host = uri.host().unwrap().to_string(); + + for i in 0..max_redirect { + let rq = self.make_request(&uri); + let resp = self + .client + .request(rq) + .await + .map_err(HTTPError::HyperError)?; - let rq = self.make_request(&uri); - let cl = self.client.follow_redirects(); - let body = cl.request(rq).await?; - let status = body.status(); - let bytes = hyper::body::to_bytes(body).await?; - println!("GET {:?} => {}", uri, status); - Ok(GetResponse { - status: status, - body: bytes, - }) + info!("({}) GET {:?} => {}", i, uri, resp.status()); + match resp.status().as_u16() { + 200 => { + let status = resp.status(); + let bytes = hyper::body::to_bytes(resp) + .await + .map_err(HTTPError::HyperError)?; + return Ok(GetResponse { + status: status, + body: bytes, + }); + } + 301 | 302 | 303 | 307 | 308 => { + let loc = resp.headers().get("location").or(resp.headers().get("Location")); + if let Some(location) = loc { + uri = hyper::Uri::builder() + .authority(host.as_str()) + .scheme(uri.scheme_str().unwrap_or("https")) + .path_and_query(location.to_str().unwrap()) + .build() + .map_err(HTTPError::HttpError).unwrap(); + info!("({}) GET 302 Redirect to: {}", i, uri); + continue; + } else { + warn!("redirect without location: {:?}", resp.headers()); + return Err(HTTPError::LogicError(format!("redirect without location: {:?}", resp.headers()))) + } + } + 404 => return Err(HTTPError::StatusError(resp.status())), + _ => {} + } + } + Err(HTTPError::LogicError(format!( + "exhausted redirects on {}", + uri + ))) } - async fn robots_ok(&mut self, uri: &hyper::Uri) -> hyper::Result<bool> { + async fn robots_ok(&mut self, uri: &hyper::Uri) -> Result<bool, HTTPError> { let host = uri.host().unwrap_or("_"); - let parts = host.to_string().split(".").collect::<Vec<&str>>(); - println!("checking robots.txt for {}", host); + info!("checking robots.txt for {}", host); match self.robots_txt_cache.get(host) { - Some(e) => Ok(robots_ok(e, uri)), + Some(e) => { + let is_ok = robots_ok(e, uri); + info!("cached robots.txt for {} ok? {}", host, is_ok); + Ok(is_ok) + } _ => { - let mut robots_uri = hyper::Uri::builder() + let robots_uri = hyper::Uri::builder() .authority(host) .scheme(uri.scheme_str().unwrap_or("http")) .path_and_query("/robots.txt") .build() .unwrap(); let resp = self.get_nocheck(robots_uri).await?; - println!("{:?}", resp.body); let robots = bytes_to_str(resp.body).unwrap(); - println!("{}", robots); let is_ok = robots_ok(&robots, uri); self.robots_txt_cache.insert(host.to_string(), robots); - + info!("robots.txt for {} ok? {}", host, is_ok); Ok(is_ok) } }
--- a/src/main.rs Fri Mar 20 23:00:48 2020 +0100 +++ b/src/main.rs Fri Mar 20 23:00:59 2020 +0100 @@ -1,21 +1,22 @@ mod http; -use std::io; -use hyper; -use hyper_rustls; - -use tokio::prelude::*; +use log::{info, warn}; +use env_logger; async fn test_fetch_page() -> hyper::Result<()> { - println!("testing!"); let mut cl = http::HTTPS::new(); let res = cl.get("https://audiophil-foto.de/de/shop/kameras/sony/".parse::<hyper::Uri>().unwrap()).await.unwrap(); - println!("{}\n{}", res.status, http::bytes_to_str(res.body).unwrap()); + info!("Fetch 1 was {}", res.status); + let res = cl.get("https://audiophil-foto.de/de/shop/kameras/nikon/".parse::<hyper::Uri>().unwrap()).await.unwrap(); + info!("Fetch 2 was {}", res.status); Ok(()) } #[tokio::main] async fn main() { + env_logger::Builder::from_default_env().filter(None, log::LevelFilter::Info).init(); + + info!("scrapeprice: init"); test_fetch_page().await.unwrap(); }