diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index 5f60de5..d8735d1 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -153,11 +153,11 @@ impl Raker { ) -> anyhow::Result { let response = client.get(url.clone()).send().await?; - if let Some(remote_addr) = response.remote_addr() { - eprintln!("rA {:?}", remote_addr); - let is_cf = self.antifeature_ip_set.contains(remote_addr.ip()); - eprintln!("CF? {:?}", is_cf); - } + let is_cf = if let Some(remote_addr) = response.remote_addr() { + self.antifeature_ip_set.contains(remote_addr.ip()) + } else { + false + }; let http_code = response.status().as_u16(); @@ -213,9 +213,9 @@ impl Raker { if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { - match self.rake_html_page(&content, url) { + match self.rake_html_page(&content, url, is_cf) { Ok(page_rake) => { - return Ok(RakeOutcome::RakedPage(page_rake)); + return Ok(page_rake); } Err(error) => { debug!("Failed to rake HTML page: {:?}", error); @@ -254,13 +254,33 @@ impl Raker { })); } - pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result { + pub fn rake_html_page(&self, content: &[u8], url: &Url, is_cf: bool) -> anyhow::Result { let content_str = std::str::from_utf8(content)?; let root_node: NodeRef = kuchiki::parse_html().one(content_str); + // See whether this page is at the canonical URL for the page. + // If it's not, then we redirect the raker to the canonical URL. + if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") { + if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") { + let canonical_url = url.join(canonical_href) + .context("Failed to resolve or parse canonical URL")?; + + if &canonical_url != url { + return Ok(RakeOutcome::Redirect { + reason: RedirectReason::NotCanonical, + new_url: canonical_url + }); + } + } + } + let mut antifeature_flags = AnalysisAntifeatures::empty(); + if is_cf { + antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE; + } + for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines { match analyse_with_ad_block_cosmetic_filter( &root_node, @@ -297,9 +317,9 @@ impl Raker { let bare_size = serde_bare::to_vec(&dense_doc)?.len(); eprintln!("CS {:?} → {:?}", content.len(), bare_size); - Ok(RakedPage { + Ok(RakeOutcome::RakedPage(RakedPage { // TODO - }) + })) } }