Support canonical URLs and pass down is_cf

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 19:46:57 +00:00
parent 7a0cd15018
commit f1ce8b2c62
1 changed files with 30 additions and 10 deletions

View File

@ -153,11 +153,11 @@ impl Raker {
) -> anyhow::Result<RakeOutcome> { ) -> anyhow::Result<RakeOutcome> {
let response = client.get(url.clone()).send().await?; let response = client.get(url.clone()).send().await?;
if let Some(remote_addr) = response.remote_addr() { let is_cf = if let Some(remote_addr) = response.remote_addr() {
eprintln!("rA {:?}", remote_addr); self.antifeature_ip_set.contains(remote_addr.ip())
let is_cf = self.antifeature_ip_set.contains(remote_addr.ip()); } else {
eprintln!("CF? {:?}", is_cf); false
} };
let http_code = response.status().as_u16(); let http_code = response.status().as_u16();
@ -213,9 +213,9 @@ impl Raker {
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{ {
match self.rake_html_page(&content, url) { match self.rake_html_page(&content, url, is_cf) {
Ok(page_rake) => { Ok(page_rake) => {
return Ok(RakeOutcome::RakedPage(page_rake)); return Ok(page_rake);
} }
Err(error) => { Err(error) => {
debug!("Failed to rake HTML page: {:?}", error); debug!("Failed to rake HTML page: {:?}", error);
@ -254,13 +254,33 @@ impl Raker {
})); }));
} }
pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { pub fn rake_html_page(&self, content: &[u8], url: &Url, is_cf: bool) -> anyhow::Result<RakeOutcome> {
let content_str = std::str::from_utf8(content)?; let content_str = std::str::from_utf8(content)?;
let root_node: NodeRef = kuchiki::parse_html().one(content_str); let root_node: NodeRef = kuchiki::parse_html().one(content_str);
// See whether this page is at the canonical URL for the page.
// If it's not, then we redirect the raker to the canonical URL.
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
let canonical_url = url.join(canonical_href)
.context("Failed to resolve or parse canonical URL")?;
if &canonical_url != url {
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::NotCanonical,
new_url: canonical_url
});
}
}
}
let mut antifeature_flags = AnalysisAntifeatures::empty(); let mut antifeature_flags = AnalysisAntifeatures::empty();
if is_cf {
antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
}
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines { for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
match analyse_with_ad_block_cosmetic_filter( match analyse_with_ad_block_cosmetic_filter(
&root_node, &root_node,
@ -297,9 +317,9 @@ impl Raker {
let bare_size = serde_bare::to_vec(&dense_doc)?.len(); let bare_size = serde_bare::to_vec(&dense_doc)?.len();
eprintln!("CS {:?}{:?}", content.len(), bare_size); eprintln!("CS {:?}{:?}", content.len(), bare_size);
Ok(RakedPage { Ok(RakeOutcome::RakedPage(RakedPage {
// TODO // TODO
}) }))
} }
} }