Support canonical URLs and pass down is_cf
This commit is contained in:
parent
7a0cd15018
commit
f1ce8b2c62
|
@ -153,11 +153,11 @@ impl Raker {
|
||||||
) -> anyhow::Result<RakeOutcome> {
|
) -> anyhow::Result<RakeOutcome> {
|
||||||
let response = client.get(url.clone()).send().await?;
|
let response = client.get(url.clone()).send().await?;
|
||||||
|
|
||||||
if let Some(remote_addr) = response.remote_addr() {
|
let is_cf = if let Some(remote_addr) = response.remote_addr() {
|
||||||
eprintln!("rA {:?}", remote_addr);
|
self.antifeature_ip_set.contains(remote_addr.ip())
|
||||||
let is_cf = self.antifeature_ip_set.contains(remote_addr.ip());
|
} else {
|
||||||
eprintln!("CF? {:?}", is_cf);
|
false
|
||||||
}
|
};
|
||||||
|
|
||||||
let http_code = response.status().as_u16();
|
let http_code = response.status().as_u16();
|
||||||
|
|
||||||
|
@ -213,9 +213,9 @@ impl Raker {
|
||||||
|
|
||||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
||||||
{
|
{
|
||||||
match self.rake_html_page(&content, url) {
|
match self.rake_html_page(&content, url, is_cf) {
|
||||||
Ok(page_rake) => {
|
Ok(page_rake) => {
|
||||||
return Ok(RakeOutcome::RakedPage(page_rake));
|
return Ok(page_rake);
|
||||||
}
|
}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
debug!("Failed to rake HTML page: {:?}", error);
|
debug!("Failed to rake HTML page: {:?}", error);
|
||||||
|
@ -254,13 +254,33 @@ impl Raker {
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
|
pub fn rake_html_page(&self, content: &[u8], url: &Url, is_cf: bool) -> anyhow::Result<RakeOutcome> {
|
||||||
let content_str = std::str::from_utf8(content)?;
|
let content_str = std::str::from_utf8(content)?;
|
||||||
|
|
||||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str);
|
let root_node: NodeRef = kuchiki::parse_html().one(content_str);
|
||||||
|
|
||||||
|
// See whether this page is at the canonical URL for the page.
|
||||||
|
// If it's not, then we redirect the raker to the canonical URL.
|
||||||
|
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
|
||||||
|
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
|
||||||
|
let canonical_url = url.join(canonical_href)
|
||||||
|
.context("Failed to resolve or parse canonical URL")?;
|
||||||
|
|
||||||
|
if &canonical_url != url {
|
||||||
|
return Ok(RakeOutcome::Redirect {
|
||||||
|
reason: RedirectReason::NotCanonical,
|
||||||
|
new_url: canonical_url
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
||||||
|
|
||||||
|
if is_cf {
|
||||||
|
antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
|
||||||
|
}
|
||||||
|
|
||||||
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
|
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
|
||||||
match analyse_with_ad_block_cosmetic_filter(
|
match analyse_with_ad_block_cosmetic_filter(
|
||||||
&root_node,
|
&root_node,
|
||||||
|
@ -297,9 +317,9 @@ impl Raker {
|
||||||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
||||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
||||||
|
|
||||||
Ok(RakedPage {
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
// TODO
|
// TODO
|
||||||
})
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue