This commit is contained in:
Olivier 'reivilibre' 2022-03-14 19:47:03 +00:00
parent f1ce8b2c62
commit df50f3607d
2 changed files with 28 additions and 17 deletions

View File

@ -43,7 +43,7 @@ pub enum RedirectReason {
/// The page redirected somewhere else. /// The page redirected somewhere else.
Redirected { Redirected {
/// HTTP Status Code of the redirect /// HTTP Status Code of the redirect
http_code: u16 http_code: u16,
}, },
/// The page was not canonical, and should not be indexed. /// The page was not canonical, and should not be indexed.
NotCanonical, NotCanonical,
@ -163,33 +163,38 @@ impl Raker {
if response.status().is_redirection() { if response.status().is_redirection() {
if let Some(redirect_target) = response.headers().get("location") { if let Some(redirect_target) = response.headers().get("location") {
let new_url = url.join(redirect_target.to_str() let new_url = url
.context("Failed to convert Location header to str")?) .join(
redirect_target
.to_str()
.context("Failed to convert Location header to str")?,
)
.context("Failed to resolve Location header target")?; .context("Failed to resolve Location header target")?;
return Ok(RakeOutcome::Redirect { return Ok(RakeOutcome::Redirect {
reason: RedirectReason::Redirected { reason: RedirectReason::Redirected { http_code },
http_code new_url,
},
new_url
}); });
} else { } else {
bail!("Redirection {:?} received, but no Location header.", response.status()); bail!(
"Redirection {:?} received, but no Location header.",
response.status()
);
} }
} }
if response.status().is_client_error() { if response.status().is_client_error() {
return Ok(RakeOutcome::PermanentFailure(PermanentFailure { return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::ResourceDenied(http_code) reason: PermanentFailureReason::ResourceDenied(http_code),
})) }));
} }
if response.status().is_server_error() { if response.status().is_server_error() {
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::ServerError(http_code), reason: TemporaryFailureReason::ServerError(http_code),
// Try again tomorrow. Maybe the server is overloaded? // Try again tomorrow. Maybe the server is overloaded?
backoff_sec: 86400 backoff_sec: 86400,
})) }));
} }
if !response.status().is_success() { if !response.status().is_success() {
@ -254,7 +259,12 @@ impl Raker {
})); }));
} }
pub fn rake_html_page(&self, content: &[u8], url: &Url, is_cf: bool) -> anyhow::Result<RakeOutcome> { pub fn rake_html_page(
&self,
content: &[u8],
url: &Url,
is_cf: bool,
) -> anyhow::Result<RakeOutcome> {
let content_str = std::str::from_utf8(content)?; let content_str = std::str::from_utf8(content)?;
let root_node: NodeRef = kuchiki::parse_html().one(content_str); let root_node: NodeRef = kuchiki::parse_html().one(content_str);
@ -263,13 +273,14 @@ impl Raker {
// If it's not, then we redirect the raker to the canonical URL. // If it's not, then we redirect the raker to the canonical URL.
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") { if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") { if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
let canonical_url = url.join(canonical_href) let canonical_url = url
.join(canonical_href)
.context("Failed to resolve or parse canonical URL")?; .context("Failed to resolve or parse canonical URL")?;
if &canonical_url != url { if &canonical_url != url {
return Ok(RakeOutcome::Redirect { return Ok(RakeOutcome::Redirect {
reason: RedirectReason::NotCanonical, reason: RedirectReason::NotCanonical,
new_url: canonical_url new_url: canonical_url,
}); });
} }
} }
@ -310,7 +321,7 @@ impl Raker {
eprintln!("{:#?}", readability.metadata); eprintln!("{:#?}", readability.metadata);
if let Some(node) = readability.article_node { if let Some(_node) = readability.article_node {
//eprintln!("{}", node.to_string()); //eprintln!("{}", node.to_string());
} }

View File

@ -12,7 +12,7 @@ pub struct DenseDocument {
} }
impl DenseDocument { impl DenseDocument {
pub fn from_document(root_node: NodeRef) { pub fn from_document(_root_node: NodeRef) {
todo!() todo!()
} }
} }