From 7a0cd15018114a422d7daac369acb6a43d87f22f Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Mon, 14 Mar 2022 19:44:53 +0000 Subject: [PATCH] Minor clean ups, plus redirect support --- quickpeep/src/bin/{qp-rake.rs => qp-rake1.rs} | 0 quickpeep/src/raking.rs | 54 ++++++++++++++++--- quickpeep_densedoc/src/lib.rs | 5 +- 3 files changed, 50 insertions(+), 9 deletions(-) rename quickpeep/src/bin/{qp-rake.rs => qp-rake1.rs} (100%) diff --git a/quickpeep/src/bin/qp-rake.rs b/quickpeep/src/bin/qp-rake1.rs similarity index 100% rename from quickpeep/src/bin/qp-rake.rs rename to quickpeep/src/bin/qp-rake1.rs diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index 6d37eee..5f60de5 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -31,16 +31,24 @@ pub enum RakeOutcome { RakedPage(RakedPage), RakedFeed(Vec), RakedSitemap(Vec), - /// The page was not canonical, and should not be indexed. - /// However here is the URL of the canonical page. - // TODO call this a Redirect and also use for 3xx redirects? - NotCanonical { + Redirect { + reason: RedirectReason, new_url: Url, }, TemporaryFailure(TemporaryFailure), PermanentFailure(PermanentFailure), } +pub enum RedirectReason { + /// The page redirected somewhere else. + Redirected { + /// HTTP Status Code of the redirect + http_code: u16 + }, + /// The page was not canonical, and should not be indexed. + NotCanonical, +} + #[derive(Clone, Debug, PartialEq, Eq)] pub struct UrlRaked { pub url: Url, @@ -66,10 +74,11 @@ pub struct PermanentFailure { pub enum TemporaryFailureReason { MissingInformation(String), + ServerError(u16), } pub enum PermanentFailureReason { - ResourceDenied(u32), + ResourceDenied(u16), WrongLanguage(String), UnknownContentType(String), } @@ -150,8 +159,41 @@ impl Raker { eprintln!("CF? {:?}", is_cf); } + let http_code = response.status().as_u16(); + + if response.status().is_redirection() { + if let Some(redirect_target) = response.headers().get("location") { + let new_url = url.join(redirect_target.to_str() + .context("Failed to convert Location header to str")?) + .context("Failed to resolve Location header target")?; + + return Ok(RakeOutcome::Redirect { + reason: RedirectReason::Redirected { + http_code + }, + new_url + }); + } else { + bail!("Redirection {:?} received, but no Location header.", response.status()); + } + } + + if response.status().is_client_error() { + return Ok(RakeOutcome::PermanentFailure(PermanentFailure { + reason: PermanentFailureReason::ResourceDenied(http_code) + })) + } + + if response.status().is_server_error() { + return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { + reason: TemporaryFailureReason::ServerError(http_code), + // Try again tomorrow. Maybe the server is overloaded? + backoff_sec: 86400 + })) + } + if !response.status().is_success() { - bail!("Not successful: {:?}", response.status().as_u16()); + bail!("Unknown failure code: {:?}", response.status()); } let content_type = if let Some(content_type) = response.headers().get("content-type") { diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index ad233a4..f92f4e2 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -21,9 +21,8 @@ impl DenseDocument { pub struct DenseHead { title: String, feed_urls: Vec, - // TODO how best to expose this?? We actually don't care about storing it though ... - // Probably move to the raker. - canonical: (), // TODO I'm sure we'd benefit by digging up some metadata, but that's possibly for later :) + /// URL to icon of the page. May be empty if none were discovered. + icon: String, } #[derive(Serialize, Deserialize, Clone, Debug)]