From bff48f35f446ad9e598b61633080f21aa803b9ce Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 28 Nov 2022 23:15:37 +0000 Subject: [PATCH] Make the raker attempt HTTPS upgrades Not only does this improve security for searchers later on, it also enables us to cut down on the number of duplicates quite easily. --- quickpeep_raker/src/raking.rs | 45 ++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index 49c1ece..a9e2f6e 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -10,12 +10,13 @@ use image::imageops::FilterType; use image::{GenericImageView, ImageFormat}; use itertools::Itertools; use lazy_static::lazy_static; -use log::debug; +use log::{debug, info, warn}; use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind}; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; use sitemap::reader::SiteMapEntity; +use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::error::Error; use std::fmt::{Debug, Display, Formatter}; @@ -59,6 +60,8 @@ pub enum RedirectReason { }, /// The page was not canonical, and should not be indexed. NotCanonical, + /// Upgrade from a HTTP to HTTPS URL (or equivalent). + SecureUpgrade, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -295,6 +298,30 @@ pub struct Raker { } impl Raker { + /// Figure out whether we can upgrade a URL to HTTPS. + pub async fn try_upgrade_to_https( + &self, + url: &Url, + client: &Client, + ) -> anyhow::Result> { + if url.scheme().compare_no_case("http") == Ordering::Equal { + // Try to upgrade to HTTPS if we can. + let mut https_url = url.clone(); + https_url.set_scheme("https"); + client + .head(&https_url) + .timeout(Duration::from_secs(10)) + .send() + .await + .context("failed to make HEAD request")? + .error_for_status() + .context("bad response for HEAD requesst")?; + Ok(Some(https_url)) + } else { + Ok(None) + } + } + /// Rakes a resource by URL. /// /// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances, @@ -305,6 +332,22 @@ impl Raker { intent: RakeIntent, client: &Client, ) -> anyhow::Result { + match self.try_upgrade_to_https(url, client).await { + Ok(Some(upgraded)) => { + return Ok(RakeOutcome::Redirect { + reason: RedirectReason::SecureUpgrade, + new_url: upgraded, + }); + } + Ok(None) => { + // continue + } + Err(err) => { + info!("can't upgrade {url} to HTTPS: {err:?}"); + // continue + } + } + let response = client.get(url.clone()).send().await?; let is_cf = if let Some(remote_addr) = response.remote_addr() {