Make the raker attempt HTTPS upgrades
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

Not only does this improve security for searchers later on,

it also enables us to cut down on the number of duplicates quite easily.
This commit is contained in:
Olivier 'reivilibre' 2022-11-28 23:15:37 +00:00
parent 34a05f84ff
commit bff48f35f4
1 changed files with 44 additions and 1 deletions

View File

@ -10,12 +10,13 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat}; use image::{GenericImageView, ImageFormat};
use itertools::Itertools; use itertools::Itertools;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::debug; use log::{debug, info, warn};
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind}; use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity; use sitemap::reader::SiteMapEntity;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::error::Error; use std::error::Error;
use std::fmt::{Debug, Display, Formatter}; use std::fmt::{Debug, Display, Formatter};
@ -59,6 +60,8 @@ pub enum RedirectReason {
}, },
/// The page was not canonical, and should not be indexed. /// The page was not canonical, and should not be indexed.
NotCanonical, NotCanonical,
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
SecureUpgrade,
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
@ -295,6 +298,30 @@ pub struct Raker {
} }
impl Raker { impl Raker {
/// Figure out whether we can upgrade a URL to HTTPS.
pub async fn try_upgrade_to_https(
&self,
url: &Url,
client: &Client,
) -> anyhow::Result<Option<Url>> {
if url.scheme().compare_no_case("http") == Ordering::Equal {
// Try to upgrade to HTTPS if we can.
let mut https_url = url.clone();
https_url.set_scheme("https");
client
.head(&https_url)
.timeout(Duration::from_secs(10))
.send()
.await
.context("failed to make HEAD request")?
.error_for_status()
.context("bad response for HEAD requesst")?;
Ok(Some(https_url))
} else {
Ok(None)
}
}
/// Rakes a resource by URL. /// Rakes a resource by URL.
/// ///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances, /// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
@ -305,6 +332,22 @@ impl Raker {
intent: RakeIntent, intent: RakeIntent,
client: &Client, client: &Client,
) -> anyhow::Result<RakeOutcome> { ) -> anyhow::Result<RakeOutcome> {
match self.try_upgrade_to_https(url, client).await {
Ok(Some(upgraded)) => {
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::SecureUpgrade,
new_url: upgraded,
});
}
Ok(None) => {
// continue
}
Err(err) => {
info!("can't upgrade {url} to HTTPS: {err:?}");
// continue
}
}
let response = client.get(url.clone()).send().await?; let response = client.get(url.clone()).send().await?;
let is_cf = if let Some(remote_addr) = response.remote_addr() { let is_cf = if let Some(remote_addr) = response.remote_addr() {