Make the raker attempt HTTPS upgrades
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

Not only does this improve security for searchers later on,

it also enables us to cut down on the number of duplicates quite easily.
rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2022-11-28 23:15:37 +00:00
parent 34a05f84ff
commit bff48f35f4
1 changed files with 44 additions and 1 deletions

View File

@ -10,12 +10,13 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat};
use itertools::Itertools;
use lazy_static::lazy_static;
use log::debug;
use log::{debug, info, warn};
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
@ -59,6 +60,8 @@ pub enum RedirectReason {
},
/// The page was not canonical, and should not be indexed.
NotCanonical,
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
SecureUpgrade,
}
#[derive(Clone, Debug, PartialEq, Eq)]
@ -295,6 +298,30 @@ pub struct Raker {
}
impl Raker {
/// Figure out whether we can upgrade a URL to HTTPS.
pub async fn try_upgrade_to_https(
&self,
url: &Url,
client: &Client,
) -> anyhow::Result<Option<Url>> {
if url.scheme().compare_no_case("http") == Ordering::Equal {
// Try to upgrade to HTTPS if we can.
let mut https_url = url.clone();
https_url.set_scheme("https");
client
.head(&https_url)
.timeout(Duration::from_secs(10))
.send()
.await
.context("failed to make HEAD request")?
.error_for_status()
.context("bad response for HEAD requesst")?;
Ok(Some(https_url))
} else {
Ok(None)
}
}
/// Rakes a resource by URL.
///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
@ -305,6 +332,22 @@ impl Raker {
intent: RakeIntent,
client: &Client,
) -> anyhow::Result<RakeOutcome> {
match self.try_upgrade_to_https(url, client).await {
Ok(Some(upgraded)) => {
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::SecureUpgrade,
new_url: upgraded,
});
}
Ok(None) => {
// continue
}
Err(err) => {
info!("can't upgrade {url} to HTTPS: {err:?}");
// continue
}
}
let response = client.get(url.clone()).send().await?;
let is_cf = if let Some(remote_addr) = response.remote_addr() {