Make the raker attempt HTTPS upgrades
Not only does this improve security for searchers later on, it also enables us to cut down on the number of duplicates quite easily.rei/rakerstore_postgres_overhaul
parent
34a05f84ff
commit
bff48f35f4
|
@ -10,12 +10,13 @@ use image::imageops::FilterType;
|
|||
use image::{GenericImageView, ImageFormat};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use log::{debug, info, warn};
|
||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sitemap::reader::SiteMapEntity;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
|
@ -59,6 +60,8 @@ pub enum RedirectReason {
|
|||
},
|
||||
/// The page was not canonical, and should not be indexed.
|
||||
NotCanonical,
|
||||
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
|
||||
SecureUpgrade,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
|
@ -295,6 +298,30 @@ pub struct Raker {
|
|||
}
|
||||
|
||||
impl Raker {
|
||||
/// Figure out whether we can upgrade a URL to HTTPS.
|
||||
pub async fn try_upgrade_to_https(
|
||||
&self,
|
||||
url: &Url,
|
||||
client: &Client,
|
||||
) -> anyhow::Result<Option<Url>> {
|
||||
if url.scheme().compare_no_case("http") == Ordering::Equal {
|
||||
// Try to upgrade to HTTPS if we can.
|
||||
let mut https_url = url.clone();
|
||||
https_url.set_scheme("https");
|
||||
client
|
||||
.head(&https_url)
|
||||
.timeout(Duration::from_secs(10))
|
||||
.send()
|
||||
.await
|
||||
.context("failed to make HEAD request")?
|
||||
.error_for_status()
|
||||
.context("bad response for HEAD requesst")?;
|
||||
Ok(Some(https_url))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Rakes a resource by URL.
|
||||
///
|
||||
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
|
||||
|
@ -305,6 +332,22 @@ impl Raker {
|
|||
intent: RakeIntent,
|
||||
client: &Client,
|
||||
) -> anyhow::Result<RakeOutcome> {
|
||||
match self.try_upgrade_to_https(url, client).await {
|
||||
Ok(Some(upgraded)) => {
|
||||
return Ok(RakeOutcome::Redirect {
|
||||
reason: RedirectReason::SecureUpgrade,
|
||||
new_url: upgraded,
|
||||
});
|
||||
}
|
||||
Ok(None) => {
|
||||
// continue
|
||||
}
|
||||
Err(err) => {
|
||||
info!("can't upgrade {url} to HTTPS: {err:?}");
|
||||
// continue
|
||||
}
|
||||
}
|
||||
|
||||
let response = client.get(url.clone()).send().await?;
|
||||
|
||||
let is_cf = if let Some(remote_addr) = response.remote_addr() {
|
||||
|
|
Loading…
Reference in New Issue