From 0bebfc0025756ff04df5fe3dbc15e4da73f63d35 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sat, 3 Dec 2022 15:13:06 +0000 Subject: [PATCH] Fix unfinished work around SecureUpgrade --- quickpeep_raker/src/raking.rs | 32 ++++++++++++++------------- quickpeep_raker/src/raking/task.rs | 1 + quickpeep_structs/src/rake_entries.rs | 2 ++ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index a9e2f6e..38c3937 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -1,5 +1,10 @@ -use crate::raking::analysis::IpSet; -use crate::raking::page_extraction::{ExtractedPage, PageExtractionService}; +use std::collections::{HashMap, HashSet}; +use std::error::Error; +use std::fmt::{Debug, Display, Formatter}; +use std::io::Cursor; +use std::str::FromStr; +use std::time::Duration; + use ::metrics::increment_counter; use anyhow::{anyhow, bail, Context}; use chrono::{DateTime, FixedOffset, Utc}; @@ -10,21 +15,18 @@ use image::imageops::FilterType; use image::{GenericImageView, ImageFormat}; use itertools::Itertools; use lazy_static::lazy_static; -use log::{debug, info, warn}; -use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind}; +use log::{debug, info}; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; use sitemap::reader::SiteMapEntity; -use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; -use std::error::Error; -use std::fmt::{Debug, Display, Formatter}; -use std::io::Cursor; -use std::str::FromStr; -use std::time::Duration; use tokio::time::Instant; +use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind}; + +use crate::raking::analysis::IpSet; +use crate::raking::page_extraction::{ExtractedPage, PageExtractionService}; + pub mod analysis; pub mod page_extraction; pub mod rakemetrics; @@ -152,7 +154,7 @@ impl FromStr for RakeIntent { impl From for RakeIntent { fn from(kind: ReferenceKind) -> Self { match kind { - ReferenceKind::CanonicalUrl => { + ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => { // FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter... RakeIntent::Any } @@ -304,12 +306,12 @@ impl Raker { url: &Url, client: &Client, ) -> anyhow::Result> { - if url.scheme().compare_no_case("http") == Ordering::Equal { + if url.scheme().eq_ignore_ascii_case("http") { // Try to upgrade to HTTPS if we can. let mut https_url = url.clone(); - https_url.set_scheme("https"); + https_url.set_scheme("https").unwrap(); client - .head(&https_url) + .head(https_url.clone()) .timeout(Duration::from_secs(10)) .send() .await diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 3d2fe3e..accb206 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -429,6 +429,7 @@ impl TaskContext { kind: match reason { RedirectReason::Redirected { .. } => ReferenceKind::Redirect, RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl, + RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade, }, last_mod: None, }] diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs index 619557c..8945e81 100644 --- a/quickpeep_structs/src/rake_entries.rs +++ b/quickpeep_structs/src/rake_entries.rs @@ -60,6 +60,8 @@ pub struct RakedReference { pub enum ReferenceKind { /// Canonical URL for the same document, as declared in the page. CanonicalUrl, + /// HTTP -> HTTPS upgrade, automatically caused by QuickPeep + SecureUpgrade, /// HTTP-level redirect. Redirect, /// Link in a page (). Could be to another page or to a feed.