Fix unfinished work around SecureUpgrade
This commit is contained in:
parent
99fcbf77f6
commit
0bebfc0025
|
@ -1,5 +1,10 @@
|
|||
use crate::raking::analysis::IpSet;
|
||||
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::io::Cursor;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use ::metrics::increment_counter;
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use chrono::{DateTime, FixedOffset, Utc};
|
||||
|
@ -10,21 +15,18 @@ use image::imageops::FilterType;
|
|||
use image::{GenericImageView, ImageFormat};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use log::{debug, info, warn};
|
||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||
use log::{debug, info};
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sitemap::reader::SiteMapEntity;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::io::Cursor;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||
|
||||
use crate::raking::analysis::IpSet;
|
||||
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||
|
||||
pub mod analysis;
|
||||
pub mod page_extraction;
|
||||
pub mod rakemetrics;
|
||||
|
@ -152,7 +154,7 @@ impl FromStr for RakeIntent {
|
|||
impl From<ReferenceKind> for RakeIntent {
|
||||
fn from(kind: ReferenceKind) -> Self {
|
||||
match kind {
|
||||
ReferenceKind::CanonicalUrl => {
|
||||
ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
|
||||
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
|
||||
RakeIntent::Any
|
||||
}
|
||||
|
@ -304,12 +306,12 @@ impl Raker {
|
|||
url: &Url,
|
||||
client: &Client,
|
||||
) -> anyhow::Result<Option<Url>> {
|
||||
if url.scheme().compare_no_case("http") == Ordering::Equal {
|
||||
if url.scheme().eq_ignore_ascii_case("http") {
|
||||
// Try to upgrade to HTTPS if we can.
|
||||
let mut https_url = url.clone();
|
||||
https_url.set_scheme("https");
|
||||
https_url.set_scheme("https").unwrap();
|
||||
client
|
||||
.head(&https_url)
|
||||
.head(https_url.clone())
|
||||
.timeout(Duration::from_secs(10))
|
||||
.send()
|
||||
.await
|
||||
|
|
|
@ -429,6 +429,7 @@ impl TaskContext {
|
|||
kind: match reason {
|
||||
RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
|
||||
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
|
||||
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
|
||||
},
|
||||
last_mod: None,
|
||||
}]
|
||||
|
|
|
@ -60,6 +60,8 @@ pub struct RakedReference {
|
|||
pub enum ReferenceKind {
|
||||
/// Canonical URL for the same document, as declared in the page.
|
||||
CanonicalUrl,
|
||||
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
|
||||
SecureUpgrade,
|
||||
/// HTTP-level redirect.
|
||||
Redirect,
|
||||
/// Link in a page (<a>). Could be to another page or to a feed.
|
||||
|
|
Loading…
Reference in New Issue