Fix unfinished work around SecureUpgrade
This commit is contained in:
parent
99fcbf77f6
commit
0bebfc0025
|
@ -1,5 +1,10 @@
|
||||||
use crate::raking::analysis::IpSet;
|
use std::collections::{HashMap, HashSet};
|
||||||
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
use std::error::Error;
|
||||||
|
use std::fmt::{Debug, Display, Formatter};
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::str::FromStr;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use ::metrics::increment_counter;
|
use ::metrics::increment_counter;
|
||||||
use anyhow::{anyhow, bail, Context};
|
use anyhow::{anyhow, bail, Context};
|
||||||
use chrono::{DateTime, FixedOffset, Utc};
|
use chrono::{DateTime, FixedOffset, Utc};
|
||||||
|
@ -10,21 +15,18 @@ use image::imageops::FilterType;
|
||||||
use image::{GenericImageView, ImageFormat};
|
use image::{GenericImageView, ImageFormat};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::{debug, info, warn};
|
use log::{debug, info};
|
||||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use sitemap::reader::SiteMapEntity;
|
use sitemap::reader::SiteMapEntity;
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::collections::{HashMap, HashSet};
|
|
||||||
use std::error::Error;
|
|
||||||
use std::fmt::{Debug, Display, Formatter};
|
|
||||||
use std::io::Cursor;
|
|
||||||
use std::str::FromStr;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
|
|
||||||
|
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||||
|
|
||||||
|
use crate::raking::analysis::IpSet;
|
||||||
|
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||||
|
|
||||||
pub mod analysis;
|
pub mod analysis;
|
||||||
pub mod page_extraction;
|
pub mod page_extraction;
|
||||||
pub mod rakemetrics;
|
pub mod rakemetrics;
|
||||||
|
@ -152,7 +154,7 @@ impl FromStr for RakeIntent {
|
||||||
impl From<ReferenceKind> for RakeIntent {
|
impl From<ReferenceKind> for RakeIntent {
|
||||||
fn from(kind: ReferenceKind) -> Self {
|
fn from(kind: ReferenceKind) -> Self {
|
||||||
match kind {
|
match kind {
|
||||||
ReferenceKind::CanonicalUrl => {
|
ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
|
||||||
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
|
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
|
||||||
RakeIntent::Any
|
RakeIntent::Any
|
||||||
}
|
}
|
||||||
|
@ -304,12 +306,12 @@ impl Raker {
|
||||||
url: &Url,
|
url: &Url,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> anyhow::Result<Option<Url>> {
|
) -> anyhow::Result<Option<Url>> {
|
||||||
if url.scheme().compare_no_case("http") == Ordering::Equal {
|
if url.scheme().eq_ignore_ascii_case("http") {
|
||||||
// Try to upgrade to HTTPS if we can.
|
// Try to upgrade to HTTPS if we can.
|
||||||
let mut https_url = url.clone();
|
let mut https_url = url.clone();
|
||||||
https_url.set_scheme("https");
|
https_url.set_scheme("https").unwrap();
|
||||||
client
|
client
|
||||||
.head(&https_url)
|
.head(https_url.clone())
|
||||||
.timeout(Duration::from_secs(10))
|
.timeout(Duration::from_secs(10))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
|
|
|
@ -429,6 +429,7 @@ impl TaskContext {
|
||||||
kind: match reason {
|
kind: match reason {
|
||||||
RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
|
RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
|
||||||
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
|
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
|
||||||
|
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
|
||||||
},
|
},
|
||||||
last_mod: None,
|
last_mod: None,
|
||||||
}]
|
}]
|
||||||
|
|
|
@ -60,6 +60,8 @@ pub struct RakedReference {
|
||||||
pub enum ReferenceKind {
|
pub enum ReferenceKind {
|
||||||
/// Canonical URL for the same document, as declared in the page.
|
/// Canonical URL for the same document, as declared in the page.
|
||||||
CanonicalUrl,
|
CanonicalUrl,
|
||||||
|
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
|
||||||
|
SecureUpgrade,
|
||||||
/// HTTP-level redirect.
|
/// HTTP-level redirect.
|
||||||
Redirect,
|
Redirect,
|
||||||
/// Link in a page (<a>). Could be to another page or to a feed.
|
/// Link in a page (<a>). Could be to another page or to a feed.
|
||||||
|
|
Loading…
Reference in New Issue