Fix unfinished work around SecureUpgrade
ci/woodpecker/push/check Pipeline was successful Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2022-12-03 15:13:06 +00:00
parent 99fcbf77f6
commit 0bebfc0025
3 changed files with 20 additions and 15 deletions

View File

@ -1,5 +1,10 @@
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use ::metrics::increment_counter;
use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
@ -10,21 +15,18 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat};
use itertools::Itertools;
use lazy_static::lazy_static;
use log::{debug, info, warn};
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use log::{debug, info};
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use tokio::time::Instant;
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
pub mod analysis;
pub mod page_extraction;
pub mod rakemetrics;
@ -152,7 +154,7 @@ impl FromStr for RakeIntent {
impl From<ReferenceKind> for RakeIntent {
fn from(kind: ReferenceKind) -> Self {
match kind {
ReferenceKind::CanonicalUrl => {
ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
RakeIntent::Any
}
@ -304,12 +306,12 @@ impl Raker {
url: &Url,
client: &Client,
) -> anyhow::Result<Option<Url>> {
if url.scheme().compare_no_case("http") == Ordering::Equal {
if url.scheme().eq_ignore_ascii_case("http") {
// Try to upgrade to HTTPS if we can.
let mut https_url = url.clone();
https_url.set_scheme("https");
https_url.set_scheme("https").unwrap();
client
.head(&https_url)
.head(https_url.clone())
.timeout(Duration::from_secs(10))
.send()
.await

View File

@ -429,6 +429,7 @@ impl TaskContext {
kind: match reason {
RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
},
last_mod: None,
}]

View File

@ -60,6 +60,8 @@ pub struct RakedReference {
pub enum ReferenceKind {
/// Canonical URL for the same document, as declared in the page.
CanonicalUrl,
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
SecureUpgrade,
/// HTTP-level redirect.
Redirect,
/// Link in a page (<a>). Could be to another page or to a feed.