Fix unfinished work around SecureUpgrade
ci/woodpecker/push/check Pipeline was successful Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2022-12-03 15:13:06 +00:00
parent 99fcbf77f6
commit 0bebfc0025
3 changed files with 20 additions and 15 deletions

View File

@ -1,5 +1,10 @@
use crate::raking::analysis::IpSet; use std::collections::{HashMap, HashSet};
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService}; use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use ::metrics::increment_counter; use ::metrics::increment_counter;
use anyhow::{anyhow, bail, Context}; use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc}; use chrono::{DateTime, FixedOffset, Utc};
@ -10,21 +15,18 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat}; use image::{GenericImageView, ImageFormat};
use itertools::Itertools; use itertools::Itertools;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::{debug, info, warn}; use log::{debug, info};
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity; use sitemap::reader::SiteMapEntity;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use tokio::time::Instant; use tokio::time::Instant;
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
pub mod analysis; pub mod analysis;
pub mod page_extraction; pub mod page_extraction;
pub mod rakemetrics; pub mod rakemetrics;
@ -152,7 +154,7 @@ impl FromStr for RakeIntent {
impl From<ReferenceKind> for RakeIntent { impl From<ReferenceKind> for RakeIntent {
fn from(kind: ReferenceKind) -> Self { fn from(kind: ReferenceKind) -> Self {
match kind { match kind {
ReferenceKind::CanonicalUrl => { ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter... // FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
RakeIntent::Any RakeIntent::Any
} }
@ -304,12 +306,12 @@ impl Raker {
url: &Url, url: &Url,
client: &Client, client: &Client,
) -> anyhow::Result<Option<Url>> { ) -> anyhow::Result<Option<Url>> {
if url.scheme().compare_no_case("http") == Ordering::Equal { if url.scheme().eq_ignore_ascii_case("http") {
// Try to upgrade to HTTPS if we can. // Try to upgrade to HTTPS if we can.
let mut https_url = url.clone(); let mut https_url = url.clone();
https_url.set_scheme("https"); https_url.set_scheme("https").unwrap();
client client
.head(&https_url) .head(https_url.clone())
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.send() .send()
.await .await

View File

@ -429,6 +429,7 @@ impl TaskContext {
kind: match reason { kind: match reason {
RedirectReason::Redirected { .. } => ReferenceKind::Redirect, RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl, RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
}, },
last_mod: None, last_mod: None,
}] }]

View File

@ -60,6 +60,8 @@ pub struct RakedReference {
pub enum ReferenceKind { pub enum ReferenceKind {
/// Canonical URL for the same document, as declared in the page. /// Canonical URL for the same document, as declared in the page.
CanonicalUrl, CanonicalUrl,
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
SecureUpgrade,
/// HTTP-level redirect. /// HTTP-level redirect.
Redirect, Redirect,
/// Link in a page (<a>). Could be to another page or to a feed. /// Link in a page (<a>). Could be to another page or to a feed.