From 403cc2a994b961da83e38069fbdd739b4308077b Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Mon, 14 Mar 2022 23:11:02 +0000 Subject: [PATCH] Scrub URLs more effectively --- quickpeep/src/bin/qp-rake1.rs | 3 +- quickpeep/src/raking/references.rs | 49 +++++++++++++++++++++------ quickpeep_structs/src/rake_entries.rs | 11 ++++-- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/quickpeep/src/bin/qp-rake1.rs b/quickpeep/src/bin/qp-rake1.rs index 363f791..6930b03 100644 --- a/quickpeep/src/bin/qp-rake1.rs +++ b/quickpeep/src/bin/qp-rake1.rs @@ -12,6 +12,7 @@ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, Refe use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::redirect::Policy; use reqwest::Url; +use std::collections::BTreeSet; use std::path::PathBuf; use tokio::fs::File; @@ -121,7 +122,7 @@ pub async fn main() -> anyhow::Result<()> { Ok(()) } -fn print_references(refs: &Vec) { +fn print_references(refs: &BTreeSet) { println!("{} References", refs.len()); for reference in refs { println!("\t{:?} → {}", reference.kind, reference.target); diff --git a/quickpeep/src/raking/references.rs b/quickpeep/src/raking/references.rs index 7e7d72d..40b32ac 100644 --- a/quickpeep/src/raking/references.rs +++ b/quickpeep/src/raking/references.rs @@ -1,17 +1,17 @@ use crate::raking::UrlRaked; -use itertools::Itertools; use quickpeep_densedoc::{DenseDocument, DenseTree}; use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind}; use reqwest::Url; +use std::collections::BTreeSet; pub fn find_references( doc: &DenseDocument, feeds: &Vec, page_url: &Url, -) -> Vec { - let mut refs = Vec::new(); +) -> BTreeSet { + let mut refs = BTreeSet::new(); - fn add_link_refs(tree: &Vec, refs: &mut Vec, page_url: &Url) { + fn add_link_refs(tree: &Vec, refs: &mut BTreeSet, page_url: &Url) { for node in tree { match node { DenseTree::Heading1(children) => { @@ -39,10 +39,10 @@ pub fn find_references( } => { if !nofollow { if let Ok(full_url) = page_url.join(&href) { - refs.push(RakedReference { - target: full_url.to_string(), + refs.insert(RakedReference { + target: clean_url(&full_url).to_string(), kind: ReferenceKind::Link, - }) + }); } } add_link_refs(children, refs, page_url); @@ -57,8 +57,8 @@ pub fn find_references( add_link_refs(&doc.body_remainder, &mut refs, &page_url); for feed in feeds { - refs.push(RakedReference { - target: feed.as_str().to_owned(), + refs.insert(RakedReference { + target: clean_url(feed).as_str().to_owned(), kind: ReferenceKind::HeaderLinkedFeed, }); } @@ -69,12 +69,39 @@ pub fn find_references( pub fn references_from_urlrakes( input: &Vec, ref_kind: ReferenceKind, -) -> Vec { +) -> BTreeSet { input .iter() .map(|url_raked| RakedReference { target: url_raked.url.to_string(), kind: ref_kind, }) - .collect_vec() + .collect() +} + +pub fn clean_url(url: &Url) -> Url { + let mut url = url.clone(); + url.set_fragment(None); + + url +} + +#[cfg(test)] +mod test { + use crate::raking::references::clean_url; + use reqwest::Url; + use std::str::FromStr; + + #[test] + pub fn test_clean_url() { + assert_eq!( + clean_url(&Url::from_str("https://example.org:443/blah#hahah").unwrap()).as_str(), + "https://example.org/blah", + ); + + assert_eq!( + clean_url(&Url::from_str("https://example.org").unwrap()).as_str(), + "https://example.org/", + ); + } } diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs index c847963..d605381 100644 --- a/quickpeep_structs/src/rake_entries.rs +++ b/quickpeep_structs/src/rake_entries.rs @@ -2,6 +2,7 @@ use bitflags::bitflags; use bitflags_serde_shim::impl_serde_for_bitflags; use quickpeep_densedoc::DenseDocument; use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; bitflags! { pub struct AnalysisAntifeatures: u8 { @@ -30,20 +31,24 @@ pub struct RakedPageEntry { #[derive(Serialize, Deserialize, Debug, Clone)] pub struct RakedReferrerEntry { - pub references: Vec, + pub references: BTreeSet, } -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] pub struct RakedReference { pub target: String, pub kind: ReferenceKind, } -#[derive(Serialize, Deserialize, Debug, Copy, Clone)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] pub enum ReferenceKind { + /// Canonical URL for the same document, as declared in the page. CanonicalUrl, + /// HTTP-level redirect. Redirect, + /// Link in a page (). Could be to another page or to a feed. Link, + /// to a feed HeaderLinkedFeed, FeedEntry, SitemapEntry,