Scrub URLs more effectively

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 23:11:02 +00:00
parent 4b296a1d1e
commit 403cc2a994
3 changed files with 48 additions and 15 deletions

View File

@ -12,6 +12,7 @@ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, Refe
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy; use reqwest::redirect::Policy;
use reqwest::Url; use reqwest::Url;
use std::collections::BTreeSet;
use std::path::PathBuf; use std::path::PathBuf;
use tokio::fs::File; use tokio::fs::File;
@ -121,7 +122,7 @@ pub async fn main() -> anyhow::Result<()> {
Ok(()) Ok(())
} }
fn print_references(refs: &Vec<RakedReference>) { fn print_references(refs: &BTreeSet<RakedReference>) {
println!("{} References", refs.len()); println!("{} References", refs.len());
for reference in refs { for reference in refs {
println!("\t{:?}{}", reference.kind, reference.target); println!("\t{:?}{}", reference.kind, reference.target);

View File

@ -1,17 +1,17 @@
use crate::raking::UrlRaked; use crate::raking::UrlRaked;
use itertools::Itertools;
use quickpeep_densedoc::{DenseDocument, DenseTree}; use quickpeep_densedoc::{DenseDocument, DenseTree};
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind}; use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
use reqwest::Url; use reqwest::Url;
use std::collections::BTreeSet;
pub fn find_references( pub fn find_references(
doc: &DenseDocument, doc: &DenseDocument,
feeds: &Vec<Url>, feeds: &Vec<Url>,
page_url: &Url, page_url: &Url,
) -> Vec<RakedReference> { ) -> BTreeSet<RakedReference> {
let mut refs = Vec::new(); let mut refs = BTreeSet::new();
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) { fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut BTreeSet<RakedReference>, page_url: &Url) {
for node in tree { for node in tree {
match node { match node {
DenseTree::Heading1(children) => { DenseTree::Heading1(children) => {
@ -39,10 +39,10 @@ pub fn find_references(
} => { } => {
if !nofollow { if !nofollow {
if let Ok(full_url) = page_url.join(&href) { if let Ok(full_url) = page_url.join(&href) {
refs.push(RakedReference { refs.insert(RakedReference {
target: full_url.to_string(), target: clean_url(&full_url).to_string(),
kind: ReferenceKind::Link, kind: ReferenceKind::Link,
}) });
} }
} }
add_link_refs(children, refs, page_url); add_link_refs(children, refs, page_url);
@ -57,8 +57,8 @@ pub fn find_references(
add_link_refs(&doc.body_remainder, &mut refs, &page_url); add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds { for feed in feeds {
refs.push(RakedReference { refs.insert(RakedReference {
target: feed.as_str().to_owned(), target: clean_url(feed).as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed, kind: ReferenceKind::HeaderLinkedFeed,
}); });
} }
@ -69,12 +69,39 @@ pub fn find_references(
pub fn references_from_urlrakes( pub fn references_from_urlrakes(
input: &Vec<UrlRaked>, input: &Vec<UrlRaked>,
ref_kind: ReferenceKind, ref_kind: ReferenceKind,
) -> Vec<RakedReference> { ) -> BTreeSet<RakedReference> {
input input
.iter() .iter()
.map(|url_raked| RakedReference { .map(|url_raked| RakedReference {
target: url_raked.url.to_string(), target: url_raked.url.to_string(),
kind: ref_kind, kind: ref_kind,
}) })
.collect_vec() .collect()
}
pub fn clean_url(url: &Url) -> Url {
let mut url = url.clone();
url.set_fragment(None);
url
}
#[cfg(test)]
mod test {
use crate::raking::references::clean_url;
use reqwest::Url;
use std::str::FromStr;
#[test]
pub fn test_clean_url() {
assert_eq!(
clean_url(&Url::from_str("https://example.org:443/blah#hahah").unwrap()).as_str(),
"https://example.org/blah",
);
assert_eq!(
clean_url(&Url::from_str("https://example.org").unwrap()).as_str(),
"https://example.org/",
);
}
} }

View File

@ -2,6 +2,7 @@ use bitflags::bitflags;
use bitflags_serde_shim::impl_serde_for_bitflags; use bitflags_serde_shim::impl_serde_for_bitflags;
use quickpeep_densedoc::DenseDocument; use quickpeep_densedoc::DenseDocument;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
bitflags! { bitflags! {
pub struct AnalysisAntifeatures: u8 { pub struct AnalysisAntifeatures: u8 {
@ -30,20 +31,24 @@ pub struct RakedPageEntry {
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedReferrerEntry { pub struct RakedReferrerEntry {
pub references: Vec<RakedReference>, pub references: BTreeSet<RakedReference>,
} }
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub struct RakedReference { pub struct RakedReference {
pub target: String, pub target: String,
pub kind: ReferenceKind, pub kind: ReferenceKind,
} }
#[derive(Serialize, Deserialize, Debug, Copy, Clone)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub enum ReferenceKind { pub enum ReferenceKind {
/// Canonical URL for the same document, as declared in the page.
CanonicalUrl, CanonicalUrl,
/// HTTP-level redirect.
Redirect, Redirect,
/// Link in a page (<a>). Could be to another page or to a feed.
Link, Link,
/// <link> to a feed
HeaderLinkedFeed, HeaderLinkedFeed,
FeedEntry, FeedEntry,
SitemapEntry, SitemapEntry,