Scrub URLs more effectively

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 23:11:02 +00:00
parent 4b296a1d1e
commit 403cc2a994
3 changed files with 48 additions and 15 deletions

View File

@ -12,6 +12,7 @@ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, Refe
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy;
use reqwest::Url;
use std::collections::BTreeSet;
use std::path::PathBuf;
use tokio::fs::File;
@ -121,7 +122,7 @@ pub async fn main() -> anyhow::Result<()> {
Ok(())
}
fn print_references(refs: &Vec<RakedReference>) {
fn print_references(refs: &BTreeSet<RakedReference>) {
println!("{} References", refs.len());
for reference in refs {
println!("\t{:?}{}", reference.kind, reference.target);

View File

@ -1,17 +1,17 @@
use crate::raking::UrlRaked;
use itertools::Itertools;
use quickpeep_densedoc::{DenseDocument, DenseTree};
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
use reqwest::Url;
use std::collections::BTreeSet;
pub fn find_references(
doc: &DenseDocument,
feeds: &Vec<Url>,
page_url: &Url,
) -> Vec<RakedReference> {
let mut refs = Vec::new();
) -> BTreeSet<RakedReference> {
let mut refs = BTreeSet::new();
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut BTreeSet<RakedReference>, page_url: &Url) {
for node in tree {
match node {
DenseTree::Heading1(children) => {
@ -39,10 +39,10 @@ pub fn find_references(
} => {
if !nofollow {
if let Ok(full_url) = page_url.join(&href) {
refs.push(RakedReference {
target: full_url.to_string(),
refs.insert(RakedReference {
target: clean_url(&full_url).to_string(),
kind: ReferenceKind::Link,
})
});
}
}
add_link_refs(children, refs, page_url);
@ -57,8 +57,8 @@ pub fn find_references(
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds {
refs.push(RakedReference {
target: feed.as_str().to_owned(),
refs.insert(RakedReference {
target: clean_url(feed).as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed,
});
}
@ -69,12 +69,39 @@ pub fn find_references(
pub fn references_from_urlrakes(
input: &Vec<UrlRaked>,
ref_kind: ReferenceKind,
) -> Vec<RakedReference> {
) -> BTreeSet<RakedReference> {
input
.iter()
.map(|url_raked| RakedReference {
target: url_raked.url.to_string(),
kind: ref_kind,
})
.collect_vec()
.collect()
}
pub fn clean_url(url: &Url) -> Url {
let mut url = url.clone();
url.set_fragment(None);
url
}
#[cfg(test)]
mod test {
use crate::raking::references::clean_url;
use reqwest::Url;
use std::str::FromStr;
#[test]
pub fn test_clean_url() {
assert_eq!(
clean_url(&Url::from_str("https://example.org:443/blah#hahah").unwrap()).as_str(),
"https://example.org/blah",
);
assert_eq!(
clean_url(&Url::from_str("https://example.org").unwrap()).as_str(),
"https://example.org/",
);
}
}

View File

@ -2,6 +2,7 @@ use bitflags::bitflags;
use bitflags_serde_shim::impl_serde_for_bitflags;
use quickpeep_densedoc::DenseDocument;
use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
bitflags! {
pub struct AnalysisAntifeatures: u8 {
@ -30,20 +31,24 @@ pub struct RakedPageEntry {
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedReferrerEntry {
pub references: Vec<RakedReference>,
pub references: BTreeSet<RakedReference>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
#[derive(Serialize, Deserialize, Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub struct RakedReference {
pub target: String,
pub kind: ReferenceKind,
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub enum ReferenceKind {
/// Canonical URL for the same document, as declared in the page.
CanonicalUrl,
/// HTTP-level redirect.
Redirect,
/// Link in a page (<a>). Could be to another page or to a feed.
Link,
/// <link> to a feed
HeaderLinkedFeed,
FeedEntry,
SitemapEntry,