Scrub URLs more effectively
This commit is contained in:
parent
4b296a1d1e
commit
403cc2a994
|
@ -12,6 +12,7 @@ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, Refe
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use reqwest::redirect::Policy;
|
use reqwest::redirect::Policy;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
|
|
||||||
|
@ -121,7 +122,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_references(refs: &Vec<RakedReference>) {
|
fn print_references(refs: &BTreeSet<RakedReference>) {
|
||||||
println!("{} References", refs.len());
|
println!("{} References", refs.len());
|
||||||
for reference in refs {
|
for reference in refs {
|
||||||
println!("\t{:?} → {}", reference.kind, reference.target);
|
println!("\t{:?} → {}", reference.kind, reference.target);
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
use crate::raking::UrlRaked;
|
use crate::raking::UrlRaked;
|
||||||
use itertools::Itertools;
|
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
||||||
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
pub fn find_references(
|
pub fn find_references(
|
||||||
doc: &DenseDocument,
|
doc: &DenseDocument,
|
||||||
feeds: &Vec<Url>,
|
feeds: &Vec<Url>,
|
||||||
page_url: &Url,
|
page_url: &Url,
|
||||||
) -> Vec<RakedReference> {
|
) -> BTreeSet<RakedReference> {
|
||||||
let mut refs = Vec::new();
|
let mut refs = BTreeSet::new();
|
||||||
|
|
||||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut BTreeSet<RakedReference>, page_url: &Url) {
|
||||||
for node in tree {
|
for node in tree {
|
||||||
match node {
|
match node {
|
||||||
DenseTree::Heading1(children) => {
|
DenseTree::Heading1(children) => {
|
||||||
|
@ -39,10 +39,10 @@ pub fn find_references(
|
||||||
} => {
|
} => {
|
||||||
if !nofollow {
|
if !nofollow {
|
||||||
if let Ok(full_url) = page_url.join(&href) {
|
if let Ok(full_url) = page_url.join(&href) {
|
||||||
refs.push(RakedReference {
|
refs.insert(RakedReference {
|
||||||
target: full_url.to_string(),
|
target: clean_url(&full_url).to_string(),
|
||||||
kind: ReferenceKind::Link,
|
kind: ReferenceKind::Link,
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
add_link_refs(children, refs, page_url);
|
add_link_refs(children, refs, page_url);
|
||||||
|
@ -57,8 +57,8 @@ pub fn find_references(
|
||||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||||
|
|
||||||
for feed in feeds {
|
for feed in feeds {
|
||||||
refs.push(RakedReference {
|
refs.insert(RakedReference {
|
||||||
target: feed.as_str().to_owned(),
|
target: clean_url(feed).as_str().to_owned(),
|
||||||
kind: ReferenceKind::HeaderLinkedFeed,
|
kind: ReferenceKind::HeaderLinkedFeed,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -69,12 +69,39 @@ pub fn find_references(
|
||||||
pub fn references_from_urlrakes(
|
pub fn references_from_urlrakes(
|
||||||
input: &Vec<UrlRaked>,
|
input: &Vec<UrlRaked>,
|
||||||
ref_kind: ReferenceKind,
|
ref_kind: ReferenceKind,
|
||||||
) -> Vec<RakedReference> {
|
) -> BTreeSet<RakedReference> {
|
||||||
input
|
input
|
||||||
.iter()
|
.iter()
|
||||||
.map(|url_raked| RakedReference {
|
.map(|url_raked| RakedReference {
|
||||||
target: url_raked.url.to_string(),
|
target: url_raked.url.to_string(),
|
||||||
kind: ref_kind,
|
kind: ref_kind,
|
||||||
})
|
})
|
||||||
.collect_vec()
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn clean_url(url: &Url) -> Url {
|
||||||
|
let mut url = url.clone();
|
||||||
|
url.set_fragment(None);
|
||||||
|
|
||||||
|
url
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::raking::references::clean_url;
|
||||||
|
use reqwest::Url;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn test_clean_url() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_url(&Url::from_str("https://example.org:443/blah#hahah").unwrap()).as_str(),
|
||||||
|
"https://example.org/blah",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
clean_url(&Url::from_str("https://example.org").unwrap()).as_str(),
|
||||||
|
"https://example.org/",
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ use bitflags::bitflags;
|
||||||
use bitflags_serde_shim::impl_serde_for_bitflags;
|
use bitflags_serde_shim::impl_serde_for_bitflags;
|
||||||
use quickpeep_densedoc::DenseDocument;
|
use quickpeep_densedoc::DenseDocument;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
bitflags! {
|
bitflags! {
|
||||||
pub struct AnalysisAntifeatures: u8 {
|
pub struct AnalysisAntifeatures: u8 {
|
||||||
|
@ -30,20 +31,24 @@ pub struct RakedPageEntry {
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct RakedReferrerEntry {
|
pub struct RakedReferrerEntry {
|
||||||
pub references: Vec<RakedReference>,
|
pub references: BTreeSet<RakedReference>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
|
||||||
pub struct RakedReference {
|
pub struct RakedReference {
|
||||||
pub target: String,
|
pub target: String,
|
||||||
pub kind: ReferenceKind,
|
pub kind: ReferenceKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
|
||||||
pub enum ReferenceKind {
|
pub enum ReferenceKind {
|
||||||
|
/// Canonical URL for the same document, as declared in the page.
|
||||||
CanonicalUrl,
|
CanonicalUrl,
|
||||||
|
/// HTTP-level redirect.
|
||||||
Redirect,
|
Redirect,
|
||||||
|
/// Link in a page (<a>). Could be to another page or to a feed.
|
||||||
Link,
|
Link,
|
||||||
|
/// <link> to a feed
|
||||||
HeaderLinkedFeed,
|
HeaderLinkedFeed,
|
||||||
FeedEntry,
|
FeedEntry,
|
||||||
SitemapEntry,
|
SitemapEntry,
|
||||||
|
|
Loading…
Reference in New Issue