Scrub URLs more effectively
This commit is contained in:
parent
4b296a1d1e
commit
403cc2a994
|
@ -12,6 +12,7 @@ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, Refe
|
|||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use reqwest::redirect::Policy;
|
||||
use reqwest::Url;
|
||||
use std::collections::BTreeSet;
|
||||
use std::path::PathBuf;
|
||||
use tokio::fs::File;
|
||||
|
||||
|
@ -121,7 +122,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn print_references(refs: &Vec<RakedReference>) {
|
||||
fn print_references(refs: &BTreeSet<RakedReference>) {
|
||||
println!("{} References", refs.len());
|
||||
for reference in refs {
|
||||
println!("\t{:?} → {}", reference.kind, reference.target);
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
use crate::raking::UrlRaked;
|
||||
use itertools::Itertools;
|
||||
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
||||
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||
use reqwest::Url;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
pub fn find_references(
|
||||
doc: &DenseDocument,
|
||||
feeds: &Vec<Url>,
|
||||
page_url: &Url,
|
||||
) -> Vec<RakedReference> {
|
||||
let mut refs = Vec::new();
|
||||
) -> BTreeSet<RakedReference> {
|
||||
let mut refs = BTreeSet::new();
|
||||
|
||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut BTreeSet<RakedReference>, page_url: &Url) {
|
||||
for node in tree {
|
||||
match node {
|
||||
DenseTree::Heading1(children) => {
|
||||
|
@ -39,10 +39,10 @@ pub fn find_references(
|
|||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
refs.push(RakedReference {
|
||||
target: full_url.to_string(),
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(&full_url).to_string(),
|
||||
kind: ReferenceKind::Link,
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
add_link_refs(children, refs, page_url);
|
||||
|
@ -57,8 +57,8 @@ pub fn find_references(
|
|||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
refs.push(RakedReference {
|
||||
target: feed.as_str().to_owned(),
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(feed).as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
});
|
||||
}
|
||||
|
@ -69,12 +69,39 @@ pub fn find_references(
|
|||
pub fn references_from_urlrakes(
|
||||
input: &Vec<UrlRaked>,
|
||||
ref_kind: ReferenceKind,
|
||||
) -> Vec<RakedReference> {
|
||||
) -> BTreeSet<RakedReference> {
|
||||
input
|
||||
.iter()
|
||||
.map(|url_raked| RakedReference {
|
||||
target: url_raked.url.to_string(),
|
||||
kind: ref_kind,
|
||||
})
|
||||
.collect_vec()
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn clean_url(url: &Url) -> Url {
|
||||
let mut url = url.clone();
|
||||
url.set_fragment(None);
|
||||
|
||||
url
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::raking::references::clean_url;
|
||||
use reqwest::Url;
|
||||
use std::str::FromStr;
|
||||
|
||||
#[test]
|
||||
pub fn test_clean_url() {
|
||||
assert_eq!(
|
||||
clean_url(&Url::from_str("https://example.org:443/blah#hahah").unwrap()).as_str(),
|
||||
"https://example.org/blah",
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
clean_url(&Url::from_str("https://example.org").unwrap()).as_str(),
|
||||
"https://example.org/",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ use bitflags::bitflags;
|
|||
use bitflags_serde_shim::impl_serde_for_bitflags;
|
||||
use quickpeep_densedoc::DenseDocument;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
bitflags! {
|
||||
pub struct AnalysisAntifeatures: u8 {
|
||||
|
@ -30,20 +31,24 @@ pub struct RakedPageEntry {
|
|||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct RakedReferrerEntry {
|
||||
pub references: Vec<RakedReference>,
|
||||
pub references: BTreeSet<RakedReference>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
|
||||
pub struct RakedReference {
|
||||
pub target: String,
|
||||
pub kind: ReferenceKind,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
|
||||
pub enum ReferenceKind {
|
||||
/// Canonical URL for the same document, as declared in the page.
|
||||
CanonicalUrl,
|
||||
/// HTTP-level redirect.
|
||||
Redirect,
|
||||
/// Link in a page (<a>). Could be to another page or to a feed.
|
||||
Link,
|
||||
/// <link> to a feed
|
||||
HeaderLinkedFeed,
|
||||
FeedEntry,
|
||||
SitemapEntry,
|
||||
|
|
Loading…
Reference in New Issue