From 4b296a1d1e65860d5fab43602fa63ac57ecb238e Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Mon, 14 Mar 2022 23:06:44 +0000 Subject: [PATCH] Show references in qp-rake1 output --- quickpeep/src/bin/qp-rake1.rs | 30 +++++++++- quickpeep/src/raking.rs | 73 ++---------------------- quickpeep/src/raking/references.rs | 80 +++++++++++++++++++++++++++ quickpeep_structs/src/rake_entries.rs | 2 +- 4 files changed, 114 insertions(+), 71 deletions(-) create mode 100644 quickpeep/src/raking/references.rs diff --git a/quickpeep/src/bin/qp-rake1.rs b/quickpeep/src/bin/qp-rake1.rs index 1139d27..363f791 100644 --- a/quickpeep/src/bin/qp-rake1.rs +++ b/quickpeep/src/bin/qp-rake1.rs @@ -5,9 +5,10 @@ use colour::{blue_ln, green_ln, red_ln, yellow_ln}; use env_logger::Env; use log::warn; use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; +use quickpeep::raking::references::references_from_urlrakes; use quickpeep::raking::{RakeIntent, RakeOutcome}; use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; -use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, ReferenceKind}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::redirect::Policy; use reqwest::Url; @@ -77,15 +78,33 @@ pub async fn main() -> anyhow::Result<()> { RakeOutcome::RakedPage(page) => { let content_size = serde_bare::to_vec(&page)?.len(); green_ln!("Page ({} bytes)", content_size); - // TODO + + println!(); + let head = &page.page_entry.document.head; + println!("Title: {}", head.title); + println!("Language: {}", head.language); + if !head.icon.is_empty() { + println!("Icon: {}", head.icon); + } + + println!(); + print_references(&page.referrer_entry.references); } RakeOutcome::RakedFeed(feed) => { green_ln!("Feed"); // TODO + + println!(); + let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry); + print_references(&refs); } RakeOutcome::RakedSitemap(sitemap) => { green_ln!("Sitemap"); // TODO + + println!(); + let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry); + print_references(&refs); } RakeOutcome::Redirect { reason, new_url } => { blue_ln!("Redirect ({:?})", reason); @@ -101,3 +120,10 @@ pub async fn main() -> anyhow::Result<()> { Ok(()) } + +fn print_references(refs: &Vec) { + println!("{} References", refs.len()); + for reference in refs { + println!("\t{:?} → {}", reference.kind, reference.target); + } +} diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index 1f39b95..9ea65c8 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -13,9 +13,7 @@ use kuchiki::NodeRef; use lazy_static::lazy_static; use log::debug; use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; -use quickpeep_structs::rake_entries::{ - AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, -}; +use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedPageEntry, RakedReferrerEntry}; use quickpeep_utils::Lazy; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; @@ -26,6 +24,7 @@ use std::time::Duration; use tokio::time::Instant; pub mod analysis; +pub mod references; /// 4 MiB ought to be enough for anybody. pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; @@ -66,8 +65,8 @@ pub struct UrlRaked { #[derive(Serialize)] pub struct RakedPage { - page_entry: RakedPageEntry, - referrer_entry: RakedReferrerEntry, + pub page_entry: RakedPageEntry, + pub referrer_entry: RakedReferrerEntry, } pub struct RobotsTxt { @@ -444,7 +443,7 @@ impl Raker { let bare_size = serde_bare::to_vec(&dense_doc)?.len(); eprintln!("CS {:?} → {:?}", content.len(), bare_size); - let references = find_references(&document, &feeds, url); + let references = references::find_references(&document, &feeds, url); Ok(RakeOutcome::RakedPage(RakedPage { page_entry: RakedPageEntry { analysed_antifeatures: antifeature_flags, @@ -455,68 +454,6 @@ impl Raker { } } -pub fn find_references( - doc: &DenseDocument, - feeds: &Vec, - page_url: &Url, -) -> Vec { - let mut refs = Vec::new(); - - fn add_link_refs(tree: &Vec, refs: &mut Vec, page_url: &Url) { - for node in tree { - match node { - DenseTree::Heading1(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Heading2(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Heading3(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Heading4(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Heading5(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Heading6(children) => { - add_link_refs(children, refs, page_url); - } - DenseTree::Link { - children, - href, - nofollow, - } => { - if !nofollow { - if let Ok(full_url) = page_url.join(&href) { - refs.push(RakedReference { - target: full_url.to_string(), - kind: ReferenceKind::CanonicalUrl, - }) - } - } - add_link_refs(children, refs, page_url); - } - DenseTree::Image { .. } => {} - DenseTree::Text(_) => {} - } - } - } - - add_link_refs(&doc.body_content, &mut refs, &page_url); - add_link_refs(&doc.body_remainder, &mut refs, &page_url); - - for feed in feeds { - refs.push(RakedReference { - target: feed.as_str().to_owned(), - kind: ReferenceKind::HeaderLinkedFeed, - }); - } - - refs -} - pub fn normalise_language(lang_string: &mut String) { *lang_string = lang_string.to_lowercase(); let mut pieces = lang_string diff --git a/quickpeep/src/raking/references.rs b/quickpeep/src/raking/references.rs new file mode 100644 index 0000000..7e7d72d --- /dev/null +++ b/quickpeep/src/raking/references.rs @@ -0,0 +1,80 @@ +use crate::raking::UrlRaked; +use itertools::Itertools; +use quickpeep_densedoc::{DenseDocument, DenseTree}; +use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind}; +use reqwest::Url; + +pub fn find_references( + doc: &DenseDocument, + feeds: &Vec, + page_url: &Url, +) -> Vec { + let mut refs = Vec::new(); + + fn add_link_refs(tree: &Vec, refs: &mut Vec, page_url: &Url) { + for node in tree { + match node { + DenseTree::Heading1(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading2(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading3(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading4(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading5(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading6(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Link { + children, + href, + nofollow, + } => { + if !nofollow { + if let Ok(full_url) = page_url.join(&href) { + refs.push(RakedReference { + target: full_url.to_string(), + kind: ReferenceKind::Link, + }) + } + } + add_link_refs(children, refs, page_url); + } + DenseTree::Image { .. } => {} + DenseTree::Text(_) => {} + } + } + } + + add_link_refs(&doc.body_content, &mut refs, &page_url); + add_link_refs(&doc.body_remainder, &mut refs, &page_url); + + for feed in feeds { + refs.push(RakedReference { + target: feed.as_str().to_owned(), + kind: ReferenceKind::HeaderLinkedFeed, + }); + } + + refs +} + +pub fn references_from_urlrakes( + input: &Vec, + ref_kind: ReferenceKind, +) -> Vec { + input + .iter() + .map(|url_raked| RakedReference { + target: url_raked.url.to_string(), + kind: ref_kind, + }) + .collect_vec() +} diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs index 47d90f2..c847963 100644 --- a/quickpeep_structs/src/rake_entries.rs +++ b/quickpeep_structs/src/rake_entries.rs @@ -39,7 +39,7 @@ pub struct RakedReference { pub kind: ReferenceKind, } -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone)] pub enum ReferenceKind { CanonicalUrl, Redirect,