Show references in qp-rake1 output

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 23:06:44 +00:00
parent 601ec553b5
commit 4b296a1d1e
4 changed files with 114 additions and 71 deletions

View File

@ -5,9 +5,10 @@ use colour::{blue_ln, green_ln, red_ln, yellow_ln};
use env_logger::Env; use env_logger::Env;
use log::warn; use log::warn;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::references::references_from_urlrakes;
use quickpeep::raking::{RakeIntent, RakeOutcome}; use quickpeep::raking::{RakeIntent, RakeOutcome};
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, ReferenceKind};
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy; use reqwest::redirect::Policy;
use reqwest::Url; use reqwest::Url;
@ -77,15 +78,33 @@ pub async fn main() -> anyhow::Result<()> {
RakeOutcome::RakedPage(page) => { RakeOutcome::RakedPage(page) => {
let content_size = serde_bare::to_vec(&page)?.len(); let content_size = serde_bare::to_vec(&page)?.len();
green_ln!("Page ({} bytes)", content_size); green_ln!("Page ({} bytes)", content_size);
// TODO
println!();
let head = &page.page_entry.document.head;
println!("Title: {}", head.title);
println!("Language: {}", head.language);
if !head.icon.is_empty() {
println!("Icon: {}", head.icon);
}
println!();
print_references(&page.referrer_entry.references);
} }
RakeOutcome::RakedFeed(feed) => { RakeOutcome::RakedFeed(feed) => {
green_ln!("Feed"); green_ln!("Feed");
// TODO // TODO
println!();
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
print_references(&refs);
} }
RakeOutcome::RakedSitemap(sitemap) => { RakeOutcome::RakedSitemap(sitemap) => {
green_ln!("Sitemap"); green_ln!("Sitemap");
// TODO // TODO
println!();
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
print_references(&refs);
} }
RakeOutcome::Redirect { reason, new_url } => { RakeOutcome::Redirect { reason, new_url } => {
blue_ln!("Redirect ({:?})", reason); blue_ln!("Redirect ({:?})", reason);
@ -101,3 +120,10 @@ pub async fn main() -> anyhow::Result<()> {
Ok(()) Ok(())
} }
fn print_references(refs: &Vec<RakedReference>) {
println!("{} References", refs.len());
for reference in refs {
println!("\t{:?}{}", reference.kind, reference.target);
}
}

View File

@ -13,9 +13,7 @@ use kuchiki::NodeRef;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::debug; use log::debug;
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_structs::rake_entries::{ use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedPageEntry, RakedReferrerEntry};
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
};
use quickpeep_utils::Lazy; use quickpeep_utils::Lazy;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
@ -26,6 +24,7 @@ use std::time::Duration;
use tokio::time::Instant; use tokio::time::Instant;
pub mod analysis; pub mod analysis;
pub mod references;
/// 4 MiB ought to be enough for anybody. /// 4 MiB ought to be enough for anybody.
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
@ -66,8 +65,8 @@ pub struct UrlRaked {
#[derive(Serialize)] #[derive(Serialize)]
pub struct RakedPage { pub struct RakedPage {
page_entry: RakedPageEntry, pub page_entry: RakedPageEntry,
referrer_entry: RakedReferrerEntry, pub referrer_entry: RakedReferrerEntry,
} }
pub struct RobotsTxt { pub struct RobotsTxt {
@ -444,7 +443,7 @@ impl Raker {
let bare_size = serde_bare::to_vec(&dense_doc)?.len(); let bare_size = serde_bare::to_vec(&dense_doc)?.len();
eprintln!("CS {:?}{:?}", content.len(), bare_size); eprintln!("CS {:?}{:?}", content.len(), bare_size);
let references = find_references(&document, &feeds, url); let references = references::find_references(&document, &feeds, url);
Ok(RakeOutcome::RakedPage(RakedPage { Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry { page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags, analysed_antifeatures: antifeature_flags,
@ -455,68 +454,6 @@ impl Raker {
} }
} }
pub fn find_references(
doc: &DenseDocument,
feeds: &Vec<Url>,
page_url: &Url,
) -> Vec<RakedReference> {
let mut refs = Vec::new();
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
for node in tree {
match node {
DenseTree::Heading1(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading2(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading3(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading4(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading5(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading6(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Link {
children,
href,
nofollow,
} => {
if !nofollow {
if let Ok(full_url) = page_url.join(&href) {
refs.push(RakedReference {
target: full_url.to_string(),
kind: ReferenceKind::CanonicalUrl,
})
}
}
add_link_refs(children, refs, page_url);
}
DenseTree::Image { .. } => {}
DenseTree::Text(_) => {}
}
}
}
add_link_refs(&doc.body_content, &mut refs, &page_url);
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds {
refs.push(RakedReference {
target: feed.as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed,
});
}
refs
}
pub fn normalise_language(lang_string: &mut String) { pub fn normalise_language(lang_string: &mut String) {
*lang_string = lang_string.to_lowercase(); *lang_string = lang_string.to_lowercase();
let mut pieces = lang_string let mut pieces = lang_string

View File

@ -0,0 +1,80 @@
use crate::raking::UrlRaked;
use itertools::Itertools;
use quickpeep_densedoc::{DenseDocument, DenseTree};
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
use reqwest::Url;
pub fn find_references(
doc: &DenseDocument,
feeds: &Vec<Url>,
page_url: &Url,
) -> Vec<RakedReference> {
let mut refs = Vec::new();
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
for node in tree {
match node {
DenseTree::Heading1(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading2(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading3(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading4(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading5(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading6(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Link {
children,
href,
nofollow,
} => {
if !nofollow {
if let Ok(full_url) = page_url.join(&href) {
refs.push(RakedReference {
target: full_url.to_string(),
kind: ReferenceKind::Link,
})
}
}
add_link_refs(children, refs, page_url);
}
DenseTree::Image { .. } => {}
DenseTree::Text(_) => {}
}
}
}
add_link_refs(&doc.body_content, &mut refs, &page_url);
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds {
refs.push(RakedReference {
target: feed.as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed,
});
}
refs
}
pub fn references_from_urlrakes(
input: &Vec<UrlRaked>,
ref_kind: ReferenceKind,
) -> Vec<RakedReference> {
input
.iter()
.map(|url_raked| RakedReference {
target: url_raked.url.to_string(),
kind: ref_kind,
})
.collect_vec()
}

View File

@ -39,7 +39,7 @@ pub struct RakedReference {
pub kind: ReferenceKind, pub kind: ReferenceKind,
} }
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Copy, Clone)]
pub enum ReferenceKind { pub enum ReferenceKind {
CanonicalUrl, CanonicalUrl,
Redirect, Redirect,