Show references in qp-rake1 output
This commit is contained in:
parent
601ec553b5
commit
4b296a1d1e
|
@ -5,9 +5,10 @@ use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
|||
use env_logger::Env;
|
||||
use log::warn;
|
||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||
use quickpeep::raking::references::references_from_urlrakes;
|
||||
use quickpeep::raking::{RakeIntent, RakeOutcome};
|
||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, ReferenceKind};
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use reqwest::redirect::Policy;
|
||||
use reqwest::Url;
|
||||
|
@ -77,15 +78,33 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
RakeOutcome::RakedPage(page) => {
|
||||
let content_size = serde_bare::to_vec(&page)?.len();
|
||||
green_ln!("Page ({} bytes)", content_size);
|
||||
// TODO
|
||||
|
||||
println!();
|
||||
let head = &page.page_entry.document.head;
|
||||
println!("Title: {}", head.title);
|
||||
println!("Language: {}", head.language);
|
||||
if !head.icon.is_empty() {
|
||||
println!("Icon: {}", head.icon);
|
||||
}
|
||||
|
||||
println!();
|
||||
print_references(&page.referrer_entry.references);
|
||||
}
|
||||
RakeOutcome::RakedFeed(feed) => {
|
||||
green_ln!("Feed");
|
||||
// TODO
|
||||
|
||||
println!();
|
||||
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
|
||||
print_references(&refs);
|
||||
}
|
||||
RakeOutcome::RakedSitemap(sitemap) => {
|
||||
green_ln!("Sitemap");
|
||||
// TODO
|
||||
|
||||
println!();
|
||||
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
|
||||
print_references(&refs);
|
||||
}
|
||||
RakeOutcome::Redirect { reason, new_url } => {
|
||||
blue_ln!("Redirect ({:?})", reason);
|
||||
|
@ -101,3 +120,10 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_references(refs: &Vec<RakedReference>) {
|
||||
println!("{} References", refs.len());
|
||||
for reference in refs {
|
||||
println!("\t{:?} → {}", reference.kind, reference.target);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,9 +13,7 @@ use kuchiki::NodeRef;
|
|||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||
use quickpeep_structs::rake_entries::{
|
||||
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||
};
|
||||
use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedPageEntry, RakedReferrerEntry};
|
||||
use quickpeep_utils::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
|
@ -26,6 +24,7 @@ use std::time::Duration;
|
|||
use tokio::time::Instant;
|
||||
|
||||
pub mod analysis;
|
||||
pub mod references;
|
||||
|
||||
/// 4 MiB ought to be enough for anybody.
|
||||
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
|
||||
|
@ -66,8 +65,8 @@ pub struct UrlRaked {
|
|||
|
||||
#[derive(Serialize)]
|
||||
pub struct RakedPage {
|
||||
page_entry: RakedPageEntry,
|
||||
referrer_entry: RakedReferrerEntry,
|
||||
pub page_entry: RakedPageEntry,
|
||||
pub referrer_entry: RakedReferrerEntry,
|
||||
}
|
||||
|
||||
pub struct RobotsTxt {
|
||||
|
@ -444,7 +443,7 @@ impl Raker {
|
|||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
||||
|
||||
let references = find_references(&document, &feeds, url);
|
||||
let references = references::find_references(&document, &feeds, url);
|
||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||
page_entry: RakedPageEntry {
|
||||
analysed_antifeatures: antifeature_flags,
|
||||
|
@ -455,68 +454,6 @@ impl Raker {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn find_references(
|
||||
doc: &DenseDocument,
|
||||
feeds: &Vec<Url>,
|
||||
page_url: &Url,
|
||||
) -> Vec<RakedReference> {
|
||||
let mut refs = Vec::new();
|
||||
|
||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||
for node in tree {
|
||||
match node {
|
||||
DenseTree::Heading1(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading2(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading3(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading4(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading5(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading6(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Link {
|
||||
children,
|
||||
href,
|
||||
nofollow,
|
||||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
refs.push(RakedReference {
|
||||
target: full_url.to_string(),
|
||||
kind: ReferenceKind::CanonicalUrl,
|
||||
})
|
||||
}
|
||||
}
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Image { .. } => {}
|
||||
DenseTree::Text(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
refs.push(RakedReference {
|
||||
target: feed.as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
});
|
||||
}
|
||||
|
||||
refs
|
||||
}
|
||||
|
||||
pub fn normalise_language(lang_string: &mut String) {
|
||||
*lang_string = lang_string.to_lowercase();
|
||||
let mut pieces = lang_string
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
use crate::raking::UrlRaked;
|
||||
use itertools::Itertools;
|
||||
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
||||
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||
use reqwest::Url;
|
||||
|
||||
pub fn find_references(
|
||||
doc: &DenseDocument,
|
||||
feeds: &Vec<Url>,
|
||||
page_url: &Url,
|
||||
) -> Vec<RakedReference> {
|
||||
let mut refs = Vec::new();
|
||||
|
||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||
for node in tree {
|
||||
match node {
|
||||
DenseTree::Heading1(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading2(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading3(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading4(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading5(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading6(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Link {
|
||||
children,
|
||||
href,
|
||||
nofollow,
|
||||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
refs.push(RakedReference {
|
||||
target: full_url.to_string(),
|
||||
kind: ReferenceKind::Link,
|
||||
})
|
||||
}
|
||||
}
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Image { .. } => {}
|
||||
DenseTree::Text(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
refs.push(RakedReference {
|
||||
target: feed.as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
});
|
||||
}
|
||||
|
||||
refs
|
||||
}
|
||||
|
||||
pub fn references_from_urlrakes(
|
||||
input: &Vec<UrlRaked>,
|
||||
ref_kind: ReferenceKind,
|
||||
) -> Vec<RakedReference> {
|
||||
input
|
||||
.iter()
|
||||
.map(|url_raked| RakedReference {
|
||||
target: url_raked.url.to_string(),
|
||||
kind: ref_kind,
|
||||
})
|
||||
.collect_vec()
|
||||
}
|
|
@ -39,7 +39,7 @@ pub struct RakedReference {
|
|||
pub kind: ReferenceKind,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
|
||||
pub enum ReferenceKind {
|
||||
CanonicalUrl,
|
||||
Redirect,
|
||||
|
|
Loading…
Reference in New Issue