Show references in qp-rake1 output
This commit is contained in:
parent
601ec553b5
commit
4b296a1d1e
|
@ -5,9 +5,10 @@ use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||||
|
use quickpeep::raking::references::references_from_urlrakes;
|
||||||
use quickpeep::raking::{RakeIntent, RakeOutcome};
|
use quickpeep::raking::{RakeIntent, RakeOutcome};
|
||||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedReference, ReferenceKind};
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use reqwest::redirect::Policy;
|
use reqwest::redirect::Policy;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
@ -77,15 +78,33 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
RakeOutcome::RakedPage(page) => {
|
RakeOutcome::RakedPage(page) => {
|
||||||
let content_size = serde_bare::to_vec(&page)?.len();
|
let content_size = serde_bare::to_vec(&page)?.len();
|
||||||
green_ln!("Page ({} bytes)", content_size);
|
green_ln!("Page ({} bytes)", content_size);
|
||||||
// TODO
|
|
||||||
|
println!();
|
||||||
|
let head = &page.page_entry.document.head;
|
||||||
|
println!("Title: {}", head.title);
|
||||||
|
println!("Language: {}", head.language);
|
||||||
|
if !head.icon.is_empty() {
|
||||||
|
println!("Icon: {}", head.icon);
|
||||||
|
}
|
||||||
|
|
||||||
|
println!();
|
||||||
|
print_references(&page.referrer_entry.references);
|
||||||
}
|
}
|
||||||
RakeOutcome::RakedFeed(feed) => {
|
RakeOutcome::RakedFeed(feed) => {
|
||||||
green_ln!("Feed");
|
green_ln!("Feed");
|
||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
|
println!();
|
||||||
|
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
|
||||||
|
print_references(&refs);
|
||||||
}
|
}
|
||||||
RakeOutcome::RakedSitemap(sitemap) => {
|
RakeOutcome::RakedSitemap(sitemap) => {
|
||||||
green_ln!("Sitemap");
|
green_ln!("Sitemap");
|
||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
|
println!();
|
||||||
|
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
|
||||||
|
print_references(&refs);
|
||||||
}
|
}
|
||||||
RakeOutcome::Redirect { reason, new_url } => {
|
RakeOutcome::Redirect { reason, new_url } => {
|
||||||
blue_ln!("Redirect ({:?})", reason);
|
blue_ln!("Redirect ({:?})", reason);
|
||||||
|
@ -101,3 +120,10 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn print_references(refs: &Vec<RakedReference>) {
|
||||||
|
println!("{} References", refs.len());
|
||||||
|
for reference in refs {
|
||||||
|
println!("\t{:?} → {}", reference.kind, reference.target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -13,9 +13,7 @@ use kuchiki::NodeRef;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||||
use quickpeep_structs::rake_entries::{
|
use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedPageEntry, RakedReferrerEntry};
|
||||||
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
|
||||||
};
|
|
||||||
use quickpeep_utils::Lazy;
|
use quickpeep_utils::Lazy;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
|
@ -26,6 +24,7 @@ use std::time::Duration;
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
|
|
||||||
pub mod analysis;
|
pub mod analysis;
|
||||||
|
pub mod references;
|
||||||
|
|
||||||
/// 4 MiB ought to be enough for anybody.
|
/// 4 MiB ought to be enough for anybody.
|
||||||
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
|
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
|
||||||
|
@ -66,8 +65,8 @@ pub struct UrlRaked {
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
pub struct RakedPage {
|
pub struct RakedPage {
|
||||||
page_entry: RakedPageEntry,
|
pub page_entry: RakedPageEntry,
|
||||||
referrer_entry: RakedReferrerEntry,
|
pub referrer_entry: RakedReferrerEntry,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RobotsTxt {
|
pub struct RobotsTxt {
|
||||||
|
@ -444,7 +443,7 @@ impl Raker {
|
||||||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
||||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
||||||
|
|
||||||
let references = find_references(&document, &feeds, url);
|
let references = references::find_references(&document, &feeds, url);
|
||||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
page_entry: RakedPageEntry {
|
page_entry: RakedPageEntry {
|
||||||
analysed_antifeatures: antifeature_flags,
|
analysed_antifeatures: antifeature_flags,
|
||||||
|
@ -455,68 +454,6 @@ impl Raker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn find_references(
|
|
||||||
doc: &DenseDocument,
|
|
||||||
feeds: &Vec<Url>,
|
|
||||||
page_url: &Url,
|
|
||||||
) -> Vec<RakedReference> {
|
|
||||||
let mut refs = Vec::new();
|
|
||||||
|
|
||||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
|
||||||
for node in tree {
|
|
||||||
match node {
|
|
||||||
DenseTree::Heading1(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Heading2(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Heading3(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Heading4(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Heading5(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Heading6(children) => {
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Link {
|
|
||||||
children,
|
|
||||||
href,
|
|
||||||
nofollow,
|
|
||||||
} => {
|
|
||||||
if !nofollow {
|
|
||||||
if let Ok(full_url) = page_url.join(&href) {
|
|
||||||
refs.push(RakedReference {
|
|
||||||
target: full_url.to_string(),
|
|
||||||
kind: ReferenceKind::CanonicalUrl,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
add_link_refs(children, refs, page_url);
|
|
||||||
}
|
|
||||||
DenseTree::Image { .. } => {}
|
|
||||||
DenseTree::Text(_) => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
|
||||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
|
||||||
|
|
||||||
for feed in feeds {
|
|
||||||
refs.push(RakedReference {
|
|
||||||
target: feed.as_str().to_owned(),
|
|
||||||
kind: ReferenceKind::HeaderLinkedFeed,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
refs
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn normalise_language(lang_string: &mut String) {
|
pub fn normalise_language(lang_string: &mut String) {
|
||||||
*lang_string = lang_string.to_lowercase();
|
*lang_string = lang_string.to_lowercase();
|
||||||
let mut pieces = lang_string
|
let mut pieces = lang_string
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
use crate::raking::UrlRaked;
|
||||||
|
use itertools::Itertools;
|
||||||
|
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
||||||
|
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||||
|
use reqwest::Url;
|
||||||
|
|
||||||
|
pub fn find_references(
|
||||||
|
doc: &DenseDocument,
|
||||||
|
feeds: &Vec<Url>,
|
||||||
|
page_url: &Url,
|
||||||
|
) -> Vec<RakedReference> {
|
||||||
|
let mut refs = Vec::new();
|
||||||
|
|
||||||
|
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||||
|
for node in tree {
|
||||||
|
match node {
|
||||||
|
DenseTree::Heading1(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading2(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading3(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading4(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading5(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading6(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Link {
|
||||||
|
children,
|
||||||
|
href,
|
||||||
|
nofollow,
|
||||||
|
} => {
|
||||||
|
if !nofollow {
|
||||||
|
if let Ok(full_url) = page_url.join(&href) {
|
||||||
|
refs.push(RakedReference {
|
||||||
|
target: full_url.to_string(),
|
||||||
|
kind: ReferenceKind::Link,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Image { .. } => {}
|
||||||
|
DenseTree::Text(_) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||||
|
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||||
|
|
||||||
|
for feed in feeds {
|
||||||
|
refs.push(RakedReference {
|
||||||
|
target: feed.as_str().to_owned(),
|
||||||
|
kind: ReferenceKind::HeaderLinkedFeed,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
refs
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn references_from_urlrakes(
|
||||||
|
input: &Vec<UrlRaked>,
|
||||||
|
ref_kind: ReferenceKind,
|
||||||
|
) -> Vec<RakedReference> {
|
||||||
|
input
|
||||||
|
.iter()
|
||||||
|
.map(|url_raked| RakedReference {
|
||||||
|
target: url_raked.url.to_string(),
|
||||||
|
kind: ref_kind,
|
||||||
|
})
|
||||||
|
.collect_vec()
|
||||||
|
}
|
|
@ -39,7 +39,7 @@ pub struct RakedReference {
|
||||||
pub kind: ReferenceKind,
|
pub kind: ReferenceKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
|
||||||
pub enum ReferenceKind {
|
pub enum ReferenceKind {
|
||||||
CanonicalUrl,
|
CanonicalUrl,
|
||||||
Redirect,
|
Redirect,
|
||||||
|
|
Loading…
Reference in New Issue