Clean-ups and support pulling out references

rei/minimum
Olivier 'reivilibre' 2022-03-14 23:01:19 +00:00
parent 5a94c825d7
commit 601ec553b5
8 changed files with 432 additions and 93 deletions

163
Cargo.lock generated
View File

@ -86,6 +86,16 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags_serde_shim"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25c3d626f0280ec39b33a6fc5c6c1067432b4c41e94aee40ded197a6649bf025"
dependencies = [
"bitflags",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.9.1"
@ -159,6 +169,45 @@ dependencies = [
"chrono",
]
[[package]]
name = "clap"
version = "3.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8c93436c21e4698bacadf42917db28b23017027a4deccb35dbe47a7e7840123"
dependencies = [
"atty",
"bitflags",
"clap_derive",
"indexmap",
"lazy_static",
"os_str_bytes",
"strsim",
"termcolor",
"textwrap",
]
[[package]]
name = "clap_derive"
version = "3.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da95d038ede1a964ce99f49cbe27a7fb538d1da595e4b4f70b8c8f338d17bf16"
dependencies = [
"heck 0.4.0",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "colour"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a"
dependencies = [
"crossterm",
]
[[package]]
name = "convert_case"
version = "0.4.0"
@ -234,6 +283,31 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "crossterm"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c"
dependencies = [
"bitflags",
"crossterm_winapi",
"lazy_static",
"libc",
"mio 0.7.14",
"parking_lot 0.11.2",
"signal-hook",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9"
dependencies = [
"winapi",
]
[[package]]
name = "cssparser"
version = "0.27.2"
@ -560,6 +634,12 @@ dependencies = [
"unicode-segmentation",
]
[[package]]
name = "heck"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
[[package]]
name = "hermit-abi"
version = "0.1.19"
@ -1635,6 +1715,19 @@ dependencies = [
"autocfg",
]
[[package]]
name = "mio"
version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc"
dependencies = [
"libc",
"log",
"miow",
"ntapi",
"winapi",
]
[[package]]
name = "mio"
version = "0.8.1"
@ -1823,6 +1916,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "os_str_bytes"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
dependencies = [
"memchr",
]
[[package]]
name = "parking_lot"
version = "0.11.2"
@ -1970,6 +2072,30 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
@ -2018,6 +2144,8 @@ dependencies = [
"anyhow",
"bytes",
"chrono",
"clap",
"colour",
"cylon",
"env_logger",
"feed-rs",
@ -2071,7 +2199,9 @@ name = "quickpeep_structs"
version = "0.1.0"
dependencies = [
"bitflags",
"bitflags_serde_shim",
"quickpeep_densedoc",
"serde",
]
[[package]]
@ -2459,6 +2589,17 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "signal-hook"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729"
dependencies = [
"libc",
"mio 0.7.14",
"signal-hook-registry",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.0"
@ -2546,6 +2687,12 @@ dependencies = [
"quote",
]
[[package]]
name = "strsim"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.23.0"
@ -2558,7 +2705,7 @@ version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
dependencies = [
"heck",
"heck 0.3.3",
"proc-macro2",
"quote",
"rustversion",
@ -2610,6 +2757,12 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
[[package]]
name = "thin-slice"
version = "0.1.1"
@ -2670,7 +2823,7 @@ dependencies = [
"bytes",
"libc",
"memchr",
"mio",
"mio 0.8.1",
"num_cpus",
"once_cell",
"parking_lot 0.12.0",
@ -2841,6 +2994,12 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "want"
version = "0.3.0"

View File

@ -6,36 +6,35 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
### Subcrates
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_structs = { path = "../quickpeep_structs" }
quickpeep_utils = { path = "../quickpeep_utils" }
### CLI Helpers
clap = { version = "3.1.6", features = ["derive"] }
colour = "0.6.0"
### Document Parsing
kuchiki = "0.8.1"
html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0"
### Dates
chrono = "0.4.19"
### Utils
lazy_static = "1.4.0"
bytes = "1.1.0"
itertools = "0.10.3"
ipnetwork = "0.18.0"
futures-util = "0.3.21"
tokio = { version = "1.17.0", features = ["full"] }
anyhow = "1.0.55"
log = "0.4.14"
env_logger = "0.9.0"
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
# TODO: why do we need these here?
kuchiki = "0.8.1"
html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0"
chrono = "0.4.19"
lazy_static = "1.4.0"
bytes = "1.1.0"
itertools = "0.10.3"
quickpeep_structs = { path = "../quickpeep_structs" }
ipnetwork = "0.18.0"
futures-util = "0.3.21"
lingua = "1.3.3"
quickpeep_utils = { path = "../quickpeep_utils" }
### Raking helpers
# HTTP Requests
@ -53,3 +52,5 @@ sitemap = "0.4.1"
### Filtering helpers
# AdBlock
adblock = "0.5.0"
# Language detection
lingua = "1.3.3"

View File

@ -1,15 +1,17 @@
use adblock::lists::RuleTypes;
use anyhow::Context;
use clap::Parser;
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
use env_logger::Env;
use log::warn;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::RakeIntent;
use quickpeep::raking::{RakeIntent, RakeOutcome};
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy;
use reqwest::Url;
use std::path::PathBuf;
use std::str::FromStr;
use tokio::fs::File;
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
@ -19,8 +21,18 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
(AnalysisAntifeatures::ADVERTS, "adverts"),
];
/// Rakes one URL and prints out the description of it.
#[derive(Clone, Debug, Parser)]
pub struct Opts {
url: Url,
}
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
let opts: Opts = Opts::parse();
let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
@ -59,35 +71,33 @@ pub async fn main() -> anyhow::Result<()> {
antifeature_ip_set,
};
// raker.rake(
// &Url::from_str("http://nothings.org/gamedev/ssao/")?,
// RakeIntent::Page,
// &client,
// )
// .await?;
//
// raker.rake(
// &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
// RakeIntent::Page,
// &client,
// )
// .await?;
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
raker
.rake(
&Url::from_str("https://www.thesprucepets.com/")?,
RakeIntent::Page,
&client,
)
.await?;
raker
.rake(
&Url::from_str("https://matrix.org/")?,
RakeIntent::Page,
&client,
)
.await?;
match outcome {
RakeOutcome::RakedPage(page) => {
let content_size = serde_bare::to_vec(&page)?.len();
green_ln!("Page ({} bytes)", content_size);
// TODO
}
RakeOutcome::RakedFeed(feed) => {
green_ln!("Feed");
// TODO
}
RakeOutcome::RakedSitemap(sitemap) => {
green_ln!("Sitemap");
// TODO
}
RakeOutcome::Redirect { reason, new_url } => {
blue_ln!("Redirect ({:?})", reason);
println!("{}", new_url.as_str());
}
RakeOutcome::TemporaryFailure(fail) => {
yellow_ln!("Temporary Failure\n\t{:?}", &fail.reason);
}
RakeOutcome::PermanentFailure(fail) => {
red_ln!("Permanent Failure\n\t{:?}", &fail.reason)
}
}
Ok(())
}

View File

@ -12,8 +12,10 @@ use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use lazy_static::lazy_static;
use log::debug;
use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_structs::rake_entries::{
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
};
use quickpeep_utils::Lazy;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
@ -44,6 +46,7 @@ pub enum RakeOutcome {
PermanentFailure(PermanentFailure),
}
#[derive(Debug)]
pub enum RedirectReason {
/// The page redirected somewhere else.
Redirected {
@ -61,27 +64,35 @@ pub struct UrlRaked {
pub intent: RakeIntent,
}
pub struct RakedPage {}
#[derive(Serialize)]
pub struct RakedPage {
page_entry: RakedPageEntry,
referrer_entry: RakedReferrerEntry,
}
pub struct RobotsTxt {
pub sitemaps: Vec<UrlRaked>,
pub rules: Cylon,
}
#[derive(Debug)]
pub struct TemporaryFailure {
pub reason: TemporaryFailureReason,
pub backoff_sec: u32,
}
#[derive(Debug)]
pub struct PermanentFailure {
pub reason: PermanentFailureReason,
}
#[derive(Debug)]
pub enum TemporaryFailureReason {
MissingInformation(String),
ServerError(u16),
}
#[derive(Debug)]
pub enum PermanentFailureReason {
ResourceDenied(u16),
WrongLanguage(String),
@ -99,6 +110,8 @@ pub enum RakeIntent {
lazy_static! {
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
HashSet::from_iter(vec!["text/xml", "application/xml",]);
/// MIME types we might expect in content-type headers
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
"text/xml",
"application/xml",
@ -108,6 +121,14 @@ lazy_static! {
"application/json",
"application/feed+json"
]);
/// MIME types we might expect in <link> tags
static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
"application/atom+xml",
"application/rss+xml",
"application/rdf+xml",
"application/feed+json"
]);
}
async fn response_to_bytes_limited(
@ -224,14 +245,10 @@ impl Raker {
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{
match self.rake_html_page(&content, url, is_cf, &headers) {
Ok(page_rake) => {
return Ok(page_rake);
}
Err(error) => {
debug!("Failed to rake HTML page: {:?}", error);
}
}
// We don't try any fallbacks for an HTML page
return Ok(self
.rake_html_page(&content, url, is_cf, &headers)
.context("Raking HTML page")?);
}
if FEED_MIME_TYPES.contains(content_type.as_str())
@ -344,6 +361,7 @@ impl Raker {
let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
if language.is_none() {
// Final fallback: guess the language
@ -356,29 +374,149 @@ impl Raker {
normalise_language(language);
}
eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
let mut title = "".to_owned();
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
readability
.parse(url.as_str())
.context("failed to analyse readability")?;
if let Ok(title_node) = root_node.select_first("head title") {
title = title_node.text_contents();
}
let mut feeds = Vec::new();
let mut icon = None;
for link_node in root_node.select("head link").into_iter().flatten() {
if let Some(rel) = link_node.attributes.borrow().get("rel") {
let rels = rel.split_whitespace().collect_vec();
if rels.contains(&"icon") {
// This is an icon
if let Some(href) = link_node.attributes.borrow().get("href") {
let icon_url = url
.join(href)
.context("Failed to resolve or parse canonical URL to icon")?;
icon = Some(icon_url);
}
} else if rels.contains(&"alternate") {
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
if FEED_LINK_MIME_TYPES.contains(rel_type) {
if let Some(href) = link_node.attributes.borrow().get("href") {
let feed_url = url
.join(href)
.context("Failed to resolve or parse canonical URL to feed")?;
feeds.push(feed_url);
}
}
}
}
}
}
let mut readability =
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
if let Err(err) = readability.parse(url.as_str()) {
debug!("Failed to analyse readability: {:?}", err);
}
eprintln!("{:#?}", readability.metadata);
if let Some(_node) = readability.article_node {
//eprintln!("{}", node.to_string());
if title.is_empty() && !readability.metadata.title().is_empty() {
// Fall back to the readability-derived page title
title = readability.metadata.title().to_owned();
}
let mut document = DenseDocument {
head: DenseHead {
title,
language: language.unwrap_or(String::with_capacity(0)),
icon: icon
.map(|url| url.as_str().to_owned())
.unwrap_or(String::with_capacity(0)),
},
body_content: Vec::with_capacity(0),
body_remainder: Vec::with_capacity(0),
};
if let Some(article_node) = readability.article_node {
document.body_remainder = DenseTree::from_body(root_node.clone());
document.body_content = DenseTree::from_body(article_node);
}
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
eprintln!("CS {:?}{:?}", content.len(), bare_size);
let references = find_references(&document, &feeds, url);
Ok(RakeOutcome::RakedPage(RakedPage {
// TODO
page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags,
document,
},
referrer_entry: RakedReferrerEntry { references },
}))
}
}
pub fn find_references(
doc: &DenseDocument,
feeds: &Vec<Url>,
page_url: &Url,
) -> Vec<RakedReference> {
let mut refs = Vec::new();
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
for node in tree {
match node {
DenseTree::Heading1(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading2(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading3(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading4(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading5(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Heading6(children) => {
add_link_refs(children, refs, page_url);
}
DenseTree::Link {
children,
href,
nofollow,
} => {
if !nofollow {
if let Ok(full_url) = page_url.join(&href) {
refs.push(RakedReference {
target: full_url.to_string(),
kind: ReferenceKind::CanonicalUrl,
})
}
}
add_link_refs(children, refs, page_url);
}
DenseTree::Image { .. } => {}
DenseTree::Text(_) => {}
}
}
}
add_link_refs(&doc.body_content, &mut refs, &page_url);
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds {
refs.push(RakedReference {
target: feed.as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed,
});
}
refs
}
pub fn normalise_language(lang_string: &mut String) {
*lang_string = lang_string.to_lowercase();
let mut pieces = lang_string

View File

@ -7,24 +7,18 @@ use std::ops::Deref;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseDocument {
head: DenseHead,
body: Vec<DenseTree>,
}
impl DenseDocument {
pub fn from_document(_root_node: NodeRef) {
todo!()
}
pub head: DenseHead,
pub body_content: Vec<DenseTree>,
pub body_remainder: Vec<DenseTree>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseHead {
title: String,
feed_urls: Vec<String>,
pub title: String,
/// Language of the page. May be empty if not discovered.
language: String,
pub language: String,
/// URL to icon of the page. May be empty if none were discovered.
icon: String,
pub icon: String,
}
#[derive(Serialize, Deserialize, Clone, Debug)]

View File

@ -7,5 +7,7 @@ edition = "2021"
[dependencies]
bitflags = "1.3.2"
bitflags_serde_shim = "0.2.2"
#arc-interner = "0.7.0"
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
serde = { version = "1.0.136", features = ["derive"] }

View File

@ -1,4 +1,7 @@
use bitflags::bitflags;
use bitflags_serde_shim::impl_serde_for_bitflags;
use quickpeep_densedoc::DenseDocument;
use serde::{Deserialize, Serialize};
bitflags! {
pub struct AnalysisAntifeatures: u8 {
@ -17,8 +20,31 @@ bitflags! {
}
}
impl_serde_for_bitflags!(AnalysisAntifeatures);
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedPageEntry {
pub analysed_antifeatures: AnalysisAntifeatures,
//pub article: Option<DenseTree>,
//pub non_article: Option<DenseTree>,
pub document: DenseDocument,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedReferrerEntry {
pub references: Vec<RakedReference>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedReference {
pub target: String,
pub kind: ReferenceKind,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum ReferenceKind {
CanonicalUrl,
Redirect,
Link,
HeaderLinkedFeed,
FeedEntry,
SitemapEntry,
}

9
scripts/get_psl.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
set -eu
dir_path="$(dirname "$0")"
mkdir -p "$dir_path/../data"
wget -O "$dir_path/../data/public_suffices.list" https://publicsuffix.org/list/public_suffix_list.dat