Clean-ups and support pulling out references
This commit is contained in:
parent
5a94c825d7
commit
601ec553b5
163
Cargo.lock
generated
163
Cargo.lock
generated
@ -86,6 +86,16 @@ version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags_serde_shim"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25c3d626f0280ec39b33a6fc5c6c1067432b4c41e94aee40ded197a6649bf025"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.9.1"
|
||||
@ -159,6 +169,45 @@ dependencies = [
|
||||
"chrono",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "3.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8c93436c21e4698bacadf42917db28b23017027a4deccb35dbe47a7e7840123"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"indexmap",
|
||||
"lazy_static",
|
||||
"os_str_bytes",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
"textwrap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "3.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da95d038ede1a964ce99f49cbe27a7fb538d1da595e4b4f70b8c8f338d17bf16"
|
||||
dependencies = [
|
||||
"heck 0.4.0",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colour"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a"
|
||||
dependencies = [
|
||||
"crossterm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.4.0"
|
||||
@ -234,6 +283,31 @@ dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"crossterm_winapi",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"mio 0.7.14",
|
||||
"parking_lot 0.11.2",
|
||||
"signal-hook",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm_winapi"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.27.2"
|
||||
@ -560,6 +634,12 @@ dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
@ -1635,6 +1715,19 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.7.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"miow",
|
||||
"ntapi",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.1"
|
||||
@ -1823,6 +1916,15 @@ dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_str_bytes"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
@ -1970,6 +2072,30 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.19"
|
||||
@ -2018,6 +2144,8 @@ dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"colour",
|
||||
"cylon",
|
||||
"env_logger",
|
||||
"feed-rs",
|
||||
@ -2071,7 +2199,9 @@ name = "quickpeep_structs"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bitflags_serde_shim",
|
||||
"quickpeep_densedoc",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2459,6 +2589,17 @@ dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"mio 0.7.14",
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.0"
|
||||
@ -2546,6 +2687,12 @@ dependencies = [
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.23.0"
|
||||
@ -2558,7 +2705,7 @@ version = "0.23.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.3.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@ -2610,6 +2757,12 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
|
||||
|
||||
[[package]]
|
||||
name = "thin-slice"
|
||||
version = "0.1.1"
|
||||
@ -2670,7 +2823,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"libc",
|
||||
"memchr",
|
||||
"mio",
|
||||
"mio 0.8.1",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.0",
|
||||
@ -2841,6 +2994,12 @@ version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.0"
|
||||
|
@ -6,36 +6,35 @@ edition = "2021"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
### Subcrates
|
||||
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||
|
||||
### CLI Helpers
|
||||
clap = { version = "3.1.6", features = ["derive"] }
|
||||
colour = "0.6.0"
|
||||
|
||||
### Document Parsing
|
||||
kuchiki = "0.8.1"
|
||||
html5ever = "0.25.1"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_bare = "0.5.0"
|
||||
|
||||
### Dates
|
||||
chrono = "0.4.19"
|
||||
|
||||
### Utils
|
||||
lazy_static = "1.4.0"
|
||||
bytes = "1.1.0"
|
||||
itertools = "0.10.3"
|
||||
ipnetwork = "0.18.0"
|
||||
futures-util = "0.3.21"
|
||||
tokio = { version = "1.17.0", features = ["full"] }
|
||||
anyhow = "1.0.55"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.9.0"
|
||||
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
|
||||
# TODO: why do we need these here?
|
||||
kuchiki = "0.8.1"
|
||||
html5ever = "0.25.1"
|
||||
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_bare = "0.5.0"
|
||||
|
||||
chrono = "0.4.19"
|
||||
|
||||
lazy_static = "1.4.0"
|
||||
|
||||
bytes = "1.1.0"
|
||||
|
||||
itertools = "0.10.3"
|
||||
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
ipnetwork = "0.18.0"
|
||||
|
||||
futures-util = "0.3.21"
|
||||
|
||||
lingua = "1.3.3"
|
||||
|
||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||
|
||||
### Raking helpers
|
||||
# HTTP Requests
|
||||
@ -53,3 +52,5 @@ sitemap = "0.4.1"
|
||||
### Filtering helpers
|
||||
# AdBlock
|
||||
adblock = "0.5.0"
|
||||
# Language detection
|
||||
lingua = "1.3.3"
|
@ -1,15 +1,17 @@
|
||||
use adblock::lists::RuleTypes;
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
||||
use env_logger::Env;
|
||||
use log::warn;
|
||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||
use quickpeep::raking::RakeIntent;
|
||||
use quickpeep::raking::{RakeIntent, RakeOutcome};
|
||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use reqwest::redirect::Policy;
|
||||
use reqwest::Url;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use tokio::fs::File;
|
||||
|
||||
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
||||
@ -19,8 +21,18 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
||||
(AnalysisAntifeatures::ADVERTS, "adverts"),
|
||||
];
|
||||
|
||||
/// Rakes one URL and prints out the description of it.
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
pub struct Opts {
|
||||
url: Url,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
|
||||
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
let mut header_map = HeaderMap::new();
|
||||
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
||||
|
||||
@ -59,35 +71,33 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
antifeature_ip_set,
|
||||
};
|
||||
|
||||
// raker.rake(
|
||||
// &Url::from_str("http://nothings.org/gamedev/ssao/")?,
|
||||
// RakeIntent::Page,
|
||||
// &client,
|
||||
// )
|
||||
// .await?;
|
||||
//
|
||||
// raker.rake(
|
||||
// &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
|
||||
// RakeIntent::Page,
|
||||
// &client,
|
||||
// )
|
||||
// .await?;
|
||||
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
|
||||
|
||||
raker
|
||||
.rake(
|
||||
&Url::from_str("https://www.thesprucepets.com/")?,
|
||||
RakeIntent::Page,
|
||||
&client,
|
||||
)
|
||||
.await?;
|
||||
|
||||
raker
|
||||
.rake(
|
||||
&Url::from_str("https://matrix.org/")?,
|
||||
RakeIntent::Page,
|
||||
&client,
|
||||
)
|
||||
.await?;
|
||||
match outcome {
|
||||
RakeOutcome::RakedPage(page) => {
|
||||
let content_size = serde_bare::to_vec(&page)?.len();
|
||||
green_ln!("Page ({} bytes)", content_size);
|
||||
// TODO
|
||||
}
|
||||
RakeOutcome::RakedFeed(feed) => {
|
||||
green_ln!("Feed");
|
||||
// TODO
|
||||
}
|
||||
RakeOutcome::RakedSitemap(sitemap) => {
|
||||
green_ln!("Sitemap");
|
||||
// TODO
|
||||
}
|
||||
RakeOutcome::Redirect { reason, new_url } => {
|
||||
blue_ln!("Redirect ({:?})", reason);
|
||||
println!(" → {}", new_url.as_str());
|
||||
}
|
||||
RakeOutcome::TemporaryFailure(fail) => {
|
||||
yellow_ln!("Temporary Failure\n\t{:?}", &fail.reason);
|
||||
}
|
||||
RakeOutcome::PermanentFailure(fail) => {
|
||||
red_ln!("Permanent Failure\n\t{:?}", &fail.reason)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -12,8 +12,10 @@ use kuchiki::traits::TendrilSink;
|
||||
use kuchiki::NodeRef;
|
||||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||
use quickpeep_structs::rake_entries::{
|
||||
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||
};
|
||||
use quickpeep_utils::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
@ -44,6 +46,7 @@ pub enum RakeOutcome {
|
||||
PermanentFailure(PermanentFailure),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum RedirectReason {
|
||||
/// The page redirected somewhere else.
|
||||
Redirected {
|
||||
@ -61,27 +64,35 @@ pub struct UrlRaked {
|
||||
pub intent: RakeIntent,
|
||||
}
|
||||
|
||||
pub struct RakedPage {}
|
||||
#[derive(Serialize)]
|
||||
pub struct RakedPage {
|
||||
page_entry: RakedPageEntry,
|
||||
referrer_entry: RakedReferrerEntry,
|
||||
}
|
||||
|
||||
pub struct RobotsTxt {
|
||||
pub sitemaps: Vec<UrlRaked>,
|
||||
pub rules: Cylon,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TemporaryFailure {
|
||||
pub reason: TemporaryFailureReason,
|
||||
pub backoff_sec: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PermanentFailure {
|
||||
pub reason: PermanentFailureReason,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TemporaryFailureReason {
|
||||
MissingInformation(String),
|
||||
ServerError(u16),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum PermanentFailureReason {
|
||||
ResourceDenied(u16),
|
||||
WrongLanguage(String),
|
||||
@ -99,6 +110,8 @@ pub enum RakeIntent {
|
||||
lazy_static! {
|
||||
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
||||
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
||||
|
||||
/// MIME types we might expect in content-type headers
|
||||
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
@ -108,6 +121,14 @@ lazy_static! {
|
||||
"application/json",
|
||||
"application/feed+json"
|
||||
]);
|
||||
|
||||
/// MIME types we might expect in <link> tags
|
||||
static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
||||
"application/atom+xml",
|
||||
"application/rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/feed+json"
|
||||
]);
|
||||
}
|
||||
|
||||
async fn response_to_bytes_limited(
|
||||
@ -224,14 +245,10 @@ impl Raker {
|
||||
|
||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
||||
{
|
||||
match self.rake_html_page(&content, url, is_cf, &headers) {
|
||||
Ok(page_rake) => {
|
||||
return Ok(page_rake);
|
||||
}
|
||||
Err(error) => {
|
||||
debug!("Failed to rake HTML page: {:?}", error);
|
||||
}
|
||||
}
|
||||
// We don't try any fallbacks for an HTML page
|
||||
return Ok(self
|
||||
.rake_html_page(&content, url, is_cf, &headers)
|
||||
.context("Raking HTML page")?);
|
||||
}
|
||||
|
||||
if FEED_MIME_TYPES.contains(content_type.as_str())
|
||||
@ -344,6 +361,7 @@ impl Raker {
|
||||
|
||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
||||
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
||||
|
||||
if language.is_none() {
|
||||
// Final fallback: guess the language
|
||||
@ -356,29 +374,149 @@ impl Raker {
|
||||
normalise_language(language);
|
||||
}
|
||||
|
||||
eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
|
||||
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
||||
let mut title = "".to_owned();
|
||||
|
||||
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
|
||||
readability
|
||||
.parse(url.as_str())
|
||||
.context("failed to analyse readability")?;
|
||||
if let Ok(title_node) = root_node.select_first("head title") {
|
||||
title = title_node.text_contents();
|
||||
}
|
||||
|
||||
let mut feeds = Vec::new();
|
||||
let mut icon = None;
|
||||
|
||||
for link_node in root_node.select("head link").into_iter().flatten() {
|
||||
if let Some(rel) = link_node.attributes.borrow().get("rel") {
|
||||
let rels = rel.split_whitespace().collect_vec();
|
||||
if rels.contains(&"icon") {
|
||||
// This is an icon
|
||||
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||
let icon_url = url
|
||||
.join(href)
|
||||
.context("Failed to resolve or parse canonical URL to icon")?;
|
||||
|
||||
icon = Some(icon_url);
|
||||
}
|
||||
} else if rels.contains(&"alternate") {
|
||||
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
|
||||
if FEED_LINK_MIME_TYPES.contains(rel_type) {
|
||||
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||
let feed_url = url
|
||||
.join(href)
|
||||
.context("Failed to resolve or parse canonical URL to feed")?;
|
||||
|
||||
feeds.push(feed_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut readability =
|
||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||
if let Err(err) = readability.parse(url.as_str()) {
|
||||
debug!("Failed to analyse readability: {:?}", err);
|
||||
}
|
||||
|
||||
eprintln!("{:#?}", readability.metadata);
|
||||
|
||||
if let Some(_node) = readability.article_node {
|
||||
//eprintln!("{}", node.to_string());
|
||||
if title.is_empty() && !readability.metadata.title().is_empty() {
|
||||
// Fall back to the readability-derived page title
|
||||
title = readability.metadata.title().to_owned();
|
||||
}
|
||||
|
||||
let mut document = DenseDocument {
|
||||
head: DenseHead {
|
||||
title,
|
||||
language: language.unwrap_or(String::with_capacity(0)),
|
||||
icon: icon
|
||||
.map(|url| url.as_str().to_owned())
|
||||
.unwrap_or(String::with_capacity(0)),
|
||||
},
|
||||
body_content: Vec::with_capacity(0),
|
||||
body_remainder: Vec::with_capacity(0),
|
||||
};
|
||||
|
||||
if let Some(article_node) = readability.article_node {
|
||||
document.body_remainder = DenseTree::from_body(root_node.clone());
|
||||
document.body_content = DenseTree::from_body(article_node);
|
||||
}
|
||||
|
||||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
||||
|
||||
let references = find_references(&document, &feeds, url);
|
||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||
// TODO
|
||||
page_entry: RakedPageEntry {
|
||||
analysed_antifeatures: antifeature_flags,
|
||||
document,
|
||||
},
|
||||
referrer_entry: RakedReferrerEntry { references },
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_references(
|
||||
doc: &DenseDocument,
|
||||
feeds: &Vec<Url>,
|
||||
page_url: &Url,
|
||||
) -> Vec<RakedReference> {
|
||||
let mut refs = Vec::new();
|
||||
|
||||
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||
for node in tree {
|
||||
match node {
|
||||
DenseTree::Heading1(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading2(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading3(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading4(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading5(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Heading6(children) => {
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Link {
|
||||
children,
|
||||
href,
|
||||
nofollow,
|
||||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
refs.push(RakedReference {
|
||||
target: full_url.to_string(),
|
||||
kind: ReferenceKind::CanonicalUrl,
|
||||
})
|
||||
}
|
||||
}
|
||||
add_link_refs(children, refs, page_url);
|
||||
}
|
||||
DenseTree::Image { .. } => {}
|
||||
DenseTree::Text(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
refs.push(RakedReference {
|
||||
target: feed.as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
});
|
||||
}
|
||||
|
||||
refs
|
||||
}
|
||||
|
||||
pub fn normalise_language(lang_string: &mut String) {
|
||||
*lang_string = lang_string.to_lowercase();
|
||||
let mut pieces = lang_string
|
||||
|
@ -7,24 +7,18 @@ use std::ops::Deref;
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct DenseDocument {
|
||||
head: DenseHead,
|
||||
body: Vec<DenseTree>,
|
||||
}
|
||||
|
||||
impl DenseDocument {
|
||||
pub fn from_document(_root_node: NodeRef) {
|
||||
todo!()
|
||||
}
|
||||
pub head: DenseHead,
|
||||
pub body_content: Vec<DenseTree>,
|
||||
pub body_remainder: Vec<DenseTree>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct DenseHead {
|
||||
title: String,
|
||||
feed_urls: Vec<String>,
|
||||
pub title: String,
|
||||
/// Language of the page. May be empty if not discovered.
|
||||
language: String,
|
||||
pub language: String,
|
||||
/// URL to icon of the page. May be empty if none were discovered.
|
||||
icon: String,
|
||||
pub icon: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
|
@ -7,5 +7,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
bitflags = "1.3.2"
|
||||
bitflags_serde_shim = "0.2.2"
|
||||
#arc-interner = "0.7.0"
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
@ -1,4 +1,7 @@
|
||||
use bitflags::bitflags;
|
||||
use bitflags_serde_shim::impl_serde_for_bitflags;
|
||||
use quickpeep_densedoc::DenseDocument;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
bitflags! {
|
||||
pub struct AnalysisAntifeatures: u8 {
|
||||
@ -17,8 +20,31 @@ bitflags! {
|
||||
}
|
||||
}
|
||||
|
||||
impl_serde_for_bitflags!(AnalysisAntifeatures);
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct RakedPageEntry {
|
||||
pub analysed_antifeatures: AnalysisAntifeatures,
|
||||
//pub article: Option<DenseTree>,
|
||||
//pub non_article: Option<DenseTree>,
|
||||
pub document: DenseDocument,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct RakedReferrerEntry {
|
||||
pub references: Vec<RakedReference>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct RakedReference {
|
||||
pub target: String,
|
||||
pub kind: ReferenceKind,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub enum ReferenceKind {
|
||||
CanonicalUrl,
|
||||
Redirect,
|
||||
Link,
|
||||
HeaderLinkedFeed,
|
||||
FeedEntry,
|
||||
SitemapEntry,
|
||||
}
|
||||
|
9
scripts/get_psl.sh
Executable file
9
scripts/get_psl.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
dir_path="$(dirname "$0")"
|
||||
|
||||
mkdir -p "$dir_path/../data"
|
||||
wget -O "$dir_path/../data/public_suffices.list" https://publicsuffix.org/list/public_suffix_list.dat
|
||||
|
Loading…
Reference in New Issue
Block a user