Clean-ups and support pulling out references
This commit is contained in:
parent
5a94c825d7
commit
601ec553b5
163
Cargo.lock
generated
163
Cargo.lock
generated
@ -86,6 +86,16 @@ version = "1.3.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags_serde_shim"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "25c3d626f0280ec39b33a6fc5c6c1067432b4c41e94aee40ded197a6649bf025"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bumpalo"
|
name = "bumpalo"
|
||||||
version = "3.9.1"
|
version = "3.9.1"
|
||||||
@ -159,6 +169,45 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "3.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d8c93436c21e4698bacadf42917db28b23017027a4deccb35dbe47a7e7840123"
|
||||||
|
dependencies = [
|
||||||
|
"atty",
|
||||||
|
"bitflags",
|
||||||
|
"clap_derive",
|
||||||
|
"indexmap",
|
||||||
|
"lazy_static",
|
||||||
|
"os_str_bytes",
|
||||||
|
"strsim",
|
||||||
|
"termcolor",
|
||||||
|
"textwrap",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "3.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da95d038ede1a964ce99f49cbe27a7fb538d1da595e4b4f70b8c8f338d17bf16"
|
||||||
|
dependencies = [
|
||||||
|
"heck 0.4.0",
|
||||||
|
"proc-macro-error",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colour"
|
||||||
|
version = "0.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a"
|
||||||
|
dependencies = [
|
||||||
|
"crossterm",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "convert_case"
|
name = "convert_case"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@ -234,6 +283,31 @@ dependencies = [
|
|||||||
"lazy_static",
|
"lazy_static",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crossterm"
|
||||||
|
version = "0.19.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"crossterm_winapi",
|
||||||
|
"lazy_static",
|
||||||
|
"libc",
|
||||||
|
"mio 0.7.14",
|
||||||
|
"parking_lot 0.11.2",
|
||||||
|
"signal-hook",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crossterm_winapi"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cssparser"
|
name = "cssparser"
|
||||||
version = "0.27.2"
|
version = "0.27.2"
|
||||||
@ -560,6 +634,12 @@ dependencies = [
|
|||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hermit-abi"
|
name = "hermit-abi"
|
||||||
version = "0.1.19"
|
version = "0.1.19"
|
||||||
@ -1635,6 +1715,19 @@ dependencies = [
|
|||||||
"autocfg",
|
"autocfg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mio"
|
||||||
|
version = "0.7.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"log",
|
||||||
|
"miow",
|
||||||
|
"ntapi",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mio"
|
name = "mio"
|
||||||
version = "0.8.1"
|
version = "0.8.1"
|
||||||
@ -1823,6 +1916,15 @@ dependencies = [
|
|||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "os_str_bytes"
|
||||||
|
version = "6.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking_lot"
|
name = "parking_lot"
|
||||||
version = "0.11.2"
|
version = "0.11.2"
|
||||||
@ -1970,6 +2072,30 @@ version = "0.1.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro-error"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro-error-attr",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro-error-attr"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro-hack"
|
name = "proc-macro-hack"
|
||||||
version = "0.5.19"
|
version = "0.5.19"
|
||||||
@ -2018,6 +2144,8 @@ dependencies = [
|
|||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"clap",
|
||||||
|
"colour",
|
||||||
"cylon",
|
"cylon",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"feed-rs",
|
"feed-rs",
|
||||||
@ -2071,7 +2199,9 @@ name = "quickpeep_structs"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags",
|
"bitflags",
|
||||||
|
"bitflags_serde_shim",
|
||||||
"quickpeep_densedoc",
|
"quickpeep_densedoc",
|
||||||
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2459,6 +2589,17 @@ dependencies = [
|
|||||||
"stable_deref_trait",
|
"stable_deref_trait",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "signal-hook"
|
||||||
|
version = "0.1.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"mio 0.7.14",
|
||||||
|
"signal-hook-registry",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "signal-hook-registry"
|
name = "signal-hook-registry"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
@ -2546,6 +2687,12 @@ dependencies = [
|
|||||||
"quote",
|
"quote",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum"
|
name = "strum"
|
||||||
version = "0.23.0"
|
version = "0.23.0"
|
||||||
@ -2558,7 +2705,7 @@ version = "0.23.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
|
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"heck",
|
"heck 0.3.3",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"rustversion",
|
"rustversion",
|
||||||
@ -2610,6 +2757,12 @@ dependencies = [
|
|||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "textwrap"
|
||||||
|
version = "0.15.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thin-slice"
|
name = "thin-slice"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@ -2670,7 +2823,7 @@ dependencies = [
|
|||||||
"bytes",
|
"bytes",
|
||||||
"libc",
|
"libc",
|
||||||
"memchr",
|
"memchr",
|
||||||
"mio",
|
"mio 0.8.1",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"parking_lot 0.12.0",
|
"parking_lot 0.12.0",
|
||||||
@ -2841,6 +2994,12 @@ version = "0.2.15"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "version_check"
|
||||||
|
version = "0.9.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "want"
|
name = "want"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
|
@ -6,36 +6,35 @@ edition = "2021"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
### Subcrates
|
||||||
|
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||||
|
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||||
|
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||||
|
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||||
|
|
||||||
|
### CLI Helpers
|
||||||
|
clap = { version = "3.1.6", features = ["derive"] }
|
||||||
|
colour = "0.6.0"
|
||||||
|
|
||||||
|
### Document Parsing
|
||||||
|
kuchiki = "0.8.1"
|
||||||
|
html5ever = "0.25.1"
|
||||||
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
serde_bare = "0.5.0"
|
||||||
|
|
||||||
|
### Dates
|
||||||
|
chrono = "0.4.19"
|
||||||
|
|
||||||
|
### Utils
|
||||||
|
lazy_static = "1.4.0"
|
||||||
|
bytes = "1.1.0"
|
||||||
|
itertools = "0.10.3"
|
||||||
|
ipnetwork = "0.18.0"
|
||||||
|
futures-util = "0.3.21"
|
||||||
tokio = { version = "1.17.0", features = ["full"] }
|
tokio = { version = "1.17.0", features = ["full"] }
|
||||||
anyhow = "1.0.55"
|
anyhow = "1.0.55"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
env_logger = "0.9.0"
|
env_logger = "0.9.0"
|
||||||
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
|
||||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
|
||||||
|
|
||||||
# TODO: why do we need these here?
|
|
||||||
kuchiki = "0.8.1"
|
|
||||||
html5ever = "0.25.1"
|
|
||||||
|
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
|
||||||
serde_bare = "0.5.0"
|
|
||||||
|
|
||||||
chrono = "0.4.19"
|
|
||||||
|
|
||||||
lazy_static = "1.4.0"
|
|
||||||
|
|
||||||
bytes = "1.1.0"
|
|
||||||
|
|
||||||
itertools = "0.10.3"
|
|
||||||
|
|
||||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
|
||||||
ipnetwork = "0.18.0"
|
|
||||||
|
|
||||||
futures-util = "0.3.21"
|
|
||||||
|
|
||||||
lingua = "1.3.3"
|
|
||||||
|
|
||||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
|
||||||
|
|
||||||
### Raking helpers
|
### Raking helpers
|
||||||
# HTTP Requests
|
# HTTP Requests
|
||||||
@ -53,3 +52,5 @@ sitemap = "0.4.1"
|
|||||||
### Filtering helpers
|
### Filtering helpers
|
||||||
# AdBlock
|
# AdBlock
|
||||||
adblock = "0.5.0"
|
adblock = "0.5.0"
|
||||||
|
# Language detection
|
||||||
|
lingua = "1.3.3"
|
@ -1,15 +1,17 @@
|
|||||||
use adblock::lists::RuleTypes;
|
use adblock::lists::RuleTypes;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use clap::Parser;
|
||||||
|
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
||||||
|
use env_logger::Env;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||||
use quickpeep::raking::RakeIntent;
|
use quickpeep::raking::{RakeIntent, RakeOutcome};
|
||||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use reqwest::redirect::Policy;
|
use reqwest::redirect::Policy;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
|
|
||||||
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
||||||
@ -19,8 +21,18 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
|||||||
(AnalysisAntifeatures::ADVERTS, "adverts"),
|
(AnalysisAntifeatures::ADVERTS, "adverts"),
|
||||||
];
|
];
|
||||||
|
|
||||||
|
/// Rakes one URL and prints out the description of it.
|
||||||
|
#[derive(Clone, Debug, Parser)]
|
||||||
|
pub struct Opts {
|
||||||
|
url: Url,
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> anyhow::Result<()> {
|
pub async fn main() -> anyhow::Result<()> {
|
||||||
|
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
|
||||||
|
|
||||||
|
let opts: Opts = Opts::parse();
|
||||||
|
|
||||||
let mut header_map = HeaderMap::new();
|
let mut header_map = HeaderMap::new();
|
||||||
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
|
||||||
|
|
||||||
@ -59,35 +71,33 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
antifeature_ip_set,
|
antifeature_ip_set,
|
||||||
};
|
};
|
||||||
|
|
||||||
// raker.rake(
|
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
|
||||||
// &Url::from_str("http://nothings.org/gamedev/ssao/")?,
|
|
||||||
// RakeIntent::Page,
|
|
||||||
// &client,
|
|
||||||
// )
|
|
||||||
// .await?;
|
|
||||||
//
|
|
||||||
// raker.rake(
|
|
||||||
// &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
|
|
||||||
// RakeIntent::Page,
|
|
||||||
// &client,
|
|
||||||
// )
|
|
||||||
// .await?;
|
|
||||||
|
|
||||||
raker
|
match outcome {
|
||||||
.rake(
|
RakeOutcome::RakedPage(page) => {
|
||||||
&Url::from_str("https://www.thesprucepets.com/")?,
|
let content_size = serde_bare::to_vec(&page)?.len();
|
||||||
RakeIntent::Page,
|
green_ln!("Page ({} bytes)", content_size);
|
||||||
&client,
|
// TODO
|
||||||
)
|
}
|
||||||
.await?;
|
RakeOutcome::RakedFeed(feed) => {
|
||||||
|
green_ln!("Feed");
|
||||||
raker
|
// TODO
|
||||||
.rake(
|
}
|
||||||
&Url::from_str("https://matrix.org/")?,
|
RakeOutcome::RakedSitemap(sitemap) => {
|
||||||
RakeIntent::Page,
|
green_ln!("Sitemap");
|
||||||
&client,
|
// TODO
|
||||||
)
|
}
|
||||||
.await?;
|
RakeOutcome::Redirect { reason, new_url } => {
|
||||||
|
blue_ln!("Redirect ({:?})", reason);
|
||||||
|
println!(" → {}", new_url.as_str());
|
||||||
|
}
|
||||||
|
RakeOutcome::TemporaryFailure(fail) => {
|
||||||
|
yellow_ln!("Temporary Failure\n\t{:?}", &fail.reason);
|
||||||
|
}
|
||||||
|
RakeOutcome::PermanentFailure(fail) => {
|
||||||
|
red_ln!("Permanent Failure\n\t{:?}", &fail.reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -12,8 +12,10 @@ use kuchiki::traits::TendrilSink;
|
|||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use quickpeep_densedoc::DenseTree;
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::{
|
||||||
|
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||||
|
};
|
||||||
use quickpeep_utils::Lazy;
|
use quickpeep_utils::Lazy;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
@ -44,6 +46,7 @@ pub enum RakeOutcome {
|
|||||||
PermanentFailure(PermanentFailure),
|
PermanentFailure(PermanentFailure),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum RedirectReason {
|
pub enum RedirectReason {
|
||||||
/// The page redirected somewhere else.
|
/// The page redirected somewhere else.
|
||||||
Redirected {
|
Redirected {
|
||||||
@ -61,27 +64,35 @@ pub struct UrlRaked {
|
|||||||
pub intent: RakeIntent,
|
pub intent: RakeIntent,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RakedPage {}
|
#[derive(Serialize)]
|
||||||
|
pub struct RakedPage {
|
||||||
|
page_entry: RakedPageEntry,
|
||||||
|
referrer_entry: RakedReferrerEntry,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct RobotsTxt {
|
pub struct RobotsTxt {
|
||||||
pub sitemaps: Vec<UrlRaked>,
|
pub sitemaps: Vec<UrlRaked>,
|
||||||
pub rules: Cylon,
|
pub rules: Cylon,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct TemporaryFailure {
|
pub struct TemporaryFailure {
|
||||||
pub reason: TemporaryFailureReason,
|
pub reason: TemporaryFailureReason,
|
||||||
pub backoff_sec: u32,
|
pub backoff_sec: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct PermanentFailure {
|
pub struct PermanentFailure {
|
||||||
pub reason: PermanentFailureReason,
|
pub reason: PermanentFailureReason,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum TemporaryFailureReason {
|
pub enum TemporaryFailureReason {
|
||||||
MissingInformation(String),
|
MissingInformation(String),
|
||||||
ServerError(u16),
|
ServerError(u16),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum PermanentFailureReason {
|
pub enum PermanentFailureReason {
|
||||||
ResourceDenied(u16),
|
ResourceDenied(u16),
|
||||||
WrongLanguage(String),
|
WrongLanguage(String),
|
||||||
@ -99,6 +110,8 @@ pub enum RakeIntent {
|
|||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
||||||
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
||||||
|
|
||||||
|
/// MIME types we might expect in content-type headers
|
||||||
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
||||||
"text/xml",
|
"text/xml",
|
||||||
"application/xml",
|
"application/xml",
|
||||||
@ -108,6 +121,14 @@ lazy_static! {
|
|||||||
"application/json",
|
"application/json",
|
||||||
"application/feed+json"
|
"application/feed+json"
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
/// MIME types we might expect in <link> tags
|
||||||
|
static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
||||||
|
"application/atom+xml",
|
||||||
|
"application/rss+xml",
|
||||||
|
"application/rdf+xml",
|
||||||
|
"application/feed+json"
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn response_to_bytes_limited(
|
async fn response_to_bytes_limited(
|
||||||
@ -224,14 +245,10 @@ impl Raker {
|
|||||||
|
|
||||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
||||||
{
|
{
|
||||||
match self.rake_html_page(&content, url, is_cf, &headers) {
|
// We don't try any fallbacks for an HTML page
|
||||||
Ok(page_rake) => {
|
return Ok(self
|
||||||
return Ok(page_rake);
|
.rake_html_page(&content, url, is_cf, &headers)
|
||||||
}
|
.context("Raking HTML page")?);
|
||||||
Err(error) => {
|
|
||||||
debug!("Failed to rake HTML page: {:?}", error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if FEED_MIME_TYPES.contains(content_type.as_str())
|
if FEED_MIME_TYPES.contains(content_type.as_str())
|
||||||
@ -344,6 +361,7 @@ impl Raker {
|
|||||||
|
|
||||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
||||||
|
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
||||||
|
|
||||||
if language.is_none() {
|
if language.is_none() {
|
||||||
// Final fallback: guess the language
|
// Final fallback: guess the language
|
||||||
@ -356,29 +374,149 @@ impl Raker {
|
|||||||
normalise_language(language);
|
normalise_language(language);
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
|
let mut title = "".to_owned();
|
||||||
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
|
||||||
|
|
||||||
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
|
if let Ok(title_node) = root_node.select_first("head title") {
|
||||||
readability
|
title = title_node.text_contents();
|
||||||
.parse(url.as_str())
|
}
|
||||||
.context("failed to analyse readability")?;
|
|
||||||
|
let mut feeds = Vec::new();
|
||||||
|
let mut icon = None;
|
||||||
|
|
||||||
|
for link_node in root_node.select("head link").into_iter().flatten() {
|
||||||
|
if let Some(rel) = link_node.attributes.borrow().get("rel") {
|
||||||
|
let rels = rel.split_whitespace().collect_vec();
|
||||||
|
if rels.contains(&"icon") {
|
||||||
|
// This is an icon
|
||||||
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||||
|
let icon_url = url
|
||||||
|
.join(href)
|
||||||
|
.context("Failed to resolve or parse canonical URL to icon")?;
|
||||||
|
|
||||||
|
icon = Some(icon_url);
|
||||||
|
}
|
||||||
|
} else if rels.contains(&"alternate") {
|
||||||
|
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
|
||||||
|
if FEED_LINK_MIME_TYPES.contains(rel_type) {
|
||||||
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||||
|
let feed_url = url
|
||||||
|
.join(href)
|
||||||
|
.context("Failed to resolve or parse canonical URL to feed")?;
|
||||||
|
|
||||||
|
feeds.push(feed_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut readability =
|
||||||
|
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||||
|
if let Err(err) = readability.parse(url.as_str()) {
|
||||||
|
debug!("Failed to analyse readability: {:?}", err);
|
||||||
|
}
|
||||||
|
|
||||||
eprintln!("{:#?}", readability.metadata);
|
eprintln!("{:#?}", readability.metadata);
|
||||||
|
|
||||||
if let Some(_node) = readability.article_node {
|
if title.is_empty() && !readability.metadata.title().is_empty() {
|
||||||
//eprintln!("{}", node.to_string());
|
// Fall back to the readability-derived page title
|
||||||
|
title = readability.metadata.title().to_owned();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut document = DenseDocument {
|
||||||
|
head: DenseHead {
|
||||||
|
title,
|
||||||
|
language: language.unwrap_or(String::with_capacity(0)),
|
||||||
|
icon: icon
|
||||||
|
.map(|url| url.as_str().to_owned())
|
||||||
|
.unwrap_or(String::with_capacity(0)),
|
||||||
|
},
|
||||||
|
body_content: Vec::with_capacity(0),
|
||||||
|
body_remainder: Vec::with_capacity(0),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(article_node) = readability.article_node {
|
||||||
|
document.body_remainder = DenseTree::from_body(root_node.clone());
|
||||||
|
document.body_content = DenseTree::from_body(article_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
||||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
||||||
|
|
||||||
|
let references = find_references(&document, &feeds, url);
|
||||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
// TODO
|
page_entry: RakedPageEntry {
|
||||||
|
analysed_antifeatures: antifeature_flags,
|
||||||
|
document,
|
||||||
|
},
|
||||||
|
referrer_entry: RakedReferrerEntry { references },
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn find_references(
|
||||||
|
doc: &DenseDocument,
|
||||||
|
feeds: &Vec<Url>,
|
||||||
|
page_url: &Url,
|
||||||
|
) -> Vec<RakedReference> {
|
||||||
|
let mut refs = Vec::new();
|
||||||
|
|
||||||
|
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
||||||
|
for node in tree {
|
||||||
|
match node {
|
||||||
|
DenseTree::Heading1(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading2(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading3(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading4(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading5(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Heading6(children) => {
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Link {
|
||||||
|
children,
|
||||||
|
href,
|
||||||
|
nofollow,
|
||||||
|
} => {
|
||||||
|
if !nofollow {
|
||||||
|
if let Ok(full_url) = page_url.join(&href) {
|
||||||
|
refs.push(RakedReference {
|
||||||
|
target: full_url.to_string(),
|
||||||
|
kind: ReferenceKind::CanonicalUrl,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
add_link_refs(children, refs, page_url);
|
||||||
|
}
|
||||||
|
DenseTree::Image { .. } => {}
|
||||||
|
DenseTree::Text(_) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||||
|
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||||
|
|
||||||
|
for feed in feeds {
|
||||||
|
refs.push(RakedReference {
|
||||||
|
target: feed.as_str().to_owned(),
|
||||||
|
kind: ReferenceKind::HeaderLinkedFeed,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
refs
|
||||||
|
}
|
||||||
|
|
||||||
pub fn normalise_language(lang_string: &mut String) {
|
pub fn normalise_language(lang_string: &mut String) {
|
||||||
*lang_string = lang_string.to_lowercase();
|
*lang_string = lang_string.to_lowercase();
|
||||||
let mut pieces = lang_string
|
let mut pieces = lang_string
|
||||||
|
@ -7,24 +7,18 @@ use std::ops::Deref;
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
pub struct DenseDocument {
|
pub struct DenseDocument {
|
||||||
head: DenseHead,
|
pub head: DenseHead,
|
||||||
body: Vec<DenseTree>,
|
pub body_content: Vec<DenseTree>,
|
||||||
}
|
pub body_remainder: Vec<DenseTree>,
|
||||||
|
|
||||||
impl DenseDocument {
|
|
||||||
pub fn from_document(_root_node: NodeRef) {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
pub struct DenseHead {
|
pub struct DenseHead {
|
||||||
title: String,
|
pub title: String,
|
||||||
feed_urls: Vec<String>,
|
|
||||||
/// Language of the page. May be empty if not discovered.
|
/// Language of the page. May be empty if not discovered.
|
||||||
language: String,
|
pub language: String,
|
||||||
/// URL to icon of the page. May be empty if none were discovered.
|
/// URL to icon of the page. May be empty if none were discovered.
|
||||||
icon: String,
|
pub icon: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
@ -7,5 +7,7 @@ edition = "2021"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
bitflags = "1.3.2"
|
bitflags = "1.3.2"
|
||||||
|
bitflags_serde_shim = "0.2.2"
|
||||||
#arc-interner = "0.7.0"
|
#arc-interner = "0.7.0"
|
||||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||||
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
use bitflags::bitflags;
|
use bitflags::bitflags;
|
||||||
|
use bitflags_serde_shim::impl_serde_for_bitflags;
|
||||||
|
use quickpeep_densedoc::DenseDocument;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
bitflags! {
|
bitflags! {
|
||||||
pub struct AnalysisAntifeatures: u8 {
|
pub struct AnalysisAntifeatures: u8 {
|
||||||
@ -17,8 +20,31 @@ bitflags! {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl_serde_for_bitflags!(AnalysisAntifeatures);
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct RakedPageEntry {
|
pub struct RakedPageEntry {
|
||||||
pub analysed_antifeatures: AnalysisAntifeatures,
|
pub analysed_antifeatures: AnalysisAntifeatures,
|
||||||
//pub article: Option<DenseTree>,
|
pub document: DenseDocument,
|
||||||
//pub non_article: Option<DenseTree>,
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct RakedReferrerEntry {
|
||||||
|
pub references: Vec<RakedReference>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct RakedReference {
|
||||||
|
pub target: String,
|
||||||
|
pub kind: ReferenceKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub enum ReferenceKind {
|
||||||
|
CanonicalUrl,
|
||||||
|
Redirect,
|
||||||
|
Link,
|
||||||
|
HeaderLinkedFeed,
|
||||||
|
FeedEntry,
|
||||||
|
SitemapEntry,
|
||||||
}
|
}
|
||||||
|
9
scripts/get_psl.sh
Executable file
9
scripts/get_psl.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
dir_path="$(dirname "$0")"
|
||||||
|
|
||||||
|
mkdir -p "$dir_path/../data"
|
||||||
|
wget -O "$dir_path/../data/public_suffices.list" https://publicsuffix.org/list/public_suffix_list.dat
|
||||||
|
|
Loading…
Reference in New Issue
Block a user