diff --git a/.gitignore b/.gitignore index 44dddc2..c75c6f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -.idea \ No newline at end of file +.idea +data/cf_ips.txt \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 834bf86..ffb415d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -612,6 +612,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35e70ee094dc02fd9c13fdad4940090f22dbd6ac7c9e7094a46cf0232a50bc7c" +[[package]] +name = "ipnetwork" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4088d739b183546b239688ddbc79891831df421773df95e236daf7867866d355" +dependencies = [ + "serde", +] + [[package]] name = "itertools" version = "0.10.3" @@ -1066,18 +1075,36 @@ dependencies = [ "cylon", "env_logger", "feed-rs", + "futures-util", "gemini-fetch", "html5ever", + "ipnetwork", + "itertools", "kuchiki", "lazy_static", "log", + "quickpeep_densedoc", "quickpeep_moz_readability", + "quickpeep_structs", "reqwest", "serde", + "serde_bare", "sitemap", "tokio", ] +[[package]] +name = "quickpeep_densedoc" +version = "0.1.0" +dependencies = [ + "anyhow", + "html5ever", + "kuchiki", + "lazy_static", + "regex", + "serde", +] + [[package]] name = "quickpeep_moz_readability" version = "0.1.0" @@ -1091,6 +1118,14 @@ dependencies = [ "url", ] +[[package]] +name = "quickpeep_structs" +version = "0.1.0" +dependencies = [ + "bitflags", + "quickpeep_densedoc", +] + [[package]] name = "quote" version = "1.0.15" @@ -1388,6 +1423,15 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bare" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51c55386eed0f1ae957b091dc2ca8122f287b60c79c774cbe3d5f2b69fded660" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" version = "1.0.136" diff --git a/Cargo.toml b/Cargo.toml index 65c4ce4..0487b5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,9 @@ [workspace] members = [ "quickpeep", - "quickpeep_moz_readability" + "quickpeep_densedoc", + "quickpeep_moz_readability", + "quickpeep_structs" ] diff --git a/quickpeep/Cargo.toml b/quickpeep/Cargo.toml index a5d374e..4db5359 100644 --- a/quickpeep/Cargo.toml +++ b/quickpeep/Cargo.toml @@ -11,12 +11,14 @@ anyhow = "1.0.55" log = "0.4.14" env_logger = "0.9.0" quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } +quickpeep_densedoc = { path = "../quickpeep_densedoc" } # TODO: why do we need these here? kuchiki = "0.8.1" html5ever = "0.25.1" serde = { version = "1.0.136", features = ["derive"] } +serde_bare = "0.5.0" chrono = "0.4.19" @@ -24,9 +26,12 @@ lazy_static = "1.4.0" bytes = "1.1.0" -# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. -# rkyv = "0.7.35" -# memmap2 = "0.5.3" +itertools = "0.10.3" + +quickpeep_structs = { path = "../quickpeep_structs" } +ipnetwork = "0.18.0" + +futures-util = "0.3.21" ### Raking helpers # HTTP Requests diff --git a/quickpeep/src/bin/qp-rake.rs b/quickpeep/src/bin/qp-rake.rs index 67b35a5..ff9ed82 100644 --- a/quickpeep/src/bin/qp-rake.rs +++ b/quickpeep/src/bin/qp-rake.rs @@ -1,25 +1,77 @@ -use quickpeep::raking::rake; +use adblock::lists::RuleTypes; +use anyhow::Context; +use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; use quickpeep::raking::RakeIntent; +use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; +use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; +use reqwest::redirect::Policy; use reqwest::Url; use std::str::FromStr; +use tokio::fs::File; #[tokio::main] pub async fn main() -> anyhow::Result<()> { - let client = reqwest::Client::new(); - // TODO max timeout, max body size - rake( - &Url::from_str("http://nothings.org/gamedev/ssao/")?, - RakeIntent::Page, - &client, - ) - .await?; + let mut header_map = HeaderMap::new(); + header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); - rake( - &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?, - RakeIntent::Page, - &client, - ) - .await?; + let client = reqwest::ClientBuilder::new() + .timeout(TIME_LIMIT) + .default_headers(header_map) + // TODO We want to handle redirects ourselves so we can track them... + .redirect(Policy::none()) + .build()?; + + // TODO Don't hardcode these paths in quite as bad a way... + let adblock_file = File::open("./cosmetic_filters.adblock") + .await + .context("Failed to open cosmetic filters file")?; + let adblock_engines = vec![( + AnalysisAntifeatures::ANNOYANCE, + load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?, + )]; + + let mut antifeature_ip_set = IpSet::new(); + + let ips_file = File::open("./data/cf_ips.txt") + .await + .context("Failed to open CF IPs file")?; + antifeature_ip_set.add_all_from_file(ips_file).await?; + + let raker = Raker { + adblock_engines, + antifeature_ip_set, + }; + + // raker.rake( + // &Url::from_str("http://nothings.org/gamedev/ssao/")?, + // RakeIntent::Page, + // &client, + // ) + // .await?; + // + // raker.rake( + // &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?, + // RakeIntent::Page, + // &client, + // ) + // .await?; + + raker + .rake( + &Url::from_str("https://www.thesprucepets.com/")?, + RakeIntent::Page, + &client, + ) + .await?; + + raker + .rake( + &Url::from_str("https://matrix.org/")?, + RakeIntent::Page, + &client, + ) + .await?; Ok(()) } diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index 7844c19..83c5304 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -1,17 +1,33 @@ +use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet}; +use adblock::engine::Engine; use anyhow::{bail, Context}; +use bytes::Bytes; use chrono::{DateTime, FixedOffset, Utc}; use cylon::Cylon; +use futures_util::stream::StreamExt; use html5ever::tendril::fmt::Slice; +use html5ever::QualName; +use kuchiki::traits::TendrilSink; +use kuchiki::NodeRef; use lazy_static::lazy_static; use log::debug; -use reqwest::{Client, Url}; +use quickpeep_densedoc::DenseTree; +use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; use sitemap::reader::SiteMapEntity; use std::collections::HashSet; +use std::time::Duration; +use tokio::time::Instant; -mod analysis; +pub mod analysis; -pub const USER_AGENT: &'static str = "QuickPeepBot"; +/// 4 MiB ought to be enough for anybody. +pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; +/// If it's not loaded in ten seconds, that's pretty severe. +/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes). +pub const TIME_LIMIT: Duration = Duration::from_secs(10); +pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot"; pub enum RakeOutcome { RakedPage(RakedPage), @@ -81,85 +97,169 @@ lazy_static! { ]); } -pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result { - let response = client.get(url.clone()).send().await?; +async fn response_to_bytes_limited( + mut response: Response, + size_limit: usize, + time_limit: Duration, +) -> anyhow::Result> { + let deadline = Instant::now() + time_limit; + let mut buffer = Vec::new(); + let mut bytestream = response.bytes_stream(); - if !response.status().is_success() { - bail!("Not successful: {:?}", response.status().as_u16()); - } - - let content_type = if let Some(content_type) = response.headers().get("content-type") { - let content_type = content_type - .to_str() - .context("Can't convert content-type to str")?; - eprintln!("CT {:?}", content_type); - content_type.split(";").next().unwrap().trim().to_owned() - } else { - return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { - reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()), - backoff_sec: 86400 * 7, - })); - }; - - let content = response.bytes().await?; - - if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { - match rake_html_page(&content, url) { - Ok(page_rake) => { - return Ok(RakeOutcome::RakedPage(page_rake)); - } - Err(error) => { - debug!("Failed to rake HTML page: {:?}", error); + loop { + tokio::select! { + next_chunk = bytestream.next() => { + match next_chunk { + Some(next_chunk) => { + buffer.extend_from_slice(next_chunk?.as_bytes()); + if buffer.len() > size_limit { + bail!("Exceeds size limit"); + } + }, + None => { + // Finished! :) + break; + } + } + }, + _ = tokio::time::sleep_until(deadline) => { + bail!("Exceeded time limit"); } } } - if FEED_MIME_TYPES.contains(content_type.as_str()) - && (intent == RakeIntent::Any || intent == RakeIntent::Feed) - { - match rake_feed(&content, url) { - Ok(feed) => { - return Ok(RakeOutcome::RakedFeed(feed)); - } - Err(error) => { - debug!("Failed to rake as feed: {:?}", error); - } - } - } - - if SITEMAP_MIME_TYPES.contains(content_type.as_str()) - && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) - { - match rake_sitemap(&content) { - Ok(sitemap) => { - return Ok(RakeOutcome::RakedSitemap(sitemap)); - } - Err(error) => { - debug!("Failed to rake as sitemap: {:?}", error); - } - } - } - - return Ok(RakeOutcome::PermanentFailure(PermanentFailure { - reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), - })); + Ok(buffer) } -pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result { - let content_str = std::str::from_utf8(content)?; +pub struct Raker { + pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>, + pub antifeature_ip_set: IpSet, +} - let mut readability = quickpeep_moz_readability::Readability::new(content_str); - readability - .parse(url.as_str()) - .context("failed to analyse readability")?; +impl Raker { + pub async fn rake( + &self, + url: &Url, + intent: RakeIntent, + client: &Client, + ) -> anyhow::Result { + let response = client.get(url.clone()).send().await?; - eprintln!("{:#?}", readability.metadata); + if let Some(remote_addr) = response.remote_addr() { + eprintln!("rA {:?}", remote_addr); + let is_cf = self.antifeature_ip_set.contains(remote_addr.ip()); + eprintln!("CF? {:?}", is_cf); + } - if let Some(node) = readability.article_node { - eprintln!("{}", node.to_string()); + if !response.status().is_success() { + bail!("Not successful: {:?}", response.status().as_u16()); + } + + let content_type = if let Some(content_type) = response.headers().get("content-type") { + let content_type = content_type + .to_str() + .context("Can't convert content-type to str")?; + eprintln!("CT {:?}", content_type); + content_type.split(";").next().unwrap().trim().to_owned() + } else { + return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure { + reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()), + backoff_sec: 86400 * 7, + })); + }; + + let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?; + + if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) + { + match self.rake_html_page(&content, url) { + Ok(page_rake) => { + return Ok(RakeOutcome::RakedPage(page_rake)); + } + Err(error) => { + debug!("Failed to rake HTML page: {:?}", error); + } + } + } + + if FEED_MIME_TYPES.contains(content_type.as_str()) + && (intent == RakeIntent::Any || intent == RakeIntent::Feed) + { + match rake_feed(&content, url) { + Ok(feed) => { + return Ok(RakeOutcome::RakedFeed(feed)); + } + Err(error) => { + debug!("Failed to rake as feed: {:?}", error); + } + } + } + + if SITEMAP_MIME_TYPES.contains(content_type.as_str()) + && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap) + { + match rake_sitemap(&content) { + Ok(sitemap) => { + return Ok(RakeOutcome::RakedSitemap(sitemap)); + } + Err(error) => { + debug!("Failed to rake as sitemap: {:?}", error); + } + } + } + + return Ok(RakeOutcome::PermanentFailure(PermanentFailure { + reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), + })); } - Ok(todo!()) + pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result { + let content_str = std::str::from_utf8(content)?; + + let root_node: NodeRef = kuchiki::parse_html().one(content_str); + + let mut antifeature_flags = AnalysisAntifeatures::empty(); + + for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines { + match analyse_with_ad_block_cosmetic_filter( + &root_node, + adblock_engine, + url.as_str(), + true, + ) { + Ok(cosmetic_filters_tripped) => { + eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped); + antifeature_flags |= *engine_antifeature_flag; + } + Err(err) => { + eprintln!("Cosmetic Filter Err {:?}", err); + } + }; + } + + let dense_doc = DenseTree::from_body(root_node.clone()); + let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); + eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); + eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); + + let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node); + readability + .parse(url.as_str()) + .context("failed to analyse readability")?; + + eprintln!("{:#?}", readability.metadata); + + if let Some(node) = readability.article_node { + //eprintln!("{}", node.to_string()); + } + + let bare_size = serde_bare::to_vec(&dense_doc)?.len(); + eprintln!("CS {:?} → {:?}", content.len(), bare_size); + + Ok(RakedPage { + // TODO + }) + } } pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result> { @@ -286,7 +386,7 @@ pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result } } - let rules = cylon::Compiler::new(USER_AGENT) + let rules = cylon::Compiler::new(RAKER_USER_AGENT) .compile(bytes.as_bytes()) .await?; diff --git a/quickpeep/src/raking/analysis.rs b/quickpeep/src/raking/analysis.rs index f79ca86..2101da0 100644 --- a/quickpeep/src/raking/analysis.rs +++ b/quickpeep/src/raking/analysis.rs @@ -1,14 +1,16 @@ -use adblock::filters::cosmetic::CosmeticFilter; -use anyhow::anyhow; +use adblock::engine::Engine; +use adblock::lists::{ParseOptions, RuleTypes}; +use anyhow::Context; +use ipnetwork::IpNetwork; use kuchiki::NodeRef; -use log::debug; -use std::path::Path; -use tokio::fs::File; +use std::collections::{BTreeSet, HashSet}; +use std::net::IpAddr; use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; -pub async fn load_cosmetic_filters( +pub async fn load_adblock_engine( reader: R, -) -> anyhow::Result> { + rule_types: RuleTypes, +) -> anyhow::Result { let mut br = BufReader::new(reader); let mut rules = Vec::new(); let mut buf = String::new(); @@ -17,27 +19,172 @@ pub async fn load_cosmetic_filters( if br.read_line(&mut buf).await? == 0 { break; } - if let Ok(rule) = CosmeticFilter::parse(&buf, false) { - rules.push(rule); + rules.push(buf.trim().to_owned()); + } + Ok(Engine::from_rules( + &rules, + ParseOptions { + format: Default::default(), + include_redirect_urls: false, + rule_types, + }, + )) +} + +// Relevant: +// https://github.com/brave/adblock-rust/issues/152#issuecomment-771259069 + +pub struct ExtractedClassesAndIds { + classes: Vec, + ids: Vec, +} + +pub fn extract_classes_and_ids_from_page(root: &NodeRef) -> ExtractedClassesAndIds { + let mut class_set = HashSet::new(); + let mut id_set = HashSet::new(); + + for node in root.inclusive_descendants() { + if let Some(element) = node.0.as_element() { + let attrs = element.attributes.borrow(); + if let Some(id) = attrs.get("id") { + id_set.insert(id.to_owned()); + } + if let Some(classes) = attrs.get("class") { + for class in classes.trim().split_whitespace() { + class_set.insert(class.to_owned()); + } + } } } - Ok(rules) + ExtractedClassesAndIds { + classes: class_set.into_iter().collect(), + ids: id_set.into_iter().collect(), + } } pub fn analyse_with_ad_block_cosmetic_filter( - root: NodeRef, - filters: &Vec, + root: &NodeRef, + engine: &Engine, + url: &str, + remove: bool, ) -> anyhow::Result { let mut matches = 0; - for rule in filters { - for ele in root - .select(&rule.selector) - .map_err(|_| anyhow!("Failed to select(..)"))? - { - debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele); - matches += 1; + + let url_resources = engine.url_cosmetic_resources(url); + let specialist_hide_selectors = if !url_resources.generichide { + let ExtractedClassesAndIds { classes, ids } = extract_classes_and_ids_from_page(root); + + //eprintln!("ID {:#?}", ids); + //eprintln!("CC {:#?}", classes); + + engine.hidden_class_id_selectors(&classes, &ids, &url_resources.exceptions) + } else { + Vec::with_capacity(0) + }; + + //eprintln!("UR {:#?}", url_resources); + //eprintln!("sHS {:#?}", specialist_hide_selectors); + //eprintln!("----"); + + for rule in itertools::chain(specialist_hide_selectors, url_resources.hide_selectors) { + if let Ok(result) = root.select(&rule) { + for ele in result { + eprintln!("Cosmetic Filter {:?} Matches {:?}", rule, ele); + matches += 1; + if remove { + ele.as_node().detach(); + } + } + } else { + //eprintln!("(fail)"); } } Ok(matches > 0) } + +// TODO this isn't particularly efficient. Probably want a trie if it's important... +pub struct IpSet { + ips: BTreeSet, +} + +impl IpSet { + pub fn new() -> IpSet { + IpSet { + ips: Default::default(), + } + } + + pub async fn add_all_from_file( + &mut self, + reader: R, + ) -> anyhow::Result<()> { + let mut br = BufReader::new(reader); + + let mut buf = String::new(); + loop { + buf.clear(); + if br.read_line(&mut buf).await? == 0 { + break; + } + + let trimmed = buf.trim(); + + if trimmed.is_empty() { + continue; + } + + let ip_net = trimmed + .parse::() + .context("Parsing CIDR IP range")?; + self.add(ip_net); + } + + Ok(()) + } + + pub fn add(&mut self, network: IpNetwork) { + // We jump through a couple of hoops to make sure we store the lowest address in the network, + // since we use that for sorting. + self.ips + .insert(IpNetwork::new(network.network(), network.prefix()).unwrap()); + } + + pub fn contains(&self, addr: IpAddr) -> bool { + let prefix = if addr.is_ipv4() { + 32 + } else { + assert!(addr.is_ipv6()); + 128 + }; + let addr_as_net = + IpNetwork::new(addr, prefix).expect("Conversion to IpNetwork should be correct"); + for ipnet in self.ips.range(..=addr_as_net).rev().next() { + if ipnet.contains(addr) { + return true; + } + } + + false + } +} + +#[cfg(test)] +mod test { + use crate::raking::analysis::IpSet; + use ipnetwork::IpNetwork; + use std::net::IpAddr; + use std::str::FromStr; + + #[test] + pub fn test_ipset_contains() { + let mut set = IpSet::new(); + set.add(IpNetwork::from_str("1.2.3.4/16").unwrap()); + set.add(IpNetwork::from_str("1.1.2.3/16").unwrap()); + set.add(IpNetwork::from_str("85.42.36.17/24").unwrap()); + + assert!(set.contains(IpAddr::from_str("1.2.42.42").unwrap())); + assert!(set.contains(IpAddr::from_str("85.42.36.14").unwrap())); + assert!(!set.contains(IpAddr::from_str("85.42.37.14").unwrap())); + } +} diff --git a/quickpeep_densedoc/Cargo.toml b/quickpeep_densedoc/Cargo.toml new file mode 100644 index 0000000..c07eca3 --- /dev/null +++ b/quickpeep_densedoc/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "quickpeep_densedoc" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.56" +serde = { version = "1.0.136", features = ["derive"] } +kuchiki = "0.8.1" +html5ever = "0.25.1" +regex = "1.5.5" +lazy_static = "1.4.0" \ No newline at end of file diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs new file mode 100644 index 0000000..ad233a4 --- /dev/null +++ b/quickpeep_densedoc/src/lib.rs @@ -0,0 +1,403 @@ +use kuchiki::NodeRef; +use lazy_static::lazy_static; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::borrow::Borrow; +use std::ops::Deref; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct DenseDocument { + head: DenseHead, + body: Vec, +} + +impl DenseDocument { + pub fn from_document(root_node: NodeRef) { + todo!() + } +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct DenseHead { + title: String, + feed_urls: Vec, + // TODO how best to expose this?? We actually don't care about storing it though ... + // Probably move to the raker. + canonical: (), // TODO I'm sure we'd benefit by digging up some metadata, but that's possibly for later :) +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub enum DenseTree { + Heading1(Vec), + Heading2(Vec), + Heading3(Vec), + Heading4(Vec), + Heading5(Vec), + Heading6(Vec), + Link { + children: Vec, + href: String, + nofollow: bool, + }, + Image { + src: String, + alt: String, + // title? I don't know if it'd be very useful. + }, + Text(String), +} + +impl DenseTree { + pub fn from_body(body_node: NodeRef) -> Vec { + let mut builder = DenseTreeBuilder::new(); + builder.add_children_of_node(body_node); + builder.into_tree() + } + + pub fn is_text(&self) -> bool { + match self { + DenseTree::Text(_) => true, + _ => false, + } + } + + pub fn generate_textual_format(nodes: &Vec) -> String { + let mut buf = String::new(); + for node in nodes { + node.append_in_textual_format(&mut buf); + } + simplify_newlines(&buf) + } + + fn append_in_textual_format(&self, string: &mut String) { + match self { + DenseTree::Heading1(children) => { + string.push_str("\n\n# "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Heading2(children) => { + string.push_str("\n\n## "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Heading3(children) => { + string.push_str("\n\n### "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Heading4(children) => { + string.push_str("\n\n#### "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Heading5(children) => { + string.push_str("\n\n##### "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Heading6(children) => { + string.push_str("\n\n###### "); + for child in children { + child.append_in_textual_format(string); + } + string.push_str("\n"); + } + DenseTree::Link { children, href, .. } => { + string.push('['); + for child in children { + child.append_in_textual_format(string); + } + string.push_str(&format!("]({})", href)); + } + DenseTree::Image { .. } => { + string.push_str("[IMG]"); + } + DenseTree::Text(text) => { + string.push_str(text); + } + } + } +} + +struct DenseTreeBuilder { + /// Siblings in the buffer. + nodes: Vec, + + /// Number of preceding newlines at the end of the buffer. + /// Used for generating text that preserves some vague structure. + preceding_newlines: u32, +} + +impl DenseTreeBuilder { + pub fn new() -> Self { + DenseTreeBuilder { + nodes: vec![], + preceding_newlines: 0, + } + } + + pub fn into_tree(mut self) -> Vec { + self.simplify(); + self.nodes + } + + /// Simplify the DenseTree nodes: coalesce Text nodes and + pub fn simplify(&mut self) { + // First coalesce all text nodes + // TODO(perf): Do it in a better way to reduce the cost. + let mut idx = 1; + while idx < self.nodes.len() { + if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() { + // Merge the two text nodes is a text node, consume it and merge it in. + match self.nodes.remove(idx) { + DenseTree::Text(append_text) => { + match &mut self.nodes[idx - 1] { + DenseTree::Text(string) => { + string.push_str(&append_text); + // Continue so we don't advance, as we just moved the list down a + // bit. + continue; + } + _ => { + panic!( + "Should be unreachable: checked to be text first. ({})", + idx - 1 + ); + } + } + } + _ => { + panic!("Should be unreachable: checked to be text first. ({})", idx); + } + } + } + + idx += 1; + } + + for node in &mut self.nodes { + match node { + DenseTree::Text(text) => { + // Coalesce newlines so there are never more than 2 in a row. + *text = simplify_newlines(&simplify_whitespace(&text)); + } + _ => { /* nop */ } + } + } + + match self.nodes.get_mut(0) { + Some(DenseTree::Text(text)) => { + *text = text.trim_start().to_owned(); + } + _ => (), + } + + let num_nodes = self.nodes.len(); + if num_nodes > 1 { + match self.nodes.get_mut(num_nodes - 1) { + Some(DenseTree::Text(text)) => { + *text = text.trim_end().to_owned(); + } + _ => (), + } + } + } + + /// Convert a HTML node's children into DenseTree nodes. + pub fn add_children_of_node(&mut self, node: NodeRef) { + for child in node.children() { + if let Some(element) = child.as_element() { + match element.name.local.deref() { + "h1" => { + self.nodes + .push(DenseTree::Heading1(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "h2" => { + self.nodes + .push(DenseTree::Heading2(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "h3" => { + self.nodes + .push(DenseTree::Heading3(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "h4" => { + self.nodes + .push(DenseTree::Heading4(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "h5" => { + self.nodes + .push(DenseTree::Heading5(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "h6" => { + self.nodes + .push(DenseTree::Heading6(DenseTree::from_body(child))); + self.preceding_newlines = 2; + } + "a" => { + let attrs = element.attributes.borrow(); + let href = attrs.get("href").unwrap_or("").to_owned(); + + if href.starts_with("javascript:") || href.starts_with("data:") { + // Skip this link. Just unwrap it. + self.add_children_of_node(child.clone()); + continue; + } + + let nofollow = attrs + .get("rel") + .map(|rel: &str| { + rel.split_whitespace() + .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow")) + }) + .unwrap_or(false); + drop(attrs); + + self.nodes.push(DenseTree::Link { + children: DenseTree::from_body(child), + href, + nofollow, + }); + + self.preceding_newlines = 0; + } + "img" => { + // TODO Decide if this is worth the space... + let attrs = element.attributes.borrow(); + let src = attrs.get("src").unwrap_or("").to_owned(); + + if src.starts_with("javascript:") || src.starts_with("data:") { + // Skip this image. + continue; + } + + let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim()); + + self.nodes.push(DenseTree::Image { src, alt }); + } + "p" | "pre" => { + // Paragraphs must have 2 preceding newlines. + if self.preceding_newlines < 2 { + self.nodes.push(DenseTree::Text( + match self.preceding_newlines { + 0 => "\n\n", + 1 => "\n", + _ => unreachable!(), + } + .to_owned(), + )); + self.preceding_newlines = 2; + } + + self.add_children_of_node(child); + + // Paragraphs must have 2 trailing newlines. + if self.preceding_newlines < 2 { + self.nodes.push(DenseTree::Text( + match self.preceding_newlines { + 0 => "\n\n", + 1 => "\n", + _ => unreachable!(), + } + .to_owned(), + )); + self.preceding_newlines = 2; + } + } + "br" => { + self.nodes.push(DenseTree::Text("\n".to_owned())); + self.preceding_newlines += 1; + } + "div" | "li" => { + // Divs must have 1 preceding newline. + if self.preceding_newlines < 1 { + self.nodes.push(DenseTree::Text("\n".to_owned())); + self.preceding_newlines = 1; + } + + self.add_children_of_node(child); + + // Divs must have 1 trailing newline. + if self.preceding_newlines < 1 { + self.nodes.push(DenseTree::Text("\n".to_owned())); + self.preceding_newlines = 1; + } + } + "script" | "style" | "svg" | "noscript" => { + // We just prune these, as we don't want them. + // (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.) + continue; + } + _ => { + // Simply unwrap the unknown element. + self.add_children_of_node(child); + } + } + //element.name.local + } else if let Some(text) = child.as_text() { + let text_to_add = + simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " "))); + self.preceding_newlines = + text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32; + self.nodes.push(DenseTree::Text(text_to_add)); + } + } + } +} + +lazy_static! { + static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap(); + static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap(); + static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap(); + static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap(); +} + +pub fn simplify_whitespace(input: &str) -> String { + let s = MANY_WHITESPACE.replace_all(input, " "); + let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n"); + UNNECESSARY_LE_WHITESPACE + .replace_all(s.borrow(), "\n") + .into_owned() +} + +pub fn simplify_newlines(input: &str) -> String { + THREE_OR_MORE_NEWLINES + .replace_all(&input.replace("\r", ""), "\n\n") + .into_owned() +} + +#[cfg(test)] +mod test { + use crate::{simplify_newlines, simplify_whitespace}; + + #[test] + pub fn test_simplify_whitespace() { + assert_eq!( + simplify_whitespace("hello cat\tdog \t bat"), + "hello cat dog bat" + ); + } + + #[test] + pub fn test_simplify_newlines() { + assert_eq!( + simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"), + "hello\n\nare\n\nyou\n\nthere?" + ); + } +} diff --git a/quickpeep_moz_readability/src/lib.rs b/quickpeep_moz_readability/src/lib.rs index 85935cf..4b6aa1a 100644 --- a/quickpeep_moz_readability/src/lib.rs +++ b/quickpeep_moz_readability/src/lib.rs @@ -60,7 +60,8 @@ const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", " pub mod regexes; pub struct Readability { - root_node: NodeRef, + /// Left-over document. Note that readable article pieces are detached from the parent. + pub root_node: NodeRef, byline: Option, article_title: String, pub article_node: Option, @@ -77,8 +78,12 @@ struct SizeInfo { impl Readability { pub fn new(html_str: &str) -> Self { + Self::new_from_node(kuchiki::parse_html().one(html_str)) + } + + pub fn new_from_node(root_node: NodeRef) -> Self { Self { - root_node: kuchiki::parse_html().one(html_str), + root_node, byline: None, article_title: "".into(), article_node: None, @@ -87,6 +92,7 @@ impl Readability { metadata: MetaData::new(), } } + pub fn parse(&mut self, url: &str) -> anyhow::Result<()> { self.unwrap_no_script_tags(); self.remove_scripts(); diff --git a/quickpeep_structs/Cargo.toml b/quickpeep_structs/Cargo.toml new file mode 100644 index 0000000..fee54ad --- /dev/null +++ b/quickpeep_structs/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "quickpeep_structs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitflags = "1.3.2" +#arc-interner = "0.7.0" +quickpeep_densedoc = { path = "../quickpeep_densedoc" } \ No newline at end of file diff --git a/quickpeep_structs/src/lib.rs b/quickpeep_structs/src/lib.rs new file mode 100644 index 0000000..5ff9ef5 --- /dev/null +++ b/quickpeep_structs/src/lib.rs @@ -0,0 +1 @@ +pub mod rake_entries; diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs new file mode 100644 index 0000000..7389b70 --- /dev/null +++ b/quickpeep_structs/src/rake_entries.rs @@ -0,0 +1,24 @@ +use bitflags::bitflags; + +bitflags! { + pub struct AnalysisAntifeatures: u8 { + /// Adverts are present on the page, according to a filter. + const ADVERTS = 0x01; + /// Some things are blocked due to privacy concerns, according to a filter. + const PRIVACY = 0x02; + /// Annoying cookie nags are present on this page, according to a cosmetic filter. + const COOKIE_NAG = 0x04; + /// Unspecified annoyances are present on this page, according to a cosmetic filter. + const ANNOYANCE = 0x08; + + /// The web page was served over CloudFlare at the time of indexing, which is not in the + /// spirit of decentralisation. + const CLOUDFLARE = 0x10; + } +} + +pub struct RakedPageEntry { + pub analysed_antifeatures: AnalysisAntifeatures, + //pub article: Option, + //pub non_article: Option, +} diff --git a/scripts/get_cf_ips.sh b/scripts/get_cf_ips.sh new file mode 100755 index 0000000..057287b --- /dev/null +++ b/scripts/get_cf_ips.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +set -eu + +dir_path="$(dirname "$0")" + +mkdir -p "$dir_path/../data" +wget -O "$dir_path/../data/cf_ips_v4.txt" https://www.cloudflare.com/ips-v4 +wget -O "$dir_path/../data/cf_ips_v6.txt" https://www.cloudflare.com/ips-v6 +echo "\n" >> "$dir_path/../data/cf_ips_v4.txt" +cat "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt" > "$dir_path/../data/cf_ips.txt" +rm "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt"