Add a lot more foundational work for raking

rei/minimum
Olivier 'reivilibre' 2022-03-13 21:33:03 +00:00
parent 210e8ef10a
commit a1097ef183
14 changed files with 934 additions and 112 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
.idea
.idea
data/cf_ips.txt

44
Cargo.lock generated
View File

@ -612,6 +612,15 @@ version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35e70ee094dc02fd9c13fdad4940090f22dbd6ac7c9e7094a46cf0232a50bc7c"
[[package]]
name = "ipnetwork"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4088d739b183546b239688ddbc79891831df421773df95e236daf7867866d355"
dependencies = [
"serde",
]
[[package]]
name = "itertools"
version = "0.10.3"
@ -1066,18 +1075,36 @@ dependencies = [
"cylon",
"env_logger",
"feed-rs",
"futures-util",
"gemini-fetch",
"html5ever",
"ipnetwork",
"itertools",
"kuchiki",
"lazy_static",
"log",
"quickpeep_densedoc",
"quickpeep_moz_readability",
"quickpeep_structs",
"reqwest",
"serde",
"serde_bare",
"sitemap",
"tokio",
]
[[package]]
name = "quickpeep_densedoc"
version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
"kuchiki",
"lazy_static",
"regex",
"serde",
]
[[package]]
name = "quickpeep_moz_readability"
version = "0.1.0"
@ -1091,6 +1118,14 @@ dependencies = [
"url",
]
[[package]]
name = "quickpeep_structs"
version = "0.1.0"
dependencies = [
"bitflags",
"quickpeep_densedoc",
]
[[package]]
name = "quote"
version = "1.0.15"
@ -1388,6 +1423,15 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde_bare"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51c55386eed0f1ae957b091dc2ca8122f287b60c79c774cbe3d5f2b69fded660"
dependencies = [
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.136"

View File

@ -1,7 +1,9 @@
[workspace]
members = [
"quickpeep",
"quickpeep_moz_readability"
"quickpeep_densedoc",
"quickpeep_moz_readability",
"quickpeep_structs"
]

View File

@ -11,12 +11,14 @@ anyhow = "1.0.55"
log = "0.4.14"
env_logger = "0.9.0"
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
# TODO: why do we need these here?
kuchiki = "0.8.1"
html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0"
chrono = "0.4.19"
@ -24,9 +26,12 @@ lazy_static = "1.4.0"
bytes = "1.1.0"
# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
# rkyv = "0.7.35"
# memmap2 = "0.5.3"
itertools = "0.10.3"
quickpeep_structs = { path = "../quickpeep_structs" }
ipnetwork = "0.18.0"
futures-util = "0.3.21"
### Raking helpers
# HTTP Requests

View File

@ -1,25 +1,77 @@
use quickpeep::raking::rake;
use adblock::lists::RuleTypes;
use anyhow::Context;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::RakeIntent;
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy;
use reqwest::Url;
use std::str::FromStr;
use tokio::fs::File;
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
let client = reqwest::Client::new();
// TODO max timeout, max body size
rake(
&Url::from_str("http://nothings.org/gamedev/ssao/")?,
RakeIntent::Page,
&client,
)
.await?;
let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
rake(
&Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
RakeIntent::Page,
&client,
)
.await?;
let client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
// TODO We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
// TODO Don't hardcode these paths in quite as bad a way...
let adblock_file = File::open("./cosmetic_filters.adblock")
.await
.context("Failed to open cosmetic filters file")?;
let adblock_engines = vec![(
AnalysisAntifeatures::ANNOYANCE,
load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?,
)];
let mut antifeature_ip_set = IpSet::new();
let ips_file = File::open("./data/cf_ips.txt")
.await
.context("Failed to open CF IPs file")?;
antifeature_ip_set.add_all_from_file(ips_file).await?;
let raker = Raker {
adblock_engines,
antifeature_ip_set,
};
// raker.rake(
// &Url::from_str("http://nothings.org/gamedev/ssao/")?,
// RakeIntent::Page,
// &client,
// )
// .await?;
//
// raker.rake(
// &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
// RakeIntent::Page,
// &client,
// )
// .await?;
raker
.rake(
&Url::from_str("https://www.thesprucepets.com/")?,
RakeIntent::Page,
&client,
)
.await?;
raker
.rake(
&Url::from_str("https://matrix.org/")?,
RakeIntent::Page,
&client,
)
.await?;
Ok(())
}

View File

@ -1,17 +1,33 @@
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
use adblock::engine::Engine;
use anyhow::{bail, Context};
use bytes::Bytes;
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use html5ever::QualName;
use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use lazy_static::lazy_static;
use log::debug;
use reqwest::{Client, Url};
use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::collections::HashSet;
use std::time::Duration;
use tokio::time::Instant;
mod analysis;
pub mod analysis;
pub const USER_AGENT: &'static str = "QuickPeepBot";
/// 4 MiB ought to be enough for anybody.
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
/// If it's not loaded in ten seconds, that's pretty severe.
/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";
pub enum RakeOutcome {
RakedPage(RakedPage),
@ -81,85 +97,169 @@ lazy_static! {
]);
}
pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
let response = client.get(url.clone()).send().await?;
async fn response_to_bytes_limited(
mut response: Response,
size_limit: usize,
time_limit: Duration,
) -> anyhow::Result<Vec<u8>> {
let deadline = Instant::now() + time_limit;
let mut buffer = Vec::new();
let mut bytestream = response.bytes_stream();
if !response.status().is_success() {
bail!("Not successful: {:?}", response.status().as_u16());
}
let content_type = if let Some(content_type) = response.headers().get("content-type") {
let content_type = content_type
.to_str()
.context("Can't convert content-type to str")?;
eprintln!("CT {:?}", content_type);
content_type.split(";").next().unwrap().trim().to_owned()
} else {
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
backoff_sec: 86400 * 7,
}));
};
let content = response.bytes().await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {
match rake_html_page(&content, url) {
Ok(page_rake) => {
return Ok(RakeOutcome::RakedPage(page_rake));
}
Err(error) => {
debug!("Failed to rake HTML page: {:?}", error);
loop {
tokio::select! {
next_chunk = bytestream.next() => {
match next_chunk {
Some(next_chunk) => {
buffer.extend_from_slice(next_chunk?.as_bytes());
if buffer.len() > size_limit {
bail!("Exceeds size limit");
}
},
None => {
// Finished! :)
break;
}
}
},
_ = tokio::time::sleep_until(deadline) => {
bail!("Exceeded time limit");
}
}
}
if FEED_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
{
match rake_feed(&content, url) {
Ok(feed) => {
return Ok(RakeOutcome::RakedFeed(feed));
}
Err(error) => {
debug!("Failed to rake as feed: {:?}", error);
}
}
}
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
{
match rake_sitemap(&content) {
Ok(sitemap) => {
return Ok(RakeOutcome::RakedSitemap(sitemap));
}
Err(error) => {
debug!("Failed to rake as sitemap: {:?}", error);
}
}
}
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
Ok(buffer)
}
pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
let content_str = std::str::from_utf8(content)?;
pub struct Raker {
pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>,
pub antifeature_ip_set: IpSet,
}
let mut readability = quickpeep_moz_readability::Readability::new(content_str);
readability
.parse(url.as_str())
.context("failed to analyse readability")?;
impl Raker {
pub async fn rake(
&self,
url: &Url,
intent: RakeIntent,
client: &Client,
) -> anyhow::Result<RakeOutcome> {
let response = client.get(url.clone()).send().await?;
eprintln!("{:#?}", readability.metadata);
if let Some(remote_addr) = response.remote_addr() {
eprintln!("rA {:?}", remote_addr);
let is_cf = self.antifeature_ip_set.contains(remote_addr.ip());
eprintln!("CF? {:?}", is_cf);
}
if let Some(node) = readability.article_node {
eprintln!("{}", node.to_string());
if !response.status().is_success() {
bail!("Not successful: {:?}", response.status().as_u16());
}
let content_type = if let Some(content_type) = response.headers().get("content-type") {
let content_type = content_type
.to_str()
.context("Can't convert content-type to str")?;
eprintln!("CT {:?}", content_type);
content_type.split(";").next().unwrap().trim().to_owned()
} else {
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
backoff_sec: 86400 * 7,
}));
};
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{
match self.rake_html_page(&content, url) {
Ok(page_rake) => {
return Ok(RakeOutcome::RakedPage(page_rake));
}
Err(error) => {
debug!("Failed to rake HTML page: {:?}", error);
}
}
}
if FEED_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
{
match rake_feed(&content, url) {
Ok(feed) => {
return Ok(RakeOutcome::RakedFeed(feed));
}
Err(error) => {
debug!("Failed to rake as feed: {:?}", error);
}
}
}
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
{
match rake_sitemap(&content) {
Ok(sitemap) => {
return Ok(RakeOutcome::RakedSitemap(sitemap));
}
Err(error) => {
debug!("Failed to rake as sitemap: {:?}", error);
}
}
}
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
}
Ok(todo!())
pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
let content_str = std::str::from_utf8(content)?;
let root_node: NodeRef = kuchiki::parse_html().one(content_str);
let mut antifeature_flags = AnalysisAntifeatures::empty();
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
match analyse_with_ad_block_cosmetic_filter(
&root_node,
adblock_engine,
url.as_str(),
true,
) {
Ok(cosmetic_filters_tripped) => {
eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
antifeature_flags |= *engine_antifeature_flag;
}
Err(err) => {
eprintln!("Cosmetic Filter Err {:?}", err);
}
};
}
let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
readability
.parse(url.as_str())
.context("failed to analyse readability")?;
eprintln!("{:#?}", readability.metadata);
if let Some(node) = readability.article_node {
//eprintln!("{}", node.to_string());
}
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
eprintln!("CS {:?}{:?}", content.len(), bare_size);
Ok(RakedPage {
// TODO
})
}
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
@ -286,7 +386,7 @@ pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>
}
}
let rules = cylon::Compiler::new(USER_AGENT)
let rules = cylon::Compiler::new(RAKER_USER_AGENT)
.compile(bytes.as_bytes())
.await?;

View File

@ -1,14 +1,16 @@
use adblock::filters::cosmetic::CosmeticFilter;
use anyhow::anyhow;
use adblock::engine::Engine;
use adblock::lists::{ParseOptions, RuleTypes};
use anyhow::Context;
use ipnetwork::IpNetwork;
use kuchiki::NodeRef;
use log::debug;
use std::path::Path;
use tokio::fs::File;
use std::collections::{BTreeSet, HashSet};
use std::net::IpAddr;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
pub async fn load_adblock_engine<R: AsyncRead + Unpin>(
reader: R,
) -> anyhow::Result<Vec<CosmeticFilter>> {
rule_types: RuleTypes,
) -> anyhow::Result<Engine> {
let mut br = BufReader::new(reader);
let mut rules = Vec::new();
let mut buf = String::new();
@ -17,27 +19,172 @@ pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
if br.read_line(&mut buf).await? == 0 {
break;
}
if let Ok(rule) = CosmeticFilter::parse(&buf, false) {
rules.push(rule);
rules.push(buf.trim().to_owned());
}
Ok(Engine::from_rules(
&rules,
ParseOptions {
format: Default::default(),
include_redirect_urls: false,
rule_types,
},
))
}
// Relevant:
// https://github.com/brave/adblock-rust/issues/152#issuecomment-771259069
pub struct ExtractedClassesAndIds {
classes: Vec<String>,
ids: Vec<String>,
}
pub fn extract_classes_and_ids_from_page(root: &NodeRef) -> ExtractedClassesAndIds {
let mut class_set = HashSet::new();
let mut id_set = HashSet::new();
for node in root.inclusive_descendants() {
if let Some(element) = node.0.as_element() {
let attrs = element.attributes.borrow();
if let Some(id) = attrs.get("id") {
id_set.insert(id.to_owned());
}
if let Some(classes) = attrs.get("class") {
for class in classes.trim().split_whitespace() {
class_set.insert(class.to_owned());
}
}
}
}
Ok(rules)
ExtractedClassesAndIds {
classes: class_set.into_iter().collect(),
ids: id_set.into_iter().collect(),
}
}
pub fn analyse_with_ad_block_cosmetic_filter(
root: NodeRef,
filters: &Vec<CosmeticFilter>,
root: &NodeRef,
engine: &Engine,
url: &str,
remove: bool,
) -> anyhow::Result<bool> {
let mut matches = 0;
for rule in filters {
for ele in root
.select(&rule.selector)
.map_err(|_| anyhow!("Failed to select(..)"))?
{
debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
matches += 1;
let url_resources = engine.url_cosmetic_resources(url);
let specialist_hide_selectors = if !url_resources.generichide {
let ExtractedClassesAndIds { classes, ids } = extract_classes_and_ids_from_page(root);
//eprintln!("ID {:#?}", ids);
//eprintln!("CC {:#?}", classes);
engine.hidden_class_id_selectors(&classes, &ids, &url_resources.exceptions)
} else {
Vec::with_capacity(0)
};
//eprintln!("UR {:#?}", url_resources);
//eprintln!("sHS {:#?}", specialist_hide_selectors);
//eprintln!("----");
for rule in itertools::chain(specialist_hide_selectors, url_resources.hide_selectors) {
if let Ok(result) = root.select(&rule) {
for ele in result {
eprintln!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
matches += 1;
if remove {
ele.as_node().detach();
}
}
} else {
//eprintln!("(fail)");
}
}
Ok(matches > 0)
}
// TODO this isn't particularly efficient. Probably want a trie if it's important...
pub struct IpSet {
ips: BTreeSet<IpNetwork>,
}
impl IpSet {
pub fn new() -> IpSet {
IpSet {
ips: Default::default(),
}
}
pub async fn add_all_from_file<R: AsyncRead + Unpin>(
&mut self,
reader: R,
) -> anyhow::Result<()> {
let mut br = BufReader::new(reader);
let mut buf = String::new();
loop {
buf.clear();
if br.read_line(&mut buf).await? == 0 {
break;
}
let trimmed = buf.trim();
if trimmed.is_empty() {
continue;
}
let ip_net = trimmed
.parse::<IpNetwork>()
.context("Parsing CIDR IP range")?;
self.add(ip_net);
}
Ok(())
}
pub fn add(&mut self, network: IpNetwork) {
// We jump through a couple of hoops to make sure we store the lowest address in the network,
// since we use that for sorting.
self.ips
.insert(IpNetwork::new(network.network(), network.prefix()).unwrap());
}
pub fn contains(&self, addr: IpAddr) -> bool {
let prefix = if addr.is_ipv4() {
32
} else {
assert!(addr.is_ipv6());
128
};
let addr_as_net =
IpNetwork::new(addr, prefix).expect("Conversion to IpNetwork should be correct");
for ipnet in self.ips.range(..=addr_as_net).rev().next() {
if ipnet.contains(addr) {
return true;
}
}
false
}
}
#[cfg(test)]
mod test {
use crate::raking::analysis::IpSet;
use ipnetwork::IpNetwork;
use std::net::IpAddr;
use std::str::FromStr;
#[test]
pub fn test_ipset_contains() {
let mut set = IpSet::new();
set.add(IpNetwork::from_str("1.2.3.4/16").unwrap());
set.add(IpNetwork::from_str("1.1.2.3/16").unwrap());
set.add(IpNetwork::from_str("85.42.36.17/24").unwrap());
assert!(set.contains(IpAddr::from_str("1.2.42.42").unwrap()));
assert!(set.contains(IpAddr::from_str("85.42.36.14").unwrap()));
assert!(!set.contains(IpAddr::from_str("85.42.37.14").unwrap()));
}
}

View File

@ -0,0 +1,14 @@
[package]
name = "quickpeep_densedoc"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.56"
serde = { version = "1.0.136", features = ["derive"] }
kuchiki = "0.8.1"
html5ever = "0.25.1"
regex = "1.5.5"
lazy_static = "1.4.0"

View File

@ -0,0 +1,403 @@
use kuchiki::NodeRef;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
use std::ops::Deref;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseDocument {
head: DenseHead,
body: Vec<DenseTree>,
}
impl DenseDocument {
pub fn from_document(root_node: NodeRef) {
todo!()
}
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct DenseHead {
title: String,
feed_urls: Vec<String>,
// TODO how best to expose this?? We actually don't care about storing it though ...
// Probably move to the raker.
canonical: (), // TODO I'm sure we'd benefit by digging up some metadata, but that's possibly for later :)
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub enum DenseTree {
Heading1(Vec<DenseTree>),
Heading2(Vec<DenseTree>),
Heading3(Vec<DenseTree>),
Heading4(Vec<DenseTree>),
Heading5(Vec<DenseTree>),
Heading6(Vec<DenseTree>),
Link {
children: Vec<DenseTree>,
href: String,
nofollow: bool,
},
Image {
src: String,
alt: String,
// title? I don't know if it'd be very useful.
},
Text(String),
}
impl DenseTree {
pub fn from_body(body_node: NodeRef) -> Vec<DenseTree> {
let mut builder = DenseTreeBuilder::new();
builder.add_children_of_node(body_node);
builder.into_tree()
}
pub fn is_text(&self) -> bool {
match self {
DenseTree::Text(_) => true,
_ => false,
}
}
pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
let mut buf = String::new();
for node in nodes {
node.append_in_textual_format(&mut buf);
}
simplify_newlines(&buf)
}
fn append_in_textual_format(&self, string: &mut String) {
match self {
DenseTree::Heading1(children) => {
string.push_str("\n\n# ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading2(children) => {
string.push_str("\n\n## ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading3(children) => {
string.push_str("\n\n### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading4(children) => {
string.push_str("\n\n#### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading5(children) => {
string.push_str("\n\n##### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Heading6(children) => {
string.push_str("\n\n###### ");
for child in children {
child.append_in_textual_format(string);
}
string.push_str("\n");
}
DenseTree::Link { children, href, .. } => {
string.push('[');
for child in children {
child.append_in_textual_format(string);
}
string.push_str(&format!("]({})", href));
}
DenseTree::Image { .. } => {
string.push_str("[IMG]");
}
DenseTree::Text(text) => {
string.push_str(text);
}
}
}
}
struct DenseTreeBuilder {
/// Siblings in the buffer.
nodes: Vec<DenseTree>,
/// Number of preceding newlines at the end of the buffer.
/// Used for generating text that preserves some vague structure.
preceding_newlines: u32,
}
impl DenseTreeBuilder {
pub fn new() -> Self {
DenseTreeBuilder {
nodes: vec![],
preceding_newlines: 0,
}
}
pub fn into_tree(mut self) -> Vec<DenseTree> {
self.simplify();
self.nodes
}
/// Simplify the DenseTree nodes: coalesce Text nodes and
pub fn simplify(&mut self) {
// First coalesce all text nodes
// TODO(perf): Do it in a better way to reduce the cost.
let mut idx = 1;
while idx < self.nodes.len() {
if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() {
// Merge the two text nodes is a text node, consume it and merge it in.
match self.nodes.remove(idx) {
DenseTree::Text(append_text) => {
match &mut self.nodes[idx - 1] {
DenseTree::Text(string) => {
string.push_str(&append_text);
// Continue so we don't advance, as we just moved the list down a
// bit.
continue;
}
_ => {
panic!(
"Should be unreachable: checked to be text first. ({})",
idx - 1
);
}
}
}
_ => {
panic!("Should be unreachable: checked to be text first. ({})", idx);
}
}
}
idx += 1;
}
for node in &mut self.nodes {
match node {
DenseTree::Text(text) => {
// Coalesce newlines so there are never more than 2 in a row.
*text = simplify_newlines(&simplify_whitespace(&text));
}
_ => { /* nop */ }
}
}
match self.nodes.get_mut(0) {
Some(DenseTree::Text(text)) => {
*text = text.trim_start().to_owned();
}
_ => (),
}
let num_nodes = self.nodes.len();
if num_nodes > 1 {
match self.nodes.get_mut(num_nodes - 1) {
Some(DenseTree::Text(text)) => {
*text = text.trim_end().to_owned();
}
_ => (),
}
}
}
/// Convert a HTML node's children into DenseTree nodes.
pub fn add_children_of_node(&mut self, node: NodeRef) {
for child in node.children() {
if let Some(element) = child.as_element() {
match element.name.local.deref() {
"h1" => {
self.nodes
.push(DenseTree::Heading1(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h2" => {
self.nodes
.push(DenseTree::Heading2(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h3" => {
self.nodes
.push(DenseTree::Heading3(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h4" => {
self.nodes
.push(DenseTree::Heading4(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h5" => {
self.nodes
.push(DenseTree::Heading5(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"h6" => {
self.nodes
.push(DenseTree::Heading6(DenseTree::from_body(child)));
self.preceding_newlines = 2;
}
"a" => {
let attrs = element.attributes.borrow();
let href = attrs.get("href").unwrap_or("").to_owned();
if href.starts_with("javascript:") || href.starts_with("data:") {
// Skip this link. Just unwrap it.
self.add_children_of_node(child.clone());
continue;
}
let nofollow = attrs
.get("rel")
.map(|rel: &str| {
rel.split_whitespace()
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
})
.unwrap_or(false);
drop(attrs);
self.nodes.push(DenseTree::Link {
children: DenseTree::from_body(child),
href,
nofollow,
});
self.preceding_newlines = 0;
}
"img" => {
// TODO Decide if this is worth the space...
let attrs = element.attributes.borrow();
let src = attrs.get("src").unwrap_or("").to_owned();
if src.starts_with("javascript:") || src.starts_with("data:") {
// Skip this image.
continue;
}
let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim());
self.nodes.push(DenseTree::Image { src, alt });
}
"p" | "pre" => {
// Paragraphs must have 2 preceding newlines.
if self.preceding_newlines < 2 {
self.nodes.push(DenseTree::Text(
match self.preceding_newlines {
0 => "\n\n",
1 => "\n",
_ => unreachable!(),
}
.to_owned(),
));
self.preceding_newlines = 2;
}
self.add_children_of_node(child);
// Paragraphs must have 2 trailing newlines.
if self.preceding_newlines < 2 {
self.nodes.push(DenseTree::Text(
match self.preceding_newlines {
0 => "\n\n",
1 => "\n",
_ => unreachable!(),
}
.to_owned(),
));
self.preceding_newlines = 2;
}
}
"br" => {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines += 1;
}
"div" | "li" => {
// Divs must have 1 preceding newline.
if self.preceding_newlines < 1 {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines = 1;
}
self.add_children_of_node(child);
// Divs must have 1 trailing newline.
if self.preceding_newlines < 1 {
self.nodes.push(DenseTree::Text("\n".to_owned()));
self.preceding_newlines = 1;
}
}
"script" | "style" | "svg" | "noscript" => {
// We just prune these, as we don't want them.
// (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.)
continue;
}
_ => {
// Simply unwrap the unknown element.
self.add_children_of_node(child);
}
}
//element.name.local
} else if let Some(text) = child.as_text() {
let text_to_add =
simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " ")));
self.preceding_newlines =
text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32;
self.nodes.push(DenseTree::Text(text_to_add));
}
}
}
}
lazy_static! {
static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap();
static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap();
static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap();
static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap();
}
pub fn simplify_whitespace(input: &str) -> String {
let s = MANY_WHITESPACE.replace_all(input, " ");
let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n");
UNNECESSARY_LE_WHITESPACE
.replace_all(s.borrow(), "\n")
.into_owned()
}
pub fn simplify_newlines(input: &str) -> String {
THREE_OR_MORE_NEWLINES
.replace_all(&input.replace("\r", ""), "\n\n")
.into_owned()
}
#[cfg(test)]
mod test {
use crate::{simplify_newlines, simplify_whitespace};
#[test]
pub fn test_simplify_whitespace() {
assert_eq!(
simplify_whitespace("hello cat\tdog \t bat"),
"hello cat dog bat"
);
}
#[test]
pub fn test_simplify_newlines() {
assert_eq!(
simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"),
"hello\n\nare\n\nyou\n\nthere?"
);
}
}

View File

@ -60,7 +60,8 @@ const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "
pub mod regexes;
pub struct Readability {
root_node: NodeRef,
/// Left-over document. Note that readable article pieces are detached from the parent.
pub root_node: NodeRef,
byline: Option<String>,
article_title: String,
pub article_node: Option<NodeRef>,
@ -77,8 +78,12 @@ struct SizeInfo {
impl Readability {
pub fn new(html_str: &str) -> Self {
Self::new_from_node(kuchiki::parse_html().one(html_str))
}
pub fn new_from_node(root_node: NodeRef) -> Self {
Self {
root_node: kuchiki::parse_html().one(html_str),
root_node,
byline: None,
article_title: "".into(),
article_node: None,
@ -87,6 +92,7 @@ impl Readability {
metadata: MetaData::new(),
}
}
pub fn parse(&mut self, url: &str) -> anyhow::Result<()> {
self.unwrap_no_script_tags();
self.remove_scripts();

View File

@ -0,0 +1,11 @@
[package]
name = "quickpeep_structs"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
bitflags = "1.3.2"
#arc-interner = "0.7.0"
quickpeep_densedoc = { path = "../quickpeep_densedoc" }

View File

@ -0,0 +1 @@
pub mod rake_entries;

View File

@ -0,0 +1,24 @@
use bitflags::bitflags;
bitflags! {
pub struct AnalysisAntifeatures: u8 {
/// Adverts are present on the page, according to a filter.
const ADVERTS = 0x01;
/// Some things are blocked due to privacy concerns, according to a filter.
const PRIVACY = 0x02;
/// Annoying cookie nags are present on this page, according to a cosmetic filter.
const COOKIE_NAG = 0x04;
/// Unspecified annoyances are present on this page, according to a cosmetic filter.
const ANNOYANCE = 0x08;
/// The web page was served over CloudFlare at the time of indexing, which is not in the
/// spirit of decentralisation.
const CLOUDFLARE = 0x10;
}
}
pub struct RakedPageEntry {
pub analysed_antifeatures: AnalysisAntifeatures,
//pub article: Option<DenseTree>,
//pub non_article: Option<DenseTree>,
}

12
scripts/get_cf_ips.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/sh
set -eu
dir_path="$(dirname "$0")"
mkdir -p "$dir_path/../data"
wget -O "$dir_path/../data/cf_ips_v4.txt" https://www.cloudflare.com/ips-v4
wget -O "$dir_path/../data/cf_ips_v6.txt" https://www.cloudflare.com/ips-v6
echo "\n" >> "$dir_path/../data/cf_ips_v4.txt"
cat "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt" > "$dir_path/../data/cf_ips.txt"
rm "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt"