Add a lot more foundational work for raking

2022-03-13 21:33:03 +00:00 · 2022-03-13 21:33:03 +00:00 · a1097ef183
commit a1097ef183
parent 210e8ef10a
14 changed files with 934 additions and 112 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@

-.idea
+.idea
+data/cf_ips.txt
--- a/Cargo.lock
+++ b/Cargo.lock
@ -612,6 +612,15 @@ version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "35e70ee094dc02fd9c13fdad4940090f22dbd6ac7c9e7094a46cf0232a50bc7c"

+[[package]]
+name = "ipnetwork"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4088d739b183546b239688ddbc79891831df421773df95e236daf7867866d355"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "itertools"
 version = "0.10.3"
@ -1066,18 +1075,36 @@ dependencies = [
 "cylon",
 "env_logger",
 "feed-rs",
+ "futures-util",
 "gemini-fetch",
 "html5ever",
+ "ipnetwork",
+ "itertools",
 "kuchiki",
 "lazy_static",
 "log",
+ "quickpeep_densedoc",
 "quickpeep_moz_readability",
+ "quickpeep_structs",
 "reqwest",
 "serde",
+ "serde_bare",
 "sitemap",
 "tokio",
 ]

+[[package]]
+name = "quickpeep_densedoc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "html5ever",
+ "kuchiki",
+ "lazy_static",
+ "regex",
+ "serde",
+]
+
 [[package]]
 name = "quickpeep_moz_readability"
 version = "0.1.0"
@ -1091,6 +1118,14 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "quickpeep_structs"
+version = "0.1.0"
+dependencies = [
+ "bitflags",
+ "quickpeep_densedoc",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.15"
@ -1388,6 +1423,15 @@ dependencies = [
 "serde_derive",
 ]

+[[package]]
+name = "serde_bare"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51c55386eed0f1ae957b091dc2ca8122f287b60c79c774cbe3d5f2b69fded660"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
 version = "1.0.136"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,7 +1,9 @@
 [workspace]
 members = [
    "quickpeep",
-    "quickpeep_moz_readability"
+    "quickpeep_densedoc",
+    "quickpeep_moz_readability",
+    "quickpeep_structs"
 ]


--- a/quickpeep/Cargo.toml
+++ b/quickpeep/Cargo.toml
@ -11,12 +11,14 @@ anyhow = "1.0.55"
 log = "0.4.14"
 env_logger = "0.9.0"
 quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
+quickpeep_densedoc = { path = "../quickpeep_densedoc" }

 # TODO: why do we need these here?
 kuchiki = "0.8.1"
 html5ever = "0.25.1"

 serde = { version = "1.0.136", features = ["derive"] }
+serde_bare = "0.5.0"

 chrono = "0.4.19"

@ -24,9 +26,12 @@ lazy_static = "1.4.0"

 bytes = "1.1.0"

-# TODO: rkyv and memmap2 should be an efficient way to load index packs into processes.
-# rkyv = "0.7.35"
-# memmap2 = "0.5.3"
+itertools = "0.10.3"
+
+quickpeep_structs = { path = "../quickpeep_structs" }
+ipnetwork = "0.18.0"
+
+futures-util = "0.3.21"

 ### Raking helpers
 # HTTP Requests
--- a/quickpeep/src/bin/qp-rake.rs
+++ b/quickpeep/src/bin/qp-rake.rs
@ -1,25 +1,77 @@
-use quickpeep::raking::rake;
+use adblock::lists::RuleTypes;
+use anyhow::Context;
+use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
 use quickpeep::raking::RakeIntent;
+use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
+use quickpeep_structs::rake_entries::AnalysisAntifeatures;
+use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
+use reqwest::redirect::Policy;
 use reqwest::Url;
 use std::str::FromStr;
+use tokio::fs::File;

 #[tokio::main]
 pub async fn main() -> anyhow::Result<()> {
-    let client = reqwest::Client::new();
-    // TODO max timeout, max body size
-    rake(
-        &Url::from_str("http://nothings.org/gamedev/ssao/")?,
-        RakeIntent::Page,
-        &client,
-    )
-    .await?;
+    let mut header_map = HeaderMap::new();
+    header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));

-    rake(
-        &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
-        RakeIntent::Page,
-        &client,
-    )
-    .await?;
+    let client = reqwest::ClientBuilder::new()
+        .timeout(TIME_LIMIT)
+        .default_headers(header_map)
+        // TODO We want to handle redirects ourselves so we can track them...
+        .redirect(Policy::none())
+        .build()?;
+
+    // TODO Don't hardcode these paths in quite as bad a way...
+    let adblock_file = File::open("./cosmetic_filters.adblock")
+        .await
+        .context("Failed to open cosmetic filters file")?;
+    let adblock_engines = vec![(
+        AnalysisAntifeatures::ANNOYANCE,
+        load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?,
+    )];
+
+    let mut antifeature_ip_set = IpSet::new();
+
+    let ips_file = File::open("./data/cf_ips.txt")
+        .await
+        .context("Failed to open CF IPs file")?;
+    antifeature_ip_set.add_all_from_file(ips_file).await?;
+
+    let raker = Raker {
+        adblock_engines,
+        antifeature_ip_set,
+    };
+
+    // raker.rake(
+    //     &Url::from_str("http://nothings.org/gamedev/ssao/")?,
+    //     RakeIntent::Page,
+    //     &client,
+    // )
+    // .await?;
+    //
+    // raker.rake(
+    //     &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
+    //     RakeIntent::Page,
+    //     &client,
+    // )
+    //     .await?;
+
+    raker
+        .rake(
+            &Url::from_str("https://www.thesprucepets.com/")?,
+            RakeIntent::Page,
+            &client,
+        )
+        .await?;
+
+    raker
+        .rake(
+            &Url::from_str("https://matrix.org/")?,
+            RakeIntent::Page,
+            &client,
+        )
+        .await?;

    Ok(())
 }
--- a/quickpeep/src/raking.rs
+++ b/quickpeep/src/raking.rs
@ -1,17 +1,33 @@
+use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
+use adblock::engine::Engine;
 use anyhow::{bail, Context};
+use bytes::Bytes;
 use chrono::{DateTime, FixedOffset, Utc};
 use cylon::Cylon;
+use futures_util::stream::StreamExt;
 use html5ever::tendril::fmt::Slice;
+use html5ever::QualName;
+use kuchiki::traits::TendrilSink;
+use kuchiki::NodeRef;
 use lazy_static::lazy_static;
 use log::debug;
-use reqwest::{Client, Url};
+use quickpeep_densedoc::DenseTree;
+use quickpeep_structs::rake_entries::AnalysisAntifeatures;
+use reqwest::{Client, Response, Url};
 use serde::{Deserialize, Serialize};
 use sitemap::reader::SiteMapEntity;
 use std::collections::HashSet;
+use std::time::Duration;
+use tokio::time::Instant;

-mod analysis;
+pub mod analysis;

-pub const USER_AGENT: &'static str = "QuickPeepBot";
+/// 4 MiB ought to be enough for anybody.
+pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
+/// If it's not loaded in ten seconds, that's pretty severe.
+/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
+pub const TIME_LIMIT: Duration = Duration::from_secs(10);
+pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";

 pub enum RakeOutcome {
    RakedPage(RakedPage),
@ -81,85 +97,169 @@ lazy_static! {
    ]);
 }

-pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> {
-    let response = client.get(url.clone()).send().await?;
+async fn response_to_bytes_limited(
+    mut response: Response,
+    size_limit: usize,
+    time_limit: Duration,
+) -> anyhow::Result<Vec<u8>> {
+    let deadline = Instant::now() + time_limit;
+    let mut buffer = Vec::new();
+    let mut bytestream = response.bytes_stream();

-    if !response.status().is_success() {
-        bail!("Not successful: {:?}", response.status().as_u16());
-    }
-
-    let content_type = if let Some(content_type) = response.headers().get("content-type") {
-        let content_type = content_type
-            .to_str()
-            .context("Can't convert content-type to str")?;
-        eprintln!("CT {:?}", content_type);
-        content_type.split(";").next().unwrap().trim().to_owned()
-    } else {
-        return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
-            reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
-            backoff_sec: 86400 * 7,
-        }));
-    };
-
-    let content = response.bytes().await?;
-
-    if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) {
-        match rake_html_page(&content, url) {
-            Ok(page_rake) => {
-                return Ok(RakeOutcome::RakedPage(page_rake));
-            }
-            Err(error) => {
-                debug!("Failed to rake HTML page: {:?}", error);
+    loop {
+        tokio::select! {
+            next_chunk = bytestream.next() => {
+                match next_chunk {
+                    Some(next_chunk) => {
+                        buffer.extend_from_slice(next_chunk?.as_bytes());
+                        if buffer.len() > size_limit {
+                            bail!("Exceeds size limit");
+                        }
+                    },
+                    None => {
+                        // Finished! :)
+                        break;
+                    }
+                }
+            },
+            _ = tokio::time::sleep_until(deadline) => {
+                bail!("Exceeded time limit");
            }
        }
    }

-    if FEED_MIME_TYPES.contains(content_type.as_str())
-        && (intent == RakeIntent::Any || intent == RakeIntent::Feed)
-    {
-        match rake_feed(&content, url) {
-            Ok(feed) => {
-                return Ok(RakeOutcome::RakedFeed(feed));
-            }
-            Err(error) => {
-                debug!("Failed to rake as feed: {:?}", error);
-            }
-        }
-    }
-
-    if SITEMAP_MIME_TYPES.contains(content_type.as_str())
-        && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
-    {
-        match rake_sitemap(&content) {
-            Ok(sitemap) => {
-                return Ok(RakeOutcome::RakedSitemap(sitemap));
-            }
-            Err(error) => {
-                debug!("Failed to rake as sitemap: {:?}", error);
-            }
-        }
-    }
-
-    return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
-        reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
-    }));
+    Ok(buffer)
 }

-pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
-    let content_str = std::str::from_utf8(content)?;
+pub struct Raker {
+    pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>,
+    pub antifeature_ip_set: IpSet,
+}

-    let mut readability = quickpeep_moz_readability::Readability::new(content_str);
-    readability
-        .parse(url.as_str())
-        .context("failed to analyse readability")?;
+impl Raker {
+    pub async fn rake(
+        &self,
+        url: &Url,
+        intent: RakeIntent,
+        client: &Client,
+    ) -> anyhow::Result<RakeOutcome> {
+        let response = client.get(url.clone()).send().await?;

-    eprintln!("{:#?}", readability.metadata);
+        if let Some(remote_addr) = response.remote_addr() {
+            eprintln!("rA {:?}", remote_addr);
+            let is_cf = self.antifeature_ip_set.contains(remote_addr.ip());
+            eprintln!("CF? {:?}", is_cf);
+        }

-    if let Some(node) = readability.article_node {
-        eprintln!("{}", node.to_string());
+        if !response.status().is_success() {
+            bail!("Not successful: {:?}", response.status().as_u16());
+        }
+
+        let content_type = if let Some(content_type) = response.headers().get("content-type") {
+            let content_type = content_type
+                .to_str()
+                .context("Can't convert content-type to str")?;
+            eprintln!("CT {:?}", content_type);
+            content_type.split(";").next().unwrap().trim().to_owned()
+        } else {
+            return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
+                reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
+                backoff_sec: 86400 * 7,
+            }));
+        };
+
+        let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
+
+        if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
+        {
+            match self.rake_html_page(&content, url) {
+                Ok(page_rake) => {
+                    return Ok(RakeOutcome::RakedPage(page_rake));
+                }
+                Err(error) => {
+                    debug!("Failed to rake HTML page: {:?}", error);
+                }
+            }
+        }
+
+        if FEED_MIME_TYPES.contains(content_type.as_str())
+            && (intent == RakeIntent::Any || intent == RakeIntent::Feed)
+        {
+            match rake_feed(&content, url) {
+                Ok(feed) => {
+                    return Ok(RakeOutcome::RakedFeed(feed));
+                }
+                Err(error) => {
+                    debug!("Failed to rake as feed: {:?}", error);
+                }
+            }
+        }
+
+        if SITEMAP_MIME_TYPES.contains(content_type.as_str())
+            && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
+        {
+            match rake_sitemap(&content) {
+                Ok(sitemap) => {
+                    return Ok(RakeOutcome::RakedSitemap(sitemap));
+                }
+                Err(error) => {
+                    debug!("Failed to rake as sitemap: {:?}", error);
+                }
+            }
+        }
+
+        return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
+            reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
+        }));
    }

-    Ok(todo!())
+    pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result<RakedPage> {
+        let content_str = std::str::from_utf8(content)?;
+
+        let root_node: NodeRef = kuchiki::parse_html().one(content_str);
+
+        let mut antifeature_flags = AnalysisAntifeatures::empty();
+
+        for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
+            match analyse_with_ad_block_cosmetic_filter(
+                &root_node,
+                adblock_engine,
+                url.as_str(),
+                true,
+            ) {
+                Ok(cosmetic_filters_tripped) => {
+                    eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
+                    antifeature_flags |= *engine_antifeature_flag;
+                }
+                Err(err) => {
+                    eprintln!("Cosmetic Filter Err {:?}", err);
+                }
+            };
+        }
+
+        let dense_doc = DenseTree::from_body(root_node.clone());
+        let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
+        eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
+        eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
+
+        let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
+        readability
+            .parse(url.as_str())
+            .context("failed to analyse readability")?;
+
+        eprintln!("{:#?}", readability.metadata);
+
+        if let Some(node) = readability.article_node {
+            //eprintln!("{}", node.to_string());
+        }
+
+        let bare_size = serde_bare::to_vec(&dense_doc)?.len();
+        eprintln!("CS {:?} → {:?}", content.len(), bare_size);
+
+        Ok(RakedPage {
+            // TODO
+        })
+    }
 }

 pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
@ -286,7 +386,7 @@ pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>
        }
    }

-    let rules = cylon::Compiler::new(USER_AGENT)
+    let rules = cylon::Compiler::new(RAKER_USER_AGENT)
        .compile(bytes.as_bytes())
        .await?;

--- a/quickpeep/src/raking/analysis.rs
+++ b/quickpeep/src/raking/analysis.rs
@ -1,14 +1,16 @@
-use adblock::filters::cosmetic::CosmeticFilter;
-use anyhow::anyhow;
+use adblock::engine::Engine;
+use adblock::lists::{ParseOptions, RuleTypes};
+use anyhow::Context;
+use ipnetwork::IpNetwork;
 use kuchiki::NodeRef;
-use log::debug;
-use std::path::Path;
-use tokio::fs::File;
+use std::collections::{BTreeSet, HashSet};
+use std::net::IpAddr;
 use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};

-pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
+pub async fn load_adblock_engine<R: AsyncRead + Unpin>(
    reader: R,
-) -> anyhow::Result<Vec<CosmeticFilter>> {
+    rule_types: RuleTypes,
+) -> anyhow::Result<Engine> {
    let mut br = BufReader::new(reader);
    let mut rules = Vec::new();
    let mut buf = String::new();
@ -17,27 +19,172 @@ pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>(
        if br.read_line(&mut buf).await? == 0 {
            break;
        }
-        if let Ok(rule) = CosmeticFilter::parse(&buf, false) {
-            rules.push(rule);
+        rules.push(buf.trim().to_owned());
+    }
+    Ok(Engine::from_rules(
+        &rules,
+        ParseOptions {
+            format: Default::default(),
+            include_redirect_urls: false,
+            rule_types,
+        },
+    ))
+}
+
+// Relevant:
+// https://github.com/brave/adblock-rust/issues/152#issuecomment-771259069
+
+pub struct ExtractedClassesAndIds {
+    classes: Vec<String>,
+    ids: Vec<String>,
+}
+
+pub fn extract_classes_and_ids_from_page(root: &NodeRef) -> ExtractedClassesAndIds {
+    let mut class_set = HashSet::new();
+    let mut id_set = HashSet::new();
+
+    for node in root.inclusive_descendants() {
+        if let Some(element) = node.0.as_element() {
+            let attrs = element.attributes.borrow();
+            if let Some(id) = attrs.get("id") {
+                id_set.insert(id.to_owned());
+            }
+            if let Some(classes) = attrs.get("class") {
+                for class in classes.trim().split_whitespace() {
+                    class_set.insert(class.to_owned());
+                }
+            }
        }
    }

-    Ok(rules)
+    ExtractedClassesAndIds {
+        classes: class_set.into_iter().collect(),
+        ids: id_set.into_iter().collect(),
+    }
 }

 pub fn analyse_with_ad_block_cosmetic_filter(
-    root: NodeRef,
-    filters: &Vec<CosmeticFilter>,
+    root: &NodeRef,
+    engine: &Engine,
+    url: &str,
+    remove: bool,
 ) -> anyhow::Result<bool> {
    let mut matches = 0;
-    for rule in filters {
-        for ele in root
-            .select(&rule.selector)
-            .map_err(|_| anyhow!("Failed to select(..)"))?
-        {
-            debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
-            matches += 1;
+
+    let url_resources = engine.url_cosmetic_resources(url);
+    let specialist_hide_selectors = if !url_resources.generichide {
+        let ExtractedClassesAndIds { classes, ids } = extract_classes_and_ids_from_page(root);
+
+        //eprintln!("ID {:#?}", ids);
+        //eprintln!("CC {:#?}", classes);
+
+        engine.hidden_class_id_selectors(&classes, &ids, &url_resources.exceptions)
+    } else {
+        Vec::with_capacity(0)
+    };
+
+    //eprintln!("UR {:#?}", url_resources);
+    //eprintln!("sHS {:#?}", specialist_hide_selectors);
+    //eprintln!("----");
+
+    for rule in itertools::chain(specialist_hide_selectors, url_resources.hide_selectors) {
+        if let Ok(result) = root.select(&rule) {
+            for ele in result {
+                eprintln!("Cosmetic Filter {:?} Matches {:?}", rule, ele);
+                matches += 1;
+                if remove {
+                    ele.as_node().detach();
+                }
+            }
+        } else {
+            //eprintln!("(fail)");
        }
    }
    Ok(matches > 0)
 }
+
+// TODO this isn't particularly efficient. Probably want a trie if it's important...
+pub struct IpSet {
+    ips: BTreeSet<IpNetwork>,
+}
+
+impl IpSet {
+    pub fn new() -> IpSet {
+        IpSet {
+            ips: Default::default(),
+        }
+    }
+
+    pub async fn add_all_from_file<R: AsyncRead + Unpin>(
+        &mut self,
+        reader: R,
+    ) -> anyhow::Result<()> {
+        let mut br = BufReader::new(reader);
+
+        let mut buf = String::new();
+        loop {
+            buf.clear();
+            if br.read_line(&mut buf).await? == 0 {
+                break;
+            }
+
+            let trimmed = buf.trim();
+
+            if trimmed.is_empty() {
+                continue;
+            }
+
+            let ip_net = trimmed
+                .parse::<IpNetwork>()
+                .context("Parsing CIDR IP range")?;
+            self.add(ip_net);
+        }
+
+        Ok(())
+    }
+
+    pub fn add(&mut self, network: IpNetwork) {
+        // We jump through a couple of hoops to make sure we store the lowest address in the network,
+        // since we use that for sorting.
+        self.ips
+            .insert(IpNetwork::new(network.network(), network.prefix()).unwrap());
+    }
+
+    pub fn contains(&self, addr: IpAddr) -> bool {
+        let prefix = if addr.is_ipv4() {
+            32
+        } else {
+            assert!(addr.is_ipv6());
+            128
+        };
+        let addr_as_net =
+            IpNetwork::new(addr, prefix).expect("Conversion to IpNetwork should be correct");
+        for ipnet in self.ips.range(..=addr_as_net).rev().next() {
+            if ipnet.contains(addr) {
+                return true;
+            }
+        }
+
+        false
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::raking::analysis::IpSet;
+    use ipnetwork::IpNetwork;
+    use std::net::IpAddr;
+    use std::str::FromStr;
+
+    #[test]
+    pub fn test_ipset_contains() {
+        let mut set = IpSet::new();
+        set.add(IpNetwork::from_str("1.2.3.4/16").unwrap());
+        set.add(IpNetwork::from_str("1.1.2.3/16").unwrap());
+        set.add(IpNetwork::from_str("85.42.36.17/24").unwrap());
+
+        assert!(set.contains(IpAddr::from_str("1.2.42.42").unwrap()));
+        assert!(set.contains(IpAddr::from_str("85.42.36.14").unwrap()));
+        assert!(!set.contains(IpAddr::from_str("85.42.37.14").unwrap()));
+    }
+}
--- a/quickpeep_densedoc/Cargo.toml
+++ b/quickpeep_densedoc/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "quickpeep_densedoc"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0.56"
+serde = { version = "1.0.136", features = ["derive"] }
+kuchiki = "0.8.1"
+html5ever = "0.25.1"
+regex = "1.5.5"
+lazy_static = "1.4.0"
--- a/quickpeep_densedoc/src/lib.rs
+++ b/quickpeep_densedoc/src/lib.rs
@ -0,0 +1,403 @@
+use kuchiki::NodeRef;
+use lazy_static::lazy_static;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use std::borrow::Borrow;
+use std::ops::Deref;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct DenseDocument {
+    head: DenseHead,
+    body: Vec<DenseTree>,
+}
+
+impl DenseDocument {
+    pub fn from_document(root_node: NodeRef) {
+        todo!()
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct DenseHead {
+    title: String,
+    feed_urls: Vec<String>,
+    // TODO how best to expose this?? We actually don't care about storing it though ...
+    //      Probably move to the raker.
+    canonical: (), // TODO I'm sure we'd benefit by digging up some metadata, but that's possibly for later :)
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub enum DenseTree {
+    Heading1(Vec<DenseTree>),
+    Heading2(Vec<DenseTree>),
+    Heading3(Vec<DenseTree>),
+    Heading4(Vec<DenseTree>),
+    Heading5(Vec<DenseTree>),
+    Heading6(Vec<DenseTree>),
+    Link {
+        children: Vec<DenseTree>,
+        href: String,
+        nofollow: bool,
+    },
+    Image {
+        src: String,
+        alt: String,
+        // title? I don't know if it'd be very useful.
+    },
+    Text(String),
+}
+
+impl DenseTree {
+    pub fn from_body(body_node: NodeRef) -> Vec<DenseTree> {
+        let mut builder = DenseTreeBuilder::new();
+        builder.add_children_of_node(body_node);
+        builder.into_tree()
+    }
+
+    pub fn is_text(&self) -> bool {
+        match self {
+            DenseTree::Text(_) => true,
+            _ => false,
+        }
+    }
+
+    pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String {
+        let mut buf = String::new();
+        for node in nodes {
+            node.append_in_textual_format(&mut buf);
+        }
+        simplify_newlines(&buf)
+    }
+
+    fn append_in_textual_format(&self, string: &mut String) {
+        match self {
+            DenseTree::Heading1(children) => {
+                string.push_str("\n\n# ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Heading2(children) => {
+                string.push_str("\n\n## ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Heading3(children) => {
+                string.push_str("\n\n### ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Heading4(children) => {
+                string.push_str("\n\n#### ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Heading5(children) => {
+                string.push_str("\n\n##### ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Heading6(children) => {
+                string.push_str("\n\n###### ");
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str("\n");
+            }
+            DenseTree::Link { children, href, .. } => {
+                string.push('[');
+                for child in children {
+                    child.append_in_textual_format(string);
+                }
+                string.push_str(&format!("]({})", href));
+            }
+            DenseTree::Image { .. } => {
+                string.push_str("[IMG]");
+            }
+            DenseTree::Text(text) => {
+                string.push_str(text);
+            }
+        }
+    }
+}
+
+struct DenseTreeBuilder {
+    /// Siblings in the buffer.
+    nodes: Vec<DenseTree>,
+
+    /// Number of preceding newlines at the end of the buffer.
+    /// Used for generating text that preserves some vague structure.
+    preceding_newlines: u32,
+}
+
+impl DenseTreeBuilder {
+    pub fn new() -> Self {
+        DenseTreeBuilder {
+            nodes: vec![],
+            preceding_newlines: 0,
+        }
+    }
+
+    pub fn into_tree(mut self) -> Vec<DenseTree> {
+        self.simplify();
+        self.nodes
+    }
+
+    /// Simplify the DenseTree nodes: coalesce Text nodes and
+    pub fn simplify(&mut self) {
+        // First coalesce all text nodes
+        // TODO(perf): Do it in a better way to reduce the cost.
+        let mut idx = 1;
+        while idx < self.nodes.len() {
+            if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() {
+                // Merge the two text nodes is a text node, consume it and merge it in.
+                match self.nodes.remove(idx) {
+                    DenseTree::Text(append_text) => {
+                        match &mut self.nodes[idx - 1] {
+                            DenseTree::Text(string) => {
+                                string.push_str(&append_text);
+                                // Continue so we don't advance, as we just moved the list down a
+                                // bit.
+                                continue;
+                            }
+                            _ => {
+                                panic!(
+                                    "Should be unreachable: checked to be text first. ({})",
+                                    idx - 1
+                                );
+                            }
+                        }
+                    }
+                    _ => {
+                        panic!("Should be unreachable: checked to be text first. ({})", idx);
+                    }
+                }
+            }
+
+            idx += 1;
+        }
+
+        for node in &mut self.nodes {
+            match node {
+                DenseTree::Text(text) => {
+                    // Coalesce newlines so there are never more than 2 in a row.
+                    *text = simplify_newlines(&simplify_whitespace(&text));
+                }
+                _ => { /* nop */ }
+            }
+        }
+
+        match self.nodes.get_mut(0) {
+            Some(DenseTree::Text(text)) => {
+                *text = text.trim_start().to_owned();
+            }
+            _ => (),
+        }
+
+        let num_nodes = self.nodes.len();
+        if num_nodes > 1 {
+            match self.nodes.get_mut(num_nodes - 1) {
+                Some(DenseTree::Text(text)) => {
+                    *text = text.trim_end().to_owned();
+                }
+                _ => (),
+            }
+        }
+    }
+
+    /// Convert a HTML node's children into DenseTree nodes.
+    pub fn add_children_of_node(&mut self, node: NodeRef) {
+        for child in node.children() {
+            if let Some(element) = child.as_element() {
+                match element.name.local.deref() {
+                    "h1" => {
+                        self.nodes
+                            .push(DenseTree::Heading1(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "h2" => {
+                        self.nodes
+                            .push(DenseTree::Heading2(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "h3" => {
+                        self.nodes
+                            .push(DenseTree::Heading3(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "h4" => {
+                        self.nodes
+                            .push(DenseTree::Heading4(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "h5" => {
+                        self.nodes
+                            .push(DenseTree::Heading5(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "h6" => {
+                        self.nodes
+                            .push(DenseTree::Heading6(DenseTree::from_body(child)));
+                        self.preceding_newlines = 2;
+                    }
+                    "a" => {
+                        let attrs = element.attributes.borrow();
+                        let href = attrs.get("href").unwrap_or("").to_owned();
+
+                        if href.starts_with("javascript:") || href.starts_with("data:") {
+                            // Skip this link. Just unwrap it.
+                            self.add_children_of_node(child.clone());
+                            continue;
+                        }
+
+                        let nofollow = attrs
+                            .get("rel")
+                            .map(|rel: &str| {
+                                rel.split_whitespace()
+                                    .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
+                            })
+                            .unwrap_or(false);
+                        drop(attrs);
+
+                        self.nodes.push(DenseTree::Link {
+                            children: DenseTree::from_body(child),
+                            href,
+                            nofollow,
+                        });
+
+                        self.preceding_newlines = 0;
+                    }
+                    "img" => {
+                        // TODO Decide if this is worth the space...
+                        let attrs = element.attributes.borrow();
+                        let src = attrs.get("src").unwrap_or("").to_owned();
+
+                        if src.starts_with("javascript:") || src.starts_with("data:") {
+                            // Skip this image.
+                            continue;
+                        }
+
+                        let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim());
+
+                        self.nodes.push(DenseTree::Image { src, alt });
+                    }
+                    "p" | "pre" => {
+                        // Paragraphs must have 2 preceding newlines.
+                        if self.preceding_newlines < 2 {
+                            self.nodes.push(DenseTree::Text(
+                                match self.preceding_newlines {
+                                    0 => "\n\n",
+                                    1 => "\n",
+                                    _ => unreachable!(),
+                                }
+                                .to_owned(),
+                            ));
+                            self.preceding_newlines = 2;
+                        }
+
+                        self.add_children_of_node(child);
+
+                        // Paragraphs must have 2 trailing newlines.
+                        if self.preceding_newlines < 2 {
+                            self.nodes.push(DenseTree::Text(
+                                match self.preceding_newlines {
+                                    0 => "\n\n",
+                                    1 => "\n",
+                                    _ => unreachable!(),
+                                }
+                                .to_owned(),
+                            ));
+                            self.preceding_newlines = 2;
+                        }
+                    }
+                    "br" => {
+                        self.nodes.push(DenseTree::Text("\n".to_owned()));
+                        self.preceding_newlines += 1;
+                    }
+                    "div" | "li" => {
+                        // Divs must have 1 preceding newline.
+                        if self.preceding_newlines < 1 {
+                            self.nodes.push(DenseTree::Text("\n".to_owned()));
+                            self.preceding_newlines = 1;
+                        }
+
+                        self.add_children_of_node(child);
+
+                        // Divs must have 1 trailing newline.
+                        if self.preceding_newlines < 1 {
+                            self.nodes.push(DenseTree::Text("\n".to_owned()));
+                            self.preceding_newlines = 1;
+                        }
+                    }
+                    "script" | "style" | "svg" | "noscript" => {
+                        // We just prune these, as we don't want them.
+                        // (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.)
+                        continue;
+                    }
+                    _ => {
+                        // Simply unwrap the unknown element.
+                        self.add_children_of_node(child);
+                    }
+                }
+                //element.name.local
+            } else if let Some(text) = child.as_text() {
+                let text_to_add =
+                    simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " ")));
+                self.preceding_newlines =
+                    text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32;
+                self.nodes.push(DenseTree::Text(text_to_add));
+            }
+        }
+    }
+}
+
+lazy_static! {
+    static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap();
+    static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap();
+    static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap();
+    static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap();
+}
+
+pub fn simplify_whitespace(input: &str) -> String {
+    let s = MANY_WHITESPACE.replace_all(input, " ");
+    let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n");
+    UNNECESSARY_LE_WHITESPACE
+        .replace_all(s.borrow(), "\n")
+        .into_owned()
+}
+
+pub fn simplify_newlines(input: &str) -> String {
+    THREE_OR_MORE_NEWLINES
+        .replace_all(&input.replace("\r", ""), "\n\n")
+        .into_owned()
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{simplify_newlines, simplify_whitespace};
+
+    #[test]
+    pub fn test_simplify_whitespace() {
+        assert_eq!(
+            simplify_whitespace("hello    cat\tdog \t bat"),
+            "hello cat dog bat"
+        );
+    }
+
+    #[test]
+    pub fn test_simplify_newlines() {
+        assert_eq!(
+            simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"),
+            "hello\n\nare\n\nyou\n\nthere?"
+        );
+    }
+}
--- a/quickpeep_moz_readability/src/lib.rs
+++ b/quickpeep_moz_readability/src/lib.rs
@ -60,7 +60,8 @@ const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "
 pub mod regexes;

 pub struct Readability {
-    root_node: NodeRef,
+    /// Left-over document. Note that readable article pieces are detached from the parent.
+    pub root_node: NodeRef,
    byline: Option<String>,
    article_title: String,
    pub article_node: Option<NodeRef>,
@ -77,8 +78,12 @@ struct SizeInfo {

 impl Readability {
    pub fn new(html_str: &str) -> Self {
+        Self::new_from_node(kuchiki::parse_html().one(html_str))
+    }
+
+    pub fn new_from_node(root_node: NodeRef) -> Self {
        Self {
-            root_node: kuchiki::parse_html().one(html_str),
+            root_node,
            byline: None,
            article_title: "".into(),
            article_node: None,
@ -87,6 +92,7 @@ impl Readability {
            metadata: MetaData::new(),
        }
    }
+
    pub fn parse(&mut self, url: &str) -> anyhow::Result<()> {
        self.unwrap_no_script_tags();
        self.remove_scripts();
--- a/quickpeep_structs/Cargo.toml
+++ b/quickpeep_structs/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "quickpeep_structs"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bitflags = "1.3.2"
+#arc-interner = "0.7.0"
+quickpeep_densedoc = { path = "../quickpeep_densedoc" }
--- a/quickpeep_structs/src/lib.rs
+++ b/quickpeep_structs/src/lib.rs
@ -0,0 +1 @@
+pub mod rake_entries;
--- a/quickpeep_structs/src/rake_entries.rs
+++ b/quickpeep_structs/src/rake_entries.rs
@ -0,0 +1,24 @@
+use bitflags::bitflags;
+
+bitflags! {
+    pub struct AnalysisAntifeatures: u8 {
+        /// Adverts are present on the page, according to a filter.
+        const ADVERTS = 0x01;
+        /// Some things are blocked due to privacy concerns, according to a filter.
+        const PRIVACY = 0x02;
+        /// Annoying cookie nags are present on this page, according to a cosmetic filter.
+        const COOKIE_NAG = 0x04;
+        /// Unspecified annoyances are present on this page, according to a cosmetic filter.
+        const ANNOYANCE = 0x08;
+
+        /// The web page was served over CloudFlare at the time of indexing, which is not in the
+        /// spirit of decentralisation.
+        const CLOUDFLARE = 0x10;
+    }
+}
+
+pub struct RakedPageEntry {
+    pub analysed_antifeatures: AnalysisAntifeatures,
+    //pub article: Option<DenseTree>,
+    //pub non_article: Option<DenseTree>,
+}
--- a/scripts/get_cf_ips.sh
+++ b/scripts/get_cf_ips.sh
@ -0,0 +1,12 @@
+#!/bin/sh
+
+set -eu
+
+dir_path="$(dirname "$0")"
+
+mkdir -p "$dir_path/../data"
+wget -O "$dir_path/../data/cf_ips_v4.txt" https://www.cloudflare.com/ips-v4
+wget -O "$dir_path/../data/cf_ips_v6.txt" https://www.cloudflare.com/ips-v6
+echo "\n" >> "$dir_path/../data/cf_ips_v4.txt"
+cat "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt" > "$dir_path/../data/cf_ips.txt"
+rm "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt"