Add a lot more foundational work for raking
This commit is contained in:
		
							parent
							
								
									210e8ef10a
								
							
						
					
					
						commit
						a1097ef183
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,2 +1,3 @@ | ||||
| 
 | ||||
| .idea | ||||
| data/cf_ips.txt | ||||
							
								
								
									
										44
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										44
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @ -612,6 +612,15 @@ version = "2.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "35e70ee094dc02fd9c13fdad4940090f22dbd6ac7c9e7094a46cf0232a50bc7c" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "ipnetwork" | ||||
| version = "0.18.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4088d739b183546b239688ddbc79891831df421773df95e236daf7867866d355" | ||||
| dependencies = [ | ||||
|  "serde", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "itertools" | ||||
| version = "0.10.3" | ||||
| @ -1066,18 +1075,36 @@ dependencies = [ | ||||
|  "cylon", | ||||
|  "env_logger", | ||||
|  "feed-rs", | ||||
|  "futures-util", | ||||
|  "gemini-fetch", | ||||
|  "html5ever", | ||||
|  "ipnetwork", | ||||
|  "itertools", | ||||
|  "kuchiki", | ||||
|  "lazy_static", | ||||
|  "log", | ||||
|  "quickpeep_densedoc", | ||||
|  "quickpeep_moz_readability", | ||||
|  "quickpeep_structs", | ||||
|  "reqwest", | ||||
|  "serde", | ||||
|  "serde_bare", | ||||
|  "sitemap", | ||||
|  "tokio", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "quickpeep_densedoc" | ||||
| version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "anyhow", | ||||
|  "html5ever", | ||||
|  "kuchiki", | ||||
|  "lazy_static", | ||||
|  "regex", | ||||
|  "serde", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "quickpeep_moz_readability" | ||||
| version = "0.1.0" | ||||
| @ -1091,6 +1118,14 @@ dependencies = [ | ||||
|  "url", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "quickpeep_structs" | ||||
| version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "bitflags", | ||||
|  "quickpeep_densedoc", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "quote" | ||||
| version = "1.0.15" | ||||
| @ -1388,6 +1423,15 @@ dependencies = [ | ||||
|  "serde_derive", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "serde_bare" | ||||
| version = "0.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "51c55386eed0f1ae957b091dc2ca8122f287b60c79c774cbe3d5f2b69fded660" | ||||
| dependencies = [ | ||||
|  "serde", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "serde_derive" | ||||
| version = "1.0.136" | ||||
|  | ||||
| @ -1,7 +1,9 @@ | ||||
| [workspace] | ||||
| members = [ | ||||
|     "quickpeep", | ||||
|     "quickpeep_moz_readability" | ||||
|     "quickpeep_densedoc", | ||||
|     "quickpeep_moz_readability", | ||||
|     "quickpeep_structs" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -11,12 +11,14 @@ anyhow = "1.0.55" | ||||
| log = "0.4.14" | ||||
| env_logger = "0.9.0" | ||||
| quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } | ||||
| quickpeep_densedoc = { path = "../quickpeep_densedoc" } | ||||
| 
 | ||||
| # TODO: why do we need these here? | ||||
| kuchiki = "0.8.1" | ||||
| html5ever = "0.25.1" | ||||
| 
 | ||||
| serde = { version = "1.0.136", features = ["derive"] } | ||||
| serde_bare = "0.5.0" | ||||
| 
 | ||||
| chrono = "0.4.19" | ||||
| 
 | ||||
| @ -24,9 +26,12 @@ lazy_static = "1.4.0" | ||||
| 
 | ||||
| bytes = "1.1.0" | ||||
| 
 | ||||
| # TODO: rkyv and memmap2 should be an efficient way to load index packs into processes. | ||||
| # rkyv = "0.7.35" | ||||
| # memmap2 = "0.5.3" | ||||
| itertools = "0.10.3" | ||||
| 
 | ||||
| quickpeep_structs = { path = "../quickpeep_structs" } | ||||
| ipnetwork = "0.18.0" | ||||
| 
 | ||||
| futures-util = "0.3.21" | ||||
| 
 | ||||
| ### Raking helpers | ||||
| # HTTP Requests | ||||
|  | ||||
| @ -1,21 +1,73 @@ | ||||
| use quickpeep::raking::rake; | ||||
| use adblock::lists::RuleTypes; | ||||
| use anyhow::Context; | ||||
| use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; | ||||
| use quickpeep::raking::RakeIntent; | ||||
| use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; | ||||
| use quickpeep_structs::rake_entries::AnalysisAntifeatures; | ||||
| use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; | ||||
| use reqwest::redirect::Policy; | ||||
| use reqwest::Url; | ||||
| use std::str::FromStr; | ||||
| use tokio::fs::File; | ||||
| 
 | ||||
| #[tokio::main] | ||||
| pub async fn main() -> anyhow::Result<()> { | ||||
|     let client = reqwest::Client::new(); | ||||
|     // TODO max timeout, max body size
 | ||||
|     rake( | ||||
|         &Url::from_str("http://nothings.org/gamedev/ssao/")?, | ||||
|     let mut header_map = HeaderMap::new(); | ||||
|     header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); | ||||
| 
 | ||||
|     let client = reqwest::ClientBuilder::new() | ||||
|         .timeout(TIME_LIMIT) | ||||
|         .default_headers(header_map) | ||||
|         // TODO We want to handle redirects ourselves so we can track them...
 | ||||
|         .redirect(Policy::none()) | ||||
|         .build()?; | ||||
| 
 | ||||
|     // TODO Don't hardcode these paths in quite as bad a way...
 | ||||
|     let adblock_file = File::open("./cosmetic_filters.adblock") | ||||
|         .await | ||||
|         .context("Failed to open cosmetic filters file")?; | ||||
|     let adblock_engines = vec![( | ||||
|         AnalysisAntifeatures::ANNOYANCE, | ||||
|         load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?, | ||||
|     )]; | ||||
| 
 | ||||
|     let mut antifeature_ip_set = IpSet::new(); | ||||
| 
 | ||||
|     let ips_file = File::open("./data/cf_ips.txt") | ||||
|         .await | ||||
|         .context("Failed to open CF IPs file")?; | ||||
|     antifeature_ip_set.add_all_from_file(ips_file).await?; | ||||
| 
 | ||||
|     let raker = Raker { | ||||
|         adblock_engines, | ||||
|         antifeature_ip_set, | ||||
|     }; | ||||
| 
 | ||||
|     // raker.rake(
 | ||||
|     //     &Url::from_str("http://nothings.org/gamedev/ssao/")?,
 | ||||
|     //     RakeIntent::Page,
 | ||||
|     //     &client,
 | ||||
|     // )
 | ||||
|     // .await?;
 | ||||
|     //
 | ||||
|     // raker.rake(
 | ||||
|     //     &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?,
 | ||||
|     //     RakeIntent::Page,
 | ||||
|     //     &client,
 | ||||
|     // )
 | ||||
|     //     .await?;
 | ||||
| 
 | ||||
|     raker | ||||
|         .rake( | ||||
|             &Url::from_str("https://www.thesprucepets.com/")?, | ||||
|             RakeIntent::Page, | ||||
|             &client, | ||||
|         ) | ||||
|         .await?; | ||||
| 
 | ||||
|     rake( | ||||
|         &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?, | ||||
|     raker | ||||
|         .rake( | ||||
|             &Url::from_str("https://matrix.org/")?, | ||||
|             RakeIntent::Page, | ||||
|             &client, | ||||
|         ) | ||||
|  | ||||
| @ -1,17 +1,33 @@ | ||||
| use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet}; | ||||
| use adblock::engine::Engine; | ||||
| use anyhow::{bail, Context}; | ||||
| use bytes::Bytes; | ||||
| use chrono::{DateTime, FixedOffset, Utc}; | ||||
| use cylon::Cylon; | ||||
| use futures_util::stream::StreamExt; | ||||
| use html5ever::tendril::fmt::Slice; | ||||
| use html5ever::QualName; | ||||
| use kuchiki::traits::TendrilSink; | ||||
| use kuchiki::NodeRef; | ||||
| use lazy_static::lazy_static; | ||||
| use log::debug; | ||||
| use reqwest::{Client, Url}; | ||||
| use quickpeep_densedoc::DenseTree; | ||||
| use quickpeep_structs::rake_entries::AnalysisAntifeatures; | ||||
| use reqwest::{Client, Response, Url}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use sitemap::reader::SiteMapEntity; | ||||
| use std::collections::HashSet; | ||||
| use std::time::Duration; | ||||
| use tokio::time::Instant; | ||||
| 
 | ||||
| mod analysis; | ||||
| pub mod analysis; | ||||
| 
 | ||||
| pub const USER_AGENT: &'static str = "QuickPeepBot"; | ||||
| /// 4 MiB ought to be enough for anybody.
 | ||||
| pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; | ||||
| /// If it's not loaded in ten seconds, that's pretty severe.
 | ||||
| /// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
 | ||||
| pub const TIME_LIMIT: Duration = Duration::from_secs(10); | ||||
| pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot"; | ||||
| 
 | ||||
| pub enum RakeOutcome { | ||||
|     RakedPage(RakedPage), | ||||
| @ -81,9 +97,60 @@ lazy_static! { | ||||
|     ]); | ||||
| } | ||||
| 
 | ||||
| pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Result<RakeOutcome> { | ||||
| async fn response_to_bytes_limited( | ||||
|     mut response: Response, | ||||
|     size_limit: usize, | ||||
|     time_limit: Duration, | ||||
| ) -> anyhow::Result<Vec<u8>> { | ||||
|     let deadline = Instant::now() + time_limit; | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut bytestream = response.bytes_stream(); | ||||
| 
 | ||||
|     loop { | ||||
|         tokio::select! { | ||||
|             next_chunk = bytestream.next() => { | ||||
|                 match next_chunk { | ||||
|                     Some(next_chunk) => { | ||||
|                         buffer.extend_from_slice(next_chunk?.as_bytes()); | ||||
|                         if buffer.len() > size_limit { | ||||
|                             bail!("Exceeds size limit"); | ||||
|                         } | ||||
|                     }, | ||||
|                     None => { | ||||
|                         // Finished! :)
 | ||||
|                         break; | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             _ = tokio::time::sleep_until(deadline) => { | ||||
|                 bail!("Exceeded time limit"); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Ok(buffer) | ||||
| } | ||||
| 
 | ||||
| pub struct Raker { | ||||
|     pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>, | ||||
|     pub antifeature_ip_set: IpSet, | ||||
| } | ||||
| 
 | ||||
| impl Raker { | ||||
|     pub async fn rake( | ||||
|         &self, | ||||
|         url: &Url, | ||||
|         intent: RakeIntent, | ||||
|         client: &Client, | ||||
|     ) -> anyhow::Result<RakeOutcome> { | ||||
|         let response = client.get(url.clone()).send().await?; | ||||
| 
 | ||||
|         if let Some(remote_addr) = response.remote_addr() { | ||||
|             eprintln!("rA {:?}", remote_addr); | ||||
|             let is_cf = self.antifeature_ip_set.contains(remote_addr.ip()); | ||||
|             eprintln!("CF? {:?}", is_cf); | ||||
|         } | ||||
| 
 | ||||
|         if !response.status().is_success() { | ||||
|             bail!("Not successful: {:?}", response.status().as_u16()); | ||||
|         } | ||||
| @ -101,10 +168,11 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res | ||||
|             })); | ||||
|         }; | ||||
| 
 | ||||
|     let content = response.bytes().await?; | ||||
|         let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?; | ||||
| 
 | ||||
|     if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { | ||||
|         match rake_html_page(&content, url) { | ||||
|         if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) | ||||
|         { | ||||
|             match self.rake_html_page(&content, url) { | ||||
|                 Ok(page_rake) => { | ||||
|                     return Ok(RakeOutcome::RakedPage(page_rake)); | ||||
|                 } | ||||
| @ -143,12 +211,38 @@ pub async fn rake(url: &Url, intent: RakeIntent, client: &Client) -> anyhow::Res | ||||
|         return Ok(RakeOutcome::PermanentFailure(PermanentFailure { | ||||
|             reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()), | ||||
|         })); | ||||
| } | ||||
|     } | ||||
| 
 | ||||
| pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { | ||||
|     pub fn rake_html_page(&self, content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { | ||||
|         let content_str = std::str::from_utf8(content)?; | ||||
| 
 | ||||
|     let mut readability = quickpeep_moz_readability::Readability::new(content_str); | ||||
|         let root_node: NodeRef = kuchiki::parse_html().one(content_str); | ||||
| 
 | ||||
|         let mut antifeature_flags = AnalysisAntifeatures::empty(); | ||||
| 
 | ||||
|         for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines { | ||||
|             match analyse_with_ad_block_cosmetic_filter( | ||||
|                 &root_node, | ||||
|                 adblock_engine, | ||||
|                 url.as_str(), | ||||
|                 true, | ||||
|             ) { | ||||
|                 Ok(cosmetic_filters_tripped) => { | ||||
|                     eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped); | ||||
|                     antifeature_flags |= *engine_antifeature_flag; | ||||
|                 } | ||||
|                 Err(err) => { | ||||
|                     eprintln!("Cosmetic Filter Err {:?}", err); | ||||
|                 } | ||||
|             }; | ||||
|         } | ||||
| 
 | ||||
|         let dense_doc = DenseTree::from_body(root_node.clone()); | ||||
|         let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); | ||||
|         eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); | ||||
|         eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); | ||||
| 
 | ||||
|         let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node); | ||||
|         readability | ||||
|             .parse(url.as_str()) | ||||
|             .context("failed to analyse readability")?; | ||||
| @ -156,10 +250,16 @@ pub fn rake_html_page(content: &[u8], url: &Url) -> anyhow::Result<RakedPage> { | ||||
|         eprintln!("{:#?}", readability.metadata); | ||||
| 
 | ||||
|         if let Some(node) = readability.article_node { | ||||
|         eprintln!("{}", node.to_string()); | ||||
|             //eprintln!("{}", node.to_string());
 | ||||
|         } | ||||
| 
 | ||||
|     Ok(todo!()) | ||||
|         let bare_size = serde_bare::to_vec(&dense_doc)?.len(); | ||||
|         eprintln!("CS {:?} → {:?}", content.len(), bare_size); | ||||
| 
 | ||||
|         Ok(RakedPage { | ||||
|             // TODO
 | ||||
|         }) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> { | ||||
| @ -286,7 +386,7 @@ pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt> | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     let rules = cylon::Compiler::new(USER_AGENT) | ||||
|     let rules = cylon::Compiler::new(RAKER_USER_AGENT) | ||||
|         .compile(bytes.as_bytes()) | ||||
|         .await?; | ||||
| 
 | ||||
|  | ||||
| @ -1,14 +1,16 @@ | ||||
| use adblock::filters::cosmetic::CosmeticFilter; | ||||
| use anyhow::anyhow; | ||||
| use adblock::engine::Engine; | ||||
| use adblock::lists::{ParseOptions, RuleTypes}; | ||||
| use anyhow::Context; | ||||
| use ipnetwork::IpNetwork; | ||||
| use kuchiki::NodeRef; | ||||
| use log::debug; | ||||
| use std::path::Path; | ||||
| use tokio::fs::File; | ||||
| use std::collections::{BTreeSet, HashSet}; | ||||
| use std::net::IpAddr; | ||||
| use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; | ||||
| 
 | ||||
| pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>( | ||||
| pub async fn load_adblock_engine<R: AsyncRead + Unpin>( | ||||
|     reader: R, | ||||
| ) -> anyhow::Result<Vec<CosmeticFilter>> { | ||||
|     rule_types: RuleTypes, | ||||
| ) -> anyhow::Result<Engine> { | ||||
|     let mut br = BufReader::new(reader); | ||||
|     let mut rules = Vec::new(); | ||||
|     let mut buf = String::new(); | ||||
| @ -17,27 +19,172 @@ pub async fn load_cosmetic_filters<R: AsyncRead + Unpin>( | ||||
|         if br.read_line(&mut buf).await? == 0 { | ||||
|             break; | ||||
|         } | ||||
|         if let Ok(rule) = CosmeticFilter::parse(&buf, false) { | ||||
|             rules.push(rule); | ||||
|         rules.push(buf.trim().to_owned()); | ||||
|     } | ||||
|     Ok(Engine::from_rules( | ||||
|         &rules, | ||||
|         ParseOptions { | ||||
|             format: Default::default(), | ||||
|             include_redirect_urls: false, | ||||
|             rule_types, | ||||
|         }, | ||||
|     )) | ||||
| } | ||||
| 
 | ||||
| // Relevant:
 | ||||
| // https://github.com/brave/adblock-rust/issues/152#issuecomment-771259069
 | ||||
| 
 | ||||
| pub struct ExtractedClassesAndIds { | ||||
|     classes: Vec<String>, | ||||
|     ids: Vec<String>, | ||||
| } | ||||
| 
 | ||||
| pub fn extract_classes_and_ids_from_page(root: &NodeRef) -> ExtractedClassesAndIds { | ||||
|     let mut class_set = HashSet::new(); | ||||
|     let mut id_set = HashSet::new(); | ||||
| 
 | ||||
|     for node in root.inclusive_descendants() { | ||||
|         if let Some(element) = node.0.as_element() { | ||||
|             let attrs = element.attributes.borrow(); | ||||
|             if let Some(id) = attrs.get("id") { | ||||
|                 id_set.insert(id.to_owned()); | ||||
|             } | ||||
|             if let Some(classes) = attrs.get("class") { | ||||
|                 for class in classes.trim().split_whitespace() { | ||||
|                     class_set.insert(class.to_owned()); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Ok(rules) | ||||
|     ExtractedClassesAndIds { | ||||
|         classes: class_set.into_iter().collect(), | ||||
|         ids: id_set.into_iter().collect(), | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| pub fn analyse_with_ad_block_cosmetic_filter( | ||||
|     root: NodeRef, | ||||
|     filters: &Vec<CosmeticFilter>, | ||||
|     root: &NodeRef, | ||||
|     engine: &Engine, | ||||
|     url: &str, | ||||
|     remove: bool, | ||||
| ) -> anyhow::Result<bool> { | ||||
|     let mut matches = 0; | ||||
|     for rule in filters { | ||||
|         for ele in root | ||||
|             .select(&rule.selector) | ||||
|             .map_err(|_| anyhow!("Failed to select(..)"))? | ||||
|         { | ||||
|             debug!("Cosmetic Filter {:?} Matches {:?}", rule, ele); | ||||
| 
 | ||||
|     let url_resources = engine.url_cosmetic_resources(url); | ||||
|     let specialist_hide_selectors = if !url_resources.generichide { | ||||
|         let ExtractedClassesAndIds { classes, ids } = extract_classes_and_ids_from_page(root); | ||||
| 
 | ||||
|         //eprintln!("ID {:#?}", ids);
 | ||||
|         //eprintln!("CC {:#?}", classes);
 | ||||
| 
 | ||||
|         engine.hidden_class_id_selectors(&classes, &ids, &url_resources.exceptions) | ||||
|     } else { | ||||
|         Vec::with_capacity(0) | ||||
|     }; | ||||
| 
 | ||||
|     //eprintln!("UR {:#?}", url_resources);
 | ||||
|     //eprintln!("sHS {:#?}", specialist_hide_selectors);
 | ||||
|     //eprintln!("----");
 | ||||
| 
 | ||||
|     for rule in itertools::chain(specialist_hide_selectors, url_resources.hide_selectors) { | ||||
|         if let Ok(result) = root.select(&rule) { | ||||
|             for ele in result { | ||||
|                 eprintln!("Cosmetic Filter {:?} Matches {:?}", rule, ele); | ||||
|                 matches += 1; | ||||
|                 if remove { | ||||
|                     ele.as_node().detach(); | ||||
|                 } | ||||
|             } | ||||
|         } else { | ||||
|             //eprintln!("(fail)");
 | ||||
|         } | ||||
|     } | ||||
|     Ok(matches > 0) | ||||
| } | ||||
| 
 | ||||
| // TODO this isn't particularly efficient. Probably want a trie if it's important...
 | ||||
| pub struct IpSet { | ||||
|     ips: BTreeSet<IpNetwork>, | ||||
| } | ||||
| 
 | ||||
| impl IpSet { | ||||
|     pub fn new() -> IpSet { | ||||
|         IpSet { | ||||
|             ips: Default::default(), | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pub async fn add_all_from_file<R: AsyncRead + Unpin>( | ||||
|         &mut self, | ||||
|         reader: R, | ||||
|     ) -> anyhow::Result<()> { | ||||
|         let mut br = BufReader::new(reader); | ||||
| 
 | ||||
|         let mut buf = String::new(); | ||||
|         loop { | ||||
|             buf.clear(); | ||||
|             if br.read_line(&mut buf).await? == 0 { | ||||
|                 break; | ||||
|             } | ||||
| 
 | ||||
|             let trimmed = buf.trim(); | ||||
| 
 | ||||
|             if trimmed.is_empty() { | ||||
|                 continue; | ||||
|             } | ||||
| 
 | ||||
|             let ip_net = trimmed | ||||
|                 .parse::<IpNetwork>() | ||||
|                 .context("Parsing CIDR IP range")?; | ||||
|             self.add(ip_net); | ||||
|         } | ||||
| 
 | ||||
|         Ok(()) | ||||
|     } | ||||
| 
 | ||||
|     pub fn add(&mut self, network: IpNetwork) { | ||||
|         // We jump through a couple of hoops to make sure we store the lowest address in the network,
 | ||||
|         // since we use that for sorting.
 | ||||
|         self.ips | ||||
|             .insert(IpNetwork::new(network.network(), network.prefix()).unwrap()); | ||||
|     } | ||||
| 
 | ||||
|     pub fn contains(&self, addr: IpAddr) -> bool { | ||||
|         let prefix = if addr.is_ipv4() { | ||||
|             32 | ||||
|         } else { | ||||
|             assert!(addr.is_ipv6()); | ||||
|             128 | ||||
|         }; | ||||
|         let addr_as_net = | ||||
|             IpNetwork::new(addr, prefix).expect("Conversion to IpNetwork should be correct"); | ||||
|         for ipnet in self.ips.range(..=addr_as_net).rev().next() { | ||||
|             if ipnet.contains(addr) { | ||||
|                 return true; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         false | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use crate::raking::analysis::IpSet; | ||||
|     use ipnetwork::IpNetwork; | ||||
|     use std::net::IpAddr; | ||||
|     use std::str::FromStr; | ||||
| 
 | ||||
|     #[test] | ||||
|     pub fn test_ipset_contains() { | ||||
|         let mut set = IpSet::new(); | ||||
|         set.add(IpNetwork::from_str("1.2.3.4/16").unwrap()); | ||||
|         set.add(IpNetwork::from_str("1.1.2.3/16").unwrap()); | ||||
|         set.add(IpNetwork::from_str("85.42.36.17/24").unwrap()); | ||||
| 
 | ||||
|         assert!(set.contains(IpAddr::from_str("1.2.42.42").unwrap())); | ||||
|         assert!(set.contains(IpAddr::from_str("85.42.36.14").unwrap())); | ||||
|         assert!(!set.contains(IpAddr::from_str("85.42.37.14").unwrap())); | ||||
|     } | ||||
| } | ||||
|  | ||||
							
								
								
									
										14
									
								
								quickpeep_densedoc/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								quickpeep_densedoc/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | ||||
| [package] | ||||
| name = "quickpeep_densedoc" | ||||
| version = "0.1.0" | ||||
| edition = "2021" | ||||
| 
 | ||||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||
| 
 | ||||
| [dependencies] | ||||
| anyhow = "1.0.56" | ||||
| serde = { version = "1.0.136", features = ["derive"] } | ||||
| kuchiki = "0.8.1" | ||||
| html5ever = "0.25.1" | ||||
| regex = "1.5.5" | ||||
| lazy_static = "1.4.0" | ||||
							
								
								
									
										403
									
								
								quickpeep_densedoc/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										403
									
								
								quickpeep_densedoc/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,403 @@ | ||||
| use kuchiki::NodeRef; | ||||
| use lazy_static::lazy_static; | ||||
| use regex::Regex; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use std::borrow::Borrow; | ||||
| use std::ops::Deref; | ||||
| 
 | ||||
| #[derive(Serialize, Deserialize, Clone, Debug)] | ||||
| pub struct DenseDocument { | ||||
|     head: DenseHead, | ||||
|     body: Vec<DenseTree>, | ||||
| } | ||||
| 
 | ||||
| impl DenseDocument { | ||||
|     pub fn from_document(root_node: NodeRef) { | ||||
|         todo!() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #[derive(Serialize, Deserialize, Clone, Debug)] | ||||
| pub struct DenseHead { | ||||
|     title: String, | ||||
|     feed_urls: Vec<String>, | ||||
|     // TODO how best to expose this?? We actually don't care about storing it though ...
 | ||||
|     //      Probably move to the raker.
 | ||||
|     canonical: (), // TODO I'm sure we'd benefit by digging up some metadata, but that's possibly for later :)
 | ||||
| } | ||||
| 
 | ||||
| #[derive(Serialize, Deserialize, Clone, Debug)] | ||||
| pub enum DenseTree { | ||||
|     Heading1(Vec<DenseTree>), | ||||
|     Heading2(Vec<DenseTree>), | ||||
|     Heading3(Vec<DenseTree>), | ||||
|     Heading4(Vec<DenseTree>), | ||||
|     Heading5(Vec<DenseTree>), | ||||
|     Heading6(Vec<DenseTree>), | ||||
|     Link { | ||||
|         children: Vec<DenseTree>, | ||||
|         href: String, | ||||
|         nofollow: bool, | ||||
|     }, | ||||
|     Image { | ||||
|         src: String, | ||||
|         alt: String, | ||||
|         // title? I don't know if it'd be very useful.
 | ||||
|     }, | ||||
|     Text(String), | ||||
| } | ||||
| 
 | ||||
| impl DenseTree { | ||||
|     pub fn from_body(body_node: NodeRef) -> Vec<DenseTree> { | ||||
|         let mut builder = DenseTreeBuilder::new(); | ||||
|         builder.add_children_of_node(body_node); | ||||
|         builder.into_tree() | ||||
|     } | ||||
| 
 | ||||
|     pub fn is_text(&self) -> bool { | ||||
|         match self { | ||||
|             DenseTree::Text(_) => true, | ||||
|             _ => false, | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pub fn generate_textual_format(nodes: &Vec<DenseTree>) -> String { | ||||
|         let mut buf = String::new(); | ||||
|         for node in nodes { | ||||
|             node.append_in_textual_format(&mut buf); | ||||
|         } | ||||
|         simplify_newlines(&buf) | ||||
|     } | ||||
| 
 | ||||
|     fn append_in_textual_format(&self, string: &mut String) { | ||||
|         match self { | ||||
|             DenseTree::Heading1(children) => { | ||||
|                 string.push_str("\n\n# "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Heading2(children) => { | ||||
|                 string.push_str("\n\n## "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Heading3(children) => { | ||||
|                 string.push_str("\n\n### "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Heading4(children) => { | ||||
|                 string.push_str("\n\n#### "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Heading5(children) => { | ||||
|                 string.push_str("\n\n##### "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Heading6(children) => { | ||||
|                 string.push_str("\n\n###### "); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str("\n"); | ||||
|             } | ||||
|             DenseTree::Link { children, href, .. } => { | ||||
|                 string.push('['); | ||||
|                 for child in children { | ||||
|                     child.append_in_textual_format(string); | ||||
|                 } | ||||
|                 string.push_str(&format!("]({})", href)); | ||||
|             } | ||||
|             DenseTree::Image { .. } => { | ||||
|                 string.push_str("[IMG]"); | ||||
|             } | ||||
|             DenseTree::Text(text) => { | ||||
|                 string.push_str(text); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| struct DenseTreeBuilder { | ||||
|     /// Siblings in the buffer.
 | ||||
|     nodes: Vec<DenseTree>, | ||||
| 
 | ||||
|     /// Number of preceding newlines at the end of the buffer.
 | ||||
|     /// Used for generating text that preserves some vague structure.
 | ||||
|     preceding_newlines: u32, | ||||
| } | ||||
| 
 | ||||
| impl DenseTreeBuilder { | ||||
|     pub fn new() -> Self { | ||||
|         DenseTreeBuilder { | ||||
|             nodes: vec![], | ||||
|             preceding_newlines: 0, | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pub fn into_tree(mut self) -> Vec<DenseTree> { | ||||
|         self.simplify(); | ||||
|         self.nodes | ||||
|     } | ||||
| 
 | ||||
|     /// Simplify the DenseTree nodes: coalesce Text nodes and
 | ||||
|     pub fn simplify(&mut self) { | ||||
|         // First coalesce all text nodes
 | ||||
|         // TODO(perf): Do it in a better way to reduce the cost.
 | ||||
|         let mut idx = 1; | ||||
|         while idx < self.nodes.len() { | ||||
|             if self.nodes[idx].is_text() && self.nodes[idx - 1].is_text() { | ||||
|                 // Merge the two text nodes is a text node, consume it and merge it in.
 | ||||
|                 match self.nodes.remove(idx) { | ||||
|                     DenseTree::Text(append_text) => { | ||||
|                         match &mut self.nodes[idx - 1] { | ||||
|                             DenseTree::Text(string) => { | ||||
|                                 string.push_str(&append_text); | ||||
|                                 // Continue so we don't advance, as we just moved the list down a
 | ||||
|                                 // bit.
 | ||||
|                                 continue; | ||||
|                             } | ||||
|                             _ => { | ||||
|                                 panic!( | ||||
|                                     "Should be unreachable: checked to be text first. ({})", | ||||
|                                     idx - 1 | ||||
|                                 ); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                     _ => { | ||||
|                         panic!("Should be unreachable: checked to be text first. ({})", idx); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             idx += 1; | ||||
|         } | ||||
| 
 | ||||
|         for node in &mut self.nodes { | ||||
|             match node { | ||||
|                 DenseTree::Text(text) => { | ||||
|                     // Coalesce newlines so there are never more than 2 in a row.
 | ||||
|                     *text = simplify_newlines(&simplify_whitespace(&text)); | ||||
|                 } | ||||
|                 _ => { /* nop */ } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         match self.nodes.get_mut(0) { | ||||
|             Some(DenseTree::Text(text)) => { | ||||
|                 *text = text.trim_start().to_owned(); | ||||
|             } | ||||
|             _ => (), | ||||
|         } | ||||
| 
 | ||||
|         let num_nodes = self.nodes.len(); | ||||
|         if num_nodes > 1 { | ||||
|             match self.nodes.get_mut(num_nodes - 1) { | ||||
|                 Some(DenseTree::Text(text)) => { | ||||
|                     *text = text.trim_end().to_owned(); | ||||
|                 } | ||||
|                 _ => (), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     /// Convert a HTML node's children into DenseTree nodes.
 | ||||
|     pub fn add_children_of_node(&mut self, node: NodeRef) { | ||||
|         for child in node.children() { | ||||
|             if let Some(element) = child.as_element() { | ||||
|                 match element.name.local.deref() { | ||||
|                     "h1" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading1(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "h2" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading2(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "h3" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading3(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "h4" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading4(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "h5" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading5(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "h6" => { | ||||
|                         self.nodes | ||||
|                             .push(DenseTree::Heading6(DenseTree::from_body(child))); | ||||
|                         self.preceding_newlines = 2; | ||||
|                     } | ||||
|                     "a" => { | ||||
|                         let attrs = element.attributes.borrow(); | ||||
|                         let href = attrs.get("href").unwrap_or("").to_owned(); | ||||
| 
 | ||||
|                         if href.starts_with("javascript:") || href.starts_with("data:") { | ||||
|                             // Skip this link. Just unwrap it.
 | ||||
|                             self.add_children_of_node(child.clone()); | ||||
|                             continue; | ||||
|                         } | ||||
| 
 | ||||
|                         let nofollow = attrs | ||||
|                             .get("rel") | ||||
|                             .map(|rel: &str| { | ||||
|                                 rel.split_whitespace() | ||||
|                                     .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow")) | ||||
|                             }) | ||||
|                             .unwrap_or(false); | ||||
|                         drop(attrs); | ||||
| 
 | ||||
|                         self.nodes.push(DenseTree::Link { | ||||
|                             children: DenseTree::from_body(child), | ||||
|                             href, | ||||
|                             nofollow, | ||||
|                         }); | ||||
| 
 | ||||
|                         self.preceding_newlines = 0; | ||||
|                     } | ||||
|                     "img" => { | ||||
|                         // TODO Decide if this is worth the space...
 | ||||
|                         let attrs = element.attributes.borrow(); | ||||
|                         let src = attrs.get("src").unwrap_or("").to_owned(); | ||||
| 
 | ||||
|                         if src.starts_with("javascript:") || src.starts_with("data:") { | ||||
|                             // Skip this image.
 | ||||
|                             continue; | ||||
|                         } | ||||
| 
 | ||||
|                         let alt = simplify_whitespace(attrs.get("alt").unwrap_or("").trim()); | ||||
| 
 | ||||
|                         self.nodes.push(DenseTree::Image { src, alt }); | ||||
|                     } | ||||
|                     "p" | "pre" => { | ||||
|                         // Paragraphs must have 2 preceding newlines.
 | ||||
|                         if self.preceding_newlines < 2 { | ||||
|                             self.nodes.push(DenseTree::Text( | ||||
|                                 match self.preceding_newlines { | ||||
|                                     0 => "\n\n", | ||||
|                                     1 => "\n", | ||||
|                                     _ => unreachable!(), | ||||
|                                 } | ||||
|                                 .to_owned(), | ||||
|                             )); | ||||
|                             self.preceding_newlines = 2; | ||||
|                         } | ||||
| 
 | ||||
|                         self.add_children_of_node(child); | ||||
| 
 | ||||
|                         // Paragraphs must have 2 trailing newlines.
 | ||||
|                         if self.preceding_newlines < 2 { | ||||
|                             self.nodes.push(DenseTree::Text( | ||||
|                                 match self.preceding_newlines { | ||||
|                                     0 => "\n\n", | ||||
|                                     1 => "\n", | ||||
|                                     _ => unreachable!(), | ||||
|                                 } | ||||
|                                 .to_owned(), | ||||
|                             )); | ||||
|                             self.preceding_newlines = 2; | ||||
|                         } | ||||
|                     } | ||||
|                     "br" => { | ||||
|                         self.nodes.push(DenseTree::Text("\n".to_owned())); | ||||
|                         self.preceding_newlines += 1; | ||||
|                     } | ||||
|                     "div" | "li" => { | ||||
|                         // Divs must have 1 preceding newline.
 | ||||
|                         if self.preceding_newlines < 1 { | ||||
|                             self.nodes.push(DenseTree::Text("\n".to_owned())); | ||||
|                             self.preceding_newlines = 1; | ||||
|                         } | ||||
| 
 | ||||
|                         self.add_children_of_node(child); | ||||
| 
 | ||||
|                         // Divs must have 1 trailing newline.
 | ||||
|                         if self.preceding_newlines < 1 { | ||||
|                             self.nodes.push(DenseTree::Text("\n".to_owned())); | ||||
|                             self.preceding_newlines = 1; | ||||
|                         } | ||||
|                     } | ||||
|                     "script" | "style" | "svg" | "noscript" => { | ||||
|                         // We just prune these, as we don't want them.
 | ||||
|                         // (noscript tends just to be noisy 'enable JS now!!' messages, so prune those too.)
 | ||||
|                         continue; | ||||
|                     } | ||||
|                     _ => { | ||||
|                         // Simply unwrap the unknown element.
 | ||||
|                         self.add_children_of_node(child); | ||||
|                     } | ||||
|                 } | ||||
|                 //element.name.local
 | ||||
|             } else if let Some(text) = child.as_text() { | ||||
|                 let text_to_add = | ||||
|                     simplify_whitespace(&simplify_newlines(&text.borrow().replace("\n", " "))); | ||||
|                 self.preceding_newlines = | ||||
|                     text_to_add.chars().rev().take_while(|c| *c == '\n').count() as u32; | ||||
|                 self.nodes.push(DenseTree::Text(text_to_add)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| lazy_static! { | ||||
|     static ref MANY_WHITESPACE: Regex = Regex::new(r"[ \t]+").unwrap(); | ||||
|     static ref THREE_OR_MORE_NEWLINES: Regex = Regex::new(r"\n+[ \t\n]+\n+").unwrap(); | ||||
|     static ref UNNECESSARY_LS_WHITESPACE: Regex = Regex::new(r"\n[ \s]+").unwrap(); | ||||
|     static ref UNNECESSARY_LE_WHITESPACE: Regex = Regex::new(r"[ \s]+\n").unwrap(); | ||||
| } | ||||
| 
 | ||||
| pub fn simplify_whitespace(input: &str) -> String { | ||||
|     let s = MANY_WHITESPACE.replace_all(input, " "); | ||||
|     let s = UNNECESSARY_LS_WHITESPACE.replace_all(s.borrow(), "\n"); | ||||
|     UNNECESSARY_LE_WHITESPACE | ||||
|         .replace_all(s.borrow(), "\n") | ||||
|         .into_owned() | ||||
| } | ||||
| 
 | ||||
| pub fn simplify_newlines(input: &str) -> String { | ||||
|     THREE_OR_MORE_NEWLINES | ||||
|         .replace_all(&input.replace("\r", ""), "\n\n") | ||||
|         .into_owned() | ||||
| } | ||||
| 
 | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use crate::{simplify_newlines, simplify_whitespace}; | ||||
| 
 | ||||
|     #[test] | ||||
|     pub fn test_simplify_whitespace() { | ||||
|         assert_eq!( | ||||
|             simplify_whitespace("hello    cat\tdog \t bat"), | ||||
|             "hello cat dog bat" | ||||
|         ); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     pub fn test_simplify_newlines() { | ||||
|         assert_eq!( | ||||
|             simplify_newlines("hello\n\n\n\nare\n\n\nyou\n\n\n\n\n\n\t\n\n\nthere?"), | ||||
|             "hello\n\nare\n\nyou\n\nthere?" | ||||
|         ); | ||||
|     } | ||||
| } | ||||
| @ -60,7 +60,8 @@ const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", " | ||||
| pub mod regexes; | ||||
| 
 | ||||
| pub struct Readability { | ||||
|     root_node: NodeRef, | ||||
|     /// Left-over document. Note that readable article pieces are detached from the parent.
 | ||||
|     pub root_node: NodeRef, | ||||
|     byline: Option<String>, | ||||
|     article_title: String, | ||||
|     pub article_node: Option<NodeRef>, | ||||
| @ -77,8 +78,12 @@ struct SizeInfo { | ||||
| 
 | ||||
| impl Readability { | ||||
|     pub fn new(html_str: &str) -> Self { | ||||
|         Self::new_from_node(kuchiki::parse_html().one(html_str)) | ||||
|     } | ||||
| 
 | ||||
|     pub fn new_from_node(root_node: NodeRef) -> Self { | ||||
|         Self { | ||||
|             root_node: kuchiki::parse_html().one(html_str), | ||||
|             root_node, | ||||
|             byline: None, | ||||
|             article_title: "".into(), | ||||
|             article_node: None, | ||||
| @ -87,6 +92,7 @@ impl Readability { | ||||
|             metadata: MetaData::new(), | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pub fn parse(&mut self, url: &str) -> anyhow::Result<()> { | ||||
|         self.unwrap_no_script_tags(); | ||||
|         self.remove_scripts(); | ||||
|  | ||||
							
								
								
									
										11
									
								
								quickpeep_structs/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								quickpeep_structs/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | ||||
| [package] | ||||
| name = "quickpeep_structs" | ||||
| version = "0.1.0" | ||||
| edition = "2021" | ||||
| 
 | ||||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||
| 
 | ||||
| [dependencies] | ||||
| bitflags = "1.3.2" | ||||
| #arc-interner = "0.7.0" | ||||
| quickpeep_densedoc = { path = "../quickpeep_densedoc" } | ||||
							
								
								
									
										1
									
								
								quickpeep_structs/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								quickpeep_structs/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| pub mod rake_entries; | ||||
							
								
								
									
										24
									
								
								quickpeep_structs/src/rake_entries.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								quickpeep_structs/src/rake_entries.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | ||||
| use bitflags::bitflags; | ||||
| 
 | ||||
| bitflags! { | ||||
|     pub struct AnalysisAntifeatures: u8 { | ||||
|         /// Adverts are present on the page, according to a filter.
 | ||||
|         const ADVERTS = 0x01; | ||||
|         /// Some things are blocked due to privacy concerns, according to a filter.
 | ||||
|         const PRIVACY = 0x02; | ||||
|         /// Annoying cookie nags are present on this page, according to a cosmetic filter.
 | ||||
|         const COOKIE_NAG = 0x04; | ||||
|         /// Unspecified annoyances are present on this page, according to a cosmetic filter.
 | ||||
|         const ANNOYANCE = 0x08; | ||||
| 
 | ||||
|         /// The web page was served over CloudFlare at the time of indexing, which is not in the
 | ||||
|         /// spirit of decentralisation.
 | ||||
|         const CLOUDFLARE = 0x10; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| pub struct RakedPageEntry { | ||||
|     pub analysed_antifeatures: AnalysisAntifeatures, | ||||
|     //pub article: Option<DenseTree>,
 | ||||
|     //pub non_article: Option<DenseTree>,
 | ||||
| } | ||||
							
								
								
									
										12
									
								
								scripts/get_cf_ips.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										12
									
								
								scripts/get_cf_ips.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,12 @@ | ||||
| #!/bin/sh | ||||
| 
 | ||||
| set -eu | ||||
| 
 | ||||
| dir_path="$(dirname "$0")" | ||||
| 
 | ||||
| mkdir -p "$dir_path/../data" | ||||
| wget -O "$dir_path/../data/cf_ips_v4.txt" https://www.cloudflare.com/ips-v4 | ||||
| wget -O "$dir_path/../data/cf_ips_v6.txt" https://www.cloudflare.com/ips-v6 | ||||
| echo "\n" >> "$dir_path/../data/cf_ips_v4.txt" | ||||
| cat "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt" > "$dir_path/../data/cf_ips.txt" | ||||
| rm "$dir_path/../data/cf_ips_v4.txt" "$dir_path/../data/cf_ips_v6.txt" | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user