diff --git a/Cargo.lock b/Cargo.lock index 39d5233..98988f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3781,6 +3781,7 @@ dependencies = [ "lingua", "log", "lru", + "markup5ever", "mdbx-sys", "metrics 0.18.1", "metrics-exporter-prometheus", diff --git a/quickpeep_raker/Cargo.toml b/quickpeep_raker/Cargo.toml index fe5c9d4..49b4ed5 100644 --- a/quickpeep_raker/Cargo.toml +++ b/quickpeep_raker/Cargo.toml @@ -20,6 +20,7 @@ colour = "0.6.0" ### Document Parsing kuchiki = "0.8.1" html5ever = "0.25.1" +markup5ever = "0.10.1" serde = { version = "1.0.136", features = ["derive"] } serde_bare = "0.5.0" serde_json = "1.0.79" diff --git a/quickpeep_raker/src/raking/analysis.rs b/quickpeep_raker/src/raking/analysis.rs index 8b75598..59b0460 100644 --- a/quickpeep_raker/src/raking/analysis.rs +++ b/quickpeep_raker/src/raking/analysis.rs @@ -1,11 +1,15 @@ use adblock::engine::Engine; use adblock::lists::{ParseOptions, RuleTypes}; use anyhow::Context; +use html5ever::QualName; use ipnetwork::IpNetwork; -use kuchiki::NodeRef; +use kuchiki::{ElementData, NodeData, NodeRef}; use lingua::Language; -use std::collections::{BTreeSet, HashSet}; +use log::debug; +use reqwest::Url; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::net::IpAddr; +use std::ops::Deref; use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; pub struct PreloadedEngine { @@ -117,6 +121,80 @@ pub fn analyse_with_ad_block_cosmetic_filter( Ok(matches > 0) } +/// Renames a node by fully replacing it in the tree. +fn replace_node(node: &NodeRef, name: Option<&str>) -> NodeRef { + let ele = node.as_element().unwrap(); + let new_node = NodeRef::new(NodeData::Element(ElementData { + name: QualName::new( + None, + ele.name.ns.clone(), + name.unwrap_or(ele.name.local.deref()).into(), + ), + attributes: ele.attributes.clone(), + template_contents: None, + })); + node.insert_after(new_node.clone()); + for child in node.children() { + new_node.append(child); + } + node.detach(); + new_node +} + +pub fn analyse_with_ad_block_network_filter( + root: &NodeRef, + engine: &Engine, + url: &str, +) -> anyhow::Result { + let mut matches = 0; + + let mut to_visit = vec![root.clone()]; + + let this_url = Url::parse(url)?; + + // Element type → (Attribute, Request type) + let processing_actions: HashMap<&'static str, (&'static str, &'static str)> = [ + ("a", ("href", "document")), + ("img", ("src", "image")), + ("script", ("src", "script")), + ] + .into_iter() + .collect(); + + while let Some(visit_next) = to_visit.pop() { + for child in visit_next.children() { + to_visit.push(child); + } + if let Some(element) = visit_next.as_element() { + if let Some((attr_name, req_type)) = processing_actions.get(element.name.local.deref()) + { + let attrs = element.attributes.borrow_mut(); + if let Some(href) = attrs.get(*attr_name) { + let should_remove = match this_url.join(href) { + Ok(href) => { + //eprintln!("check {:?}/{:?}/{:?}", href.as_str(), this_url.as_str(), *req_type); + engine + .check_network_urls(href.as_str(), this_url.as_str(), *req_type) + .matched + } + Err(err) => { + debug!("Removing erroneous {}: {:?} {:?}", attr_name, href, err); + true + } + }; + drop(attrs); + if should_remove { + matches += 1; + replace_node(&visit_next, Some("disabled-node")); + } + } + } + } + } + + Ok(matches > 0) +} + pub fn guess_document_language(text: &str) -> Option { let detector = lingua::LanguageDetectorBuilder::from_all_languages().build(); detector diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 8c7a6ab..d9642bd 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -1,5 +1,6 @@ use crate::raking::analysis::{ - analyse_with_ad_block_cosmetic_filter, guess_document_language, PreloadedEngine, + analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter, + guess_document_language, PreloadedEngine, }; use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES}; use adblock::engine::Engine; @@ -180,6 +181,16 @@ impl PageExtractionServiceInternal { error!("Cosmetic Filter Err {:?}", err); } }; + + match analyse_with_ad_block_network_filter(&root_node, &adblock_engine, url.as_str()) { + Ok(network_filters_tripped) => { + debug!("?network filters tripped: {}", network_filters_tripped); + antifeature_flags |= *engine_antifeature_flag; + } + Err(err) => { + error!("Network Filter Err {:?}", err); + } + }; } let dense_doc = DenseTree::from_body(root_node.clone());