Support network filter checking

This commit is contained in:
Olivier 'reivilibre' 2022-03-28 23:43:01 +01:00
parent de610e5aab
commit 68b7c76d1e
4 changed files with 94 additions and 3 deletions

1
Cargo.lock generated
View File

@ -3781,6 +3781,7 @@ dependencies = [
"lingua",
"log",
"lru",
"markup5ever",
"mdbx-sys",
"metrics 0.18.1",
"metrics-exporter-prometheus",

View File

@ -20,6 +20,7 @@ colour = "0.6.0"
### Document Parsing
kuchiki = "0.8.1"
html5ever = "0.25.1"
markup5ever = "0.10.1"
serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0"
serde_json = "1.0.79"

View File

@ -1,11 +1,15 @@
use adblock::engine::Engine;
use adblock::lists::{ParseOptions, RuleTypes};
use anyhow::Context;
use html5ever::QualName;
use ipnetwork::IpNetwork;
use kuchiki::NodeRef;
use kuchiki::{ElementData, NodeData, NodeRef};
use lingua::Language;
use std::collections::{BTreeSet, HashSet};
use log::debug;
use reqwest::Url;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::net::IpAddr;
use std::ops::Deref;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
pub struct PreloadedEngine {
@ -117,6 +121,80 @@ pub fn analyse_with_ad_block_cosmetic_filter(
Ok(matches > 0)
}
/// Renames a node by fully replacing it in the tree.
fn replace_node(node: &NodeRef, name: Option<&str>) -> NodeRef {
let ele = node.as_element().unwrap();
let new_node = NodeRef::new(NodeData::Element(ElementData {
name: QualName::new(
None,
ele.name.ns.clone(),
name.unwrap_or(ele.name.local.deref()).into(),
),
attributes: ele.attributes.clone(),
template_contents: None,
}));
node.insert_after(new_node.clone());
for child in node.children() {
new_node.append(child);
}
node.detach();
new_node
}
pub fn analyse_with_ad_block_network_filter(
root: &NodeRef,
engine: &Engine,
url: &str,
) -> anyhow::Result<bool> {
let mut matches = 0;
let mut to_visit = vec![root.clone()];
let this_url = Url::parse(url)?;
// Element type → (Attribute, Request type)
let processing_actions: HashMap<&'static str, (&'static str, &'static str)> = [
("a", ("href", "document")),
("img", ("src", "image")),
("script", ("src", "script")),
]
.into_iter()
.collect();
while let Some(visit_next) = to_visit.pop() {
for child in visit_next.children() {
to_visit.push(child);
}
if let Some(element) = visit_next.as_element() {
if let Some((attr_name, req_type)) = processing_actions.get(element.name.local.deref())
{
let attrs = element.attributes.borrow_mut();
if let Some(href) = attrs.get(*attr_name) {
let should_remove = match this_url.join(href) {
Ok(href) => {
//eprintln!("check {:?}/{:?}/{:?}", href.as_str(), this_url.as_str(), *req_type);
engine
.check_network_urls(href.as_str(), this_url.as_str(), *req_type)
.matched
}
Err(err) => {
debug!("Removing erroneous {}: {:?} {:?}", attr_name, href, err);
true
}
};
drop(attrs);
if should_remove {
matches += 1;
replace_node(&visit_next, Some("disabled-node"));
}
}
}
}
}
Ok(matches > 0)
}
pub fn guess_document_language(text: &str) -> Option<String> {
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
detector

View File

@ -1,5 +1,6 @@
use crate::raking::analysis::{
analyse_with_ad_block_cosmetic_filter, guess_document_language, PreloadedEngine,
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
guess_document_language, PreloadedEngine,
};
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
use adblock::engine::Engine;
@ -180,6 +181,16 @@ impl PageExtractionServiceInternal {
error!("Cosmetic Filter Err {:?}", err);
}
};
match analyse_with_ad_block_network_filter(&root_node, &adblock_engine, url.as_str()) {
Ok(network_filters_tripped) => {
debug!("?network filters tripped: {}", network_filters_tripped);
antifeature_flags |= *engine_antifeature_flag;
}
Err(err) => {
error!("Network Filter Err {:?}", err);
}
};
}
let dense_doc = DenseTree::from_body(root_node.clone());