Support network filter checking
This commit is contained in:
parent
de610e5aab
commit
68b7c76d1e
|
@ -3781,6 +3781,7 @@ dependencies = [
|
|||
"lingua",
|
||||
"log",
|
||||
"lru",
|
||||
"markup5ever",
|
||||
"mdbx-sys",
|
||||
"metrics 0.18.1",
|
||||
"metrics-exporter-prometheus",
|
||||
|
|
|
@ -20,6 +20,7 @@ colour = "0.6.0"
|
|||
### Document Parsing
|
||||
kuchiki = "0.8.1"
|
||||
html5ever = "0.25.1"
|
||||
markup5ever = "0.10.1"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_bare = "0.5.0"
|
||||
serde_json = "1.0.79"
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
use adblock::engine::Engine;
|
||||
use adblock::lists::{ParseOptions, RuleTypes};
|
||||
use anyhow::Context;
|
||||
use html5ever::QualName;
|
||||
use ipnetwork::IpNetwork;
|
||||
use kuchiki::NodeRef;
|
||||
use kuchiki::{ElementData, NodeData, NodeRef};
|
||||
use lingua::Language;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use log::debug;
|
||||
use reqwest::Url;
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::net::IpAddr;
|
||||
use std::ops::Deref;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||
|
||||
pub struct PreloadedEngine {
|
||||
|
@ -117,6 +121,80 @@ pub fn analyse_with_ad_block_cosmetic_filter(
|
|||
Ok(matches > 0)
|
||||
}
|
||||
|
||||
/// Renames a node by fully replacing it in the tree.
|
||||
fn replace_node(node: &NodeRef, name: Option<&str>) -> NodeRef {
|
||||
let ele = node.as_element().unwrap();
|
||||
let new_node = NodeRef::new(NodeData::Element(ElementData {
|
||||
name: QualName::new(
|
||||
None,
|
||||
ele.name.ns.clone(),
|
||||
name.unwrap_or(ele.name.local.deref()).into(),
|
||||
),
|
||||
attributes: ele.attributes.clone(),
|
||||
template_contents: None,
|
||||
}));
|
||||
node.insert_after(new_node.clone());
|
||||
for child in node.children() {
|
||||
new_node.append(child);
|
||||
}
|
||||
node.detach();
|
||||
new_node
|
||||
}
|
||||
|
||||
pub fn analyse_with_ad_block_network_filter(
|
||||
root: &NodeRef,
|
||||
engine: &Engine,
|
||||
url: &str,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut matches = 0;
|
||||
|
||||
let mut to_visit = vec![root.clone()];
|
||||
|
||||
let this_url = Url::parse(url)?;
|
||||
|
||||
// Element type → (Attribute, Request type)
|
||||
let processing_actions: HashMap<&'static str, (&'static str, &'static str)> = [
|
||||
("a", ("href", "document")),
|
||||
("img", ("src", "image")),
|
||||
("script", ("src", "script")),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
while let Some(visit_next) = to_visit.pop() {
|
||||
for child in visit_next.children() {
|
||||
to_visit.push(child);
|
||||
}
|
||||
if let Some(element) = visit_next.as_element() {
|
||||
if let Some((attr_name, req_type)) = processing_actions.get(element.name.local.deref())
|
||||
{
|
||||
let attrs = element.attributes.borrow_mut();
|
||||
if let Some(href) = attrs.get(*attr_name) {
|
||||
let should_remove = match this_url.join(href) {
|
||||
Ok(href) => {
|
||||
//eprintln!("check {:?}/{:?}/{:?}", href.as_str(), this_url.as_str(), *req_type);
|
||||
engine
|
||||
.check_network_urls(href.as_str(), this_url.as_str(), *req_type)
|
||||
.matched
|
||||
}
|
||||
Err(err) => {
|
||||
debug!("Removing erroneous {}: {:?} {:?}", attr_name, href, err);
|
||||
true
|
||||
}
|
||||
};
|
||||
drop(attrs);
|
||||
if should_remove {
|
||||
matches += 1;
|
||||
replace_node(&visit_next, Some("disabled-node"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(matches > 0)
|
||||
}
|
||||
|
||||
pub fn guess_document_language(text: &str) -> Option<String> {
|
||||
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
|
||||
detector
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use crate::raking::analysis::{
|
||||
analyse_with_ad_block_cosmetic_filter, guess_document_language, PreloadedEngine,
|
||||
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
|
||||
guess_document_language, PreloadedEngine,
|
||||
};
|
||||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||
use adblock::engine::Engine;
|
||||
|
@ -180,6 +181,16 @@ impl PageExtractionServiceInternal {
|
|||
error!("Cosmetic Filter Err {:?}", err);
|
||||
}
|
||||
};
|
||||
|
||||
match analyse_with_ad_block_network_filter(&root_node, &adblock_engine, url.as_str()) {
|
||||
Ok(network_filters_tripped) => {
|
||||
debug!("?network filters tripped: {}", network_filters_tripped);
|
||||
antifeature_flags |= *engine_antifeature_flag;
|
||||
}
|
||||
Err(err) => {
|
||||
error!("Network Filter Err {:?}", err);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||
|
|
Loading…
Reference in New Issue