Support network filter checking
This commit is contained in:
parent
de610e5aab
commit
68b7c76d1e
|
@ -3781,6 +3781,7 @@ dependencies = [
|
||||||
"lingua",
|
"lingua",
|
||||||
"log",
|
"log",
|
||||||
"lru",
|
"lru",
|
||||||
|
"markup5ever",
|
||||||
"mdbx-sys",
|
"mdbx-sys",
|
||||||
"metrics 0.18.1",
|
"metrics 0.18.1",
|
||||||
"metrics-exporter-prometheus",
|
"metrics-exporter-prometheus",
|
||||||
|
|
|
@ -20,6 +20,7 @@ colour = "0.6.0"
|
||||||
### Document Parsing
|
### Document Parsing
|
||||||
kuchiki = "0.8.1"
|
kuchiki = "0.8.1"
|
||||||
html5ever = "0.25.1"
|
html5ever = "0.25.1"
|
||||||
|
markup5ever = "0.10.1"
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
serde_bare = "0.5.0"
|
serde_bare = "0.5.0"
|
||||||
serde_json = "1.0.79"
|
serde_json = "1.0.79"
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
use adblock::lists::{ParseOptions, RuleTypes};
|
use adblock::lists::{ParseOptions, RuleTypes};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use html5ever::QualName;
|
||||||
use ipnetwork::IpNetwork;
|
use ipnetwork::IpNetwork;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::{ElementData, NodeData, NodeRef};
|
||||||
use lingua::Language;
|
use lingua::Language;
|
||||||
use std::collections::{BTreeSet, HashSet};
|
use log::debug;
|
||||||
|
use reqwest::Url;
|
||||||
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
use std::net::IpAddr;
|
use std::net::IpAddr;
|
||||||
|
use std::ops::Deref;
|
||||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||||
|
|
||||||
pub struct PreloadedEngine {
|
pub struct PreloadedEngine {
|
||||||
|
@ -117,6 +121,80 @@ pub fn analyse_with_ad_block_cosmetic_filter(
|
||||||
Ok(matches > 0)
|
Ok(matches > 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Renames a node by fully replacing it in the tree.
|
||||||
|
fn replace_node(node: &NodeRef, name: Option<&str>) -> NodeRef {
|
||||||
|
let ele = node.as_element().unwrap();
|
||||||
|
let new_node = NodeRef::new(NodeData::Element(ElementData {
|
||||||
|
name: QualName::new(
|
||||||
|
None,
|
||||||
|
ele.name.ns.clone(),
|
||||||
|
name.unwrap_or(ele.name.local.deref()).into(),
|
||||||
|
),
|
||||||
|
attributes: ele.attributes.clone(),
|
||||||
|
template_contents: None,
|
||||||
|
}));
|
||||||
|
node.insert_after(new_node.clone());
|
||||||
|
for child in node.children() {
|
||||||
|
new_node.append(child);
|
||||||
|
}
|
||||||
|
node.detach();
|
||||||
|
new_node
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn analyse_with_ad_block_network_filter(
|
||||||
|
root: &NodeRef,
|
||||||
|
engine: &Engine,
|
||||||
|
url: &str,
|
||||||
|
) -> anyhow::Result<bool> {
|
||||||
|
let mut matches = 0;
|
||||||
|
|
||||||
|
let mut to_visit = vec![root.clone()];
|
||||||
|
|
||||||
|
let this_url = Url::parse(url)?;
|
||||||
|
|
||||||
|
// Element type → (Attribute, Request type)
|
||||||
|
let processing_actions: HashMap<&'static str, (&'static str, &'static str)> = [
|
||||||
|
("a", ("href", "document")),
|
||||||
|
("img", ("src", "image")),
|
||||||
|
("script", ("src", "script")),
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
while let Some(visit_next) = to_visit.pop() {
|
||||||
|
for child in visit_next.children() {
|
||||||
|
to_visit.push(child);
|
||||||
|
}
|
||||||
|
if let Some(element) = visit_next.as_element() {
|
||||||
|
if let Some((attr_name, req_type)) = processing_actions.get(element.name.local.deref())
|
||||||
|
{
|
||||||
|
let attrs = element.attributes.borrow_mut();
|
||||||
|
if let Some(href) = attrs.get(*attr_name) {
|
||||||
|
let should_remove = match this_url.join(href) {
|
||||||
|
Ok(href) => {
|
||||||
|
//eprintln!("check {:?}/{:?}/{:?}", href.as_str(), this_url.as_str(), *req_type);
|
||||||
|
engine
|
||||||
|
.check_network_urls(href.as_str(), this_url.as_str(), *req_type)
|
||||||
|
.matched
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
debug!("Removing erroneous {}: {:?} {:?}", attr_name, href, err);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
};
|
||||||
|
drop(attrs);
|
||||||
|
if should_remove {
|
||||||
|
matches += 1;
|
||||||
|
replace_node(&visit_next, Some("disabled-node"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(matches > 0)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn guess_document_language(text: &str) -> Option<String> {
|
pub fn guess_document_language(text: &str) -> Option<String> {
|
||||||
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
|
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
|
||||||
detector
|
detector
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
use crate::raking::analysis::{
|
use crate::raking::analysis::{
|
||||||
analyse_with_ad_block_cosmetic_filter, guess_document_language, PreloadedEngine,
|
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
|
||||||
|
guess_document_language, PreloadedEngine,
|
||||||
};
|
};
|
||||||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
|
@ -180,6 +181,16 @@ impl PageExtractionServiceInternal {
|
||||||
error!("Cosmetic Filter Err {:?}", err);
|
error!("Cosmetic Filter Err {:?}", err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
match analyse_with_ad_block_network_filter(&root_node, &adblock_engine, url.as_str()) {
|
||||||
|
Ok(network_filters_tripped) => {
|
||||||
|
debug!("?network filters tripped: {}", network_filters_tripped);
|
||||||
|
antifeature_flags |= *engine_antifeature_flag;
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
error!("Network Filter Err {:?}", err);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||||
|
|
Loading…
Reference in New Issue