Allow loading multiple adblock lists
This commit is contained in:
parent
a1097ef183
commit
6d3d7c5f47
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
.idea
|
.idea
|
||||||
data/cf_ips.txt
|
data/cf_ips.txt
|
||||||
|
data
|
|
@ -1,5 +1,6 @@
|
||||||
use adblock::lists::RuleTypes;
|
use adblock::lists::RuleTypes;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use log::warn;
|
||||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||||
use quickpeep::raking::RakeIntent;
|
use quickpeep::raking::RakeIntent;
|
||||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||||
|
@ -7,9 +8,17 @@ use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use reqwest::redirect::Policy;
|
use reqwest::redirect::Policy;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
|
|
||||||
|
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
||||||
|
(AnalysisAntifeatures::COOKIE_NAG, "cookie_nag"),
|
||||||
|
(AnalysisAntifeatures::ANNOYANCE, "annoyance"),
|
||||||
|
(AnalysisAntifeatures::PRIVACY, "privacy"),
|
||||||
|
(AnalysisAntifeatures::ADVERTS, "adverts"),
|
||||||
|
];
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> anyhow::Result<()> {
|
pub async fn main() -> anyhow::Result<()> {
|
||||||
let mut header_map = HeaderMap::new();
|
let mut header_map = HeaderMap::new();
|
||||||
|
@ -22,14 +31,21 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
.redirect(Policy::none())
|
.redirect(Policy::none())
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
// TODO Don't hardcode these paths in quite as bad a way...
|
let mut adblock_engines = Vec::new();
|
||||||
let adblock_file = File::open("./cosmetic_filters.adblock")
|
|
||||||
.await
|
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||||
.context("Failed to open cosmetic filters file")?;
|
// TODO Don't hardcode these paths in quite as bad a way...
|
||||||
let adblock_engines = vec![(
|
let path = PathBuf::from(format!("./data/{}.adblock", name));
|
||||||
AnalysisAntifeatures::ANNOYANCE,
|
if !path.exists() {
|
||||||
load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?,
|
warn!("Missing adblock rules: {:?}.", path);
|
||||||
)];
|
continue;
|
||||||
|
}
|
||||||
|
let file = File::open(&path).await?;
|
||||||
|
adblock_engines.push((
|
||||||
|
*antifeature,
|
||||||
|
load_adblock_engine(file, RuleTypes::All).await?,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
let mut antifeature_ip_set = IpSet::new();
|
let mut antifeature_ip_set = IpSet::new();
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
|
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use bytes::Bytes;
|
|
||||||
use chrono::{DateTime, FixedOffset, Utc};
|
use chrono::{DateTime, FixedOffset, Utc};
|
||||||
use cylon::Cylon;
|
use cylon::Cylon;
|
||||||
use futures_util::stream::StreamExt;
|
use futures_util::stream::StreamExt;
|
||||||
use html5ever::tendril::fmt::Slice;
|
use html5ever::tendril::fmt::Slice;
|
||||||
use html5ever::QualName;
|
|
||||||
use kuchiki::traits::TendrilSink;
|
use kuchiki::traits::TendrilSink;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
@ -35,6 +33,7 @@ pub enum RakeOutcome {
|
||||||
RakedSitemap(Vec<UrlRaked>),
|
RakedSitemap(Vec<UrlRaked>),
|
||||||
/// The page was not canonical, and should not be indexed.
|
/// The page was not canonical, and should not be indexed.
|
||||||
/// However here is the URL of the canonical page.
|
/// However here is the URL of the canonical page.
|
||||||
|
// TODO call this a Redirect and also use for 3xx redirects?
|
||||||
NotCanonical {
|
NotCanonical {
|
||||||
new_url: Url,
|
new_url: Url,
|
||||||
},
|
},
|
||||||
|
@ -98,7 +97,7 @@ lazy_static! {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn response_to_bytes_limited(
|
async fn response_to_bytes_limited(
|
||||||
mut response: Response,
|
response: Response,
|
||||||
size_limit: usize,
|
size_limit: usize,
|
||||||
time_limit: Duration,
|
time_limit: Duration,
|
||||||
) -> anyhow::Result<Vec<u8>> {
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
dir_path="$(dirname "$0")"
|
||||||
|
|
||||||
|
mkdir -p "$dir_path/../data"
|
||||||
|
wget -O "$dir_path/../data/cookie_nag.adblock" https://secure.fanboy.co.nz/fanboy-cookiemonster.txt
|
||||||
|
wget -O "$dir_path/../data/social.adblock" https://easylist.to/easylist/fanboy-social.txt
|
||||||
|
wget -O "$dir_path/../data/privacy.adblock" https://easylist.to/easylist/easyprivacy.txt
|
||||||
|
wget -O "$dir_path/../data/annoyance.adblock" https://secure.fanboy.co.nz/fanboy-annoyance.txt
|
||||||
|
wget -O "$dir_path/../data/adverts.adblock" https://easylist.to/easylist/easylist.txt
|
||||||
|
|
Loading…
Reference in New Issue