Allow loading multiple adblock lists

This commit is contained in:
Olivier 'reivilibre' 2022-03-13 21:57:46 +00:00
parent a1097ef183
commit 6d3d7c5f47
4 changed files with 41 additions and 12 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
.idea
data/cf_ips.txt
data/cf_ips.txt
data

View File

@ -1,5 +1,6 @@
use adblock::lists::RuleTypes;
use anyhow::Context;
use log::warn;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::RakeIntent;
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
@ -7,9 +8,17 @@ use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy;
use reqwest::Url;
use std::path::PathBuf;
use std::str::FromStr;
use tokio::fs::File;
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
(AnalysisAntifeatures::COOKIE_NAG, "cookie_nag"),
(AnalysisAntifeatures::ANNOYANCE, "annoyance"),
(AnalysisAntifeatures::PRIVACY, "privacy"),
(AnalysisAntifeatures::ADVERTS, "adverts"),
];
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
let mut header_map = HeaderMap::new();
@ -22,14 +31,21 @@ pub async fn main() -> anyhow::Result<()> {
.redirect(Policy::none())
.build()?;
// TODO Don't hardcode these paths in quite as bad a way...
let adblock_file = File::open("./cosmetic_filters.adblock")
.await
.context("Failed to open cosmetic filters file")?;
let adblock_engines = vec![(
AnalysisAntifeatures::ANNOYANCE,
load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?,
)];
let mut adblock_engines = Vec::new();
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
// TODO Don't hardcode these paths in quite as bad a way...
let path = PathBuf::from(format!("./data/{}.adblock", name));
if !path.exists() {
warn!("Missing adblock rules: {:?}.", path);
continue;
}
let file = File::open(&path).await?;
adblock_engines.push((
*antifeature,
load_adblock_engine(file, RuleTypes::All).await?,
));
}
let mut antifeature_ip_set = IpSet::new();

View File

@ -1,12 +1,10 @@
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
use adblock::engine::Engine;
use anyhow::{bail, Context};
use bytes::Bytes;
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use html5ever::QualName;
use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use lazy_static::lazy_static;
@ -35,6 +33,7 @@ pub enum RakeOutcome {
RakedSitemap(Vec<UrlRaked>),
/// The page was not canonical, and should not be indexed.
/// However here is the URL of the canonical page.
// TODO call this a Redirect and also use for 3xx redirects?
NotCanonical {
new_url: Url,
},
@ -98,7 +97,7 @@ lazy_static! {
}
async fn response_to_bytes_limited(
mut response: Response,
response: Response,
size_limit: usize,
time_limit: Duration,
) -> anyhow::Result<Vec<u8>> {

13
scripts/get_adblock_filters.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/sh
set -eu
dir_path="$(dirname "$0")"
mkdir -p "$dir_path/../data"
wget -O "$dir_path/../data/cookie_nag.adblock" https://secure.fanboy.co.nz/fanboy-cookiemonster.txt
wget -O "$dir_path/../data/social.adblock" https://easylist.to/easylist/fanboy-social.txt
wget -O "$dir_path/../data/privacy.adblock" https://easylist.to/easylist/easyprivacy.txt
wget -O "$dir_path/../data/annoyance.adblock" https://secure.fanboy.co.nz/fanboy-annoyance.txt
wget -O "$dir_path/../data/adverts.adblock" https://easylist.to/easylist/easylist.txt