Allow loading multiple adblock lists

This commit is contained in:
Olivier 'reivilibre' 2022-03-13 21:57:46 +00:00
parent a1097ef183
commit 6d3d7c5f47
4 changed files with 41 additions and 12 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
.idea .idea
data/cf_ips.txt data/cf_ips.txt
data

View File

@ -1,5 +1,6 @@
use adblock::lists::RuleTypes; use adblock::lists::RuleTypes;
use anyhow::Context; use anyhow::Context;
use log::warn;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::RakeIntent; use quickpeep::raking::RakeIntent;
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
@ -7,9 +8,17 @@ use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy; use reqwest::redirect::Policy;
use reqwest::Url; use reqwest::Url;
use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use tokio::fs::File; use tokio::fs::File;
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
(AnalysisAntifeatures::COOKIE_NAG, "cookie_nag"),
(AnalysisAntifeatures::ANNOYANCE, "annoyance"),
(AnalysisAntifeatures::PRIVACY, "privacy"),
(AnalysisAntifeatures::ADVERTS, "adverts"),
];
#[tokio::main] #[tokio::main]
pub async fn main() -> anyhow::Result<()> { pub async fn main() -> anyhow::Result<()> {
let mut header_map = HeaderMap::new(); let mut header_map = HeaderMap::new();
@ -22,14 +31,21 @@ pub async fn main() -> anyhow::Result<()> {
.redirect(Policy::none()) .redirect(Policy::none())
.build()?; .build()?;
// TODO Don't hardcode these paths in quite as bad a way... let mut adblock_engines = Vec::new();
let adblock_file = File::open("./cosmetic_filters.adblock")
.await for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
.context("Failed to open cosmetic filters file")?; // TODO Don't hardcode these paths in quite as bad a way...
let adblock_engines = vec![( let path = PathBuf::from(format!("./data/{}.adblock", name));
AnalysisAntifeatures::ANNOYANCE, if !path.exists() {
load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?, warn!("Missing adblock rules: {:?}.", path);
)]; continue;
}
let file = File::open(&path).await?;
adblock_engines.push((
*antifeature,
load_adblock_engine(file, RuleTypes::All).await?,
));
}
let mut antifeature_ip_set = IpSet::new(); let mut antifeature_ip_set = IpSet::new();

View File

@ -1,12 +1,10 @@
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet}; use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
use adblock::engine::Engine; use adblock::engine::Engine;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use bytes::Bytes;
use chrono::{DateTime, FixedOffset, Utc}; use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon; use cylon::Cylon;
use futures_util::stream::StreamExt; use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice; use html5ever::tendril::fmt::Slice;
use html5ever::QualName;
use kuchiki::traits::TendrilSink; use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef; use kuchiki::NodeRef;
use lazy_static::lazy_static; use lazy_static::lazy_static;
@ -35,6 +33,7 @@ pub enum RakeOutcome {
RakedSitemap(Vec<UrlRaked>), RakedSitemap(Vec<UrlRaked>),
/// The page was not canonical, and should not be indexed. /// The page was not canonical, and should not be indexed.
/// However here is the URL of the canonical page. /// However here is the URL of the canonical page.
// TODO call this a Redirect and also use for 3xx redirects?
NotCanonical { NotCanonical {
new_url: Url, new_url: Url,
}, },
@ -98,7 +97,7 @@ lazy_static! {
} }
async fn response_to_bytes_limited( async fn response_to_bytes_limited(
mut response: Response, response: Response,
size_limit: usize, size_limit: usize,
time_limit: Duration, time_limit: Duration,
) -> anyhow::Result<Vec<u8>> { ) -> anyhow::Result<Vec<u8>> {

13
scripts/get_adblock_filters.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/sh
set -eu
dir_path="$(dirname "$0")"
mkdir -p "$dir_path/../data"
wget -O "$dir_path/../data/cookie_nag.adblock" https://secure.fanboy.co.nz/fanboy-cookiemonster.txt
wget -O "$dir_path/../data/social.adblock" https://easylist.to/easylist/fanboy-social.txt
wget -O "$dir_path/../data/privacy.adblock" https://easylist.to/easylist/easyprivacy.txt
wget -O "$dir_path/../data/annoyance.adblock" https://secure.fanboy.co.nz/fanboy-annoyance.txt
wget -O "$dir_path/../data/adverts.adblock" https://easylist.to/easylist/easylist.txt