Allow loading multiple adblock lists
This commit is contained in:
parent
a1097ef183
commit
6d3d7c5f47
|
@ -1,3 +1,4 @@
|
|||
|
||||
.idea
|
||||
data/cf_ips.txt
|
||||
data/cf_ips.txt
|
||||
data
|
|
@ -1,5 +1,6 @@
|
|||
use adblock::lists::RuleTypes;
|
||||
use anyhow::Context;
|
||||
use log::warn;
|
||||
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
|
||||
use quickpeep::raking::RakeIntent;
|
||||
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||
|
@ -7,9 +8,17 @@ use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
|||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use reqwest::redirect::Policy;
|
||||
use reqwest::Url;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use tokio::fs::File;
|
||||
|
||||
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
|
||||
(AnalysisAntifeatures::COOKIE_NAG, "cookie_nag"),
|
||||
(AnalysisAntifeatures::ANNOYANCE, "annoyance"),
|
||||
(AnalysisAntifeatures::PRIVACY, "privacy"),
|
||||
(AnalysisAntifeatures::ADVERTS, "adverts"),
|
||||
];
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn main() -> anyhow::Result<()> {
|
||||
let mut header_map = HeaderMap::new();
|
||||
|
@ -22,14 +31,21 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
.redirect(Policy::none())
|
||||
.build()?;
|
||||
|
||||
// TODO Don't hardcode these paths in quite as bad a way...
|
||||
let adblock_file = File::open("./cosmetic_filters.adblock")
|
||||
.await
|
||||
.context("Failed to open cosmetic filters file")?;
|
||||
let adblock_engines = vec![(
|
||||
AnalysisAntifeatures::ANNOYANCE,
|
||||
load_adblock_engine(adblock_file, RuleTypes::CosmeticOnly).await?,
|
||||
)];
|
||||
let mut adblock_engines = Vec::new();
|
||||
|
||||
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||
// TODO Don't hardcode these paths in quite as bad a way...
|
||||
let path = PathBuf::from(format!("./data/{}.adblock", name));
|
||||
if !path.exists() {
|
||||
warn!("Missing adblock rules: {:?}.", path);
|
||||
continue;
|
||||
}
|
||||
let file = File::open(&path).await?;
|
||||
adblock_engines.push((
|
||||
*antifeature,
|
||||
load_adblock_engine(file, RuleTypes::All).await?,
|
||||
));
|
||||
}
|
||||
|
||||
let mut antifeature_ip_set = IpSet::new();
|
||||
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
|
||||
use adblock::engine::Engine;
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use chrono::{DateTime, FixedOffset, Utc};
|
||||
use cylon::Cylon;
|
||||
use futures_util::stream::StreamExt;
|
||||
use html5ever::tendril::fmt::Slice;
|
||||
use html5ever::QualName;
|
||||
use kuchiki::traits::TendrilSink;
|
||||
use kuchiki::NodeRef;
|
||||
use lazy_static::lazy_static;
|
||||
|
@ -35,6 +33,7 @@ pub enum RakeOutcome {
|
|||
RakedSitemap(Vec<UrlRaked>),
|
||||
/// The page was not canonical, and should not be indexed.
|
||||
/// However here is the URL of the canonical page.
|
||||
// TODO call this a Redirect and also use for 3xx redirects?
|
||||
NotCanonical {
|
||||
new_url: Url,
|
||||
},
|
||||
|
@ -98,7 +97,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
async fn response_to_bytes_limited(
|
||||
mut response: Response,
|
||||
response: Response,
|
||||
size_limit: usize,
|
||||
time_limit: Duration,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
dir_path="$(dirname "$0")"
|
||||
|
||||
mkdir -p "$dir_path/../data"
|
||||
wget -O "$dir_path/../data/cookie_nag.adblock" https://secure.fanboy.co.nz/fanboy-cookiemonster.txt
|
||||
wget -O "$dir_path/../data/social.adblock" https://easylist.to/easylist/fanboy-social.txt
|
||||
wget -O "$dir_path/../data/privacy.adblock" https://easylist.to/easylist/easyprivacy.txt
|
||||
wget -O "$dir_path/../data/annoyance.adblock" https://secure.fanboy.co.nz/fanboy-annoyance.txt
|
||||
wget -O "$dir_path/../data/adverts.adblock" https://easylist.to/easylist/easylist.txt
|
||||
|
Loading…
Reference in New Issue