quickpeep/quickpeep/src/bin/qp-rake1.rs

104 lines
3.2 KiB
Rust

use adblock::lists::RuleTypes;
use anyhow::Context;
use clap::Parser;
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
use env_logger::Env;
use log::warn;
use quickpeep::raking::analysis::{load_adblock_engine, IpSet};
use quickpeep::raking::{RakeIntent, RakeOutcome};
use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::redirect::Policy;
use reqwest::Url;
use std::path::PathBuf;
use tokio::fs::File;
pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
(AnalysisAntifeatures::COOKIE_NAG, "cookie_nag"),
(AnalysisAntifeatures::ANNOYANCE, "annoyance"),
(AnalysisAntifeatures::PRIVACY, "privacy"),
(AnalysisAntifeatures::ADVERTS, "adverts"),
];
/// Rakes one URL and prints out the description of it.
#[derive(Clone, Debug, Parser)]
pub struct Opts {
url: Url,
}
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
let opts: Opts = Opts::parse();
let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT));
let client = reqwest::ClientBuilder::new()
.timeout(TIME_LIMIT)
.default_headers(header_map)
// TODO We want to handle redirects ourselves so we can track them...
.redirect(Policy::none())
.build()?;
let mut adblock_engines = Vec::new();
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
// TODO Don't hardcode these paths in quite as bad a way...
let path = PathBuf::from(format!("./data/{}.adblock", name));
if !path.exists() {
warn!("Missing adblock rules: {:?}.", path);
continue;
}
let file = File::open(&path).await?;
adblock_engines.push((
*antifeature,
load_adblock_engine(file, RuleTypes::All).await?,
));
}
let mut antifeature_ip_set = IpSet::new();
let ips_file = File::open("./data/cf_ips.txt")
.await
.context("Failed to open CF IPs file")?;
antifeature_ip_set.add_all_from_file(ips_file).await?;
let raker = Raker {
adblock_engines,
antifeature_ip_set,
};
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
match outcome {
RakeOutcome::RakedPage(page) => {
let content_size = serde_bare::to_vec(&page)?.len();
green_ln!("Page ({} bytes)", content_size);
// TODO
}
RakeOutcome::RakedFeed(feed) => {
green_ln!("Feed");
// TODO
}
RakeOutcome::RakedSitemap(sitemap) => {
green_ln!("Sitemap");
// TODO
}
RakeOutcome::Redirect { reason, new_url } => {
blue_ln!("Redirect ({:?})", reason);
println!("{}", new_url.as_str());
}
RakeOutcome::TemporaryFailure(fail) => {
yellow_ln!("Temporary Failure\n\t{:?}", &fail.reason);
}
RakeOutcome::PermanentFailure(fail) => {
red_ln!("Permanent Failure\n\t{:?}", &fail.reason)
}
}
Ok(())
}