Support raking icons

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 19:01:42 +01:00
parent 653acf68a9
commit a31566a89f
5 changed files with 296 additions and 9 deletions

216
Cargo.lock generated
View File

@ -48,6 +48,12 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "adler32"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
[[package]]
name = "ahash"
version = "0.3.8"
@ -319,6 +325,12 @@ dependencies = [
"shlex",
]
[[package]]
name = "bit_field"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4"
[[package]]
name = "bitflags"
version = "1.3.2"
@ -386,6 +398,12 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
[[package]]
name = "bytemuck"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e851ca7c24871e7336801608a4797d7376545b6928a10d32d75685687141ead"
[[package]]
name = "byteorder"
version = "1.4.3"
@ -533,6 +551,12 @@ dependencies = [
"cc",
]
[[package]]
name = "color_quant"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
[[package]]
name = "colour"
version = "0.6.0"
@ -825,6 +849,15 @@ dependencies = [
"num_cpus",
]
[[package]]
name = "deflate"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c86f7e25f518f4b81808a2cf1c50996a61f5c2eb394b2393bd87f2a4780a432f"
dependencies = [
"adler32",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -933,6 +966,22 @@ version = "2.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77f3309417938f28bf8228fcff79a4a37103981e3e186d2ccd19c74b38f4eb71"
[[package]]
name = "exr"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4badb9489a465cb2c555af1f00f0bfd8cecd6fc12ac11da9d5b40c5dd5f0200"
dependencies = [
"bit_field",
"deflate",
"flume",
"half",
"inflate",
"lebe",
"smallvec",
"threadpool",
]
[[package]]
name = "fail"
version = "0.5.0"
@ -1016,7 +1065,7 @@ dependencies = [
"cfg-if",
"crc32fast",
"libc",
"miniz_oxide",
"miniz_oxide 0.4.4",
]
[[package]]
@ -1027,6 +1076,7 @@ checksum = "843c03199d0c0ca54bc1ea90ac0d507274c28abcc4f691ae8b4eaa375087c76a"
dependencies = [
"futures-core",
"futures-sink",
"nanorand",
"pin-project",
"spin 0.9.2",
]
@ -1272,8 +1322,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d39cd93900197114fa1fcb7ae84ca742095eed9442088988ae74fa744e930e77"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"wasm-bindgen",
]
[[package]]
name = "gif"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3a7187e78088aead22ceedeee99779455b23fc231fe13ec443f99bb71694e5b"
dependencies = [
"color_quant",
"weezl",
]
[[package]]
@ -1301,6 +1363,12 @@ dependencies = [
"tracing",
]
[[package]]
name = "half"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "hashbrown"
version = "0.11.2"
@ -1483,6 +1551,26 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "image"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db207d030ae38f1eb6f240d5a1c1c88ff422aa005d10f8c6c6fc5e75286ab30e"
dependencies = [
"bytemuck",
"byteorder",
"color_quant",
"exr",
"gif",
"jpeg-decoder 0.2.2",
"num-iter",
"num-rational 0.4.0",
"num-traits",
"png",
"scoped_threadpool",
"tiff",
]
[[package]]
name = "include_dir"
version = "0.7.2"
@ -1512,6 +1600,15 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "inflate"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff"
dependencies = [
"adler32",
]
[[package]]
name = "instant"
version = "0.1.12"
@ -1602,6 +1699,21 @@ dependencies = [
"libc",
]
[[package]]
name = "jpeg-decoder"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "229d53d58899083193af11e15917b5640cd40b29ff475a1fe4ef725deb02d0f2"
[[package]]
name = "jpeg-decoder"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "105fb082d64e2100074587f59a74231f771750c664af903f1f9f76c9dedfc6f1"
dependencies = [
"rayon",
]
[[package]]
name = "js-sys"
version = "0.3.56"
@ -1635,6 +1747,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "lebe"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7efd1d698db0759e6ef11a7cd44407407399a910c774dd804c64c032da7826ff"
[[package]]
name = "levenshtein_automata"
version = "0.2.1"
@ -1707,6 +1825,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "libwebp-sys"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439fd1885aa28937e7edcd68d2e793cb4a22f8733460d2519fbafd2b215672bf"
dependencies = [
"cc",
]
[[package]]
name = "libz-sys"
version = "1.1.5"
@ -2761,6 +2888,15 @@ dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082"
dependencies = [
"adler",
]
[[package]]
name = "mio"
version = "0.7.14"
@ -2806,6 +2942,15 @@ dependencies = [
"byteorder",
]
[[package]]
name = "nanorand"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
dependencies = [
"getrandom 0.2.5",
]
[[package]]
name = "native-tls"
version = "0.2.8"
@ -2876,7 +3021,7 @@ dependencies = [
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-rational 0.2.4",
"num-traits",
]
@ -2934,6 +3079,17 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.14"
@ -3280,6 +3436,18 @@ version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe"
[[package]]
name = "png"
version = "0.17.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc38c0ad57efb786dd57b9864e5b18bae478c00c824dc55a38bbc9da95dde3ba"
dependencies = [
"bitflags",
"crc32fast",
"deflate",
"miniz_oxide 0.5.1",
]
[[package]]
name = "polling"
version = "2.2.0"
@ -3525,6 +3693,7 @@ dependencies = [
"futures-util",
"gemini-fetch",
"html5ever",
"image",
"ipnetwork",
"itertools",
"kuchiki",
@ -3553,6 +3722,7 @@ dependencies = [
"smartstring",
"tokio",
"toml",
"webp",
"zstd",
]
@ -3909,6 +4079,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "scoped_threadpool"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8"
[[package]]
name = "scopeguard"
version = "1.1.0"
@ -4531,6 +4707,26 @@ dependencies = [
"syn",
]
[[package]]
name = "threadpool"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
dependencies = [
"num_cpus",
]
[[package]]
name = "tiff"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0247608e998cb6ce39dfc8f4a16c50361ce71e5b52e6d24ea1227ea8ea8ee0b2"
dependencies = [
"flate2",
"jpeg-decoder 0.1.22",
"weezl",
]
[[package]]
name = "time"
version = "0.1.44"
@ -5023,6 +5219,16 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "webp"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf022f821f166079a407d000ab57e84de020e66ffbbf4edde999bc7d6e371cae"
dependencies = [
"image",
"libwebp-sys",
]
[[package]]
name = "webpki"
version = "0.21.4"
@ -5042,6 +5248,12 @@ dependencies = [
"webpki",
]
[[package]]
name = "weezl"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8b77fdfd5a253be4ab714e4ffa3c49caf146b4de743e97510c0656cf90f1e8e"
[[package]]
name = "wepoll-ffi"
version = "0.1.2"

View File

@ -74,6 +74,10 @@ adblock = "0.5.0"
# Language detection
lingua = "1.3.3"
### Image processing
image = "0.24.1"
webp = { version = "0.2.2", features = ["img"] }
### Metrics
metrics = "0.18.1"
metrics-exporter-prometheus = { version = "0.9.0", default-features = false, features = ["http-listener"] }

View File

@ -1,7 +1,7 @@
use adblock::lists::RuleTypes;
use anyhow::Context;
use clap::Parser;
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
use colour::{blue, blue_ln, cyan_ln, green_ln, red_ln, yellow_ln};
use env_logger::Env;
use log::warn;
use quickpeep_raker::raking::analysis::{preload_adblock_engine, IpSet};
@ -28,6 +28,8 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [
#[derive(Clone, Debug, Parser)]
pub struct Opts {
url: Url,
#[clap(short = 'i')]
intent: Option<RakeIntent>,
}
#[tokio::main]
@ -114,6 +116,13 @@ pub async fn main() -> anyhow::Result<()> {
RakeOutcome::PermanentFailure(fail) => {
red_ln!("Permanent Failure\n\t{:?}", &fail.reason)
}
RakeOutcome::RakedIcon(icon) => {
green_ln!("Icon");
blue!("\tOriginal size: ");
cyan_ln!("{} bytes", icon.original_size_in_bytes);
blue!("\tPacked size: ");
cyan_ln!("{} bytes", icon.webp_bytes.len());
}
}
Ok(())

View File

@ -1,10 +1,11 @@
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
use anyhow::{bail, Context};
use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use image::ImageFormat;
use itertools::Itertools;
use lazy_static::lazy_static;
use log::debug;
@ -13,7 +14,9 @@ use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use tokio::time::Instant;
@ -29,11 +32,29 @@ pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";
lazy_static! {
pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = {
[
("image/png", ImageFormat::Png),
("image/webp", ImageFormat::WebP),
("image/jpeg", ImageFormat::Jpeg),
("image/gif", ImageFormat::Gif),
("image/vnd.microsoft.icon", ImageFormat::Ico),
("image/x-icon", ImageFormat::Ico),
("image/icon", ImageFormat::Ico),
("image/ico", ImageFormat::Ico),
("application/ico", ImageFormat::Ico),
]
.into_iter()
.collect()
};
}
pub enum RakeOutcome {
RakedPage(RakedPage),
RakedFeed(Vec<UrlRaked>),
RakedSitemap(Vec<UrlRaked>),
RakedIcon(()),
RakedIcon(RakedIcon),
Redirect {
reason: RedirectReason,
new_url: Url,
@ -66,6 +87,11 @@ pub struct RakedPage {
pub referrer_entry: RakedReferrerEntry,
}
pub struct RakedIcon {
pub original_size_in_bytes: usize,
pub webp_bytes: Vec<u8>,
}
pub struct RobotsTxt {
pub sitemaps: Vec<UrlRaked>,
pub rules: Cylon,
@ -107,6 +133,23 @@ pub enum RakeIntent {
Icon,
}
impl FromStr for RakeIntent {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s.to_lowercase().as_ref() {
"any" => RakeIntent::Any,
"page" => RakeIntent::Page,
"feed" => RakeIntent::Feed,
"sitemap" => RakeIntent::SiteMap,
"icon" => RakeIntent::Icon,
other => {
bail!("Unrecognised intent: {:?}", other)
}
})
}
}
impl From<ReferenceKind> for RakeIntent {
fn from(kind: ReferenceKind) -> Self {
match kind {
@ -443,8 +486,27 @@ pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
Ok(urls)
}
pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result<()> {
todo!()
pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result<RakedIcon> {
let format = match IMAGE_MIME_TYPES.get(content_type) {
Some(format) => format,
None => {
bail!("Unknown image format: {:?}", content_type);
}
};
let orig_size = content.len();
let mut cursor = Cursor::new(&content);
let image = image::load(&mut cursor, *format).context("Failed to load image")?;
let webp_encoder =
webp::Encoder::from_image(&image).map_err(|err| anyhow!("webp fail: {}", err))?;
let encoded = webp_encoder.encode(0.6).to_vec();
Ok(RakedIcon {
original_size_in_bytes: orig_size,
webp_bytes: encoded,
})
}
pub fn robots_txt_url_for(url: &Url) -> anyhow::Result<Url> {

View File

@ -349,7 +349,7 @@ impl TaskContext {
Ok(NextAction::Continue)
}
RakeOutcome::RakedIcon(()) => {
RakeOutcome::RakedIcon(icon) => {
// Store icon to icon store
todo!();
}