From a31566a89fdbdef0db2603df7f974a55f9c5c706 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 27 Mar 2022 19:01:42 +0100 Subject: [PATCH] Support raking icons --- Cargo.lock | 216 +++++++++++++++++++++++++++- quickpeep_raker/Cargo.toml | 4 + quickpeep_raker/src/bin/qp-rake1.rs | 11 +- quickpeep_raker/src/raking.rs | 72 +++++++++- quickpeep_raker/src/raking/task.rs | 2 +- 5 files changed, 296 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8dd65a..6ee129a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,6 +48,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.3.8" @@ -319,6 +325,12 @@ dependencies = [ "shlex", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + [[package]] name = "bitflags" version = "1.3.2" @@ -386,6 +398,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" +[[package]] +name = "bytemuck" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e851ca7c24871e7336801608a4797d7376545b6928a10d32d75685687141ead" + [[package]] name = "byteorder" version = "1.4.3" @@ -533,6 +551,12 @@ dependencies = [ "cc", ] +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colour" version = "0.6.0" @@ -825,6 +849,15 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "deflate" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c86f7e25f518f4b81808a2cf1c50996a61f5c2eb394b2393bd87f2a4780a432f" +dependencies = [ + "adler32", +] + [[package]] name = "derive_more" version = "0.99.17" @@ -933,6 +966,22 @@ version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77f3309417938f28bf8228fcff79a4a37103981e3e186d2ccd19c74b38f4eb71" +[[package]] +name = "exr" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4badb9489a465cb2c555af1f00f0bfd8cecd6fc12ac11da9d5b40c5dd5f0200" +dependencies = [ + "bit_field", + "deflate", + "flume", + "half", + "inflate", + "lebe", + "smallvec", + "threadpool", +] + [[package]] name = "fail" version = "0.5.0" @@ -1016,7 +1065,7 @@ dependencies = [ "cfg-if", "crc32fast", "libc", - "miniz_oxide", + "miniz_oxide 0.4.4", ] [[package]] @@ -1027,6 +1076,7 @@ checksum = "843c03199d0c0ca54bc1ea90ac0d507274c28abcc4f691ae8b4eaa375087c76a" dependencies = [ "futures-core", "futures-sink", + "nanorand", "pin-project", "spin 0.9.2", ] @@ -1272,8 +1322,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d39cd93900197114fa1fcb7ae84ca742095eed9442088988ae74fa744e930e77" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.10.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "gif" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3a7187e78088aead22ceedeee99779455b23fc231fe13ec443f99bb71694e5b" +dependencies = [ + "color_quant", + "weezl", ] [[package]] @@ -1301,6 +1363,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + [[package]] name = "hashbrown" version = "0.11.2" @@ -1483,6 +1551,26 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "image" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db207d030ae38f1eb6f240d5a1c1c88ff422aa005d10f8c6c6fc5e75286ab30e" +dependencies = [ + "bytemuck", + "byteorder", + "color_quant", + "exr", + "gif", + "jpeg-decoder 0.2.2", + "num-iter", + "num-rational 0.4.0", + "num-traits", + "png", + "scoped_threadpool", + "tiff", +] + [[package]] name = "include_dir" version = "0.7.2" @@ -1512,6 +1600,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "inflate" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff" +dependencies = [ + "adler32", +] + [[package]] name = "instant" version = "0.1.12" @@ -1602,6 +1699,21 @@ dependencies = [ "libc", ] +[[package]] +name = "jpeg-decoder" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229d53d58899083193af11e15917b5640cd40b29ff475a1fe4ef725deb02d0f2" + +[[package]] +name = "jpeg-decoder" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "105fb082d64e2100074587f59a74231f771750c664af903f1f9f76c9dedfc6f1" +dependencies = [ + "rayon", +] + [[package]] name = "js-sys" version = "0.3.56" @@ -1635,6 +1747,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lebe" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7efd1d698db0759e6ef11a7cd44407407399a910c774dd804c64c032da7826ff" + [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -1707,6 +1825,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libwebp-sys" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439fd1885aa28937e7edcd68d2e793cb4a22f8733460d2519fbafd2b215672bf" +dependencies = [ + "cc", +] + [[package]] name = "libz-sys" version = "1.1.5" @@ -2761,6 +2888,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082" +dependencies = [ + "adler", +] + [[package]] name = "mio" version = "0.7.14" @@ -2806,6 +2942,15 @@ dependencies = [ "byteorder", ] +[[package]] +name = "nanorand" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" +dependencies = [ + "getrandom 0.2.5", +] + [[package]] name = "native-tls" version = "0.2.8" @@ -2876,7 +3021,7 @@ dependencies = [ "num-complex", "num-integer", "num-iter", - "num-rational", + "num-rational 0.2.4", "num-traits", ] @@ -2934,6 +3079,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.14" @@ -3280,6 +3436,18 @@ version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" +[[package]] +name = "png" +version = "0.17.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc38c0ad57efb786dd57b9864e5b18bae478c00c824dc55a38bbc9da95dde3ba" +dependencies = [ + "bitflags", + "crc32fast", + "deflate", + "miniz_oxide 0.5.1", +] + [[package]] name = "polling" version = "2.2.0" @@ -3525,6 +3693,7 @@ dependencies = [ "futures-util", "gemini-fetch", "html5ever", + "image", "ipnetwork", "itertools", "kuchiki", @@ -3553,6 +3722,7 @@ dependencies = [ "smartstring", "tokio", "toml", + "webp", "zstd", ] @@ -3909,6 +4079,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" + [[package]] name = "scopeguard" version = "1.1.0" @@ -4531,6 +4707,26 @@ dependencies = [ "syn", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "tiff" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0247608e998cb6ce39dfc8f4a16c50361ce71e5b52e6d24ea1227ea8ea8ee0b2" +dependencies = [ + "flate2", + "jpeg-decoder 0.1.22", + "weezl", +] + [[package]] name = "time" version = "0.1.44" @@ -5023,6 +5219,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webp" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf022f821f166079a407d000ab57e84de020e66ffbbf4edde999bc7d6e371cae" +dependencies = [ + "image", + "libwebp-sys", +] + [[package]] name = "webpki" version = "0.21.4" @@ -5042,6 +5248,12 @@ dependencies = [ "webpki", ] +[[package]] +name = "weezl" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b77fdfd5a253be4ab714e4ffa3c49caf146b4de743e97510c0656cf90f1e8e" + [[package]] name = "wepoll-ffi" version = "0.1.2" diff --git a/quickpeep_raker/Cargo.toml b/quickpeep_raker/Cargo.toml index 9d4ae46..89750ff 100644 --- a/quickpeep_raker/Cargo.toml +++ b/quickpeep_raker/Cargo.toml @@ -74,6 +74,10 @@ adblock = "0.5.0" # Language detection lingua = "1.3.3" +### Image processing +image = "0.24.1" +webp = { version = "0.2.2", features = ["img"] } + ### Metrics metrics = "0.18.1" metrics-exporter-prometheus = { version = "0.9.0", default-features = false, features = ["http-listener"] } diff --git a/quickpeep_raker/src/bin/qp-rake1.rs b/quickpeep_raker/src/bin/qp-rake1.rs index e67cd57..64e2b66 100644 --- a/quickpeep_raker/src/bin/qp-rake1.rs +++ b/quickpeep_raker/src/bin/qp-rake1.rs @@ -1,7 +1,7 @@ use adblock::lists::RuleTypes; use anyhow::Context; use clap::Parser; -use colour::{blue_ln, green_ln, red_ln, yellow_ln}; +use colour::{blue, blue_ln, cyan_ln, green_ln, red_ln, yellow_ln}; use env_logger::Env; use log::warn; use quickpeep_raker::raking::analysis::{preload_adblock_engine, IpSet}; @@ -28,6 +28,8 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [ #[derive(Clone, Debug, Parser)] pub struct Opts { url: Url, + #[clap(short = 'i')] + intent: Option, } #[tokio::main] @@ -114,6 +116,13 @@ pub async fn main() -> anyhow::Result<()> { RakeOutcome::PermanentFailure(fail) => { red_ln!("Permanent Failure\n\t{:?}", &fail.reason) } + RakeOutcome::RakedIcon(icon) => { + green_ln!("Icon"); + blue!("\tOriginal size: "); + cyan_ln!("{} bytes", icon.original_size_in_bytes); + blue!("\tPacked size: "); + cyan_ln!("{} bytes", icon.webp_bytes.len()); + } } Ok(()) diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index 18e5789..b9d88f0 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -1,10 +1,11 @@ use crate::raking::analysis::IpSet; use crate::raking::page_extraction::{ExtractedPage, PageExtractionService}; -use anyhow::{bail, Context}; +use anyhow::{anyhow, bail, Context}; use chrono::{DateTime, FixedOffset, Utc}; use cylon::Cylon; use futures_util::stream::StreamExt; use html5ever::tendril::fmt::Slice; +use image::ImageFormat; use itertools::Itertools; use lazy_static::lazy_static; use log::debug; @@ -13,7 +14,9 @@ use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; use sitemap::reader::SiteMapEntity; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; +use std::io::Cursor; +use std::str::FromStr; use std::time::Duration; use tokio::time::Instant; @@ -29,11 +32,29 @@ pub const SIZE_LIMIT: usize = 4 * 1024 * 1024; pub const TIME_LIMIT: Duration = Duration::from_secs(10); pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot"; +lazy_static! { + pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = { + [ + ("image/png", ImageFormat::Png), + ("image/webp", ImageFormat::WebP), + ("image/jpeg", ImageFormat::Jpeg), + ("image/gif", ImageFormat::Gif), + ("image/vnd.microsoft.icon", ImageFormat::Ico), + ("image/x-icon", ImageFormat::Ico), + ("image/icon", ImageFormat::Ico), + ("image/ico", ImageFormat::Ico), + ("application/ico", ImageFormat::Ico), + ] + .into_iter() + .collect() + }; +} + pub enum RakeOutcome { RakedPage(RakedPage), RakedFeed(Vec), RakedSitemap(Vec), - RakedIcon(()), + RakedIcon(RakedIcon), Redirect { reason: RedirectReason, new_url: Url, @@ -66,6 +87,11 @@ pub struct RakedPage { pub referrer_entry: RakedReferrerEntry, } +pub struct RakedIcon { + pub original_size_in_bytes: usize, + pub webp_bytes: Vec, +} + pub struct RobotsTxt { pub sitemaps: Vec, pub rules: Cylon, @@ -107,6 +133,23 @@ pub enum RakeIntent { Icon, } +impl FromStr for RakeIntent { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Ok(match s.to_lowercase().as_ref() { + "any" => RakeIntent::Any, + "page" => RakeIntent::Page, + "feed" => RakeIntent::Feed, + "sitemap" => RakeIntent::SiteMap, + "icon" => RakeIntent::Icon, + other => { + bail!("Unrecognised intent: {:?}", other) + } + }) + } +} + impl From for RakeIntent { fn from(kind: ReferenceKind) -> Self { match kind { @@ -443,8 +486,27 @@ pub fn rake_sitemap(content: &[u8]) -> anyhow::Result> { Ok(urls) } -pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result<()> { - todo!() +pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result { + let format = match IMAGE_MIME_TYPES.get(content_type) { + Some(format) => format, + None => { + bail!("Unknown image format: {:?}", content_type); + } + }; + + let orig_size = content.len(); + + let mut cursor = Cursor::new(&content); + let image = image::load(&mut cursor, *format).context("Failed to load image")?; + + let webp_encoder = + webp::Encoder::from_image(&image).map_err(|err| anyhow!("webp fail: {}", err))?; + let encoded = webp_encoder.encode(0.6).to_vec(); + + Ok(RakedIcon { + original_size_in_bytes: orig_size, + webp_bytes: encoded, + }) } pub fn robots_txt_url_for(url: &Url) -> anyhow::Result { diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index a1db9d3..f635236 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -349,7 +349,7 @@ impl TaskContext { Ok(NextAction::Continue) } - RakeOutcome::RakedIcon(()) => { + RakeOutcome::RakedIcon(icon) => { // Store icon to icon store todo!(); }