From 8ec8003dbb1ea6a4525aa6e8b41015180f8d67d4 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 27 Mar 2022 21:30:38 +0100 Subject: [PATCH] Store favicon BLAKE3 hashes in the doc store --- Cargo.lock | 72 ++++++++++++++++++++++++- quickpeep_densedoc/src/lib.rs | 10 ++++ quickpeep_index/Cargo.toml | 2 + quickpeep_index/src/backend.rs | 2 + quickpeep_index/src/backend/tantivy.rs | 4 ++ quickpeep_indexer/Cargo.toml | 2 + quickpeep_indexer/src/bin/qp-indexer.rs | 8 +++ quickpeep_raker/src/raking/task.rs | 7 +-- 8 files changed, 100 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 07ce1bf..37cec7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,12 +116,24 @@ dependencies = [ "serde", ] +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + [[package]] name = "arrayvec" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" +[[package]] +name = "arrayvec" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" + [[package]] name = "askama" version = "0.11.1" @@ -356,6 +368,20 @@ dependencies = [ "crunchy", ] +[[package]] +name = "blake3" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08e53fc5a564bb15bfe6fae56bd71522205f1f91893f9c0116edad6496c183f" +dependencies = [ + "arrayref", + "arrayvec 0.7.2", + "cc", + "cfg-if", + "constant_time_eq", + "digest 0.10.3", +] + [[package]] name = "block-buffer" version = "0.7.3" @@ -377,6 +403,15 @@ dependencies = [ "generic-array 0.14.5", ] +[[package]] +name = "block-buffer" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324" +dependencies = [ + "generic-array 0.14.5", +] + [[package]] name = "block-padding" version = "0.1.5" @@ -606,6 +641,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "convert_case" version = "0.4.0" @@ -761,6 +802,16 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-common" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8" +dependencies = [ + "generic-array 0.14.5", + "typenum", +] + [[package]] name = "cssparser" version = "0.27.2" @@ -895,6 +946,17 @@ dependencies = [ "generic-array 0.14.5", ] +[[package]] +name = "digest" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +dependencies = [ + "block-buffer 0.10.2", + "crypto-common", + "subtle", +] + [[package]] name = "diplomatic-bag" version = "0.2.0" @@ -1765,7 +1827,7 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" dependencies = [ - "arrayvec", + "arrayvec 0.5.2", "bitflags", "cfg-if", "ryu", @@ -3637,6 +3699,7 @@ name = "quickpeep_index" version = "0.1.0" dependencies = [ "anyhow", + "blake3", "env_logger", "fancy_mdbx", "log", @@ -3654,6 +3717,7 @@ name = "quickpeep_indexer" version = "0.1.0" dependencies = [ "anyhow", + "blake3", "clap", "colour", "env_logger", @@ -4544,6 +4608,12 @@ dependencies = [ "syn", ] +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + [[package]] name = "syn" version = "1.0.89" diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index 32cce5d..f9844a1 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -21,6 +21,16 @@ pub struct DenseHead { pub icon: String, } +impl DenseHead { + pub fn effective_favicon_url(&self) -> &str { + if self.icon.is_empty() { + "/favicon.ico" + } else { + self.icon.as_str() + } + } +} + #[derive(Serialize, Deserialize, Clone, Debug)] pub enum DenseTree { Heading1(Vec), diff --git a/quickpeep_index/Cargo.toml b/quickpeep_index/Cargo.toml index 59d05fc..9ca972b 100644 --- a/quickpeep_index/Cargo.toml +++ b/quickpeep_index/Cargo.toml @@ -16,6 +16,8 @@ serde = { version = "1.0.136", features = ["derive"] } serde_bare = "0.5.0" toml = "0.5.8" +blake3 = "1.3.1" + fancy_mdbx = { path = "../../../libraries/fancy_mdbx" } quickpeep_structs = { path = "../quickpeep_structs" } diff --git a/quickpeep_index/src/backend.rs b/quickpeep_index/src/backend.rs index cf76c01..fcef09b 100644 --- a/quickpeep_index/src/backend.rs +++ b/quickpeep_index/src/backend.rs @@ -19,6 +19,7 @@ pub struct BackendIndependentDocument { pub nonarticle_body: String, pub tags: Vec, pub url: String, + pub favicon_url_hash: u64, } /// A backend-independent document struct. @@ -29,4 +30,5 @@ pub struct SearchDocument { pub excerpt: String, pub tags: Vec, pub url: String, + pub favicon_url_hash: u64, } diff --git a/quickpeep_index/src/backend/tantivy.rs b/quickpeep_index/src/backend/tantivy.rs index e395c93..0784752 100644 --- a/quickpeep_index/src/backend/tantivy.rs +++ b/quickpeep_index/src/backend/tantivy.rs @@ -33,6 +33,8 @@ pub struct DocumentStoreRow { title: String, body: String, nonbody: String, + /// BLAKE3 of the favicon URL + favicon_url_hash: u64, } pub struct StoreTables { @@ -145,6 +147,7 @@ impl Backend for TantivyBackend { title: document.title, body: document.article_body, nonbody: document.nonarticle_body, + favicon_url_hash: document.favicon_url_hash, }, ) })?; @@ -200,6 +203,7 @@ impl Backend for TantivyBackend { excerpt, tags: vec![], url: url.to_owned(), + favicon_url_hash: doc_row.favicon_url_hash, }) } diff --git a/quickpeep_indexer/Cargo.toml b/quickpeep_indexer/Cargo.toml index c44db22..e737296 100644 --- a/quickpeep_indexer/Cargo.toml +++ b/quickpeep_indexer/Cargo.toml @@ -18,6 +18,8 @@ clap = { version = "3.1.6", features = ["derive"] } colour = "0.6.0" url = "2.2.2" +blake3 = "1.3.1" + smartstring = "1.0.1" # Used for efficient lookup of seeds (URL prefixes) diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 80ca084..3f4ddf4 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -85,6 +85,13 @@ pub async fn main() -> anyhow::Result<()> { .unwrap_or_else(|| Vec::with_capacity(0)); // TODO Store the actual structure of the document in the store? + + let favicon_url = document.head.effective_favicon_url(); + let mut favicon_url_hash_long = [0u8; 8]; + favicon_url_hash_long + .copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]); + let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long); + indexer_backend.add_document(BackendIndependentDocument { title: document.head.title, article_body, @@ -92,6 +99,7 @@ pub async fn main() -> anyhow::Result<()> { // TODO populate tags & antifeatures tags, url: page_record.url.to_string(), + favicon_url_hash, })?; } } diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 56e0b6f..391a341 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -476,12 +476,7 @@ impl EventProcessor<'_> { )?; // If there's a favicon to be tried, add it to the list... - let favicon_url_rel = if page.document.head.icon.is_empty() { - "/favicon.ico" - } else { - page.document.head.icon.as_str() - }; - + let favicon_url_rel = page.document.head.effective_favicon_url(); if let Ok(favicon_url) = url.join(favicon_url_rel) { txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; }