Store favicon BLAKE3 hashes in the doc store

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 21:30:38 +01:00
parent a271f83805
commit 8ec8003dbb
8 changed files with 100 additions and 7 deletions

72
Cargo.lock generated
View File

@ -116,12 +116,24 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "arrayref"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
[[package]] [[package]]
name = "arrayvec" name = "arrayvec"
version = "0.5.2" version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]]
name = "arrayvec"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
[[package]] [[package]]
name = "askama" name = "askama"
version = "0.11.1" version = "0.11.1"
@ -356,6 +368,20 @@ dependencies = [
"crunchy", "crunchy",
] ]
[[package]]
name = "blake3"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a08e53fc5a564bb15bfe6fae56bd71522205f1f91893f9c0116edad6496c183f"
dependencies = [
"arrayref",
"arrayvec 0.7.2",
"cc",
"cfg-if",
"constant_time_eq",
"digest 0.10.3",
]
[[package]] [[package]]
name = "block-buffer" name = "block-buffer"
version = "0.7.3" version = "0.7.3"
@ -377,6 +403,15 @@ dependencies = [
"generic-array 0.14.5", "generic-array 0.14.5",
] ]
[[package]]
name = "block-buffer"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324"
dependencies = [
"generic-array 0.14.5",
]
[[package]] [[package]]
name = "block-padding" name = "block-padding"
version = "0.1.5" version = "0.1.5"
@ -606,6 +641,12 @@ dependencies = [
"tiny-keccak", "tiny-keccak",
] ]
[[package]]
name = "constant_time_eq"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
[[package]] [[package]]
name = "convert_case" name = "convert_case"
version = "0.4.0" version = "0.4.0"
@ -761,6 +802,16 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "crypto-common"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8"
dependencies = [
"generic-array 0.14.5",
"typenum",
]
[[package]] [[package]]
name = "cssparser" name = "cssparser"
version = "0.27.2" version = "0.27.2"
@ -895,6 +946,17 @@ dependencies = [
"generic-array 0.14.5", "generic-array 0.14.5",
] ]
[[package]]
name = "digest"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506"
dependencies = [
"block-buffer 0.10.2",
"crypto-common",
"subtle",
]
[[package]] [[package]]
name = "diplomatic-bag" name = "diplomatic-bag"
version = "0.2.0" version = "0.2.0"
@ -1765,7 +1827,7 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
dependencies = [ dependencies = [
"arrayvec", "arrayvec 0.5.2",
"bitflags", "bitflags",
"cfg-if", "cfg-if",
"ryu", "ryu",
@ -3637,6 +3699,7 @@ name = "quickpeep_index"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"blake3",
"env_logger", "env_logger",
"fancy_mdbx", "fancy_mdbx",
"log", "log",
@ -3654,6 +3717,7 @@ name = "quickpeep_indexer"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"blake3",
"clap", "clap",
"colour", "colour",
"env_logger", "env_logger",
@ -4544,6 +4608,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "subtle"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.89" version = "1.0.89"

View File

@ -21,6 +21,16 @@ pub struct DenseHead {
pub icon: String, pub icon: String,
} }
impl DenseHead {
pub fn effective_favicon_url(&self) -> &str {
if self.icon.is_empty() {
"/favicon.ico"
} else {
self.icon.as_str()
}
}
}
#[derive(Serialize, Deserialize, Clone, Debug)] #[derive(Serialize, Deserialize, Clone, Debug)]
pub enum DenseTree { pub enum DenseTree {
Heading1(Vec<DenseTree>), Heading1(Vec<DenseTree>),

View File

@ -16,6 +16,8 @@ serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0" serde_bare = "0.5.0"
toml = "0.5.8" toml = "0.5.8"
blake3 = "1.3.1"
fancy_mdbx = { path = "../../../libraries/fancy_mdbx" } fancy_mdbx = { path = "../../../libraries/fancy_mdbx" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }

View File

@ -19,6 +19,7 @@ pub struct BackendIndependentDocument {
pub nonarticle_body: String, pub nonarticle_body: String,
pub tags: Vec<String>, pub tags: Vec<String>,
pub url: String, pub url: String,
pub favicon_url_hash: u64,
} }
/// A backend-independent document struct. /// A backend-independent document struct.
@ -29,4 +30,5 @@ pub struct SearchDocument {
pub excerpt: String, pub excerpt: String,
pub tags: Vec<String>, pub tags: Vec<String>,
pub url: String, pub url: String,
pub favicon_url_hash: u64,
} }

View File

@ -33,6 +33,8 @@ pub struct DocumentStoreRow {
title: String, title: String,
body: String, body: String,
nonbody: String, nonbody: String,
/// BLAKE3 of the favicon URL
favicon_url_hash: u64,
} }
pub struct StoreTables { pub struct StoreTables {
@ -145,6 +147,7 @@ impl Backend for TantivyBackend {
title: document.title, title: document.title,
body: document.article_body, body: document.article_body,
nonbody: document.nonarticle_body, nonbody: document.nonarticle_body,
favicon_url_hash: document.favicon_url_hash,
}, },
) )
})?; })?;
@ -200,6 +203,7 @@ impl Backend for TantivyBackend {
excerpt, excerpt,
tags: vec![], tags: vec![],
url: url.to_owned(), url: url.to_owned(),
favicon_url_hash: doc_row.favicon_url_hash,
}) })
} }

View File

@ -18,6 +18,8 @@ clap = { version = "3.1.6", features = ["derive"] }
colour = "0.6.0" colour = "0.6.0"
url = "2.2.2" url = "2.2.2"
blake3 = "1.3.1"
smartstring = "1.0.1" smartstring = "1.0.1"
# Used for efficient lookup of seeds (URL prefixes) # Used for efficient lookup of seeds (URL prefixes)

View File

@ -85,6 +85,13 @@ pub async fn main() -> anyhow::Result<()> {
.unwrap_or_else(|| Vec::with_capacity(0)); .unwrap_or_else(|| Vec::with_capacity(0));
// TODO Store the actual structure of the document in the store? // TODO Store the actual structure of the document in the store?
let favicon_url = document.head.effective_favicon_url();
let mut favicon_url_hash_long = [0u8; 8];
favicon_url_hash_long
.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
indexer_backend.add_document(BackendIndependentDocument { indexer_backend.add_document(BackendIndependentDocument {
title: document.head.title, title: document.head.title,
article_body, article_body,
@ -92,6 +99,7 @@ pub async fn main() -> anyhow::Result<()> {
// TODO populate tags & antifeatures // TODO populate tags & antifeatures
tags, tags,
url: page_record.url.to_string(), url: page_record.url.to_string(),
favicon_url_hash,
})?; })?;
} }
} }

View File

@ -476,12 +476,7 @@ impl EventProcessor<'_> {
)?; )?;
// If there's a favicon to be tried, add it to the list... // If there's a favicon to be tried, add it to the list...
let favicon_url_rel = if page.document.head.icon.is_empty() { let favicon_url_rel = page.document.head.effective_favicon_url();
"/favicon.ico"
} else {
page.document.head.icon.as_str()
};
if let Ok(favicon_url) = url.join(favicon_url_rel) { if let Ok(favicon_url) = url.join(favicon_url_rel) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
} }