diff --git a/qp_indexer.sample.toml b/qp_indexer.sample.toml index 015b230..efece8e 100644 --- a/qp_indexer.sample.toml +++ b/qp_indexer.sample.toml @@ -1,4 +1,5 @@ seed_dir = "../quickpeep_seeds" +icon_store = "./index_icons" # Tantivy Backend # [backend.tantivy] diff --git a/quickpeep_index/src/auxiliary.rs b/quickpeep_index/src/auxiliary.rs new file mode 100644 index 0000000..ba9c11f --- /dev/null +++ b/quickpeep_index/src/auxiliary.rs @@ -0,0 +1 @@ +pub mod icon_store; diff --git a/quickpeep_index/src/auxiliary/icon_store.rs b/quickpeep_index/src/auxiliary/icon_store.rs new file mode 100644 index 0000000..590f30d --- /dev/null +++ b/quickpeep_index/src/auxiliary/icon_store.rs @@ -0,0 +1,20 @@ +use fancy_mdbx::database::RawTable; +use fancy_mdbx::environment::Env; +use std::path::Path; + +pub struct IconStore { + pub env: Env, + + /// Icons table + /// u64 hashes (little endian) to WebP contents + pub icons: RawTable<[u8], [u8]>, +} + +impl IconStore { + pub fn open(path: &Path) -> anyhow::Result { + let env = Env::open(path)?; + let icons = env.open_raw_table(Some("icons"), ())?; + + Ok(IconStore { env, icons }) + } +} diff --git a/quickpeep_index/src/lib.rs b/quickpeep_index/src/lib.rs index 27b6d15..a02aad6 100644 --- a/quickpeep_index/src/lib.rs +++ b/quickpeep_index/src/lib.rs @@ -1,3 +1,5 @@ pub mod config; pub mod backend; + +pub mod auxiliary; diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 539019b..dd20ec0 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -8,12 +8,15 @@ use std::io::{BufRead, BufReader}; use patricia_tree::PatriciaMap; use quickpeep_densedoc::DenseTree; -use quickpeep_index::backend::BackendIndependentDocument; +use quickpeep_index::auxiliary::icon_store::IconStore; +use quickpeep_index::backend::{Backend, BackendIndependentDocument}; use quickpeep_indexer::config::IndexerConfig; use quickpeep_seed_parser::loader::{ find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, }; -use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; +use quickpeep_structs::rake_entries::{ + IconEntry, PackRecord, RakedPageEntry, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, +}; use quickpeep_utils::urls::get_reduced_domain; use smartstring::alias::CompactString; use std::path::PathBuf; @@ -40,6 +43,8 @@ pub async fn main() -> anyhow::Result<()> { .unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); let config = IndexerConfig::load(&config_path).context("Failed to load config")?; + let icon_store = IconStore::open(config.icon_store.as_path())?; + let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?; let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64); let handle = tokio::spawn(async move { @@ -62,46 +67,27 @@ pub async fn main() -> anyhow::Result<()> { // TODO the decompressor has a buffer already, but we need this to see the end let mut buf_reader = BufReader::new(decompressor); let schema: String = serde_bare::from_reader(&mut buf_reader)?; - if &schema != SCHEMA_RAKED_PAGES { - bail!( - "Wrong schema version: wanted {:?}, got {:?}", - SCHEMA_RAKED_PAGES, - &schema - ); - } - // TODO(unstable): this condition is `.has_data_left()` but it's unstable. - while buf_reader.fill_buf().map(|b| !b.is_empty())? { - let page_record: PackRecord = serde_bare::from_reader(&mut buf_reader)?; - - let document = page_record.record.document; - - let article_body = DenseTree::generate_textual_format(&document.body_content, false); - let nonarticle_body = - DenseTree::generate_textual_format(&document.body_remainder, false); - - let tags = seed_lookup - .look_up(&Url::parse(page_record.url.as_ref())?)? - .map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect()) - .unwrap_or_else(|| Vec::with_capacity(0)); - - // TODO Store the actual structure of the document in the store? - - let favicon_url = document.head.effective_favicon_url(); - let mut favicon_url_hash_long = [0u8; 8]; - favicon_url_hash_long - .copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]); - let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long); - - indexer_backend.add_document(BackendIndependentDocument { - title: document.head.title, - article_body, - nonarticle_body, - // TODO populate tags & antifeatures - tags, - url: page_record.url.to_string(), - favicon_url_hash, - })?; + match schema.as_ref() { + SCHEMA_RAKED_PAGES => { + // TODO(unstable): this condition is `.has_data_left()` but it's unstable. + while buf_reader.fill_buf().map(|b| !b.is_empty())? { + handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?; + } + } + SCHEMA_RAKED_ICONS => { + // TODO(unstable): this condition is `.has_data_left()` but it's unstable. + while buf_reader.fill_buf().map(|b| !b.is_empty())? { + handle_icon_pack(&mut buf_reader, &icon_store)?; + } + } + _ => { + bail!( + "Wrong schema version: wanted e.g. {:?}, got {:?}", + SCHEMA_RAKED_PAGES, + &schema + ); + } } } indexer_backend.flush()?; @@ -109,6 +95,67 @@ pub async fn main() -> anyhow::Result<()> { Ok(()) } +pub fn handle_page_pack( + buf_reader: &mut impl BufRead, + seed_lookup: &SeedLookupTable, + indexer_backend: &mut Box, +) -> anyhow::Result<()> { + let page_record: PackRecord = serde_bare::from_reader(buf_reader)?; + + let document = page_record.record.document; + + let article_body = DenseTree::generate_textual_format(&document.body_content, false); + let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder, false); + + let tags = seed_lookup + .look_up(&Url::parse(page_record.url.as_ref())?)? + .map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect()) + .unwrap_or_else(|| Vec::with_capacity(0)); + + // TODO Store the actual structure of the document in the store? + + let favicon_url_relative = document.head.effective_favicon_url(); + let favicon_url = Url::parse(page_record.url.as_ref())?.join(favicon_url_relative)?; + let favicon_url = favicon_url.as_str(); + let mut favicon_url_hash_long = [0u8; 8]; + favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]); + let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long); + + indexer_backend.add_document(BackendIndependentDocument { + title: document.head.title, + article_body, + nonarticle_body, + // TODO populate tags & antifeatures + tags, + url: page_record.url.to_string(), + favicon_url_hash, + })?; + + Ok(()) +} + +pub fn handle_icon_pack( + buf_reader: &mut impl BufRead, + icon_store: &IconStore, +) -> anyhow::Result<()> { + let page_record: PackRecord = serde_bare::from_reader(buf_reader)?; + + let webp_bytes = page_record.record.webp_bytes; + + let mut favicon_url_hash_long = [0u8; 8]; + favicon_url_hash_long + .copy_from_slice(&blake3::hash(page_record.url.as_ref().as_bytes()).as_bytes()[0..8]); + let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long); + + icon_store.env.rw_txn(|txn| { + icon_store + .icons + .put(txn, &favicon_url_hash.to_le_bytes(), &webp_bytes) + })?; + + Ok(()) +} + pub struct SeedLookupTable { pub by_prefix: PatriciaMap, pub by_reduced_domain: HashMap, diff --git a/quickpeep_indexer/src/config.rs b/quickpeep_indexer/src/config.rs index 717667a..4c231ef 100644 --- a/quickpeep_indexer/src/config.rs +++ b/quickpeep_indexer/src/config.rs @@ -12,6 +12,9 @@ pub struct IndexerConfig { /// Path to seeds pub seed_dir: PathBuf, + /// Path to the icon store + pub icon_store: PathBuf, + /// Configuration about which backend to use. pub backend: BackendConfig, } @@ -25,6 +28,7 @@ impl IndexerConfig { let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?; indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir); + indexer_config.icon_store = config_dir.join(indexer_config.icon_store); match &mut indexer_config.backend { BackendConfig::Tantivy(tantivy) => { tantivy.index_dir = config_dir.join(&tantivy.index_dir);