Store icons in the icon store

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 22:00:50 +01:00
parent eb899ac9a5
commit 25db9fdb24
6 changed files with 116 additions and 41 deletions

View File

@ -1,4 +1,5 @@
seed_dir = "../quickpeep_seeds" seed_dir = "../quickpeep_seeds"
icon_store = "./index_icons"
# Tantivy Backend # Tantivy Backend
# [backend.tantivy] # [backend.tantivy]

View File

@ -0,0 +1 @@
pub mod icon_store;

View File

@ -0,0 +1,20 @@
use fancy_mdbx::database::RawTable;
use fancy_mdbx::environment::Env;
use std::path::Path;
pub struct IconStore {
pub env: Env,
/// Icons table
/// u64 hashes (little endian) to WebP contents
pub icons: RawTable<[u8], [u8]>,
}
impl IconStore {
pub fn open(path: &Path) -> anyhow::Result<IconStore> {
let env = Env::open(path)?;
let icons = env.open_raw_table(Some("icons"), ())?;
Ok(IconStore { env, icons })
}
}

View File

@ -1,3 +1,5 @@
pub mod config; pub mod config;
pub mod backend; pub mod backend;
pub mod auxiliary;

View File

@ -8,12 +8,15 @@ use std::io::{BufRead, BufReader};
use patricia_tree::PatriciaMap; use patricia_tree::PatriciaMap;
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
use quickpeep_index::backend::BackendIndependentDocument; use quickpeep_index::auxiliary::icon_store::IconStore;
use quickpeep_index::backend::{Backend, BackendIndependentDocument};
use quickpeep_indexer::config::IndexerConfig; use quickpeep_indexer::config::IndexerConfig;
use quickpeep_seed_parser::loader::{ use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
}; };
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; use quickpeep_structs::rake_entries::{
IconEntry, PackRecord, RakedPageEntry, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES,
};
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
use smartstring::alias::CompactString; use smartstring::alias::CompactString;
use std::path::PathBuf; use std::path::PathBuf;
@ -40,6 +43,8 @@ pub async fn main() -> anyhow::Result<()> {
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); .unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
let config = IndexerConfig::load(&config_path).context("Failed to load config")?; let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let icon_store = IconStore::open(config.icon_store.as_path())?;
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?; let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64); let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
let handle = tokio::spawn(async move { let handle = tokio::spawn(async move {
@ -62,23 +67,45 @@ pub async fn main() -> anyhow::Result<()> {
// TODO the decompressor has a buffer already, but we need this to see the end // TODO the decompressor has a buffer already, but we need this to see the end
let mut buf_reader = BufReader::new(decompressor); let mut buf_reader = BufReader::new(decompressor);
let schema: String = serde_bare::from_reader(&mut buf_reader)?; let schema: String = serde_bare::from_reader(&mut buf_reader)?;
if &schema != SCHEMA_RAKED_PAGES {
match schema.as_ref() {
SCHEMA_RAKED_PAGES => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?;
}
}
SCHEMA_RAKED_ICONS => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_icon_pack(&mut buf_reader, &icon_store)?;
}
}
_ => {
bail!( bail!(
"Wrong schema version: wanted {:?}, got {:?}", "Wrong schema version: wanted e.g. {:?}, got {:?}",
SCHEMA_RAKED_PAGES, SCHEMA_RAKED_PAGES,
&schema &schema
); );
} }
}
}
indexer_backend.flush()?;
// TODO(unstable): this condition is `.has_data_left()` but it's unstable. Ok(())
while buf_reader.fill_buf().map(|b| !b.is_empty())? { }
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(&mut buf_reader)?;
pub fn handle_page_pack(
buf_reader: &mut impl BufRead,
seed_lookup: &SeedLookupTable,
indexer_backend: &mut Box<dyn Backend>,
) -> anyhow::Result<()> {
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(buf_reader)?;
let document = page_record.record.document; let document = page_record.record.document;
let article_body = DenseTree::generate_textual_format(&document.body_content, false); let article_body = DenseTree::generate_textual_format(&document.body_content, false);
let nonarticle_body = let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder, false);
DenseTree::generate_textual_format(&document.body_remainder, false);
let tags = seed_lookup let tags = seed_lookup
.look_up(&Url::parse(page_record.url.as_ref())?)? .look_up(&Url::parse(page_record.url.as_ref())?)?
@ -87,10 +114,11 @@ pub async fn main() -> anyhow::Result<()> {
// TODO Store the actual structure of the document in the store? // TODO Store the actual structure of the document in the store?
let favicon_url = document.head.effective_favicon_url(); let favicon_url_relative = document.head.effective_favicon_url();
let favicon_url = Url::parse(page_record.url.as_ref())?.join(favicon_url_relative)?;
let favicon_url = favicon_url.as_str();
let mut favicon_url_hash_long = [0u8; 8]; let mut favicon_url_hash_long = [0u8; 8];
favicon_url_hash_long favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long); let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
indexer_backend.add_document(BackendIndependentDocument { indexer_backend.add_document(BackendIndependentDocument {
@ -102,9 +130,28 @@ pub async fn main() -> anyhow::Result<()> {
url: page_record.url.to_string(), url: page_record.url.to_string(),
favicon_url_hash, favicon_url_hash,
})?; })?;
}
} Ok(())
indexer_backend.flush()?; }
pub fn handle_icon_pack(
buf_reader: &mut impl BufRead,
icon_store: &IconStore,
) -> anyhow::Result<()> {
let page_record: PackRecord<IconEntry> = serde_bare::from_reader(buf_reader)?;
let webp_bytes = page_record.record.webp_bytes;
let mut favicon_url_hash_long = [0u8; 8];
favicon_url_hash_long
.copy_from_slice(&blake3::hash(page_record.url.as_ref().as_bytes()).as_bytes()[0..8]);
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
icon_store.env.rw_txn(|txn| {
icon_store
.icons
.put(txn, &favicon_url_hash.to_le_bytes(), &webp_bytes)
})?;
Ok(()) Ok(())
} }

View File

@ -12,6 +12,9 @@ pub struct IndexerConfig {
/// Path to seeds /// Path to seeds
pub seed_dir: PathBuf, pub seed_dir: PathBuf,
/// Path to the icon store
pub icon_store: PathBuf,
/// Configuration about which backend to use. /// Configuration about which backend to use.
pub backend: BackendConfig, pub backend: BackendConfig,
} }
@ -25,6 +28,7 @@ impl IndexerConfig {
let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?; let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?;
indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir); indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir);
indexer_config.icon_store = config_dir.join(indexer_config.icon_store);
match &mut indexer_config.backend { match &mut indexer_config.backend {
BackendConfig::Tantivy(tantivy) => { BackendConfig::Tantivy(tantivy) => {
tantivy.index_dir = config_dir.join(&tantivy.index_dir); tantivy.index_dir = config_dir.join(&tantivy.index_dir);