Store icons in the icon store
This commit is contained in:
parent
eb899ac9a5
commit
25db9fdb24
|
@ -1,4 +1,5 @@
|
|||
seed_dir = "../quickpeep_seeds"
|
||||
icon_store = "./index_icons"
|
||||
|
||||
# Tantivy Backend
|
||||
# [backend.tantivy]
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
pub mod icon_store;
|
|
@ -0,0 +1,20 @@
|
|||
use fancy_mdbx::database::RawTable;
|
||||
use fancy_mdbx::environment::Env;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct IconStore {
|
||||
pub env: Env,
|
||||
|
||||
/// Icons table
|
||||
/// u64 hashes (little endian) to WebP contents
|
||||
pub icons: RawTable<[u8], [u8]>,
|
||||
}
|
||||
|
||||
impl IconStore {
|
||||
pub fn open(path: &Path) -> anyhow::Result<IconStore> {
|
||||
let env = Env::open(path)?;
|
||||
let icons = env.open_raw_table(Some("icons"), ())?;
|
||||
|
||||
Ok(IconStore { env, icons })
|
||||
}
|
||||
}
|
|
@ -1,3 +1,5 @@
|
|||
pub mod config;
|
||||
|
||||
pub mod backend;
|
||||
|
||||
pub mod auxiliary;
|
||||
|
|
|
@ -8,12 +8,15 @@ use std::io::{BufRead, BufReader};
|
|||
|
||||
use patricia_tree::PatriciaMap;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_index::backend::BackendIndependentDocument;
|
||||
use quickpeep_index::auxiliary::icon_store::IconStore;
|
||||
use quickpeep_index::backend::{Backend, BackendIndependentDocument};
|
||||
use quickpeep_indexer::config::IndexerConfig;
|
||||
use quickpeep_seed_parser::loader::{
|
||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
|
||||
};
|
||||
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
|
||||
use quickpeep_structs::rake_entries::{
|
||||
IconEntry, PackRecord, RakedPageEntry, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES,
|
||||
};
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use smartstring::alias::CompactString;
|
||||
use std::path::PathBuf;
|
||||
|
@ -40,6 +43,8 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
let icon_store = IconStore::open(config.icon_store.as_path())?;
|
||||
|
||||
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
||||
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
||||
let handle = tokio::spawn(async move {
|
||||
|
@ -62,23 +67,45 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
// TODO the decompressor has a buffer already, but we need this to see the end
|
||||
let mut buf_reader = BufReader::new(decompressor);
|
||||
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
|
||||
if &schema != SCHEMA_RAKED_PAGES {
|
||||
|
||||
match schema.as_ref() {
|
||||
SCHEMA_RAKED_PAGES => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?;
|
||||
}
|
||||
}
|
||||
SCHEMA_RAKED_ICONS => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_icon_pack(&mut buf_reader, &icon_store)?;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
bail!(
|
||||
"Wrong schema version: wanted {:?}, got {:?}",
|
||||
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
||||
SCHEMA_RAKED_PAGES,
|
||||
&schema
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
indexer_backend.flush()?;
|
||||
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(&mut buf_reader)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn handle_page_pack(
|
||||
buf_reader: &mut impl BufRead,
|
||||
seed_lookup: &SeedLookupTable,
|
||||
indexer_backend: &mut Box<dyn Backend>,
|
||||
) -> anyhow::Result<()> {
|
||||
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(buf_reader)?;
|
||||
|
||||
let document = page_record.record.document;
|
||||
|
||||
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
||||
let nonarticle_body =
|
||||
DenseTree::generate_textual_format(&document.body_remainder, false);
|
||||
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder, false);
|
||||
|
||||
let tags = seed_lookup
|
||||
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||
|
@ -87,10 +114,11 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
// TODO Store the actual structure of the document in the store?
|
||||
|
||||
let favicon_url = document.head.effective_favicon_url();
|
||||
let favicon_url_relative = document.head.effective_favicon_url();
|
||||
let favicon_url = Url::parse(page_record.url.as_ref())?.join(favicon_url_relative)?;
|
||||
let favicon_url = favicon_url.as_str();
|
||||
let mut favicon_url_hash_long = [0u8; 8];
|
||||
favicon_url_hash_long
|
||||
.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
||||
favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
||||
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
||||
|
||||
indexer_backend.add_document(BackendIndependentDocument {
|
||||
|
@ -102,9 +130,28 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
url: page_record.url.to_string(),
|
||||
favicon_url_hash,
|
||||
})?;
|
||||
}
|
||||
}
|
||||
indexer_backend.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn handle_icon_pack(
|
||||
buf_reader: &mut impl BufRead,
|
||||
icon_store: &IconStore,
|
||||
) -> anyhow::Result<()> {
|
||||
let page_record: PackRecord<IconEntry> = serde_bare::from_reader(buf_reader)?;
|
||||
|
||||
let webp_bytes = page_record.record.webp_bytes;
|
||||
|
||||
let mut favicon_url_hash_long = [0u8; 8];
|
||||
favicon_url_hash_long
|
||||
.copy_from_slice(&blake3::hash(page_record.url.as_ref().as_bytes()).as_bytes()[0..8]);
|
||||
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
||||
|
||||
icon_store.env.rw_txn(|txn| {
|
||||
icon_store
|
||||
.icons
|
||||
.put(txn, &favicon_url_hash.to_le_bytes(), &webp_bytes)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -12,6 +12,9 @@ pub struct IndexerConfig {
|
|||
/// Path to seeds
|
||||
pub seed_dir: PathBuf,
|
||||
|
||||
/// Path to the icon store
|
||||
pub icon_store: PathBuf,
|
||||
|
||||
/// Configuration about which backend to use.
|
||||
pub backend: BackendConfig,
|
||||
}
|
||||
|
@ -25,6 +28,7 @@ impl IndexerConfig {
|
|||
let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?;
|
||||
|
||||
indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir);
|
||||
indexer_config.icon_store = config_dir.join(indexer_config.icon_store);
|
||||
match &mut indexer_config.backend {
|
||||
BackendConfig::Tantivy(tantivy) => {
|
||||
tantivy.index_dir = config_dir.join(&tantivy.index_dir);
|
||||
|
|
Loading…
Reference in New Issue