Store icons in the icon store
This commit is contained in:
parent
eb899ac9a5
commit
25db9fdb24
@ -1,4 +1,5 @@
|
|||||||
seed_dir = "../quickpeep_seeds"
|
seed_dir = "../quickpeep_seeds"
|
||||||
|
icon_store = "./index_icons"
|
||||||
|
|
||||||
# Tantivy Backend
|
# Tantivy Backend
|
||||||
# [backend.tantivy]
|
# [backend.tantivy]
|
||||||
|
1
quickpeep_index/src/auxiliary.rs
Normal file
1
quickpeep_index/src/auxiliary.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod icon_store;
|
20
quickpeep_index/src/auxiliary/icon_store.rs
Normal file
20
quickpeep_index/src/auxiliary/icon_store.rs
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
use fancy_mdbx::database::RawTable;
|
||||||
|
use fancy_mdbx::environment::Env;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
pub struct IconStore {
|
||||||
|
pub env: Env,
|
||||||
|
|
||||||
|
/// Icons table
|
||||||
|
/// u64 hashes (little endian) to WebP contents
|
||||||
|
pub icons: RawTable<[u8], [u8]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IconStore {
|
||||||
|
pub fn open(path: &Path) -> anyhow::Result<IconStore> {
|
||||||
|
let env = Env::open(path)?;
|
||||||
|
let icons = env.open_raw_table(Some("icons"), ())?;
|
||||||
|
|
||||||
|
Ok(IconStore { env, icons })
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,5 @@
|
|||||||
pub mod config;
|
pub mod config;
|
||||||
|
|
||||||
pub mod backend;
|
pub mod backend;
|
||||||
|
|
||||||
|
pub mod auxiliary;
|
||||||
|
@ -8,12 +8,15 @@ use std::io::{BufRead, BufReader};
|
|||||||
|
|
||||||
use patricia_tree::PatriciaMap;
|
use patricia_tree::PatriciaMap;
|
||||||
use quickpeep_densedoc::DenseTree;
|
use quickpeep_densedoc::DenseTree;
|
||||||
use quickpeep_index::backend::BackendIndependentDocument;
|
use quickpeep_index::auxiliary::icon_store::IconStore;
|
||||||
|
use quickpeep_index::backend::{Backend, BackendIndependentDocument};
|
||||||
use quickpeep_indexer::config::IndexerConfig;
|
use quickpeep_indexer::config::IndexerConfig;
|
||||||
use quickpeep_seed_parser::loader::{
|
use quickpeep_seed_parser::loader::{
|
||||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
|
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
|
||||||
};
|
};
|
||||||
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
|
use quickpeep_structs::rake_entries::{
|
||||||
|
IconEntry, PackRecord, RakedPageEntry, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES,
|
||||||
|
};
|
||||||
use quickpeep_utils::urls::get_reduced_domain;
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
use smartstring::alias::CompactString;
|
use smartstring::alias::CompactString;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@ -40,6 +43,8 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
||||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||||
|
|
||||||
|
let icon_store = IconStore::open(config.icon_store.as_path())?;
|
||||||
|
|
||||||
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
||||||
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
||||||
let handle = tokio::spawn(async move {
|
let handle = tokio::spawn(async move {
|
||||||
@ -62,46 +67,27 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
// TODO the decompressor has a buffer already, but we need this to see the end
|
// TODO the decompressor has a buffer already, but we need this to see the end
|
||||||
let mut buf_reader = BufReader::new(decompressor);
|
let mut buf_reader = BufReader::new(decompressor);
|
||||||
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
|
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
|
||||||
if &schema != SCHEMA_RAKED_PAGES {
|
|
||||||
bail!(
|
|
||||||
"Wrong schema version: wanted {:?}, got {:?}",
|
|
||||||
SCHEMA_RAKED_PAGES,
|
|
||||||
&schema
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
match schema.as_ref() {
|
||||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
SCHEMA_RAKED_PAGES => {
|
||||||
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(&mut buf_reader)?;
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||||
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||||
let document = page_record.record.document;
|
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?;
|
||||||
|
}
|
||||||
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
}
|
||||||
let nonarticle_body =
|
SCHEMA_RAKED_ICONS => {
|
||||||
DenseTree::generate_textual_format(&document.body_remainder, false);
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||||
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||||
let tags = seed_lookup
|
handle_icon_pack(&mut buf_reader, &icon_store)?;
|
||||||
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
}
|
||||||
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
}
|
||||||
.unwrap_or_else(|| Vec::with_capacity(0));
|
_ => {
|
||||||
|
bail!(
|
||||||
// TODO Store the actual structure of the document in the store?
|
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
||||||
|
SCHEMA_RAKED_PAGES,
|
||||||
let favicon_url = document.head.effective_favicon_url();
|
&schema
|
||||||
let mut favicon_url_hash_long = [0u8; 8];
|
);
|
||||||
favicon_url_hash_long
|
}
|
||||||
.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
|
||||||
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
|
||||||
|
|
||||||
indexer_backend.add_document(BackendIndependentDocument {
|
|
||||||
title: document.head.title,
|
|
||||||
article_body,
|
|
||||||
nonarticle_body,
|
|
||||||
// TODO populate tags & antifeatures
|
|
||||||
tags,
|
|
||||||
url: page_record.url.to_string(),
|
|
||||||
favicon_url_hash,
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
indexer_backend.flush()?;
|
indexer_backend.flush()?;
|
||||||
@ -109,6 +95,67 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn handle_page_pack(
|
||||||
|
buf_reader: &mut impl BufRead,
|
||||||
|
seed_lookup: &SeedLookupTable,
|
||||||
|
indexer_backend: &mut Box<dyn Backend>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(buf_reader)?;
|
||||||
|
|
||||||
|
let document = page_record.record.document;
|
||||||
|
|
||||||
|
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
||||||
|
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder, false);
|
||||||
|
|
||||||
|
let tags = seed_lookup
|
||||||
|
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
||||||
|
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
||||||
|
.unwrap_or_else(|| Vec::with_capacity(0));
|
||||||
|
|
||||||
|
// TODO Store the actual structure of the document in the store?
|
||||||
|
|
||||||
|
let favicon_url_relative = document.head.effective_favicon_url();
|
||||||
|
let favicon_url = Url::parse(page_record.url.as_ref())?.join(favicon_url_relative)?;
|
||||||
|
let favicon_url = favicon_url.as_str();
|
||||||
|
let mut favicon_url_hash_long = [0u8; 8];
|
||||||
|
favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
||||||
|
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
||||||
|
|
||||||
|
indexer_backend.add_document(BackendIndependentDocument {
|
||||||
|
title: document.head.title,
|
||||||
|
article_body,
|
||||||
|
nonarticle_body,
|
||||||
|
// TODO populate tags & antifeatures
|
||||||
|
tags,
|
||||||
|
url: page_record.url.to_string(),
|
||||||
|
favicon_url_hash,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn handle_icon_pack(
|
||||||
|
buf_reader: &mut impl BufRead,
|
||||||
|
icon_store: &IconStore,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let page_record: PackRecord<IconEntry> = serde_bare::from_reader(buf_reader)?;
|
||||||
|
|
||||||
|
let webp_bytes = page_record.record.webp_bytes;
|
||||||
|
|
||||||
|
let mut favicon_url_hash_long = [0u8; 8];
|
||||||
|
favicon_url_hash_long
|
||||||
|
.copy_from_slice(&blake3::hash(page_record.url.as_ref().as_bytes()).as_bytes()[0..8]);
|
||||||
|
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
||||||
|
|
||||||
|
icon_store.env.rw_txn(|txn| {
|
||||||
|
icon_store
|
||||||
|
.icons
|
||||||
|
.put(txn, &favicon_url_hash.to_le_bytes(), &webp_bytes)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub struct SeedLookupTable {
|
pub struct SeedLookupTable {
|
||||||
pub by_prefix: PatriciaMap<Seed>,
|
pub by_prefix: PatriciaMap<Seed>,
|
||||||
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
||||||
|
@ -12,6 +12,9 @@ pub struct IndexerConfig {
|
|||||||
/// Path to seeds
|
/// Path to seeds
|
||||||
pub seed_dir: PathBuf,
|
pub seed_dir: PathBuf,
|
||||||
|
|
||||||
|
/// Path to the icon store
|
||||||
|
pub icon_store: PathBuf,
|
||||||
|
|
||||||
/// Configuration about which backend to use.
|
/// Configuration about which backend to use.
|
||||||
pub backend: BackendConfig,
|
pub backend: BackendConfig,
|
||||||
}
|
}
|
||||||
@ -25,6 +28,7 @@ impl IndexerConfig {
|
|||||||
let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?;
|
let mut indexer_config: IndexerConfig = toml::from_slice(&bytes)?;
|
||||||
|
|
||||||
indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir);
|
indexer_config.seed_dir = config_dir.join(indexer_config.seed_dir);
|
||||||
|
indexer_config.icon_store = config_dir.join(indexer_config.icon_store);
|
||||||
match &mut indexer_config.backend {
|
match &mut indexer_config.backend {
|
||||||
BackendConfig::Tantivy(tantivy) => {
|
BackendConfig::Tantivy(tantivy) => {
|
||||||
tantivy.index_dir = config_dir.join(&tantivy.index_dir);
|
tantivy.index_dir = config_dir.join(&tantivy.index_dir);
|
||||||
|
Loading…
Reference in New Issue
Block a user