|
|
|
@ -8,12 +8,15 @@ use std::io::{BufRead, BufReader};
|
|
|
|
|
|
|
|
|
|
use patricia_tree::PatriciaMap;
|
|
|
|
|
use quickpeep_densedoc::DenseTree;
|
|
|
|
|
use quickpeep_index::backend::BackendIndependentDocument;
|
|
|
|
|
use quickpeep_index::auxiliary::icon_store::IconStore;
|
|
|
|
|
use quickpeep_index::backend::{Backend, BackendIndependentDocument};
|
|
|
|
|
use quickpeep_indexer::config::IndexerConfig;
|
|
|
|
|
use quickpeep_seed_parser::loader::{
|
|
|
|
|
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
|
|
|
|
|
};
|
|
|
|
|
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
|
|
|
|
|
use quickpeep_structs::rake_entries::{
|
|
|
|
|
IconEntry, PackRecord, RakedPageEntry, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES,
|
|
|
|
|
};
|
|
|
|
|
use quickpeep_utils::urls::get_reduced_domain;
|
|
|
|
|
use smartstring::alias::CompactString;
|
|
|
|
|
use std::path::PathBuf;
|
|
|
|
@ -40,6 +43,8 @@ pub async fn main() -> anyhow::Result<()> {
|
|
|
|
|
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
|
|
|
|
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
|
|
|
|
|
|
|
|
|
let icon_store = IconStore::open(config.icon_store.as_path())?;
|
|
|
|
|
|
|
|
|
|
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
|
|
|
|
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
|
|
|
|
let handle = tokio::spawn(async move {
|
|
|
|
@ -62,46 +67,27 @@ pub async fn main() -> anyhow::Result<()> {
|
|
|
|
|
// TODO the decompressor has a buffer already, but we need this to see the end
|
|
|
|
|
let mut buf_reader = BufReader::new(decompressor);
|
|
|
|
|
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
|
|
|
|
|
if &schema != SCHEMA_RAKED_PAGES {
|
|
|
|
|
bail!(
|
|
|
|
|
"Wrong schema version: wanted {:?}, got {:?}",
|
|
|
|
|
SCHEMA_RAKED_PAGES,
|
|
|
|
|
&schema
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
|
|
|
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
|
|
|
|
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(&mut buf_reader)?;
|
|
|
|
|
|
|
|
|
|
let document = page_record.record.document;
|
|
|
|
|
|
|
|
|
|
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
|
|
|
|
let nonarticle_body =
|
|
|
|
|
DenseTree::generate_textual_format(&document.body_remainder, false);
|
|
|
|
|
|
|
|
|
|
let tags = seed_lookup
|
|
|
|
|
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
|
|
|
|
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
|
|
|
|
.unwrap_or_else(|| Vec::with_capacity(0));
|
|
|
|
|
|
|
|
|
|
// TODO Store the actual structure of the document in the store?
|
|
|
|
|
|
|
|
|
|
let favicon_url = document.head.effective_favicon_url();
|
|
|
|
|
let mut favicon_url_hash_long = [0u8; 8];
|
|
|
|
|
favicon_url_hash_long
|
|
|
|
|
.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
|
|
|
|
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
|
|
|
|
|
|
|
|
|
indexer_backend.add_document(BackendIndependentDocument {
|
|
|
|
|
title: document.head.title,
|
|
|
|
|
article_body,
|
|
|
|
|
nonarticle_body,
|
|
|
|
|
// TODO populate tags & antifeatures
|
|
|
|
|
tags,
|
|
|
|
|
url: page_record.url.to_string(),
|
|
|
|
|
favicon_url_hash,
|
|
|
|
|
})?;
|
|
|
|
|
match schema.as_ref() {
|
|
|
|
|
SCHEMA_RAKED_PAGES => {
|
|
|
|
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
|
|
|
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
|
|
|
|
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
SCHEMA_RAKED_ICONS => {
|
|
|
|
|
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
|
|
|
|
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
|
|
|
|
handle_icon_pack(&mut buf_reader, &icon_store)?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|
bail!(
|
|
|
|
|
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
|
|
|
|
SCHEMA_RAKED_PAGES,
|
|
|
|
|
&schema
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
indexer_backend.flush()?;
|
|
|
|
@ -109,6 +95,67 @@ pub async fn main() -> anyhow::Result<()> {
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn handle_page_pack(
|
|
|
|
|
buf_reader: &mut impl BufRead,
|
|
|
|
|
seed_lookup: &SeedLookupTable,
|
|
|
|
|
indexer_backend: &mut Box<dyn Backend>,
|
|
|
|
|
) -> anyhow::Result<()> {
|
|
|
|
|
let page_record: PackRecord<RakedPageEntry> = serde_bare::from_reader(buf_reader)?;
|
|
|
|
|
|
|
|
|
|
let document = page_record.record.document;
|
|
|
|
|
|
|
|
|
|
let article_body = DenseTree::generate_textual_format(&document.body_content, false);
|
|
|
|
|
let nonarticle_body = DenseTree::generate_textual_format(&document.body_remainder, false);
|
|
|
|
|
|
|
|
|
|
let tags = seed_lookup
|
|
|
|
|
.look_up(&Url::parse(page_record.url.as_ref())?)?
|
|
|
|
|
.map(|seed: &Seed| seed.tags.iter().map(|cs| cs.to_string()).collect())
|
|
|
|
|
.unwrap_or_else(|| Vec::with_capacity(0));
|
|
|
|
|
|
|
|
|
|
// TODO Store the actual structure of the document in the store?
|
|
|
|
|
|
|
|
|
|
let favicon_url_relative = document.head.effective_favicon_url();
|
|
|
|
|
let favicon_url = Url::parse(page_record.url.as_ref())?.join(favicon_url_relative)?;
|
|
|
|
|
let favicon_url = favicon_url.as_str();
|
|
|
|
|
let mut favicon_url_hash_long = [0u8; 8];
|
|
|
|
|
favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
|
|
|
|
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
|
|
|
|
|
|
|
|
|
indexer_backend.add_document(BackendIndependentDocument {
|
|
|
|
|
title: document.head.title,
|
|
|
|
|
article_body,
|
|
|
|
|
nonarticle_body,
|
|
|
|
|
// TODO populate tags & antifeatures
|
|
|
|
|
tags,
|
|
|
|
|
url: page_record.url.to_string(),
|
|
|
|
|
favicon_url_hash,
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn handle_icon_pack(
|
|
|
|
|
buf_reader: &mut impl BufRead,
|
|
|
|
|
icon_store: &IconStore,
|
|
|
|
|
) -> anyhow::Result<()> {
|
|
|
|
|
let page_record: PackRecord<IconEntry> = serde_bare::from_reader(buf_reader)?;
|
|
|
|
|
|
|
|
|
|
let webp_bytes = page_record.record.webp_bytes;
|
|
|
|
|
|
|
|
|
|
let mut favicon_url_hash_long = [0u8; 8];
|
|
|
|
|
favicon_url_hash_long
|
|
|
|
|
.copy_from_slice(&blake3::hash(page_record.url.as_ref().as_bytes()).as_bytes()[0..8]);
|
|
|
|
|
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
|
|
|
|
|
|
|
|
|
icon_store.env.rw_txn(|txn| {
|
|
|
|
|
icon_store
|
|
|
|
|
.icons
|
|
|
|
|
.put(txn, &favicon_url_hash.to_le_bytes(), &webp_bytes)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct SeedLookupTable {
|
|
|
|
|
pub by_prefix: PatriciaMap<Seed>,
|
|
|
|
|
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
|
|
|
|