Build seed lookup table

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 20:53:29 +01:00
parent 6f596b54dc
commit 0a955bb2f5
3 changed files with 51 additions and 6 deletions

2
Cargo.lock generated
View File

@ -3663,12 +3663,14 @@ dependencies = [
"quickpeep_index",
"quickpeep_seed_parser",
"quickpeep_structs",
"quickpeep_utils",
"serde",
"serde_bare",
"serde_json",
"smartstring",
"tokio",
"toml",
"url",
"zstd",
]

View File

@ -16,6 +16,12 @@ serde_json = "1.0.79"
toml = "0.5.8"
clap = { version = "3.1.6", features = ["derive"] }
colour = "0.6.0"
url = "2.2.2"
smartstring = "1.0.1"
# Used for efficient lookup of seeds (URL prefixes)
patricia_tree = "0.3.1"
# For decompression of emitted packs. 0.11.1+zstd.1.5.2
zstd = "0.11.1"
@ -24,3 +30,4 @@ quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_index = { path = "../quickpeep_index" }
quickpeep_structs = { path = "../quickpeep_structs" }
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }
quickpeep_utils = { path = "../quickpeep_utils" }

View File

@ -2,15 +2,23 @@ use anyhow::{bail, Context};
use clap::Parser;
use colour::{blue, yellow_ln};
use env_logger::Env;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use patricia_tree::PatriciaMap;
use quickpeep_densedoc::DenseTree;
use quickpeep_index::backend::BackendIndependentDocument;
use quickpeep_indexer::config::IndexerConfig;
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, SEED_EXTENSION};
use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
};
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
use quickpeep_utils::urls::get_reduced_domain;
use smartstring::alias::CompactString;
use std::path::PathBuf;
use tokio::sync::mpsc::Receiver;
use url::Url;
/// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)]
@ -21,6 +29,11 @@ pub struct Opts {
rakepacks: Vec<PathBuf>,
}
pub struct SeedLookupTable {
pub by_prefix: PatriciaMap<Seed>,
pub by_reduced_domain: HashMap<CompactString, Seed>,
}
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
@ -33,16 +46,13 @@ pub async fn main() -> anyhow::Result<()> {
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
let (seed_tx, mut seed_rx) = tokio::sync::mpsc::channel(64);
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
let handle = tokio::spawn(async move {
seed_loader(seed_files, &seed_tx).await?;
Ok(()) as anyhow::Result<()>
});
while let Some(seed) = seed_rx.recv().await {
// TODO store this seed in an efficient structure for looking up...
todo!();
}
let seed_lookup = build_seed_lookup_table(seed_rx).await?;
handle.await??;
@ -89,3 +99,29 @@ pub async fn main() -> anyhow::Result<()> {
Ok(())
}
pub async fn build_seed_lookup_table(
mut seed_rx: Receiver<Seed>,
) -> anyhow::Result<SeedLookupTable> {
let mut seed_lookup = SeedLookupTable {
by_prefix: Default::default(),
by_reduced_domain: HashMap::new(),
};
while let Some(seed) = seed_rx.recv().await {
match &seed.url {
UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str)?;
let reduced_domain = get_reduced_domain(&url)?;
seed_lookup
.by_reduced_domain
.insert(reduced_domain.into(), seed);
}
UrlOrUrlPattern::UrlPrefix(url_str) => {
seed_lookup.by_prefix.insert(url_str.clone(), seed);
}
}
}
Ok(seed_lookup)
}