Build seed lookup table
This commit is contained in:
parent
6f596b54dc
commit
0a955bb2f5
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -3663,12 +3663,14 @@ dependencies = [
|
||||
"quickpeep_index",
|
||||
"quickpeep_seed_parser",
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils",
|
||||
"serde",
|
||||
"serde_bare",
|
||||
"serde_json",
|
||||
"smartstring",
|
||||
"tokio",
|
||||
"toml",
|
||||
"url",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
|
@ -16,6 +16,12 @@ serde_json = "1.0.79"
|
||||
toml = "0.5.8"
|
||||
clap = { version = "3.1.6", features = ["derive"] }
|
||||
colour = "0.6.0"
|
||||
url = "2.2.2"
|
||||
|
||||
smartstring = "1.0.1"
|
||||
|
||||
# Used for efficient lookup of seeds (URL prefixes)
|
||||
patricia_tree = "0.3.1"
|
||||
|
||||
# For decompression of emitted packs. 0.11.1+zstd.1.5.2
|
||||
zstd = "0.11.1"
|
||||
@ -24,3 +30,4 @@ quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
quickpeep_index = { path = "../quickpeep_index" }
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }
|
||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
@ -2,15 +2,23 @@ use anyhow::{bail, Context};
|
||||
use clap::Parser;
|
||||
use colour::{blue, yellow_ln};
|
||||
use env_logger::Env;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use patricia_tree::PatriciaMap;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_index::backend::BackendIndependentDocument;
|
||||
use quickpeep_indexer::config::IndexerConfig;
|
||||
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, SEED_EXTENSION};
|
||||
use quickpeep_seed_parser::loader::{
|
||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION,
|
||||
};
|
||||
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use smartstring::alias::CompactString;
|
||||
use std::path::PathBuf;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use url::Url;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
@ -21,6 +29,11 @@ pub struct Opts {
|
||||
rakepacks: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
pub struct SeedLookupTable {
|
||||
pub by_prefix: PatriciaMap<Seed>,
|
||||
pub by_reduced_domain: HashMap<CompactString, Seed>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
|
||||
@ -33,16 +46,13 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
||||
let (seed_tx, mut seed_rx) = tokio::sync::mpsc::channel(64);
|
||||
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
||||
let handle = tokio::spawn(async move {
|
||||
seed_loader(seed_files, &seed_tx).await?;
|
||||
Ok(()) as anyhow::Result<()>
|
||||
});
|
||||
|
||||
while let Some(seed) = seed_rx.recv().await {
|
||||
// TODO store this seed in an efficient structure for looking up...
|
||||
todo!();
|
||||
}
|
||||
let seed_lookup = build_seed_lookup_table(seed_rx).await?;
|
||||
|
||||
handle.await??;
|
||||
|
||||
@ -89,3 +99,29 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn build_seed_lookup_table(
|
||||
mut seed_rx: Receiver<Seed>,
|
||||
) -> anyhow::Result<SeedLookupTable> {
|
||||
let mut seed_lookup = SeedLookupTable {
|
||||
by_prefix: Default::default(),
|
||||
by_reduced_domain: HashMap::new(),
|
||||
};
|
||||
|
||||
while let Some(seed) = seed_rx.recv().await {
|
||||
match &seed.url {
|
||||
UrlOrUrlPattern::Url(url_str) => {
|
||||
let url = Url::parse(url_str)?;
|
||||
let reduced_domain = get_reduced_domain(&url)?;
|
||||
seed_lookup
|
||||
.by_reduced_domain
|
||||
.insert(reduced_domain.into(), seed);
|
||||
}
|
||||
UrlOrUrlPattern::UrlPrefix(url_str) => {
|
||||
seed_lookup.by_prefix.insert(url_str.clone(), seed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(seed_lookup)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user