From 4665bfd3a359d905f023d4ba610c53550bcb50b9 Mon Sep 17 00:00:00 2001 From: Olivier Date: Sat, 26 Mar 2022 17:55:19 +0000 Subject: [PATCH] Load seeds in the indexer --- Cargo.lock | 1 + quickpeep_indexer/Cargo.toml | 1 + quickpeep_indexer/src/bin/qp-indexer.rs | 18 +++++++++++++++++- quickpeep_raker/src/bin/qp-seedrake.rs | 7 +++---- quickpeep_seed_parser/src/loader.rs | 3 +++ 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 619a444..e8dd65a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3483,6 +3483,7 @@ dependencies = [ "log", "quickpeep_densedoc", "quickpeep_index", + "quickpeep_seed_parser", "quickpeep_structs", "serde", "serde_bare", diff --git a/quickpeep_indexer/Cargo.toml b/quickpeep_indexer/Cargo.toml index c8112ad..ef8573c 100644 --- a/quickpeep_indexer/Cargo.toml +++ b/quickpeep_indexer/Cargo.toml @@ -23,3 +23,4 @@ zstd = "0.11.1" quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_index = { path = "../quickpeep_index" } quickpeep_structs = { path = "../quickpeep_structs" } +quickpeep_seed_parser = { path = "../quickpeep_seed_parser" } diff --git a/quickpeep_indexer/src/bin/qp-indexer.rs b/quickpeep_indexer/src/bin/qp-indexer.rs index 4d94cdf..1a42e16 100644 --- a/quickpeep_indexer/src/bin/qp-indexer.rs +++ b/quickpeep_indexer/src/bin/qp-indexer.rs @@ -8,6 +8,7 @@ use std::io::{BufRead, BufReader}; use quickpeep_densedoc::DenseTree; use quickpeep_index::backend::BackendIndependentDocument; use quickpeep_indexer::config::IndexerConfig; +use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, SEED_EXTENSION}; use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; use std::path::PathBuf; @@ -20,7 +21,8 @@ pub struct Opts { rakepacks: Vec, } -pub fn main() -> anyhow::Result<()> { +#[tokio::main] +pub async fn main() -> anyhow::Result<()> { env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init(); let opts: Opts = Opts::parse(); @@ -30,6 +32,20 @@ pub fn main() -> anyhow::Result<()> { .unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); let config = IndexerConfig::load(&config_path).context("Failed to load config")?; + let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?; + let (seed_tx, mut seed_rx) = tokio::sync::mpsc::channel(64); + let handle = tokio::spawn(async move { + seed_loader(seed_files, &seed_tx).await?; + Ok(()) as anyhow::Result<()> + }); + + while let Some(seed) = seed_rx.recv().await { + // TODO store this seed in an efficient structure for looking up... + todo!(); + } + + handle.await??; + let mut indexer_backend = config.open_indexer_backend()?; for pack in opts.rakepacks { diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 2fad0a7..6c6bda1 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -17,12 +17,11 @@ use quickpeep_raker::raking::analysis::get_reduced_domain; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::{maintenance, RakerStore}; -use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern}; +use quickpeep_seed_parser::loader::{ + find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION, +}; use quickpeep_utils::dirty::DirtyTracker; -pub const SEED_EXTENSION: &'static str = ".seed"; -pub const WEED_EXTENSION: &'static str = ".weed"; - /// Seeds a raker's queue with URLs #[derive(Clone, Debug, Parser)] pub struct Opts { diff --git a/quickpeep_seed_parser/src/loader.rs b/quickpeep_seed_parser/src/loader.rs index 7657ecf..a5dcb9e 100644 --- a/quickpeep_seed_parser/src/loader.rs +++ b/quickpeep_seed_parser/src/loader.rs @@ -5,6 +5,9 @@ use std::ffi::OsStr; use std::path::PathBuf; use tokio::sync::mpsc::Sender; +pub const SEED_EXTENSION: &'static str = ".seed"; +pub const WEED_EXTENSION: &'static str = ".weed"; + pub struct Seed { pub url: UrlOrUrlPattern, // TODO(later) These make more sense at the indexer stage. tags: BTreeSet,