Load seeds in the indexer
Some checks failed
continuous-integration/drone the build failed

This commit is contained in:
Olivier 'reivilibre' 2022-03-26 17:55:19 +00:00
parent f884324648
commit 4665bfd3a3
5 changed files with 25 additions and 5 deletions

1
Cargo.lock generated
View File

@ -3483,6 +3483,7 @@ dependencies = [
"log", "log",
"quickpeep_densedoc", "quickpeep_densedoc",
"quickpeep_index", "quickpeep_index",
"quickpeep_seed_parser",
"quickpeep_structs", "quickpeep_structs",
"serde", "serde",
"serde_bare", "serde_bare",

View File

@ -23,3 +23,4 @@ zstd = "0.11.1"
quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_index = { path = "../quickpeep_index" } quickpeep_index = { path = "../quickpeep_index" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }

View File

@ -8,6 +8,7 @@ use std::io::{BufRead, BufReader};
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
use quickpeep_index::backend::BackendIndependentDocument; use quickpeep_index::backend::BackendIndependentDocument;
use quickpeep_indexer::config::IndexerConfig; use quickpeep_indexer::config::IndexerConfig;
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, SEED_EXTENSION};
use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES}; use quickpeep_structs::rake_entries::{PackRecord, RakedPageEntry, SCHEMA_RAKED_PAGES};
use std::path::PathBuf; use std::path::PathBuf;
@ -20,7 +21,8 @@ pub struct Opts {
rakepacks: Vec<PathBuf>, rakepacks: Vec<PathBuf>,
} }
pub fn main() -> anyhow::Result<()> { #[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init(); env_logger::Builder::from_env(Env::default().default_filter_or("info,qp_indexer=debug")).init();
let opts: Opts = Opts::parse(); let opts: Opts = Opts::parse();
@ -30,6 +32,20 @@ pub fn main() -> anyhow::Result<()> {
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); .unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
let config = IndexerConfig::load(&config_path).context("Failed to load config")?; let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
let (seed_tx, mut seed_rx) = tokio::sync::mpsc::channel(64);
let handle = tokio::spawn(async move {
seed_loader(seed_files, &seed_tx).await?;
Ok(()) as anyhow::Result<()>
});
while let Some(seed) = seed_rx.recv().await {
// TODO store this seed in an efficient structure for looking up...
todo!();
}
handle.await??;
let mut indexer_backend = config.open_indexer_backend()?; let mut indexer_backend = config.open_indexer_backend()?;
for pack in opts.rakepacks { for pack in opts.rakepacks {

View File

@ -17,12 +17,11 @@ use quickpeep_raker::raking::analysis::get_reduced_domain;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_raker::storage::{maintenance, RakerStore};
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern}; use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
};
use quickpeep_utils::dirty::DirtyTracker; use quickpeep_utils::dirty::DirtyTracker;
pub const SEED_EXTENSION: &'static str = ".seed";
pub const WEED_EXTENSION: &'static str = ".weed";
/// Seeds a raker's queue with URLs /// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)] #[derive(Clone, Debug, Parser)]
pub struct Opts { pub struct Opts {

View File

@ -5,6 +5,9 @@ use std::ffi::OsStr;
use std::path::PathBuf; use std::path::PathBuf;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
pub const SEED_EXTENSION: &'static str = ".seed";
pub const WEED_EXTENSION: &'static str = ".weed";
pub struct Seed { pub struct Seed {
pub url: UrlOrUrlPattern, pub url: UrlOrUrlPattern,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>, // TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,