From 8df430c7f103fd78a2c1393ce7e8b4f4fc0e1c38 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 20 Mar 2022 20:50:31 +0000 Subject: [PATCH] Load and parse seeds --- Cargo.lock | 1 + quickpeep_raker/Cargo.toml | 1 + quickpeep_raker/src/bin/qp-seedrake.rs | 48 ++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87eb6d1..8f07f2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3034,6 +3034,7 @@ dependencies = [ "publicsuffix", "quickpeep_densedoc", "quickpeep_moz_readability", + "quickpeep_seed_parser", "quickpeep_structs", "quickpeep_utils", "rand 0.8.5", diff --git a/quickpeep_raker/Cargo.toml b/quickpeep_raker/Cargo.toml index 0f34960..b8b0b0c 100644 --- a/quickpeep_raker/Cargo.toml +++ b/quickpeep_raker/Cargo.toml @@ -9,6 +9,7 @@ edition = "2021" ### Subcrates quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } quickpeep_densedoc = { path = "../quickpeep_densedoc" } +quickpeep_seed_parser = { path = "../quickpeep_seed_parser" } quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_utils = { path = "../quickpeep_utils" } diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 7d68541..1c312b5 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -3,15 +3,16 @@ use std::collections::BTreeSet; use env_logger::Env; -use anyhow::{bail, Context}; +use anyhow::{anyhow, bail, Context}; use arc_interner::ArcIntern; use smartstring::alias::CompactString; -use std::path::{PathBuf}; +use std::path::PathBuf; use tokio::sync::mpsc::{Receiver, Sender}; use quickpeep_raker::config; use quickpeep_raker::storage::RakerStore; +use quickpeep_seed_parser::parse_seeds; /// Seeds a raker's queue with URLs #[derive(Clone, Debug, Parser)] @@ -58,7 +59,7 @@ pub async fn main() -> anyhow::Result<()> { pub struct Seed { url: UrlOrUrlPattern, - tags: ArcIntern>, + tags: BTreeSet, } /// Either a URL or a URL prefix. @@ -71,14 +72,49 @@ pub enum UrlOrUrlPattern { /// Task that loads seeds from the filesystem async fn seed_loader(seed_files: Vec, send: &Sender) -> anyhow::Result<()> { for seed_file in seed_files { - // TODO parse the seed file - - // TODO send out seeds + // Parse the seed file and send out the seeds. + let seed_file_text = tokio::fs::read_to_string(&seed_file).await?; + match parse_seeds(&seed_file_text) { + Ok(seedblocks) => { + for seedblock in seedblocks { + for seed in seedblock.seeds { + let tags: BTreeSet = seedblock + .tags + .iter() + .chain(seed.extra_tags.iter()) + .cloned() + .collect(); + send.send(Seed { + url: seed_url_parse_pattern(seed.url), + tags, + }) + .await + .map_err(|_| anyhow!("Seed receiver shut down prematurely"))?; + } + } + } + Err(err) => { + eprintln!( + "~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~", + seed_file, err + ); + bail!("Failed to parse {:?}; see error above.", seed_file); + } + } } Ok(()) } +fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern { + if url.ends_with('*') { + url.pop(); + UrlOrUrlPattern::UrlPrefix(url) + } else { + UrlOrUrlPattern::Url(url) + } +} + async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result> { let mut dirs = vec![seed_dir]; let mut seedfiles = Vec::new();