Load and parse seeds

This commit is contained in:
Olivier 'reivilibre' 2022-03-20 20:50:31 +00:00
parent 39aa4eb9b7
commit 8df430c7f1
3 changed files with 44 additions and 6 deletions

1
Cargo.lock generated
View File

@ -3034,6 +3034,7 @@ dependencies = [
"publicsuffix", "publicsuffix",
"quickpeep_densedoc", "quickpeep_densedoc",
"quickpeep_moz_readability", "quickpeep_moz_readability",
"quickpeep_seed_parser",
"quickpeep_structs", "quickpeep_structs",
"quickpeep_utils", "quickpeep_utils",
"rand 0.8.5", "rand 0.8.5",

View File

@ -9,6 +9,7 @@ edition = "2021"
### Subcrates ### Subcrates
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }
quickpeep_utils = { path = "../quickpeep_utils" } quickpeep_utils = { path = "../quickpeep_utils" }

View File

@ -3,15 +3,16 @@ use std::collections::BTreeSet;
use env_logger::Env; use env_logger::Env;
use anyhow::{bail, Context}; use anyhow::{anyhow, bail, Context};
use arc_interner::ArcIntern; use arc_interner::ArcIntern;
use smartstring::alias::CompactString; use smartstring::alias::CompactString;
use std::path::{PathBuf}; use std::path::PathBuf;
use tokio::sync::mpsc::{Receiver, Sender}; use tokio::sync::mpsc::{Receiver, Sender};
use quickpeep_raker::config; use quickpeep_raker::config;
use quickpeep_raker::storage::RakerStore; use quickpeep_raker::storage::RakerStore;
use quickpeep_seed_parser::parse_seeds;
/// Seeds a raker's queue with URLs /// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)] #[derive(Clone, Debug, Parser)]
@ -58,7 +59,7 @@ pub async fn main() -> anyhow::Result<()> {
pub struct Seed { pub struct Seed {
url: UrlOrUrlPattern, url: UrlOrUrlPattern,
tags: ArcIntern<BTreeSet<CompactString>>, tags: BTreeSet<CompactString>,
} }
/// Either a URL or a URL prefix. /// Either a URL or a URL prefix.
@ -71,14 +72,49 @@ pub enum UrlOrUrlPattern {
/// Task that loads seeds from the filesystem /// Task that loads seeds from the filesystem
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> { async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
for seed_file in seed_files { for seed_file in seed_files {
// TODO parse the seed file // Parse the seed file and send out the seeds.
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
// TODO send out seeds match parse_seeds(&seed_file_text) {
Ok(seedblocks) => {
for seedblock in seedblocks {
for seed in seedblock.seeds {
let tags: BTreeSet<CompactString> = seedblock
.tags
.iter()
.chain(seed.extra_tags.iter())
.cloned()
.collect();
send.send(Seed {
url: seed_url_parse_pattern(seed.url),
tags,
})
.await
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
}
}
}
Err(err) => {
eprintln!(
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
seed_file, err
);
bail!("Failed to parse {:?}; see error above.", seed_file);
}
}
} }
Ok(()) Ok(())
} }
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
if url.ends_with('*') {
url.pop();
UrlOrUrlPattern::UrlPrefix(url)
} else {
UrlOrUrlPattern::Url(url)
}
}
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> { async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
let mut dirs = vec![seed_dir]; let mut dirs = vec![seed_dir];
let mut seedfiles = Vec::new(); let mut seedfiles = Vec::new();