Load and parse seeds
This commit is contained in:
parent
39aa4eb9b7
commit
8df430c7f1
|
@ -3034,6 +3034,7 @@ dependencies = [
|
|||
"publicsuffix",
|
||||
"quickpeep_densedoc",
|
||||
"quickpeep_moz_readability",
|
||||
"quickpeep_seed_parser",
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils",
|
||||
"rand 0.8.5",
|
||||
|
|
|
@ -9,6 +9,7 @@ edition = "2021"
|
|||
### Subcrates
|
||||
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||
|
||||
|
|
|
@ -3,15 +3,16 @@ use std::collections::BTreeSet;
|
|||
|
||||
use env_logger::Env;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use arc_interner::ArcIntern;
|
||||
use smartstring::alias::CompactString;
|
||||
|
||||
use std::path::{PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
|
||||
use quickpeep_raker::config;
|
||||
use quickpeep_raker::storage::RakerStore;
|
||||
use quickpeep_seed_parser::parse_seeds;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
|
@ -58,7 +59,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
pub struct Seed {
|
||||
url: UrlOrUrlPattern,
|
||||
tags: ArcIntern<BTreeSet<CompactString>>,
|
||||
tags: BTreeSet<CompactString>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
|
@ -71,14 +72,49 @@ pub enum UrlOrUrlPattern {
|
|||
/// Task that loads seeds from the filesystem
|
||||
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||
for seed_file in seed_files {
|
||||
// TODO parse the seed file
|
||||
|
||||
// TODO send out seeds
|
||||
// Parse the seed file and send out the seeds.
|
||||
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
||||
match parse_seeds(&seed_file_text) {
|
||||
Ok(seedblocks) => {
|
||||
for seedblock in seedblocks {
|
||||
for seed in seedblock.seeds {
|
||||
let tags: BTreeSet<CompactString> = seedblock
|
||||
.tags
|
||||
.iter()
|
||||
.chain(seed.extra_tags.iter())
|
||||
.cloned()
|
||||
.collect();
|
||||
send.send(Seed {
|
||||
url: seed_url_parse_pattern(seed.url),
|
||||
tags,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
||||
seed_file, err
|
||||
);
|
||||
bail!("Failed to parse {:?}; see error above.", seed_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
||||
if url.ends_with('*') {
|
||||
url.pop();
|
||||
UrlOrUrlPattern::UrlPrefix(url)
|
||||
} else {
|
||||
UrlOrUrlPattern::Url(url)
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut dirs = vec![seed_dir];
|
||||
let mut seedfiles = Vec::new();
|
||||
|
|
Loading…
Reference in New Issue