Load and parse seeds
This commit is contained in:
parent
39aa4eb9b7
commit
8df430c7f1
|
@ -3034,6 +3034,7 @@ dependencies = [
|
||||||
"publicsuffix",
|
"publicsuffix",
|
||||||
"quickpeep_densedoc",
|
"quickpeep_densedoc",
|
||||||
"quickpeep_moz_readability",
|
"quickpeep_moz_readability",
|
||||||
|
"quickpeep_seed_parser",
|
||||||
"quickpeep_structs",
|
"quickpeep_structs",
|
||||||
"quickpeep_utils",
|
"quickpeep_utils",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
|
|
@ -9,6 +9,7 @@ edition = "2021"
|
||||||
### Subcrates
|
### Subcrates
|
||||||
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
quickpeep_moz_readability = { path = "../quickpeep_moz_readability" }
|
||||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||||
|
quickpeep_seed_parser = { path = "../quickpeep_seed_parser" }
|
||||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,16 @@ use std::collections::BTreeSet;
|
||||||
|
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{anyhow, bail, Context};
|
||||||
use arc_interner::ArcIntern;
|
use arc_interner::ArcIntern;
|
||||||
use smartstring::alias::CompactString;
|
use smartstring::alias::CompactString;
|
||||||
|
|
||||||
use std::path::{PathBuf};
|
use std::path::PathBuf;
|
||||||
use tokio::sync::mpsc::{Receiver, Sender};
|
use tokio::sync::mpsc::{Receiver, Sender};
|
||||||
|
|
||||||
use quickpeep_raker::config;
|
use quickpeep_raker::config;
|
||||||
use quickpeep_raker::storage::RakerStore;
|
use quickpeep_raker::storage::RakerStore;
|
||||||
|
use quickpeep_seed_parser::parse_seeds;
|
||||||
|
|
||||||
/// Seeds a raker's queue with URLs
|
/// Seeds a raker's queue with URLs
|
||||||
#[derive(Clone, Debug, Parser)]
|
#[derive(Clone, Debug, Parser)]
|
||||||
|
@ -58,7 +59,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
pub struct Seed {
|
pub struct Seed {
|
||||||
url: UrlOrUrlPattern,
|
url: UrlOrUrlPattern,
|
||||||
tags: ArcIntern<BTreeSet<CompactString>>,
|
tags: BTreeSet<CompactString>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Either a URL or a URL prefix.
|
/// Either a URL or a URL prefix.
|
||||||
|
@ -71,14 +72,49 @@ pub enum UrlOrUrlPattern {
|
||||||
/// Task that loads seeds from the filesystem
|
/// Task that loads seeds from the filesystem
|
||||||
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||||
for seed_file in seed_files {
|
for seed_file in seed_files {
|
||||||
// TODO parse the seed file
|
// Parse the seed file and send out the seeds.
|
||||||
|
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
||||||
// TODO send out seeds
|
match parse_seeds(&seed_file_text) {
|
||||||
|
Ok(seedblocks) => {
|
||||||
|
for seedblock in seedblocks {
|
||||||
|
for seed in seedblock.seeds {
|
||||||
|
let tags: BTreeSet<CompactString> = seedblock
|
||||||
|
.tags
|
||||||
|
.iter()
|
||||||
|
.chain(seed.extra_tags.iter())
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
send.send(Seed {
|
||||||
|
url: seed_url_parse_pattern(seed.url),
|
||||||
|
tags,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!(
|
||||||
|
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
||||||
|
seed_file, err
|
||||||
|
);
|
||||||
|
bail!("Failed to parse {:?}; see error above.", seed_file);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
||||||
|
if url.ends_with('*') {
|
||||||
|
url.pop();
|
||||||
|
UrlOrUrlPattern::UrlPrefix(url)
|
||||||
|
} else {
|
||||||
|
UrlOrUrlPattern::Url(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
|
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
|
||||||
let mut dirs = vec![seed_dir];
|
let mut dirs = vec![seed_dir];
|
||||||
let mut seedfiles = Vec::new();
|
let mut seedfiles = Vec::new();
|
||||||
|
|
Loading…
Reference in New Issue