Factor out recursive loader

This commit is contained in:
Olivier 'reivilibre' 2022-03-26 17:45:31 +00:00
parent f4672181aa
commit f884324648
5 changed files with 122 additions and 111 deletions

2
Cargo.lock generated
View File

@ -3560,9 +3560,11 @@ name = "quickpeep_seed_parser"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"log",
"pest", "pest",
"pest_derive", "pest_derive",
"smartstring", "smartstring",
"tokio",
] ]
[[package]] [[package]]

View File

@ -1,17 +1,15 @@
use clap::Parser; use clap::Parser;
use std::borrow::{Borrow, BorrowMut}; use std::borrow::{Borrow, BorrowMut};
use std::ffi::OsStr;
use env_logger::Env; use env_logger::Env;
use anyhow::{anyhow, bail, Context}; use anyhow::{bail, Context};
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln}; use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
use log::warn;
use reqwest::{Client, Url}; use reqwest::{Client, Url};
use std::path::PathBuf; use std::path::PathBuf;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio::sync::mpsc::{Receiver, Sender}; use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config; use quickpeep_raker::config;
use quickpeep_raker::config::RakerConfig; use quickpeep_raker::config::RakerConfig;
@ -19,7 +17,7 @@ use quickpeep_raker::raking::analysis::get_reduced_domain;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_raker::storage::{maintenance, RakerStore};
use quickpeep_seed_parser::parse_seeds; use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern};
use quickpeep_utils::dirty::DirtyTracker; use quickpeep_utils::dirty::DirtyTracker;
pub const SEED_EXTENSION: &'static str = ".seed"; pub const SEED_EXTENSION: &'static str = ".seed";
@ -124,112 +122,6 @@ pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Re
Ok(()) Ok(())
} }
pub struct Seed {
url: UrlOrUrlPattern,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
}
/// Either a URL or a URL prefix.
#[derive(Clone, Debug)]
pub enum UrlOrUrlPattern {
Url(String),
UrlPrefix(String),
}
impl UrlOrUrlPattern {
pub fn as_str(&self) -> &str {
match self {
UrlOrUrlPattern::Url(url) => url.as_str(),
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
}
}
}
/// Task that loads seeds from the filesystem
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
for seed_file in seed_files {
// Parse the seed file and send out the seeds.
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
match parse_seeds(&seed_file_text) {
Ok(seedblocks) => {
for seedblock in seedblocks {
for seed in seedblock.seeds {
/*
let tags: BTreeSet<CompactString> = seedblock
.tags
.iter()
.chain(seed.extra_tags.iter())
.cloned()
.collect();
*/
send.send(Seed {
url: seed_url_parse_pattern(seed.url),
// tags,
})
.await
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
}
}
}
Err(err) => {
eprintln!(
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
seed_file, err
);
bail!("Failed to parse {:?}; see error above.", seed_file);
}
}
}
Ok(())
}
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
if url.ends_with('*') {
url.pop();
UrlOrUrlPattern::UrlPrefix(url)
} else {
UrlOrUrlPattern::Url(url)
}
}
async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
let mut dirs = vec![seed_dir];
let mut seedfiles = Vec::new();
while let Some(dir_to_scan) = dirs.pop() {
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
while let Some(entry) = dir.next_entry().await? {
let path = entry.path();
let file_name = match path
.file_name()
.map(|osstr: &OsStr| osstr.to_str())
.flatten()
{
None => {
warn!("Skipping non-UTF-8 name.");
continue;
}
Some(file_name) => file_name,
};
if file_name.starts_with(".") {
continue;
}
if file_name.ends_with(extension) {
seedfiles.push(path);
continue;
}
if path.is_dir() {
// Recurse into this directory later.
dirs.push(path);
}
}
}
Ok(seedfiles)
}
const BATCH_SIZE: usize = 256; const BATCH_SIZE: usize = 256;
#[derive(Clone, Debug, Default)] #[derive(Clone, Debug, Default)]

View File

@ -10,3 +10,6 @@ pest = "2.1.3"
pest_derive = "2.1.0" pest_derive = "2.1.0"
smartstring = "1.0.0" smartstring = "1.0.0"
anyhow = "1.0.56" anyhow = "1.0.56"
log = "0.4.16"
tokio = { version = "1.17.0", features = ["full"] }

View File

@ -2,6 +2,8 @@ extern crate pest;
#[macro_use] #[macro_use]
extern crate pest_derive; extern crate pest_derive;
pub mod loader;
use anyhow::{bail, ensure, Context}; use anyhow::{bail, ensure, Context};
use smartstring::alias::CompactString; use smartstring::alias::CompactString;

View File

@ -0,0 +1,112 @@
use crate::parse_seeds;
use anyhow::{anyhow, bail};
use log::warn;
use std::ffi::OsStr;
use std::path::PathBuf;
use tokio::sync::mpsc::Sender;
pub struct Seed {
pub url: UrlOrUrlPattern,
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
}
/// Either a URL or a URL prefix.
#[derive(Clone, Debug)]
pub enum UrlOrUrlPattern {
Url(String),
UrlPrefix(String),
}
impl UrlOrUrlPattern {
pub fn as_str(&self) -> &str {
match self {
UrlOrUrlPattern::Url(url) => url.as_str(),
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
}
}
}
/// Task that loads seeds from the filesystem
pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
for seed_file in seed_files {
// Parse the seed file and send out the seeds.
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
match parse_seeds(&seed_file_text) {
Ok(seedblocks) => {
for seedblock in seedblocks {
for seed in seedblock.seeds {
/*
let tags: BTreeSet<CompactString> = seedblock
.tags
.iter()
.chain(seed.extra_tags.iter())
.cloned()
.collect();
*/
send.send(Seed {
url: seed_url_parse_pattern(seed.url),
// tags,
})
.await
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
}
}
}
Err(err) => {
eprintln!(
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
seed_file, err
);
bail!("Failed to parse {:?}; see error above.", seed_file);
}
}
}
Ok(())
}
pub fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
if url.ends_with('*') {
url.pop();
UrlOrUrlPattern::UrlPrefix(url)
} else {
UrlOrUrlPattern::Url(url)
}
}
pub async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
let mut dirs = vec![seed_dir];
let mut seedfiles = Vec::new();
while let Some(dir_to_scan) = dirs.pop() {
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
while let Some(entry) = dir.next_entry().await? {
let path = entry.path();
let file_name = match path
.file_name()
.map(|osstr: &OsStr| osstr.to_str())
.flatten()
{
None => {
warn!("Skipping non-UTF-8 name.");
continue;
}
Some(file_name) => file_name,
};
if file_name.starts_with(".") {
continue;
}
if file_name.ends_with(extension) {
seedfiles.push(path);
continue;
}
if path.is_dir() {
// Recurse into this directory later.
dirs.push(path);
}
}
}
Ok(seedfiles)
}