Factor out recursive loader
This commit is contained in:
parent
f4672181aa
commit
f884324648
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -3560,9 +3560,11 @@ name = "quickpeep_seed_parser"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"log",
|
||||
"pest",
|
||||
"pest_derive",
|
||||
"smartstring",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1,17 +1,15 @@
|
||||
use clap::Parser;
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ffi::OsStr;
|
||||
|
||||
use env_logger::Env;
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use anyhow::{bail, Context};
|
||||
|
||||
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
|
||||
use log::warn;
|
||||
use reqwest::{Client, Url};
|
||||
use std::path::PathBuf;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
|
||||
use quickpeep_raker::config;
|
||||
use quickpeep_raker::config::RakerConfig;
|
||||
@ -19,7 +17,7 @@ use quickpeep_raker::raking::analysis::get_reduced_domain;
|
||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||
use quickpeep_seed_parser::parse_seeds;
|
||||
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern};
|
||||
use quickpeep_utils::dirty::DirtyTracker;
|
||||
|
||||
pub const SEED_EXTENSION: &'static str = ".seed";
|
||||
@ -124,112 +122,6 @@ pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Re
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct Seed {
|
||||
url: UrlOrUrlPattern,
|
||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum UrlOrUrlPattern {
|
||||
Url(String),
|
||||
UrlPrefix(String),
|
||||
}
|
||||
|
||||
impl UrlOrUrlPattern {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
UrlOrUrlPattern::Url(url) => url.as_str(),
|
||||
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Task that loads seeds from the filesystem
|
||||
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||
for seed_file in seed_files {
|
||||
// Parse the seed file and send out the seeds.
|
||||
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
||||
match parse_seeds(&seed_file_text) {
|
||||
Ok(seedblocks) => {
|
||||
for seedblock in seedblocks {
|
||||
for seed in seedblock.seeds {
|
||||
/*
|
||||
let tags: BTreeSet<CompactString> = seedblock
|
||||
.tags
|
||||
.iter()
|
||||
.chain(seed.extra_tags.iter())
|
||||
.cloned()
|
||||
.collect();
|
||||
*/
|
||||
send.send(Seed {
|
||||
url: seed_url_parse_pattern(seed.url),
|
||||
// tags,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
||||
seed_file, err
|
||||
);
|
||||
bail!("Failed to parse {:?}; see error above.", seed_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
||||
if url.ends_with('*') {
|
||||
url.pop();
|
||||
UrlOrUrlPattern::UrlPrefix(url)
|
||||
} else {
|
||||
UrlOrUrlPattern::Url(url)
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut dirs = vec![seed_dir];
|
||||
let mut seedfiles = Vec::new();
|
||||
|
||||
while let Some(dir_to_scan) = dirs.pop() {
|
||||
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
||||
|
||||
while let Some(entry) = dir.next_entry().await? {
|
||||
let path = entry.path();
|
||||
let file_name = match path
|
||||
.file_name()
|
||||
.map(|osstr: &OsStr| osstr.to_str())
|
||||
.flatten()
|
||||
{
|
||||
None => {
|
||||
warn!("Skipping non-UTF-8 name.");
|
||||
continue;
|
||||
}
|
||||
Some(file_name) => file_name,
|
||||
};
|
||||
if file_name.starts_with(".") {
|
||||
continue;
|
||||
}
|
||||
if file_name.ends_with(extension) {
|
||||
seedfiles.push(path);
|
||||
continue;
|
||||
}
|
||||
if path.is_dir() {
|
||||
// Recurse into this directory later.
|
||||
dirs.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(seedfiles)
|
||||
}
|
||||
|
||||
const BATCH_SIZE: usize = 256;
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
|
@ -10,3 +10,6 @@ pest = "2.1.3"
|
||||
pest_derive = "2.1.0"
|
||||
smartstring = "1.0.0"
|
||||
anyhow = "1.0.56"
|
||||
|
||||
log = "0.4.16"
|
||||
tokio = { version = "1.17.0", features = ["full"] }
|
||||
|
@ -2,6 +2,8 @@ extern crate pest;
|
||||
#[macro_use]
|
||||
extern crate pest_derive;
|
||||
|
||||
pub mod loader;
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use smartstring::alias::CompactString;
|
||||
|
||||
|
112
quickpeep_seed_parser/src/loader.rs
Normal file
112
quickpeep_seed_parser/src/loader.rs
Normal file
@ -0,0 +1,112 @@
|
||||
use crate::parse_seeds;
|
||||
use anyhow::{anyhow, bail};
|
||||
use log::warn;
|
||||
use std::ffi::OsStr;
|
||||
use std::path::PathBuf;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
pub struct Seed {
|
||||
pub url: UrlOrUrlPattern,
|
||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum UrlOrUrlPattern {
|
||||
Url(String),
|
||||
UrlPrefix(String),
|
||||
}
|
||||
|
||||
impl UrlOrUrlPattern {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
UrlOrUrlPattern::Url(url) => url.as_str(),
|
||||
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Task that loads seeds from the filesystem
|
||||
pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||
for seed_file in seed_files {
|
||||
// Parse the seed file and send out the seeds.
|
||||
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
||||
match parse_seeds(&seed_file_text) {
|
||||
Ok(seedblocks) => {
|
||||
for seedblock in seedblocks {
|
||||
for seed in seedblock.seeds {
|
||||
/*
|
||||
let tags: BTreeSet<CompactString> = seedblock
|
||||
.tags
|
||||
.iter()
|
||||
.chain(seed.extra_tags.iter())
|
||||
.cloned()
|
||||
.collect();
|
||||
*/
|
||||
send.send(Seed {
|
||||
url: seed_url_parse_pattern(seed.url),
|
||||
// tags,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
||||
seed_file, err
|
||||
);
|
||||
bail!("Failed to parse {:?}; see error above.", seed_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
||||
if url.ends_with('*') {
|
||||
url.pop();
|
||||
UrlOrUrlPattern::UrlPrefix(url)
|
||||
} else {
|
||||
UrlOrUrlPattern::Url(url)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut dirs = vec![seed_dir];
|
||||
let mut seedfiles = Vec::new();
|
||||
|
||||
while let Some(dir_to_scan) = dirs.pop() {
|
||||
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
||||
|
||||
while let Some(entry) = dir.next_entry().await? {
|
||||
let path = entry.path();
|
||||
let file_name = match path
|
||||
.file_name()
|
||||
.map(|osstr: &OsStr| osstr.to_str())
|
||||
.flatten()
|
||||
{
|
||||
None => {
|
||||
warn!("Skipping non-UTF-8 name.");
|
||||
continue;
|
||||
}
|
||||
Some(file_name) => file_name,
|
||||
};
|
||||
if file_name.starts_with(".") {
|
||||
continue;
|
||||
}
|
||||
if file_name.ends_with(extension) {
|
||||
seedfiles.push(path);
|
||||
continue;
|
||||
}
|
||||
if path.is_dir() {
|
||||
// Recurse into this directory later.
|
||||
dirs.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(seedfiles)
|
||||
}
|
Loading…
Reference in New Issue
Block a user