Factor out recursive loader
This commit is contained in:
parent
f4672181aa
commit
f884324648
|
@ -3560,9 +3560,11 @@ name = "quickpeep_seed_parser"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"log",
|
||||||
"pest",
|
"pest",
|
||||||
"pest_derive",
|
"pest_derive",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -1,17 +1,15 @@
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use std::borrow::{Borrow, BorrowMut};
|
use std::borrow::{Borrow, BorrowMut};
|
||||||
use std::ffi::OsStr;
|
|
||||||
|
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
|
||||||
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
|
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
|
||||||
use log::warn;
|
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio::sync::mpsc::{Receiver, Sender};
|
use tokio::sync::mpsc::Receiver;
|
||||||
|
|
||||||
use quickpeep_raker::config;
|
use quickpeep_raker::config;
|
||||||
use quickpeep_raker::config::RakerConfig;
|
use quickpeep_raker::config::RakerConfig;
|
||||||
|
@ -19,7 +17,7 @@ use quickpeep_raker::raking::analysis::get_reduced_domain;
|
||||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||||
use quickpeep_seed_parser::parse_seeds;
|
use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern};
|
||||||
use quickpeep_utils::dirty::DirtyTracker;
|
use quickpeep_utils::dirty::DirtyTracker;
|
||||||
|
|
||||||
pub const SEED_EXTENSION: &'static str = ".seed";
|
pub const SEED_EXTENSION: &'static str = ".seed";
|
||||||
|
@ -124,112 +122,6 @@ pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Re
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Seed {
|
|
||||||
url: UrlOrUrlPattern,
|
|
||||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Either a URL or a URL prefix.
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub enum UrlOrUrlPattern {
|
|
||||||
Url(String),
|
|
||||||
UrlPrefix(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UrlOrUrlPattern {
|
|
||||||
pub fn as_str(&self) -> &str {
|
|
||||||
match self {
|
|
||||||
UrlOrUrlPattern::Url(url) => url.as_str(),
|
|
||||||
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Task that loads seeds from the filesystem
|
|
||||||
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
|
||||||
for seed_file in seed_files {
|
|
||||||
// Parse the seed file and send out the seeds.
|
|
||||||
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
|
||||||
match parse_seeds(&seed_file_text) {
|
|
||||||
Ok(seedblocks) => {
|
|
||||||
for seedblock in seedblocks {
|
|
||||||
for seed in seedblock.seeds {
|
|
||||||
/*
|
|
||||||
let tags: BTreeSet<CompactString> = seedblock
|
|
||||||
.tags
|
|
||||||
.iter()
|
|
||||||
.chain(seed.extra_tags.iter())
|
|
||||||
.cloned()
|
|
||||||
.collect();
|
|
||||||
*/
|
|
||||||
send.send(Seed {
|
|
||||||
url: seed_url_parse_pattern(seed.url),
|
|
||||||
// tags,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
eprintln!(
|
|
||||||
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
|
||||||
seed_file, err
|
|
||||||
);
|
|
||||||
bail!("Failed to parse {:?}; see error above.", seed_file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
|
||||||
if url.ends_with('*') {
|
|
||||||
url.pop();
|
|
||||||
UrlOrUrlPattern::UrlPrefix(url)
|
|
||||||
} else {
|
|
||||||
UrlOrUrlPattern::Url(url)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
||||||
let mut dirs = vec![seed_dir];
|
|
||||||
let mut seedfiles = Vec::new();
|
|
||||||
|
|
||||||
while let Some(dir_to_scan) = dirs.pop() {
|
|
||||||
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
|
||||||
|
|
||||||
while let Some(entry) = dir.next_entry().await? {
|
|
||||||
let path = entry.path();
|
|
||||||
let file_name = match path
|
|
||||||
.file_name()
|
|
||||||
.map(|osstr: &OsStr| osstr.to_str())
|
|
||||||
.flatten()
|
|
||||||
{
|
|
||||||
None => {
|
|
||||||
warn!("Skipping non-UTF-8 name.");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Some(file_name) => file_name,
|
|
||||||
};
|
|
||||||
if file_name.starts_with(".") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if file_name.ends_with(extension) {
|
|
||||||
seedfiles.push(path);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if path.is_dir() {
|
|
||||||
// Recurse into this directory later.
|
|
||||||
dirs.push(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(seedfiles)
|
|
||||||
}
|
|
||||||
|
|
||||||
const BATCH_SIZE: usize = 256;
|
const BATCH_SIZE: usize = 256;
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone, Debug, Default)]
|
||||||
|
|
|
@ -10,3 +10,6 @@ pest = "2.1.3"
|
||||||
pest_derive = "2.1.0"
|
pest_derive = "2.1.0"
|
||||||
smartstring = "1.0.0"
|
smartstring = "1.0.0"
|
||||||
anyhow = "1.0.56"
|
anyhow = "1.0.56"
|
||||||
|
|
||||||
|
log = "0.4.16"
|
||||||
|
tokio = { version = "1.17.0", features = ["full"] }
|
||||||
|
|
|
@ -2,6 +2,8 @@ extern crate pest;
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate pest_derive;
|
extern crate pest_derive;
|
||||||
|
|
||||||
|
pub mod loader;
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
use smartstring::alias::CompactString;
|
use smartstring::alias::CompactString;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
use crate::parse_seeds;
|
||||||
|
use anyhow::{anyhow, bail};
|
||||||
|
use log::warn;
|
||||||
|
use std::ffi::OsStr;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
|
pub struct Seed {
|
||||||
|
pub url: UrlOrUrlPattern,
|
||||||
|
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Either a URL or a URL prefix.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum UrlOrUrlPattern {
|
||||||
|
Url(String),
|
||||||
|
UrlPrefix(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UrlOrUrlPattern {
|
||||||
|
pub fn as_str(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
UrlOrUrlPattern::Url(url) => url.as_str(),
|
||||||
|
UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task that loads seeds from the filesystem
|
||||||
|
pub async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||||
|
for seed_file in seed_files {
|
||||||
|
// Parse the seed file and send out the seeds.
|
||||||
|
let seed_file_text = tokio::fs::read_to_string(&seed_file).await?;
|
||||||
|
match parse_seeds(&seed_file_text) {
|
||||||
|
Ok(seedblocks) => {
|
||||||
|
for seedblock in seedblocks {
|
||||||
|
for seed in seedblock.seeds {
|
||||||
|
/*
|
||||||
|
let tags: BTreeSet<CompactString> = seedblock
|
||||||
|
.tags
|
||||||
|
.iter()
|
||||||
|
.chain(seed.extra_tags.iter())
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
*/
|
||||||
|
send.send(Seed {
|
||||||
|
url: seed_url_parse_pattern(seed.url),
|
||||||
|
// tags,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("Seed receiver shut down prematurely"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!(
|
||||||
|
"~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~",
|
||||||
|
seed_file, err
|
||||||
|
);
|
||||||
|
bail!("Failed to parse {:?}; see error above.", seed_file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern {
|
||||||
|
if url.ends_with('*') {
|
||||||
|
url.pop();
|
||||||
|
UrlOrUrlPattern::UrlPrefix(url)
|
||||||
|
} else {
|
||||||
|
UrlOrUrlPattern::Url(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result<Vec<PathBuf>> {
|
||||||
|
let mut dirs = vec![seed_dir];
|
||||||
|
let mut seedfiles = Vec::new();
|
||||||
|
|
||||||
|
while let Some(dir_to_scan) = dirs.pop() {
|
||||||
|
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = dir.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
let file_name = match path
|
||||||
|
.file_name()
|
||||||
|
.map(|osstr: &OsStr| osstr.to_str())
|
||||||
|
.flatten()
|
||||||
|
{
|
||||||
|
None => {
|
||||||
|
warn!("Skipping non-UTF-8 name.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Some(file_name) => file_name,
|
||||||
|
};
|
||||||
|
if file_name.starts_with(".") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if file_name.ends_with(extension) {
|
||||||
|
seedfiles.push(path);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if path.is_dir() {
|
||||||
|
// Recurse into this directory later.
|
||||||
|
dirs.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(seedfiles)
|
||||||
|
}
|
Loading…
Reference in New Issue