diff --git a/Cargo.lock b/Cargo.lock index 58b385f..619a444 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3560,9 +3560,11 @@ name = "quickpeep_seed_parser" version = "0.1.0" dependencies = [ "anyhow", + "log", "pest", "pest_derive", "smartstring", + "tokio", ] [[package]] diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 618b7b6..2fad0a7 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -1,17 +1,15 @@ use clap::Parser; use std::borrow::{Borrow, BorrowMut}; -use std::ffi::OsStr; use env_logger::Env; -use anyhow::{anyhow, bail, Context}; +use anyhow::{bail, Context}; use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln}; -use log::warn; use reqwest::{Client, Url}; use std::path::PathBuf; use tokio::sync::mpsc; -use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::mpsc::Receiver; use quickpeep_raker::config; use quickpeep_raker::config::RakerConfig; @@ -19,7 +17,7 @@ use quickpeep_raker::raking::analysis::get_reduced_domain; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::{maintenance, RakerStore}; -use quickpeep_seed_parser::parse_seeds; +use quickpeep_seed_parser::loader::{find_seed_files, seed_loader, Seed, UrlOrUrlPattern}; use quickpeep_utils::dirty::DirtyTracker; pub const SEED_EXTENSION: &'static str = ".seed"; @@ -124,112 +122,6 @@ pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Re Ok(()) } -pub struct Seed { - url: UrlOrUrlPattern, - // TODO(later) These make more sense at the indexer stage. tags: BTreeSet, -} - -/// Either a URL or a URL prefix. -#[derive(Clone, Debug)] -pub enum UrlOrUrlPattern { - Url(String), - UrlPrefix(String), -} - -impl UrlOrUrlPattern { - pub fn as_str(&self) -> &str { - match self { - UrlOrUrlPattern::Url(url) => url.as_str(), - UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(), - } - } -} - -/// Task that loads seeds from the filesystem -async fn seed_loader(seed_files: Vec, send: &Sender) -> anyhow::Result<()> { - for seed_file in seed_files { - // Parse the seed file and send out the seeds. - let seed_file_text = tokio::fs::read_to_string(&seed_file).await?; - match parse_seeds(&seed_file_text) { - Ok(seedblocks) => { - for seedblock in seedblocks { - for seed in seedblock.seeds { - /* - let tags: BTreeSet = seedblock - .tags - .iter() - .chain(seed.extra_tags.iter()) - .cloned() - .collect(); - */ - send.send(Seed { - url: seed_url_parse_pattern(seed.url), - // tags, - }) - .await - .map_err(|_| anyhow!("Seed receiver shut down prematurely"))?; - } - } - } - Err(err) => { - eprintln!( - "~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~", - seed_file, err - ); - bail!("Failed to parse {:?}; see error above.", seed_file); - } - } - } - - Ok(()) -} - -fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern { - if url.ends_with('*') { - url.pop(); - UrlOrUrlPattern::UrlPrefix(url) - } else { - UrlOrUrlPattern::Url(url) - } -} - -async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result> { - let mut dirs = vec![seed_dir]; - let mut seedfiles = Vec::new(); - - while let Some(dir_to_scan) = dirs.pop() { - let mut dir = tokio::fs::read_dir(&dir_to_scan).await?; - - while let Some(entry) = dir.next_entry().await? { - let path = entry.path(); - let file_name = match path - .file_name() - .map(|osstr: &OsStr| osstr.to_str()) - .flatten() - { - None => { - warn!("Skipping non-UTF-8 name."); - continue; - } - Some(file_name) => file_name, - }; - if file_name.starts_with(".") { - continue; - } - if file_name.ends_with(extension) { - seedfiles.push(path); - continue; - } - if path.is_dir() { - // Recurse into this directory later. - dirs.push(path); - } - } - } - - Ok(seedfiles) -} - const BATCH_SIZE: usize = 256; #[derive(Clone, Debug, Default)] diff --git a/quickpeep_seed_parser/Cargo.toml b/quickpeep_seed_parser/Cargo.toml index 4f5a9da..c8e3d3d 100644 --- a/quickpeep_seed_parser/Cargo.toml +++ b/quickpeep_seed_parser/Cargo.toml @@ -10,3 +10,6 @@ pest = "2.1.3" pest_derive = "2.1.0" smartstring = "1.0.0" anyhow = "1.0.56" + +log = "0.4.16" +tokio = { version = "1.17.0", features = ["full"] } diff --git a/quickpeep_seed_parser/src/lib.rs b/quickpeep_seed_parser/src/lib.rs index a37602e..1ce6ba4 100644 --- a/quickpeep_seed_parser/src/lib.rs +++ b/quickpeep_seed_parser/src/lib.rs @@ -2,6 +2,8 @@ extern crate pest; #[macro_use] extern crate pest_derive; +pub mod loader; + use anyhow::{bail, ensure, Context}; use smartstring::alias::CompactString; diff --git a/quickpeep_seed_parser/src/loader.rs b/quickpeep_seed_parser/src/loader.rs new file mode 100644 index 0000000..7657ecf --- /dev/null +++ b/quickpeep_seed_parser/src/loader.rs @@ -0,0 +1,112 @@ +use crate::parse_seeds; +use anyhow::{anyhow, bail}; +use log::warn; +use std::ffi::OsStr; +use std::path::PathBuf; +use tokio::sync::mpsc::Sender; + +pub struct Seed { + pub url: UrlOrUrlPattern, + // TODO(later) These make more sense at the indexer stage. tags: BTreeSet, +} + +/// Either a URL or a URL prefix. +#[derive(Clone, Debug)] +pub enum UrlOrUrlPattern { + Url(String), + UrlPrefix(String), +} + +impl UrlOrUrlPattern { + pub fn as_str(&self) -> &str { + match self { + UrlOrUrlPattern::Url(url) => url.as_str(), + UrlOrUrlPattern::UrlPrefix(url_prefix) => url_prefix.as_str(), + } + } +} + +/// Task that loads seeds from the filesystem +pub async fn seed_loader(seed_files: Vec, send: &Sender) -> anyhow::Result<()> { + for seed_file in seed_files { + // Parse the seed file and send out the seeds. + let seed_file_text = tokio::fs::read_to_string(&seed_file).await?; + match parse_seeds(&seed_file_text) { + Ok(seedblocks) => { + for seedblock in seedblocks { + for seed in seedblock.seeds { + /* + let tags: BTreeSet = seedblock + .tags + .iter() + .chain(seed.extra_tags.iter()) + .cloned() + .collect(); + */ + send.send(Seed { + url: seed_url_parse_pattern(seed.url), + // tags, + }) + .await + .map_err(|_| anyhow!("Seed receiver shut down prematurely"))?; + } + } + } + Err(err) => { + eprintln!( + "~~~~~ Error in seed file ({:?}):\n{:?}\n~~~~~", + seed_file, err + ); + bail!("Failed to parse {:?}; see error above.", seed_file); + } + } + } + + Ok(()) +} + +pub fn seed_url_parse_pattern(mut url: String) -> UrlOrUrlPattern { + if url.ends_with('*') { + url.pop(); + UrlOrUrlPattern::UrlPrefix(url) + } else { + UrlOrUrlPattern::Url(url) + } +} + +pub async fn find_seed_files(seed_dir: PathBuf, extension: &str) -> anyhow::Result> { + let mut dirs = vec![seed_dir]; + let mut seedfiles = Vec::new(); + + while let Some(dir_to_scan) = dirs.pop() { + let mut dir = tokio::fs::read_dir(&dir_to_scan).await?; + + while let Some(entry) = dir.next_entry().await? { + let path = entry.path(); + let file_name = match path + .file_name() + .map(|osstr: &OsStr| osstr.to_str()) + .flatten() + { + None => { + warn!("Skipping non-UTF-8 name."); + continue; + } + Some(file_name) => file_name, + }; + if file_name.starts_with(".") { + continue; + } + if file_name.ends_with(extension) { + seedfiles.push(path); + continue; + } + if path.is_dir() { + // Recurse into this directory later. + dirs.push(path); + } + } + } + + Ok(seedfiles) +}