quickpeep/quickpeep_raker/src/bin/qp-seedrake.rs

239 lines
7.1 KiB
Rust

use clap::Parser;
use std::borrow::Borrow;
use env_logger::Env;
use anyhow::{bail, Context};
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
use reqwest::{Client, Url};
use std::path::PathBuf;
use tokio::sync::mpsc;
use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::{maintenance, RakerStore};
use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
};
use quickpeep_utils::urls::get_reduced_domain;
/// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)]
pub struct Opts {
#[clap(long = "config")]
config: Option<PathBuf>,
#[clap(long = "import")]
/// Import the seeds into the workbench
import: bool,
}
#[tokio::main]
pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
let opts: Opts = Opts::parse();
let config_path = opts
.config
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
let config = RakerConfig::load(&config_path).context("Failed to load config")?;
if !config.raker.workbench_dir.exists() {
bail!(
"Workbench directory ({:?}) doesn't exist.",
config.raker.workbench_dir
);
}
if !config.seed_dir.exists() {
bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir);
}
let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?;
import_seeds(store.clone(), &config).await?;
import_weeds(store.clone(), &config).await?;
eprintln!("... re-applying seeds and weeds to on-hold URLs ...");
store
.async_rw_txn(|txn| maintenance::reapply_seeds_and_weeds_to_on_hold_urls(txn))
.await?;
eprintln!("... done!");
Ok(())
}
pub async fn import_seeds(store: RakerStore, config: &RakerConfig) -> anyhow::Result<()> {
let (weed_tx, weed_rx) = mpsc::channel(128);
let weed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
eprintln!("{:?}", weed_files);
tokio::spawn(async move {
seed_loader(weed_files, &weed_tx).await?;
Ok(()) as anyhow::Result<()>
});
let stats = importer(store, weed_rx, false).await?;
dark_green_ln!("=== Seeds Imported! ===");
green!("New URLs: ");
yellow_ln!("{:?}", stats.new_urls);
green!("New sitemaps: ");
yellow_ln!("{:?}", stats.new_sitemaps);
green!("New domains: ");
yellow_ln!("{:?}", stats.new_domains);
dark_yellow!("Seen URLs: ");
yellow_ln!("{:?}", stats.already_present_urls);
println!();
Ok(())
}
pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Result<()> {
let (weed_tx, weed_rx) = mpsc::channel(128);
let seed_files = find_seed_files(config.seed_dir.clone(), WEED_EXTENSION).await?;
eprintln!("{:?}", seed_files);
tokio::spawn(async move {
seed_loader(seed_files, &weed_tx).await?;
Ok(()) as anyhow::Result<()>
});
let stats = importer(store, weed_rx, true).await?;
dark_red_ln!("=== Weeds Imported! ===");
red!("New domains: ");
yellow_ln!("{:?}", stats.new_domains);
println!();
Ok(())
}
const BATCH_SIZE: usize = 256;
#[derive(Clone, Debug, Default)]
pub struct SeedImportStats {
pub new_domains: u32,
pub new_sitemaps: u32,
pub new_urls: u32,
pub already_present_urls: u32,
}
/// Task that imports seeds into the store
async fn importer(
store: RakerStore,
mut recv: Receiver<Seed>,
are_weeds: bool,
) -> anyhow::Result<SeedImportStats> {
let mut buf = Vec::with_capacity(BATCH_SIZE);
let mut stats = SeedImportStats::default();
let client = Client::new();
while let Some(seed) = recv.recv().await {
buf.push(seed);
if buf.len() == BATCH_SIZE {
import_and_flush_batch_seeds_or_weeds(
&store, &mut buf, &mut stats, &client, !are_weeds,
)
.await?;
}
}
import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
.await?;
Ok(stats)
}
async fn import_and_flush_batch_seeds_or_weeds(
store: &RakerStore,
buf: &mut Vec<Seed>,
stats: &mut SeedImportStats,
client: &Client,
is_seed: bool,
) -> anyhow::Result<()> {
let txn = store.rw_txn()?;
for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
let domain_record = txn.get_domain_record(domain.borrow())?;
let is_domain_new = domain_record.is_none();
let mut domain_record = domain_record.unwrap_or_default();
if is_domain_new {
stats.new_domains += 1;
}
let mut dirty = is_domain_new;
// Register the domain. This is a no-op if it's already active or backing off.
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
let url_like = match &seed.url {
UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str.as_str())?;
if is_seed {
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
stats.new_urls += 1;
} else {
stats.already_present_urls += 1;
}
}
// Seed/weed with empty prefix
dirty |= domain_record
.rakeable_path_prefixes
.insert(String::new(), is_seed)
!= Some(is_seed);
url
}
UrlOrUrlPattern::UrlPrefix(prefix) => {
let prefix_as_url = Url::parse(prefix.as_str())?;
if is_seed {
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
stats.new_urls += 1;
} else {
stats.already_present_urls += 1;
}
}
dirty |= domain_record
.rakeable_path_prefixes
.insert(prefix_as_url.path().to_string(), is_seed)
!= Some(is_seed);
prefix_as_url
}
};
if dirty {
txn.put_domain_record(domain.borrow(), domain_record)?;
}
if is_seed {
// look at robots.txt and discover sitemaps!
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
for sitemap in robots_txt.sitemaps {
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
stats.new_sitemaps += 1;
}
}
}
}
}
txn.commit()?;
Ok(())
}