239 lines
7.1 KiB
Rust
239 lines
7.1 KiB
Rust
use clap::Parser;
|
|
use std::borrow::Borrow;
|
|
|
|
use env_logger::Env;
|
|
|
|
use anyhow::{bail, Context};
|
|
|
|
use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln};
|
|
use reqwest::{Client, Url};
|
|
use std::path::PathBuf;
|
|
use tokio::sync::mpsc;
|
|
use tokio::sync::mpsc::Receiver;
|
|
|
|
use quickpeep_raker::config::RakerConfig;
|
|
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
|
|
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
|
use quickpeep_raker::storage::{maintenance, RakerStore};
|
|
use quickpeep_seed_parser::loader::{
|
|
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
|
};
|
|
use quickpeep_utils::urls::get_reduced_domain;
|
|
|
|
/// Seeds a raker's queue with URLs
|
|
#[derive(Clone, Debug, Parser)]
|
|
pub struct Opts {
|
|
#[clap(long = "config")]
|
|
config: Option<PathBuf>,
|
|
|
|
#[clap(long = "import")]
|
|
/// Import the seeds into the workbench
|
|
import: bool,
|
|
}
|
|
|
|
#[tokio::main]
|
|
pub async fn main() -> anyhow::Result<()> {
|
|
env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init();
|
|
|
|
let opts: Opts = Opts::parse();
|
|
|
|
let config_path = opts
|
|
.config
|
|
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
|
let config = RakerConfig::load(&config_path).context("Failed to load config")?;
|
|
|
|
if !config.raker.workbench_dir.exists() {
|
|
bail!(
|
|
"Workbench directory ({:?}) doesn't exist.",
|
|
config.raker.workbench_dir
|
|
);
|
|
}
|
|
if !config.seed_dir.exists() {
|
|
bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir);
|
|
}
|
|
|
|
let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?;
|
|
|
|
import_seeds(store.clone(), &config).await?;
|
|
|
|
import_weeds(store.clone(), &config).await?;
|
|
|
|
eprintln!("... re-applying seeds and weeds to on-hold URLs ...");
|
|
store
|
|
.async_rw_txn(|txn| maintenance::reapply_seeds_and_weeds_to_on_hold_urls(txn))
|
|
.await?;
|
|
eprintln!("... done!");
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn import_seeds(store: RakerStore, config: &RakerConfig) -> anyhow::Result<()> {
|
|
let (weed_tx, weed_rx) = mpsc::channel(128);
|
|
|
|
let weed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
|
|
|
eprintln!("{:?}", weed_files);
|
|
|
|
tokio::spawn(async move {
|
|
seed_loader(weed_files, &weed_tx).await?;
|
|
|
|
Ok(()) as anyhow::Result<()>
|
|
});
|
|
|
|
let stats = importer(store, weed_rx, false).await?;
|
|
|
|
dark_green_ln!("=== Seeds Imported! ===");
|
|
green!("New URLs: ");
|
|
yellow_ln!("{:?}", stats.new_urls);
|
|
green!("New sitemaps: ");
|
|
yellow_ln!("{:?}", stats.new_sitemaps);
|
|
green!("New domains: ");
|
|
yellow_ln!("{:?}", stats.new_domains);
|
|
dark_yellow!("Seen URLs: ");
|
|
yellow_ln!("{:?}", stats.already_present_urls);
|
|
println!();
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn import_weeds(store: RakerStore, config: &RakerConfig) -> anyhow::Result<()> {
|
|
let (weed_tx, weed_rx) = mpsc::channel(128);
|
|
|
|
let seed_files = find_seed_files(config.seed_dir.clone(), WEED_EXTENSION).await?;
|
|
|
|
eprintln!("{:?}", seed_files);
|
|
|
|
tokio::spawn(async move {
|
|
seed_loader(seed_files, &weed_tx).await?;
|
|
|
|
Ok(()) as anyhow::Result<()>
|
|
});
|
|
|
|
let stats = importer(store, weed_rx, true).await?;
|
|
|
|
dark_red_ln!("=== Weeds Imported! ===");
|
|
red!("New domains: ");
|
|
yellow_ln!("{:?}", stats.new_domains);
|
|
println!();
|
|
|
|
Ok(())
|
|
}
|
|
|
|
const BATCH_SIZE: usize = 256;
|
|
|
|
#[derive(Clone, Debug, Default)]
|
|
pub struct SeedImportStats {
|
|
pub new_domains: u32,
|
|
pub new_sitemaps: u32,
|
|
pub new_urls: u32,
|
|
pub already_present_urls: u32,
|
|
}
|
|
|
|
/// Task that imports seeds into the store
|
|
async fn importer(
|
|
store: RakerStore,
|
|
mut recv: Receiver<Seed>,
|
|
are_weeds: bool,
|
|
) -> anyhow::Result<SeedImportStats> {
|
|
let mut buf = Vec::with_capacity(BATCH_SIZE);
|
|
let mut stats = SeedImportStats::default();
|
|
let client = Client::new();
|
|
while let Some(seed) = recv.recv().await {
|
|
buf.push(seed);
|
|
|
|
if buf.len() == BATCH_SIZE {
|
|
import_and_flush_batch_seeds_or_weeds(
|
|
&store, &mut buf, &mut stats, &client, !are_weeds,
|
|
)
|
|
.await?;
|
|
}
|
|
}
|
|
import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
|
|
.await?;
|
|
|
|
Ok(stats)
|
|
}
|
|
|
|
async fn import_and_flush_batch_seeds_or_weeds(
|
|
store: &RakerStore,
|
|
buf: &mut Vec<Seed>,
|
|
stats: &mut SeedImportStats,
|
|
client: &Client,
|
|
is_seed: bool,
|
|
) -> anyhow::Result<()> {
|
|
let txn = store.rw_txn()?;
|
|
for seed in buf.drain(..) {
|
|
let as_url = Url::parse(seed.url.as_str())
|
|
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
|
let domain = get_reduced_domain(&as_url)
|
|
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
|
|
|
let domain_record = txn.get_domain_record(domain.borrow())?;
|
|
let is_domain_new = domain_record.is_none();
|
|
let mut domain_record = domain_record.unwrap_or_default();
|
|
if is_domain_new {
|
|
stats.new_domains += 1;
|
|
}
|
|
let mut dirty = is_domain_new;
|
|
|
|
// Register the domain. This is a no-op if it's already active or backing off.
|
|
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
|
|
|
let url_like = match &seed.url {
|
|
UrlOrUrlPattern::Url(url_str) => {
|
|
let url = Url::parse(url_str.as_str())?;
|
|
if is_seed {
|
|
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
|
|
stats.new_urls += 1;
|
|
} else {
|
|
stats.already_present_urls += 1;
|
|
}
|
|
}
|
|
|
|
// Seed/weed with empty prefix
|
|
dirty |= domain_record
|
|
.rakeable_path_prefixes
|
|
.insert(String::new(), is_seed)
|
|
!= Some(is_seed);
|
|
|
|
url
|
|
}
|
|
UrlOrUrlPattern::UrlPrefix(prefix) => {
|
|
let prefix_as_url = Url::parse(prefix.as_str())?;
|
|
if is_seed {
|
|
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
|
|
stats.new_urls += 1;
|
|
} else {
|
|
stats.already_present_urls += 1;
|
|
}
|
|
}
|
|
|
|
dirty |= domain_record
|
|
.rakeable_path_prefixes
|
|
.insert(prefix_as_url.path().to_string(), is_seed)
|
|
!= Some(is_seed);
|
|
|
|
prefix_as_url
|
|
}
|
|
};
|
|
|
|
if dirty {
|
|
txn.put_domain_record(domain.borrow(), domain_record)?;
|
|
}
|
|
|
|
if is_seed {
|
|
// look at robots.txt and discover sitemaps!
|
|
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
|
for sitemap in robots_txt.sitemaps {
|
|
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
|
|
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
|
|
stats.new_sitemaps += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
txn.commit()?;
|
|
Ok(())
|
|
}
|