From ff514e90b8cb22ca3ceb586d1410ae1fc1f83c6e Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Fri, 31 Mar 2023 22:50:02 +0100 Subject: [PATCH] Simplify allowed_/weed_domains --- quickpeep_raker/src/bin/qp-raker-db.rs | 18 +--- quickpeep_raker/src/bin/qp-seedrake.rs | 118 +++++++-------------- quickpeep_raker/src/raking/task.rs | 48 ++++----- quickpeep_raker/src/storage.rs | 68 +++--------- quickpeep_raker/src/storage/maintenance.rs | 39 +++---- quickpeep_raker/src/storage/records.rs | 61 ++--------- quickpeep_utils/src/dirty.rs | 40 ------- quickpeep_utils/src/lib.rs | 1 - 8 files changed, 110 insertions(+), 283 deletions(-) delete mode 100644 quickpeep_utils/src/dirty.rs diff --git a/quickpeep_raker/src/bin/qp-raker-db.rs b/quickpeep_raker/src/bin/qp-raker-db.rs index 79a12fe..b23c352 100644 --- a/quickpeep_raker/src/bin/qp-raker-db.rs +++ b/quickpeep_raker/src/bin/qp-raker-db.rs @@ -16,8 +16,8 @@ use quickpeep_raker::config; use quickpeep_raker::storage::mdbx_helper_types::MdbxBare; use quickpeep_raker::storage::records::{ - ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, - QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, + ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord, + UrlVisitedRecord, }; use quickpeep_raker::storage::{RakerStore, RakerTxn}; @@ -111,11 +111,11 @@ pub async fn main() -> anyhow::Result<()> { &txn, )?; } - "allowed_domains" => { - inspect::>( + "domains" => { + inspect::>( opts.key_name.as_ref(), opts.prefix, - &txn.mdbx.borrow_dbs().allowed_domains, + &txn.mdbx.borrow_dbs().domains, &txn, )?; } @@ -127,14 +127,6 @@ pub async fn main() -> anyhow::Result<()> { &txn, )?; } - "weed_domains" => { - inspect::>( - opts.key_name.as_ref(), - opts.prefix, - &txn.mdbx.borrow_dbs().weed_domains, - &txn, - )?; - } other => { dark_yellow_ln!("Unknown database {:?}", other); } diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 570eb6a..c586575 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -1,5 +1,5 @@ use clap::Parser; -use std::borrow::{Borrow, BorrowMut}; +use std::borrow::Borrow; use env_logger::Env; @@ -14,12 +14,10 @@ use tokio::sync::mpsc::Receiver; use quickpeep_raker::config::RakerConfig; use quickpeep_raker::raking::references::SUPPORTED_SCHEMES; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; -use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_seed_parser::loader::{ find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION, }; -use quickpeep_utils::dirty::DirtyTracker; use quickpeep_utils::urls::get_reduced_domain; /// Seeds a raker's queue with URLs @@ -144,27 +142,24 @@ async fn importer( buf.push(seed); if buf.len() == BATCH_SIZE { - if are_weeds { - import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; - } else { - import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?; - } + import_and_flush_batch_seeds_or_weeds( + &store, &mut buf, &mut stats, &client, !are_weeds, + ) + .await?; } } - if are_weeds { - import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; - } else { - import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?; - } + import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds) + .await?; Ok(stats) } -async fn import_and_flush_batch_seeds( +async fn import_and_flush_batch_seeds_or_weeds( store: &RakerStore, buf: &mut Vec, stats: &mut SeedImportStats, client: &Client, + is_seed: bool, ) -> anyhow::Result<()> { let txn = store.rw_txn()?; for seed in buf.drain(..) { @@ -173,20 +168,13 @@ async fn import_and_flush_batch_seeds( let domain = get_reduced_domain(&as_url) .with_context(|| format!("No domain in seed URL '{as_url}'!"))?; - let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?; - - let is_domain_new = allowed_domain_record.is_none(); + let domain_record = txn.get_domain_record(domain.borrow())?; + let is_domain_new = domain_record.is_none(); + let mut domain_record = domain_record.unwrap_or_default(); if is_domain_new { stats.new_domains += 1; } - - let mut allowed_domain_record = DirtyTracker::new( - allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()), - ); - if is_domain_new { - // Mark it as dirty - let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut(); - } + let mut dirty = is_domain_new; // Register the domain. This is a no-op if it's already active or backing off. txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?; @@ -194,36 +182,46 @@ async fn import_and_flush_batch_seeds( let url_like = match &seed.url { UrlOrUrlPattern::Url(url_str) => { let url = Url::parse(url_str.as_str())?; - if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? { - stats.new_urls += 1; - } else { - stats.already_present_urls += 1; + if is_seed { + if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? { + stats.new_urls += 1; + } else { + stats.already_present_urls += 1; + } } + + // Seed/weed with empty prefix + dirty |= domain_record + .rakeable_path_prefixes + .insert(String::new(), is_seed) + != Some(is_seed); + url } UrlOrUrlPattern::UrlPrefix(prefix) => { let prefix_as_url = Url::parse(prefix.as_str())?; - if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? { - stats.new_urls += 1; - } else { - stats.already_present_urls += 1; - } - if is_domain_new { - let allowed_domain_record: &mut AllowedDomainRecord = - allowed_domain_record.borrow_mut(); - allowed_domain_record - .restricted_prefixes - .insert(prefix_as_url.path().to_string()); + if is_seed { + if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? { + stats.new_urls += 1; + } else { + stats.already_present_urls += 1; + } } + + dirty |= domain_record + .rakeable_path_prefixes + .insert(prefix_as_url.path().to_string(), is_seed) + != Some(is_seed); + prefix_as_url } }; - if allowed_domain_record.is_dirty() { - txn.put_allowed_domain_record(domain.borrow(), allowed_domain_record.into_inner())?; + if dirty { + txn.put_domain_record(domain.borrow(), domain_record)?; } - if is_domain_new { + if is_seed { // look at robots.txt and discover sitemaps! if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { for sitemap in robots_txt.sitemaps { @@ -238,37 +236,3 @@ async fn import_and_flush_batch_seeds( txn.commit()?; Ok(()) } - -async fn import_and_flush_batch_weeds( - store: &RakerStore, - buf: &mut Vec, - stats: &mut SeedImportStats, -) -> anyhow::Result<()> { - let txn = store.rw_txn()?; - for seed in buf.drain(..) { - let as_url = Url::parse(seed.url.as_str()) - .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; - let domain = get_reduced_domain(&as_url) - .with_context(|| format!("No domain in weed URL '{as_url}'!"))?; - - let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?; - - let is_domain_new = weed_domain_record.is_none(); - if is_domain_new { - stats.new_domains += 1; - } - - let mut weed_domain_record = - DirtyTracker::new(weed_domain_record.unwrap_or_else(|| WeedDomainRecord::default())); - if is_domain_new { - // Mark it as dirty - let _: &mut WeedDomainRecord = weed_domain_record.borrow_mut(); - } - - if weed_domain_record.is_dirty() { - txn.put_weed_domain_record(domain.borrow(), weed_domain_record.into_inner())?; - } - } - txn.commit()?; - Ok(()) -} diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 8bbddb0..9bf1ec8 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -4,7 +4,7 @@ use crate::raking::{ get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, }; -use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord}; +use crate::storage::records::{DomainRecord, UrlVisitedRecord}; use crate::storage::RakerStore; use anyhow::{anyhow, Context}; use chrono::Utc; @@ -564,31 +564,31 @@ impl EventProcessor<'_> { format!("failed to reduce domain: {:?}", reference.target) })?; - // First check if this URL is an allowed URL (hence should be enqueued) + // Check if this URL is an allowed URL (hence should be enqueued) let allowed = txn - .get_allowed_domain_record(domain.borrow())? - .map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url)) - .unwrap_or(false); - if allowed { - let is_fresh = txn.enqueue_url( - &reference.target, - reference.last_mod, - reference.kind.into(), - )?; - if is_fresh { - increment_counter!("qprake_queue_new_url"); - } - continue; - } + .get_domain_record(domain.borrow())? + .map(|record: DomainRecord| record.is_url_rakeable(&ref_url)) + .flatten(); - // Then check if this URL is a weed (hence should be ignored) - let is_weed = txn - .get_weed_domain_record(domain.borrow())? - .map(|record: WeedDomainRecord| record.applies_to_url(&ref_url)) - .unwrap_or(false); - if !is_weed { - // It's neither allowed nor weeded, so put it on hold for later inspection - txn.put_url_on_hold(&reference.target, reference.kind.into())?; + match allowed { + Some(true) => { + let is_fresh = txn.enqueue_url( + &reference.target, + reference.last_mod, + reference.kind.into(), + )?; + if is_fresh { + increment_counter!("qprake_queue_new_url"); + } + continue; + } + Some(false) => { + // Weed! Do nothing. + } + None => { + // It's neither allowed nor weeded, so put it on hold for later inspection + txn.put_url_on_hold(&reference.target, reference.kind.into())?; + } } } diff --git a/quickpeep_raker/src/storage.rs b/quickpeep_raker/src/storage.rs index ddf057a..71d0693 100644 --- a/quickpeep_raker/src/storage.rs +++ b/quickpeep_raker/src/storage.rs @@ -2,8 +2,8 @@ use crate::raking::{RakeIntent, TemporaryFailure}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64}; use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION}; use crate::storage::records::{ - ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, - QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, + ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord, + UrlVisitedRecord, }; use anyhow::{anyhow, bail, ensure, Context}; use libmdbx::{ @@ -45,12 +45,10 @@ pub struct Databases<'env> { pub backing_off_domains: Database<'env>, /// URL → VisitedDomainRecord pub visited_urls: Database<'env>, - /// Domain → AllowedDomainRecord - pub allowed_domains: Database<'env>, + /// Domain → DomainRecord + pub domains: Database<'env>, /// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE) pub urls_on_hold: Database<'env>, - /// Domain → WeedDomainRecord - pub weed_domains: Database<'env>, } impl<'env> Databases<'env> { @@ -66,16 +64,15 @@ impl<'env> Databases<'env> { ), ("backing_off_domains", &self.backing_off_domains), ("visited_urls", &self.visited_urls), - ("allowed_domains", &self.allowed_domains), + ("domains", &self.domains), ("urls_on_hold", &self.urls_on_hold), - ("weed_domains", &self.weed_domains), ] .into_iter() } } // Must match the order of the Databases struct fields. -pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [ +pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [ ("urls_queue", DatabaseFlags::empty()), ("rerake_queue", DatabaseFlags::DUP_SORT), ("active_domains", DatabaseFlags::empty()), @@ -86,9 +83,8 @@ pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [ ), ("backing_off_domains", DatabaseFlags::empty()), ("urls_visited", DatabaseFlags::empty()), - ("allowed_domains", DatabaseFlags::empty()), + ("domains", DatabaseFlags::empty()), ("urls_on_hold", DatabaseFlags::empty()), - ("weed_domains", DatabaseFlags::empty()), ]; #[self_referencing] @@ -187,9 +183,8 @@ impl RakerStore { backing_off_reinstatements: dbs.next().unwrap(), backing_off_domains: dbs.next().unwrap(), visited_urls: dbs.next().unwrap(), - allowed_domains: dbs.next().unwrap(), + domains: dbs.next().unwrap(), urls_on_hold: dbs.next().unwrap(), - weed_domains: dbs.next().unwrap(), } }, } @@ -603,33 +598,17 @@ impl<'a> RakerTxn<'a, RW> { Ok(is_new) } - pub fn put_allowed_domain_record( + pub fn put_domain_record( &self, domain: &str, - allowed_domain_record: AllowedDomainRecord, + domain_record: DomainRecord, ) -> anyhow::Result<()> { - let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains; + let domains = &self.mdbx.borrow_dbs().domains; self.mdbx_txn.put( - allowed_domains, + domains, domain.as_bytes(), - MdbxBare(allowed_domain_record).as_bytes(), - WriteFlags::empty(), - )?; - Ok(()) - } - - pub fn put_weed_domain_record( - &self, - domain: &str, - weed_domain_record: WeedDomainRecord, - ) -> anyhow::Result<()> { - let weed_domains = &self.mdbx.borrow_dbs().weed_domains; - - self.mdbx_txn.put( - weed_domains, - domain.as_bytes(), - MdbxBare(weed_domain_record).as_bytes(), + MdbxBare(domain_record).as_bytes(), WriteFlags::empty(), )?; Ok(()) @@ -779,27 +758,12 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> { } } - pub fn get_allowed_domain_record( - &self, - domain: &str, - ) -> anyhow::Result> { - let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains; + pub fn get_domain_record(&self, domain: &str) -> anyhow::Result> { + let domains = &self.mdbx.borrow_dbs().domains; match self .mdbx_txn - .get::>(allowed_domains, domain.as_bytes())? - { - None => Ok(None), - Some(MdbxBare(record)) => Ok(Some(record)), - } - } - - pub fn get_weed_domain_record(&self, domain: &str) -> anyhow::Result> { - let weed_domains = &self.mdbx.borrow_dbs().weed_domains; - - match self - .mdbx_txn - .get::>(weed_domains, domain.as_bytes())? + .get::>(domains, domain.as_bytes())? { None => Ok(None), Some(MdbxBare(record)) => Ok(Some(record)), diff --git a/quickpeep_raker/src/storage/maintenance.rs b/quickpeep_raker/src/storage/maintenance.rs index f48f4d6..f0f1dd8 100644 --- a/quickpeep_raker/src/storage/maintenance.rs +++ b/quickpeep_raker/src/storage/maintenance.rs @@ -1,9 +1,8 @@ use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString}; -use crate::storage::records::{AllowedDomainRecord, OnHoldUrlRecord, WeedDomainRecord}; +use crate::storage::records::{DomainRecord, OnHoldUrlRecord}; use crate::storage::RakerTxn; use anyhow::Context; use libmdbx::{Database, WriteFlags, RW}; -use log::warn; use reqwest::Url; /// Runs one big transaction that: @@ -16,8 +15,7 @@ use reqwest::Url; pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn) -> anyhow::Result<()> { struct DomainState { pub domain: String, - pub allowed_domain_record: Option, - pub weed_domain_record: Option, + pub domain_record: Option, } let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold; @@ -47,44 +45,33 @@ pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn) -> anyhow::Res // Then load the relevant records for it. domain_state = Some(DomainState { domain: domain.to_owned(), - allowed_domain_record: txn.get_allowed_domain_record(domain)?, - weed_domain_record: txn.get_weed_domain_record(domain)?, + domain_record: txn.get_domain_record(domain)?, }); } let url = Url::parse(url_str)?; let domain_state = domain_state.as_ref().unwrap(); - let is_allowed = domain_state - .allowed_domain_record - .as_ref() - .map(|adr: &AllowedDomainRecord| adr.applies_to_url(&url)) - .unwrap_or(false); - let is_weed = domain_state - .weed_domain_record - .as_ref() - .map(|wdr: &WeedDomainRecord| wdr.applies_to_url(&url)) - .unwrap_or(false); - match (is_allowed, is_weed) { - (false, false) => { /* nop */ } - (true, true) => { - warn!( - "Ambiguous: {:?} is both mentioned by a seed and a weed. Ignoring.", - url - ); - } - (true, false) => { + let is_rakeable = domain_state + .domain_record + .as_ref() + .map(|dr: &DomainRecord| dr.is_url_rakeable(&url)) + .flatten(); + + match is_rakeable { + Some(true) => { // ALLOWED // Make it a queued URL txn.enqueue_url(url_str, None, record.queue_record.intent)?; cur.del(WriteFlags::empty())?; } - (false, true) => { + Some(false) => { // WEED // Just delete cur.del(WriteFlags::empty())?; } + None => { /* nop: neither allowed nor a weed. Keep on hold. */ } } } diff --git a/quickpeep_raker/src/storage/records.rs b/quickpeep_raker/src/storage/records.rs index 4e8592c..e4c7f79 100644 --- a/quickpeep_raker/src/storage/records.rs +++ b/quickpeep_raker/src/storage/records.rs @@ -1,7 +1,7 @@ use crate::raking::{RakeIntent, TemporaryFailure}; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::BTreeSet; +use std::collections::BTreeMap; #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] pub struct ActiveDomainRecord { @@ -46,26 +46,20 @@ pub struct BackingOffDomainRecord { } #[derive(Clone, Debug, Serialize, Deserialize, Default)] -pub struct AllowedDomainRecord { - /// Set of acceptable path prefixes. - /// Empty if ALL path prefixes are permitted. - pub restricted_prefixes: BTreeSet, +pub struct DomainRecord { + pub rakeable_path_prefixes: BTreeMap, } -impl AllowedDomainRecord { - /// Returns true iff this record applies to this URL. +impl DomainRecord { + /// Returns whether the URL is rakeable. /// /// Preconditions: it has been checked that the record applies to the domain - pub fn applies_to_url(&self, url: &Url) -> bool { - if self.restricted_prefixes.is_empty() { - return true; - } - - let mut applies = false; - for prefix in self.restricted_prefixes.iter() { + pub fn is_url_rakeable(&self, url: &Url) -> Option { + let mut final_result = None; + // TODO This could be made more efficient. + for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() { if url.path().starts_with(prefix) { - applies = true; - break; + final_result = Some(rakeable); } if prefix.as_str() > url.path() { // e.g. /dog > /cat/xyz @@ -74,39 +68,6 @@ impl AllowedDomainRecord { break; } } - applies - } -} - -#[derive(Clone, Debug, Serialize, Deserialize, Default)] -pub struct WeedDomainRecord { - /// Set of weedy path prefixes. - /// Empty if ALL path prefixes are weedy. - pub restricted_prefixes: BTreeSet, -} - -impl WeedDomainRecord { - /// Returns true iff this record applies to this URL. - /// - /// Preconditions: it has been checked that the record applies to the domain - pub fn applies_to_url(&self, url: &Url) -> bool { - if self.restricted_prefixes.is_empty() { - return true; - } - - let mut applies = false; - for prefix in self.restricted_prefixes.iter() { - if url.path().starts_with(prefix) { - applies = true; - break; - } - if prefix.as_str() > url.path() { - // e.g. /dog > /cat/xyz - // This means we've missed all chances to see our prefix, - // so we break here (efficiency). - break; - } - } - applies + final_result } } diff --git a/quickpeep_utils/src/dirty.rs b/quickpeep_utils/src/dirty.rs deleted file mode 100644 index 03bec03..0000000 --- a/quickpeep_utils/src/dirty.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::borrow::{Borrow, BorrowMut}; - -pub struct DirtyTracker { - inner: T, - dirty: bool, -} - -impl Borrow for DirtyTracker { - fn borrow(&self) -> &T { - &self.inner - } -} - -impl BorrowMut for DirtyTracker { - fn borrow_mut(&mut self) -> &mut T { - self.dirty = true; - &mut self.inner - } -} - -impl DirtyTracker { - pub fn new(inner: T) -> DirtyTracker { - DirtyTracker { - inner, - dirty: false, - } - } - - pub fn is_dirty(&self) -> bool { - self.dirty - } - - pub fn make_clean(&mut self) { - self.dirty = false; - } - - pub fn into_inner(self) -> T { - self.inner - } -} diff --git a/quickpeep_utils/src/lib.rs b/quickpeep_utils/src/lib.rs index 3e80b73..d27b9cd 100644 --- a/quickpeep_utils/src/lib.rs +++ b/quickpeep_utils/src/lib.rs @@ -1,4 +1,3 @@ pub mod dates; -pub mod dirty; pub mod lazy; pub mod urls;