diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index d784f2a..256ed27 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -4,7 +4,7 @@ use crate::raking::{ get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, }; -use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord}; +use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord}; use crate::storage::{RakerStore, RandomActiveDomainAcquisition}; use anyhow::{anyhow, ensure, Context}; use chrono::Utc; @@ -469,38 +469,28 @@ impl EventProcessor<'_> { for reference in refs.references { let ref_url = Url::parse(&reference.target)?; let domain = get_reduced_domain(&ref_url)?; - let allowed = match txn.get_allowed_domain_record(domain.borrow())? { - None => false, - Some(AllowedDomainRecord { - restricted_prefixes, - }) => { - if restricted_prefixes.is_empty() { - true - } else { - let mut allowed = false; - for prefix in restricted_prefixes.iter() { - if ref_url.path().starts_with(prefix) { - allowed = true; - break; - } - if prefix.as_str() > ref_url.path() { - // e.g. /dog > /cat/xyz - // This means we've missed all chances to see our prefix, - // so we break here (efficiency). - break; - } - } - allowed - } - } - }; + + // First check if this URL is an allowed URL (hence should be enqueued) + let allowed = txn + .get_allowed_domain_record(domain.borrow())? + .map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url)) + .unwrap_or(false); if allowed { txn.enqueue_url( &reference.target, reference.last_mod, reference.kind.into(), )?; - } else { + continue; + } + + // Then check if this URL is a weed (hence should be ignored) + let is_weed = txn + .get_weed_domain_record(domain.borrow())? + .map(|record: WeedDomainRecord| record.applies_to_url(&ref_url)) + .unwrap_or(false); + if !is_weed { + // It's neither allowed nor weeded, so put it on hold for later inspection txn.put_url_on_hold(&reference.target, reference.kind.into())?; } } diff --git a/quickpeep_raker/src/storage/records.rs b/quickpeep_raker/src/storage/records.rs index 9861d40..bdc2e94 100644 --- a/quickpeep_raker/src/storage/records.rs +++ b/quickpeep_raker/src/storage/records.rs @@ -1,4 +1,5 @@ use crate::raking::{RakeIntent, TemporaryFailure}; +use reqwest::Url; use serde::{Deserialize, Serialize}; use std::collections::BTreeSet; @@ -52,9 +53,61 @@ pub struct AllowedDomainRecord { pub restricted_prefixes: BTreeSet, } +impl AllowedDomainRecord { + /// Returns true iff this record applies to this URL. + /// + /// Preconditions: it has been checked that the record applies to the domain + pub fn applies_to_url(&self, url: &Url) -> bool { + if self.restricted_prefixes.is_empty() { + return true; + } + + let mut applies = false; + for prefix in self.restricted_prefixes.iter() { + if url.path().starts_with(prefix) { + applies = true; + break; + } + if prefix.as_str() > url.path() { + // e.g. /dog > /cat/xyz + // This means we've missed all chances to see our prefix, + // so we break here (efficiency). + break; + } + } + applies + } +} + #[derive(Clone, Debug, Serialize, Deserialize, Default)] pub struct WeedDomainRecord { /// Set of weedy path prefixes. /// Empty if ALL path prefixes are weedy. pub restricted_prefixes: BTreeSet, } + +impl WeedDomainRecord { + /// Returns true iff this record applies to this URL. + /// + /// Preconditions: it has been checked that the record applies to the domain + pub fn applies_to_url(&self, url: &Url) -> bool { + if self.restricted_prefixes.is_empty() { + return true; + } + + let mut applies = false; + for prefix in self.restricted_prefixes.iter() { + if url.path().starts_with(prefix) { + applies = true; + break; + } + if prefix.as_str() > url.path() { + // e.g. /dog > /cat/xyz + // This means we've missed all chances to see our prefix, + // so we break here (efficiency). + break; + } + } + applies + } +}