Simplify allowed_/weed_domains

rei/rakerstore_postgres_overhaul
Olivier 'reivilibre' 2023-03-31 22:50:02 +01:00
parent 1c10cb203a
commit ff514e90b8
8 changed files with 110 additions and 283 deletions

View File

@ -16,8 +16,8 @@ use quickpeep_raker::config;
use quickpeep_raker::storage::mdbx_helper_types::MdbxBare; use quickpeep_raker::storage::mdbx_helper_types::MdbxBare;
use quickpeep_raker::storage::records::{ use quickpeep_raker::storage::records::{
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, UrlVisitedRecord,
}; };
use quickpeep_raker::storage::{RakerStore, RakerTxn}; use quickpeep_raker::storage::{RakerStore, RakerTxn};
@ -111,11 +111,11 @@ pub async fn main() -> anyhow::Result<()> {
&txn, &txn,
)?; )?;
} }
"allowed_domains" => { "domains" => {
inspect::<MdbxBare<AllowedDomainRecord>>( inspect::<MdbxBare<DomainRecord>>(
opts.key_name.as_ref(), opts.key_name.as_ref(),
opts.prefix, opts.prefix,
&txn.mdbx.borrow_dbs().allowed_domains, &txn.mdbx.borrow_dbs().domains,
&txn, &txn,
)?; )?;
} }
@ -127,14 +127,6 @@ pub async fn main() -> anyhow::Result<()> {
&txn, &txn,
)?; )?;
} }
"weed_domains" => {
inspect::<MdbxBare<WeedDomainRecord>>(
opts.key_name.as_ref(),
opts.prefix,
&txn.mdbx.borrow_dbs().weed_domains,
&txn,
)?;
}
other => { other => {
dark_yellow_ln!("Unknown database {:?}", other); dark_yellow_ln!("Unknown database {:?}", other);
} }

View File

@ -1,5 +1,5 @@
use clap::Parser; use clap::Parser;
use std::borrow::{Borrow, BorrowMut}; use std::borrow::Borrow;
use env_logger::Env; use env_logger::Env;
@ -14,12 +14,10 @@ use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config::RakerConfig; use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES; use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_raker::storage::{maintenance, RakerStore};
use quickpeep_seed_parser::loader::{ use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION, find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
}; };
use quickpeep_utils::dirty::DirtyTracker;
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
/// Seeds a raker's queue with URLs /// Seeds a raker's queue with URLs
@ -144,27 +142,24 @@ async fn importer(
buf.push(seed); buf.push(seed);
if buf.len() == BATCH_SIZE { if buf.len() == BATCH_SIZE {
if are_weeds { import_and_flush_batch_seeds_or_weeds(
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; &store, &mut buf, &mut stats, &client, !are_weeds,
} else { )
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?; .await?;
}
} }
} }
if are_weeds { import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; .await?;
} else {
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
}
Ok(stats) Ok(stats)
} }
async fn import_and_flush_batch_seeds( async fn import_and_flush_batch_seeds_or_weeds(
store: &RakerStore, store: &RakerStore,
buf: &mut Vec<Seed>, buf: &mut Vec<Seed>,
stats: &mut SeedImportStats, stats: &mut SeedImportStats,
client: &Client, client: &Client,
is_seed: bool,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let txn = store.rw_txn()?; let txn = store.rw_txn()?;
for seed in buf.drain(..) { for seed in buf.drain(..) {
@ -173,20 +168,13 @@ async fn import_and_flush_batch_seeds(
let domain = get_reduced_domain(&as_url) let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?; .with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?; let domain_record = txn.get_domain_record(domain.borrow())?;
let is_domain_new = domain_record.is_none();
let is_domain_new = allowed_domain_record.is_none(); let mut domain_record = domain_record.unwrap_or_default();
if is_domain_new { if is_domain_new {
stats.new_domains += 1; stats.new_domains += 1;
} }
let mut dirty = is_domain_new;
let mut allowed_domain_record = DirtyTracker::new(
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
);
if is_domain_new {
// Mark it as dirty
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
}
// Register the domain. This is a no-op if it's already active or backing off. // Register the domain. This is a no-op if it's already active or backing off.
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?; txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
@ -194,36 +182,46 @@ async fn import_and_flush_batch_seeds(
let url_like = match &seed.url { let url_like = match &seed.url {
UrlOrUrlPattern::Url(url_str) => { UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str.as_str())?; let url = Url::parse(url_str.as_str())?;
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? { if is_seed {
stats.new_urls += 1; if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
} else { stats.new_urls += 1;
stats.already_present_urls += 1; } else {
stats.already_present_urls += 1;
}
} }
// Seed/weed with empty prefix
dirty |= domain_record
.rakeable_path_prefixes
.insert(String::new(), is_seed)
!= Some(is_seed);
url url
} }
UrlOrUrlPattern::UrlPrefix(prefix) => { UrlOrUrlPattern::UrlPrefix(prefix) => {
let prefix_as_url = Url::parse(prefix.as_str())?; let prefix_as_url = Url::parse(prefix.as_str())?;
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? { if is_seed {
stats.new_urls += 1; if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
} else { stats.new_urls += 1;
stats.already_present_urls += 1; } else {
} stats.already_present_urls += 1;
if is_domain_new { }
let allowed_domain_record: &mut AllowedDomainRecord =
allowed_domain_record.borrow_mut();
allowed_domain_record
.restricted_prefixes
.insert(prefix_as_url.path().to_string());
} }
dirty |= domain_record
.rakeable_path_prefixes
.insert(prefix_as_url.path().to_string(), is_seed)
!= Some(is_seed);
prefix_as_url prefix_as_url
} }
}; };
if allowed_domain_record.is_dirty() { if dirty {
txn.put_allowed_domain_record(domain.borrow(), allowed_domain_record.into_inner())?; txn.put_domain_record(domain.borrow(), domain_record)?;
} }
if is_domain_new { if is_seed {
// look at robots.txt and discover sitemaps! // look at robots.txt and discover sitemaps!
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
for sitemap in robots_txt.sitemaps { for sitemap in robots_txt.sitemaps {
@ -238,37 +236,3 @@ async fn import_and_flush_batch_seeds(
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
} }
async fn import_and_flush_batch_weeds(
store: &RakerStore,
buf: &mut Vec<Seed>,
stats: &mut SeedImportStats,
) -> anyhow::Result<()> {
let txn = store.rw_txn()?;
for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in weed URL '{as_url}'!"))?;
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
let is_domain_new = weed_domain_record.is_none();
if is_domain_new {
stats.new_domains += 1;
}
let mut weed_domain_record =
DirtyTracker::new(weed_domain_record.unwrap_or_else(|| WeedDomainRecord::default()));
if is_domain_new {
// Mark it as dirty
let _: &mut WeedDomainRecord = weed_domain_record.borrow_mut();
}
if weed_domain_record.is_dirty() {
txn.put_weed_domain_record(domain.borrow(), weed_domain_record.into_inner())?;
}
}
txn.commit()?;
Ok(())
}

View File

@ -4,7 +4,7 @@ use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
}; };
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord}; use crate::storage::records::{DomainRecord, UrlVisitedRecord};
use crate::storage::RakerStore; use crate::storage::RakerStore;
use anyhow::{anyhow, Context}; use anyhow::{anyhow, Context};
use chrono::Utc; use chrono::Utc;
@ -564,31 +564,31 @@ impl EventProcessor<'_> {
format!("failed to reduce domain: {:?}", reference.target) format!("failed to reduce domain: {:?}", reference.target)
})?; })?;
// First check if this URL is an allowed URL (hence should be enqueued) // Check if this URL is an allowed URL (hence should be enqueued)
let allowed = txn let allowed = txn
.get_allowed_domain_record(domain.borrow())? .get_domain_record(domain.borrow())?
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url)) .map(|record: DomainRecord| record.is_url_rakeable(&ref_url))
.unwrap_or(false); .flatten();
if allowed {
let is_fresh = txn.enqueue_url(
&reference.target,
reference.last_mod,
reference.kind.into(),
)?;
if is_fresh {
increment_counter!("qprake_queue_new_url");
}
continue;
}
// Then check if this URL is a weed (hence should be ignored) match allowed {
let is_weed = txn Some(true) => {
.get_weed_domain_record(domain.borrow())? let is_fresh = txn.enqueue_url(
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url)) &reference.target,
.unwrap_or(false); reference.last_mod,
if !is_weed { reference.kind.into(),
// It's neither allowed nor weeded, so put it on hold for later inspection )?;
txn.put_url_on_hold(&reference.target, reference.kind.into())?; if is_fresh {
increment_counter!("qprake_queue_new_url");
}
continue;
}
Some(false) => {
// Weed! Do nothing.
}
None => {
// It's neither allowed nor weeded, so put it on hold for later inspection
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
}
} }
} }

View File

@ -2,8 +2,8 @@ use crate::raking::{RakeIntent, TemporaryFailure};
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64};
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION}; use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
use crate::storage::records::{ use crate::storage::records::{
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, UrlVisitedRecord,
}; };
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{anyhow, bail, ensure, Context};
use libmdbx::{ use libmdbx::{
@ -45,12 +45,10 @@ pub struct Databases<'env> {
pub backing_off_domains: Database<'env>, pub backing_off_domains: Database<'env>,
/// URL → VisitedDomainRecord /// URL → VisitedDomainRecord
pub visited_urls: Database<'env>, pub visited_urls: Database<'env>,
/// Domain → AllowedDomainRecord /// Domain → DomainRecord
pub allowed_domains: Database<'env>, pub domains: Database<'env>,
/// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE) /// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE)
pub urls_on_hold: Database<'env>, pub urls_on_hold: Database<'env>,
/// Domain → WeedDomainRecord
pub weed_domains: Database<'env>,
} }
impl<'env> Databases<'env> { impl<'env> Databases<'env> {
@ -66,16 +64,15 @@ impl<'env> Databases<'env> {
), ),
("backing_off_domains", &self.backing_off_domains), ("backing_off_domains", &self.backing_off_domains),
("visited_urls", &self.visited_urls), ("visited_urls", &self.visited_urls),
("allowed_domains", &self.allowed_domains), ("domains", &self.domains),
("urls_on_hold", &self.urls_on_hold), ("urls_on_hold", &self.urls_on_hold),
("weed_domains", &self.weed_domains),
] ]
.into_iter() .into_iter()
} }
} }
// Must match the order of the Databases struct fields. // Must match the order of the Databases struct fields.
pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [ pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
("urls_queue", DatabaseFlags::empty()), ("urls_queue", DatabaseFlags::empty()),
("rerake_queue", DatabaseFlags::DUP_SORT), ("rerake_queue", DatabaseFlags::DUP_SORT),
("active_domains", DatabaseFlags::empty()), ("active_domains", DatabaseFlags::empty()),
@ -86,9 +83,8 @@ pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [
), ),
("backing_off_domains", DatabaseFlags::empty()), ("backing_off_domains", DatabaseFlags::empty()),
("urls_visited", DatabaseFlags::empty()), ("urls_visited", DatabaseFlags::empty()),
("allowed_domains", DatabaseFlags::empty()), ("domains", DatabaseFlags::empty()),
("urls_on_hold", DatabaseFlags::empty()), ("urls_on_hold", DatabaseFlags::empty()),
("weed_domains", DatabaseFlags::empty()),
]; ];
#[self_referencing] #[self_referencing]
@ -187,9 +183,8 @@ impl RakerStore {
backing_off_reinstatements: dbs.next().unwrap(), backing_off_reinstatements: dbs.next().unwrap(),
backing_off_domains: dbs.next().unwrap(), backing_off_domains: dbs.next().unwrap(),
visited_urls: dbs.next().unwrap(), visited_urls: dbs.next().unwrap(),
allowed_domains: dbs.next().unwrap(), domains: dbs.next().unwrap(),
urls_on_hold: dbs.next().unwrap(), urls_on_hold: dbs.next().unwrap(),
weed_domains: dbs.next().unwrap(),
} }
}, },
} }
@ -603,33 +598,17 @@ impl<'a> RakerTxn<'a, RW> {
Ok(is_new) Ok(is_new)
} }
pub fn put_allowed_domain_record( pub fn put_domain_record(
&self, &self,
domain: &str, domain: &str,
allowed_domain_record: AllowedDomainRecord, domain_record: DomainRecord,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains; let domains = &self.mdbx.borrow_dbs().domains;
self.mdbx_txn.put( self.mdbx_txn.put(
allowed_domains, domains,
domain.as_bytes(), domain.as_bytes(),
MdbxBare(allowed_domain_record).as_bytes(), MdbxBare(domain_record).as_bytes(),
WriteFlags::empty(),
)?;
Ok(())
}
pub fn put_weed_domain_record(
&self,
domain: &str,
weed_domain_record: WeedDomainRecord,
) -> anyhow::Result<()> {
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
self.mdbx_txn.put(
weed_domains,
domain.as_bytes(),
MdbxBare(weed_domain_record).as_bytes(),
WriteFlags::empty(), WriteFlags::empty(),
)?; )?;
Ok(()) Ok(())
@ -779,27 +758,12 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> {
} }
} }
pub fn get_allowed_domain_record( pub fn get_domain_record(&self, domain: &str) -> anyhow::Result<Option<DomainRecord>> {
&self, let domains = &self.mdbx.borrow_dbs().domains;
domain: &str,
) -> anyhow::Result<Option<AllowedDomainRecord>> {
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
match self match self
.mdbx_txn .mdbx_txn
.get::<MdbxBare<AllowedDomainRecord>>(allowed_domains, domain.as_bytes())? .get::<MdbxBare<DomainRecord>>(domains, domain.as_bytes())?
{
None => Ok(None),
Some(MdbxBare(record)) => Ok(Some(record)),
}
}
pub fn get_weed_domain_record(&self, domain: &str) -> anyhow::Result<Option<WeedDomainRecord>> {
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
match self
.mdbx_txn
.get::<MdbxBare<WeedDomainRecord>>(weed_domains, domain.as_bytes())?
{ {
None => Ok(None), None => Ok(None),
Some(MdbxBare(record)) => Ok(Some(record)), Some(MdbxBare(record)) => Ok(Some(record)),

View File

@ -1,9 +1,8 @@
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
use crate::storage::records::{AllowedDomainRecord, OnHoldUrlRecord, WeedDomainRecord}; use crate::storage::records::{DomainRecord, OnHoldUrlRecord};
use crate::storage::RakerTxn; use crate::storage::RakerTxn;
use anyhow::Context; use anyhow::Context;
use libmdbx::{Database, WriteFlags, RW}; use libmdbx::{Database, WriteFlags, RW};
use log::warn;
use reqwest::Url; use reqwest::Url;
/// Runs one big transaction that: /// Runs one big transaction that:
@ -16,8 +15,7 @@ use reqwest::Url;
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> { pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
struct DomainState { struct DomainState {
pub domain: String, pub domain: String,
pub allowed_domain_record: Option<AllowedDomainRecord>, pub domain_record: Option<DomainRecord>,
pub weed_domain_record: Option<WeedDomainRecord>,
} }
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold; let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
@ -47,44 +45,33 @@ pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Res
// Then load the relevant records for it. // Then load the relevant records for it.
domain_state = Some(DomainState { domain_state = Some(DomainState {
domain: domain.to_owned(), domain: domain.to_owned(),
allowed_domain_record: txn.get_allowed_domain_record(domain)?, domain_record: txn.get_domain_record(domain)?,
weed_domain_record: txn.get_weed_domain_record(domain)?,
}); });
} }
let url = Url::parse(url_str)?; let url = Url::parse(url_str)?;
let domain_state = domain_state.as_ref().unwrap(); let domain_state = domain_state.as_ref().unwrap();
let is_allowed = domain_state
.allowed_domain_record
.as_ref()
.map(|adr: &AllowedDomainRecord| adr.applies_to_url(&url))
.unwrap_or(false);
let is_weed = domain_state
.weed_domain_record
.as_ref()
.map(|wdr: &WeedDomainRecord| wdr.applies_to_url(&url))
.unwrap_or(false);
match (is_allowed, is_weed) { let is_rakeable = domain_state
(false, false) => { /* nop */ } .domain_record
(true, true) => { .as_ref()
warn!( .map(|dr: &DomainRecord| dr.is_url_rakeable(&url))
"Ambiguous: {:?} is both mentioned by a seed and a weed. Ignoring.", .flatten();
url
); match is_rakeable {
} Some(true) => {
(true, false) => {
// ALLOWED // ALLOWED
// Make it a queued URL // Make it a queued URL
txn.enqueue_url(url_str, None, record.queue_record.intent)?; txn.enqueue_url(url_str, None, record.queue_record.intent)?;
cur.del(WriteFlags::empty())?; cur.del(WriteFlags::empty())?;
} }
(false, true) => { Some(false) => {
// WEED // WEED
// Just delete // Just delete
cur.del(WriteFlags::empty())?; cur.del(WriteFlags::empty())?;
} }
None => { /* nop: neither allowed nor a weed. Keep on hold. */ }
} }
} }

View File

@ -1,7 +1,7 @@
use crate::raking::{RakeIntent, TemporaryFailure}; use crate::raking::{RakeIntent, TemporaryFailure};
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::BTreeSet; use std::collections::BTreeMap;
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct ActiveDomainRecord { pub struct ActiveDomainRecord {
@ -46,26 +46,20 @@ pub struct BackingOffDomainRecord {
} }
#[derive(Clone, Debug, Serialize, Deserialize, Default)] #[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct AllowedDomainRecord { pub struct DomainRecord {
/// Set of acceptable path prefixes. pub rakeable_path_prefixes: BTreeMap<String, bool>,
/// Empty if ALL path prefixes are permitted.
pub restricted_prefixes: BTreeSet<String>,
} }
impl AllowedDomainRecord { impl DomainRecord {
/// Returns true iff this record applies to this URL. /// Returns whether the URL is rakeable.
/// ///
/// Preconditions: it has been checked that the record applies to the domain /// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool { pub fn is_url_rakeable(&self, url: &Url) -> Option<bool> {
if self.restricted_prefixes.is_empty() { let mut final_result = None;
return true; // TODO This could be made more efficient.
} for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() {
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) { if url.path().starts_with(prefix) {
applies = true; final_result = Some(rakeable);
break;
} }
if prefix.as_str() > url.path() { if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz // e.g. /dog > /cat/xyz
@ -74,39 +68,6 @@ impl AllowedDomainRecord {
break; break;
} }
} }
applies final_result
}
}
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct WeedDomainRecord {
/// Set of weedy path prefixes.
/// Empty if ALL path prefixes are weedy.
pub restricted_prefixes: BTreeSet<String>,
}
impl WeedDomainRecord {
/// Returns true iff this record applies to this URL.
///
/// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool {
if self.restricted_prefixes.is_empty() {
return true;
}
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) {
applies = true;
break;
}
if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
applies
} }
} }

View File

@ -1,40 +0,0 @@
use std::borrow::{Borrow, BorrowMut};
pub struct DirtyTracker<T> {
inner: T,
dirty: bool,
}
impl<T> Borrow<T> for DirtyTracker<T> {
fn borrow(&self) -> &T {
&self.inner
}
}
impl<T> BorrowMut<T> for DirtyTracker<T> {
fn borrow_mut(&mut self) -> &mut T {
self.dirty = true;
&mut self.inner
}
}
impl<T> DirtyTracker<T> {
pub fn new(inner: T) -> DirtyTracker<T> {
DirtyTracker {
inner,
dirty: false,
}
}
pub fn is_dirty(&self) -> bool {
self.dirty
}
pub fn make_clean(&mut self) {
self.dirty = false;
}
pub fn into_inner(self) -> T {
self.inner
}
}

View File

@ -1,4 +1,3 @@
pub mod dates; pub mod dates;
pub mod dirty;
pub mod lazy; pub mod lazy;
pub mod urls; pub mod urls;