Simplify allowed_/weed_domains
This commit is contained in:
parent
1c10cb203a
commit
ff514e90b8
@ -16,8 +16,8 @@ use quickpeep_raker::config;
|
|||||||
|
|
||||||
use quickpeep_raker::storage::mdbx_helper_types::MdbxBare;
|
use quickpeep_raker::storage::mdbx_helper_types::MdbxBare;
|
||||||
use quickpeep_raker::storage::records::{
|
use quickpeep_raker::storage::records::{
|
||||||
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord,
|
ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
|
||||||
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord,
|
UrlVisitedRecord,
|
||||||
};
|
};
|
||||||
use quickpeep_raker::storage::{RakerStore, RakerTxn};
|
use quickpeep_raker::storage::{RakerStore, RakerTxn};
|
||||||
|
|
||||||
@ -111,11 +111,11 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
&txn,
|
&txn,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
"allowed_domains" => {
|
"domains" => {
|
||||||
inspect::<MdbxBare<AllowedDomainRecord>>(
|
inspect::<MdbxBare<DomainRecord>>(
|
||||||
opts.key_name.as_ref(),
|
opts.key_name.as_ref(),
|
||||||
opts.prefix,
|
opts.prefix,
|
||||||
&txn.mdbx.borrow_dbs().allowed_domains,
|
&txn.mdbx.borrow_dbs().domains,
|
||||||
&txn,
|
&txn,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -127,14 +127,6 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
&txn,
|
&txn,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
"weed_domains" => {
|
|
||||||
inspect::<MdbxBare<WeedDomainRecord>>(
|
|
||||||
opts.key_name.as_ref(),
|
|
||||||
opts.prefix,
|
|
||||||
&txn.mdbx.borrow_dbs().weed_domains,
|
|
||||||
&txn,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
other => {
|
other => {
|
||||||
dark_yellow_ln!("Unknown database {:?}", other);
|
dark_yellow_ln!("Unknown database {:?}", other);
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use std::borrow::{Borrow, BorrowMut};
|
use std::borrow::Borrow;
|
||||||
|
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
|
||||||
@ -14,12 +14,10 @@ use tokio::sync::mpsc::Receiver;
|
|||||||
use quickpeep_raker::config::RakerConfig;
|
use quickpeep_raker::config::RakerConfig;
|
||||||
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
|
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
|
||||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
|
||||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||||
use quickpeep_seed_parser::loader::{
|
use quickpeep_seed_parser::loader::{
|
||||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
||||||
};
|
};
|
||||||
use quickpeep_utils::dirty::DirtyTracker;
|
|
||||||
use quickpeep_utils::urls::get_reduced_domain;
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
|
|
||||||
/// Seeds a raker's queue with URLs
|
/// Seeds a raker's queue with URLs
|
||||||
@ -144,27 +142,24 @@ async fn importer(
|
|||||||
buf.push(seed);
|
buf.push(seed);
|
||||||
|
|
||||||
if buf.len() == BATCH_SIZE {
|
if buf.len() == BATCH_SIZE {
|
||||||
if are_weeds {
|
import_and_flush_batch_seeds_or_weeds(
|
||||||
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?;
|
&store, &mut buf, &mut stats, &client, !are_weeds,
|
||||||
} else {
|
)
|
||||||
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
|
.await?;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if are_weeds {
|
import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
|
||||||
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?;
|
.await?;
|
||||||
} else {
|
|
||||||
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(stats)
|
Ok(stats)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn import_and_flush_batch_seeds(
|
async fn import_and_flush_batch_seeds_or_weeds(
|
||||||
store: &RakerStore,
|
store: &RakerStore,
|
||||||
buf: &mut Vec<Seed>,
|
buf: &mut Vec<Seed>,
|
||||||
stats: &mut SeedImportStats,
|
stats: &mut SeedImportStats,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
is_seed: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let txn = store.rw_txn()?;
|
let txn = store.rw_txn()?;
|
||||||
for seed in buf.drain(..) {
|
for seed in buf.drain(..) {
|
||||||
@ -173,20 +168,13 @@ async fn import_and_flush_batch_seeds(
|
|||||||
let domain = get_reduced_domain(&as_url)
|
let domain = get_reduced_domain(&as_url)
|
||||||
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
||||||
|
|
||||||
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
|
let domain_record = txn.get_domain_record(domain.borrow())?;
|
||||||
|
let is_domain_new = domain_record.is_none();
|
||||||
let is_domain_new = allowed_domain_record.is_none();
|
let mut domain_record = domain_record.unwrap_or_default();
|
||||||
if is_domain_new {
|
if is_domain_new {
|
||||||
stats.new_domains += 1;
|
stats.new_domains += 1;
|
||||||
}
|
}
|
||||||
|
let mut dirty = is_domain_new;
|
||||||
let mut allowed_domain_record = DirtyTracker::new(
|
|
||||||
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
|
|
||||||
);
|
|
||||||
if is_domain_new {
|
|
||||||
// Mark it as dirty
|
|
||||||
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register the domain. This is a no-op if it's already active or backing off.
|
// Register the domain. This is a no-op if it's already active or backing off.
|
||||||
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
||||||
@ -194,36 +182,46 @@ async fn import_and_flush_batch_seeds(
|
|||||||
let url_like = match &seed.url {
|
let url_like = match &seed.url {
|
||||||
UrlOrUrlPattern::Url(url_str) => {
|
UrlOrUrlPattern::Url(url_str) => {
|
||||||
let url = Url::parse(url_str.as_str())?;
|
let url = Url::parse(url_str.as_str())?;
|
||||||
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
|
if is_seed {
|
||||||
stats.new_urls += 1;
|
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
|
||||||
} else {
|
stats.new_urls += 1;
|
||||||
stats.already_present_urls += 1;
|
} else {
|
||||||
|
stats.already_present_urls += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Seed/weed with empty prefix
|
||||||
|
dirty |= domain_record
|
||||||
|
.rakeable_path_prefixes
|
||||||
|
.insert(String::new(), is_seed)
|
||||||
|
!= Some(is_seed);
|
||||||
|
|
||||||
url
|
url
|
||||||
}
|
}
|
||||||
UrlOrUrlPattern::UrlPrefix(prefix) => {
|
UrlOrUrlPattern::UrlPrefix(prefix) => {
|
||||||
let prefix_as_url = Url::parse(prefix.as_str())?;
|
let prefix_as_url = Url::parse(prefix.as_str())?;
|
||||||
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
|
if is_seed {
|
||||||
stats.new_urls += 1;
|
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
|
||||||
} else {
|
stats.new_urls += 1;
|
||||||
stats.already_present_urls += 1;
|
} else {
|
||||||
}
|
stats.already_present_urls += 1;
|
||||||
if is_domain_new {
|
}
|
||||||
let allowed_domain_record: &mut AllowedDomainRecord =
|
|
||||||
allowed_domain_record.borrow_mut();
|
|
||||||
allowed_domain_record
|
|
||||||
.restricted_prefixes
|
|
||||||
.insert(prefix_as_url.path().to_string());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dirty |= domain_record
|
||||||
|
.rakeable_path_prefixes
|
||||||
|
.insert(prefix_as_url.path().to_string(), is_seed)
|
||||||
|
!= Some(is_seed);
|
||||||
|
|
||||||
prefix_as_url
|
prefix_as_url
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if allowed_domain_record.is_dirty() {
|
if dirty {
|
||||||
txn.put_allowed_domain_record(domain.borrow(), allowed_domain_record.into_inner())?;
|
txn.put_domain_record(domain.borrow(), domain_record)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_domain_new {
|
if is_seed {
|
||||||
// look at robots.txt and discover sitemaps!
|
// look at robots.txt and discover sitemaps!
|
||||||
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
||||||
for sitemap in robots_txt.sitemaps {
|
for sitemap in robots_txt.sitemaps {
|
||||||
@ -238,37 +236,3 @@ async fn import_and_flush_batch_seeds(
|
|||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn import_and_flush_batch_weeds(
|
|
||||||
store: &RakerStore,
|
|
||||||
buf: &mut Vec<Seed>,
|
|
||||||
stats: &mut SeedImportStats,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let txn = store.rw_txn()?;
|
|
||||||
for seed in buf.drain(..) {
|
|
||||||
let as_url = Url::parse(seed.url.as_str())
|
|
||||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
|
||||||
let domain = get_reduced_domain(&as_url)
|
|
||||||
.with_context(|| format!("No domain in weed URL '{as_url}'!"))?;
|
|
||||||
|
|
||||||
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
|
|
||||||
|
|
||||||
let is_domain_new = weed_domain_record.is_none();
|
|
||||||
if is_domain_new {
|
|
||||||
stats.new_domains += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut weed_domain_record =
|
|
||||||
DirtyTracker::new(weed_domain_record.unwrap_or_else(|| WeedDomainRecord::default()));
|
|
||||||
if is_domain_new {
|
|
||||||
// Mark it as dirty
|
|
||||||
let _: &mut WeedDomainRecord = weed_domain_record.borrow_mut();
|
|
||||||
}
|
|
||||||
|
|
||||||
if weed_domain_record.is_dirty() {
|
|
||||||
txn.put_weed_domain_record(domain.borrow(), weed_domain_record.into_inner())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
txn.commit()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
@ -4,7 +4,7 @@ use crate::raking::{
|
|||||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||||
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
||||||
};
|
};
|
||||||
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord};
|
use crate::storage::records::{DomainRecord, UrlVisitedRecord};
|
||||||
use crate::storage::RakerStore;
|
use crate::storage::RakerStore;
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
@ -564,31 +564,31 @@ impl EventProcessor<'_> {
|
|||||||
format!("failed to reduce domain: {:?}", reference.target)
|
format!("failed to reduce domain: {:?}", reference.target)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// First check if this URL is an allowed URL (hence should be enqueued)
|
// Check if this URL is an allowed URL (hence should be enqueued)
|
||||||
let allowed = txn
|
let allowed = txn
|
||||||
.get_allowed_domain_record(domain.borrow())?
|
.get_domain_record(domain.borrow())?
|
||||||
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url))
|
.map(|record: DomainRecord| record.is_url_rakeable(&ref_url))
|
||||||
.unwrap_or(false);
|
.flatten();
|
||||||
if allowed {
|
|
||||||
let is_fresh = txn.enqueue_url(
|
|
||||||
&reference.target,
|
|
||||||
reference.last_mod,
|
|
||||||
reference.kind.into(),
|
|
||||||
)?;
|
|
||||||
if is_fresh {
|
|
||||||
increment_counter!("qprake_queue_new_url");
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Then check if this URL is a weed (hence should be ignored)
|
match allowed {
|
||||||
let is_weed = txn
|
Some(true) => {
|
||||||
.get_weed_domain_record(domain.borrow())?
|
let is_fresh = txn.enqueue_url(
|
||||||
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url))
|
&reference.target,
|
||||||
.unwrap_or(false);
|
reference.last_mod,
|
||||||
if !is_weed {
|
reference.kind.into(),
|
||||||
// It's neither allowed nor weeded, so put it on hold for later inspection
|
)?;
|
||||||
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
|
if is_fresh {
|
||||||
|
increment_counter!("qprake_queue_new_url");
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Some(false) => {
|
||||||
|
// Weed! Do nothing.
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// It's neither allowed nor weeded, so put it on hold for later inspection
|
||||||
|
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,8 +2,8 @@ use crate::raking::{RakeIntent, TemporaryFailure};
|
|||||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64};
|
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64};
|
||||||
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
||||||
use crate::storage::records::{
|
use crate::storage::records::{
|
||||||
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord,
|
ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
|
||||||
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord,
|
UrlVisitedRecord,
|
||||||
};
|
};
|
||||||
use anyhow::{anyhow, bail, ensure, Context};
|
use anyhow::{anyhow, bail, ensure, Context};
|
||||||
use libmdbx::{
|
use libmdbx::{
|
||||||
@ -45,12 +45,10 @@ pub struct Databases<'env> {
|
|||||||
pub backing_off_domains: Database<'env>,
|
pub backing_off_domains: Database<'env>,
|
||||||
/// URL → VisitedDomainRecord
|
/// URL → VisitedDomainRecord
|
||||||
pub visited_urls: Database<'env>,
|
pub visited_urls: Database<'env>,
|
||||||
/// Domain → AllowedDomainRecord
|
/// Domain → DomainRecord
|
||||||
pub allowed_domains: Database<'env>,
|
pub domains: Database<'env>,
|
||||||
/// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE)
|
/// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE)
|
||||||
pub urls_on_hold: Database<'env>,
|
pub urls_on_hold: Database<'env>,
|
||||||
/// Domain → WeedDomainRecord
|
|
||||||
pub weed_domains: Database<'env>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'env> Databases<'env> {
|
impl<'env> Databases<'env> {
|
||||||
@ -66,16 +64,15 @@ impl<'env> Databases<'env> {
|
|||||||
),
|
),
|
||||||
("backing_off_domains", &self.backing_off_domains),
|
("backing_off_domains", &self.backing_off_domains),
|
||||||
("visited_urls", &self.visited_urls),
|
("visited_urls", &self.visited_urls),
|
||||||
("allowed_domains", &self.allowed_domains),
|
("domains", &self.domains),
|
||||||
("urls_on_hold", &self.urls_on_hold),
|
("urls_on_hold", &self.urls_on_hold),
|
||||||
("weed_domains", &self.weed_domains),
|
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Must match the order of the Databases struct fields.
|
// Must match the order of the Databases struct fields.
|
||||||
pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [
|
pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
|
||||||
("urls_queue", DatabaseFlags::empty()),
|
("urls_queue", DatabaseFlags::empty()),
|
||||||
("rerake_queue", DatabaseFlags::DUP_SORT),
|
("rerake_queue", DatabaseFlags::DUP_SORT),
|
||||||
("active_domains", DatabaseFlags::empty()),
|
("active_domains", DatabaseFlags::empty()),
|
||||||
@ -86,9 +83,8 @@ pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [
|
|||||||
),
|
),
|
||||||
("backing_off_domains", DatabaseFlags::empty()),
|
("backing_off_domains", DatabaseFlags::empty()),
|
||||||
("urls_visited", DatabaseFlags::empty()),
|
("urls_visited", DatabaseFlags::empty()),
|
||||||
("allowed_domains", DatabaseFlags::empty()),
|
("domains", DatabaseFlags::empty()),
|
||||||
("urls_on_hold", DatabaseFlags::empty()),
|
("urls_on_hold", DatabaseFlags::empty()),
|
||||||
("weed_domains", DatabaseFlags::empty()),
|
|
||||||
];
|
];
|
||||||
|
|
||||||
#[self_referencing]
|
#[self_referencing]
|
||||||
@ -187,9 +183,8 @@ impl RakerStore {
|
|||||||
backing_off_reinstatements: dbs.next().unwrap(),
|
backing_off_reinstatements: dbs.next().unwrap(),
|
||||||
backing_off_domains: dbs.next().unwrap(),
|
backing_off_domains: dbs.next().unwrap(),
|
||||||
visited_urls: dbs.next().unwrap(),
|
visited_urls: dbs.next().unwrap(),
|
||||||
allowed_domains: dbs.next().unwrap(),
|
domains: dbs.next().unwrap(),
|
||||||
urls_on_hold: dbs.next().unwrap(),
|
urls_on_hold: dbs.next().unwrap(),
|
||||||
weed_domains: dbs.next().unwrap(),
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -603,33 +598,17 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
Ok(is_new)
|
Ok(is_new)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_allowed_domain_record(
|
pub fn put_domain_record(
|
||||||
&self,
|
&self,
|
||||||
domain: &str,
|
domain: &str,
|
||||||
allowed_domain_record: AllowedDomainRecord,
|
domain_record: DomainRecord,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
|
let domains = &self.mdbx.borrow_dbs().domains;
|
||||||
|
|
||||||
self.mdbx_txn.put(
|
self.mdbx_txn.put(
|
||||||
allowed_domains,
|
domains,
|
||||||
domain.as_bytes(),
|
domain.as_bytes(),
|
||||||
MdbxBare(allowed_domain_record).as_bytes(),
|
MdbxBare(domain_record).as_bytes(),
|
||||||
WriteFlags::empty(),
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn put_weed_domain_record(
|
|
||||||
&self,
|
|
||||||
domain: &str,
|
|
||||||
weed_domain_record: WeedDomainRecord,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
|
|
||||||
|
|
||||||
self.mdbx_txn.put(
|
|
||||||
weed_domains,
|
|
||||||
domain.as_bytes(),
|
|
||||||
MdbxBare(weed_domain_record).as_bytes(),
|
|
||||||
WriteFlags::empty(),
|
WriteFlags::empty(),
|
||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -779,27 +758,12 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_allowed_domain_record(
|
pub fn get_domain_record(&self, domain: &str) -> anyhow::Result<Option<DomainRecord>> {
|
||||||
&self,
|
let domains = &self.mdbx.borrow_dbs().domains;
|
||||||
domain: &str,
|
|
||||||
) -> anyhow::Result<Option<AllowedDomainRecord>> {
|
|
||||||
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
|
|
||||||
|
|
||||||
match self
|
match self
|
||||||
.mdbx_txn
|
.mdbx_txn
|
||||||
.get::<MdbxBare<AllowedDomainRecord>>(allowed_domains, domain.as_bytes())?
|
.get::<MdbxBare<DomainRecord>>(domains, domain.as_bytes())?
|
||||||
{
|
|
||||||
None => Ok(None),
|
|
||||||
Some(MdbxBare(record)) => Ok(Some(record)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_weed_domain_record(&self, domain: &str) -> anyhow::Result<Option<WeedDomainRecord>> {
|
|
||||||
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
|
|
||||||
|
|
||||||
match self
|
|
||||||
.mdbx_txn
|
|
||||||
.get::<MdbxBare<WeedDomainRecord>>(weed_domains, domain.as_bytes())?
|
|
||||||
{
|
{
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
Some(MdbxBare(record)) => Ok(Some(record)),
|
Some(MdbxBare(record)) => Ok(Some(record)),
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
|
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
|
||||||
use crate::storage::records::{AllowedDomainRecord, OnHoldUrlRecord, WeedDomainRecord};
|
use crate::storage::records::{DomainRecord, OnHoldUrlRecord};
|
||||||
use crate::storage::RakerTxn;
|
use crate::storage::RakerTxn;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use libmdbx::{Database, WriteFlags, RW};
|
use libmdbx::{Database, WriteFlags, RW};
|
||||||
use log::warn;
|
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
/// Runs one big transaction that:
|
/// Runs one big transaction that:
|
||||||
@ -16,8 +15,7 @@ use reqwest::Url;
|
|||||||
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
|
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
|
||||||
struct DomainState {
|
struct DomainState {
|
||||||
pub domain: String,
|
pub domain: String,
|
||||||
pub allowed_domain_record: Option<AllowedDomainRecord>,
|
pub domain_record: Option<DomainRecord>,
|
||||||
pub weed_domain_record: Option<WeedDomainRecord>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
|
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
|
||||||
@ -47,44 +45,33 @@ pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Res
|
|||||||
// Then load the relevant records for it.
|
// Then load the relevant records for it.
|
||||||
domain_state = Some(DomainState {
|
domain_state = Some(DomainState {
|
||||||
domain: domain.to_owned(),
|
domain: domain.to_owned(),
|
||||||
allowed_domain_record: txn.get_allowed_domain_record(domain)?,
|
domain_record: txn.get_domain_record(domain)?,
|
||||||
weed_domain_record: txn.get_weed_domain_record(domain)?,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let url = Url::parse(url_str)?;
|
let url = Url::parse(url_str)?;
|
||||||
|
|
||||||
let domain_state = domain_state.as_ref().unwrap();
|
let domain_state = domain_state.as_ref().unwrap();
|
||||||
let is_allowed = domain_state
|
|
||||||
.allowed_domain_record
|
|
||||||
.as_ref()
|
|
||||||
.map(|adr: &AllowedDomainRecord| adr.applies_to_url(&url))
|
|
||||||
.unwrap_or(false);
|
|
||||||
let is_weed = domain_state
|
|
||||||
.weed_domain_record
|
|
||||||
.as_ref()
|
|
||||||
.map(|wdr: &WeedDomainRecord| wdr.applies_to_url(&url))
|
|
||||||
.unwrap_or(false);
|
|
||||||
|
|
||||||
match (is_allowed, is_weed) {
|
let is_rakeable = domain_state
|
||||||
(false, false) => { /* nop */ }
|
.domain_record
|
||||||
(true, true) => {
|
.as_ref()
|
||||||
warn!(
|
.map(|dr: &DomainRecord| dr.is_url_rakeable(&url))
|
||||||
"Ambiguous: {:?} is both mentioned by a seed and a weed. Ignoring.",
|
.flatten();
|
||||||
url
|
|
||||||
);
|
match is_rakeable {
|
||||||
}
|
Some(true) => {
|
||||||
(true, false) => {
|
|
||||||
// ALLOWED
|
// ALLOWED
|
||||||
// Make it a queued URL
|
// Make it a queued URL
|
||||||
txn.enqueue_url(url_str, None, record.queue_record.intent)?;
|
txn.enqueue_url(url_str, None, record.queue_record.intent)?;
|
||||||
cur.del(WriteFlags::empty())?;
|
cur.del(WriteFlags::empty())?;
|
||||||
}
|
}
|
||||||
(false, true) => {
|
Some(false) => {
|
||||||
// WEED
|
// WEED
|
||||||
// Just delete
|
// Just delete
|
||||||
cur.del(WriteFlags::empty())?;
|
cur.del(WriteFlags::empty())?;
|
||||||
}
|
}
|
||||||
|
None => { /* nop: neither allowed nor a weed. Keep on hold. */ }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use crate::raking::{RakeIntent, TemporaryFailure};
|
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||||
pub struct ActiveDomainRecord {
|
pub struct ActiveDomainRecord {
|
||||||
@ -46,26 +46,20 @@ pub struct BackingOffDomainRecord {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||||
pub struct AllowedDomainRecord {
|
pub struct DomainRecord {
|
||||||
/// Set of acceptable path prefixes.
|
pub rakeable_path_prefixes: BTreeMap<String, bool>,
|
||||||
/// Empty if ALL path prefixes are permitted.
|
|
||||||
pub restricted_prefixes: BTreeSet<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AllowedDomainRecord {
|
impl DomainRecord {
|
||||||
/// Returns true iff this record applies to this URL.
|
/// Returns whether the URL is rakeable.
|
||||||
///
|
///
|
||||||
/// Preconditions: it has been checked that the record applies to the domain
|
/// Preconditions: it has been checked that the record applies to the domain
|
||||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
pub fn is_url_rakeable(&self, url: &Url) -> Option<bool> {
|
||||||
if self.restricted_prefixes.is_empty() {
|
let mut final_result = None;
|
||||||
return true;
|
// TODO This could be made more efficient.
|
||||||
}
|
for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() {
|
||||||
|
|
||||||
let mut applies = false;
|
|
||||||
for prefix in self.restricted_prefixes.iter() {
|
|
||||||
if url.path().starts_with(prefix) {
|
if url.path().starts_with(prefix) {
|
||||||
applies = true;
|
final_result = Some(rakeable);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
if prefix.as_str() > url.path() {
|
if prefix.as_str() > url.path() {
|
||||||
// e.g. /dog > /cat/xyz
|
// e.g. /dog > /cat/xyz
|
||||||
@ -74,39 +68,6 @@ impl AllowedDomainRecord {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
applies
|
final_result
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
|
||||||
pub struct WeedDomainRecord {
|
|
||||||
/// Set of weedy path prefixes.
|
|
||||||
/// Empty if ALL path prefixes are weedy.
|
|
||||||
pub restricted_prefixes: BTreeSet<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WeedDomainRecord {
|
|
||||||
/// Returns true iff this record applies to this URL.
|
|
||||||
///
|
|
||||||
/// Preconditions: it has been checked that the record applies to the domain
|
|
||||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
|
||||||
if self.restricted_prefixes.is_empty() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut applies = false;
|
|
||||||
for prefix in self.restricted_prefixes.iter() {
|
|
||||||
if url.path().starts_with(prefix) {
|
|
||||||
applies = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if prefix.as_str() > url.path() {
|
|
||||||
// e.g. /dog > /cat/xyz
|
|
||||||
// This means we've missed all chances to see our prefix,
|
|
||||||
// so we break here (efficiency).
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
applies
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,40 +0,0 @@
|
|||||||
use std::borrow::{Borrow, BorrowMut};
|
|
||||||
|
|
||||||
pub struct DirtyTracker<T> {
|
|
||||||
inner: T,
|
|
||||||
dirty: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Borrow<T> for DirtyTracker<T> {
|
|
||||||
fn borrow(&self) -> &T {
|
|
||||||
&self.inner
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> BorrowMut<T> for DirtyTracker<T> {
|
|
||||||
fn borrow_mut(&mut self) -> &mut T {
|
|
||||||
self.dirty = true;
|
|
||||||
&mut self.inner
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> DirtyTracker<T> {
|
|
||||||
pub fn new(inner: T) -> DirtyTracker<T> {
|
|
||||||
DirtyTracker {
|
|
||||||
inner,
|
|
||||||
dirty: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_dirty(&self) -> bool {
|
|
||||||
self.dirty
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn make_clean(&mut self) {
|
|
||||||
self.dirty = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn into_inner(self) -> T {
|
|
||||||
self.inner
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,4 +1,3 @@
|
|||||||
pub mod dates;
|
pub mod dates;
|
||||||
pub mod dirty;
|
|
||||||
pub mod lazy;
|
pub mod lazy;
|
||||||
pub mod urls;
|
pub mod urls;
|
||||||
|
Loading…
Reference in New Issue
Block a user