Don't enqueue references if they're weeds

This commit is contained in:
Olivier 'reivilibre' 2022-03-22 19:56:10 +00:00
parent 641c575660
commit 2f5131e690
2 changed files with 70 additions and 27 deletions

View File

@ -4,7 +4,7 @@ use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeOutcome,
Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
};
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord};
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord};
use crate::storage::{RakerStore, RandomActiveDomainAcquisition};
use anyhow::{anyhow, ensure, Context};
use chrono::Utc;
@ -469,38 +469,28 @@ impl EventProcessor<'_> {
for reference in refs.references {
let ref_url = Url::parse(&reference.target)?;
let domain = get_reduced_domain(&ref_url)?;
let allowed = match txn.get_allowed_domain_record(domain.borrow())? {
None => false,
Some(AllowedDomainRecord {
restricted_prefixes,
}) => {
if restricted_prefixes.is_empty() {
true
} else {
let mut allowed = false;
for prefix in restricted_prefixes.iter() {
if ref_url.path().starts_with(prefix) {
allowed = true;
break;
}
if prefix.as_str() > ref_url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
allowed
}
}
};
// First check if this URL is an allowed URL (hence should be enqueued)
let allowed = txn
.get_allowed_domain_record(domain.borrow())?
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url))
.unwrap_or(false);
if allowed {
txn.enqueue_url(
&reference.target,
reference.last_mod,
reference.kind.into(),
)?;
} else {
continue;
}
// Then check if this URL is a weed (hence should be ignored)
let is_weed = txn
.get_weed_domain_record(domain.borrow())?
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url))
.unwrap_or(false);
if !is_weed {
// It's neither allowed nor weeded, so put it on hold for later inspection
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
}
}

View File

@ -1,4 +1,5 @@
use crate::raking::{RakeIntent, TemporaryFailure};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
@ -52,9 +53,61 @@ pub struct AllowedDomainRecord {
pub restricted_prefixes: BTreeSet<String>,
}
impl AllowedDomainRecord {
/// Returns true iff this record applies to this URL.
///
/// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool {
if self.restricted_prefixes.is_empty() {
return true;
}
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) {
applies = true;
break;
}
if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
applies
}
}
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct WeedDomainRecord {
/// Set of weedy path prefixes.
/// Empty if ALL path prefixes are weedy.
pub restricted_prefixes: BTreeSet<String>,
}
impl WeedDomainRecord {
/// Returns true iff this record applies to this URL.
///
/// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool {
if self.restricted_prefixes.is_empty() {
return true;
}
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) {
applies = true;
break;
}
if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
applies
}
}