Don't enqueue references if they're weeds
This commit is contained in:
parent
641c575660
commit
2f5131e690
|
@ -4,7 +4,7 @@ use crate::raking::{
|
|||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeOutcome,
|
||||
Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
||||
};
|
||||
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord};
|
||||
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord};
|
||||
use crate::storage::{RakerStore, RandomActiveDomainAcquisition};
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use chrono::Utc;
|
||||
|
@ -469,38 +469,28 @@ impl EventProcessor<'_> {
|
|||
for reference in refs.references {
|
||||
let ref_url = Url::parse(&reference.target)?;
|
||||
let domain = get_reduced_domain(&ref_url)?;
|
||||
let allowed = match txn.get_allowed_domain_record(domain.borrow())? {
|
||||
None => false,
|
||||
Some(AllowedDomainRecord {
|
||||
restricted_prefixes,
|
||||
}) => {
|
||||
if restricted_prefixes.is_empty() {
|
||||
true
|
||||
} else {
|
||||
let mut allowed = false;
|
||||
for prefix in restricted_prefixes.iter() {
|
||||
if ref_url.path().starts_with(prefix) {
|
||||
allowed = true;
|
||||
break;
|
||||
}
|
||||
if prefix.as_str() > ref_url.path() {
|
||||
// e.g. /dog > /cat/xyz
|
||||
// This means we've missed all chances to see our prefix,
|
||||
// so we break here (efficiency).
|
||||
break;
|
||||
}
|
||||
}
|
||||
allowed
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// First check if this URL is an allowed URL (hence should be enqueued)
|
||||
let allowed = txn
|
||||
.get_allowed_domain_record(domain.borrow())?
|
||||
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url))
|
||||
.unwrap_or(false);
|
||||
if allowed {
|
||||
txn.enqueue_url(
|
||||
&reference.target,
|
||||
reference.last_mod,
|
||||
reference.kind.into(),
|
||||
)?;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Then check if this URL is a weed (hence should be ignored)
|
||||
let is_weed = txn
|
||||
.get_weed_domain_record(domain.borrow())?
|
||||
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url))
|
||||
.unwrap_or(false);
|
||||
if !is_weed {
|
||||
// It's neither allowed nor weeded, so put it on hold for later inspection
|
||||
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
|
@ -52,9 +53,61 @@ pub struct AllowedDomainRecord {
|
|||
pub restricted_prefixes: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl AllowedDomainRecord {
|
||||
/// Returns true iff this record applies to this URL.
|
||||
///
|
||||
/// Preconditions: it has been checked that the record applies to the domain
|
||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
||||
if self.restricted_prefixes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
let mut applies = false;
|
||||
for prefix in self.restricted_prefixes.iter() {
|
||||
if url.path().starts_with(prefix) {
|
||||
applies = true;
|
||||
break;
|
||||
}
|
||||
if prefix.as_str() > url.path() {
|
||||
// e.g. /dog > /cat/xyz
|
||||
// This means we've missed all chances to see our prefix,
|
||||
// so we break here (efficiency).
|
||||
break;
|
||||
}
|
||||
}
|
||||
applies
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||
pub struct WeedDomainRecord {
|
||||
/// Set of weedy path prefixes.
|
||||
/// Empty if ALL path prefixes are weedy.
|
||||
pub restricted_prefixes: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl WeedDomainRecord {
|
||||
/// Returns true iff this record applies to this URL.
|
||||
///
|
||||
/// Preconditions: it has been checked that the record applies to the domain
|
||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
||||
if self.restricted_prefixes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
let mut applies = false;
|
||||
for prefix in self.restricted_prefixes.iter() {
|
||||
if url.path().starts_with(prefix) {
|
||||
applies = true;
|
||||
break;
|
||||
}
|
||||
if prefix.as_str() > url.path() {
|
||||
// e.g. /dog > /cat/xyz
|
||||
// This means we've missed all chances to see our prefix,
|
||||
// so we break here (efficiency).
|
||||
break;
|
||||
}
|
||||
}
|
||||
applies
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue