Don't enqueue references if they're weeds

2022-03-22 19:56:10 +00:00 · 2022-03-22 19:56:10 +00:00 · 2f5131e690
parent 641c575660
commit 2f5131e690
2 changed files with 70 additions and 27 deletions
--- a/quickpeep_raker/src/raking/task.rs
+++ b/quickpeep_raker/src/raking/task.rs
@ -4,7 +4,7 @@ use crate::raking::{
    get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeOutcome,
    Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
 };
-use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord};
+use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord};
 use crate::storage::{RakerStore, RandomActiveDomainAcquisition};
 use anyhow::{anyhow, ensure, Context};
 use chrono::Utc;
@ -469,38 +469,28 @@ impl EventProcessor<'_> {
                for reference in refs.references {
                    let ref_url = Url::parse(&reference.target)?;
                    let domain = get_reduced_domain(&ref_url)?;
-                    let allowed = match txn.get_allowed_domain_record(domain.borrow())? {
-                        None => false,
-                        Some(AllowedDomainRecord {
-                            restricted_prefixes,
-                        }) => {
-                            if restricted_prefixes.is_empty() {
-                                true
-                            } else {
-                                let mut allowed = false;
-                                for prefix in restricted_prefixes.iter() {
-                                    if ref_url.path().starts_with(prefix) {
-                                        allowed = true;
-                                        break;
-                                    }
-                                    if prefix.as_str() > ref_url.path() {
-                                        // e.g. /dog > /cat/xyz
-                                        // This means we've missed all chances to see our prefix,
-                                        // so we break here (efficiency).
-                                        break;
-                                    }
-                                }
-                                allowed
-                            }
-                        }
-                    };
+
+                    // First check if this URL is an allowed URL (hence should be enqueued)
+                    let allowed = txn
+                        .get_allowed_domain_record(domain.borrow())?
+                        .map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url))
+                        .unwrap_or(false);
                    if allowed {
                        txn.enqueue_url(
                            &reference.target,
                            reference.last_mod,
                            reference.kind.into(),
                        )?;
-                    } else {
+                        continue;
+                    }
+
+                    // Then check if this URL is a weed (hence should be ignored)
+                    let is_weed = txn
+                        .get_weed_domain_record(domain.borrow())?
+                        .map(|record: WeedDomainRecord| record.applies_to_url(&ref_url))
+                        .unwrap_or(false);
+                    if !is_weed {
+                        // It's neither allowed nor weeded, so put it on hold for later inspection
                        txn.put_url_on_hold(&reference.target, reference.kind.into())?;
                    }
                }
--- a/quickpeep_raker/src/storage/records.rs
+++ b/quickpeep_raker/src/storage/records.rs
@ -1,4 +1,5 @@
 use crate::raking::{RakeIntent, TemporaryFailure};
+use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeSet;

@ -52,9 +53,61 @@ pub struct AllowedDomainRecord {
    pub restricted_prefixes: BTreeSet<String>,
 }

+impl AllowedDomainRecord {
+    /// Returns true iff this record applies to this URL.
+    ///
+    /// Preconditions: it has been checked that the record applies to the domain
+    pub fn applies_to_url(&self, url: &Url) -> bool {
+        if self.restricted_prefixes.is_empty() {
+            return true;
+        }
+
+        let mut applies = false;
+        for prefix in self.restricted_prefixes.iter() {
+            if url.path().starts_with(prefix) {
+                applies = true;
+                break;
+            }
+            if prefix.as_str() > url.path() {
+                // e.g. /dog > /cat/xyz
+                // This means we've missed all chances to see our prefix,
+                // so we break here (efficiency).
+                break;
+            }
+        }
+        applies
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, Default)]
 pub struct WeedDomainRecord {
    /// Set of weedy path prefixes.
    /// Empty if ALL path prefixes are weedy.
    pub restricted_prefixes: BTreeSet<String>,
 }
+
+impl WeedDomainRecord {
+    /// Returns true iff this record applies to this URL.
+    ///
+    /// Preconditions: it has been checked that the record applies to the domain
+    pub fn applies_to_url(&self, url: &Url) -> bool {
+        if self.restricted_prefixes.is_empty() {
+            return true;
+        }
+
+        let mut applies = false;
+        for prefix in self.restricted_prefixes.iter() {
+            if url.path().starts_with(prefix) {
+                applies = true;
+                break;
+            }
+            if prefix.as_str() > url.path() {
+                // e.g. /dog > /cat/xyz
+                // This means we've missed all chances to see our prefix,
+                // so we break here (efficiency).
+                break;
+            }
+        }
+        applies
+    }
+}