From e07ac16bc4cc97b8290e0e4500357a9573943bba Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Fri, 31 Mar 2023 22:59:23 +0100 Subject: [PATCH] Skip raking of weeded URLs May be useful for retroactively clearing out URLs --- quickpeep_raker/src/raking/task.rs | 25 +++++++++++++++++++++++++ quickpeep_raker/src/storage.rs | 9 +++++++++ 2 files changed, 34 insertions(+) diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 9bf1ec8..8c9dfdc 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -94,6 +94,17 @@ impl TaskContext { let mut current_robot_rules: Option = None; let mut wait_until: Option = None; + let domain_record = { + let txn = self.store.ro_txn()?; + let dr = txn.get_domain_record(&domain)?; + match dr { + None => { + return Ok(()); + } + Some(dr) => dr, + } + }; + while !self.graceful_stop.load(Ordering::Relaxed) { // Get a URL to process let url = { @@ -138,6 +149,20 @@ impl TaskContext { let url = Url::parse(&url_str) .with_context(|| format!("failed to parse as URL: {url_str:?}"))?; + if !domain_record.is_url_rakeable(&url).unwrap_or(false) { + // This is now a weed: skip. + let domain = domain.clone(); + let url = url.clone(); + self.store + .async_rw_txn(move |txn| { + txn.dequeue_url(&domain, url.as_str())?; + txn.commit()?; + Ok(()) + }) + .await?; + continue; + } + // Check our robot rules are valid for that URL. let robot_url = robots_txt_url_for(&url) .with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?; diff --git a/quickpeep_raker/src/storage.rs b/quickpeep_raker/src/storage.rs index 71d0693..f9f6fdd 100644 --- a/quickpeep_raker/src/storage.rs +++ b/quickpeep_raker/src/storage.rs @@ -373,6 +373,15 @@ impl<'a> RakerTxn<'a, RW> { Ok(()) } + /// Marks a URL as visited and takes it out of the queue. + pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> { + let queue_urls = &self.mdbx.borrow_dbs().queue_urls; + let queue_key = format!("{}\n{}", domain, url_str); + self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?; + + Ok(()) + } + pub fn start_backing_off( &self, domain: &str,