Skip raking of weeded URLs
May be useful for retroactively clearing out URLs
This commit is contained in:
parent
ff514e90b8
commit
e07ac16bc4
|
@ -94,6 +94,17 @@ impl TaskContext {
|
|||
let mut current_robot_rules: Option<Cylon> = None;
|
||||
let mut wait_until: Option<Instant> = None;
|
||||
|
||||
let domain_record = {
|
||||
let txn = self.store.ro_txn()?;
|
||||
let dr = txn.get_domain_record(&domain)?;
|
||||
match dr {
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
Some(dr) => dr,
|
||||
}
|
||||
};
|
||||
|
||||
while !self.graceful_stop.load(Ordering::Relaxed) {
|
||||
// Get a URL to process
|
||||
let url = {
|
||||
|
@ -138,6 +149,20 @@ impl TaskContext {
|
|||
let url = Url::parse(&url_str)
|
||||
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
|
||||
|
||||
if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
|
||||
// This is now a weed: skip.
|
||||
let domain = domain.clone();
|
||||
let url = url.clone();
|
||||
self.store
|
||||
.async_rw_txn(move |txn| {
|
||||
txn.dequeue_url(&domain, url.as_str())?;
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check our robot rules are valid for that URL.
|
||||
let robot_url = robots_txt_url_for(&url)
|
||||
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
|
||||
|
|
|
@ -373,6 +373,15 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Marks a URL as visited and takes it out of the queue.
|
||||
pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
|
||||
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||
let queue_key = format!("{}\n{}", domain, url_str);
|
||||
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start_backing_off(
|
||||
&self,
|
||||
domain: &str,
|
||||
|
|
Loading…
Reference in New Issue