Skip raking of weeded URLs
ci/woodpecker/push/check Pipeline failed Details
ci/woodpecker/push/manual Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details

May be useful for retroactively clearing out URLs
This commit is contained in:
Olivier 'reivilibre' 2023-03-31 22:59:23 +01:00
parent ff514e90b8
commit e07ac16bc4
2 changed files with 34 additions and 0 deletions

View File

@ -94,6 +94,17 @@ impl TaskContext {
let mut current_robot_rules: Option<Cylon> = None;
let mut wait_until: Option<Instant> = None;
let domain_record = {
let txn = self.store.ro_txn()?;
let dr = txn.get_domain_record(&domain)?;
match dr {
None => {
return Ok(());
}
Some(dr) => dr,
}
};
while !self.graceful_stop.load(Ordering::Relaxed) {
// Get a URL to process
let url = {
@ -138,6 +149,20 @@ impl TaskContext {
let url = Url::parse(&url_str)
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
// This is now a weed: skip.
let domain = domain.clone();
let url = url.clone();
self.store
.async_rw_txn(move |txn| {
txn.dequeue_url(&domain, url.as_str())?;
txn.commit()?;
Ok(())
})
.await?;
continue;
}
// Check our robot rules are valid for that URL.
let robot_url = robots_txt_url_for(&url)
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;

View File

@ -373,6 +373,15 @@ impl<'a> RakerTxn<'a, RW> {
Ok(())
}
/// Marks a URL as visited and takes it out of the queue.
pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
let queue_key = format!("{}\n{}", domain, url_str);
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
Ok(())
}
pub fn start_backing_off(
&self,
domain: &str,