Skip raking of weeded URLs
May be useful for retroactively clearing out URLs
This commit is contained in:
parent
ff514e90b8
commit
e07ac16bc4
@ -94,6 +94,17 @@ impl TaskContext {
|
|||||||
let mut current_robot_rules: Option<Cylon> = None;
|
let mut current_robot_rules: Option<Cylon> = None;
|
||||||
let mut wait_until: Option<Instant> = None;
|
let mut wait_until: Option<Instant> = None;
|
||||||
|
|
||||||
|
let domain_record = {
|
||||||
|
let txn = self.store.ro_txn()?;
|
||||||
|
let dr = txn.get_domain_record(&domain)?;
|
||||||
|
match dr {
|
||||||
|
None => {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Some(dr) => dr,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
while !self.graceful_stop.load(Ordering::Relaxed) {
|
while !self.graceful_stop.load(Ordering::Relaxed) {
|
||||||
// Get a URL to process
|
// Get a URL to process
|
||||||
let url = {
|
let url = {
|
||||||
@ -138,6 +149,20 @@ impl TaskContext {
|
|||||||
let url = Url::parse(&url_str)
|
let url = Url::parse(&url_str)
|
||||||
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
|
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
|
||||||
|
|
||||||
|
if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
|
||||||
|
// This is now a weed: skip.
|
||||||
|
let domain = domain.clone();
|
||||||
|
let url = url.clone();
|
||||||
|
self.store
|
||||||
|
.async_rw_txn(move |txn| {
|
||||||
|
txn.dequeue_url(&domain, url.as_str())?;
|
||||||
|
txn.commit()?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Check our robot rules are valid for that URL.
|
// Check our robot rules are valid for that URL.
|
||||||
let robot_url = robots_txt_url_for(&url)
|
let robot_url = robots_txt_url_for(&url)
|
||||||
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
|
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
|
||||||
|
@ -373,6 +373,15 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Marks a URL as visited and takes it out of the queue.
|
||||||
|
pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
|
||||||
|
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||||
|
let queue_key = format!("{}\n{}", domain, url_str);
|
||||||
|
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn start_backing_off(
|
pub fn start_backing_off(
|
||||||
&self,
|
&self,
|
||||||
domain: &str,
|
domain: &str,
|
||||||
|
Loading…
Reference in New Issue
Block a user