Skip raking of weeded URLs
May be useful for retroactively clearing out URLs
This commit is contained in:
		
							parent
							
								
									ff514e90b8
								
							
						
					
					
						commit
						e07ac16bc4
					
				@ -94,6 +94,17 @@ impl TaskContext {
 | 
			
		||||
        let mut current_robot_rules: Option<Cylon> = None;
 | 
			
		||||
        let mut wait_until: Option<Instant> = None;
 | 
			
		||||
 | 
			
		||||
        let domain_record = {
 | 
			
		||||
            let txn = self.store.ro_txn()?;
 | 
			
		||||
            let dr = txn.get_domain_record(&domain)?;
 | 
			
		||||
            match dr {
 | 
			
		||||
                None => {
 | 
			
		||||
                    return Ok(());
 | 
			
		||||
                }
 | 
			
		||||
                Some(dr) => dr,
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        while !self.graceful_stop.load(Ordering::Relaxed) {
 | 
			
		||||
            // Get a URL to process
 | 
			
		||||
            let url = {
 | 
			
		||||
@ -138,6 +149,20 @@ impl TaskContext {
 | 
			
		||||
            let url = Url::parse(&url_str)
 | 
			
		||||
                .with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
 | 
			
		||||
 | 
			
		||||
            if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
 | 
			
		||||
                // This is now a weed: skip.
 | 
			
		||||
                let domain = domain.clone();
 | 
			
		||||
                let url = url.clone();
 | 
			
		||||
                self.store
 | 
			
		||||
                    .async_rw_txn(move |txn| {
 | 
			
		||||
                        txn.dequeue_url(&domain, url.as_str())?;
 | 
			
		||||
                        txn.commit()?;
 | 
			
		||||
                        Ok(())
 | 
			
		||||
                    })
 | 
			
		||||
                    .await?;
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Check our robot rules are valid for that URL.
 | 
			
		||||
            let robot_url = robots_txt_url_for(&url)
 | 
			
		||||
                .with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
 | 
			
		||||
 | 
			
		||||
@ -373,6 +373,15 @@ impl<'a> RakerTxn<'a, RW> {
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Marks a URL as visited and takes it out of the queue.
 | 
			
		||||
    pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
 | 
			
		||||
        let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
 | 
			
		||||
        let queue_key = format!("{}\n{}", domain, url_str);
 | 
			
		||||
        self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn start_backing_off(
 | 
			
		||||
        &self,
 | 
			
		||||
        domain: &str,
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user