quickpeep/quickpeep_raker/src/storage/maintenance.rs

81 lines
2.5 KiB
Rust

use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
use crate::storage::records::{DomainRecord, OnHoldUrlRecord};
use crate::storage::RakerTxn;
use anyhow::Context;
use libmdbx::{Database, WriteFlags, RW};
use reqwest::Url;
/// Runs one big transaction that:
/// - scans on-hold URLs
/// - moves 'allowed' ones to the queue
/// - deletes 'weeds'
/// - leaves unknown ones alone
///
/// Ideally should be applied after importing seeds and weeds on an existing database.
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
struct DomainState {
pub domain: String,
pub domain_record: Option<DomainRecord>,
}
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
let mut domain_state = None;
// Scan through the on-hold URLs
let mut cur = txn.mdbx_txn.cursor(urls_on_hold)?;
let mut first_iteration = true;
while let Some((MdbxString(domain_then_url), MdbxBare(record))) = if first_iteration {
first_iteration = false;
cur.first::<MdbxString, MdbxBare<OnHoldUrlRecord>>()
} else {
cur.next::<MdbxString, MdbxBare<OnHoldUrlRecord>>()
}? {
let mut split = domain_then_url.as_ref().split("\n");
let domain = split.next().context("No first split..?")?;
let url_str = split.next().context("No URL")?;
// Is the domain new?
if domain_state
.as_ref()
.map(|ds: &DomainState| &ds.domain != domain)
.unwrap_or(true)
{
// Then load the relevant records for it.
domain_state = Some(DomainState {
domain: domain.to_owned(),
domain_record: txn.get_domain_record(domain)?,
});
}
let url = Url::parse(url_str)?;
let domain_state = domain_state.as_ref().unwrap();
let is_rakeable = domain_state
.domain_record
.as_ref()
.map(|dr: &DomainRecord| dr.is_url_rakeable(&url))
.flatten();
match is_rakeable {
Some(true) => {
// ALLOWED
// Make it a queued URL
txn.enqueue_url(url_str, None, record.queue_record.intent)?;
cur.del(WriteFlags::empty())?;
}
Some(false) => {
// WEED
// Just delete
cur.del(WriteFlags::empty())?;
}
None => { /* nop: neither allowed nor a weed. Keep on hold. */ }
}
}
txn.commit()?;
Ok(())
}