quickpeep/quickpeep_raker/src/storage/records.rs

74 lines
2.5 KiB
Rust

use crate::raking::{RakeIntent, TemporaryFailure};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct ActiveDomainRecord {
/// The raffle ticket number owned by this domain.
pub raffle_ticket: u32,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct UrlVisitedRecord {
/// Number of days since the QuickPeep Epoch that this page was last raked at.
/// A u16 is fine here, giving 179 years worth of values. This allows compact encoding.
/// We don't really care about a more granular timestamp: sitemaps and feeds usually only
/// give the date of last update anyway.
pub last_visited_days: u16,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct QueueUrlRecord {
pub intent: RakeIntent, // TODO CONSIDER
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct OnHoldUrlRecord {
/// Record that should be emitted once this is released.
pub queue_record: QueueUrlRecord,
/// Number of times this URL has been 'enqueued'; capped at 255.
pub refs: u8,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct BackingOffDomainRecord {
/// The URL that caused the backoff.
pub failed_url: String,
/// The reason that this backoff is in place
pub failure: TemporaryFailure,
/// Duration of the backoff. Used to provide increasing backoffs if the failures persist.
pub backoff: u32,
/// When the domain should be reinstated
/// MUST match the timestamp present in the reinstatements table.
pub reinstate_at: u64,
}
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct DomainRecord {
pub rakeable_path_prefixes: BTreeMap<String, bool>,
}
impl DomainRecord {
/// Returns whether the URL is rakeable.
///
/// Preconditions: it has been checked that the record applies to the domain
pub fn is_url_rakeable(&self, url: &Url) -> Option<bool> {
let mut final_result = None;
// TODO This could be made more efficient.
for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() {
if url.path().starts_with(prefix) {
final_result = Some(rakeable);
}
if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
final_result
}
}