Add configurable re-rake times for different kinds of raked things
This commit is contained in:
parent
d5255410f5
commit
6ecbc0561f
@ -96,5 +96,11 @@
|
|||||||
pack_emitter: (
|
pack_emitter: (
|
||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
|
rerake_timings: (
|
||||||
|
page: 300,
|
||||||
|
icon: 365,
|
||||||
|
feed: 10,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
@ -268,6 +268,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||||||
submission,
|
submission,
|
||||||
graceful_stop,
|
graceful_stop,
|
||||||
notify: graceful_stop_notify,
|
notify: graceful_stop_notify,
|
||||||
|
rerake_timings: Arc::new(config.raker.rerake_timings.clone()),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Reinstate old backoffs
|
// Reinstate old backoffs
|
||||||
|
@ -28,6 +28,23 @@ pub struct RakerOnlyConfig {
|
|||||||
pub metrics: MetricsConfig,
|
pub metrics: MetricsConfig,
|
||||||
|
|
||||||
pub pack_emitter: PackEmitterSettings,
|
pub pack_emitter: PackEmitterSettings,
|
||||||
|
|
||||||
|
pub rerake_timings: RerakeTimings,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct RerakeTimings {
|
||||||
|
/// How long, in days, between re-rakes of the same page?
|
||||||
|
/// Suggested: 300
|
||||||
|
pub page: u16,
|
||||||
|
|
||||||
|
/// How long, in days, between re-rakes of feeds?
|
||||||
|
/// Suggested: 10
|
||||||
|
pub feed: u16,
|
||||||
|
|
||||||
|
/// How long, in days, between re-rakes of icons?
|
||||||
|
/// Suggested: 365
|
||||||
|
pub icon: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RakerConfig {
|
impl RakerConfig {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use crate::config::RerakeTimings;
|
||||||
use crate::raking::references::references_from_urlrakes;
|
use crate::raking::references::references_from_urlrakes;
|
||||||
use crate::raking::{
|
use crate::raking::{
|
||||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||||
@ -78,6 +79,8 @@ pub struct TaskContext {
|
|||||||
/// Notifier used to wake up sleepers (either to stop them gracefully, or because work
|
/// Notifier used to wake up sleepers (either to stop them gracefully, or because work
|
||||||
/// is available (not implemented))
|
/// is available (not implemented))
|
||||||
pub notify: Arc<Notify>,
|
pub notify: Arc<Notify>,
|
||||||
|
|
||||||
|
pub rerake_timings: Arc<RerakeTimings>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TaskContext {
|
impl TaskContext {
|
||||||
@ -348,7 +351,7 @@ impl TaskContext {
|
|||||||
.process_page(url.clone(), page.page_entry, today)
|
.process_page(url.clone(), page.page_entry, today)
|
||||||
.await?;
|
.await?;
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), page.referrer_entry, today)
|
.process_refs(url.clone(), page.referrer_entry, today, false)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
@ -365,7 +368,7 @@ impl TaskContext {
|
|||||||
.context("Reference processor shut down; can't stream references!")?;
|
.context("Reference processor shut down; can't stream references!")?;
|
||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today)
|
.process_refs(url.clone(), refs, today, true)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
@ -382,7 +385,7 @@ impl TaskContext {
|
|||||||
.context("Reference processor shut down; can't stream references!")?;
|
.context("Reference processor shut down; can't stream references!")?;
|
||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today)
|
.process_refs(url.clone(), refs, today, true)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
@ -427,7 +430,7 @@ impl TaskContext {
|
|||||||
.context("Reference processor shut down; can't stream references!")?;
|
.context("Reference processor shut down; can't stream references!")?;
|
||||||
|
|
||||||
self.as_event_processor()
|
self.as_event_processor()
|
||||||
.process_refs(url.clone(), refs, today)
|
.process_refs(url.clone(), refs, today, false)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
Ok(NextAction::Continue)
|
Ok(NextAction::Continue)
|
||||||
@ -474,6 +477,7 @@ impl TaskContext {
|
|||||||
fn as_event_processor(&self) -> EventProcessor {
|
fn as_event_processor(&self) -> EventProcessor {
|
||||||
EventProcessor {
|
EventProcessor {
|
||||||
store: Cow::Borrowed(&self.store),
|
store: Cow::Borrowed(&self.store),
|
||||||
|
rerake_timings: &self.rerake_timings,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -483,6 +487,7 @@ impl TaskContext {
|
|||||||
/// just by replaying the stream of RakePacks and importing seeds.
|
/// just by replaying the stream of RakePacks and importing seeds.
|
||||||
pub struct EventProcessor<'a> {
|
pub struct EventProcessor<'a> {
|
||||||
store: Cow<'a, RakerStore>,
|
store: Cow<'a, RakerStore>,
|
||||||
|
rerake_timings: &'a RerakeTimings,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EventProcessor<'_> {
|
impl EventProcessor<'_> {
|
||||||
@ -492,6 +497,7 @@ impl EventProcessor<'_> {
|
|||||||
page: RakedPageEntry,
|
page: RakedPageEntry,
|
||||||
datestamp: u16,
|
datestamp: u16,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
let rerake_on = Some(datestamp + self.rerake_timings.page);
|
||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
@ -502,6 +508,7 @@ impl EventProcessor<'_> {
|
|||||||
UrlVisitedRecord {
|
UrlVisitedRecord {
|
||||||
last_visited_days: datestamp,
|
last_visited_days: datestamp,
|
||||||
},
|
},
|
||||||
|
rerake_on,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// If there's a favicon to be tried, add it to the list...
|
// If there's a favicon to be tried, add it to the list...
|
||||||
@ -517,6 +524,8 @@ impl EventProcessor<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> {
|
pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> {
|
||||||
|
let rerake_on = Some(datestamp + self.rerake_timings.icon);
|
||||||
|
|
||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
@ -527,6 +536,7 @@ impl EventProcessor<'_> {
|
|||||||
UrlVisitedRecord {
|
UrlVisitedRecord {
|
||||||
last_visited_days: datestamp,
|
last_visited_days: datestamp,
|
||||||
},
|
},
|
||||||
|
rerake_on,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
@ -540,7 +550,13 @@ impl EventProcessor<'_> {
|
|||||||
url: Url,
|
url: Url,
|
||||||
refs: RakedReferrerEntry,
|
refs: RakedReferrerEntry,
|
||||||
datestamp: u16,
|
datestamp: u16,
|
||||||
|
rerakeable_feed: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
let rerake_on = if rerakeable_feed {
|
||||||
|
Some(self.rerake_timings.feed)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
self.store
|
self.store
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.async_rw_txn(move |txn| {
|
.async_rw_txn(move |txn| {
|
||||||
@ -551,6 +567,7 @@ impl EventProcessor<'_> {
|
|||||||
UrlVisitedRecord {
|
UrlVisitedRecord {
|
||||||
last_visited_days: datestamp,
|
last_visited_days: datestamp,
|
||||||
},
|
},
|
||||||
|
rerake_on,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// track all the referred-to URLs!
|
// track all the referred-to URLs!
|
||||||
@ -603,6 +620,7 @@ impl EventProcessor<'_> {
|
|||||||
UrlVisitedRecord {
|
UrlVisitedRecord {
|
||||||
last_visited_days: datestamp,
|
last_visited_days: datestamp,
|
||||||
},
|
},
|
||||||
|
None,
|
||||||
)?;
|
)?;
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -32,6 +32,9 @@ pub mod records;
|
|||||||
pub struct Databases<'env> {
|
pub struct Databases<'env> {
|
||||||
/// Domain \n URL → QueueUrlRecord
|
/// Domain \n URL → QueueUrlRecord
|
||||||
pub queue_urls: Database<'env>,
|
pub queue_urls: Database<'env>,
|
||||||
|
/// u16 → URL. The u16 is the day-precision QuickPeep timestamp at which the URL should (MULTI-VALUE; INT16)
|
||||||
|
/// be enqueued again for reraking.
|
||||||
|
pub rerake_queue: Database<'env>,
|
||||||
/// Domain → ActiveDomainRecord
|
/// Domain → ActiveDomainRecord
|
||||||
pub active_domains: Database<'env>,
|
pub active_domains: Database<'env>,
|
||||||
/// u32 → domain name. Used to try and give some fairness.
|
/// u32 → domain name. Used to try and give some fairness.
|
||||||
@ -54,6 +57,7 @@ impl<'env> Databases<'env> {
|
|||||||
pub fn iter_all_databases(&self) -> impl Iterator<Item = (&'static str, &Database<'env>)> {
|
pub fn iter_all_databases(&self) -> impl Iterator<Item = (&'static str, &Database<'env>)> {
|
||||||
[
|
[
|
||||||
("queue_urls", &self.queue_urls),
|
("queue_urls", &self.queue_urls),
|
||||||
|
("rerake_queue", &self.rerake_queue),
|
||||||
("active_domains", &self.active_domains),
|
("active_domains", &self.active_domains),
|
||||||
("active_domain_raffle", &self.active_domain_raffle),
|
("active_domain_raffle", &self.active_domain_raffle),
|
||||||
(
|
(
|
||||||
@ -71,8 +75,9 @@ impl<'env> Databases<'env> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Must match the order of the Databases struct fields.
|
// Must match the order of the Databases struct fields.
|
||||||
pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
|
pub const DATABASES: [(&'static str, DatabaseFlags); 10] = [
|
||||||
("urls_queue", DatabaseFlags::empty()),
|
("urls_queue", DatabaseFlags::empty()),
|
||||||
|
("rerake_queue", DatabaseFlags::DUP_SORT),
|
||||||
("active_domains", DatabaseFlags::empty()),
|
("active_domains", DatabaseFlags::empty()),
|
||||||
("active_domain_raffle", DatabaseFlags::INTEGER_KEY),
|
("active_domain_raffle", DatabaseFlags::INTEGER_KEY),
|
||||||
(
|
(
|
||||||
@ -176,6 +181,7 @@ impl RakerStore {
|
|||||||
// Must match the order of the DATABASES constant and the struct field definitions
|
// Must match the order of the DATABASES constant and the struct field definitions
|
||||||
Databases {
|
Databases {
|
||||||
queue_urls: dbs.next().unwrap(),
|
queue_urls: dbs.next().unwrap(),
|
||||||
|
rerake_queue: dbs.next().unwrap(),
|
||||||
active_domains: dbs.next().unwrap(),
|
active_domains: dbs.next().unwrap(),
|
||||||
active_domain_raffle: dbs.next().unwrap(),
|
active_domain_raffle: dbs.next().unwrap(),
|
||||||
backing_off_reinstatements: dbs.next().unwrap(),
|
backing_off_reinstatements: dbs.next().unwrap(),
|
||||||
@ -337,9 +343,11 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
domain: &str,
|
domain: &str,
|
||||||
url_str: &str,
|
url_str: &str,
|
||||||
record: UrlVisitedRecord,
|
record: UrlVisitedRecord,
|
||||||
|
rerake_on: Option<u16>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||||
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
||||||
|
let rerake_queue = &self.mdbx.borrow_dbs().rerake_queue;
|
||||||
|
|
||||||
let queue_key = format!("{}\n{}", domain, url_str);
|
let queue_key = format!("{}\n{}", domain, url_str);
|
||||||
|
|
||||||
@ -358,6 +366,15 @@ impl<'a> RakerTxn<'a, RW> {
|
|||||||
WriteFlags::empty(),
|
WriteFlags::empty(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
if let Some(rerake_on) = rerake_on {
|
||||||
|
self.mdbx_txn.put(
|
||||||
|
rerake_queue,
|
||||||
|
&rerake_on.to_be_bytes(),
|
||||||
|
url_str.as_bytes(),
|
||||||
|
WriteFlags::empty(),
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user