From 27c321809772dff63a62ec295d63843d0d8a10be Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 27 Mar 2022 19:40:12 +0100 Subject: [PATCH] Emit icons from the raker --- quickpeep_raker/src/bin/qp-raker.rs | 18 ++++++++++++- quickpeep_raker/src/raking/task.rs | 39 +++++++++++++++++++++++++-- quickpeep_structs/src/rake_entries.rs | 7 +++++ 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/quickpeep_raker/src/bin/qp-raker.rs b/quickpeep_raker/src/bin/qp-raker.rs index 7c5d3fc..54e5327 100644 --- a/quickpeep_raker/src/bin/qp-raker.rs +++ b/quickpeep_raker/src/bin/qp-raker.rs @@ -27,7 +27,8 @@ use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission}; use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep_raker::storage::RakerStore; use quickpeep_structs::rake_entries::{ - AnalysisAntifeatures, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES, SCHEMA_RAKED_REJECTIONS, + AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES, + SCHEMA_RAKED_REJECTIONS, }; /// The ordering is slightly important on these: more specific things should come first. @@ -147,6 +148,7 @@ pub async fn main() -> anyhow::Result<()> { let (pages_tx, pages_rx) = mpsc::channel(32); let (refs_tx, refs_rx) = mpsc::channel(32); let (rejections_tx, rejections_rx) = mpsc::channel(32); + let (icons_tx, icons_rx) = mpsc::channel(32); let mut emitters = Vec::with_capacity(3); @@ -201,10 +203,24 @@ pub async fn main() -> anyhow::Result<()> { ); } + { + let emit_dir = config.emit_dir.clone(); + let settings = config.pack_emitter.clone(); + emitters.push( + std::thread::Builder::new() + .name("icons emitter".to_owned()) + .spawn(move || -> anyhow::Result<()> { + pack_emitter(&emit_dir, "icons", SCHEMA_RAKED_ICONS, icons_rx, &settings)?; + Ok(()) + })?, + ); + } + let submission = TaskResultSubmission { pages: pages_tx, references: refs_tx, rejections: rejections_tx, + icons: icons_tx, }; let graceful_stop = Arc::new(AtomicBool::new(false)); diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 9c83597..280e9be 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -12,7 +12,7 @@ use cylon::Cylon; use log::warn; use lru::LruCache; use quickpeep_structs::rake_entries::{ - RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, + IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, }; use quickpeep_utils::dates::date_to_quickpeep_days; use reqwest::{Client, Url}; @@ -44,6 +44,7 @@ pub struct TaskResultSubmission { pub pages: Sender<(Url, RakedPageEntry)>, pub references: Sender<(Url, RakedReferrerEntry)>, pub rejections: Sender<(Url, PermanentFailure)>, + pub icons: Sender<(Url, IconEntry)>, } #[derive(Clone)] @@ -359,7 +360,22 @@ impl TaskContext { } RakeOutcome::RakedIcon(icon) => { // Store icon to icon store - todo!(); + + self.submission + .icons + .send(( + url.clone(), + IconEntry { + webp_bytes: icon.webp_bytes, + }, + )) + .await?; + + self.as_event_processor() + .process_icon(url.clone(), today) + .await?; + + Ok(NextAction::Continue) } RakeOutcome::Redirect { reason, new_url } => { let refs = RakedReferrerEntry { @@ -476,6 +492,25 @@ impl EventProcessor<'_> { .await } + pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> { + self.store + .as_ref() + .async_rw_txn(move |txn| { + let domain = get_reduced_domain(&url)?; + txn.mark_url_as_visited( + domain.as_ref(), + url.as_ref(), + UrlVisitedRecord { + last_visited_days: datestamp, + }, + )?; + + txn.commit()?; + Ok(()) + }) + .await + } + pub async fn process_refs( &self, url: Url, diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs index 484c660..619557c 100644 --- a/quickpeep_structs/src/rake_entries.rs +++ b/quickpeep_structs/src/rake_entries.rs @@ -29,6 +29,7 @@ impl_serde_for_bitflags!(AnalysisAntifeatures); pub const SCHEMA_RAKED_PAGES: &str = "quickpeep_pages:0.1.0"; pub const SCHEMA_RAKED_REFERENCES: &str = "quickpeep_references:0.1.0"; pub const SCHEMA_RAKED_REJECTIONS: &str = "quickpeep_rejections:0.1.0"; +pub const SCHEMA_RAKED_ICONS: &str = "quickpeep_icons:0.1.0"; #[derive(Serialize, Deserialize, Debug, Clone)] pub struct RakedPageEntry { @@ -36,6 +37,12 @@ pub struct RakedPageEntry { pub document: DenseDocument, } +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct IconEntry { + /// Densely-packed WebP bytes (with low quality). + pub webp_bytes: Vec, +} + #[derive(Serialize, Deserialize, Debug, Clone)] pub struct RakedReferrerEntry { pub references: BTreeSet,