Emit icons from the raker
continuous-integration/drone the build failed Details

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 19:40:12 +01:00
parent bdfacc643e
commit 27c3218097
3 changed files with 61 additions and 3 deletions

View File

@ -27,7 +27,8 @@ use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission};
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_raker::storage::RakerStore; use quickpeep_raker::storage::RakerStore;
use quickpeep_structs::rake_entries::{ use quickpeep_structs::rake_entries::{
AnalysisAntifeatures, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES, SCHEMA_RAKED_REJECTIONS, AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES,
SCHEMA_RAKED_REJECTIONS,
}; };
/// The ordering is slightly important on these: more specific things should come first. /// The ordering is slightly important on these: more specific things should come first.
@ -147,6 +148,7 @@ pub async fn main() -> anyhow::Result<()> {
let (pages_tx, pages_rx) = mpsc::channel(32); let (pages_tx, pages_rx) = mpsc::channel(32);
let (refs_tx, refs_rx) = mpsc::channel(32); let (refs_tx, refs_rx) = mpsc::channel(32);
let (rejections_tx, rejections_rx) = mpsc::channel(32); let (rejections_tx, rejections_rx) = mpsc::channel(32);
let (icons_tx, icons_rx) = mpsc::channel(32);
let mut emitters = Vec::with_capacity(3); let mut emitters = Vec::with_capacity(3);
@ -201,10 +203,24 @@ pub async fn main() -> anyhow::Result<()> {
); );
} }
{
let emit_dir = config.emit_dir.clone();
let settings = config.pack_emitter.clone();
emitters.push(
std::thread::Builder::new()
.name("icons emitter".to_owned())
.spawn(move || -> anyhow::Result<()> {
pack_emitter(&emit_dir, "icons", SCHEMA_RAKED_ICONS, icons_rx, &settings)?;
Ok(())
})?,
);
}
let submission = TaskResultSubmission { let submission = TaskResultSubmission {
pages: pages_tx, pages: pages_tx,
references: refs_tx, references: refs_tx,
rejections: rejections_tx, rejections: rejections_tx,
icons: icons_tx,
}; };
let graceful_stop = Arc::new(AtomicBool::new(false)); let graceful_stop = Arc::new(AtomicBool::new(false));

View File

@ -12,7 +12,7 @@ use cylon::Cylon;
use log::warn; use log::warn;
use lru::LruCache; use lru::LruCache;
use quickpeep_structs::rake_entries::{ use quickpeep_structs::rake_entries::{
RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
}; };
use quickpeep_utils::dates::date_to_quickpeep_days; use quickpeep_utils::dates::date_to_quickpeep_days;
use reqwest::{Client, Url}; use reqwest::{Client, Url};
@ -44,6 +44,7 @@ pub struct TaskResultSubmission {
pub pages: Sender<(Url, RakedPageEntry)>, pub pages: Sender<(Url, RakedPageEntry)>,
pub references: Sender<(Url, RakedReferrerEntry)>, pub references: Sender<(Url, RakedReferrerEntry)>,
pub rejections: Sender<(Url, PermanentFailure)>, pub rejections: Sender<(Url, PermanentFailure)>,
pub icons: Sender<(Url, IconEntry)>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -359,7 +360,22 @@ impl TaskContext {
} }
RakeOutcome::RakedIcon(icon) => { RakeOutcome::RakedIcon(icon) => {
// Store icon to icon store // Store icon to icon store
todo!();
self.submission
.icons
.send((
url.clone(),
IconEntry {
webp_bytes: icon.webp_bytes,
},
))
.await?;
self.as_event_processor()
.process_icon(url.clone(), today)
.await?;
Ok(NextAction::Continue)
} }
RakeOutcome::Redirect { reason, new_url } => { RakeOutcome::Redirect { reason, new_url } => {
let refs = RakedReferrerEntry { let refs = RakedReferrerEntry {
@ -476,6 +492,25 @@ impl EventProcessor<'_> {
.await .await
} }
pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> {
self.store
.as_ref()
.async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?;
txn.mark_url_as_visited(
domain.as_ref(),
url.as_ref(),
UrlVisitedRecord {
last_visited_days: datestamp,
},
)?;
txn.commit()?;
Ok(())
})
.await
}
pub async fn process_refs( pub async fn process_refs(
&self, &self,
url: Url, url: Url,

View File

@ -29,6 +29,7 @@ impl_serde_for_bitflags!(AnalysisAntifeatures);
pub const SCHEMA_RAKED_PAGES: &str = "quickpeep_pages:0.1.0"; pub const SCHEMA_RAKED_PAGES: &str = "quickpeep_pages:0.1.0";
pub const SCHEMA_RAKED_REFERENCES: &str = "quickpeep_references:0.1.0"; pub const SCHEMA_RAKED_REFERENCES: &str = "quickpeep_references:0.1.0";
pub const SCHEMA_RAKED_REJECTIONS: &str = "quickpeep_rejections:0.1.0"; pub const SCHEMA_RAKED_REJECTIONS: &str = "quickpeep_rejections:0.1.0";
pub const SCHEMA_RAKED_ICONS: &str = "quickpeep_icons:0.1.0";
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedPageEntry { pub struct RakedPageEntry {
@ -36,6 +37,12 @@ pub struct RakedPageEntry {
pub document: DenseDocument, pub document: DenseDocument,
} }
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IconEntry {
/// Densely-packed WebP bytes (with low quality).
pub webp_bytes: Vec<u8>,
}
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RakedReferrerEntry { pub struct RakedReferrerEntry {
pub references: BTreeSet<RakedReference>, pub references: BTreeSet<RakedReference>,