From 7dc2369dd2d464ae1cf5bf46e7a961f9fd4c3990 Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 28 Mar 2022 19:45:39 +0100 Subject: [PATCH] Some groundwork for pulling out JSON-LD dates --- Cargo.lock | 1 + quickpeep_raker/Cargo.toml | 1 + quickpeep_raker/src/raking/page_extraction.rs | 72 ++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 37cec7d..15b75c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3794,6 +3794,7 @@ dependencies = [ "reqwest", "serde", "serde_bare", + "serde_json", "signal-hook 0.3.13", "sitemap", "smartstring", diff --git a/quickpeep_raker/Cargo.toml b/quickpeep_raker/Cargo.toml index 89750ff..fe5c9d4 100644 --- a/quickpeep_raker/Cargo.toml +++ b/quickpeep_raker/Cargo.toml @@ -22,6 +22,7 @@ kuchiki = "0.8.1" html5ever = "0.25.1" serde = { version = "1.0.136", features = ["derive"] } serde_bare = "0.5.0" +serde_json = "1.0.79" toml = "0.5.8" diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 1992b39..5e2b6ff 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -4,15 +4,17 @@ use crate::raking::analysis::{ use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES}; use adblock::engine::Engine; use anyhow::{bail, Context}; +use chrono::{DateTime, Utc}; use html5ever::tendril::TendrilSink; use itertools::Itertools; use kuchiki::NodeRef; -use log::{debug, error, warn}; +use log::{debug, error, trace, warn}; use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_utils::lazy::Lazy; use reqwest::header::HeaderMap; use reqwest::Url; +use serde::{Deserialize, Serialize}; use tokio::runtime; use tokio::sync::mpsc::Sender; use tokio::sync::{mpsc, oneshot}; @@ -234,6 +236,8 @@ impl PageExtractionServiceInternal { } } + find_page_metadata(root_node.clone())?; + let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); if let Err(err) = readability.parse(url.as_str()) { @@ -274,6 +278,47 @@ impl PageExtractionServiceInternal { } } +fn parse_json_ld_date(input: String) -> Option> { + DateTime::parse_from_rfc3339(&input) + .ok() + .map(|dt| dt.with_timezone(&Utc)) +} + +pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result { + let mut meta = PageMetadata { + publication_date: None, + creation_date: None, + }; + + match root_node.select("script[type='application/ld+json']") { + Ok(ld_nodes) => { + for ld_node in ld_nodes { + if let Some(_element) = ld_node.as_node().as_element() { + let json_text = ld_node.text_contents(); + match serde_json::de::from_str::(json_text.as_str()) { + Ok(jld) => { + let date_published = + jld.date_published.map(parse_json_ld_date).flatten(); + let date_created = jld.date_created.map(parse_json_ld_date).flatten(); + if date_published.is_some() { + meta.publication_date = date_published; + } + if date_created.is_some() { + meta.creation_date = date_created; + } + } + Err(err) => { + trace!("Bad JSON-LD: {:?}", err); + } + } + } + } + Ok(meta) + } + Err(()) => Ok(meta), + } +} + pub enum ExtractedPage { Success { document: DenseDocument, @@ -285,3 +330,28 @@ pub enum ExtractedPage { new_url: Url, }, } + +/// Metadata that may be found in a JSON-LD snippet on the page. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JsonLdMetadata { + #[serde(rename = "@context")] + #[serde(default)] + pub context: Option, + + #[serde(rename = "@type")] + #[serde(default)] + pub entity_type: Option, + + #[serde(rename = "datePublished")] + #[serde(default)] + pub date_published: Option, + + #[serde(rename = "dateCreated")] + #[serde(default)] + pub date_created: Option, +} + +pub struct PageMetadata { + publication_date: Option>, + creation_date: Option>, +}