Some groundwork for pulling out JSON-LD dates

This commit is contained in:
Olivier 'reivilibre' 2022-03-28 19:45:39 +01:00
parent 2f68d4d825
commit 7dc2369dd2
3 changed files with 73 additions and 1 deletions

1
Cargo.lock generated
View File

@ -3794,6 +3794,7 @@ dependencies = [
"reqwest", "reqwest",
"serde", "serde",
"serde_bare", "serde_bare",
"serde_json",
"signal-hook 0.3.13", "signal-hook 0.3.13",
"sitemap", "sitemap",
"smartstring", "smartstring",

View File

@ -22,6 +22,7 @@ kuchiki = "0.8.1"
html5ever = "0.25.1" html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0" serde_bare = "0.5.0"
serde_json = "1.0.79"
toml = "0.5.8" toml = "0.5.8"

View File

@ -4,15 +4,17 @@ use crate::raking::analysis::{
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES}; use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
use adblock::engine::Engine; use adblock::engine::Engine;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use chrono::{DateTime, Utc};
use html5ever::tendril::TendrilSink; use html5ever::tendril::TendrilSink;
use itertools::Itertools; use itertools::Itertools;
use kuchiki::NodeRef; use kuchiki::NodeRef;
use log::{debug, error, warn}; use log::{debug, error, trace, warn};
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::lazy::Lazy; use quickpeep_utils::lazy::Lazy;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize};
use tokio::runtime; use tokio::runtime;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
use tokio::sync::{mpsc, oneshot}; use tokio::sync::{mpsc, oneshot};
@ -234,6 +236,8 @@ impl PageExtractionServiceInternal {
} }
} }
find_page_metadata(root_node.clone())?;
let mut readability = let mut readability =
quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
if let Err(err) = readability.parse(url.as_str()) { if let Err(err) = readability.parse(url.as_str()) {
@ -274,6 +278,47 @@ impl PageExtractionServiceInternal {
} }
} }
fn parse_json_ld_date(input: String) -> Option<DateTime<Utc>> {
DateTime::parse_from_rfc3339(&input)
.ok()
.map(|dt| dt.with_timezone(&Utc))
}
pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
let mut meta = PageMetadata {
publication_date: None,
creation_date: None,
};
match root_node.select("script[type='application/ld+json']") {
Ok(ld_nodes) => {
for ld_node in ld_nodes {
if let Some(_element) = ld_node.as_node().as_element() {
let json_text = ld_node.text_contents();
match serde_json::de::from_str::<JsonLdMetadata>(json_text.as_str()) {
Ok(jld) => {
let date_published =
jld.date_published.map(parse_json_ld_date).flatten();
let date_created = jld.date_created.map(parse_json_ld_date).flatten();
if date_published.is_some() {
meta.publication_date = date_published;
}
if date_created.is_some() {
meta.creation_date = date_created;
}
}
Err(err) => {
trace!("Bad JSON-LD: {:?}", err);
}
}
}
}
Ok(meta)
}
Err(()) => Ok(meta),
}
}
pub enum ExtractedPage { pub enum ExtractedPage {
Success { Success {
document: DenseDocument, document: DenseDocument,
@ -285,3 +330,28 @@ pub enum ExtractedPage {
new_url: Url, new_url: Url,
}, },
} }
/// Metadata that may be found in a JSON-LD snippet on the page.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JsonLdMetadata {
#[serde(rename = "@context")]
#[serde(default)]
pub context: Option<String>,
#[serde(rename = "@type")]
#[serde(default)]
pub entity_type: Option<String>,
#[serde(rename = "datePublished")]
#[serde(default)]
pub date_published: Option<String>,
#[serde(rename = "dateCreated")]
#[serde(default)]
pub date_created: Option<String>,
}
pub struct PageMetadata {
publication_date: Option<DateTime<Utc>>,
creation_date: Option<DateTime<Utc>>,
}