Some groundwork for pulling out JSON-LD dates

This commit is contained in:
Olivier 'reivilibre' 2022-03-28 19:45:39 +01:00
parent 2f68d4d825
commit 7dc2369dd2
3 changed files with 73 additions and 1 deletions

1
Cargo.lock generated
View File

@ -3794,6 +3794,7 @@ dependencies = [
"reqwest",
"serde",
"serde_bare",
"serde_json",
"signal-hook 0.3.13",
"sitemap",
"smartstring",

View File

@ -22,6 +22,7 @@ kuchiki = "0.8.1"
html5ever = "0.25.1"
serde = { version = "1.0.136", features = ["derive"] }
serde_bare = "0.5.0"
serde_json = "1.0.79"
toml = "0.5.8"

View File

@ -4,15 +4,17 @@ use crate::raking::analysis::{
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
use adblock::engine::Engine;
use anyhow::{bail, Context};
use chrono::{DateTime, Utc};
use html5ever::tendril::TendrilSink;
use itertools::Itertools;
use kuchiki::NodeRef;
use log::{debug, error, warn};
use log::{debug, error, trace, warn};
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::lazy::Lazy;
use reqwest::header::HeaderMap;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use tokio::runtime;
use tokio::sync::mpsc::Sender;
use tokio::sync::{mpsc, oneshot};
@ -234,6 +236,8 @@ impl PageExtractionServiceInternal {
}
}
find_page_metadata(root_node.clone())?;
let mut readability =
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
if let Err(err) = readability.parse(url.as_str()) {
@ -274,6 +278,47 @@ impl PageExtractionServiceInternal {
}
}
fn parse_json_ld_date(input: String) -> Option<DateTime<Utc>> {
DateTime::parse_from_rfc3339(&input)
.ok()
.map(|dt| dt.with_timezone(&Utc))
}
pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
let mut meta = PageMetadata {
publication_date: None,
creation_date: None,
};
match root_node.select("script[type='application/ld+json']") {
Ok(ld_nodes) => {
for ld_node in ld_nodes {
if let Some(_element) = ld_node.as_node().as_element() {
let json_text = ld_node.text_contents();
match serde_json::de::from_str::<JsonLdMetadata>(json_text.as_str()) {
Ok(jld) => {
let date_published =
jld.date_published.map(parse_json_ld_date).flatten();
let date_created = jld.date_created.map(parse_json_ld_date).flatten();
if date_published.is_some() {
meta.publication_date = date_published;
}
if date_created.is_some() {
meta.creation_date = date_created;
}
}
Err(err) => {
trace!("Bad JSON-LD: {:?}", err);
}
}
}
}
Ok(meta)
}
Err(()) => Ok(meta),
}
}
pub enum ExtractedPage {
Success {
document: DenseDocument,
@ -285,3 +330,28 @@ pub enum ExtractedPage {
new_url: Url,
},
}
/// Metadata that may be found in a JSON-LD snippet on the page.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JsonLdMetadata {
#[serde(rename = "@context")]
#[serde(default)]
pub context: Option<String>,
#[serde(rename = "@type")]
#[serde(default)]
pub entity_type: Option<String>,
#[serde(rename = "datePublished")]
#[serde(default)]
pub date_published: Option<String>,
#[serde(rename = "dateCreated")]
#[serde(default)]
pub date_created: Option<String>,
}
pub struct PageMetadata {
publication_date: Option<DateTime<Utc>>,
creation_date: Option<DateTime<Utc>>,
}