Some groundwork for pulling out JSON-LD dates
This commit is contained in:
parent
2f68d4d825
commit
7dc2369dd2
|
@ -3794,6 +3794,7 @@ dependencies = [
|
|||
"reqwest",
|
||||
"serde",
|
||||
"serde_bare",
|
||||
"serde_json",
|
||||
"signal-hook 0.3.13",
|
||||
"sitemap",
|
||||
"smartstring",
|
||||
|
|
|
@ -22,6 +22,7 @@ kuchiki = "0.8.1"
|
|||
html5ever = "0.25.1"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_bare = "0.5.0"
|
||||
serde_json = "1.0.79"
|
||||
|
||||
toml = "0.5.8"
|
||||
|
||||
|
|
|
@ -4,15 +4,17 @@ use crate::raking::analysis::{
|
|||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||
use adblock::engine::Engine;
|
||||
use anyhow::{bail, Context};
|
||||
use chrono::{DateTime, Utc};
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use itertools::Itertools;
|
||||
use kuchiki::NodeRef;
|
||||
use log::{debug, error, warn};
|
||||
use log::{debug, error, trace, warn};
|
||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_utils::lazy::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::runtime;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
|
@ -234,6 +236,8 @@ impl PageExtractionServiceInternal {
|
|||
}
|
||||
}
|
||||
|
||||
find_page_metadata(root_node.clone())?;
|
||||
|
||||
let mut readability =
|
||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||
if let Err(err) = readability.parse(url.as_str()) {
|
||||
|
@ -274,6 +278,47 @@ impl PageExtractionServiceInternal {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_json_ld_date(input: String) -> Option<DateTime<Utc>> {
|
||||
DateTime::parse_from_rfc3339(&input)
|
||||
.ok()
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
}
|
||||
|
||||
pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
|
||||
let mut meta = PageMetadata {
|
||||
publication_date: None,
|
||||
creation_date: None,
|
||||
};
|
||||
|
||||
match root_node.select("script[type='application/ld+json']") {
|
||||
Ok(ld_nodes) => {
|
||||
for ld_node in ld_nodes {
|
||||
if let Some(_element) = ld_node.as_node().as_element() {
|
||||
let json_text = ld_node.text_contents();
|
||||
match serde_json::de::from_str::<JsonLdMetadata>(json_text.as_str()) {
|
||||
Ok(jld) => {
|
||||
let date_published =
|
||||
jld.date_published.map(parse_json_ld_date).flatten();
|
||||
let date_created = jld.date_created.map(parse_json_ld_date).flatten();
|
||||
if date_published.is_some() {
|
||||
meta.publication_date = date_published;
|
||||
}
|
||||
if date_created.is_some() {
|
||||
meta.creation_date = date_created;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
trace!("Bad JSON-LD: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(meta)
|
||||
}
|
||||
Err(()) => Ok(meta),
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ExtractedPage {
|
||||
Success {
|
||||
document: DenseDocument,
|
||||
|
@ -285,3 +330,28 @@ pub enum ExtractedPage {
|
|||
new_url: Url,
|
||||
},
|
||||
}
|
||||
|
||||
/// Metadata that may be found in a JSON-LD snippet on the page.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct JsonLdMetadata {
|
||||
#[serde(rename = "@context")]
|
||||
#[serde(default)]
|
||||
pub context: Option<String>,
|
||||
|
||||
#[serde(rename = "@type")]
|
||||
#[serde(default)]
|
||||
pub entity_type: Option<String>,
|
||||
|
||||
#[serde(rename = "datePublished")]
|
||||
#[serde(default)]
|
||||
pub date_published: Option<String>,
|
||||
|
||||
#[serde(rename = "dateCreated")]
|
||||
#[serde(default)]
|
||||
pub date_created: Option<String>,
|
||||
}
|
||||
|
||||
pub struct PageMetadata {
|
||||
publication_date: Option<DateTime<Utc>>,
|
||||
creation_date: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue