Some groundwork for pulling out JSON-LD dates
This commit is contained in:
parent
2f68d4d825
commit
7dc2369dd2
|
@ -3794,6 +3794,7 @@ dependencies = [
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_bare",
|
"serde_bare",
|
||||||
|
"serde_json",
|
||||||
"signal-hook 0.3.13",
|
"signal-hook 0.3.13",
|
||||||
"sitemap",
|
"sitemap",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
|
|
|
@ -22,6 +22,7 @@ kuchiki = "0.8.1"
|
||||||
html5ever = "0.25.1"
|
html5ever = "0.25.1"
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
serde_bare = "0.5.0"
|
serde_bare = "0.5.0"
|
||||||
|
serde_json = "1.0.79"
|
||||||
|
|
||||||
toml = "0.5.8"
|
toml = "0.5.8"
|
||||||
|
|
||||||
|
|
|
@ -4,15 +4,17 @@ use crate::raking::analysis::{
|
||||||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use html5ever::tendril::TendrilSink;
|
use html5ever::tendril::TendrilSink;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use log::{debug, error, warn};
|
use log::{debug, error, trace, warn};
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
use quickpeep_utils::lazy::Lazy;
|
use quickpeep_utils::lazy::Lazy;
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::runtime;
|
use tokio::runtime;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
use tokio::sync::{mpsc, oneshot};
|
use tokio::sync::{mpsc, oneshot};
|
||||||
|
@ -234,6 +236,8 @@ impl PageExtractionServiceInternal {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
find_page_metadata(root_node.clone())?;
|
||||||
|
|
||||||
let mut readability =
|
let mut readability =
|
||||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||||
if let Err(err) = readability.parse(url.as_str()) {
|
if let Err(err) = readability.parse(url.as_str()) {
|
||||||
|
@ -274,6 +278,47 @@ impl PageExtractionServiceInternal {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_json_ld_date(input: String) -> Option<DateTime<Utc>> {
|
||||||
|
DateTime::parse_from_rfc3339(&input)
|
||||||
|
.ok()
|
||||||
|
.map(|dt| dt.with_timezone(&Utc))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
|
||||||
|
let mut meta = PageMetadata {
|
||||||
|
publication_date: None,
|
||||||
|
creation_date: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
match root_node.select("script[type='application/ld+json']") {
|
||||||
|
Ok(ld_nodes) => {
|
||||||
|
for ld_node in ld_nodes {
|
||||||
|
if let Some(_element) = ld_node.as_node().as_element() {
|
||||||
|
let json_text = ld_node.text_contents();
|
||||||
|
match serde_json::de::from_str::<JsonLdMetadata>(json_text.as_str()) {
|
||||||
|
Ok(jld) => {
|
||||||
|
let date_published =
|
||||||
|
jld.date_published.map(parse_json_ld_date).flatten();
|
||||||
|
let date_created = jld.date_created.map(parse_json_ld_date).flatten();
|
||||||
|
if date_published.is_some() {
|
||||||
|
meta.publication_date = date_published;
|
||||||
|
}
|
||||||
|
if date_created.is_some() {
|
||||||
|
meta.creation_date = date_created;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
trace!("Bad JSON-LD: {:?}", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(meta)
|
||||||
|
}
|
||||||
|
Err(()) => Ok(meta),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub enum ExtractedPage {
|
pub enum ExtractedPage {
|
||||||
Success {
|
Success {
|
||||||
document: DenseDocument,
|
document: DenseDocument,
|
||||||
|
@ -285,3 +330,28 @@ pub enum ExtractedPage {
|
||||||
new_url: Url,
|
new_url: Url,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Metadata that may be found in a JSON-LD snippet on the page.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct JsonLdMetadata {
|
||||||
|
#[serde(rename = "@context")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub context: Option<String>,
|
||||||
|
|
||||||
|
#[serde(rename = "@type")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub entity_type: Option<String>,
|
||||||
|
|
||||||
|
#[serde(rename = "datePublished")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub date_published: Option<String>,
|
||||||
|
|
||||||
|
#[serde(rename = "dateCreated")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub date_created: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PageMetadata {
|
||||||
|
publication_date: Option<DateTime<Utc>>,
|
||||||
|
creation_date: Option<DateTime<Utc>>,
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue