From 3bfd192c2891ce096d00f133c36fd129395c6490 Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 28 Mar 2022 22:24:36 +0100 Subject: [PATCH] Store dates in the document head --- quickpeep_densedoc/Cargo.toml | 2 +- quickpeep_densedoc/src/lib.rs | 2 ++ quickpeep_raker/src/raking/page_extraction.rs | 8 +++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/quickpeep_densedoc/Cargo.toml b/quickpeep_densedoc/Cargo.toml index c07eca3..6e22615 100644 --- a/quickpeep_densedoc/Cargo.toml +++ b/quickpeep_densedoc/Cargo.toml @@ -11,4 +11,4 @@ serde = { version = "1.0.136", features = ["derive"] } kuchiki = "0.8.1" html5ever = "0.25.1" regex = "1.5.5" -lazy_static = "1.4.0" \ No newline at end of file +lazy_static = "1.4.0" diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index 719176c..9c0274f 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -19,6 +19,8 @@ pub struct DenseHead { pub language: String, /// URL to icon of the page. May be empty if none were discovered. pub icon: String, + /// Datetime of publication (or creation, as a fallback), in seconds since the epoch. + pub datetime: Option, } impl DenseHead { diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 5e2b6ff..8c7a6ab 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -236,7 +236,7 @@ impl PageExtractionServiceInternal { } } - find_page_metadata(root_node.clone())?; + let metadata = find_page_metadata(root_node.clone())?; let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); @@ -249,6 +249,11 @@ impl PageExtractionServiceInternal { title = readability.metadata.title().to_owned(); } + let datetime = metadata + .publication_date + .or(metadata.creation_date) + .map(|dt| dt.timestamp()); + let mut document = DenseDocument { head: DenseHead { title, @@ -256,6 +261,7 @@ impl PageExtractionServiceInternal { icon: icon .map(|url| url.as_str().to_owned()) .unwrap_or(String::with_capacity(0)), + datetime, }, body_content: Vec::with_capacity(0), body_remainder: Vec::with_capacity(0),