diff --git a/quickpeep_densedoc/Cargo.toml b/quickpeep_densedoc/Cargo.toml index c07eca3..6e22615 100644 --- a/quickpeep_densedoc/Cargo.toml +++ b/quickpeep_densedoc/Cargo.toml @@ -11,4 +11,4 @@ serde = { version = "1.0.136", features = ["derive"] } kuchiki = "0.8.1" html5ever = "0.25.1" regex = "1.5.5" -lazy_static = "1.4.0" \ No newline at end of file +lazy_static = "1.4.0" diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index 719176c..9c0274f 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -19,6 +19,8 @@ pub struct DenseHead { pub language: String, /// URL to icon of the page. May be empty if none were discovered. pub icon: String, + /// Datetime of publication (or creation, as a fallback), in seconds since the epoch. + pub datetime: Option, } impl DenseHead { diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 5e2b6ff..8c7a6ab 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -236,7 +236,7 @@ impl PageExtractionServiceInternal { } } - find_page_metadata(root_node.clone())?; + let metadata = find_page_metadata(root_node.clone())?; let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); @@ -249,6 +249,11 @@ impl PageExtractionServiceInternal { title = readability.metadata.title().to_owned(); } + let datetime = metadata + .publication_date + .or(metadata.creation_date) + .map(|dt| dt.timestamp()); + let mut document = DenseDocument { head: DenseHead { title, @@ -256,6 +261,7 @@ impl PageExtractionServiceInternal { icon: icon .map(|url| url.as_str().to_owned()) .unwrap_or(String::with_capacity(0)), + datetime, }, body_content: Vec::with_capacity(0), body_remainder: Vec::with_capacity(0),