From 6a68757e30b65c29fe9f4bcf70b8652f89771208 Mon Sep 17 00:00:00 2001 From: Olivier Date: Tue, 29 Mar 2022 22:43:31 +0100 Subject: [PATCH] Use a non-readabilitised copy of the document for reference extraction Fixes #7. --- quickpeep_raker/src/raking.rs | 3 ++- quickpeep_raker/src/raking/page_extraction.rs | 5 +++++ quickpeep_raker/src/raking/references.rs | 7 +++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index f714b41..b6a7ab6 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -393,11 +393,12 @@ impl Raker { .await? { ExtractedPage::Success { + unreadable_document, document, feeds, antifeature_flags, } => { - let references = references::find_references(&document, &feeds, url); + let references = references::find_references(&unreadable_document, &feeds, url); Ok(RakeOutcome::RakedPage(RakedPage { page_entry: RakedPageEntry { analysed_antifeatures: antifeature_flags, diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index d9642bd..511e742 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -249,6 +249,9 @@ impl PageExtractionServiceInternal { let metadata = find_page_metadata(root_node.clone())?; + // Capture a copy of the unreadable document before it's too late. + let unreadable_document = DenseTree::from_body(root_node.clone()); + let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); if let Err(err) = readability.parse(url.as_str()) { @@ -288,6 +291,7 @@ impl PageExtractionServiceInternal { } Ok(ExtractedPage::Success { + unreadable_document, document, feeds, antifeature_flags, @@ -338,6 +342,7 @@ pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result { pub enum ExtractedPage { Success { + unreadable_document: Vec, document: DenseDocument, feeds: Vec, antifeature_flags: AnalysisAntifeatures, diff --git a/quickpeep_raker/src/raking/references.rs b/quickpeep_raker/src/raking/references.rs index 5e6d796..062798f 100644 --- a/quickpeep_raker/src/raking/references.rs +++ b/quickpeep_raker/src/raking/references.rs @@ -1,12 +1,12 @@ use crate::raking::UrlRaked; -use quickpeep_densedoc::{DenseDocument, DenseTree}; +use quickpeep_densedoc::DenseTree; use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind}; use quickpeep_utils::dates::date_to_quickpeep_days; use reqwest::Url; use std::collections::BTreeSet; pub fn find_references( - doc: &DenseDocument, + doc: &Vec, feeds: &Vec, page_url: &Url, ) -> BTreeSet { @@ -55,8 +55,7 @@ pub fn find_references( } } - add_link_refs(&doc.body_content, &mut refs, &page_url); - add_link_refs(&doc.body_remainder, &mut refs, &page_url); + add_link_refs(&doc, &mut refs, &page_url); for feed in feeds { refs.insert(RakedReference {