Use a non-readabilitised copy of the document for reference extraction
continuous-integration/drone the build failed Details

Fixes #7.
This commit is contained in:
Olivier 'reivilibre' 2022-03-29 22:43:31 +01:00
parent e6a402af19
commit 6a68757e30
3 changed files with 10 additions and 5 deletions

View File

@ -393,11 +393,12 @@ impl Raker {
.await? .await?
{ {
ExtractedPage::Success { ExtractedPage::Success {
unreadable_document,
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
} => { } => {
let references = references::find_references(&document, &feeds, url); let references = references::find_references(&unreadable_document, &feeds, url);
Ok(RakeOutcome::RakedPage(RakedPage { Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry { page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags, analysed_antifeatures: antifeature_flags,

View File

@ -249,6 +249,9 @@ impl PageExtractionServiceInternal {
let metadata = find_page_metadata(root_node.clone())?; let metadata = find_page_metadata(root_node.clone())?;
// Capture a copy of the unreadable document before it's too late.
let unreadable_document = DenseTree::from_body(root_node.clone());
let mut readability = let mut readability =
quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
if let Err(err) = readability.parse(url.as_str()) { if let Err(err) = readability.parse(url.as_str()) {
@ -288,6 +291,7 @@ impl PageExtractionServiceInternal {
} }
Ok(ExtractedPage::Success { Ok(ExtractedPage::Success {
unreadable_document,
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
@ -338,6 +342,7 @@ pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
pub enum ExtractedPage { pub enum ExtractedPage {
Success { Success {
unreadable_document: Vec<DenseTree>,
document: DenseDocument, document: DenseDocument,
feeds: Vec<Url>, feeds: Vec<Url>,
antifeature_flags: AnalysisAntifeatures, antifeature_flags: AnalysisAntifeatures,

View File

@ -1,12 +1,12 @@
use crate::raking::UrlRaked; use crate::raking::UrlRaked;
use quickpeep_densedoc::{DenseDocument, DenseTree}; use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind}; use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
use quickpeep_utils::dates::date_to_quickpeep_days; use quickpeep_utils::dates::date_to_quickpeep_days;
use reqwest::Url; use reqwest::Url;
use std::collections::BTreeSet; use std::collections::BTreeSet;
pub fn find_references( pub fn find_references(
doc: &DenseDocument, doc: &Vec<DenseTree>,
feeds: &Vec<Url>, feeds: &Vec<Url>,
page_url: &Url, page_url: &Url,
) -> BTreeSet<RakedReference> { ) -> BTreeSet<RakedReference> {
@ -55,8 +55,7 @@ pub fn find_references(
} }
} }
add_link_refs(&doc.body_content, &mut refs, &page_url); add_link_refs(&doc, &mut refs, &page_url);
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
for feed in feeds { for feed in feeds {
refs.insert(RakedReference { refs.insert(RakedReference {