Use a non-readabilitised copy of the document for reference extraction
continuous-integration/drone the build failed Details

Fixes #7.
rei/raker_storage
Olivier 'reivilibre' 2022-03-29 22:43:31 +01:00
parent e6a402af19
commit 6a68757e30
3 changed files with 10 additions and 5 deletions

View File

@ -393,11 +393,12 @@ impl Raker {
.await?
{
ExtractedPage::Success {
unreadable_document,
document,
feeds,
antifeature_flags,
} => {
let references = references::find_references(&document, &feeds, url);
let references = references::find_references(&unreadable_document, &feeds, url);
Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags,

View File

@ -249,6 +249,9 @@ impl PageExtractionServiceInternal {
let metadata = find_page_metadata(root_node.clone())?;
// Capture a copy of the unreadable document before it's too late.
let unreadable_document = DenseTree::from_body(root_node.clone());
let mut readability =
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
if let Err(err) = readability.parse(url.as_str()) {
@ -288,6 +291,7 @@ impl PageExtractionServiceInternal {
}
Ok(ExtractedPage::Success {
unreadable_document,
document,
feeds,
antifeature_flags,
@ -338,6 +342,7 @@ pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
pub enum ExtractedPage {
Success {
unreadable_document: Vec<DenseTree>,
document: DenseDocument,
feeds: Vec<Url>,
antifeature_flags: AnalysisAntifeatures,

View File

@ -1,12 +1,12 @@
use crate::raking::UrlRaked;
use quickpeep_densedoc::{DenseDocument, DenseTree};
use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
use quickpeep_utils::dates::date_to_quickpeep_days;
use reqwest::Url;
use std::collections::BTreeSet;
pub fn find_references(
doc: &DenseDocument,
doc: &Vec<DenseTree>,
feeds: &Vec<Url>,
page_url: &Url,
) -> BTreeSet<RakedReference> {
@ -55,8 +55,7 @@ pub fn find_references(
}
}
add_link_refs(&doc.body_content, &mut refs, &page_url);
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
add_link_refs(&doc, &mut refs, &page_url);
for feed in feeds {
refs.insert(RakedReference {