Use a non-readabilitised copy of the document for reference extraction
Some checks failed
continuous-integration/drone the build failed
Some checks failed
continuous-integration/drone the build failed
Fixes #7.
This commit is contained in:
parent
e6a402af19
commit
6a68757e30
@ -393,11 +393,12 @@ impl Raker {
|
||||
.await?
|
||||
{
|
||||
ExtractedPage::Success {
|
||||
unreadable_document,
|
||||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
} => {
|
||||
let references = references::find_references(&document, &feeds, url);
|
||||
let references = references::find_references(&unreadable_document, &feeds, url);
|
||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||
page_entry: RakedPageEntry {
|
||||
analysed_antifeatures: antifeature_flags,
|
||||
|
@ -249,6 +249,9 @@ impl PageExtractionServiceInternal {
|
||||
|
||||
let metadata = find_page_metadata(root_node.clone())?;
|
||||
|
||||
// Capture a copy of the unreadable document before it's too late.
|
||||
let unreadable_document = DenseTree::from_body(root_node.clone());
|
||||
|
||||
let mut readability =
|
||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||
if let Err(err) = readability.parse(url.as_str()) {
|
||||
@ -288,6 +291,7 @@ impl PageExtractionServiceInternal {
|
||||
}
|
||||
|
||||
Ok(ExtractedPage::Success {
|
||||
unreadable_document,
|
||||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
@ -338,6 +342,7 @@ pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
|
||||
|
||||
pub enum ExtractedPage {
|
||||
Success {
|
||||
unreadable_document: Vec<DenseTree>,
|
||||
document: DenseDocument,
|
||||
feeds: Vec<Url>,
|
||||
antifeature_flags: AnalysisAntifeatures,
|
||||
|
@ -1,12 +1,12 @@
|
||||
use crate::raking::UrlRaked;
|
||||
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||
use reqwest::Url;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
pub fn find_references(
|
||||
doc: &DenseDocument,
|
||||
doc: &Vec<DenseTree>,
|
||||
feeds: &Vec<Url>,
|
||||
page_url: &Url,
|
||||
) -> BTreeSet<RakedReference> {
|
||||
@ -55,8 +55,7 @@ pub fn find_references(
|
||||
}
|
||||
}
|
||||
|
||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
||||
add_link_refs(&doc, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
refs.insert(RakedReference {
|
||||
|
Loading…
Reference in New Issue
Block a user