Use a non-readabilitised copy of the document for reference extraction
continuous-integration/drone the build failed
Details
continuous-integration/drone the build failed
Details
Fixes #7.
This commit is contained in:
parent
e6a402af19
commit
6a68757e30
|
@ -393,11 +393,12 @@ impl Raker {
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
ExtractedPage::Success {
|
ExtractedPage::Success {
|
||||||
|
unreadable_document,
|
||||||
document,
|
document,
|
||||||
feeds,
|
feeds,
|
||||||
antifeature_flags,
|
antifeature_flags,
|
||||||
} => {
|
} => {
|
||||||
let references = references::find_references(&document, &feeds, url);
|
let references = references::find_references(&unreadable_document, &feeds, url);
|
||||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
page_entry: RakedPageEntry {
|
page_entry: RakedPageEntry {
|
||||||
analysed_antifeatures: antifeature_flags,
|
analysed_antifeatures: antifeature_flags,
|
||||||
|
|
|
@ -249,6 +249,9 @@ impl PageExtractionServiceInternal {
|
||||||
|
|
||||||
let metadata = find_page_metadata(root_node.clone())?;
|
let metadata = find_page_metadata(root_node.clone())?;
|
||||||
|
|
||||||
|
// Capture a copy of the unreadable document before it's too late.
|
||||||
|
let unreadable_document = DenseTree::from_body(root_node.clone());
|
||||||
|
|
||||||
let mut readability =
|
let mut readability =
|
||||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||||
if let Err(err) = readability.parse(url.as_str()) {
|
if let Err(err) = readability.parse(url.as_str()) {
|
||||||
|
@ -288,6 +291,7 @@ impl PageExtractionServiceInternal {
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(ExtractedPage::Success {
|
Ok(ExtractedPage::Success {
|
||||||
|
unreadable_document,
|
||||||
document,
|
document,
|
||||||
feeds,
|
feeds,
|
||||||
antifeature_flags,
|
antifeature_flags,
|
||||||
|
@ -338,6 +342,7 @@ pub fn find_page_metadata(root_node: NodeRef) -> anyhow::Result<PageMetadata> {
|
||||||
|
|
||||||
pub enum ExtractedPage {
|
pub enum ExtractedPage {
|
||||||
Success {
|
Success {
|
||||||
|
unreadable_document: Vec<DenseTree>,
|
||||||
document: DenseDocument,
|
document: DenseDocument,
|
||||||
feeds: Vec<Url>,
|
feeds: Vec<Url>,
|
||||||
antifeature_flags: AnalysisAntifeatures,
|
antifeature_flags: AnalysisAntifeatures,
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
use crate::raking::UrlRaked;
|
use crate::raking::UrlRaked;
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseTree};
|
use quickpeep_densedoc::DenseTree;
|
||||||
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
use quickpeep_structs::rake_entries::{RakedReference, ReferenceKind};
|
||||||
use quickpeep_utils::dates::date_to_quickpeep_days;
|
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
pub fn find_references(
|
pub fn find_references(
|
||||||
doc: &DenseDocument,
|
doc: &Vec<DenseTree>,
|
||||||
feeds: &Vec<Url>,
|
feeds: &Vec<Url>,
|
||||||
page_url: &Url,
|
page_url: &Url,
|
||||||
) -> BTreeSet<RakedReference> {
|
) -> BTreeSet<RakedReference> {
|
||||||
|
@ -55,8 +55,7 @@ pub fn find_references(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
add_link_refs(&doc, &mut refs, &page_url);
|
||||||
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
|
||||||
|
|
||||||
for feed in feeds {
|
for feed in feeds {
|
||||||
refs.insert(RakedReference {
|
refs.insert(RakedReference {
|
||||||
|
|
Loading…
Reference in New Issue