diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 511e742..9422726 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -2,6 +2,7 @@ use crate::raking::analysis::{ analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter, guess_document_language, PreloadedEngine, }; +use crate::raking::references::dissolve_links; use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES}; use adblock::engine::Engine; use anyhow::{bail, Context}; @@ -290,6 +291,10 @@ impl PageExtractionServiceInternal { document.body_remainder = DenseTree::from_body(root_node); } + // Dissolve links to save space + dissolve_links(&mut document.body_content); + dissolve_links(&mut document.body_remainder); + Ok(ExtractedPage::Success { unreadable_document, document, diff --git a/quickpeep_raker/src/raking/references.rs b/quickpeep_raker/src/raking/references.rs index 5bb0cef..7572626 100644 --- a/quickpeep_raker/src/raking/references.rs +++ b/quickpeep_raker/src/raking/references.rs @@ -88,6 +88,56 @@ pub fn references_from_urlrakes( .collect() } +pub fn dissolve_links(dense_tree: &mut Vec) { + let mut idx = 0; + while idx < dense_tree.len() { + let is_link = match &mut dense_tree[idx] { + DenseTree::Heading1(children) => { + dissolve_links(children); + false + } + DenseTree::Heading2(children) => { + dissolve_links(children); + false + } + DenseTree::Heading3(children) => { + dissolve_links(children); + false + } + DenseTree::Heading4(children) => { + dissolve_links(children); + false + } + DenseTree::Heading5(children) => { + dissolve_links(children); + false + } + DenseTree::Heading6(children) => { + dissolve_links(children); + false + } + DenseTree::Link { children, .. } => { + dissolve_links(children); + true + } + DenseTree::Image { .. } => false, + DenseTree::Text(_) => false, + }; + if is_link { + match dense_tree.remove(idx) { + DenseTree::Link { children, .. } => { + dense_tree.splice(idx..idx, children); + } + _ => { + panic!("Implementation bug: Link being dissolved is not a Link.") + } + } + } else { + idx += 1; + } + } +} + pub fn clean_url(url: &Url) -> Url { let mut url = url.clone(); url.set_fragment(None);