Dissolve links before emitting documents to the pack store
continuous-integration/drone the build failed Details

Fixes #9
This commit is contained in:
Olivier 'reivilibre' 2022-04-03 10:47:18 +01:00
parent 6c2ff9daec
commit 96a01e0aaa
2 changed files with 55 additions and 0 deletions

View File

@ -2,6 +2,7 @@ use crate::raking::analysis::{
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
guess_document_language, PreloadedEngine,
};
use crate::raking::references::dissolve_links;
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
use adblock::engine::Engine;
use anyhow::{bail, Context};
@ -290,6 +291,10 @@ impl PageExtractionServiceInternal {
document.body_remainder = DenseTree::from_body(root_node);
}
// Dissolve links to save space
dissolve_links(&mut document.body_content);
dissolve_links(&mut document.body_remainder);
Ok(ExtractedPage::Success {
unreadable_document,
document,

View File

@ -88,6 +88,56 @@ pub fn references_from_urlrakes(
.collect()
}
pub fn dissolve_links(dense_tree: &mut Vec<DenseTree>) {
let mut idx = 0;
while idx < dense_tree.len() {
let is_link = match &mut dense_tree[idx] {
DenseTree::Heading1(children) => {
dissolve_links(children);
false
}
DenseTree::Heading2(children) => {
dissolve_links(children);
false
}
DenseTree::Heading3(children) => {
dissolve_links(children);
false
}
DenseTree::Heading4(children) => {
dissolve_links(children);
false
}
DenseTree::Heading5(children) => {
dissolve_links(children);
false
}
DenseTree::Heading6(children) => {
dissolve_links(children);
false
}
DenseTree::Link { children, .. } => {
dissolve_links(children);
true
}
DenseTree::Image { .. } => false,
DenseTree::Text(_) => false,
};
if is_link {
match dense_tree.remove(idx) {
DenseTree::Link { children, .. } => {
dense_tree.splice(idx..idx, children);
}
_ => {
panic!("Implementation bug: Link being dissolved is not a Link.")
}
}
} else {
idx += 1;
}
}
}
pub fn clean_url(url: &Url) -> Url {
let mut url = url.clone();
url.set_fragment(None);