Dissolve links before emitting documents to the pack store
continuous-integration/drone the build failed
Details
continuous-integration/drone the build failed
Details
Fixes #9
This commit is contained in:
parent
6c2ff9daec
commit
96a01e0aaa
|
@ -2,6 +2,7 @@ use crate::raking::analysis::{
|
|||
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
|
||||
guess_document_language, PreloadedEngine,
|
||||
};
|
||||
use crate::raking::references::dissolve_links;
|
||||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||
use adblock::engine::Engine;
|
||||
use anyhow::{bail, Context};
|
||||
|
@ -290,6 +291,10 @@ impl PageExtractionServiceInternal {
|
|||
document.body_remainder = DenseTree::from_body(root_node);
|
||||
}
|
||||
|
||||
// Dissolve links to save space
|
||||
dissolve_links(&mut document.body_content);
|
||||
dissolve_links(&mut document.body_remainder);
|
||||
|
||||
Ok(ExtractedPage::Success {
|
||||
unreadable_document,
|
||||
document,
|
||||
|
|
|
@ -88,6 +88,56 @@ pub fn references_from_urlrakes(
|
|||
.collect()
|
||||
}
|
||||
|
||||
pub fn dissolve_links(dense_tree: &mut Vec<DenseTree>) {
|
||||
let mut idx = 0;
|
||||
while idx < dense_tree.len() {
|
||||
let is_link = match &mut dense_tree[idx] {
|
||||
DenseTree::Heading1(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Heading2(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Heading3(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Heading4(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Heading5(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Heading6(children) => {
|
||||
dissolve_links(children);
|
||||
false
|
||||
}
|
||||
DenseTree::Link { children, .. } => {
|
||||
dissolve_links(children);
|
||||
true
|
||||
}
|
||||
DenseTree::Image { .. } => false,
|
||||
DenseTree::Text(_) => false,
|
||||
};
|
||||
if is_link {
|
||||
match dense_tree.remove(idx) {
|
||||
DenseTree::Link { children, .. } => {
|
||||
dense_tree.splice(idx..idx, children);
|
||||
}
|
||||
_ => {
|
||||
panic!("Implementation bug: Link being dissolved is not a Link.")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clean_url(url: &Url) -> Url {
|
||||
let mut url = url.clone();
|
||||
url.set_fragment(None);
|
||||
|
|
Loading…
Reference in New Issue