Dissolve links before emitting documents to the pack store
continuous-integration/drone the build failed
Details
continuous-integration/drone the build failed
Details
Fixes #9
This commit is contained in:
parent
6c2ff9daec
commit
96a01e0aaa
|
@ -2,6 +2,7 @@ use crate::raking::analysis::{
|
||||||
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
|
analyse_with_ad_block_cosmetic_filter, analyse_with_ad_block_network_filter,
|
||||||
guess_document_language, PreloadedEngine,
|
guess_document_language, PreloadedEngine,
|
||||||
};
|
};
|
||||||
|
use crate::raking::references::dissolve_links;
|
||||||
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
@ -290,6 +291,10 @@ impl PageExtractionServiceInternal {
|
||||||
document.body_remainder = DenseTree::from_body(root_node);
|
document.body_remainder = DenseTree::from_body(root_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Dissolve links to save space
|
||||||
|
dissolve_links(&mut document.body_content);
|
||||||
|
dissolve_links(&mut document.body_remainder);
|
||||||
|
|
||||||
Ok(ExtractedPage::Success {
|
Ok(ExtractedPage::Success {
|
||||||
unreadable_document,
|
unreadable_document,
|
||||||
document,
|
document,
|
||||||
|
|
|
@ -88,6 +88,56 @@ pub fn references_from_urlrakes(
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn dissolve_links(dense_tree: &mut Vec<DenseTree>) {
|
||||||
|
let mut idx = 0;
|
||||||
|
while idx < dense_tree.len() {
|
||||||
|
let is_link = match &mut dense_tree[idx] {
|
||||||
|
DenseTree::Heading1(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Heading2(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Heading3(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Heading4(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Heading5(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Heading6(children) => {
|
||||||
|
dissolve_links(children);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
DenseTree::Link { children, .. } => {
|
||||||
|
dissolve_links(children);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
DenseTree::Image { .. } => false,
|
||||||
|
DenseTree::Text(_) => false,
|
||||||
|
};
|
||||||
|
if is_link {
|
||||||
|
match dense_tree.remove(idx) {
|
||||||
|
DenseTree::Link { children, .. } => {
|
||||||
|
dense_tree.splice(idx..idx, children);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
panic!("Implementation bug: Link being dissolved is not a Link.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
idx += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn clean_url(url: &Url) -> Url {
|
pub fn clean_url(url: &Url) -> Url {
|
||||||
let mut url = url.clone();
|
let mut url = url.clone();
|
||||||
url.set_fragment(None);
|
url.set_fragment(None);
|
||||||
|
|
Loading…
Reference in New Issue