Lazily textify the document tree only if it's needed

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 20:33:12 +00:00
parent 04b94b16ed
commit 5a94c825d7
6 changed files with 69 additions and 4 deletions

5
Cargo.lock generated
View File

@ -2033,6 +2033,7 @@ dependencies = [
"quickpeep_densedoc", "quickpeep_densedoc",
"quickpeep_moz_readability", "quickpeep_moz_readability",
"quickpeep_structs", "quickpeep_structs",
"quickpeep_utils",
"reqwest", "reqwest",
"serde", "serde",
"serde_bare", "serde_bare",
@ -2073,6 +2074,10 @@ dependencies = [
"quickpeep_densedoc", "quickpeep_densedoc",
] ]
[[package]]
name = "quickpeep_utils"
version = "0.1.0"
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.15" version = "1.0.15"

View File

@ -3,7 +3,8 @@ members = [
"quickpeep", "quickpeep",
"quickpeep_densedoc", "quickpeep_densedoc",
"quickpeep_moz_readability", "quickpeep_moz_readability",
"quickpeep_structs" "quickpeep_structs",
"quickpeep_utils"
] ]

View File

@ -35,6 +35,8 @@ futures-util = "0.3.21"
lingua = "1.3.3" lingua = "1.3.3"
quickpeep_utils = { path = "../quickpeep_utils" }
### Raking helpers ### Raking helpers
# HTTP Requests # HTTP Requests
reqwest = { version = "0.11.9", features = ["stream"] } reqwest = { version = "0.11.9", features = ["stream"] }

View File

@ -14,6 +14,7 @@ use lazy_static::lazy_static;
use log::debug; use log::debug;
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::Lazy;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -342,11 +343,11 @@ impl Raker {
} }
let dense_doc = DenseTree::from_body(root_node.clone()); let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
if language.is_none() { if language.is_none() {
// Final fallback: guess the language // Final fallback: guess the language
language = guess_document_language(&dense_doc_text); language = guess_document_language(&*dense_doc_text);
} }
// Try and enforce some consistency in the language code; // Try and enforce some consistency in the language code;
@ -355,7 +356,7 @@ impl Raker {
normalise_language(language); normalise_language(language);
} }
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node); let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);

View File

@ -0,0 +1,8 @@
[package]
name = "quickpeep_utils"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@ -0,0 +1,48 @@
use std::cell::RefCell;
use std::ops::Deref;
enum LazyInner<'a, T> {
Uncomputed(Option<Box<dyn FnOnce() -> T + 'a>>),
Computed(T),
}
pub struct Lazy<'a, T> {
inner: RefCell<LazyInner<'a, T>>,
}
impl<'a, T> Lazy<'a, T> {
pub fn new(func: Box<dyn FnOnce() -> T + 'a>) -> Lazy<T> {
Lazy {
inner: RefCell::new(LazyInner::Uncomputed(Some(func))),
}
}
}
impl<'a, T: 'a> Deref for Lazy<'a, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe fn extend_lifetime<'a, 'b, A>(a: &'a A) -> &'b A {
std::mem::transmute(a)
}
let mut inner_mut = self.inner.borrow_mut();
if let LazyInner::Uncomputed(func) = &mut *inner_mut {
if let Some(func) = func.take() {
*inner_mut = LazyInner::Computed(func());
} else {
panic!("Unreachable: uncomputed but no function to compute with")
}
}
match &*inner_mut {
LazyInner::Computed(computed) => unsafe {
// Extending the lifetime *should* be safe because we don't ever overwrite
// a computed value...
extend_lifetime(computed)
},
LazyInner::Uncomputed(_) => {
panic!("Unreachable: Should have been computed");
}
}
}
}