diff --git a/Cargo.lock b/Cargo.lock index 990bc0e..40e2fe9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2033,6 +2033,7 @@ dependencies = [ "quickpeep_densedoc", "quickpeep_moz_readability", "quickpeep_structs", + "quickpeep_utils", "reqwest", "serde", "serde_bare", @@ -2073,6 +2074,10 @@ dependencies = [ "quickpeep_densedoc", ] +[[package]] +name = "quickpeep_utils" +version = "0.1.0" + [[package]] name = "quote" version = "1.0.15" diff --git a/Cargo.toml b/Cargo.toml index 0487b5d..0940d25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,8 @@ members = [ "quickpeep", "quickpeep_densedoc", "quickpeep_moz_readability", - "quickpeep_structs" + "quickpeep_structs", + "quickpeep_utils" ] diff --git a/quickpeep/Cargo.toml b/quickpeep/Cargo.toml index 2e42c76..4dce7d1 100644 --- a/quickpeep/Cargo.toml +++ b/quickpeep/Cargo.toml @@ -35,6 +35,8 @@ futures-util = "0.3.21" lingua = "1.3.3" +quickpeep_utils = { path = "../quickpeep_utils" } + ### Raking helpers # HTTP Requests reqwest = { version = "0.11.9", features = ["stream"] } diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index dc32b86..bbe2cde 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -14,6 +14,7 @@ use lazy_static::lazy_static; use log::debug; use quickpeep_densedoc::DenseTree; use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use quickpeep_utils::Lazy; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; @@ -342,11 +343,11 @@ impl Raker { } let dense_doc = DenseTree::from_body(root_node.clone()); - let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); + let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc))); if language.is_none() { // Final fallback: guess the language - language = guess_document_language(&dense_doc_text); + language = guess_document_language(&*dense_doc_text); } // Try and enforce some consistency in the language code; @@ -355,7 +356,7 @@ impl Raker { normalise_language(language); } - eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); + eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text); eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node); diff --git a/quickpeep_utils/Cargo.toml b/quickpeep_utils/Cargo.toml new file mode 100644 index 0000000..b980a89 --- /dev/null +++ b/quickpeep_utils/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "quickpeep_utils" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/quickpeep_utils/src/lib.rs b/quickpeep_utils/src/lib.rs new file mode 100644 index 0000000..750d8f4 --- /dev/null +++ b/quickpeep_utils/src/lib.rs @@ -0,0 +1,48 @@ +use std::cell::RefCell; +use std::ops::Deref; + +enum LazyInner<'a, T> { + Uncomputed(Option T + 'a>>), + Computed(T), +} + +pub struct Lazy<'a, T> { + inner: RefCell>, +} +impl<'a, T> Lazy<'a, T> { + pub fn new(func: Box T + 'a>) -> Lazy { + Lazy { + inner: RefCell::new(LazyInner::Uncomputed(Some(func))), + } + } +} + +impl<'a, T: 'a> Deref for Lazy<'a, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + unsafe fn extend_lifetime<'a, 'b, A>(a: &'a A) -> &'b A { + std::mem::transmute(a) + } + + let mut inner_mut = self.inner.borrow_mut(); + if let LazyInner::Uncomputed(func) = &mut *inner_mut { + if let Some(func) = func.take() { + *inner_mut = LazyInner::Computed(func()); + } else { + panic!("Unreachable: uncomputed but no function to compute with") + } + } + + match &*inner_mut { + LazyInner::Computed(computed) => unsafe { + // Extending the lifetime *should* be safe because we don't ever overwrite + // a computed value... + extend_lifetime(computed) + }, + LazyInner::Uncomputed(_) => { + panic!("Unreachable: Should have been computed"); + } + } + } +}