Lazily textify the document tree only if it's needed

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 20:33:12 +00:00
parent 04b94b16ed
commit 5a94c825d7
6 changed files with 69 additions and 4 deletions

5
Cargo.lock generated
View File

@ -2033,6 +2033,7 @@ dependencies = [
"quickpeep_densedoc",
"quickpeep_moz_readability",
"quickpeep_structs",
"quickpeep_utils",
"reqwest",
"serde",
"serde_bare",
@ -2073,6 +2074,10 @@ dependencies = [
"quickpeep_densedoc",
]
[[package]]
name = "quickpeep_utils"
version = "0.1.0"
[[package]]
name = "quote"
version = "1.0.15"

View File

@ -3,7 +3,8 @@ members = [
"quickpeep",
"quickpeep_densedoc",
"quickpeep_moz_readability",
"quickpeep_structs"
"quickpeep_structs",
"quickpeep_utils"
]

View File

@ -35,6 +35,8 @@ futures-util = "0.3.21"
lingua = "1.3.3"
quickpeep_utils = { path = "../quickpeep_utils" }
### Raking helpers
# HTTP Requests
reqwest = { version = "0.11.9", features = ["stream"] }

View File

@ -14,6 +14,7 @@ use lazy_static::lazy_static;
use log::debug;
use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::Lazy;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
@ -342,11 +343,11 @@ impl Raker {
}
let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
if language.is_none() {
// Final fallback: guess the language
language = guess_document_language(&dense_doc_text);
language = guess_document_language(&*dense_doc_text);
}
// Try and enforce some consistency in the language code;
@ -355,7 +356,7 @@ impl Raker {
normalise_language(language);
}
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);

View File

@ -0,0 +1,8 @@
[package]
name = "quickpeep_utils"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@ -0,0 +1,48 @@
use std::cell::RefCell;
use std::ops::Deref;
enum LazyInner<'a, T> {
Uncomputed(Option<Box<dyn FnOnce() -> T + 'a>>),
Computed(T),
}
pub struct Lazy<'a, T> {
inner: RefCell<LazyInner<'a, T>>,
}
impl<'a, T> Lazy<'a, T> {
pub fn new(func: Box<dyn FnOnce() -> T + 'a>) -> Lazy<T> {
Lazy {
inner: RefCell::new(LazyInner::Uncomputed(Some(func))),
}
}
}
impl<'a, T: 'a> Deref for Lazy<'a, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe fn extend_lifetime<'a, 'b, A>(a: &'a A) -> &'b A {
std::mem::transmute(a)
}
let mut inner_mut = self.inner.borrow_mut();
if let LazyInner::Uncomputed(func) = &mut *inner_mut {
if let Some(func) = func.take() {
*inner_mut = LazyInner::Computed(func());
} else {
panic!("Unreachable: uncomputed but no function to compute with")
}
}
match &*inner_mut {
LazyInner::Computed(computed) => unsafe {
// Extending the lifetime *should* be safe because we don't ever overwrite
// a computed value...
extend_lifetime(computed)
},
LazyInner::Uncomputed(_) => {
panic!("Unreachable: Should have been computed");
}
}
}
}