Lazily textify the document tree only if it's needed
This commit is contained in:
parent
04b94b16ed
commit
5a94c825d7
|
@ -2033,6 +2033,7 @@ dependencies = [
|
|||
"quickpeep_densedoc",
|
||||
"quickpeep_moz_readability",
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_bare",
|
||||
|
@ -2073,6 +2074,10 @@ dependencies = [
|
|||
"quickpeep_densedoc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quickpeep_utils"
|
||||
version = "0.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.15"
|
||||
|
|
|
@ -3,7 +3,8 @@ members = [
|
|||
"quickpeep",
|
||||
"quickpeep_densedoc",
|
||||
"quickpeep_moz_readability",
|
||||
"quickpeep_structs"
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils"
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -35,6 +35,8 @@ futures-util = "0.3.21"
|
|||
|
||||
lingua = "1.3.3"
|
||||
|
||||
quickpeep_utils = { path = "../quickpeep_utils" }
|
||||
|
||||
### Raking helpers
|
||||
# HTTP Requests
|
||||
reqwest = { version = "0.11.9", features = ["stream"] }
|
||||
|
|
|
@ -14,6 +14,7 @@ use lazy_static::lazy_static;
|
|||
use log::debug;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_utils::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -342,11 +343,11 @@ impl Raker {
|
|||
}
|
||||
|
||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
|
||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
||||
|
||||
if language.is_none() {
|
||||
// Final fallback: guess the language
|
||||
language = guess_document_language(&dense_doc_text);
|
||||
language = guess_document_language(&*dense_doc_text);
|
||||
}
|
||||
|
||||
// Try and enforce some consistency in the language code;
|
||||
|
@ -355,7 +356,7 @@ impl Raker {
|
|||
normalise_language(language);
|
||||
}
|
||||
|
||||
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
|
||||
eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text);
|
||||
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
||||
|
||||
let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node);
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
name = "quickpeep_utils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
|
@ -0,0 +1,48 @@
|
|||
use std::cell::RefCell;
|
||||
use std::ops::Deref;
|
||||
|
||||
enum LazyInner<'a, T> {
|
||||
Uncomputed(Option<Box<dyn FnOnce() -> T + 'a>>),
|
||||
Computed(T),
|
||||
}
|
||||
|
||||
pub struct Lazy<'a, T> {
|
||||
inner: RefCell<LazyInner<'a, T>>,
|
||||
}
|
||||
impl<'a, T> Lazy<'a, T> {
|
||||
pub fn new(func: Box<dyn FnOnce() -> T + 'a>) -> Lazy<T> {
|
||||
Lazy {
|
||||
inner: RefCell::new(LazyInner::Uncomputed(Some(func))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a> Deref for Lazy<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
unsafe fn extend_lifetime<'a, 'b, A>(a: &'a A) -> &'b A {
|
||||
std::mem::transmute(a)
|
||||
}
|
||||
|
||||
let mut inner_mut = self.inner.borrow_mut();
|
||||
if let LazyInner::Uncomputed(func) = &mut *inner_mut {
|
||||
if let Some(func) = func.take() {
|
||||
*inner_mut = LazyInner::Computed(func());
|
||||
} else {
|
||||
panic!("Unreachable: uncomputed but no function to compute with")
|
||||
}
|
||||
}
|
||||
|
||||
match &*inner_mut {
|
||||
LazyInner::Computed(computed) => unsafe {
|
||||
// Extending the lifetime *should* be safe because we don't ever overwrite
|
||||
// a computed value...
|
||||
extend_lifetime(computed)
|
||||
},
|
||||
LazyInner::Uncomputed(_) => {
|
||||
panic!("Unreachable: Should have been computed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue