Some partial progress towards raking pages
This commit is contained in:
parent
5bab279cc2
commit
ea4f2d1332
|
@ -594,6 +594,16 @@ dependencies = [
|
||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "diplomatic-bag"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6448adcf5616f6dd9627e545f447cf13dc5fda312f938364240d8feb84b0d12b"
|
||||||
|
dependencies = [
|
||||||
|
"crossbeam-channel",
|
||||||
|
"once_cell",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dotenv"
|
name = "dotenv"
|
||||||
version = "0.15.0"
|
version = "0.15.0"
|
||||||
|
@ -2687,6 +2697,7 @@ dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"colour",
|
"colour",
|
||||||
"cylon",
|
"cylon",
|
||||||
|
"diplomatic-bag",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"feed-rs",
|
"feed-rs",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
|
|
@ -45,6 +45,7 @@ env_logger = "0.9.0"
|
||||||
ouroboros = "0.14.2"
|
ouroboros = "0.14.2"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
lru = "0.7.3"
|
lru = "0.7.3"
|
||||||
|
diplomatic-bag = "0.2.0"
|
||||||
|
|
||||||
### Raking helpers
|
### Raking helpers
|
||||||
# HTTP Requests
|
# HTTP Requests
|
||||||
|
|
|
@ -4,7 +4,8 @@ use clap::Parser;
|
||||||
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
use colour::{blue_ln, green_ln, red_ln, yellow_ln};
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use quickpeep_raker::raking::analysis::{load_adblock_engine, IpSet};
|
use quickpeep_raker::raking::analysis::{preload_adblock_engine, IpSet};
|
||||||
|
use quickpeep_raker::raking::page_extraction::PageExtractionService;
|
||||||
use quickpeep_raker::raking::references::references_from_urlrakes;
|
use quickpeep_raker::raking::references::references_from_urlrakes;
|
||||||
use quickpeep_raker::raking::{RakeIntent, RakeOutcome};
|
use quickpeep_raker::raking::{RakeIntent, RakeOutcome};
|
||||||
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||||
|
@ -57,7 +58,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
let file = File::open(&path).await?;
|
let file = File::open(&path).await?;
|
||||||
adblock_engines.push((
|
adblock_engines.push((
|
||||||
*antifeature,
|
*antifeature,
|
||||||
load_adblock_engine(file, RuleTypes::All).await?,
|
preload_adblock_engine(file, RuleTypes::All).await?,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,8 +70,8 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
antifeature_ip_set.add_all_from_file(ips_file).await?;
|
antifeature_ip_set.add_all_from_file(ips_file).await?;
|
||||||
|
|
||||||
let raker = Raker {
|
let raker = Raker {
|
||||||
adblock_engines,
|
|
||||||
antifeature_ip_set,
|
antifeature_ip_set,
|
||||||
|
page_extraction: PageExtractionService::new(adblock_engines)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
|
let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?;
|
||||||
|
|
|
@ -3,18 +3,19 @@ use clap::Parser;
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use log::error;
|
||||||
|
use lru::LruCache;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use reqwest::redirect::Policy;
|
use reqwest::redirect::Policy;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::{Arc, Mutex, RwLock};
|
use std::sync::{Arc, Mutex, RwLock};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use log::error;
|
|
||||||
use lru::LruCache;
|
|
||||||
use tokio::sync::{mpsc, Semaphore};
|
use tokio::sync::{mpsc, Semaphore};
|
||||||
|
|
||||||
use quickpeep_raker::config;
|
use quickpeep_raker::config;
|
||||||
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
use quickpeep_raker::raking::page_extraction::PageExtractionService;
|
||||||
use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission};
|
use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission};
|
||||||
|
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||||
use quickpeep_raker::storage::RakerStore;
|
use quickpeep_raker::storage::RakerStore;
|
||||||
|
|
||||||
/// Seeds a raker's queue with URLs
|
/// Seeds a raker's queue with URLs
|
||||||
|
@ -66,8 +67,8 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
||||||
|
|
||||||
let raker = Raker {
|
let raker = Raker {
|
||||||
adblock_engines: vec![],
|
antifeature_ip_set: todo!(),
|
||||||
antifeature_ip_set: todo!()
|
page_extraction: PageExtractionService::new(vec![])?, // TODO
|
||||||
};
|
};
|
||||||
|
|
||||||
let num_tasks = opts.concurrent_jobs + opts.concurrent_sleepers;
|
let num_tasks = opts.concurrent_jobs + opts.concurrent_sleepers;
|
||||||
|
@ -78,7 +79,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let submission = TaskResultSubmission {
|
let submission = TaskResultSubmission {
|
||||||
pages: pages_tx,
|
pages: pages_tx,
|
||||||
references: refs_tx
|
references: refs_tx,
|
||||||
};
|
};
|
||||||
|
|
||||||
let task_context = TaskContext {
|
let task_context = TaskContext {
|
||||||
|
@ -88,7 +89,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
busy_domains: Arc::new(Mutex::new(Default::default())),
|
busy_domains: Arc::new(Mutex::new(Default::default())),
|
||||||
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
||||||
semaphore,
|
semaphore,
|
||||||
submission
|
submission,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut tasks = Vec::with_capacity(num_tasks as usize);
|
let mut tasks = Vec::with_capacity(num_tasks as usize);
|
||||||
|
@ -101,8 +102,6 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
error!("Raker task {:?} encountered an error: {:?}", task_num, err);
|
error!("Raker task {:?} encountered an error: {:?}", task_num, err);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// TODO spawn task
|
|
||||||
}
|
}
|
||||||
|
|
||||||
drop(task_context);
|
drop(task_context);
|
||||||
|
|
|
@ -1,31 +1,24 @@
|
||||||
use crate::raking::analysis::{
|
use crate::raking::analysis::IpSet;
|
||||||
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
|
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||||
};
|
|
||||||
use adblock::engine::Engine;
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use chrono::{DateTime, FixedOffset, Utc};
|
use chrono::{DateTime, FixedOffset, Utc};
|
||||||
use cylon::Cylon;
|
use cylon::Cylon;
|
||||||
use futures_util::stream::StreamExt;
|
use futures_util::stream::StreamExt;
|
||||||
use html5ever::tendril::fmt::Slice;
|
use html5ever::tendril::fmt::Slice;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use kuchiki::traits::TendrilSink;
|
|
||||||
use kuchiki::NodeRef;
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry};
|
||||||
use quickpeep_structs::rake_entries::{AnalysisAntifeatures, RakedPageEntry, RakedReferrerEntry};
|
|
||||||
use quickpeep_utils::lazy::Lazy;
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use sitemap::reader::SiteMapEntity;
|
use sitemap::reader::SiteMapEntity;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::Mutex;
|
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use quickpeep_utils::threadguard::ThreadGuard;
|
|
||||||
|
|
||||||
pub mod analysis;
|
pub mod analysis;
|
||||||
|
pub mod page_extraction;
|
||||||
pub mod references;
|
pub mod references;
|
||||||
pub mod task;
|
pub mod task;
|
||||||
|
|
||||||
|
@ -92,6 +85,7 @@ pub struct PermanentFailure {
|
||||||
pub enum TemporaryFailureReason {
|
pub enum TemporaryFailureReason {
|
||||||
MissingInformation(String),
|
MissingInformation(String),
|
||||||
ServerError(u16),
|
ServerError(u16),
|
||||||
|
UnknownClientError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -168,8 +162,8 @@ async fn response_to_bytes_limited(
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Raker {
|
pub struct Raker {
|
||||||
pub adblock_engines: Vec<(AnalysisAntifeatures, ThreadGuard<Engine, ()>)>,
|
|
||||||
pub antifeature_ip_set: IpSet,
|
pub antifeature_ip_set: IpSet,
|
||||||
|
pub page_extraction: PageExtractionService,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Raker {
|
impl Raker {
|
||||||
|
@ -249,7 +243,8 @@ impl Raker {
|
||||||
{
|
{
|
||||||
// We don't try any fallbacks for an HTML page
|
// We don't try any fallbacks for an HTML page
|
||||||
return Ok(self
|
return Ok(self
|
||||||
.rake_html_page(&content, url, is_cf, &headers).await
|
.rake_html_page(&content, url, is_cf, &headers)
|
||||||
|
.await
|
||||||
.context("Raking HTML page")?);
|
.context("Raking HTML page")?);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -291,172 +286,31 @@ impl Raker {
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
headers: &HeaderMap,
|
headers: &HeaderMap,
|
||||||
) -> anyhow::Result<RakeOutcome> {
|
) -> anyhow::Result<RakeOutcome> {
|
||||||
let content_str = std::str::from_utf8(content)?;
|
let content_str = std::str::from_utf8(content)?.to_owned();
|
||||||
|
|
||||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str);
|
match self
|
||||||
|
.page_extraction
|
||||||
// See whether this page is at the canonical URL for the page.
|
.extract(content_str, url.clone(), headers.clone(), is_cf)
|
||||||
// If it's not, then we redirect the raker to the canonical URL.
|
.await?
|
||||||
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
|
{
|
||||||
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
|
ExtractedPage::Success {
|
||||||
let canonical_url = url
|
|
||||||
.join(canonical_href)
|
|
||||||
.context("Failed to resolve or parse canonical URL")?;
|
|
||||||
|
|
||||||
if &canonical_url != url {
|
|
||||||
return Ok(RakeOutcome::Redirect {
|
|
||||||
reason: RedirectReason::NotCanonical,
|
|
||||||
new_url: canonical_url,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try and dig up the page's language.
|
|
||||||
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
|
|
||||||
let mut language = None;
|
|
||||||
|
|
||||||
if let Ok(html_node) = root_node.select_first("html") {
|
|
||||||
if let Some(lang) = html_node.attributes.borrow().get("lang") {
|
|
||||||
language = Some(lang.trim().to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if language.is_none() {
|
|
||||||
// Next fallback: prefer the content-language header baked into the page itself
|
|
||||||
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
|
||||||
if let Some(lang) = meta_node.attributes.borrow().get("content") {
|
|
||||||
language = Some(lang.trim().to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if language.is_none() {
|
|
||||||
// Next fallback: prefer the content-language received as a header
|
|
||||||
if let Some(lang) = headers.get("content-language") {
|
|
||||||
language = Some(lang.to_str()?.to_owned());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
|
||||||
|
|
||||||
if is_cf {
|
|
||||||
antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
|
|
||||||
adblock_engine.apply(Box::new(|adblock_engine| {
|
|
||||||
match analyse_with_ad_block_cosmetic_filter(
|
|
||||||
&root_node,
|
|
||||||
&adblock_engine,
|
|
||||||
url.as_str(),
|
|
||||||
true,
|
|
||||||
) {
|
|
||||||
Ok(cosmetic_filters_tripped) => {
|
|
||||||
eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
|
|
||||||
antifeature_flags |= *engine_antifeature_flag;
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
eprintln!("Cosmetic Filter Err {:?}", err);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Ok(())
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
|
||||||
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
|
||||||
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
|
||||||
|
|
||||||
if language.is_none() {
|
|
||||||
// Final fallback: guess the language
|
|
||||||
language = guess_document_language(&*dense_doc_text);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try and enforce some consistency in the language code;
|
|
||||||
// we want codes like en_US rather than en-us.
|
|
||||||
if let Some(language) = language.as_mut() {
|
|
||||||
normalise_language(language);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut title = "".to_owned();
|
|
||||||
|
|
||||||
if let Ok(title_node) = root_node.select_first("head title") {
|
|
||||||
title = title_node.text_contents();
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut feeds = Vec::new();
|
|
||||||
let mut icon = None;
|
|
||||||
|
|
||||||
for link_node in root_node.select("head link").into_iter().flatten() {
|
|
||||||
if let Some(rel) = link_node.attributes.borrow().get("rel") {
|
|
||||||
let rels = rel.split_whitespace().collect_vec();
|
|
||||||
if rels.contains(&"icon") {
|
|
||||||
// This is an icon
|
|
||||||
if let Some(href) = link_node.attributes.borrow().get("href") {
|
|
||||||
let icon_url = url
|
|
||||||
.join(href)
|
|
||||||
.context("Failed to resolve or parse canonical URL to icon")?;
|
|
||||||
|
|
||||||
icon = Some(icon_url);
|
|
||||||
}
|
|
||||||
} else if rels.contains(&"alternate") {
|
|
||||||
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
|
|
||||||
if FEED_LINK_MIME_TYPES.contains(rel_type) {
|
|
||||||
if let Some(href) = link_node.attributes.borrow().get("href") {
|
|
||||||
let feed_url = url
|
|
||||||
.join(href)
|
|
||||||
.context("Failed to resolve or parse canonical URL to feed")?;
|
|
||||||
|
|
||||||
feeds.push(feed_url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut readability =
|
|
||||||
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
|
||||||
if let Err(err) = readability.parse(url.as_str()) {
|
|
||||||
debug!("Failed to analyse readability: {:?}", err);
|
|
||||||
}
|
|
||||||
|
|
||||||
eprintln!("{:#?}", readability.metadata);
|
|
||||||
|
|
||||||
if title.is_empty() && !readability.metadata.title().is_empty() {
|
|
||||||
// Fall back to the readability-derived page title
|
|
||||||
title = readability.metadata.title().to_owned();
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut document = DenseDocument {
|
|
||||||
head: DenseHead {
|
|
||||||
title,
|
|
||||||
language: language.unwrap_or(String::with_capacity(0)),
|
|
||||||
icon: icon
|
|
||||||
.map(|url| url.as_str().to_owned())
|
|
||||||
.unwrap_or(String::with_capacity(0)),
|
|
||||||
},
|
|
||||||
body_content: Vec::with_capacity(0),
|
|
||||||
body_remainder: Vec::with_capacity(0),
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(article_node) = readability.article_node {
|
|
||||||
document.body_remainder = DenseTree::from_body(root_node.clone());
|
|
||||||
document.body_content = DenseTree::from_body(article_node);
|
|
||||||
}
|
|
||||||
|
|
||||||
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
|
||||||
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
|
||||||
|
|
||||||
let references = references::find_references(&document, &feeds, url);
|
|
||||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
|
||||||
page_entry: RakedPageEntry {
|
|
||||||
analysed_antifeatures: antifeature_flags,
|
|
||||||
document,
|
document,
|
||||||
},
|
feeds,
|
||||||
referrer_entry: RakedReferrerEntry { references },
|
antifeature_flags,
|
||||||
}))
|
} => {
|
||||||
|
let references = references::find_references(&document, &feeds, url);
|
||||||
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
|
page_entry: RakedPageEntry {
|
||||||
|
analysed_antifeatures: antifeature_flags,
|
||||||
|
document,
|
||||||
|
},
|
||||||
|
referrer_entry: RakedReferrerEntry { references },
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
ExtractedPage::Redirect { reason, new_url } => {
|
||||||
|
Ok(RakeOutcome::Redirect { reason, new_url })
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,10 +10,30 @@ use std::collections::{BTreeSet, HashSet};
|
||||||
use std::net::IpAddr;
|
use std::net::IpAddr;
|
||||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||||
|
|
||||||
pub async fn load_adblock_engine<R: AsyncRead + Unpin>(
|
pub struct PreloadedEngine {
|
||||||
|
rules: Vec<String>,
|
||||||
|
rule_types: RuleTypes,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreloadedEngine {
|
||||||
|
pub fn build(&self) -> Engine {
|
||||||
|
Engine::from_rules(
|
||||||
|
&self.rules,
|
||||||
|
ParseOptions {
|
||||||
|
format: Default::default(),
|
||||||
|
include_redirect_urls: false,
|
||||||
|
rule_types: self.rule_types,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 'Pre-load' the adblock engine.
|
||||||
|
/// This doesn't construct the adblock engine yet because the adblock engine is not `Send`.
|
||||||
|
pub async fn preload_adblock_engine<R: AsyncRead + Unpin>(
|
||||||
reader: R,
|
reader: R,
|
||||||
rule_types: RuleTypes,
|
rule_types: RuleTypes,
|
||||||
) -> anyhow::Result<Engine> {
|
) -> anyhow::Result<PreloadedEngine> {
|
||||||
let mut br = BufReader::new(reader);
|
let mut br = BufReader::new(reader);
|
||||||
let mut rules = Vec::new();
|
let mut rules = Vec::new();
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
|
@ -24,14 +44,7 @@ pub async fn load_adblock_engine<R: AsyncRead + Unpin>(
|
||||||
}
|
}
|
||||||
rules.push(buf.trim().to_owned());
|
rules.push(buf.trim().to_owned());
|
||||||
}
|
}
|
||||||
Ok(Engine::from_rules(
|
Ok(PreloadedEngine { rules, rule_types })
|
||||||
&rules,
|
|
||||||
ParseOptions {
|
|
||||||
format: Default::default(),
|
|
||||||
include_redirect_urls: false,
|
|
||||||
rule_types,
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Relevant:
|
// Relevant:
|
||||||
|
|
|
@ -0,0 +1,285 @@
|
||||||
|
use crate::raking::analysis::{
|
||||||
|
analyse_with_ad_block_cosmetic_filter, guess_document_language, PreloadedEngine,
|
||||||
|
};
|
||||||
|
use crate::raking::{normalise_language, RedirectReason, FEED_LINK_MIME_TYPES};
|
||||||
|
use adblock::engine::Engine;
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use html5ever::tendril::TendrilSink;
|
||||||
|
use itertools::Itertools;
|
||||||
|
use kuchiki::NodeRef;
|
||||||
|
use log::{debug, warn};
|
||||||
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||||
|
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
|
use quickpeep_utils::lazy::Lazy;
|
||||||
|
use reqwest::header::HeaderMap;
|
||||||
|
use reqwest::Url;
|
||||||
|
use tokio::runtime;
|
||||||
|
use tokio::sync::mpsc::Sender;
|
||||||
|
use tokio::sync::{mpsc, oneshot};
|
||||||
|
use tokio::task::LocalSet;
|
||||||
|
|
||||||
|
/// As page extraction requires dealing with things are are not `Send`, we send those tasks off to
|
||||||
|
/// another thread.
|
||||||
|
pub struct PageExtractionService {
|
||||||
|
submission: Sender<ExtractionTask>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ExtractionTask {
|
||||||
|
content: String,
|
||||||
|
url: Url,
|
||||||
|
headers: HeaderMap,
|
||||||
|
is_cf: bool,
|
||||||
|
response: oneshot::Sender<anyhow::Result<ExtractedPage>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PageExtractionService {
|
||||||
|
pub async fn extract(
|
||||||
|
&self,
|
||||||
|
content: String,
|
||||||
|
url: Url,
|
||||||
|
headers: HeaderMap,
|
||||||
|
is_cf: bool,
|
||||||
|
) -> anyhow::Result<ExtractedPage> {
|
||||||
|
let (resp_tx, resp_rx) = oneshot::channel();
|
||||||
|
if let Err(_) = self
|
||||||
|
.submission
|
||||||
|
.send(ExtractionTask {
|
||||||
|
content,
|
||||||
|
url,
|
||||||
|
headers,
|
||||||
|
is_cf,
|
||||||
|
response: resp_tx,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
bail!("Page extraction service has shut down");
|
||||||
|
}
|
||||||
|
resp_rx.await?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PageExtractionService {
|
||||||
|
pub fn new(
|
||||||
|
adblock_engines: Vec<(AnalysisAntifeatures, PreloadedEngine)>,
|
||||||
|
) -> anyhow::Result<PageExtractionService> {
|
||||||
|
let (submission, mut recv) = mpsc::channel::<ExtractionTask>(16);
|
||||||
|
|
||||||
|
let rt = runtime::Builder::new_current_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
std::thread::spawn(move || {
|
||||||
|
let internal = PageExtractionServiceInternal {
|
||||||
|
adblock_engines: adblock_engines
|
||||||
|
.into_iter()
|
||||||
|
.map(|(aa, engine)| (aa, engine.build()))
|
||||||
|
.collect(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let local = LocalSet::new();
|
||||||
|
|
||||||
|
local.spawn_local(async move {
|
||||||
|
while let Some(new_task) = recv.recv().await {
|
||||||
|
let result = internal.extract_page(
|
||||||
|
new_task.content,
|
||||||
|
new_task.url,
|
||||||
|
new_task.headers,
|
||||||
|
new_task.is_cf,
|
||||||
|
);
|
||||||
|
if let Err(_) = new_task.response.send(result) {
|
||||||
|
warn!("Requester went away before could send extracted page");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// This will return once all senders are dropped and all spawned tasks have returned.
|
||||||
|
rt.block_on(local);
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(PageExtractionService { submission })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PageExtractionServiceInternal {
|
||||||
|
adblock_engines: Vec<(AnalysisAntifeatures, Engine)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PageExtractionServiceInternal {
|
||||||
|
fn extract_page(
|
||||||
|
&self,
|
||||||
|
content_str: String,
|
||||||
|
url: Url,
|
||||||
|
headers: HeaderMap,
|
||||||
|
is_cf: bool,
|
||||||
|
) -> anyhow::Result<ExtractedPage> {
|
||||||
|
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
||||||
|
|
||||||
|
// See whether this page is at the canonical URL for the page.
|
||||||
|
// If it's not, then we redirect the raker to the canonical URL.
|
||||||
|
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
|
||||||
|
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
|
||||||
|
let canonical_url = url
|
||||||
|
.join(canonical_href)
|
||||||
|
.context("Failed to resolve or parse canonical URL")?;
|
||||||
|
|
||||||
|
if &canonical_url != &url {
|
||||||
|
return Ok(ExtractedPage::Redirect {
|
||||||
|
reason: RedirectReason::NotCanonical,
|
||||||
|
new_url: canonical_url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try and dig up the page's language.
|
||||||
|
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
|
||||||
|
let mut language = None;
|
||||||
|
|
||||||
|
if let Ok(html_node) = root_node.select_first("html") {
|
||||||
|
if let Some(lang) = html_node.attributes.borrow().get("lang") {
|
||||||
|
language = Some(lang.trim().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Next fallback: prefer the content-language header baked into the page itself
|
||||||
|
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||||
|
if let Some(lang) = meta_node.attributes.borrow().get("content") {
|
||||||
|
language = Some(lang.trim().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Next fallback: prefer the content-language received as a header
|
||||||
|
if let Some(lang) = headers.get("content-language") {
|
||||||
|
language = Some(lang.to_str()?.to_owned());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
||||||
|
|
||||||
|
if is_cf {
|
||||||
|
antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
|
||||||
|
match analyse_with_ad_block_cosmetic_filter(
|
||||||
|
&root_node,
|
||||||
|
&adblock_engine,
|
||||||
|
url.as_str(),
|
||||||
|
true,
|
||||||
|
) {
|
||||||
|
Ok(cosmetic_filters_tripped) => {
|
||||||
|
eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
|
||||||
|
antifeature_flags |= *engine_antifeature_flag;
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!("Cosmetic Filter Err {:?}", err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||||
|
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
||||||
|
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Final fallback: guess the language
|
||||||
|
language = guess_document_language(&*dense_doc_text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try and enforce some consistency in the language code;
|
||||||
|
// we want codes like en_US rather than en-us.
|
||||||
|
if let Some(language) = language.as_mut() {
|
||||||
|
normalise_language(language);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut title = "".to_owned();
|
||||||
|
|
||||||
|
if let Ok(title_node) = root_node.select_first("head title") {
|
||||||
|
title = title_node.text_contents();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut feeds = Vec::new();
|
||||||
|
let mut icon = None;
|
||||||
|
|
||||||
|
for link_node in root_node.select("head link").into_iter().flatten() {
|
||||||
|
if let Some(rel) = link_node.attributes.borrow().get("rel") {
|
||||||
|
let rels = rel.split_whitespace().collect_vec();
|
||||||
|
if rels.contains(&"icon") {
|
||||||
|
// This is an icon
|
||||||
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||||
|
let icon_url = url
|
||||||
|
.join(href)
|
||||||
|
.context("Failed to resolve or parse canonical URL to icon")?;
|
||||||
|
|
||||||
|
icon = Some(icon_url);
|
||||||
|
}
|
||||||
|
} else if rels.contains(&"alternate") {
|
||||||
|
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
|
||||||
|
if FEED_LINK_MIME_TYPES.contains(rel_type) {
|
||||||
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
||||||
|
let feed_url = url
|
||||||
|
.join(href)
|
||||||
|
.context("Failed to resolve or parse canonical URL to feed")?;
|
||||||
|
|
||||||
|
feeds.push(feed_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut readability =
|
||||||
|
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
||||||
|
if let Err(err) = readability.parse(url.as_str()) {
|
||||||
|
debug!("Failed to analyse readability: {:?}", err);
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("{:#?}", readability.metadata);
|
||||||
|
|
||||||
|
if title.is_empty() && !readability.metadata.title().is_empty() {
|
||||||
|
// Fall back to the readability-derived page title
|
||||||
|
title = readability.metadata.title().to_owned();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut document = DenseDocument {
|
||||||
|
head: DenseHead {
|
||||||
|
title,
|
||||||
|
language: language.unwrap_or(String::with_capacity(0)),
|
||||||
|
icon: icon
|
||||||
|
.map(|url| url.as_str().to_owned())
|
||||||
|
.unwrap_or(String::with_capacity(0)),
|
||||||
|
},
|
||||||
|
body_content: Vec::with_capacity(0),
|
||||||
|
body_remainder: Vec::with_capacity(0),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(article_node) = readability.article_node {
|
||||||
|
document.body_remainder = DenseTree::from_body(root_node.clone());
|
||||||
|
document.body_content = DenseTree::from_body(article_node);
|
||||||
|
} else {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ExtractedPage::Success {
|
||||||
|
document,
|
||||||
|
feeds,
|
||||||
|
antifeature_flags,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ExtractedPage {
|
||||||
|
Success {
|
||||||
|
document: DenseDocument,
|
||||||
|
feeds: Vec<Url>,
|
||||||
|
antifeature_flags: AnalysisAntifeatures,
|
||||||
|
},
|
||||||
|
Redirect {
|
||||||
|
reason: RedirectReason,
|
||||||
|
new_url: Url,
|
||||||
|
},
|
||||||
|
}
|
|
@ -1,14 +1,22 @@
|
||||||
use crate::raking::{get_robots_txt_for, RakeOutcome, Raker, RobotsTxt, UrlRaked};
|
use crate::raking::{
|
||||||
|
get_robots_txt_for, robots_txt_url_for, RakeOutcome, Raker, RobotsTxt, TemporaryFailure,
|
||||||
|
TemporaryFailureReason, UrlRaked,
|
||||||
|
};
|
||||||
|
use crate::storage::records::UrlVisitedRecord;
|
||||||
use crate::storage::RakerStore;
|
use crate::storage::RakerStore;
|
||||||
|
use chrono::Utc;
|
||||||
use cylon::Cylon;
|
use cylon::Cylon;
|
||||||
|
use log::warn;
|
||||||
use lru::LruCache;
|
use lru::LruCache;
|
||||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry};
|
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry};
|
||||||
|
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::sync::{Arc, Mutex as StdMutex, RwLock};
|
use std::sync::{Arc, Mutex as StdMutex, RwLock};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
use tokio::time::Instant;
|
||||||
|
|
||||||
/// A crawl delay that is greater than 61 seconds will cause the domain to lose its place in the
|
/// A crawl delay that is greater than 61 seconds will cause the domain to lose its place in the
|
||||||
/// queue and get turned into a backoff.
|
/// queue and get turned into a backoff.
|
||||||
|
@ -68,6 +76,7 @@ impl TaskContext {
|
||||||
pub async fn process_domain(&mut self, domain: String) -> anyhow::Result<()> {
|
pub async fn process_domain(&mut self, domain: String) -> anyhow::Result<()> {
|
||||||
let mut current_robot_rules_url: Option<Url> = None;
|
let mut current_robot_rules_url: Option<Url> = None;
|
||||||
let mut current_robot_rules: Option<Cylon> = None;
|
let mut current_robot_rules: Option<Cylon> = None;
|
||||||
|
let mut wait_until: Option<Instant> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Get a URL to process
|
// Get a URL to process
|
||||||
|
@ -76,30 +85,96 @@ impl TaskContext {
|
||||||
txn.choose_url_for_domain(&domain)?
|
txn.choose_url_for_domain(&domain)?
|
||||||
};
|
};
|
||||||
|
|
||||||
let url = todo!();
|
let (url_str, url_record) = if let Some(url) = url {
|
||||||
let intent = todo!();
|
url
|
||||||
|
} else {
|
||||||
|
// Exhausted all the URLs for this domain. Let's remove it from the active domains
|
||||||
|
// queue.
|
||||||
|
let domain = domain.to_owned();
|
||||||
|
let out_of_urls = self
|
||||||
|
.store
|
||||||
|
.async_rw_txn(move |txn| {
|
||||||
|
// Double-check we're still out of URLs (another could have been added since
|
||||||
|
// we last checked!)
|
||||||
|
let out_of_urls = txn.choose_url_for_domain(&domain)?.is_none();
|
||||||
|
|
||||||
// Check our robot rules are valid for that URL
|
if !out_of_urls {
|
||||||
todo!();
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
let permit = self.semaphore.acquire().await?;
|
// TODO delete the domain from the store
|
||||||
|
|
||||||
// TODO process errors
|
Ok(true)
|
||||||
match self.raker.rake(url, intent, &self.client).await {
|
})
|
||||||
Ok(rake_outcome) => match self.process_outcome(&url, rake_outcome).await? {
|
.await?;
|
||||||
NextAction::Continue => {
|
if out_of_urls {
|
||||||
todo!()
|
break;
|
||||||
}
|
} else {
|
||||||
NextAction::ChangeDomain => {
|
continue;
|
||||||
todo!()
|
}
|
||||||
}
|
};
|
||||||
},
|
|
||||||
Err(_) => {
|
let url = Url::parse(&url_str)?;
|
||||||
|
|
||||||
|
// Check our robot rules are valid for that URL.
|
||||||
|
let robot_url = robots_txt_url_for(&url)?;
|
||||||
|
if Some(&robot_url) != current_robot_rules_url.as_ref() {
|
||||||
|
// We need to update our robot rules!
|
||||||
|
current_robot_rules = self.get_robot_rules(&url).await?;
|
||||||
|
current_robot_rules_url = Some(robot_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check our robot rules allow us to visit the desired URL
|
||||||
|
if let Some(robot_rules) = current_robot_rules.as_ref() {
|
||||||
|
if !robot_rules.allow(url.path().as_bytes()) {
|
||||||
|
// Permanently reject this.
|
||||||
todo!();
|
todo!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(wait_until) = wait_until.take() {
|
||||||
|
// Sleep to respect a crawl-delay
|
||||||
|
tokio::time::sleep_until(wait_until).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now acquire a permit to go and fetch the desired URL
|
||||||
|
let permit = self.semaphore.acquire().await?;
|
||||||
|
let raked = self.raker.rake(&url, url_record.intent, &self.client).await;
|
||||||
drop(permit);
|
drop(permit);
|
||||||
|
|
||||||
|
let rake_outcome = raked.unwrap_or_else(|err| {
|
||||||
|
warn!("Failed to rake {:?}: {:?}", url, err);
|
||||||
|
// Treat this as a temporary rejection (backoff).
|
||||||
|
RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||||
|
reason: TemporaryFailureReason::UnknownClientError(format!(
|
||||||
|
"Failed to rake {:?}: {:?}",
|
||||||
|
url, err
|
||||||
|
)),
|
||||||
|
// Back off for a day: this ought to be enough time for the operator to fix the problem... maybe?
|
||||||
|
backoff_sec: 86400,
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
match self.process_outcome(&url, rake_outcome).await? {
|
||||||
|
NextAction::Continue => {
|
||||||
|
// Take that URL off the queue
|
||||||
|
let now = Utc::today();
|
||||||
|
let record = UrlVisitedRecord {
|
||||||
|
last_visited_days: date_to_quickpeep_days(&now)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
let domain = domain.clone();
|
||||||
|
self.store
|
||||||
|
.async_rw_txn(move |txn| {
|
||||||
|
txn.mark_url_as_visited(domain, url_str, record)?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
NextAction::ChangeDomain => {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32};
|
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32};
|
||||||
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
||||||
use crate::storage::records::{ActiveDomainRecord, QueueUrlRecord};
|
use crate::storage::records::{ActiveDomainRecord, QueueUrlRecord, UrlVisitedRecord};
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
use libmdbx::{
|
use libmdbx::{
|
||||||
Database, DatabaseFlags, Environment, EnvironmentFlags, Transaction, TransactionKind,
|
Database, DatabaseFlags, Environment, EnvironmentFlags, Transaction, TransactionKind,
|
||||||
|
@ -14,7 +14,7 @@ use std::sync::Arc;
|
||||||
|
|
||||||
mod mdbx_helper_types;
|
mod mdbx_helper_types;
|
||||||
mod migrations;
|
mod migrations;
|
||||||
mod records;
|
pub mod records;
|
||||||
|
|
||||||
/// The databases available in an environment.
|
/// The databases available in an environment.
|
||||||
pub struct Databases<'env> {
|
pub struct Databases<'env> {
|
||||||
|
@ -249,6 +249,33 @@ impl<'a> RakerTxn<'a, RW> {
|
||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Marks a URL as visited and takes it out of the queue.
|
||||||
|
pub fn mark_url_as_visited(
|
||||||
|
&self,
|
||||||
|
domain: String,
|
||||||
|
url_str: String,
|
||||||
|
record: UrlVisitedRecord,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||||
|
let visited_urls = &self.mdbx.borrow_dbs().visited;
|
||||||
|
|
||||||
|
let queue_key = format!("{}\n{}", domain, url_str);
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?,
|
||||||
|
"No queued URL to delete"
|
||||||
|
);
|
||||||
|
|
||||||
|
self.mdbx_txn.put(
|
||||||
|
visited_urls,
|
||||||
|
url_str.as_bytes(),
|
||||||
|
&MdbxBare(record).as_bytes(),
|
||||||
|
WriteFlags::empty(),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read-only implementations (but can also be used on RW transactions)
|
/// Read-only implementations (but can also be used on RW transactions)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
use crate::raking::RakeIntent;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
|
@ -7,7 +8,7 @@ pub struct ActiveDomainRecord {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
pub struct VisitedDomainRecord {
|
pub struct UrlVisitedRecord {
|
||||||
/// Number of minutes since the QuickPeep Epoch that this page was last raked at.
|
/// Number of minutes since the QuickPeep Epoch that this page was last raked at.
|
||||||
/// We store minutes to give us 60× the range of times.
|
/// We store minutes to give us 60× the range of times.
|
||||||
/// We'd really rather stick with 32-bit ints to reduce the space storage requirements.
|
/// We'd really rather stick with 32-bit ints to reduce the space storage requirements.
|
||||||
|
@ -18,5 +19,5 @@ pub struct VisitedDomainRecord {
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
pub struct QueueUrlRecord {
|
pub struct QueueUrlRecord {
|
||||||
// TODO
|
pub intent: RakeIntent, // TODO
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,2 @@
|
||||||
pub mod dates;
|
pub mod dates;
|
||||||
pub mod lazy;
|
pub mod lazy;
|
||||||
pub mod threadguard;
|
|
|
@ -1,56 +0,0 @@
|
||||||
use std::sync::mpsc;
|
|
||||||
use std::sync::mpsc::Sender;
|
|
||||||
use std::thread::JoinHandle;
|
|
||||||
use anyhow::anyhow;
|
|
||||||
use tokio::sync::oneshot::Sender as OneshotSender;
|
|
||||||
use log::warn;
|
|
||||||
|
|
||||||
pub type ThreadGuardJob<T, R> = Box<dyn FnOnce(&T) -> anyhow::Result<R> + Send + Sync>;
|
|
||||||
|
|
||||||
/// Helper to use structs that aren't Sync or Send.
|
|
||||||
/// Spawns a thread upon which operations can be sent to.
|
|
||||||
pub struct ThreadGuard<T, R> {
|
|
||||||
jobs: Sender<(ThreadGuardJob<T, R>, OneshotSender<anyhow::Result<R>>)>,
|
|
||||||
thread: Option<JoinHandle<anyhow::Result<()>>>
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T, R> Drop for ThreadGuard<T, R> {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if let Some(thread) = self.thread.take() {
|
|
||||||
if let Err(err) = thread.join()
|
|
||||||
.unwrap_or_else(|e| Err(anyhow!("Can't join: {:?}", e))) {
|
|
||||||
warn!("Failed to join threadguard, or it returned an error.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: 'static, R: Send + 'static> ThreadGuard<T, R> {
|
|
||||||
pub fn new(initialiser: Box<dyn FnOnce() -> T + Send>) -> ThreadGuard<T, R> {
|
|
||||||
let (jobs_tx, jobs_rx) = mpsc::channel::<(ThreadGuardJob<T, R>, OneshotSender<anyhow::Result<R>>)>();
|
|
||||||
|
|
||||||
let thread = Some(std::thread::spawn(move || -> anyhow::Result<()> {
|
|
||||||
let object = initialiser();
|
|
||||||
|
|
||||||
while let Ok((job, job_return_tx)) = jobs_rx.recv() {
|
|
||||||
let result = job(&object);
|
|
||||||
job_return_tx.send(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}));
|
|
||||||
|
|
||||||
ThreadGuard {
|
|
||||||
jobs: jobs_tx,
|
|
||||||
thread
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn apply(&self, function: ThreadGuardJob<T, R>) -> anyhow::Result<R> {
|
|
||||||
let (return_tx, return_rx) = tokio::sync::oneshot::channel();
|
|
||||||
self.jobs.send((function, return_tx))?;
|
|
||||||
|
|
||||||
Ok(return_rx.await??)
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue