662 lines
21 KiB
Rust
662 lines
21 KiB
Rust
use crate::raking::analysis::{
|
|
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
|
|
};
|
|
use adblock::engine::Engine;
|
|
use anyhow::{bail, Context};
|
|
use chrono::{DateTime, FixedOffset, Utc};
|
|
use cylon::Cylon;
|
|
use futures_util::stream::StreamExt;
|
|
use html5ever::tendril::fmt::Slice;
|
|
use itertools::Itertools;
|
|
use kuchiki::traits::TendrilSink;
|
|
use kuchiki::NodeRef;
|
|
use lazy_static::lazy_static;
|
|
use log::debug;
|
|
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
|
use quickpeep_structs::rake_entries::{
|
|
AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
|
};
|
|
use quickpeep_utils::Lazy;
|
|
use reqwest::header::HeaderMap;
|
|
use reqwest::{Client, Response, Url};
|
|
use serde::{Deserialize, Serialize};
|
|
use sitemap::reader::SiteMapEntity;
|
|
use std::collections::HashSet;
|
|
use std::time::Duration;
|
|
use tokio::time::Instant;
|
|
|
|
pub mod analysis;
|
|
|
|
/// 4 MiB ought to be enough for anybody.
|
|
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
|
|
/// If it's not loaded in ten seconds, that's pretty severe.
|
|
/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
|
|
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
|
|
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";
|
|
|
|
pub enum RakeOutcome {
|
|
RakedPage(RakedPage),
|
|
RakedFeed(Vec<UrlRaked>),
|
|
RakedSitemap(Vec<UrlRaked>),
|
|
Redirect {
|
|
reason: RedirectReason,
|
|
new_url: Url,
|
|
},
|
|
TemporaryFailure(TemporaryFailure),
|
|
PermanentFailure(PermanentFailure),
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub enum RedirectReason {
|
|
/// The page redirected somewhere else.
|
|
Redirected {
|
|
/// HTTP Status Code of the redirect
|
|
http_code: u16,
|
|
},
|
|
/// The page was not canonical, and should not be indexed.
|
|
NotCanonical,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
|
pub struct UrlRaked {
|
|
pub url: Url,
|
|
pub last_changed: Option<DateTime<Utc>>,
|
|
pub intent: RakeIntent,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
pub struct RakedPage {
|
|
page_entry: RakedPageEntry,
|
|
referrer_entry: RakedReferrerEntry,
|
|
}
|
|
|
|
pub struct RobotsTxt {
|
|
pub sitemaps: Vec<UrlRaked>,
|
|
pub rules: Cylon,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct TemporaryFailure {
|
|
pub reason: TemporaryFailureReason,
|
|
pub backoff_sec: u32,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct PermanentFailure {
|
|
pub reason: PermanentFailureReason,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub enum TemporaryFailureReason {
|
|
MissingInformation(String),
|
|
ServerError(u16),
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub enum PermanentFailureReason {
|
|
ResourceDenied(u16),
|
|
WrongLanguage(String),
|
|
UnknownContentType(String),
|
|
}
|
|
|
|
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
|
pub enum RakeIntent {
|
|
Any,
|
|
Page,
|
|
Feed,
|
|
SiteMap,
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
|
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
|
|
|
/// MIME types we might expect in content-type headers
|
|
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
|
"text/xml",
|
|
"application/xml",
|
|
"application/atom+xml",
|
|
"application/rss+xml",
|
|
"application/rdf+xml",
|
|
"application/json",
|
|
"application/feed+json"
|
|
]);
|
|
|
|
/// MIME types we might expect in <link> tags
|
|
static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
|
|
"application/atom+xml",
|
|
"application/rss+xml",
|
|
"application/rdf+xml",
|
|
"application/feed+json"
|
|
]);
|
|
}
|
|
|
|
async fn response_to_bytes_limited(
|
|
response: Response,
|
|
size_limit: usize,
|
|
time_limit: Duration,
|
|
) -> anyhow::Result<Vec<u8>> {
|
|
let deadline = Instant::now() + time_limit;
|
|
let mut buffer = Vec::new();
|
|
let mut bytestream = response.bytes_stream();
|
|
|
|
loop {
|
|
tokio::select! {
|
|
next_chunk = bytestream.next() => {
|
|
match next_chunk {
|
|
Some(next_chunk) => {
|
|
buffer.extend_from_slice(next_chunk?.as_bytes());
|
|
if buffer.len() > size_limit {
|
|
bail!("Exceeds size limit");
|
|
}
|
|
},
|
|
None => {
|
|
// Finished! :)
|
|
break;
|
|
}
|
|
}
|
|
},
|
|
_ = tokio::time::sleep_until(deadline) => {
|
|
bail!("Exceeded time limit");
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(buffer)
|
|
}
|
|
|
|
pub struct Raker {
|
|
pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>,
|
|
pub antifeature_ip_set: IpSet,
|
|
}
|
|
|
|
impl Raker {
|
|
pub async fn rake(
|
|
&self,
|
|
url: &Url,
|
|
intent: RakeIntent,
|
|
client: &Client,
|
|
) -> anyhow::Result<RakeOutcome> {
|
|
let response = client.get(url.clone()).send().await?;
|
|
|
|
let is_cf = if let Some(remote_addr) = response.remote_addr() {
|
|
self.antifeature_ip_set.contains(remote_addr.ip())
|
|
} else {
|
|
false
|
|
};
|
|
|
|
let http_code = response.status().as_u16();
|
|
|
|
if response.status().is_redirection() {
|
|
if let Some(redirect_target) = response.headers().get("location") {
|
|
let new_url = url
|
|
.join(
|
|
redirect_target
|
|
.to_str()
|
|
.context("Failed to convert Location header to str")?,
|
|
)
|
|
.context("Failed to resolve Location header target")?;
|
|
|
|
return Ok(RakeOutcome::Redirect {
|
|
reason: RedirectReason::Redirected { http_code },
|
|
new_url,
|
|
});
|
|
} else {
|
|
bail!(
|
|
"Redirection {:?} received, but no Location header.",
|
|
response.status()
|
|
);
|
|
}
|
|
}
|
|
|
|
if response.status().is_client_error() {
|
|
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
|
reason: PermanentFailureReason::ResourceDenied(http_code),
|
|
}));
|
|
}
|
|
|
|
if response.status().is_server_error() {
|
|
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
|
|
reason: TemporaryFailureReason::ServerError(http_code),
|
|
// Try again tomorrow. Maybe the server is overloaded?
|
|
backoff_sec: 86400,
|
|
}));
|
|
}
|
|
|
|
if !response.status().is_success() {
|
|
bail!("Unknown failure code: {:?}", response.status());
|
|
}
|
|
|
|
let content_type = if let Some(content_type) = response.headers().get("content-type") {
|
|
let content_type = content_type
|
|
.to_str()
|
|
.context("Can't convert content-type to str")?;
|
|
eprintln!("CT {:?}", content_type);
|
|
content_type.split(";").next().unwrap().trim().to_owned()
|
|
} else {
|
|
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
|
|
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
|
|
backoff_sec: 86400 * 7,
|
|
}));
|
|
};
|
|
|
|
let headers = response.headers().clone();
|
|
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
|
|
|
|
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
|
{
|
|
// We don't try any fallbacks for an HTML page
|
|
return Ok(self
|
|
.rake_html_page(&content, url, is_cf, &headers)
|
|
.context("Raking HTML page")?);
|
|
}
|
|
|
|
if FEED_MIME_TYPES.contains(content_type.as_str())
|
|
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
|
|
{
|
|
match rake_feed(&content, url) {
|
|
Ok(feed) => {
|
|
return Ok(RakeOutcome::RakedFeed(feed));
|
|
}
|
|
Err(error) => {
|
|
debug!("Failed to rake as feed: {:?}", error);
|
|
}
|
|
}
|
|
}
|
|
|
|
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
|
|
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
|
|
{
|
|
match rake_sitemap(&content) {
|
|
Ok(sitemap) => {
|
|
return Ok(RakeOutcome::RakedSitemap(sitemap));
|
|
}
|
|
Err(error) => {
|
|
debug!("Failed to rake as sitemap: {:?}", error);
|
|
}
|
|
}
|
|
}
|
|
|
|
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
|
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
|
|
}));
|
|
}
|
|
|
|
pub fn rake_html_page(
|
|
&self,
|
|
content: &[u8],
|
|
url: &Url,
|
|
is_cf: bool,
|
|
headers: &HeaderMap,
|
|
) -> anyhow::Result<RakeOutcome> {
|
|
let content_str = std::str::from_utf8(content)?;
|
|
|
|
let root_node: NodeRef = kuchiki::parse_html().one(content_str);
|
|
|
|
// See whether this page is at the canonical URL for the page.
|
|
// If it's not, then we redirect the raker to the canonical URL.
|
|
if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
|
|
if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
|
|
let canonical_url = url
|
|
.join(canonical_href)
|
|
.context("Failed to resolve or parse canonical URL")?;
|
|
|
|
if &canonical_url != url {
|
|
return Ok(RakeOutcome::Redirect {
|
|
reason: RedirectReason::NotCanonical,
|
|
new_url: canonical_url,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try and dig up the page's language.
|
|
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
|
|
let mut language = None;
|
|
|
|
if let Ok(html_node) = root_node.select_first("html") {
|
|
if let Some(lang) = html_node.attributes.borrow().get("lang") {
|
|
language = Some(lang.trim().to_string());
|
|
}
|
|
}
|
|
|
|
if language.is_none() {
|
|
// Next fallback: prefer the content-language header baked into the page itself
|
|
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
|
if let Some(lang) = meta_node.attributes.borrow().get("content") {
|
|
language = Some(lang.trim().to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
if language.is_none() {
|
|
// Next fallback: prefer the content-language received as a header
|
|
if let Some(lang) = headers.get("content-language") {
|
|
language = Some(lang.to_str()?.to_owned());
|
|
}
|
|
}
|
|
|
|
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
|
|
|
if is_cf {
|
|
antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
|
|
}
|
|
|
|
for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
|
|
match analyse_with_ad_block_cosmetic_filter(
|
|
&root_node,
|
|
adblock_engine,
|
|
url.as_str(),
|
|
true,
|
|
) {
|
|
Ok(cosmetic_filters_tripped) => {
|
|
eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
|
|
antifeature_flags |= *engine_antifeature_flag;
|
|
}
|
|
Err(err) => {
|
|
eprintln!("Cosmetic Filter Err {:?}", err);
|
|
}
|
|
};
|
|
}
|
|
|
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
|
let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
|
|
//eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);
|
|
|
|
if language.is_none() {
|
|
// Final fallback: guess the language
|
|
language = guess_document_language(&*dense_doc_text);
|
|
}
|
|
|
|
// Try and enforce some consistency in the language code;
|
|
// we want codes like en_US rather than en-us.
|
|
if let Some(language) = language.as_mut() {
|
|
normalise_language(language);
|
|
}
|
|
|
|
let mut title = "".to_owned();
|
|
|
|
if let Ok(title_node) = root_node.select_first("head title") {
|
|
title = title_node.text_contents();
|
|
}
|
|
|
|
let mut feeds = Vec::new();
|
|
let mut icon = None;
|
|
|
|
for link_node in root_node.select("head link").into_iter().flatten() {
|
|
if let Some(rel) = link_node.attributes.borrow().get("rel") {
|
|
let rels = rel.split_whitespace().collect_vec();
|
|
if rels.contains(&"icon") {
|
|
// This is an icon
|
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
|
let icon_url = url
|
|
.join(href)
|
|
.context("Failed to resolve or parse canonical URL to icon")?;
|
|
|
|
icon = Some(icon_url);
|
|
}
|
|
} else if rels.contains(&"alternate") {
|
|
if let Some(rel_type) = link_node.attributes.borrow().get("type") {
|
|
if FEED_LINK_MIME_TYPES.contains(rel_type) {
|
|
if let Some(href) = link_node.attributes.borrow().get("href") {
|
|
let feed_url = url
|
|
.join(href)
|
|
.context("Failed to resolve or parse canonical URL to feed")?;
|
|
|
|
feeds.push(feed_url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut readability =
|
|
quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
|
|
if let Err(err) = readability.parse(url.as_str()) {
|
|
debug!("Failed to analyse readability: {:?}", err);
|
|
}
|
|
|
|
eprintln!("{:#?}", readability.metadata);
|
|
|
|
if title.is_empty() && !readability.metadata.title().is_empty() {
|
|
// Fall back to the readability-derived page title
|
|
title = readability.metadata.title().to_owned();
|
|
}
|
|
|
|
let mut document = DenseDocument {
|
|
head: DenseHead {
|
|
title,
|
|
language: language.unwrap_or(String::with_capacity(0)),
|
|
icon: icon
|
|
.map(|url| url.as_str().to_owned())
|
|
.unwrap_or(String::with_capacity(0)),
|
|
},
|
|
body_content: Vec::with_capacity(0),
|
|
body_remainder: Vec::with_capacity(0),
|
|
};
|
|
|
|
if let Some(article_node) = readability.article_node {
|
|
document.body_remainder = DenseTree::from_body(root_node.clone());
|
|
document.body_content = DenseTree::from_body(article_node);
|
|
}
|
|
|
|
let bare_size = serde_bare::to_vec(&dense_doc)?.len();
|
|
eprintln!("CS {:?} → {:?}", content.len(), bare_size);
|
|
|
|
let references = find_references(&document, &feeds, url);
|
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
|
page_entry: RakedPageEntry {
|
|
analysed_antifeatures: antifeature_flags,
|
|
document,
|
|
},
|
|
referrer_entry: RakedReferrerEntry { references },
|
|
}))
|
|
}
|
|
}
|
|
|
|
pub fn find_references(
|
|
doc: &DenseDocument,
|
|
feeds: &Vec<Url>,
|
|
page_url: &Url,
|
|
) -> Vec<RakedReference> {
|
|
let mut refs = Vec::new();
|
|
|
|
fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
|
|
for node in tree {
|
|
match node {
|
|
DenseTree::Heading1(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Heading2(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Heading3(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Heading4(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Heading5(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Heading6(children) => {
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Link {
|
|
children,
|
|
href,
|
|
nofollow,
|
|
} => {
|
|
if !nofollow {
|
|
if let Ok(full_url) = page_url.join(&href) {
|
|
refs.push(RakedReference {
|
|
target: full_url.to_string(),
|
|
kind: ReferenceKind::CanonicalUrl,
|
|
})
|
|
}
|
|
}
|
|
add_link_refs(children, refs, page_url);
|
|
}
|
|
DenseTree::Image { .. } => {}
|
|
DenseTree::Text(_) => {}
|
|
}
|
|
}
|
|
}
|
|
|
|
add_link_refs(&doc.body_content, &mut refs, &page_url);
|
|
add_link_refs(&doc.body_remainder, &mut refs, &page_url);
|
|
|
|
for feed in feeds {
|
|
refs.push(RakedReference {
|
|
target: feed.as_str().to_owned(),
|
|
kind: ReferenceKind::HeaderLinkedFeed,
|
|
});
|
|
}
|
|
|
|
refs
|
|
}
|
|
|
|
pub fn normalise_language(lang_string: &mut String) {
|
|
*lang_string = lang_string.to_lowercase();
|
|
let mut pieces = lang_string
|
|
.replace("-", "_")
|
|
.split('_')
|
|
.map(|s| s.to_owned())
|
|
.collect_vec();
|
|
if let Some(dialect) = pieces.get_mut(1) {
|
|
*dialect = dialect.to_uppercase();
|
|
}
|
|
}
|
|
|
|
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
|
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
|
|
|
let mut urls = Vec::new();
|
|
|
|
for entry in feed.entries {
|
|
let link = if let Some(link) = entry.links.get(0) {
|
|
link
|
|
} else {
|
|
continue;
|
|
};
|
|
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
|
|
|
|
let last_changed = entry.updated.or(entry.published);
|
|
|
|
urls.push(UrlRaked {
|
|
url,
|
|
last_changed,
|
|
intent: RakeIntent::Page,
|
|
});
|
|
}
|
|
|
|
// TODO paginated feeds (e.g. JSON Feed next_url)
|
|
|
|
Ok(urls)
|
|
}
|
|
|
|
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
|
|
let curs = std::io::Cursor::new(content);
|
|
let reader = sitemap::reader::SiteMapReader::new(curs);
|
|
|
|
let mut urls = Vec::new();
|
|
|
|
for entry in reader {
|
|
match &entry {
|
|
SiteMapEntity::Url(url) => {
|
|
let loc = if let Some(loc) = url.loc.get_url() {
|
|
loc
|
|
} else {
|
|
continue;
|
|
};
|
|
|
|
urls.push(UrlRaked {
|
|
url: loc,
|
|
last_changed: url
|
|
.lastmod
|
|
.get_time()
|
|
.map(|dt: DateTime<FixedOffset>| dt.into()),
|
|
intent: RakeIntent::Page,
|
|
});
|
|
}
|
|
SiteMapEntity::SiteMap(sitemap) => {
|
|
let loc = if let Some(loc) = sitemap.loc.get_url() {
|
|
loc
|
|
} else {
|
|
continue;
|
|
};
|
|
|
|
urls.push(UrlRaked {
|
|
url: loc,
|
|
last_changed: sitemap
|
|
.lastmod
|
|
.get_time()
|
|
.map(|dt: DateTime<FixedOffset>| dt.into()),
|
|
intent: RakeIntent::SiteMap,
|
|
});
|
|
}
|
|
SiteMapEntity::Err(error) => {
|
|
debug!("Sitemap error {:?}", error);
|
|
}
|
|
}
|
|
eprintln!("{:?}", entry);
|
|
}
|
|
|
|
if urls.is_empty() {
|
|
bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
|
|
}
|
|
|
|
Ok(urls)
|
|
}
|
|
|
|
pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
|
|
let robots_url = url
|
|
.join("/robots.txt")
|
|
.context("Whilst resolving /robots.txt on URL")?;
|
|
let resp = client.get(robots_url.clone()).send().await?;
|
|
|
|
if !resp.status().is_success() {
|
|
let code = resp.status().as_u16();
|
|
if code == 404 || code == 410 {
|
|
// not found or gone? Assume there is intentionally no robots.txt file.
|
|
return Ok(None);
|
|
}
|
|
|
|
bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
|
|
}
|
|
|
|
let bytes = resp.bytes().await?;
|
|
|
|
Ok(decode_robots_txt(&bytes).await?)
|
|
}
|
|
|
|
pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
|
|
let mut sitemaps = Vec::new();
|
|
|
|
for line in bytes.split(|b| *b == b'\n') {
|
|
let line = line.to_ascii_lowercase();
|
|
if line.starts_with(b"sitemap:") {
|
|
if let Ok(value) = std::str::from_utf8(&line[8..]) {
|
|
if let Ok(url) = Url::parse(value.trim()) {
|
|
sitemaps.push(UrlRaked {
|
|
url,
|
|
last_changed: None,
|
|
intent: RakeIntent::SiteMap,
|
|
});
|
|
} else {
|
|
debug!("Failed to parse sitemap value as a URL")
|
|
}
|
|
} else {
|
|
debug!("Failed to parse sitemap value as UTF-8")
|
|
}
|
|
}
|
|
}
|
|
|
|
let rules = cylon::Compiler::new(RAKER_USER_AGENT)
|
|
.compile(bytes.as_bytes())
|
|
.await?;
|
|
|
|
Ok(Some(RobotsTxt { sitemaps, rules }))
|
|
}
|