quickpeep/quickpeep_raker/src/raking.rs

708 lines
22 KiB
Rust

use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use ::metrics::increment_counter;
use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat};
use itertools::Itertools;
use lazy_static::lazy_static;
use log::{debug, info};
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use tokio::time::Instant;
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
pub mod analysis;
pub mod page_extraction;
pub mod rakemetrics;
pub mod references;
pub mod task;
/// 4 MiB ought to be enough for anybody.
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
/// If it's not loaded in ten seconds, that's pretty severe.
/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";
pub enum RakeOutcome {
RakedPage(RakedPage),
RakedFeed(Vec<UrlRaked>),
RakedSitemap(Vec<UrlRaked>),
RakedIcon(RakedIcon),
Redirect {
reason: RedirectReason,
new_url: Url,
},
TemporaryFailure(TemporaryFailure),
PermanentFailure(PermanentFailure),
}
#[derive(Debug)]
pub enum RedirectReason {
/// The page redirected somewhere else.
Redirected {
/// HTTP Status Code of the redirect
http_code: u16,
},
/// The page was not canonical, and should not be indexed.
NotCanonical,
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
SecureUpgrade,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct UrlRaked {
pub url: Url,
pub last_changed: Option<DateTime<Utc>>,
pub intent: RakeIntent,
}
#[derive(Serialize)]
pub struct RakedPage {
pub page_entry: RakedPageEntry,
pub referrer_entry: RakedReferrerEntry,
}
pub struct RakedIcon {
pub original_size_in_bytes: usize,
pub webp_bytes: Vec<u8>,
}
pub struct RobotsTxt {
pub sitemaps: Vec<UrlRaked>,
pub rules: Cylon,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TemporaryFailure {
pub reason: TemporaryFailureReason,
pub backoff_sec: u32,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PermanentFailure {
pub reason: PermanentFailureReason,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum TemporaryFailureReason {
MissingInformation(String),
ServerError(u16),
UnknownClientError(String),
ExcruciatingCrawlDelay(u64),
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum PermanentFailureReason {
ResourceDenied(u16),
DeniedToRobots,
WrongLanguage(String),
UnknownContentType(String),
ExceedsSizeLimit,
}
impl Display for PermanentFailure {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
Debug::fmt(&self, f)
}
}
impl Error for PermanentFailure {}
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum RakeIntent {
Any,
Page,
Feed,
SiteMap,
Icon,
}
impl FromStr for RakeIntent {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s.to_lowercase().as_ref() {
"any" => RakeIntent::Any,
"page" => RakeIntent::Page,
"feed" => RakeIntent::Feed,
"sitemap" => RakeIntent::SiteMap,
"icon" => RakeIntent::Icon,
other => {
bail!("Unrecognised intent: {:?}", other)
}
})
}
}
impl From<ReferenceKind> for RakeIntent {
fn from(kind: ReferenceKind) -> Self {
match kind {
ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
RakeIntent::Any
}
ReferenceKind::Redirect => {
// FIXME We don't know what this is a redirect for. Suppose it doesn't matter...
RakeIntent::Any
}
ReferenceKind::Link => {
// Links can go to pages but also to RSS feeds
RakeIntent::Any
}
ReferenceKind::HeaderLinkedFeed => RakeIntent::Feed,
ReferenceKind::FeedEntry => RakeIntent::Page,
ReferenceKind::SitemapEntry => RakeIntent::Page,
}
}
}
impl RakeIntent {
pub fn supports_mime_type(&self, mime_type: &str) -> bool {
match self {
RakeIntent::Any => ALL_MIME_TYPES.contains(mime_type),
RakeIntent::Page => PAGE_MIME_TYPES.contains(mime_type),
RakeIntent::Feed => FEED_MIME_TYPES.contains(mime_type),
RakeIntent::SiteMap => SITEMAP_MIME_TYPES.contains(mime_type),
RakeIntent::Icon => IMAGE_MIME_TYPES.contains_key(mime_type),
}
}
}
lazy_static! {
static ref PAGE_MIME_TYPES: HashSet<&'static str> =
HashSet::from_iter(vec!["text/html", "text/gemini",]);
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
HashSet::from_iter(vec!["text/xml", "application/xml",]);
/// MIME types we might expect in content-type headers
static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
"text/xml",
"application/xml",
"application/atom+xml",
"application/rss+xml",
"application/rdf+xml",
"application/json",
"application/feed+json"
]);
/// MIME types we might expect in <link> tags
static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
"application/atom+xml",
"application/rss+xml",
"application/rdf+xml",
"application/feed+json"
]);
pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = {
[
("image/png", ImageFormat::Png),
("image/webp", ImageFormat::WebP),
("image/jpeg", ImageFormat::Jpeg),
("image/gif", ImageFormat::Gif),
("image/vnd.microsoft.icon", ImageFormat::Ico),
("image/x-icon", ImageFormat::Ico),
("image/icon", ImageFormat::Ico),
("image/ico", ImageFormat::Ico),
("application/ico", ImageFormat::Ico),
]
.into_iter()
.collect()
};
pub static ref ALL_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(
PAGE_MIME_TYPES.iter().cloned()
.chain(SITEMAP_MIME_TYPES.iter().cloned())
.chain(FEED_MIME_TYPES.iter().cloned())
.chain(FEED_LINK_MIME_TYPES.iter().cloned())
.chain(IMAGE_MIME_TYPES.keys().cloned())
);
}
async fn response_to_bytes_limited(
response: Response,
size_limit: usize,
time_limit: Duration,
) -> anyhow::Result<Vec<u8>> {
// Check the content-length header without
let content_length = response
.headers()
.get("content-length")
.map(|len| len.to_str().ok())
.flatten()
.map(|len| len.parse::<u64>().ok())
.flatten();
if let Some(content_length) = content_length {
if content_length > size_limit as u64 {
// We can avoid downloading it: we already know it exceeds the limit.
increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
return Err(PermanentFailure {
reason: PermanentFailureReason::ExceedsSizeLimit,
}
.into());
}
}
let deadline = Instant::now() + time_limit;
let mut buffer = Vec::new();
let mut bytestream = response.bytes_stream();
loop {
tokio::select! {
next_chunk = bytestream.next() => {
match next_chunk {
Some(next_chunk) => {
buffer.extend_from_slice(next_chunk?.as_bytes());
if buffer.len() > size_limit {
increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
return Err(PermanentFailure {
reason: PermanentFailureReason::ExceedsSizeLimit,
}.into());
}
},
None => {
// Finished! :)
break;
}
}
},
_ = tokio::time::sleep_until(deadline) => {
increment_counter!("qprake_rake_specific_fail_count", "reason" => "TimeLimit");
bail!("Exceeded time limit");
}
}
}
Ok(buffer)
}
pub struct Raker {
pub antifeature_ip_set: IpSet,
pub page_extraction: PageExtractionService,
}
impl Raker {
/// Figure out whether we can upgrade a URL to HTTPS.
pub async fn try_upgrade_to_https(
&self,
url: &Url,
client: &Client,
) -> anyhow::Result<Option<Url>> {
if url.scheme().eq_ignore_ascii_case("http") {
// Try to upgrade to HTTPS if we can.
let mut https_url = url.clone();
https_url.set_scheme("https").unwrap();
client
.head(https_url.clone())
.timeout(Duration::from_secs(10))
.send()
.await
.context("failed to make HEAD request")?
.error_for_status()
.context("bad response for HEAD requesst")?;
Ok(Some(https_url))
} else {
Ok(None)
}
}
/// Rakes a resource by URL.
///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
/// most notably when picking up favicons.
pub async fn rake(
&self,
url: &Url,
intent: RakeIntent,
client: &Client,
) -> anyhow::Result<RakeOutcome> {
match self.try_upgrade_to_https(url, client).await {
Ok(Some(upgraded)) => {
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::SecureUpgrade,
new_url: upgraded,
});
}
Ok(None) => {
// continue
}
Err(err) => {
info!("can't upgrade {url} to HTTPS: {err:?}");
// continue
}
}
let response = client.get(url.clone()).send().await?;
let is_cf = if let Some(remote_addr) = response.remote_addr() {
self.antifeature_ip_set.contains(remote_addr.ip())
} else {
false
};
let http_code = response.status().as_u16();
if response.status().is_redirection() {
if let Some(redirect_target) = response.headers().get("location") {
let new_url = url
.join(
redirect_target
.to_str()
.context("Failed to convert Location header to str")?,
)
.context("Failed to resolve Location header target")?;
if intent == RakeIntent::Icon {
// Icons have special handling around redirects: we dereference them by using
// a separate client, but don't store the redirect if we hit the limit!
bail!("Ran out of redirects to fetch icon with.");
}
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::Redirected { http_code },
new_url,
});
} else {
bail!(
"Redirection {:?} received, but no Location header.",
response.status()
);
}
}
let code = response.status().as_u16().to_string();
increment_counter!("qprake_rake_status_count", "status" => code);
if response.status().is_client_error() {
increment_counter!("qprake_rake_status_count", "status" => "4xx");
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::ResourceDenied(http_code),
}));
}
if response.status().is_server_error() {
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
reason: TemporaryFailureReason::ServerError(http_code),
// Try again tomorrow. Maybe the server is overloaded?
backoff_sec: 86400,
}));
}
if !response.status().is_success() {
bail!("Unknown failure code: {:?}", response.status());
}
let content_type = if let Some(content_type) = response.headers().get("content-type") {
let content_type = content_type
.to_str()
.context("Can't convert content-type to str")?;
content_type
.split(";")
.next()
.unwrap()
.trim()
.to_lowercase()
} else {
increment_counter!("qprake_rake_specific_fail_count", "reason" => "NoCT");
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType("not specified".to_owned()),
}));
};
if !intent.supports_mime_type(&content_type) {
increment_counter!("qprake_rake_specific_fail_count", "reason" => "OtherCT");
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
}
let headers = response.headers().clone();
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{
// We don't try any fallbacks for an HTML page
return Ok(self
.rake_html_page(content, url, is_cf, &headers)
.await
.context("Raking HTML page")?);
}
if FEED_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::Feed)
{
match rake_feed(&content, url) {
Ok(feed) => {
return Ok(RakeOutcome::RakedFeed(feed));
}
Err(error) => {
debug!("Failed to rake as feed: {:?}", error);
}
}
}
if SITEMAP_MIME_TYPES.contains(content_type.as_str())
&& (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
{
match rake_sitemap(&content) {
Ok(sitemap) => {
return Ok(RakeOutcome::RakedSitemap(sitemap));
}
Err(error) => {
debug!("Failed to rake as sitemap: {:?}", error);
}
}
}
if intent == RakeIntent::Icon {
match rake_icon(&content, &content_type) {
Ok(icon) => {
return Ok(RakeOutcome::RakedIcon(icon));
}
Err(error) => {
debug!("Failed to rake as icon: {:?}", error);
}
}
}
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
}));
}
pub async fn rake_html_page(
&self,
content: Vec<u8>,
url: &Url,
is_cf: bool,
headers: &HeaderMap,
) -> anyhow::Result<RakeOutcome> {
match self
.page_extraction
.extract(content, url.clone(), headers.clone(), is_cf)
.await?
{
ExtractedPage::Success {
unreadable_document,
document,
feeds,
antifeature_flags,
} => {
let references = references::find_references(&unreadable_document, &feeds, url);
Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags,
document,
},
referrer_entry: RakedReferrerEntry { references },
}))
}
ExtractedPage::Redirect { reason, new_url } => {
Ok(RakeOutcome::Redirect { reason, new_url })
}
}
}
}
pub fn normalise_language(lang_string: &mut String) {
*lang_string = lang_string.to_lowercase();
let mut pieces = lang_string
.replace("-", "_")
.split('_')
.map(|s| s.to_owned())
.collect_vec();
if let Some(dialect) = pieces.get_mut(1) {
*dialect = dialect.to_uppercase();
}
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
let mut urls = Vec::new();
for entry in feed.entries {
let link = if let Some(link) = entry.links.get(0) {
link
} else {
continue;
};
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?
let last_changed = entry.updated.or(entry.published);
urls.push(UrlRaked {
url,
last_changed,
intent: RakeIntent::Page,
});
}
// TODO(feature) paginated feeds (e.g. JSON Feed next_url)
Ok(urls)
}
pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
let curs = std::io::Cursor::new(content);
let reader = sitemap::reader::SiteMapReader::new(curs);
let mut urls = Vec::new();
for entry in reader {
match &entry {
SiteMapEntity::Url(url) => {
let loc = if let Some(loc) = url.loc.get_url() {
loc
} else {
continue;
};
urls.push(UrlRaked {
url: loc,
last_changed: url
.lastmod
.get_time()
.map(|dt: DateTime<FixedOffset>| dt.into()),
intent: RakeIntent::Page,
});
}
SiteMapEntity::SiteMap(sitemap) => {
let loc = if let Some(loc) = sitemap.loc.get_url() {
loc
} else {
continue;
};
urls.push(UrlRaked {
url: loc,
last_changed: sitemap
.lastmod
.get_time()
.map(|dt: DateTime<FixedOffset>| dt.into()),
intent: RakeIntent::SiteMap,
});
}
SiteMapEntity::Err(error) => {
debug!("Sitemap error {:?}", error);
}
}
}
if urls.is_empty() {
bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
}
Ok(urls)
}
pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result<RakedIcon> {
let format = match IMAGE_MIME_TYPES.get(content_type) {
Some(format) => format,
None => {
bail!("Unknown image format: {:?}", content_type);
}
};
let orig_size = content.len();
let mut cursor = Cursor::new(&content);
let mut image = image::load(&mut cursor, *format).context("Failed to load image")?;
const WANTED_DIMENSIONS: u32 = 32;
/// Between 0 and 100.
const WEBP_QUALITY: f32 = 5.0;
let (w, h) = image.dimensions();
if w.max(h) > WANTED_DIMENSIONS {
// image = image.thumbnail(WANTED_DIMENSIONS, WANTED_DIMENSIONS);
// Triangle is slightly better quality than nearest neighbour, but less expensive than
// Cubic or Lanczos.
// .thumbnail() is apparently very fast, but the artifacts were a little bit unfortunate for
// this.
image = image.resize_to_fill(WANTED_DIMENSIONS, WANTED_DIMENSIONS, FilterType::Triangle);
}
let webp_encoder =
webp::Encoder::from_image(&image).map_err(|err| anyhow!("webp fail: {}", err))?;
let encoded = webp_encoder.encode(WEBP_QUALITY).to_vec();
Ok(RakedIcon {
original_size_in_bytes: orig_size,
webp_bytes: encoded,
})
}
pub fn robots_txt_url_for(url: &Url) -> anyhow::Result<Url> {
url.join("/robots.txt")
.context("Whilst resolving /robots.txt on URL")
}
pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
let robots_url = robots_txt_url_for(url)?;
let resp = client.get(robots_url.clone()).send().await?;
if !resp.status().is_success() {
let code = resp.status().as_u16();
if code == 403 || code == 404 || code == 410 {
// not found or gone? Assume there is intentionally no robots.txt file.
// If they deny us access to the robots file, then they deserve whatever they get and
// we proceed.
return Ok(None);
}
bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
}
let bytes = resp.bytes().await?;
Ok(decode_robots_txt(&bytes).await?)
}
pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
let mut sitemaps = Vec::new();
for line in bytes.split(|b| *b == b'\n') {
let line = line.to_ascii_lowercase();
if line.starts_with(b"sitemap:") {
if let Ok(value) = std::str::from_utf8(&line[8..]) {
if let Ok(url) = Url::parse(value.trim()) {
sitemaps.push(UrlRaked {
url,
last_changed: None,
intent: RakeIntent::SiteMap,
});
} else {
debug!("Failed to parse sitemap value as a URL")
}
} else {
debug!("Failed to parse sitemap value as UTF-8")
}
}
}
let rules = cylon::Compiler::new(RAKER_USER_AGENT)
.compile(bytes.as_bytes())
.await?;
Ok(Some(RobotsTxt { sitemaps, rules }))
}