quickpeep/quickpeep_raker/src/raking.rs

use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;

use ::metrics::increment_counter;
use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat};
use itertools::Itertools;
use lazy_static::lazy_static;
use log::{debug, info};
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use tokio::time::Instant;

use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};

use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};

pub mod analysis;
pub mod page_extraction;
pub mod rakemetrics;
pub mod references;
pub mod task;

/// 4 MiB ought to be enough for anybody.
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
/// If it's not loaded in ten seconds, that's pretty severe.
/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";

pub enum RakeOutcome {
    RakedPage(RakedPage),
    RakedFeed(Vec<UrlRaked>),
    RakedSitemap(Vec<UrlRaked>),
    RakedIcon(RakedIcon),
    Redirect {
        reason: RedirectReason,
        new_url: Url,
    },
    TemporaryFailure(TemporaryFailure),
    PermanentFailure(PermanentFailure),
}

#[derive(Debug)]
pub enum RedirectReason {
    /// The page redirected somewhere else.
    Redirected {
        /// HTTP Status Code of the redirect
        http_code: u16,
    },
    /// The page was not canonical, and should not be indexed.
    NotCanonical,
    /// Upgrade from a HTTP to HTTPS URL (or equivalent).
    SecureUpgrade,
}

#[derive(Clone, Debug, PartialEq, Eq)]
pub struct UrlRaked {
    pub url: Url,
    pub last_changed: Option<DateTime<Utc>>,
    pub intent: RakeIntent,
}

#[derive(Serialize)]
pub struct RakedPage {
    pub page_entry: RakedPageEntry,
    pub referrer_entry: RakedReferrerEntry,
}

pub struct RakedIcon {
    pub original_size_in_bytes: usize,
    pub webp_bytes: Vec<u8>,
}

pub struct RobotsTxt {
    pub sitemaps: Vec<UrlRaked>,
    pub rules: Cylon,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TemporaryFailure {
    pub reason: TemporaryFailureReason,
    pub backoff_sec: u32,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PermanentFailure {
    pub reason: PermanentFailureReason,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum TemporaryFailureReason {
    MissingInformation(String),
    ServerError(u16),
    UnknownClientError(String),
    ExcruciatingCrawlDelay(u64),
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum PermanentFailureReason {
    ResourceDenied(u16),
    DeniedToRobots,
    WrongLanguage(String),
    UnknownContentType(String),
    ExceedsSizeLimit,
}

impl Display for PermanentFailure {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        Debug::fmt(&self, f)
    }
}

impl Error for PermanentFailure {}

#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum RakeIntent {
    Any,
    Page,
    Feed,
    SiteMap,
    Icon,
}

impl FromStr for RakeIntent {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Ok(match s.to_lowercase().as_ref() {
            "any" => RakeIntent::Any,
            "page" => RakeIntent::Page,
            "feed" => RakeIntent::Feed,
            "sitemap" => RakeIntent::SiteMap,
            "icon" => RakeIntent::Icon,
            other => {
                bail!("Unrecognised intent: {:?}", other)
            }
        })
    }
}

impl From<ReferenceKind> for RakeIntent {
    fn from(kind: ReferenceKind) -> Self {
        match kind {
            ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
                // FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
                RakeIntent::Any
            }
            ReferenceKind::Redirect => {
                // FIXME We don't know what this is a redirect for. Suppose it doesn't matter...
                RakeIntent::Any
            }
            ReferenceKind::Link => {
                // Links can go to pages but also to RSS feeds
                RakeIntent::Any
            }
            ReferenceKind::HeaderLinkedFeed => RakeIntent::Feed,
            ReferenceKind::FeedEntry => RakeIntent::Page,
            ReferenceKind::SitemapEntry => RakeIntent::Page,
        }
    }
}

impl RakeIntent {
    pub fn supports_mime_type(&self, mime_type: &str) -> bool {
        match self {
            RakeIntent::Any => ALL_MIME_TYPES.contains(mime_type),
            RakeIntent::Page => PAGE_MIME_TYPES.contains(mime_type),
            RakeIntent::Feed => FEED_MIME_TYPES.contains(mime_type),
            RakeIntent::SiteMap => SITEMAP_MIME_TYPES.contains(mime_type),
            RakeIntent::Icon => IMAGE_MIME_TYPES.contains_key(mime_type),
        }
    }
}

lazy_static! {
    static ref PAGE_MIME_TYPES: HashSet<&'static str> =
        HashSet::from_iter(vec!["text/html", "text/gemini",]);

    static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
        HashSet::from_iter(vec!["text/xml", "application/xml",]);

    /// MIME types we might expect in content-type headers
    static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
        "text/xml",
        "application/xml",
        "application/atom+xml",
        "application/rss+xml",
        "application/rdf+xml",
        "application/json",
        "application/feed+json"
    ]);

    /// MIME types we might expect in <link> tags
    static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
        "application/atom+xml",
        "application/rss+xml",
        "application/rdf+xml",
        "application/feed+json"
    ]);

    pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = {
        [
            ("image/png", ImageFormat::Png),
            ("image/webp", ImageFormat::WebP),
            ("image/jpeg", ImageFormat::Jpeg),
            ("image/gif", ImageFormat::Gif),
            ("image/vnd.microsoft.icon", ImageFormat::Ico),
            ("image/x-icon", ImageFormat::Ico),
            ("image/icon", ImageFormat::Ico),
            ("image/ico", ImageFormat::Ico),
            ("application/ico", ImageFormat::Ico),
        ]
        .into_iter()
        .collect()
    };

    pub static ref ALL_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(
        PAGE_MIME_TYPES.iter().cloned()
            .chain(SITEMAP_MIME_TYPES.iter().cloned())
            .chain(FEED_MIME_TYPES.iter().cloned())
            .chain(FEED_LINK_MIME_TYPES.iter().cloned())
            .chain(IMAGE_MIME_TYPES.keys().cloned())
    );
}

async fn response_to_bytes_limited(
    response: Response,
    size_limit: usize,
    time_limit: Duration,
) -> anyhow::Result<Vec<u8>> {
    // Check the content-length header without
    let content_length = response
        .headers()
        .get("content-length")
        .map(|len| len.to_str().ok())
        .flatten()
        .map(|len| len.parse::<u64>().ok())
        .flatten();

    if let Some(content_length) = content_length {
        if content_length > size_limit as u64 {
            // We can avoid downloading it: we already know it exceeds the limit.
            increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
            return Err(PermanentFailure {
                reason: PermanentFailureReason::ExceedsSizeLimit,
            }
            .into());
        }
    }

    let deadline = Instant::now() + time_limit;
    let mut buffer = Vec::new();
    let mut bytestream = response.bytes_stream();

    loop {
        tokio::select! {
            next_chunk = bytestream.next() => {
                match next_chunk {
                    Some(next_chunk) => {
                        buffer.extend_from_slice(next_chunk?.as_bytes());
                        if buffer.len() > size_limit {
                            increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
                            return Err(PermanentFailure {
                                reason: PermanentFailureReason::ExceedsSizeLimit,
                            }.into());
                        }
                    },
                    None => {
                        // Finished! :)
                        break;
                    }
                }
            },
            _ = tokio::time::sleep_until(deadline) => {
                increment_counter!("qprake_rake_specific_fail_count", "reason" => "TimeLimit");
                bail!("Exceeded time limit");
            }
        }
    }

    Ok(buffer)
}

pub struct Raker {
    pub antifeature_ip_set: IpSet,
    pub page_extraction: PageExtractionService,
}

impl Raker {
    /// Figure out whether we can upgrade a URL to HTTPS.
    pub async fn try_upgrade_to_https(
        &self,
        url: &Url,
        client: &Client,
    ) -> anyhow::Result<Option<Url>> {
        if url.scheme().eq_ignore_ascii_case("http") {
            // Try to upgrade to HTTPS if we can.
            let mut https_url = url.clone();
            https_url.set_scheme("https").unwrap();
            client
                .head(https_url.clone())
                .timeout(Duration::from_secs(10))
                .send()
                .await
                .context("failed to make HEAD request")?
                .error_for_status()
                .context("bad response for HEAD requesst")?;
            Ok(Some(https_url))
        } else {
            Ok(None)
        }
    }

    /// Rakes a resource by URL.
    ///
    /// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
    /// most notably when picking up favicons.
    pub async fn rake(
        &self,
        url: &Url,
        intent: RakeIntent,
        client: &Client,
    ) -> anyhow::Result<RakeOutcome> {
        match self.try_upgrade_to_https(url, client).await {
            Ok(Some(upgraded)) => {
                return Ok(RakeOutcome::Redirect {
                    reason: RedirectReason::SecureUpgrade,
                    new_url: upgraded,
                });
            }
            Ok(None) => {
                // continue
            }
            Err(err) => {
                info!("can't upgrade {url} to HTTPS: {err:?}");
                // continue
            }
        }

        let response = client.get(url.clone()).send().await?;

        let is_cf = if let Some(remote_addr) = response.remote_addr() {
            self.antifeature_ip_set.contains(remote_addr.ip())
        } else {
            false
        };

        let http_code = response.status().as_u16();

        if response.status().is_redirection() {
            if let Some(redirect_target) = response.headers().get("location") {
                let new_url = url
                    .join(
                        redirect_target
                            .to_str()
                            .context("Failed to convert Location header to str")?,
                    )
                    .context("Failed to resolve Location header target")?;

                if intent == RakeIntent::Icon {
                    // Icons have special handling around redirects: we dereference them by using
                    // a separate client, but don't store the redirect if we hit the limit!
                    bail!("Ran out of redirects to fetch icon with.");
                }

                return Ok(RakeOutcome::Redirect {
                    reason: RedirectReason::Redirected { http_code },
                    new_url,
                });
            } else {
                bail!(
                    "Redirection {:?} received, but no Location header.",
                    response.status()
                );
            }
        }

        let code = response.status().as_u16().to_string();
        increment_counter!("qprake_rake_status_count", "status" => code);

        if response.status().is_client_error() {
            increment_counter!("qprake_rake_status_count", "status" => "4xx");
            return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
                reason: PermanentFailureReason::ResourceDenied(http_code),
            }));
        }

        if response.status().is_server_error() {
            return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
                reason: TemporaryFailureReason::ServerError(http_code),
                // Try again tomorrow. Maybe the server is overloaded?
                backoff_sec: 86400,
            }));
        }

        if !response.status().is_success() {
            bail!("Unknown failure code: {:?}", response.status());
        }

        let content_type = if let Some(content_type) = response.headers().get("content-type") {
            let content_type = content_type
                .to_str()
                .context("Can't convert content-type to str")?;
            content_type
                .split(";")
                .next()
                .unwrap()
                .trim()
                .to_lowercase()
        } else {
            increment_counter!("qprake_rake_specific_fail_count", "reason" => "NoCT");
            return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
                reason: PermanentFailureReason::UnknownContentType("not specified".to_owned()),
            }));
        };

        if !intent.supports_mime_type(&content_type) {
            increment_counter!("qprake_rake_specific_fail_count", "reason" => "OtherCT");
            return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
                reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
            }));
        }

        let headers = response.headers().clone();
        let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;

        if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
        {
            // We don't try any fallbacks for an HTML page
            return Ok(self
                .rake_html_page(content, url, is_cf, &headers)
                .await
                .context("Raking HTML page")?);
        }

        if FEED_MIME_TYPES.contains(content_type.as_str())
            && (intent == RakeIntent::Any || intent == RakeIntent::Feed)
        {
            match rake_feed(&content, url) {
                Ok(feed) => {
                    return Ok(RakeOutcome::RakedFeed(feed));
                }
                Err(error) => {
                    debug!("Failed to rake as feed: {:?}", error);
                }
            }
        }

        if SITEMAP_MIME_TYPES.contains(content_type.as_str())
            && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
        {
            match rake_sitemap(&content) {
                Ok(sitemap) => {
                    return Ok(RakeOutcome::RakedSitemap(sitemap));
                }
                Err(error) => {
                    debug!("Failed to rake as sitemap: {:?}", error);
                }
            }
        }

        if intent == RakeIntent::Icon {
            match rake_icon(&content, &content_type) {
                Ok(icon) => {
                    return Ok(RakeOutcome::RakedIcon(icon));
                }
                Err(error) => {
                    debug!("Failed to rake as icon: {:?}", error);
                }
            }
        }

        return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
            reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
        }));
    }

    pub async fn rake_html_page(
        &self,
        content: Vec<u8>,
        url: &Url,
        is_cf: bool,
        headers: &HeaderMap,
    ) -> anyhow::Result<RakeOutcome> {
        match self
            .page_extraction
            .extract(content, url.clone(), headers.clone(), is_cf)
            .await?
        {
            ExtractedPage::Success {
                unreadable_document,
                document,
                feeds,
                antifeature_flags,
            } => {
                let references = references::find_references(&unreadable_document, &feeds, url);
                Ok(RakeOutcome::RakedPage(RakedPage {
                    page_entry: RakedPageEntry {
                        analysed_antifeatures: antifeature_flags,
                        document,
                    },
                    referrer_entry: RakedReferrerEntry { references },
                }))
            }
            ExtractedPage::Redirect { reason, new_url } => {
                Ok(RakeOutcome::Redirect { reason, new_url })
            }
        }
    }
}

pub fn normalise_language(lang_string: &mut String) {
    *lang_string = lang_string.to_lowercase();
    let mut pieces = lang_string
        .replace("-", "_")
        .split('_')
        .map(|s| s.to_owned())
        .collect_vec();
    if let Some(dialect) = pieces.get_mut(1) {
        *dialect = dialect.to_uppercase();
    }
}

pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
    let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;

    let mut urls = Vec::new();

    for entry in feed.entries {
        let link = if let Some(link) = entry.links.get(0) {
            link
        } else {
            continue;
        };
        let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?

        let last_changed = entry.updated.or(entry.published);

        urls.push(UrlRaked {
            url,
            last_changed,
            intent: RakeIntent::Page,
        });
    }

    // TODO(feature) paginated feeds (e.g. JSON Feed next_url)

    Ok(urls)
}

pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
    let curs = std::io::Cursor::new(content);
    let reader = sitemap::reader::SiteMapReader::new(curs);

    let mut urls = Vec::new();

    for entry in reader {
        match &entry {
            SiteMapEntity::Url(url) => {
                let loc = if let Some(loc) = url.loc.get_url() {
                    loc
                } else {
                    continue;
                };

                urls.push(UrlRaked {
                    url: loc,
                    last_changed: url
                        .lastmod
                        .get_time()
                        .map(|dt: DateTime<FixedOffset>| dt.into()),
                    intent: RakeIntent::Page,
                });
            }
            SiteMapEntity::SiteMap(sitemap) => {
                let loc = if let Some(loc) = sitemap.loc.get_url() {
                    loc
                } else {
                    continue;
                };

                urls.push(UrlRaked {
                    url: loc,
                    last_changed: sitemap
                        .lastmod
                        .get_time()
                        .map(|dt: DateTime<FixedOffset>| dt.into()),
                    intent: RakeIntent::SiteMap,
                });
            }
            SiteMapEntity::Err(error) => {
                debug!("Sitemap error {:?}", error);
            }
        }
    }

    if urls.is_empty() {
        bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
    }

    Ok(urls)
}

pub fn rake_icon(content: &[u8], content_type: &str) -> anyhow::Result<RakedIcon> {
    let format = match IMAGE_MIME_TYPES.get(content_type) {
        Some(format) => format,
        None => {
            bail!("Unknown image format: {:?}", content_type);
        }
    };

    let orig_size = content.len();

    let mut cursor = Cursor::new(&content);
    let mut image = image::load(&mut cursor, *format).context("Failed to load image")?;

    const WANTED_DIMENSIONS: u32 = 32;
    /// Between 0 and 100.
    const WEBP_QUALITY: f32 = 5.0;

    let (w, h) = image.dimensions();
    if w.max(h) > WANTED_DIMENSIONS {
        // image = image.thumbnail(WANTED_DIMENSIONS, WANTED_DIMENSIONS);
        // Triangle is slightly better quality than nearest neighbour, but less expensive than
        // Cubic or Lanczos.
        // .thumbnail() is apparently very fast, but the artifacts were a little bit unfortunate for
        // this.
        image = image.resize_to_fill(WANTED_DIMENSIONS, WANTED_DIMENSIONS, FilterType::Triangle);
    }

    let webp_encoder =
        webp::Encoder::from_image(&image).map_err(|err| anyhow!("webp fail: {}", err))?;
    let encoded = webp_encoder.encode(WEBP_QUALITY).to_vec();

    Ok(RakedIcon {
        original_size_in_bytes: orig_size,
        webp_bytes: encoded,
    })
}

pub fn robots_txt_url_for(url: &Url) -> anyhow::Result<Url> {
    url.join("/robots.txt")
        .context("Whilst resolving /robots.txt on URL")
}

pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
    let robots_url = robots_txt_url_for(url)?;
    let resp = client.get(robots_url.clone()).send().await?;

    if !resp.status().is_success() {
        let code = resp.status().as_u16();
        if code == 403 || code == 404 || code == 410 {
            // not found or gone? Assume there is intentionally no robots.txt file.
            // If they deny us access to the robots file, then they deserve whatever they get and
            // we proceed.
            return Ok(None);
        }

        bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
    }

    let bytes = resp.bytes().await?;

    Ok(decode_robots_txt(&bytes).await?)
}

pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
    let mut sitemaps = Vec::new();

    for line in bytes.split(|b| *b == b'\n') {
        let line = line.to_ascii_lowercase();
        if line.starts_with(b"sitemap:") {
            if let Ok(value) = std::str::from_utf8(&line[8..]) {
                if let Ok(url) = Url::parse(value.trim()) {
                    sitemaps.push(UrlRaked {
                        url,
                        last_changed: None,
                        intent: RakeIntent::SiteMap,
                    });
                } else {
                    debug!("Failed to parse sitemap value as a URL")
                }
            } else {
                debug!("Failed to parse sitemap value as UTF-8")
            }
        }
    }

    let rules = cylon::Compiler::new(RAKER_USER_AGENT)
        .compile(bytes.as_bytes())
        .await?;

    Ok(Some(RobotsTxt { sitemaps, rules }))
}