quickpeep/quickpeep/src/raking.rs

use crate::raking::analysis::{
    analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
};
use adblock::engine::Engine;
use anyhow::{bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use itertools::Itertools;
use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use lazy_static::lazy_static;
use log::debug;
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_structs::rake_entries::{
    AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
};
use quickpeep_utils::Lazy;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
use std::collections::HashSet;
use std::time::Duration;
use tokio::time::Instant;

pub mod analysis;

/// 4 MiB ought to be enough for anybody.
pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
/// If it's not loaded in ten seconds, that's pretty severe.
/// 10 seconds is almost too generous (assuming that the best of things can run slowly sometimes).
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";

pub enum RakeOutcome {
    RakedPage(RakedPage),
    RakedFeed(Vec<UrlRaked>),
    RakedSitemap(Vec<UrlRaked>),
    Redirect {
        reason: RedirectReason,
        new_url: Url,
    },
    TemporaryFailure(TemporaryFailure),
    PermanentFailure(PermanentFailure),
}

#[derive(Debug)]
pub enum RedirectReason {
    /// The page redirected somewhere else.
    Redirected {
        /// HTTP Status Code of the redirect
        http_code: u16,
    },
    /// The page was not canonical, and should not be indexed.
    NotCanonical,
}

#[derive(Clone, Debug, PartialEq, Eq)]
pub struct UrlRaked {
    pub url: Url,
    pub last_changed: Option<DateTime<Utc>>,
    pub intent: RakeIntent,
}

#[derive(Serialize)]
pub struct RakedPage {
    page_entry: RakedPageEntry,
    referrer_entry: RakedReferrerEntry,
}

pub struct RobotsTxt {
    pub sitemaps: Vec<UrlRaked>,
    pub rules: Cylon,
}

#[derive(Debug)]
pub struct TemporaryFailure {
    pub reason: TemporaryFailureReason,
    pub backoff_sec: u32,
}

#[derive(Debug)]
pub struct PermanentFailure {
    pub reason: PermanentFailureReason,
}

#[derive(Debug)]
pub enum TemporaryFailureReason {
    MissingInformation(String),
    ServerError(u16),
}

#[derive(Debug)]
pub enum PermanentFailureReason {
    ResourceDenied(u16),
    WrongLanguage(String),
    UnknownContentType(String),
}

#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum RakeIntent {
    Any,
    Page,
    Feed,
    SiteMap,
}

lazy_static! {
    static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
        HashSet::from_iter(vec!["text/xml", "application/xml",]);

    /// MIME types we might expect in content-type headers
    static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
        "text/xml",
        "application/xml",
        "application/atom+xml",
        "application/rss+xml",
        "application/rdf+xml",
        "application/json",
        "application/feed+json"
    ]);

    /// MIME types we might expect in <link> tags
    static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![
        "application/atom+xml",
        "application/rss+xml",
        "application/rdf+xml",
        "application/feed+json"
    ]);
}

async fn response_to_bytes_limited(
    response: Response,
    size_limit: usize,
    time_limit: Duration,
) -> anyhow::Result<Vec<u8>> {
    let deadline = Instant::now() + time_limit;
    let mut buffer = Vec::new();
    let mut bytestream = response.bytes_stream();

    loop {
        tokio::select! {
            next_chunk = bytestream.next() => {
                match next_chunk {
                    Some(next_chunk) => {
                        buffer.extend_from_slice(next_chunk?.as_bytes());
                        if buffer.len() > size_limit {
                            bail!("Exceeds size limit");
                        }
                    },
                    None => {
                        // Finished! :)
                        break;
                    }
                }
            },
            _ = tokio::time::sleep_until(deadline) => {
                bail!("Exceeded time limit");
            }
        }
    }

    Ok(buffer)
}

pub struct Raker {
    pub adblock_engines: Vec<(AnalysisAntifeatures, Engine)>,
    pub antifeature_ip_set: IpSet,
}

impl Raker {
    pub async fn rake(
        &self,
        url: &Url,
        intent: RakeIntent,
        client: &Client,
    ) -> anyhow::Result<RakeOutcome> {
        let response = client.get(url.clone()).send().await?;

        let is_cf = if let Some(remote_addr) = response.remote_addr() {
            self.antifeature_ip_set.contains(remote_addr.ip())
        } else {
            false
        };

        let http_code = response.status().as_u16();

        if response.status().is_redirection() {
            if let Some(redirect_target) = response.headers().get("location") {
                let new_url = url
                    .join(
                        redirect_target
                            .to_str()
                            .context("Failed to convert Location header to str")?,
                    )
                    .context("Failed to resolve Location header target")?;

                return Ok(RakeOutcome::Redirect {
                    reason: RedirectReason::Redirected { http_code },
                    new_url,
                });
            } else {
                bail!(
                    "Redirection {:?} received, but no Location header.",
                    response.status()
                );
            }
        }

        if response.status().is_client_error() {
            return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
                reason: PermanentFailureReason::ResourceDenied(http_code),
            }));
        }

        if response.status().is_server_error() {
            return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
                reason: TemporaryFailureReason::ServerError(http_code),
                // Try again tomorrow. Maybe the server is overloaded?
                backoff_sec: 86400,
            }));
        }

        if !response.status().is_success() {
            bail!("Unknown failure code: {:?}", response.status());
        }

        let content_type = if let Some(content_type) = response.headers().get("content-type") {
            let content_type = content_type
                .to_str()
                .context("Can't convert content-type to str")?;
            eprintln!("CT {:?}", content_type);
            content_type.split(";").next().unwrap().trim().to_owned()
        } else {
            return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
                reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
                backoff_sec: 86400 * 7,
            }));
        };

        let headers = response.headers().clone();
        let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;

        if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
        {
            // We don't try any fallbacks for an HTML page
            return Ok(self
                .rake_html_page(&content, url, is_cf, &headers)
                .context("Raking HTML page")?);
        }

        if FEED_MIME_TYPES.contains(content_type.as_str())
            && (intent == RakeIntent::Any || intent == RakeIntent::Feed)
        {
            match rake_feed(&content, url) {
                Ok(feed) => {
                    return Ok(RakeOutcome::RakedFeed(feed));
                }
                Err(error) => {
                    debug!("Failed to rake as feed: {:?}", error);
                }
            }
        }

        if SITEMAP_MIME_TYPES.contains(content_type.as_str())
            && (intent == RakeIntent::Any || intent == RakeIntent::SiteMap)
        {
            match rake_sitemap(&content) {
                Ok(sitemap) => {
                    return Ok(RakeOutcome::RakedSitemap(sitemap));
                }
                Err(error) => {
                    debug!("Failed to rake as sitemap: {:?}", error);
                }
            }
        }

        return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
            reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
        }));
    }

    pub fn rake_html_page(
        &self,
        content: &[u8],
        url: &Url,
        is_cf: bool,
        headers: &HeaderMap,
    ) -> anyhow::Result<RakeOutcome> {
        let content_str = std::str::from_utf8(content)?;

        let root_node: NodeRef = kuchiki::parse_html().one(content_str);

        // See whether this page is at the canonical URL for the page.
        // If it's not, then we redirect the raker to the canonical URL.
        if let Ok(canonical_link_node) = root_node.select_first("head link[rel=canonical]") {
            if let Some(canonical_href) = canonical_link_node.attributes.borrow().get("href") {
                let canonical_url = url
                    .join(canonical_href)
                    .context("Failed to resolve or parse canonical URL")?;

                if &canonical_url != url {
                    return Ok(RakeOutcome::Redirect {
                        reason: RedirectReason::NotCanonical,
                        new_url: canonical_url,
                    });
                }
            }
        }

        // Try and dig up the page's language.
        // First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
        let mut language = None;

        if let Ok(html_node) = root_node.select_first("html") {
            if let Some(lang) = html_node.attributes.borrow().get("lang") {
                language = Some(lang.trim().to_string());
            }
        }

        if language.is_none() {
            // Next fallback: prefer the content-language header baked into the page itself
            if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
                if let Some(lang) = meta_node.attributes.borrow().get("content") {
                    language = Some(lang.trim().to_string());
                }
            }
        }

        if language.is_none() {
            // Next fallback: prefer the content-language received as a header
            if let Some(lang) = headers.get("content-language") {
                language = Some(lang.to_str()?.to_owned());
            }
        }

        let mut antifeature_flags = AnalysisAntifeatures::empty();

        if is_cf {
            antifeature_flags |= AnalysisAntifeatures::CLOUDFLARE;
        }

        for (engine_antifeature_flag, adblock_engine) in &self.adblock_engines {
            match analyse_with_ad_block_cosmetic_filter(
                &root_node,
                adblock_engine,
                url.as_str(),
                true,
            ) {
                Ok(cosmetic_filters_tripped) => {
                    eprintln!("?cosmetic filters tripped: {}", cosmetic_filters_tripped);
                    antifeature_flags |= *engine_antifeature_flag;
                }
                Err(err) => {
                    eprintln!("Cosmetic Filter Err {:?}", err);
                }
            };
        }

        let dense_doc = DenseTree::from_body(root_node.clone());
        let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc)));
        //eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text);

        if language.is_none() {
            // Final fallback: guess the language
            language = guess_document_language(&*dense_doc_text);
        }

        // Try and enforce some consistency in the language code;
        // we want codes like en_US rather than en-us.
        if let Some(language) = language.as_mut() {
            normalise_language(language);
        }

        let mut title = "".to_owned();

        if let Ok(title_node) = root_node.select_first("head title") {
            title = title_node.text_contents();
        }

        let mut feeds = Vec::new();
        let mut icon = None;

        for link_node in root_node.select("head link").into_iter().flatten() {
            if let Some(rel) = link_node.attributes.borrow().get("rel") {
                let rels = rel.split_whitespace().collect_vec();
                if rels.contains(&"icon") {
                    // This is an icon
                    if let Some(href) = link_node.attributes.borrow().get("href") {
                        let icon_url = url
                            .join(href)
                            .context("Failed to resolve or parse canonical URL to icon")?;

                        icon = Some(icon_url);
                    }
                } else if rels.contains(&"alternate") {
                    if let Some(rel_type) = link_node.attributes.borrow().get("type") {
                        if FEED_LINK_MIME_TYPES.contains(rel_type) {
                            if let Some(href) = link_node.attributes.borrow().get("href") {
                                let feed_url = url
                                    .join(href)
                                    .context("Failed to resolve or parse canonical URL to feed")?;

                                feeds.push(feed_url);
                            }
                        }
                    }
                }
            }
        }

        let mut readability =
            quickpeep_moz_readability::Readability::new_from_node(root_node.clone());
        if let Err(err) = readability.parse(url.as_str()) {
            debug!("Failed to analyse readability: {:?}", err);
        }

        eprintln!("{:#?}", readability.metadata);

        if title.is_empty() && !readability.metadata.title().is_empty() {
            // Fall back to the readability-derived page title
            title = readability.metadata.title().to_owned();
        }

        let mut document = DenseDocument {
            head: DenseHead {
                title,
                language: language.unwrap_or(String::with_capacity(0)),
                icon: icon
                    .map(|url| url.as_str().to_owned())
                    .unwrap_or(String::with_capacity(0)),
            },
            body_content: Vec::with_capacity(0),
            body_remainder: Vec::with_capacity(0),
        };

        if let Some(article_node) = readability.article_node {
            document.body_remainder = DenseTree::from_body(root_node.clone());
            document.body_content = DenseTree::from_body(article_node);
        }

        let bare_size = serde_bare::to_vec(&dense_doc)?.len();
        eprintln!("CS {:?} → {:?}", content.len(), bare_size);

        let references = find_references(&document, &feeds, url);
        Ok(RakeOutcome::RakedPage(RakedPage {
            page_entry: RakedPageEntry {
                analysed_antifeatures: antifeature_flags,
                document,
            },
            referrer_entry: RakedReferrerEntry { references },
        }))
    }
}

pub fn find_references(
    doc: &DenseDocument,
    feeds: &Vec<Url>,
    page_url: &Url,
) -> Vec<RakedReference> {
    let mut refs = Vec::new();

    fn add_link_refs(tree: &Vec<DenseTree>, refs: &mut Vec<RakedReference>, page_url: &Url) {
        for node in tree {
            match node {
                DenseTree::Heading1(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Heading2(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Heading3(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Heading4(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Heading5(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Heading6(children) => {
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Link {
                    children,
                    href,
                    nofollow,
                } => {
                    if !nofollow {
                        if let Ok(full_url) = page_url.join(&href) {
                            refs.push(RakedReference {
                                target: full_url.to_string(),
                                kind: ReferenceKind::CanonicalUrl,
                            })
                        }
                    }
                    add_link_refs(children, refs, page_url);
                }
                DenseTree::Image { .. } => {}
                DenseTree::Text(_) => {}
            }
        }
    }

    add_link_refs(&doc.body_content, &mut refs, &page_url);
    add_link_refs(&doc.body_remainder, &mut refs, &page_url);

    for feed in feeds {
        refs.push(RakedReference {
            target: feed.as_str().to_owned(),
            kind: ReferenceKind::HeaderLinkedFeed,
        });
    }

    refs
}

pub fn normalise_language(lang_string: &mut String) {
    *lang_string = lang_string.to_lowercase();
    let mut pieces = lang_string
        .replace("-", "_")
        .split('_')
        .map(|s| s.to_owned())
        .collect_vec();
    if let Some(dialect) = pieces.get_mut(1) {
        *dialect = dialect.to_uppercase();
    }
}

pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
    let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;

    let mut urls = Vec::new();

    for entry in feed.entries {
        let link = if let Some(link) = entry.links.get(0) {
            link
        } else {
            continue;
        };
        let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?

        let last_changed = entry.updated.or(entry.published);

        urls.push(UrlRaked {
            url,
            last_changed,
            intent: RakeIntent::Page,
        });
    }

    // TODO paginated feeds (e.g. JSON Feed next_url)

    Ok(urls)
}

pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
    let curs = std::io::Cursor::new(content);
    let reader = sitemap::reader::SiteMapReader::new(curs);

    let mut urls = Vec::new();

    for entry in reader {
        match &entry {
            SiteMapEntity::Url(url) => {
                let loc = if let Some(loc) = url.loc.get_url() {
                    loc
                } else {
                    continue;
                };

                urls.push(UrlRaked {
                    url: loc,
                    last_changed: url
                        .lastmod
                        .get_time()
                        .map(|dt: DateTime<FixedOffset>| dt.into()),
                    intent: RakeIntent::Page,
                });
            }
            SiteMapEntity::SiteMap(sitemap) => {
                let loc = if let Some(loc) = sitemap.loc.get_url() {
                    loc
                } else {
                    continue;
                };

                urls.push(UrlRaked {
                    url: loc,
                    last_changed: sitemap
                        .lastmod
                        .get_time()
                        .map(|dt: DateTime<FixedOffset>| dt.into()),
                    intent: RakeIntent::SiteMap,
                });
            }
            SiteMapEntity::Err(error) => {
                debug!("Sitemap error {:?}", error);
            }
        }
        eprintln!("{:?}", entry);
    }

    if urls.is_empty() {
        bail!("No URLs or Sitemaps picked up from sitemap; is it bad?");
    }

    Ok(urls)
}

pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Option<RobotsTxt>> {
    let robots_url = url
        .join("/robots.txt")
        .context("Whilst resolving /robots.txt on URL")?;
    let resp = client.get(robots_url.clone()).send().await?;

    if !resp.status().is_success() {
        let code = resp.status().as_u16();
        if code == 404 || code == 410 {
            // not found or gone? Assume there is intentionally no robots.txt file.
            return Ok(None);
        }

        bail!("Failed to get {:?}: {:?}", robots_url, resp.status());
    }

    let bytes = resp.bytes().await?;

    Ok(decode_robots_txt(&bytes).await?)
}

pub async fn decode_robots_txt(bytes: &[u8]) -> anyhow::Result<Option<RobotsTxt>> {
    let mut sitemaps = Vec::new();

    for line in bytes.split(|b| *b == b'\n') {
        let line = line.to_ascii_lowercase();
        if line.starts_with(b"sitemap:") {
            if let Ok(value) = std::str::from_utf8(&line[8..]) {
                if let Ok(url) = Url::parse(value.trim()) {
                    sitemaps.push(UrlRaked {
                        url,
                        last_changed: None,
                        intent: RakeIntent::SiteMap,
                    });
                } else {
                    debug!("Failed to parse sitemap value as a URL")
                }
            } else {
                debug!("Failed to parse sitemap value as UTF-8")
            }
        }
    }

    let rules = cylon::Compiler::new(RAKER_USER_AGENT)
        .compile(bytes.as_bytes())
        .await?;

    Ok(Some(RobotsTxt { sitemaps, rules }))
}