Respect nofollow and noindex <meta> robots tags

Along with doing the right thing, this should speed up raking for us
2023-03-30 23:09:39 +01:00 · 2023-03-30 23:09:39 +01:00 · 1e8aa95e7a
commit 1e8aa95e7a
parent 18d2023550
3 changed files with 55 additions and 4 deletions
--- a/quickpeep_densedoc/src/lib.rs
+++ b/quickpeep_densedoc/src/lib.rs
@ -287,7 +287,8 @@ impl DenseTreeBuilder {
                        let nofollow = attrs
                            .get("rel")
                            .map(|rel: &str| {
-                                rel.split_whitespace()
+                                rel.split(|c: char| c.is_whitespace() || c == ',')
                                    .filter(|s| !s.is_empty())
                                    .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
                            })
                            .unwrap_or(false);
--- a/quickpeep_raker/src/raking.rs
+++ b/quickpeep_raker/src/raking.rs
@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::error::Error;
 use std::fmt::{Debug, Display, Formatter};
 use std::io::Cursor;
@ -15,7 +15,7 @@ use image::imageops::FilterType;
 use image::{GenericImageView, ImageFormat};
 use itertools::Itertools;
 use lazy_static::lazy_static;
-use log::{debug, info};
+use log::{debug, info, warn};
 use reqwest::header::HeaderMap;
 use reqwest::{Client, Response, Url};
 use serde::{Deserialize, Serialize};
@ -112,6 +112,7 @@ pub enum TemporaryFailureReason {
 pub enum PermanentFailureReason {
    ResourceDenied(u16),
    DeniedToRobots,
    IndexingDenied,
    WrongLanguage(String),
    UnknownContentType(String),
    ExceedsSizeLimit,
@ -505,8 +506,34 @@ impl Raker {
                document,
                feeds,
                antifeature_flags,
                no_follow,
                no_index,
            } => {
-                let references = references::find_references(&unreadable_document, &feeds, url);
+                if no_index {
                    return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
                        reason: PermanentFailureReason::IndexingDenied,
                    }));
                }
                let mut references = references::find_references(&unreadable_document, &feeds, url);
                if no_follow {
                    // Remove any link references
                    for reference in references {
                        match reference.kind {
                            ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
                            ReferenceKind::CanonicalUrl
                            | ReferenceKind::FeedEntry
                            | ReferenceKind::SitemapEntry
                            | ReferenceKind::SecureUpgrade
                            | ReferenceKind::Redirect => {
                                warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
                            }
                        }
                    }
                    references = BTreeSet::new();
                }
                Ok(RakeOutcome::RakedPage(RakedPage {
                    page_entry: RakedPageEntry {
                        analysed_antifeatures: antifeature_flags,
--- a/quickpeep_raker/src/raking/page_extraction.rs
+++ b/quickpeep_raker/src/raking/page_extraction.rs
@ -156,6 +156,25 @@ impl PageExtractionServiceInternal {
            }
        }
        let mut no_follow = false;
        let mut no_index = false;
        // Find any restrictions on indexing this page or following any links.
        if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
            for node in robots_nodes {
                if let Some(content) = node.attributes.borrow().get("content") {
                    for directive in content
                        .split(|c: char| c.is_whitespace() || c == ',')
                        .filter(|s| !s.is_empty())
                    {
                        let none = directive.eq_ignore_ascii_case("none");
                        no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
                        no_index |= directive.eq_ignore_ascii_case("noindex") | none;
                    }
                }
            }
        }
        if language.is_none() {
            // Next fallback: prefer the content-language header baked into the page itself
            if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
@ -311,6 +330,8 @@ impl PageExtractionServiceInternal {
            document,
            feeds,
            antifeature_flags,
            no_follow,
            no_index,
        })
    }
 }
@ -362,6 +383,8 @@ pub enum ExtractedPage {
        document: DenseDocument,
        feeds: Vec<Url>,
        antifeature_flags: AnalysisAntifeatures,
        no_follow: bool,
        no_index: bool,
    },
    Redirect {
        reason: RedirectReason,