From 1e8aa95e7a93766e407cf2e08ae42504b16de27f Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Thu, 30 Mar 2023 23:09:39 +0100 Subject: [PATCH] Respect nofollow and noindex robots tags Along with doing the right thing, this should speed up raking for us --- quickpeep_densedoc/src/lib.rs | 3 +- quickpeep_raker/src/raking.rs | 33 +++++++++++++++++-- quickpeep_raker/src/raking/page_extraction.rs | 23 +++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index e4af533..6741375 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -287,7 +287,8 @@ impl DenseTreeBuilder { let nofollow = attrs .get("rel") .map(|rel: &str| { - rel.split_whitespace() + rel.split(|c: char| c.is_whitespace() || c == ',') + .filter(|s| !s.is_empty()) .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow")) }) .unwrap_or(false); diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index 38c3937..78e3dae 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::error::Error; use std::fmt::{Debug, Display, Formatter}; use std::io::Cursor; @@ -15,7 +15,7 @@ use image::imageops::FilterType; use image::{GenericImageView, ImageFormat}; use itertools::Itertools; use lazy_static::lazy_static; -use log::{debug, info}; +use log::{debug, info, warn}; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; @@ -112,6 +112,7 @@ pub enum TemporaryFailureReason { pub enum PermanentFailureReason { ResourceDenied(u16), DeniedToRobots, + IndexingDenied, WrongLanguage(String), UnknownContentType(String), ExceedsSizeLimit, @@ -505,8 +506,34 @@ impl Raker { document, feeds, antifeature_flags, + no_follow, + no_index, } => { - let references = references::find_references(&unreadable_document, &feeds, url); + if no_index { + return Ok(RakeOutcome::PermanentFailure(PermanentFailure { + reason: PermanentFailureReason::IndexingDenied, + })); + } + + let mut references = references::find_references(&unreadable_document, &feeds, url); + + if no_follow { + // Remove any link references + for reference in references { + match reference.kind { + ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (), + ReferenceKind::CanonicalUrl + | ReferenceKind::FeedEntry + | ReferenceKind::SitemapEntry + | ReferenceKind::SecureUpgrade + | ReferenceKind::Redirect => { + warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind); + } + } + } + references = BTreeSet::new(); + } + Ok(RakeOutcome::RakedPage(RakedPage { page_entry: RakedPageEntry { analysed_antifeatures: antifeature_flags, diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 2ffb595..4ff6612 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -156,6 +156,25 @@ impl PageExtractionServiceInternal { } } + let mut no_follow = false; + let mut no_index = false; + + // Find any restrictions on indexing this page or following any links. + if let Ok(robots_nodes) = root_node.select("meta[name=robots]") { + for node in robots_nodes { + if let Some(content) = node.attributes.borrow().get("content") { + for directive in content + .split(|c: char| c.is_whitespace() || c == ',') + .filter(|s| !s.is_empty()) + { + let none = directive.eq_ignore_ascii_case("none"); + no_follow |= directive.eq_ignore_ascii_case("nofollow") | none; + no_index |= directive.eq_ignore_ascii_case("noindex") | none; + } + } + } + } + if language.is_none() { // Next fallback: prefer the content-language header baked into the page itself if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") { @@ -311,6 +330,8 @@ impl PageExtractionServiceInternal { document, feeds, antifeature_flags, + no_follow, + no_index, }) } } @@ -362,6 +383,8 @@ pub enum ExtractedPage { document: DenseDocument, feeds: Vec, antifeature_flags: AnalysisAntifeatures, + no_follow: bool, + no_index: bool, }, Redirect { reason: RedirectReason,