Respect nofollow and noindex <meta> robots tags
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful

Along with doing the right thing, this should speed up raking for us
This commit is contained in:
Olivier 'reivilibre' 2023-03-30 23:09:39 +01:00
parent 18d2023550
commit 1e8aa95e7a
3 changed files with 55 additions and 4 deletions

View File

@ -287,7 +287,8 @@ impl DenseTreeBuilder {
let nofollow = attrs let nofollow = attrs
.get("rel") .get("rel")
.map(|rel: &str| { .map(|rel: &str| {
rel.split_whitespace() rel.split(|c: char| c.is_whitespace() || c == ',')
.filter(|s| !s.is_empty())
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow")) .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
}) })
.unwrap_or(false); .unwrap_or(false);

View File

@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::error::Error; use std::error::Error;
use std::fmt::{Debug, Display, Formatter}; use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor; use std::io::Cursor;
@ -15,7 +15,7 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat}; use image::{GenericImageView, ImageFormat};
use itertools::Itertools; use itertools::Itertools;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::{debug, info}; use log::{debug, info, warn};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -112,6 +112,7 @@ pub enum TemporaryFailureReason {
pub enum PermanentFailureReason { pub enum PermanentFailureReason {
ResourceDenied(u16), ResourceDenied(u16),
DeniedToRobots, DeniedToRobots,
IndexingDenied,
WrongLanguage(String), WrongLanguage(String),
UnknownContentType(String), UnknownContentType(String),
ExceedsSizeLimit, ExceedsSizeLimit,
@ -505,8 +506,34 @@ impl Raker {
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
no_follow,
no_index,
} => { } => {
let references = references::find_references(&unreadable_document, &feeds, url); if no_index {
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::IndexingDenied,
}));
}
let mut references = references::find_references(&unreadable_document, &feeds, url);
if no_follow {
// Remove any link references
for reference in references {
match reference.kind {
ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
ReferenceKind::CanonicalUrl
| ReferenceKind::FeedEntry
| ReferenceKind::SitemapEntry
| ReferenceKind::SecureUpgrade
| ReferenceKind::Redirect => {
warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
}
}
}
references = BTreeSet::new();
}
Ok(RakeOutcome::RakedPage(RakedPage { Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry { page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags, analysed_antifeatures: antifeature_flags,

View File

@ -156,6 +156,25 @@ impl PageExtractionServiceInternal {
} }
} }
let mut no_follow = false;
let mut no_index = false;
// Find any restrictions on indexing this page or following any links.
if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
for node in robots_nodes {
if let Some(content) = node.attributes.borrow().get("content") {
for directive in content
.split(|c: char| c.is_whitespace() || c == ',')
.filter(|s| !s.is_empty())
{
let none = directive.eq_ignore_ascii_case("none");
no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
no_index |= directive.eq_ignore_ascii_case("noindex") | none;
}
}
}
}
if language.is_none() { if language.is_none() {
// Next fallback: prefer the content-language header baked into the page itself // Next fallback: prefer the content-language header baked into the page itself
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") { if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
@ -311,6 +330,8 @@ impl PageExtractionServiceInternal {
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
no_follow,
no_index,
}) })
} }
} }
@ -362,6 +383,8 @@ pub enum ExtractedPage {
document: DenseDocument, document: DenseDocument,
feeds: Vec<Url>, feeds: Vec<Url>,
antifeature_flags: AnalysisAntifeatures, antifeature_flags: AnalysisAntifeatures,
no_follow: bool,
no_index: bool,
}, },
Redirect { Redirect {
reason: RedirectReason, reason: RedirectReason,