Respect nofollow and noindex <meta> robots tags
Along with doing the right thing, this should speed up raking for us
This commit is contained in:
parent
18d2023550
commit
1e8aa95e7a
|
@ -287,7 +287,8 @@ impl DenseTreeBuilder {
|
|||
let nofollow = attrs
|
||||
.get("rel")
|
||||
.map(|rel: &str| {
|
||||
rel.split_whitespace()
|
||||
rel.split(|c: char| c.is_whitespace() || c == ',')
|
||||
.filter(|s| !s.is_empty())
|
||||
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::io::Cursor;
|
||||
|
@ -15,7 +15,7 @@ use image::imageops::FilterType;
|
|||
use image::{GenericImageView, ImageFormat};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use log::{debug, info};
|
||||
use log::{debug, info, warn};
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -112,6 +112,7 @@ pub enum TemporaryFailureReason {
|
|||
pub enum PermanentFailureReason {
|
||||
ResourceDenied(u16),
|
||||
DeniedToRobots,
|
||||
IndexingDenied,
|
||||
WrongLanguage(String),
|
||||
UnknownContentType(String),
|
||||
ExceedsSizeLimit,
|
||||
|
@ -505,8 +506,34 @@ impl Raker {
|
|||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
no_follow,
|
||||
no_index,
|
||||
} => {
|
||||
let references = references::find_references(&unreadable_document, &feeds, url);
|
||||
if no_index {
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::IndexingDenied,
|
||||
}));
|
||||
}
|
||||
|
||||
let mut references = references::find_references(&unreadable_document, &feeds, url);
|
||||
|
||||
if no_follow {
|
||||
// Remove any link references
|
||||
for reference in references {
|
||||
match reference.kind {
|
||||
ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
|
||||
ReferenceKind::CanonicalUrl
|
||||
| ReferenceKind::FeedEntry
|
||||
| ReferenceKind::SitemapEntry
|
||||
| ReferenceKind::SecureUpgrade
|
||||
| ReferenceKind::Redirect => {
|
||||
warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
|
||||
}
|
||||
}
|
||||
}
|
||||
references = BTreeSet::new();
|
||||
}
|
||||
|
||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||
page_entry: RakedPageEntry {
|
||||
analysed_antifeatures: antifeature_flags,
|
||||
|
|
|
@ -156,6 +156,25 @@ impl PageExtractionServiceInternal {
|
|||
}
|
||||
}
|
||||
|
||||
let mut no_follow = false;
|
||||
let mut no_index = false;
|
||||
|
||||
// Find any restrictions on indexing this page or following any links.
|
||||
if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
|
||||
for node in robots_nodes {
|
||||
if let Some(content) = node.attributes.borrow().get("content") {
|
||||
for directive in content
|
||||
.split(|c: char| c.is_whitespace() || c == ',')
|
||||
.filter(|s| !s.is_empty())
|
||||
{
|
||||
let none = directive.eq_ignore_ascii_case("none");
|
||||
no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
|
||||
no_index |= directive.eq_ignore_ascii_case("noindex") | none;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if language.is_none() {
|
||||
// Next fallback: prefer the content-language header baked into the page itself
|
||||
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||
|
@ -311,6 +330,8 @@ impl PageExtractionServiceInternal {
|
|||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
no_follow,
|
||||
no_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -362,6 +383,8 @@ pub enum ExtractedPage {
|
|||
document: DenseDocument,
|
||||
feeds: Vec<Url>,
|
||||
antifeature_flags: AnalysisAntifeatures,
|
||||
no_follow: bool,
|
||||
no_index: bool,
|
||||
},
|
||||
Redirect {
|
||||
reason: RedirectReason,
|
||||
|
|
Loading…
Reference in New Issue