Respect nofollow and noindex <meta> robots tags
Along with doing the right thing, this should speed up raking for us
This commit is contained in:
parent
18d2023550
commit
1e8aa95e7a
|
@ -287,7 +287,8 @@ impl DenseTreeBuilder {
|
||||||
let nofollow = attrs
|
let nofollow = attrs
|
||||||
.get("rel")
|
.get("rel")
|
||||||
.map(|rel: &str| {
|
.map(|rel: &str| {
|
||||||
rel.split_whitespace()
|
rel.split(|c: char| c.is_whitespace() || c == ',')
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
|
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
|
||||||
})
|
})
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::fmt::{Debug, Display, Formatter};
|
use std::fmt::{Debug, Display, Formatter};
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
@ -15,7 +15,7 @@ use image::imageops::FilterType;
|
||||||
use image::{GenericImageView, ImageFormat};
|
use image::{GenericImageView, ImageFormat};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::{debug, info};
|
use log::{debug, info, warn};
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
@ -112,6 +112,7 @@ pub enum TemporaryFailureReason {
|
||||||
pub enum PermanentFailureReason {
|
pub enum PermanentFailureReason {
|
||||||
ResourceDenied(u16),
|
ResourceDenied(u16),
|
||||||
DeniedToRobots,
|
DeniedToRobots,
|
||||||
|
IndexingDenied,
|
||||||
WrongLanguage(String),
|
WrongLanguage(String),
|
||||||
UnknownContentType(String),
|
UnknownContentType(String),
|
||||||
ExceedsSizeLimit,
|
ExceedsSizeLimit,
|
||||||
|
@ -505,8 +506,34 @@ impl Raker {
|
||||||
document,
|
document,
|
||||||
feeds,
|
feeds,
|
||||||
antifeature_flags,
|
antifeature_flags,
|
||||||
|
no_follow,
|
||||||
|
no_index,
|
||||||
} => {
|
} => {
|
||||||
let references = references::find_references(&unreadable_document, &feeds, url);
|
if no_index {
|
||||||
|
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||||
|
reason: PermanentFailureReason::IndexingDenied,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut references = references::find_references(&unreadable_document, &feeds, url);
|
||||||
|
|
||||||
|
if no_follow {
|
||||||
|
// Remove any link references
|
||||||
|
for reference in references {
|
||||||
|
match reference.kind {
|
||||||
|
ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
|
||||||
|
ReferenceKind::CanonicalUrl
|
||||||
|
| ReferenceKind::FeedEntry
|
||||||
|
| ReferenceKind::SitemapEntry
|
||||||
|
| ReferenceKind::SecureUpgrade
|
||||||
|
| ReferenceKind::Redirect => {
|
||||||
|
warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
references = BTreeSet::new();
|
||||||
|
}
|
||||||
|
|
||||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||||
page_entry: RakedPageEntry {
|
page_entry: RakedPageEntry {
|
||||||
analysed_antifeatures: antifeature_flags,
|
analysed_antifeatures: antifeature_flags,
|
||||||
|
|
|
@ -156,6 +156,25 @@ impl PageExtractionServiceInternal {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut no_follow = false;
|
||||||
|
let mut no_index = false;
|
||||||
|
|
||||||
|
// Find any restrictions on indexing this page or following any links.
|
||||||
|
if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
|
||||||
|
for node in robots_nodes {
|
||||||
|
if let Some(content) = node.attributes.borrow().get("content") {
|
||||||
|
for directive in content
|
||||||
|
.split(|c: char| c.is_whitespace() || c == ',')
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
let none = directive.eq_ignore_ascii_case("none");
|
||||||
|
no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
|
||||||
|
no_index |= directive.eq_ignore_ascii_case("noindex") | none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if language.is_none() {
|
if language.is_none() {
|
||||||
// Next fallback: prefer the content-language header baked into the page itself
|
// Next fallback: prefer the content-language header baked into the page itself
|
||||||
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||||
|
@ -311,6 +330,8 @@ impl PageExtractionServiceInternal {
|
||||||
document,
|
document,
|
||||||
feeds,
|
feeds,
|
||||||
antifeature_flags,
|
antifeature_flags,
|
||||||
|
no_follow,
|
||||||
|
no_index,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -362,6 +383,8 @@ pub enum ExtractedPage {
|
||||||
document: DenseDocument,
|
document: DenseDocument,
|
||||||
feeds: Vec<Url>,
|
feeds: Vec<Url>,
|
||||||
antifeature_flags: AnalysisAntifeatures,
|
antifeature_flags: AnalysisAntifeatures,
|
||||||
|
no_follow: bool,
|
||||||
|
no_index: bool,
|
||||||
},
|
},
|
||||||
Redirect {
|
Redirect {
|
||||||
reason: RedirectReason,
|
reason: RedirectReason,
|
||||||
|
|
Loading…
Reference in New Issue