completely isolate crawl-delay code into a feature

This commit is contained in:
r.portalez 2021-03-10 16:57:37 +01:00
parent e904e91972
commit f331be990f
2 changed files with 17 additions and 2 deletions

View File

@ -7,6 +7,7 @@ use serde_derive::{Deserialize, Serialize};
pub enum Rule<'a> { pub enum Rule<'a> {
Allow(&'a str), Allow(&'a str),
Disallow(&'a str), Disallow(&'a str),
#[cfg(feature = "crawl-delay")]
Delay(&'a str), Delay(&'a str),
} }
@ -15,6 +16,7 @@ impl<'a> Rule<'a> {
match self { match self {
Rule::Allow(inner) => inner, Rule::Allow(inner) => inner,
Rule::Disallow(inner) => inner, Rule::Disallow(inner) => inner,
#[cfg(feature = "crawl-delay")]
Rule::Delay(inner) => inner, Rule::Delay(inner) => inner,
} }
} }

View File

@ -5,6 +5,7 @@ use futures_util::{
}; };
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:"; const UA_PREFIX: &str = "user-agent:";
#[cfg(feature = "crawl-delay")]
const DELAY_PREFIX: &str = "crawl-delay:"; const DELAY_PREFIX: &str = "crawl-delay:";
const ALLOW_PREFIX: &str = "allow:"; const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:"; const DISALLOW_PREFIX: &str = "disallow:";
@ -13,6 +14,7 @@ const DISALLOW_PREFIX: &str = "disallow:";
enum ParsedRule { enum ParsedRule {
Allow(String), Allow(String),
Disallow(String), Disallow(String),
#[cfg(feature = "crawl-delay")]
Delay(String), Delay(String),
} }
@ -21,6 +23,7 @@ impl<'a> Into<Rule<'a>> for &'a ParsedRule {
match self { match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
#[cfg(feature = "crawl-delay")]
ParsedRule::Delay(delay) => Rule::Delay(delay), ParsedRule::Delay(delay) => Rule::Delay(delay),
} }
} }
@ -170,12 +173,21 @@ fn parse_line(line: String) -> ParsedLine {
// This tries to parse lines roughly in order of most frequent kind to // This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average. // least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line)
#[cfg(feature = "crawl-delay")]
return parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
.unwrap_or(ParsedLine::Nothing) .unwrap_or(ParsedLine::Nothing);
#[cfg(not(feature = "crawl-delay"))]
return parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.unwrap_or(ParsedLine::Nothing);
} }
fn strip_comments(line: &str) -> &str { fn strip_comments(line: &str) -> &str {
@ -199,6 +211,7 @@ fn parse_user_agent(line: &str) -> Option<&str> {
} }
} }
#[cfg(feature = "crawl-delay")]
fn parse_delay(line: &str) -> Option<&str> { fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() { if line.len() < DELAY_PREFIX.len() {
return None; return None;