Merge pull request #1 from JeWaVe/master

support crawl-delay
This commit is contained in:
Creston Bunch 2021-04-08 17:34:52 -05:00 committed by GitHub
commit 42e6d06983
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 164 additions and 39 deletions

45
Cargo.lock generated
View File

@ -247,9 +247,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink", "futures-sink",
@ -257,9 +257,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94"
[[package]] [[package]]
name = "futures-executor" name = "futures-executor"
@ -274,15 +274,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59"
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7"
dependencies = [ dependencies = [
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
@ -292,24 +292,21 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80"
dependencies = [
"once_cell",
]
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.12" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -428,12 +425,6 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "once_cell"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0"
[[package]] [[package]]
name = "oorandom" name = "oorandom"
version = "11.1.3" version = "11.1.3"
@ -606,9 +597,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.123" version = "1.0.124"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" checksum = "bd761ff957cb2a45fbb9ab3da6512de9de55872866160b23c25f1a841e99d29f"
[[package]] [[package]]
name = "serde_cbor" name = "serde_cbor"
@ -622,9 +613,9 @@ dependencies = [
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.123" version = "1.0.124"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" checksum = "1800f7693e94e186f5e25a28291ae1570da908aff7d97a095dec1e56ff99069b"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View File

@ -10,6 +10,9 @@ license = "MIT"
keywords = ["robots", "txt", "parse", "compile"] keywords = ["robots", "txt", "parse", "compile"]
repository = "https://github.com/crestonbunch/cylon" repository = "https://github.com/crestonbunch/cylon"
[features]
crawl-delay = []
[dependencies] [dependencies]
futures-util = "0.3" futures-util = "0.3"
serde = "1.0" serde = "1.0"

View File

@ -9,6 +9,7 @@ Disallow: /
Allow: /a Allow: /a
Allow: /abc Allow: /abc
Allow: /b Allow: /b
Crawl-Delay: 20
"# "#
.as_bytes(); .as_bytes();

View File

@ -1,9 +1,14 @@
#[cfg(feature = "crawl-delay")]
use std::cmp::Ordering;
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> { pub enum Rule<'a> {
Allow(&'a str), Allow(&'a str),
Disallow(&'a str), Disallow(&'a str),
#[cfg(feature = "crawl-delay")]
Delay(&'a str),
} }
impl<'a> Rule<'a> { impl<'a> Rule<'a> {
@ -11,6 +16,8 @@ impl<'a> Rule<'a> {
match self { match self {
Rule::Allow(inner) => inner, Rule::Allow(inner) => inner,
Rule::Disallow(inner) => inner, Rule::Disallow(inner) => inner,
#[cfg(feature = "crawl-delay")]
Rule::Delay(inner) => inner,
} }
} }
} }
@ -29,6 +36,8 @@ struct Transition(Edge, usize);
enum State { enum State {
Allow, Allow,
Disallow, Disallow,
#[cfg(feature = "crawl-delay")]
Delay,
Intermediate, Intermediate,
} }
@ -40,12 +49,31 @@ enum State {
pub struct Cylon { pub struct Cylon {
states: Vec<State>, states: Vec<State>,
transitions: Vec<Vec<Transition>>, transitions: Vec<Vec<Transition>>,
#[cfg(feature = "crawl-delay")]
delay: Option<u64>,
} }
impl Cylon { impl Cylon {
#[cfg(feature = "crawl-delay")]
pub fn delay(&self) -> Option<u64> {
self.delay
}
/// Match whether the rules allow or disallow the target path. /// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool { pub fn allow(&self, path: &str) -> bool {
let mut state = path.chars().fold(2, |state, path_char| { match self.states[self.state(path)] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
#[cfg(feature = "crawl-delay")]
State::Intermediate | State::Delay => unreachable!(),
#[cfg(not(feature = "crawl-delay"))]
State::Intermediate => unreachable!(),
}
}
fn state(&self, path: &str) -> usize {
let state = path.chars().fold(2, |state, path_char| {
let t = &self.transitions[state]; let t = &self.transitions[state];
t.iter() t.iter()
.rev() .rev()
@ -64,7 +92,7 @@ impl Cylon {
// Follow the EoW transition, if necessary // Follow the EoW transition, if necessary
let t = &self.transitions[state]; let t = &self.transitions[state];
state = t t
.iter() .iter()
.rev() .rev()
.find(|transition| match transition { .find(|transition| match transition {
@ -73,14 +101,7 @@ impl Cylon {
_ => false, _ => false,
}) })
.map(|Transition(.., next_state)| *next_state) .map(|Transition(.., next_state)| *next_state)
.unwrap_or(state); .unwrap_or(state)
match self.states[state] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate => unreachable!(),
}
} }
/// Compile a machine from a list of rules. /// Compile a machine from a list of rules.
@ -114,6 +135,8 @@ impl Cylon {
State::Allow => 0, State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state, State::Disallow if last_char == Some('$') => wildcard_state,
State::Disallow => 1, State::Disallow => 1,
#[cfg(feature = "crawl-delay")]
State::Delay => 1,
State::Intermediate => wildcard_state, State::Intermediate => wildcard_state,
}; };
@ -155,6 +178,8 @@ impl Cylon {
let state = match (rule, eow) { let state = match (rule, eow) {
(Rule::Allow(..), true) => State::Allow, (Rule::Allow(..), true) => State::Allow,
(Rule::Disallow(..), true) => State::Disallow, (Rule::Disallow(..), true) => State::Disallow,
#[cfg(feature = "crawl-delay")]
(Rule::Delay(..), true) => State::Delay,
_ => State::Intermediate, _ => State::Intermediate,
}; };
@ -184,12 +209,42 @@ impl Cylon {
}); });
states.push(match state { states.push(match state {
#[cfg(feature = "crawl-delay")]
State::Allow | State::Disallow | State::Delay => state,
#[cfg(not(feature = "crawl-delay"))]
State::Allow | State::Disallow => state, State::Allow | State::Disallow => state,
State::Intermediate => states[wildcard_state], State::Intermediate => states[wildcard_state],
}); });
transitions.push(t); transitions.push(t);
} }
#[cfg(feature = "crawl-delay")]
{
let mut delays: Vec<Option<u64>> = rules.iter().filter(|rule| {
match rule {
Rule::Delay(_) => true,
_ => false
}
}).map(|r| {
r.inner().parse::<u64>().ok()
}).collect();
delays.sort_unstable_by(|a, b| {
match (a, b) {
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
(Some(aa), Some(bb)) => aa.cmp(bb)
}
});
Self {
delay: *delays.get(0).unwrap_or(&None),
states,
transitions,
}
}
#[cfg(not(feature = "crawl-delay"))]
Self { Self {
states, states,
transitions, transitions,

View File

@ -5,6 +5,8 @@ use futures_util::{
}; };
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:"; const UA_PREFIX: &str = "user-agent:";
#[cfg(feature = "crawl-delay")]
const DELAY_PREFIX: &str = "crawl-delay:";
const ALLOW_PREFIX: &str = "allow:"; const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:"; const DISALLOW_PREFIX: &str = "disallow:";
@ -12,6 +14,8 @@ const DISALLOW_PREFIX: &str = "disallow:";
enum ParsedRule { enum ParsedRule {
Allow(String), Allow(String),
Disallow(String), Disallow(String),
#[cfg(feature = "crawl-delay")]
Delay(String),
} }
impl<'a> Into<Rule<'a>> for &'a ParsedRule { impl<'a> Into<Rule<'a>> for &'a ParsedRule {
@ -19,6 +23,8 @@ impl<'a> Into<Rule<'a>> for &'a ParsedRule {
match self { match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
#[cfg(feature = "crawl-delay")]
ParsedRule::Delay(delay) => Rule::Delay(delay),
} }
} }
} }
@ -167,11 +173,21 @@ fn parse_line(line: String) -> ParsedLine {
// This tries to parse lines roughly in order of most frequent kind to // This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average. // least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line)
#[cfg(feature = "crawl-delay")]
return parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.unwrap_or(ParsedLine::Nothing) .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
.unwrap_or(ParsedLine::Nothing);
#[cfg(not(feature = "crawl-delay"))]
return parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.unwrap_or(ParsedLine::Nothing);
} }
fn strip_comments(line: &str) -> &str { fn strip_comments(line: &str) -> &str {
@ -195,6 +211,21 @@ fn parse_user_agent(line: &str) -> Option<&str> {
} }
} }
#[cfg(feature = "crawl-delay")]
fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() {
return None;
}
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DELAY_PREFIX.len()..];
if prefix == DELAY_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_allow(line: &str) -> Option<&str> { fn parse_allow(line: &str) -> Option<&str> {
if line.len() < ALLOW_PREFIX.len() { if line.len() < ALLOW_PREFIX.len() {
return None; return None;
@ -293,6 +324,46 @@ mod tests {
} }
} }
#[test]
#[cfg(feature = "crawl-delay")]
fn test_crawl_delay() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
Crawl-Delay: 30
User-agent: foobar
Crawl-Delay: 60
User-agent: googlebot
Allow: /
User-agent: barfoo
Crawl-Delay: 60
Crawl-Delay: 20
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("googlebot");
let googlebot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("barfoo");
let barfoo_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(Some(60), foobar_machine.delay());
assert_eq!(Some(20), barfoo_machine.delay());
assert_eq!(Some(30), jonesbot_machine.delay());
assert_eq!(None, googlebot_machine.delay());
});
}
#[test] #[test]
fn test_end_to_end() { fn test_end_to_end() {
tokio_test::block_on(async { tokio_test::block_on(async {
@ -300,6 +371,10 @@ mod tests {
User-agent: jones-bot User-agent: jones-bot
Disallow: / Disallow: /
User-agent: foo
Allow: /
Crawl-Delay: 20
User-agent: jones User-agent: jones
User-agent: foobar User-agent: foobar
Allow: / Allow: /