add crawl-delay

This commit is contained in:
r.portalez 2021-03-10 15:57:55 +01:00
parent 86ee746b96
commit fe11216642
4 changed files with 979 additions and 902 deletions

View File

@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
repository = "https://github.com/crestonbunch/cylon" repository = "https://github.com/crestonbunch/cylon"
[dependencies] [dependencies]
futures-util = "0.3.13" futures-util = "0.3"
serde = "1.0.124" serde = "1.0"
serde_derive = "1.0.124" serde_derive = "1.0"
[dev-dependencies] [dev-dependencies]
criterion = { version = "0.3", features = ["async_futures"] } criterion = { version = "0.3", features = ["async_futures"] }

View File

@ -9,6 +9,7 @@ Disallow: /
Allow: /a Allow: /a
Allow: /abc Allow: /abc
Allow: /b Allow: /b
Crawl-Delay: 20
"# "#
.as_bytes(); .as_bytes();

View File

@ -1,10 +1,12 @@
use std::cmp::Ordering;
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> { pub enum Rule<'a> {
Allow(&'a str), Allow(&'a str),
Disallow(&'a str), Disallow(&'a str),
Delay(u64), Delay(&'a str),
} }
impl<'a> Rule<'a> { impl<'a> Rule<'a> {
@ -31,6 +33,7 @@ struct Transition(Edge, usize);
enum State { enum State {
Allow, Allow,
Disallow, Disallow,
Delay,
Intermediate, Intermediate,
} }
@ -42,12 +45,26 @@ enum State {
pub struct Cylon { pub struct Cylon {
states: Vec<State>, states: Vec<State>,
transitions: Vec<Vec<Transition>>, transitions: Vec<Vec<Transition>>,
delay: Option<u64>,
} }
impl Cylon { impl Cylon {
pub fn delay(&self) -> Option<u64> {
self.delay
}
/// Match whether the rules allow or disallow the target path. /// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool { pub fn allow(&self, path: &str) -> bool {
let mut state = path.chars().fold(2, |state, path_char| { match self.states[self.state(path)] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate | State::Delay => unreachable!(),
}
}
fn state(&self, path: &str) -> usize {
let state = path.chars().fold(2, |state, path_char| {
let t = &self.transitions[state]; let t = &self.transitions[state];
t.iter() t.iter()
.rev() .rev()
@ -66,7 +83,7 @@ impl Cylon {
// Follow the EoW transition, if necessary // Follow the EoW transition, if necessary
let t = &self.transitions[state]; let t = &self.transitions[state];
state = t t
.iter() .iter()
.rev() .rev()
.find(|transition| match transition { .find(|transition| match transition {
@ -75,14 +92,7 @@ impl Cylon {
_ => false, _ => false,
}) })
.map(|Transition(.., next_state)| *next_state) .map(|Transition(.., next_state)| *next_state)
.unwrap_or(state); .unwrap_or(state)
match self.states[state] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate => unreachable!(),
}
} }
/// Compile a machine from a list of rules. /// Compile a machine from a list of rules.
@ -116,6 +126,7 @@ impl Cylon {
State::Allow => 0, State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state, State::Disallow if last_char == Some('$') => wildcard_state,
State::Disallow => 1, State::Disallow => 1,
State::Delay => 1,
State::Intermediate => wildcard_state, State::Intermediate => wildcard_state,
}; };
@ -157,6 +168,7 @@ impl Cylon {
let state = match (rule, eow) { let state = match (rule, eow) {
(Rule::Allow(..), true) => State::Allow, (Rule::Allow(..), true) => State::Allow,
(Rule::Disallow(..), true) => State::Disallow, (Rule::Disallow(..), true) => State::Disallow,
(Rule::Delay(..), true) => State::Delay,
_ => State::Intermediate, _ => State::Intermediate,
}; };
@ -186,13 +198,34 @@ impl Cylon {
}); });
states.push(match state { states.push(match state {
State::Allow | State::Disallow => state, State::Allow | State::Disallow | State::Delay => state,
State::Intermediate => states[wildcard_state], State::Intermediate => states[wildcard_state],
}); });
transitions.push(t); transitions.push(t);
} }
let mut delays: Vec<Option<u64>> = rules.iter().filter(|rule| {
match rule {
Rule::Delay(_) => true,
_ => false
}
}).map(|r| {
r.inner().parse::<u64>().ok()
}).collect();
delays.sort_unstable_by(|a, b| {
match (a, b) {
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
(Some(aa), Some(bb)) => aa.cmp(bb)
}
});
Self { Self {
delay: *delays.get(0).unwrap_or(&None),
states, states,
transitions, transitions,
} }

View File

@ -13,7 +13,7 @@ const DISALLOW_PREFIX: &str = "disallow:";
enum ParsedRule { enum ParsedRule {
Allow(String), Allow(String),
Disallow(String), Disallow(String),
Delay(u64), Delay(String),
} }
impl<'a> Into<Rule<'a>> for &'a ParsedRule { impl<'a> Into<Rule<'a>> for &'a ParsedRule {
@ -21,7 +21,7 @@ impl<'a> Into<Rule<'a>> for &'a ParsedRule {
match self { match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Delay(delay) => Rule.Delay(delay), ParsedRule::Delay(delay) => Rule::Delay(delay),
} }
} }
} }
@ -199,7 +199,7 @@ fn parse_user_agent(line: &str) -> Option<&str> {
} }
} }
fn parse_delay(line: &str) -> Option<u64> { fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() { if line.len() < DELAY_PREFIX.len() {
return None; return None;
} }
@ -311,6 +311,45 @@ mod tests {
} }
} }
#[test]
fn test_crawl_delay() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
Crawl-Delay: 30
User-agent: foobar
Crawl-Delay: 60
User-agent: googlebot
Allow: /
User-agent: barfoo
Crawl-Delay: 60
Crawl-Delay: 20
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("googlebot");
let googlebot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("barfoo");
let barfoo_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(Some(60), foobar_machine.delay());
assert_eq!(Some(20), barfoo_machine.delay());
assert_eq!(Some(30), jonesbot_machine.delay());
assert_eq!(None, googlebot_machine.delay());
});
}
#[test] #[test]
fn test_end_to_end() { fn test_end_to_end() {
tokio_test::block_on(async { tokio_test::block_on(async {
@ -318,6 +357,10 @@ mod tests {
User-agent: jones-bot User-agent: jones-bot
Disallow: / Disallow: /
User-agent: foo
Allow: /
Crawl-Delay: 20
User-agent: jones User-agent: jones
User-agent: foobar User-agent: foobar
Allow: / Allow: /