add crawl-delay

This commit is contained in:
r.portalez 2021-03-10 15:57:55 +01:00
parent 86ee746b96
commit fe11216642
4 changed files with 979 additions and 902 deletions

View File

@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
repository = "https://github.com/crestonbunch/cylon"
[dependencies]
futures-util = "0.3.13"
serde = "1.0.124"
serde_derive = "1.0.124"
futures-util = "0.3"
serde = "1.0"
serde_derive = "1.0"
[dev-dependencies]
criterion = { version = "0.3", features = ["async_futures"] }

View File

@ -9,6 +9,7 @@ Disallow: /
Allow: /a
Allow: /abc
Allow: /b
Crawl-Delay: 20
"#
.as_bytes();

View File

@ -1,10 +1,12 @@
use std::cmp::Ordering;
use serde_derive::{Deserialize, Serialize};
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> {
Allow(&'a str),
Disallow(&'a str),
Delay(u64),
Delay(&'a str),
}
impl<'a> Rule<'a> {
@ -31,6 +33,7 @@ struct Transition(Edge, usize);
enum State {
Allow,
Disallow,
Delay,
Intermediate,
}
@ -42,12 +45,26 @@ enum State {
pub struct Cylon {
states: Vec<State>,
transitions: Vec<Vec<Transition>>,
delay: Option<u64>,
}
impl Cylon {
pub fn delay(&self) -> Option<u64> {
self.delay
}
/// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool {
let mut state = path.chars().fold(2, |state, path_char| {
match self.states[self.state(path)] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate | State::Delay => unreachable!(),
}
}
fn state(&self, path: &str) -> usize {
let state = path.chars().fold(2, |state, path_char| {
let t = &self.transitions[state];
t.iter()
.rev()
@ -66,7 +83,7 @@ impl Cylon {
// Follow the EoW transition, if necessary
let t = &self.transitions[state];
state = t
t
.iter()
.rev()
.find(|transition| match transition {
@ -75,14 +92,7 @@ impl Cylon {
_ => false,
})
.map(|Transition(.., next_state)| *next_state)
.unwrap_or(state);
match self.states[state] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate => unreachable!(),
}
.unwrap_or(state)
}
/// Compile a machine from a list of rules.
@ -116,6 +126,7 @@ impl Cylon {
State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state,
State::Disallow => 1,
State::Delay => 1,
State::Intermediate => wildcard_state,
};
@ -157,6 +168,7 @@ impl Cylon {
let state = match (rule, eow) {
(Rule::Allow(..), true) => State::Allow,
(Rule::Disallow(..), true) => State::Disallow,
(Rule::Delay(..), true) => State::Delay,
_ => State::Intermediate,
};
@ -186,13 +198,34 @@ impl Cylon {
});
states.push(match state {
State::Allow | State::Disallow => state,
State::Allow | State::Disallow | State::Delay => state,
State::Intermediate => states[wildcard_state],
});
transitions.push(t);
}
let mut delays: Vec<Option<u64>> = rules.iter().filter(|rule| {
match rule {
Rule::Delay(_) => true,
_ => false
}
}).map(|r| {
r.inner().parse::<u64>().ok()
}).collect();
delays.sort_unstable_by(|a, b| {
match (a, b) {
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
(Some(aa), Some(bb)) => aa.cmp(bb)
}
});
Self {
delay: *delays.get(0).unwrap_or(&None),
states,
transitions,
}

View File

@ -13,7 +13,7 @@ const DISALLOW_PREFIX: &str = "disallow:";
enum ParsedRule {
Allow(String),
Disallow(String),
Delay(u64),
Delay(String),
}
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
@ -21,7 +21,7 @@ impl<'a> Into<Rule<'a>> for &'a ParsedRule {
match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Delay(delay) => Rule.Delay(delay),
ParsedRule::Delay(delay) => Rule::Delay(delay),
}
}
}
@ -199,7 +199,7 @@ fn parse_user_agent(line: &str) -> Option<&str> {
}
}
fn parse_delay(line: &str) -> Option<u64> {
fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() {
return None;
}
@ -311,6 +311,45 @@ mod tests {
}
}
#[test]
fn test_crawl_delay() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
Crawl-Delay: 30
User-agent: foobar
Crawl-Delay: 60
User-agent: googlebot
Allow: /
User-agent: barfoo
Crawl-Delay: 60
Crawl-Delay: 20
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("googlebot");
let googlebot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("barfoo");
let barfoo_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(Some(60), foobar_machine.delay());
assert_eq!(Some(20), barfoo_machine.delay());
assert_eq!(Some(30), jonesbot_machine.delay());
assert_eq!(None, googlebot_machine.delay());
});
}
#[test]
fn test_end_to_end() {
tokio_test::block_on(async {
@ -318,6 +357,10 @@ mod tests {
User-agent: jones-bot
Disallow: /
User-agent: foo
Allow: /
Crawl-Delay: 20
User-agent: jones
User-agent: foobar
Allow: /