add crawl-delay
This commit is contained in:
parent
86ee746b96
commit
fe11216642
|
@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
|
||||||
repository = "https://github.com/crestonbunch/cylon"
|
repository = "https://github.com/crestonbunch/cylon"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
futures-util = "0.3.13"
|
futures-util = "0.3"
|
||||||
serde = "1.0.124"
|
serde = "1.0"
|
||||||
serde_derive = "1.0.124"
|
serde_derive = "1.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = { version = "0.3", features = ["async_futures"] }
|
criterion = { version = "0.3", features = ["async_futures"] }
|
||||||
|
|
|
@ -9,6 +9,7 @@ Disallow: /
|
||||||
Allow: /a
|
Allow: /a
|
||||||
Allow: /abc
|
Allow: /abc
|
||||||
Allow: /b
|
Allow: /b
|
||||||
|
Crawl-Delay: 20
|
||||||
"#
|
"#
|
||||||
.as_bytes();
|
.as_bytes();
|
||||||
|
|
||||||
|
|
57
src/dfa.rs
57
src/dfa.rs
|
@ -1,10 +1,12 @@
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use serde_derive::{Deserialize, Serialize};
|
use serde_derive::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub enum Rule<'a> {
|
pub enum Rule<'a> {
|
||||||
Allow(&'a str),
|
Allow(&'a str),
|
||||||
Disallow(&'a str),
|
Disallow(&'a str),
|
||||||
Delay(u64),
|
Delay(&'a str),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Rule<'a> {
|
impl<'a> Rule<'a> {
|
||||||
|
@ -31,6 +33,7 @@ struct Transition(Edge, usize);
|
||||||
enum State {
|
enum State {
|
||||||
Allow,
|
Allow,
|
||||||
Disallow,
|
Disallow,
|
||||||
|
Delay,
|
||||||
Intermediate,
|
Intermediate,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,12 +45,26 @@ enum State {
|
||||||
pub struct Cylon {
|
pub struct Cylon {
|
||||||
states: Vec<State>,
|
states: Vec<State>,
|
||||||
transitions: Vec<Vec<Transition>>,
|
transitions: Vec<Vec<Transition>>,
|
||||||
|
delay: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Cylon {
|
impl Cylon {
|
||||||
|
pub fn delay(&self) -> Option<u64> {
|
||||||
|
self.delay
|
||||||
|
}
|
||||||
|
|
||||||
/// Match whether the rules allow or disallow the target path.
|
/// Match whether the rules allow or disallow the target path.
|
||||||
pub fn allow(&self, path: &str) -> bool {
|
pub fn allow(&self, path: &str) -> bool {
|
||||||
let mut state = path.chars().fold(2, |state, path_char| {
|
match self.states[self.state(path)] {
|
||||||
|
State::Allow => true,
|
||||||
|
State::Disallow => false,
|
||||||
|
// Intermediate states are not preserved in the DFA
|
||||||
|
State::Intermediate | State::Delay => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn state(&self, path: &str) -> usize {
|
||||||
|
let state = path.chars().fold(2, |state, path_char| {
|
||||||
let t = &self.transitions[state];
|
let t = &self.transitions[state];
|
||||||
t.iter()
|
t.iter()
|
||||||
.rev()
|
.rev()
|
||||||
|
@ -66,7 +83,7 @@ impl Cylon {
|
||||||
|
|
||||||
// Follow the EoW transition, if necessary
|
// Follow the EoW transition, if necessary
|
||||||
let t = &self.transitions[state];
|
let t = &self.transitions[state];
|
||||||
state = t
|
t
|
||||||
.iter()
|
.iter()
|
||||||
.rev()
|
.rev()
|
||||||
.find(|transition| match transition {
|
.find(|transition| match transition {
|
||||||
|
@ -75,14 +92,7 @@ impl Cylon {
|
||||||
_ => false,
|
_ => false,
|
||||||
})
|
})
|
||||||
.map(|Transition(.., next_state)| *next_state)
|
.map(|Transition(.., next_state)| *next_state)
|
||||||
.unwrap_or(state);
|
.unwrap_or(state)
|
||||||
|
|
||||||
match self.states[state] {
|
|
||||||
State::Allow => true,
|
|
||||||
State::Disallow => false,
|
|
||||||
// Intermediate states are not preserved in the DFA
|
|
||||||
State::Intermediate => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compile a machine from a list of rules.
|
/// Compile a machine from a list of rules.
|
||||||
|
@ -116,6 +126,7 @@ impl Cylon {
|
||||||
State::Allow => 0,
|
State::Allow => 0,
|
||||||
State::Disallow if last_char == Some('$') => wildcard_state,
|
State::Disallow if last_char == Some('$') => wildcard_state,
|
||||||
State::Disallow => 1,
|
State::Disallow => 1,
|
||||||
|
State::Delay => 1,
|
||||||
State::Intermediate => wildcard_state,
|
State::Intermediate => wildcard_state,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -157,6 +168,7 @@ impl Cylon {
|
||||||
let state = match (rule, eow) {
|
let state = match (rule, eow) {
|
||||||
(Rule::Allow(..), true) => State::Allow,
|
(Rule::Allow(..), true) => State::Allow,
|
||||||
(Rule::Disallow(..), true) => State::Disallow,
|
(Rule::Disallow(..), true) => State::Disallow,
|
||||||
|
(Rule::Delay(..), true) => State::Delay,
|
||||||
_ => State::Intermediate,
|
_ => State::Intermediate,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -186,13 +198,34 @@ impl Cylon {
|
||||||
});
|
});
|
||||||
|
|
||||||
states.push(match state {
|
states.push(match state {
|
||||||
State::Allow | State::Disallow => state,
|
State::Allow | State::Disallow | State::Delay => state,
|
||||||
State::Intermediate => states[wildcard_state],
|
State::Intermediate => states[wildcard_state],
|
||||||
});
|
});
|
||||||
transitions.push(t);
|
transitions.push(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut delays: Vec<Option<u64>> = rules.iter().filter(|rule| {
|
||||||
|
match rule {
|
||||||
|
Rule::Delay(_) => true,
|
||||||
|
_ => false
|
||||||
|
}
|
||||||
|
}).map(|r| {
|
||||||
|
r.inner().parse::<u64>().ok()
|
||||||
|
}).collect();
|
||||||
|
delays.sort_unstable_by(|a, b| {
|
||||||
|
match (a, b) {
|
||||||
|
(None, Some(_)) => Ordering::Greater,
|
||||||
|
(Some(_), None) => Ordering::Less,
|
||||||
|
(None, None) => Ordering::Equal,
|
||||||
|
(Some(aa), Some(bb)) => aa.cmp(bb)
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
|
delay: *delays.get(0).unwrap_or(&None),
|
||||||
states,
|
states,
|
||||||
transitions,
|
transitions,
|
||||||
}
|
}
|
||||||
|
|
49
src/parse.rs
49
src/parse.rs
|
@ -13,7 +13,7 @@ const DISALLOW_PREFIX: &str = "disallow:";
|
||||||
enum ParsedRule {
|
enum ParsedRule {
|
||||||
Allow(String),
|
Allow(String),
|
||||||
Disallow(String),
|
Disallow(String),
|
||||||
Delay(u64),
|
Delay(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
||||||
|
@ -21,7 +21,7 @@ impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
||||||
match self {
|
match self {
|
||||||
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
||||||
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
||||||
ParsedRule::Delay(delay) => Rule.Delay(delay),
|
ParsedRule::Delay(delay) => Rule::Delay(delay),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -199,7 +199,7 @@ fn parse_user_agent(line: &str) -> Option<&str> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_delay(line: &str) -> Option<u64> {
|
fn parse_delay(line: &str) -> Option<&str> {
|
||||||
if line.len() < DELAY_PREFIX.len() {
|
if line.len() < DELAY_PREFIX.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
@ -311,6 +311,45 @@ mod tests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_crawl_delay() {
|
||||||
|
tokio_test::block_on(async {
|
||||||
|
let example_robots = r#"
|
||||||
|
User-agent: jones-bot
|
||||||
|
Disallow: /
|
||||||
|
Crawl-Delay: 30
|
||||||
|
|
||||||
|
User-agent: foobar
|
||||||
|
Crawl-Delay: 60
|
||||||
|
|
||||||
|
User-agent: googlebot
|
||||||
|
Allow: /
|
||||||
|
|
||||||
|
User-agent: barfoo
|
||||||
|
Crawl-Delay: 60
|
||||||
|
Crawl-Delay: 20
|
||||||
|
"#
|
||||||
|
.as_bytes();
|
||||||
|
|
||||||
|
let parser = Compiler::new("foobar");
|
||||||
|
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
let parser = Compiler::new("googlebot");
|
||||||
|
let googlebot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
let parser = Compiler::new("barfoo");
|
||||||
|
let barfoo_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
let parser = Compiler::new("jones-bot");
|
||||||
|
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(Some(60), foobar_machine.delay());
|
||||||
|
assert_eq!(Some(20), barfoo_machine.delay());
|
||||||
|
assert_eq!(Some(30), jonesbot_machine.delay());
|
||||||
|
assert_eq!(None, googlebot_machine.delay());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_end_to_end() {
|
fn test_end_to_end() {
|
||||||
tokio_test::block_on(async {
|
tokio_test::block_on(async {
|
||||||
|
@ -318,6 +357,10 @@ mod tests {
|
||||||
User-agent: jones-bot
|
User-agent: jones-bot
|
||||||
Disallow: /
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: foo
|
||||||
|
Allow: /
|
||||||
|
Crawl-Delay: 20
|
||||||
|
|
||||||
User-agent: jones
|
User-agent: jones
|
||||||
User-agent: foobar
|
User-agent: foobar
|
||||||
Allow: /
|
Allow: /
|
||||||
|
|
Loading…
Reference in New Issue