add crawl-delay

This commit is contained in:
r.portalez 2021-03-10 15:57:55 +01:00
parent 86ee746b96
commit fe11216642
4 changed files with 979 additions and 902 deletions

View File

@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
repository = "https://github.com/crestonbunch/cylon" repository = "https://github.com/crestonbunch/cylon"
[dependencies] [dependencies]
futures-util = "0.3.13" futures-util = "0.3"
serde = "1.0.124" serde = "1.0"
serde_derive = "1.0.124" serde_derive = "1.0"
[dev-dependencies] [dev-dependencies]
criterion = { version = "0.3", features = ["async_futures"] } criterion = { version = "0.3", features = ["async_futures"] }

View File

@ -9,6 +9,7 @@ Disallow: /
Allow: /a Allow: /a
Allow: /abc Allow: /abc
Allow: /b Allow: /b
Crawl-Delay: 20
"# "#
.as_bytes(); .as_bytes();

1033
src/dfa.rs

File diff suppressed because it is too large Load Diff

View File

@ -1,399 +1,442 @@
use super::dfa::{Cylon, Rule}; use super::dfa::{Cylon, Rule};
use futures_util::{ use futures_util::{
io::{AsyncBufRead, AsyncRead, BufReader, Result}, io::{AsyncBufRead, AsyncRead, BufReader, Result},
AsyncBufReadExt, AsyncBufReadExt,
}; };
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:"; const UA_PREFIX: &str = "user-agent:";
const DELAY_PREFIX: &str = "crawl-delay:"; const DELAY_PREFIX: &str = "crawl-delay:";
const ALLOW_PREFIX: &str = "allow:"; const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:"; const DISALLOW_PREFIX: &str = "disallow:";
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
enum ParsedRule { enum ParsedRule {
Allow(String), Allow(String),
Disallow(String), Disallow(String),
Delay(u64), Delay(String),
} }
impl<'a> Into<Rule<'a>> for &'a ParsedRule { impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> { fn into(self) -> Rule<'a> {
match self { match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Delay(delay) => Rule.Delay(delay), ParsedRule::Delay(delay) => Rule::Delay(delay),
} }
} }
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
enum ParsedLine { enum ParsedLine {
UserAgent(String), UserAgent(String),
Rule(ParsedRule), Rule(ParsedRule),
Nothing, Nothing,
} }
/// A compiler takes an input robots.txt file and outputs a compiled Cylon, /// A compiler takes an input robots.txt file and outputs a compiled Cylon,
/// which can be used to efficiently match a large number of paths against /// which can be used to efficiently match a large number of paths against
/// the robots.txt file. /// the robots.txt file.
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub struct Compiler { pub struct Compiler {
user_agent: String, user_agent: String,
} }
impl Compiler { impl Compiler {
/// Build a new compiler that parses rules for the given user agent from /// Build a new compiler that parses rules for the given user agent from
/// a robots.txt file. /// a robots.txt file.
pub fn new(user_agent: &str) -> Self { pub fn new(user_agent: &str) -> Self {
Self { Self {
user_agent: user_agent.to_lowercase(), user_agent: user_agent.to_lowercase(),
} }
} }
/// Parse an input robots.txt file into a Cylon that can recognize /// Parse an input robots.txt file into a Cylon that can recognize
/// whether or not a path matches the rules for the Parser's user agent. /// whether or not a path matches the rules for the Parser's user agent.
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> { pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
let reader = BufReader::new(file); let reader = BufReader::new(file);
let mut agent = String::new(); let mut agent = String::new();
let mut rules: Vec<ParsedRule> = vec![]; let mut rules: Vec<ParsedRule> = vec![];
let mut group_reader = GroupReader::new(reader); let mut group_reader = GroupReader::new(reader);
// find the most specific matching group in the robots file // find the most specific matching group in the robots file
while let Some(agents) = group_reader.next_header().await? { while let Some(agents) = group_reader.next_header().await? {
let matching_agent = agents.iter().find(|a| { let matching_agent = agents.iter().find(|a| {
let matches = &a[..] == "*" || self.user_agent.contains(*a); let matches = &a[..] == "*" || self.user_agent.contains(*a);
let more_specific = a.len() > agent.len(); let more_specific = a.len() > agent.len();
matches && more_specific matches && more_specific
}); });
if let Some(matching_agent) = matching_agent { if let Some(matching_agent) = matching_agent {
agent = matching_agent.clone(); agent = matching_agent.clone();
rules = group_reader.next_rules().await?; rules = group_reader.next_rules().await?;
} }
} }
let rules = rules.iter().map(|r| r.into()).collect(); let rules = rules.iter().map(|r| r.into()).collect();
Ok(Cylon::compile(rules)) Ok(Cylon::compile(rules))
} }
} }
struct GroupReader<R: AsyncBufRead + Unpin> { struct GroupReader<R: AsyncBufRead + Unpin> {
parsing_agents: bool, parsing_agents: bool,
agents: Vec<String>, agents: Vec<String>,
rules: Vec<ParsedRule>, rules: Vec<ParsedRule>,
reader: R, reader: R,
} }
impl<R: AsyncBufRead + Unpin> GroupReader<R> { impl<R: AsyncBufRead + Unpin> GroupReader<R> {
fn new(reader: R) -> Self { fn new(reader: R) -> Self {
Self { Self {
parsing_agents: true, parsing_agents: true,
agents: vec![], agents: vec![],
rules: vec![], rules: vec![],
reader, reader,
} }
} }
/// Scan forward until the next group header defined by one or more /// Scan forward until the next group header defined by one or more
/// user agent lines. This lets us optimize the lines we need to copy /// user agent lines. This lets us optimize the lines we need to copy
/// so we can skip over groups that don't match the desired user agent. /// so we can skip over groups that don't match the desired user agent.
async fn next_header(&mut self) -> Result<Option<Vec<String>>> { async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
let mut buf = String::new(); let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 { while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone()); let parsed_line = parse_line(buf.clone());
match parsed_line { match parsed_line {
ParsedLine::UserAgent(ua) if self.parsing_agents => { ParsedLine::UserAgent(ua) if self.parsing_agents => {
self.agents.push(ua); self.agents.push(ua);
} }
ParsedLine::UserAgent(ua) => { ParsedLine::UserAgent(ua) => {
self.agents = vec![ua]; self.agents = vec![ua];
self.rules = vec![]; self.rules = vec![];
self.parsing_agents = true; self.parsing_agents = true;
} }
ParsedLine::Rule(rule) if self.parsing_agents => { ParsedLine::Rule(rule) if self.parsing_agents => {
// Preserve the rule in case we need it in next_rules(). // Preserve the rule in case we need it in next_rules().
self.rules.push(rule); self.rules.push(rule);
self.parsing_agents = false; self.parsing_agents = false;
break; break;
} }
// Skip over lines until we get to the next user agent. // Skip over lines until we get to the next user agent.
ParsedLine::Rule(..) => (), ParsedLine::Rule(..) => (),
ParsedLine::Nothing => (), ParsedLine::Nothing => (),
} }
buf.clear(); buf.clear();
} }
let agents = self.agents.clone(); let agents = self.agents.clone();
self.agents = vec![]; self.agents = vec![];
if agents.is_empty() { if agents.is_empty() {
return Ok(None); return Ok(None);
} }
Ok(Some(agents)) Ok(Some(agents))
} }
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> { async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
let mut buf = String::new(); let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 { while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone()); let parsed_line = parse_line(buf.clone());
match parsed_line { match parsed_line {
ParsedLine::Rule(rule) => { ParsedLine::Rule(rule) => {
self.rules.push(rule); self.rules.push(rule);
self.parsing_agents = false; self.parsing_agents = false;
} }
ParsedLine::UserAgent(ua) if !self.parsing_agents => { ParsedLine::UserAgent(ua) if !self.parsing_agents => {
// Preserve the agent in case we need it in next_agents(). // Preserve the agent in case we need it in next_agents().
self.agents.push(ua); self.agents.push(ua);
self.parsing_agents = true; self.parsing_agents = true;
break; break;
} }
// Skip over lines until we get to the next rule. // Skip over lines until we get to the next rule.
ParsedLine::UserAgent(..) => (), ParsedLine::UserAgent(..) => (),
ParsedLine::Nothing => (), ParsedLine::Nothing => (),
} }
buf.clear(); buf.clear();
} }
let rules = self.rules.clone(); let rules = self.rules.clone();
self.rules = vec![]; self.rules = vec![];
Ok(rules) Ok(rules)
} }
} }
fn parse_line(line: String) -> ParsedLine { fn parse_line(line: String) -> ParsedLine {
let line = strip_comments(&line[..]).trim(); let line = strip_comments(&line[..]).trim();
// This tries to parse lines roughly in order of most frequent kind to // This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average. // least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line) parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
.unwrap_or(ParsedLine::Nothing) .unwrap_or(ParsedLine::Nothing)
} }
fn strip_comments(line: &str) -> &str { fn strip_comments(line: &str) -> &str {
if let Some(before) = line.split('#').next() { if let Some(before) = line.split('#').next() {
return before; return before;
} }
return line; return line;
} }
fn parse_user_agent(line: &str) -> Option<&str> { fn parse_user_agent(line: &str) -> Option<&str> {
if line.len() < UA_PREFIX.len() { if line.len() < UA_PREFIX.len() {
return None; return None;
} }
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase(); let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[UA_PREFIX.len()..]; let suffix = &line[UA_PREFIX.len()..];
if prefix == UA_PREFIX { if prefix == UA_PREFIX {
Some(suffix.trim()) Some(suffix.trim())
} else { } else {
None None
} }
} }
fn parse_delay(line: &str) -> Option<u64> { fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() { if line.len() < DELAY_PREFIX.len() {
return None; return None;
} }
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DELAY_PREFIX.len()..]; let suffix = &line[DELAY_PREFIX.len()..];
if prefix == DELAY_PREFIX { if prefix == DELAY_PREFIX {
Some(suffix.trim()) Some(suffix.trim())
} else { } else {
None None
} }
} }
fn parse_allow(line: &str) -> Option<&str> { fn parse_allow(line: &str) -> Option<&str> {
if line.len() < ALLOW_PREFIX.len() { if line.len() < ALLOW_PREFIX.len() {
return None; return None;
} }
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase(); let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[ALLOW_PREFIX.len()..]; let suffix = &line[ALLOW_PREFIX.len()..];
if prefix == ALLOW_PREFIX { if prefix == ALLOW_PREFIX {
Some(suffix.trim()) Some(suffix.trim())
} else { } else {
None None
} }
} }
fn parse_disallow(line: &str) -> Option<&str> { fn parse_disallow(line: &str) -> Option<&str> {
if line.len() < DISALLOW_PREFIX.len() { if line.len() < DISALLOW_PREFIX.len() {
return None; return None;
} }
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase(); let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DISALLOW_PREFIX.len()..]; let suffix = &line[DISALLOW_PREFIX.len()..];
if prefix == DISALLOW_PREFIX { if prefix == DISALLOW_PREFIX {
Some(suffix.trim()) Some(suffix.trim())
} else { } else {
None None
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
#[test] #[test]
fn test_parse_allow() { fn test_parse_allow() {
let test_cases = vec![ let test_cases = vec![
("Allow: /", "/"), ("Allow: /", "/"),
("allow: / # Root with comment", "/"), ("allow: / # Root with comment", "/"),
("ALLOW: /abc/def ", "/abc/def"), ("ALLOW: /abc/def ", "/abc/def"),
("Allow: /abc/def ", "/abc/def"), ("Allow: /abc/def ", "/abc/def"),
(" Allow: /*/foo", "/*/foo"), (" Allow: /*/foo", "/*/foo"),
]; ];
for (i, o) in test_cases { for (i, o) in test_cases {
assert_eq!( assert_eq!(
parse_line(i.into()), parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Allow(o.into())) ParsedLine::Rule(ParsedRule::Allow(o.into()))
); );
} }
} }
#[test] #[test]
fn test_parse_disallow() { fn test_parse_disallow() {
let test_cases = vec![ let test_cases = vec![
("Disallow: /", "/"), ("Disallow: /", "/"),
("disallow: / # Root with comment", "/"), ("disallow: / # Root with comment", "/"),
("DISALLOW: /abc/def ", "/abc/def"), ("DISALLOW: /abc/def ", "/abc/def"),
("Disallow: /abc/def ", "/abc/def"), ("Disallow: /abc/def ", "/abc/def"),
(" Disallow: /*/foo", "/*/foo"), (" Disallow: /*/foo", "/*/foo"),
]; ];
for (i, o) in test_cases { for (i, o) in test_cases {
assert_eq!( assert_eq!(
parse_line(i.into()), parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Disallow(o.into())) ParsedLine::Rule(ParsedRule::Disallow(o.into()))
); );
} }
} }
#[test] #[test]
fn test_parse_user_agent() { fn test_parse_user_agent() {
let test_cases = vec![ let test_cases = vec![
("User-agent: *", "*"), ("User-agent: *", "*"),
("user-agent: ImABot # User agent with comment", "imabot"), ("user-agent: ImABot # User agent with comment", "imabot"),
(" USER-AGENT: ImABot ", "imabot"), (" USER-AGENT: ImABot ", "imabot"),
]; ];
for (i, o) in test_cases { for (i, o) in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into())); assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
} }
} }
#[test] #[test]
fn test_parse_nothing() { fn test_parse_nothing() {
let test_cases = vec![ let test_cases = vec![
"Useragent: *", "Useragent: *",
"# Comment", "# Comment",
"", "",
" ", " ",
"\t", "\t",
"alow: /", "alow: /",
"disalow: /", "disalow: /",
]; ];
for i in test_cases { for i in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::Nothing); assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
} }
} }
#[test] #[test]
fn test_end_to_end() { fn test_crawl_delay() {
tokio_test::block_on(async { tokio_test::block_on(async {
let example_robots = r#" let example_robots = r#"
User-agent: jones-bot User-agent: jones-bot
Disallow: / Disallow: /
Crawl-Delay: 30
User-agent: jones
User-agent: foobar User-agent: foobar
Allow: / Crawl-Delay: 60
User-agent: * User-agent: googlebot
Disallow: / Allow: /
"#
.as_bytes(); User-agent: barfoo
Crawl-Delay: 60
let parser = Compiler::new("foobar"); Crawl-Delay: 20
let foobar_machine = parser.compile(example_robots).await.unwrap(); "#
.as_bytes();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap(); let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap(); let parser = Compiler::new("googlebot");
let googlebot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("abc");
let abc_machine = parser.compile(example_robots).await.unwrap(); let parser = Compiler::new("barfoo");
let barfoo_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(false, jonesbot_machine.allow("/index.html")); let parser = Compiler::new("jones-bot");
assert_eq!(false, imabot_machine.allow("/index.html")); let jonesbot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(false, abc_machine.allow("/index.html"));
}); assert_eq!(Some(60), foobar_machine.delay());
} assert_eq!(Some(20), barfoo_machine.delay());
assert_eq!(Some(30), jonesbot_machine.delay());
#[test] assert_eq!(None, googlebot_machine.delay());
fn test_invalid_1() { });
tokio_test::block_on(async { }
let example_robots = r#"
# Instead of treating this as an error, we'll just consider #[test]
# this behavior undefined. fn test_end_to_end() {
Allow: / tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones User-agent: jones-bot
User-agent: foobar Disallow: /
Disallow: /
"# User-agent: foo
.as_bytes(); Allow: /
Crawl-Delay: 20
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap(); User-agent: jones
User-agent: foobar
let parser = Compiler::new("imabot"); Allow: /
let imabot_machine = parser.compile(example_robots).await.unwrap();
User-agent: *
// Everything is allowed because next_header() returns None Disallow: /
assert_eq!(true, foobar_machine.allow("/index.html")); "#
assert_eq!(true, imabot_machine.allow("/index.html")); .as_bytes();
});
} let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
#[test]
fn test_invalid_2() { let parser = Compiler::new("jones-bot");
tokio_test::block_on(async { let jonesbot_machine = parser.compile(example_robots).await.unwrap();
let example_robots = r#"
User-agent: jones let parser = Compiler::new("imabot");
User-agent: foobar let imabot_machine = parser.compile(example_robots).await.unwrap();
Disallow: /
let parser = Compiler::new("abc");
# Instead of treating this as an error, we consider this let abc_machine = parser.compile(example_robots).await.unwrap();
# behavior undefined.
User-agent: imabot assert_eq!(true, foobar_machine.allow("/index.html"));
"# assert_eq!(false, jonesbot_machine.allow("/index.html"));
.as_bytes(); assert_eq!(false, imabot_machine.allow("/index.html"));
assert_eq!(false, abc_machine.allow("/index.html"));
let parser = Compiler::new("foobar"); });
let foobar_machine = parser.compile(example_robots).await.unwrap(); }
let parser = Compiler::new("imabot"); #[test]
let imabot_machine = parser.compile(example_robots).await.unwrap(); fn test_invalid_1() {
tokio_test::block_on(async {
assert_eq!(false, foobar_machine.allow("/index.html")); let example_robots = r#"
assert_eq!(true, imabot_machine.allow("/index.html")); # Instead of treating this as an error, we'll just consider
}); # this behavior undefined.
} Allow: /
}
User-agent: jones
User-agent: foobar
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
// Everything is allowed because next_header() returns None
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_2() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones
User-agent: foobar
Disallow: /
# Instead of treating this as an error, we consider this
# behavior undefined.
User-agent: imabot
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(false, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
}