add crawl-delay

This commit is contained in:
r.portalez 2021-03-10 15:57:55 +01:00
parent 86ee746b96
commit fe11216642
4 changed files with 979 additions and 902 deletions

View File

@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
repository = "https://github.com/crestonbunch/cylon"
[dependencies]
futures-util = "0.3.13"
serde = "1.0.124"
serde_derive = "1.0.124"
futures-util = "0.3"
serde = "1.0"
serde_derive = "1.0"
[dev-dependencies]
criterion = { version = "0.3", features = ["async_futures"] }

View File

@ -9,6 +9,7 @@ Disallow: /
Allow: /a
Allow: /abc
Allow: /b
Crawl-Delay: 20
"#
.as_bytes();

1033
src/dfa.rs

File diff suppressed because it is too large Load Diff

View File

@ -1,399 +1,442 @@
use super::dfa::{Cylon, Rule};
use futures_util::{
io::{AsyncBufRead, AsyncRead, BufReader, Result},
AsyncBufReadExt,
};
use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:";
const DELAY_PREFIX: &str = "crawl-delay:";
const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:";
#[derive(Debug, PartialEq, Clone)]
enum ParsedRule {
Allow(String),
Disallow(String),
Delay(u64),
}
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> {
match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Delay(delay) => Rule.Delay(delay),
}
}
}
#[derive(Debug, PartialEq)]
enum ParsedLine {
UserAgent(String),
Rule(ParsedRule),
Nothing,
}
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
/// which can be used to efficiently match a large number of paths against
/// the robots.txt file.
#[derive(Debug, Serialize, Deserialize)]
pub struct Compiler {
user_agent: String,
}
impl Compiler {
/// Build a new compiler that parses rules for the given user agent from
/// a robots.txt file.
pub fn new(user_agent: &str) -> Self {
Self {
user_agent: user_agent.to_lowercase(),
}
}
/// Parse an input robots.txt file into a Cylon that can recognize
/// whether or not a path matches the rules for the Parser's user agent.
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
let reader = BufReader::new(file);
let mut agent = String::new();
let mut rules: Vec<ParsedRule> = vec![];
let mut group_reader = GroupReader::new(reader);
// find the most specific matching group in the robots file
while let Some(agents) = group_reader.next_header().await? {
let matching_agent = agents.iter().find(|a| {
let matches = &a[..] == "*" || self.user_agent.contains(*a);
let more_specific = a.len() > agent.len();
matches && more_specific
});
if let Some(matching_agent) = matching_agent {
agent = matching_agent.clone();
rules = group_reader.next_rules().await?;
}
}
let rules = rules.iter().map(|r| r.into()).collect();
Ok(Cylon::compile(rules))
}
}
struct GroupReader<R: AsyncBufRead + Unpin> {
parsing_agents: bool,
agents: Vec<String>,
rules: Vec<ParsedRule>,
reader: R,
}
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
fn new(reader: R) -> Self {
Self {
parsing_agents: true,
agents: vec![],
rules: vec![],
reader,
}
}
/// Scan forward until the next group header defined by one or more
/// user agent lines. This lets us optimize the lines we need to copy
/// so we can skip over groups that don't match the desired user agent.
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::UserAgent(ua) if self.parsing_agents => {
self.agents.push(ua);
}
ParsedLine::UserAgent(ua) => {
self.agents = vec![ua];
self.rules = vec![];
self.parsing_agents = true;
}
ParsedLine::Rule(rule) if self.parsing_agents => {
// Preserve the rule in case we need it in next_rules().
self.rules.push(rule);
self.parsing_agents = false;
break;
}
// Skip over lines until we get to the next user agent.
ParsedLine::Rule(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let agents = self.agents.clone();
self.agents = vec![];
if agents.is_empty() {
return Ok(None);
}
Ok(Some(agents))
}
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::Rule(rule) => {
self.rules.push(rule);
self.parsing_agents = false;
}
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
// Preserve the agent in case we need it in next_agents().
self.agents.push(ua);
self.parsing_agents = true;
break;
}
// Skip over lines until we get to the next rule.
ParsedLine::UserAgent(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let rules = self.rules.clone();
self.rules = vec![];
Ok(rules)
}
}
fn parse_line(line: String) -> ParsedLine {
let line = strip_comments(&line[..]).trim();
// This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
.unwrap_or(ParsedLine::Nothing)
}
fn strip_comments(line: &str) -> &str {
if let Some(before) = line.split('#').next() {
return before;
}
return line;
}
fn parse_user_agent(line: &str) -> Option<&str> {
if line.len() < UA_PREFIX.len() {
return None;
}
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[UA_PREFIX.len()..];
if prefix == UA_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_delay(line: &str) -> Option<u64> {
if line.len() < DELAY_PREFIX.len() {
return None;
}
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DELAY_PREFIX.len()..];
if prefix == DELAY_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_allow(line: &str) -> Option<&str> {
if line.len() < ALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[ALLOW_PREFIX.len()..];
if prefix == ALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_disallow(line: &str) -> Option<&str> {
if line.len() < DISALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DISALLOW_PREFIX.len()..];
if prefix == DISALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_allow() {
let test_cases = vec![
("Allow: /", "/"),
("allow: / # Root with comment", "/"),
("ALLOW: /abc/def ", "/abc/def"),
("Allow: /abc/def ", "/abc/def"),
(" Allow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Allow(o.into()))
);
}
}
#[test]
fn test_parse_disallow() {
let test_cases = vec![
("Disallow: /", "/"),
("disallow: / # Root with comment", "/"),
("DISALLOW: /abc/def ", "/abc/def"),
("Disallow: /abc/def ", "/abc/def"),
(" Disallow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
);
}
}
#[test]
fn test_parse_user_agent() {
let test_cases = vec![
("User-agent: *", "*"),
("user-agent: ImABot # User agent with comment", "imabot"),
(" USER-AGENT: ImABot ", "imabot"),
];
for (i, o) in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
}
}
#[test]
fn test_parse_nothing() {
let test_cases = vec![
"Useragent: *",
"# Comment",
"",
" ",
"\t",
"alow: /",
"disalow: /",
];
for i in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
}
}
#[test]
fn test_end_to_end() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
User-agent: jones
User-agent: foobar
Allow: /
User-agent: *
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("abc");
let abc_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(false, jonesbot_machine.allow("/index.html"));
assert_eq!(false, imabot_machine.allow("/index.html"));
assert_eq!(false, abc_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_1() {
tokio_test::block_on(async {
let example_robots = r#"
# Instead of treating this as an error, we'll just consider
# this behavior undefined.
Allow: /
User-agent: jones
User-agent: foobar
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
// Everything is allowed because next_header() returns None
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_2() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones
User-agent: foobar
Disallow: /
# Instead of treating this as an error, we consider this
# behavior undefined.
User-agent: imabot
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(false, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
}
use super::dfa::{Cylon, Rule};
use futures_util::{
io::{AsyncBufRead, AsyncRead, BufReader, Result},
AsyncBufReadExt,
};
use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:";
const DELAY_PREFIX: &str = "crawl-delay:";
const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:";
#[derive(Debug, PartialEq, Clone)]
enum ParsedRule {
Allow(String),
Disallow(String),
Delay(String),
}
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> {
match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Delay(delay) => Rule::Delay(delay),
}
}
}
#[derive(Debug, PartialEq)]
enum ParsedLine {
UserAgent(String),
Rule(ParsedRule),
Nothing,
}
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
/// which can be used to efficiently match a large number of paths against
/// the robots.txt file.
#[derive(Debug, Serialize, Deserialize)]
pub struct Compiler {
user_agent: String,
}
impl Compiler {
/// Build a new compiler that parses rules for the given user agent from
/// a robots.txt file.
pub fn new(user_agent: &str) -> Self {
Self {
user_agent: user_agent.to_lowercase(),
}
}
/// Parse an input robots.txt file into a Cylon that can recognize
/// whether or not a path matches the rules for the Parser's user agent.
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
let reader = BufReader::new(file);
let mut agent = String::new();
let mut rules: Vec<ParsedRule> = vec![];
let mut group_reader = GroupReader::new(reader);
// find the most specific matching group in the robots file
while let Some(agents) = group_reader.next_header().await? {
let matching_agent = agents.iter().find(|a| {
let matches = &a[..] == "*" || self.user_agent.contains(*a);
let more_specific = a.len() > agent.len();
matches && more_specific
});
if let Some(matching_agent) = matching_agent {
agent = matching_agent.clone();
rules = group_reader.next_rules().await?;
}
}
let rules = rules.iter().map(|r| r.into()).collect();
Ok(Cylon::compile(rules))
}
}
struct GroupReader<R: AsyncBufRead + Unpin> {
parsing_agents: bool,
agents: Vec<String>,
rules: Vec<ParsedRule>,
reader: R,
}
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
fn new(reader: R) -> Self {
Self {
parsing_agents: true,
agents: vec![],
rules: vec![],
reader,
}
}
/// Scan forward until the next group header defined by one or more
/// user agent lines. This lets us optimize the lines we need to copy
/// so we can skip over groups that don't match the desired user agent.
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::UserAgent(ua) if self.parsing_agents => {
self.agents.push(ua);
}
ParsedLine::UserAgent(ua) => {
self.agents = vec![ua];
self.rules = vec![];
self.parsing_agents = true;
}
ParsedLine::Rule(rule) if self.parsing_agents => {
// Preserve the rule in case we need it in next_rules().
self.rules.push(rule);
self.parsing_agents = false;
break;
}
// Skip over lines until we get to the next user agent.
ParsedLine::Rule(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let agents = self.agents.clone();
self.agents = vec![];
if agents.is_empty() {
return Ok(None);
}
Ok(Some(agents))
}
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::Rule(rule) => {
self.rules.push(rule);
self.parsing_agents = false;
}
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
// Preserve the agent in case we need it in next_agents().
self.agents.push(ua);
self.parsing_agents = true;
break;
}
// Skip over lines until we get to the next rule.
ParsedLine::UserAgent(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let rules = self.rules.clone();
self.rules = vec![];
Ok(rules)
}
}
fn parse_line(line: String) -> ParsedLine {
let line = strip_comments(&line[..]).trim();
// This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
.unwrap_or(ParsedLine::Nothing)
}
fn strip_comments(line: &str) -> &str {
if let Some(before) = line.split('#').next() {
return before;
}
return line;
}
fn parse_user_agent(line: &str) -> Option<&str> {
if line.len() < UA_PREFIX.len() {
return None;
}
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[UA_PREFIX.len()..];
if prefix == UA_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_delay(line: &str) -> Option<&str> {
if line.len() < DELAY_PREFIX.len() {
return None;
}
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DELAY_PREFIX.len()..];
if prefix == DELAY_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_allow(line: &str) -> Option<&str> {
if line.len() < ALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[ALLOW_PREFIX.len()..];
if prefix == ALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_disallow(line: &str) -> Option<&str> {
if line.len() < DISALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DISALLOW_PREFIX.len()..];
if prefix == DISALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_allow() {
let test_cases = vec![
("Allow: /", "/"),
("allow: / # Root with comment", "/"),
("ALLOW: /abc/def ", "/abc/def"),
("Allow: /abc/def ", "/abc/def"),
(" Allow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Allow(o.into()))
);
}
}
#[test]
fn test_parse_disallow() {
let test_cases = vec![
("Disallow: /", "/"),
("disallow: / # Root with comment", "/"),
("DISALLOW: /abc/def ", "/abc/def"),
("Disallow: /abc/def ", "/abc/def"),
(" Disallow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
);
}
}
#[test]
fn test_parse_user_agent() {
let test_cases = vec![
("User-agent: *", "*"),
("user-agent: ImABot # User agent with comment", "imabot"),
(" USER-AGENT: ImABot ", "imabot"),
];
for (i, o) in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
}
}
#[test]
fn test_parse_nothing() {
let test_cases = vec![
"Useragent: *",
"# Comment",
"",
" ",
"\t",
"alow: /",
"disalow: /",
];
for i in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
}
}
#[test]
fn test_crawl_delay() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
Crawl-Delay: 30
User-agent: foobar
Crawl-Delay: 60
User-agent: googlebot
Allow: /
User-agent: barfoo
Crawl-Delay: 60
Crawl-Delay: 20
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("googlebot");
let googlebot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("barfoo");
let barfoo_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(Some(60), foobar_machine.delay());
assert_eq!(Some(20), barfoo_machine.delay());
assert_eq!(Some(30), jonesbot_machine.delay());
assert_eq!(None, googlebot_machine.delay());
});
}
#[test]
fn test_end_to_end() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
User-agent: foo
Allow: /
Crawl-Delay: 20
User-agent: jones
User-agent: foobar
Allow: /
User-agent: *
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("abc");
let abc_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(false, jonesbot_machine.allow("/index.html"));
assert_eq!(false, imabot_machine.allow("/index.html"));
assert_eq!(false, abc_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_1() {
tokio_test::block_on(async {
let example_robots = r#"
# Instead of treating this as an error, we'll just consider
# this behavior undefined.
Allow: /
User-agent: jones
User-agent: foobar
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
// Everything is allowed because next_header() returns None
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_2() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones
User-agent: foobar
Disallow: /
# Instead of treating this as an error, we consider this
# behavior undefined.
User-agent: imabot
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(false, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
}