add crawl-delay

2021-03-10 15:57:55 +01:00 · 2021-03-10 15:57:55 +01:00 · fe11216642
parent 86ee746b96
commit fe11216642
4 changed files with 979 additions and 902 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
 repository = "https://github.com/crestonbunch/cylon"

 [dependencies]
-futures-util = "0.3.13"
-serde = "1.0.124"
-serde_derive = "1.0.124"
+futures-util = "0.3"
+serde = "1.0"
+serde_derive = "1.0"

 [dev-dependencies]
 criterion = { version = "0.3", features = ["async_futures"] }
--- a/benches/parse.rs
+++ b/benches/parse.rs
@ -9,6 +9,7 @@ Disallow: /
 Allow: /a
 Allow: /abc
 Allow: /b
+Crawl-Delay: 20
 "#
 .as_bytes();

--- a/src/dfa.rs
+++ b/src/dfa.rs
--- a/src/parse.rs
+++ b/src/parse.rs
@ -1,399 +1,442 @@
-use super::dfa::{Cylon, Rule};
-use futures_util::{
-    io::{AsyncBufRead, AsyncRead, BufReader, Result},
-    AsyncBufReadExt,
-};
-use serde_derive::{Deserialize, Serialize};
-const UA_PREFIX: &str = "user-agent:";
-const DELAY_PREFIX: &str = "crawl-delay:";
-const ALLOW_PREFIX: &str = "allow:";
-const DISALLOW_PREFIX: &str = "disallow:";
-
-#[derive(Debug, PartialEq, Clone)]
-enum ParsedRule {
-    Allow(String),
-    Disallow(String),
-    Delay(u64),
-}
-
-impl<'a> Into<Rule<'a>> for &'a ParsedRule {
-    fn into(self) -> Rule<'a> {
-        match self {
-            ParsedRule::Allow(path) => Rule::Allow(&path[..]),
-            ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
-            ParsedRule::Delay(delay) => Rule.Delay(delay),
-        }
-    }
-}
-
-#[derive(Debug, PartialEq)]
-enum ParsedLine {
-    UserAgent(String),
-    Rule(ParsedRule),
-    Nothing,
-}
-
-/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
-/// which can be used to efficiently match a large number of paths against
-/// the robots.txt file.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct Compiler {
-    user_agent: String,
-}
-
-impl Compiler {
-    /// Build a new compiler that parses rules for the given user agent from
-    /// a robots.txt file.
-    pub fn new(user_agent: &str) -> Self {
-        Self {
-            user_agent: user_agent.to_lowercase(),
-        }
-    }
-
-    /// Parse an input robots.txt file into a Cylon that can recognize
-    /// whether or not a path matches the rules for the Parser's user agent.
-    pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
-        let reader = BufReader::new(file);
-        let mut agent = String::new();
-        let mut rules: Vec<ParsedRule> = vec![];
-        let mut group_reader = GroupReader::new(reader);
-
-        // find the most specific matching group in the robots file
-        while let Some(agents) = group_reader.next_header().await? {
-            let matching_agent = agents.iter().find(|a| {
-                let matches = &a[..] == "*" || self.user_agent.contains(*a);
-                let more_specific = a.len() > agent.len();
-                matches && more_specific
-            });
-
-            if let Some(matching_agent) = matching_agent {
-                agent = matching_agent.clone();
-                rules = group_reader.next_rules().await?;
-            }
-        }
-
-        let rules = rules.iter().map(|r| r.into()).collect();
-        Ok(Cylon::compile(rules))
-    }
-}
-
-struct GroupReader<R: AsyncBufRead + Unpin> {
-    parsing_agents: bool,
-    agents: Vec<String>,
-    rules: Vec<ParsedRule>,
-    reader: R,
-}
-
-impl<R: AsyncBufRead + Unpin> GroupReader<R> {
-    fn new(reader: R) -> Self {
-        Self {
-            parsing_agents: true,
-            agents: vec![],
-            rules: vec![],
-            reader,
-        }
-    }
-
-    /// Scan forward until the next group header defined by one or more
-    /// user agent lines. This lets us optimize the lines we need to copy
-    /// so we can skip over groups that don't match the desired user agent.
-    async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
-        let mut buf = String::new();
-        while self.reader.read_line(&mut buf).await? != 0 {
-            let parsed_line = parse_line(buf.clone());
-
-            match parsed_line {
-                ParsedLine::UserAgent(ua) if self.parsing_agents => {
-                    self.agents.push(ua);
-                }
-                ParsedLine::UserAgent(ua) => {
-                    self.agents = vec![ua];
-                    self.rules = vec![];
-                    self.parsing_agents = true;
-                }
-                ParsedLine::Rule(rule) if self.parsing_agents => {
-                    // Preserve the rule in case we need it in next_rules().
-                    self.rules.push(rule);
-                    self.parsing_agents = false;
-                    break;
-                }
-                // Skip over lines until we get to the next user agent.
-                ParsedLine::Rule(..) => (),
-                ParsedLine::Nothing => (),
-            }
-
-            buf.clear();
-        }
-
-        let agents = self.agents.clone();
-        self.agents = vec![];
-
-        if agents.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(agents))
-    }
-
-    async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
-        let mut buf = String::new();
-        while self.reader.read_line(&mut buf).await? != 0 {
-            let parsed_line = parse_line(buf.clone());
-
-            match parsed_line {
-                ParsedLine::Rule(rule) => {
-                    self.rules.push(rule);
-                    self.parsing_agents = false;
-                }
-                ParsedLine::UserAgent(ua) if !self.parsing_agents => {
-                    // Preserve the agent in case we need it in next_agents().
-                    self.agents.push(ua);
-                    self.parsing_agents = true;
-                    break;
-                }
-                // Skip over lines until we get to the next rule.
-                ParsedLine::UserAgent(..) => (),
-                ParsedLine::Nothing => (),
-            }
-
-            buf.clear();
-        }
-
-        let rules = self.rules.clone();
-        self.rules = vec![];
-        Ok(rules)
-    }
-}
-
-fn parse_line(line: String) -> ParsedLine {
-    let line = strip_comments(&line[..]).trim();
-
-    // This tries to parse lines roughly in order of most frequent kind to
-    // least frequent kind in order to minimize CPU cycles on average.
-    parse_disallow(line)
-        .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
-        .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
-        .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
-        .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
-        .unwrap_or(ParsedLine::Nothing)
-}
-
-fn strip_comments(line: &str) -> &str {
-    if let Some(before) = line.split('#').next() {
-        return before;
-    }
-    return line;
-}
-
-fn parse_user_agent(line: &str) -> Option<&str> {
-    if line.len() < UA_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[UA_PREFIX.len()..];
-
-    if prefix == UA_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_delay(line: &str) -> Option<u64> {
-    if line.len() < DELAY_PREFIX.len() {
-        return None;
-    }
-
-    let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[DELAY_PREFIX.len()..];
-    if prefix == DELAY_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_allow(line: &str) -> Option<&str> {
-    if line.len() < ALLOW_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[ALLOW_PREFIX.len()..];
-
-    if prefix == ALLOW_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_disallow(line: &str) -> Option<&str> {
-    if line.len() < DISALLOW_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[DISALLOW_PREFIX.len()..];
-
-    if prefix == DISALLOW_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_allow() {
-        let test_cases = vec![
-            ("Allow: /", "/"),
-            ("allow: /   #  Root with comment", "/"),
-            ("ALLOW: /abc/def  ", "/abc/def"),
-            ("Allow:   /abc/def  ", "/abc/def"),
-            ("  Allow: /*/foo", "/*/foo"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(
-                parse_line(i.into()),
-                ParsedLine::Rule(ParsedRule::Allow(o.into()))
-            );
-        }
-    }
-
-    #[test]
-    fn test_parse_disallow() {
-        let test_cases = vec![
-            ("Disallow: /", "/"),
-            ("disallow: /   #  Root with comment", "/"),
-            ("DISALLOW: /abc/def  ", "/abc/def"),
-            ("Disallow:   /abc/def  ", "/abc/def"),
-            ("  Disallow: /*/foo", "/*/foo"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(
-                parse_line(i.into()),
-                ParsedLine::Rule(ParsedRule::Disallow(o.into()))
-            );
-        }
-    }
-
-    #[test]
-    fn test_parse_user_agent() {
-        let test_cases = vec![
-            ("User-agent: *", "*"),
-            ("user-agent: ImABot   #  User agent with comment", "imabot"),
-            ("  USER-AGENT:   ImABot  ", "imabot"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
-        }
-    }
-
-    #[test]
-    fn test_parse_nothing() {
-        let test_cases = vec![
-            "Useragent: *",
-            "# Comment",
-            "",
-            "    ",
-            "\t",
-            "alow: /",
-            "disalow: /",
-        ];
-
-        for i in test_cases {
-            assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
-        }
-    }
-
-    #[test]
-    fn test_end_to_end() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            User-agent: jones-bot
-            Disallow: /
-
-            User-agent: jones
-            User-agent: foobar
-            Allow: /
-
-            User-agent: *
-            Disallow: /
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("jones-bot");
-            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("abc");
-            let abc_machine = parser.compile(example_robots).await.unwrap();
-
-            assert_eq!(true, foobar_machine.allow("/index.html"));
-            assert_eq!(false, jonesbot_machine.allow("/index.html"));
-            assert_eq!(false, imabot_machine.allow("/index.html"));
-            assert_eq!(false, abc_machine.allow("/index.html"));
-        });
-    }
-
-    #[test]
-    fn test_invalid_1() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            # Instead of treating this as an error, we'll just consider
-            # this behavior undefined.
-            Allow: /
-
-            User-agent: jones
-            User-agent: foobar
-            Disallow: /
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            // Everything is allowed because next_header() returns None
-            assert_eq!(true, foobar_machine.allow("/index.html"));
-            assert_eq!(true, imabot_machine.allow("/index.html"));
-        });
-    }
-
-    #[test]
-    fn test_invalid_2() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            User-agent: jones
-            User-agent: foobar
-            Disallow: /
-
-            # Instead of treating this as an error, we consider this
-            # behavior undefined.
-            User-agent: imabot
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            assert_eq!(false, foobar_machine.allow("/index.html"));
-            assert_eq!(true, imabot_machine.allow("/index.html"));
-        });
-    }
-}
+use super::dfa::{Cylon, Rule};
+use futures_util::{
+    io::{AsyncBufRead, AsyncRead, BufReader, Result},
+    AsyncBufReadExt,
+};
+use serde_derive::{Deserialize, Serialize};
+const UA_PREFIX: &str = "user-agent:";
+const DELAY_PREFIX: &str = "crawl-delay:";
+const ALLOW_PREFIX: &str = "allow:";
+const DISALLOW_PREFIX: &str = "disallow:";
+
+#[derive(Debug, PartialEq, Clone)]
+enum ParsedRule {
+    Allow(String),
+    Disallow(String),
+    Delay(String),
+}
+
+impl<'a> Into<Rule<'a>> for &'a ParsedRule {
+    fn into(self) -> Rule<'a> {
+        match self {
+            ParsedRule::Allow(path) => Rule::Allow(&path[..]),
+            ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
+            ParsedRule::Delay(delay) => Rule::Delay(delay),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq)]
+enum ParsedLine {
+    UserAgent(String),
+    Rule(ParsedRule),
+    Nothing,
+}
+
+/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
+/// which can be used to efficiently match a large number of paths against
+/// the robots.txt file.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Compiler {
+    user_agent: String,
+}
+
+impl Compiler {
+    /// Build a new compiler that parses rules for the given user agent from
+    /// a robots.txt file.
+    pub fn new(user_agent: &str) -> Self {
+        Self {
+            user_agent: user_agent.to_lowercase(),
+        }
+    }
+
+    /// Parse an input robots.txt file into a Cylon that can recognize
+    /// whether or not a path matches the rules for the Parser's user agent.
+    pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
+        let reader = BufReader::new(file);
+        let mut agent = String::new();
+        let mut rules: Vec<ParsedRule> = vec![];
+        let mut group_reader = GroupReader::new(reader);
+
+        // find the most specific matching group in the robots file
+        while let Some(agents) = group_reader.next_header().await? {
+            let matching_agent = agents.iter().find(|a| {
+                let matches = &a[..] == "*" || self.user_agent.contains(*a);
+                let more_specific = a.len() > agent.len();
+                matches && more_specific
+            });
+
+            if let Some(matching_agent) = matching_agent {
+                agent = matching_agent.clone();
+                rules = group_reader.next_rules().await?;
+            }
+        }
+
+        let rules = rules.iter().map(|r| r.into()).collect();
+        Ok(Cylon::compile(rules))
+    }
+}
+
+struct GroupReader<R: AsyncBufRead + Unpin> {
+    parsing_agents: bool,
+    agents: Vec<String>,
+    rules: Vec<ParsedRule>,
+    reader: R,
+}
+
+impl<R: AsyncBufRead + Unpin> GroupReader<R> {
+    fn new(reader: R) -> Self {
+        Self {
+            parsing_agents: true,
+            agents: vec![],
+            rules: vec![],
+            reader,
+        }
+    }
+
+    /// Scan forward until the next group header defined by one or more
+    /// user agent lines. This lets us optimize the lines we need to copy
+    /// so we can skip over groups that don't match the desired user agent.
+    async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
+        let mut buf = String::new();
+        while self.reader.read_line(&mut buf).await? != 0 {
+            let parsed_line = parse_line(buf.clone());
+
+            match parsed_line {
+                ParsedLine::UserAgent(ua) if self.parsing_agents => {
+                    self.agents.push(ua);
+                }
+                ParsedLine::UserAgent(ua) => {
+                    self.agents = vec![ua];
+                    self.rules = vec![];
+                    self.parsing_agents = true;
+                }
+                ParsedLine::Rule(rule) if self.parsing_agents => {
+                    // Preserve the rule in case we need it in next_rules().
+                    self.rules.push(rule);
+                    self.parsing_agents = false;
+                    break;
+                }
+                // Skip over lines until we get to the next user agent.
+                ParsedLine::Rule(..) => (),
+                ParsedLine::Nothing => (),
+            }
+
+            buf.clear();
+        }
+
+        let agents = self.agents.clone();
+        self.agents = vec![];
+
+        if agents.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(agents))
+    }
+
+    async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
+        let mut buf = String::new();
+        while self.reader.read_line(&mut buf).await? != 0 {
+            let parsed_line = parse_line(buf.clone());
+
+            match parsed_line {
+                ParsedLine::Rule(rule) => {
+                    self.rules.push(rule);
+                    self.parsing_agents = false;
+                }
+                ParsedLine::UserAgent(ua) if !self.parsing_agents => {
+                    // Preserve the agent in case we need it in next_agents().
+                    self.agents.push(ua);
+                    self.parsing_agents = true;
+                    break;
+                }
+                // Skip over lines until we get to the next rule.
+                ParsedLine::UserAgent(..) => (),
+                ParsedLine::Nothing => (),
+            }
+
+            buf.clear();
+        }
+
+        let rules = self.rules.clone();
+        self.rules = vec![];
+        Ok(rules)
+    }
+}
+
+fn parse_line(line: String) -> ParsedLine {
+    let line = strip_comments(&line[..]).trim();
+
+    // This tries to parse lines roughly in order of most frequent kind to
+    // least frequent kind in order to minimize CPU cycles on average.
+    parse_disallow(line)
+        .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
+        .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
+        .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
+        .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
+        .unwrap_or(ParsedLine::Nothing)
+}
+
+fn strip_comments(line: &str) -> &str {
+    if let Some(before) = line.split('#').next() {
+        return before;
+    }
+    return line;
+}
+
+fn parse_user_agent(line: &str) -> Option<&str> {
+    if line.len() < UA_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[UA_PREFIX.len()..];
+
+    if prefix == UA_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_delay(line: &str) -> Option<&str> {
+    if line.len() < DELAY_PREFIX.len() {
+        return None;
+    }
+
+    let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[DELAY_PREFIX.len()..];
+    if prefix == DELAY_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_allow(line: &str) -> Option<&str> {
+    if line.len() < ALLOW_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[ALLOW_PREFIX.len()..];
+
+    if prefix == ALLOW_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_disallow(line: &str) -> Option<&str> {
+    if line.len() < DISALLOW_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[DISALLOW_PREFIX.len()..];
+
+    if prefix == DISALLOW_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_allow() {
+        let test_cases = vec![
+            ("Allow: /", "/"),
+            ("allow: /   #  Root with comment", "/"),
+            ("ALLOW: /abc/def  ", "/abc/def"),
+            ("Allow:   /abc/def  ", "/abc/def"),
+            ("  Allow: /*/foo", "/*/foo"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(
+                parse_line(i.into()),
+                ParsedLine::Rule(ParsedRule::Allow(o.into()))
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_disallow() {
+        let test_cases = vec![
+            ("Disallow: /", "/"),
+            ("disallow: /   #  Root with comment", "/"),
+            ("DISALLOW: /abc/def  ", "/abc/def"),
+            ("Disallow:   /abc/def  ", "/abc/def"),
+            ("  Disallow: /*/foo", "/*/foo"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(
+                parse_line(i.into()),
+                ParsedLine::Rule(ParsedRule::Disallow(o.into()))
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_user_agent() {
+        let test_cases = vec![
+            ("User-agent: *", "*"),
+            ("user-agent: ImABot   #  User agent with comment", "imabot"),
+            ("  USER-AGENT:   ImABot  ", "imabot"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
+        }
+    }
+
+    #[test]
+    fn test_parse_nothing() {
+        let test_cases = vec![
+            "Useragent: *",
+            "# Comment",
+            "",
+            "    ",
+            "\t",
+            "alow: /",
+            "disalow: /",
+        ];
+
+        for i in test_cases {
+            assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
+        }
+    }
+
+    #[test]
+    fn test_crawl_delay() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones-bot
+            Disallow: /
+            Crawl-Delay: 30
+
+            User-agent: foobar
+            Crawl-Delay: 60
+
+            User-agent: googlebot
+            Allow: /
+
+            User-agent: barfoo
+            Crawl-Delay: 60
+            Crawl-Delay: 20
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("googlebot");
+            let googlebot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("barfoo");
+            let barfoo_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("jones-bot");
+            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(Some(60), foobar_machine.delay());
+            assert_eq!(Some(20), barfoo_machine.delay());
+            assert_eq!(Some(30), jonesbot_machine.delay());
+            assert_eq!(None, googlebot_machine.delay());
+        });
+    }
+
+    #[test]
+    fn test_end_to_end() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones-bot
+            Disallow: /
+
+            User-agent: foo
+            Allow: /
+            Crawl-Delay: 20
+
+            User-agent: jones
+            User-agent: foobar
+            Allow: /
+
+            User-agent: *
+            Disallow: /
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("jones-bot");
+            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("abc");
+            let abc_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(true, foobar_machine.allow("/index.html"));
+            assert_eq!(false, jonesbot_machine.allow("/index.html"));
+            assert_eq!(false, imabot_machine.allow("/index.html"));
+            assert_eq!(false, abc_machine.allow("/index.html"));
+        });
+    }
+
+    #[test]
+    fn test_invalid_1() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            # Instead of treating this as an error, we'll just consider
+            # this behavior undefined.
+            Allow: /
+
+            User-agent: jones
+            User-agent: foobar
+            Disallow: /
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            // Everything is allowed because next_header() returns None
+            assert_eq!(true, foobar_machine.allow("/index.html"));
+            assert_eq!(true, imabot_machine.allow("/index.html"));
+        });
+    }
+
+    #[test]
+    fn test_invalid_2() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones
+            User-agent: foobar
+            Disallow: /
+
+            # Instead of treating this as an error, we consider this
+            # behavior undefined.
+            User-agent: imabot
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(false, foobar_machine.allow("/index.html"));
+            assert_eq!(true, imabot_machine.allow("/index.html"));
+        });
+    }
+}