From fe11216642a6ec9fec05d8bc7bab3d3eaf6ee2ca Mon Sep 17 00:00:00 2001
From: "r.portalez" <r.portalez@qwant.com>
Date: Wed, 10 Mar 2021 15:57:55 +0100
Subject: [PATCH] add crawl-delay

---
 Cargo.toml       |    6 +-
 benches/parse.rs |    1 +
 src/dfa.rs       | 1033 ++++++++++++++++++++++++----------------------
 src/parse.rs     |  841 +++++++++++++++++++------------------
 4 files changed, 979 insertions(+), 902 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 795f606..c82cf6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
 repository = "https://github.com/crestonbunch/cylon"
 
 [dependencies]
-futures-util = "0.3.13"
-serde = "1.0.124"
-serde_derive = "1.0.124"
+futures-util = "0.3"
+serde = "1.0"
+serde_derive = "1.0"
 
 [dev-dependencies]
 criterion = { version = "0.3", features = ["async_futures"] }
diff --git a/benches/parse.rs b/benches/parse.rs
index 2084b00..ba0851e 100644
--- a/benches/parse.rs
+++ b/benches/parse.rs
@@ -9,6 +9,7 @@ Disallow: /
 Allow: /a
 Allow: /abc
 Allow: /b
+Crawl-Delay: 20
 "#
 .as_bytes();
 
diff --git a/src/dfa.rs b/src/dfa.rs
index 5e57ea5..6c39eb4 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -1,500 +1,533 @@
-use serde_derive::{Deserialize, Serialize};
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub enum Rule<'a> {
-    Allow(&'a str),
-    Disallow(&'a str),
-    Delay(u64),
-}
-
-impl<'a> Rule<'a> {
-    fn inner(&self) -> &str {
-        match self {
-            Rule::Allow(inner) => inner,
-            Rule::Disallow(inner) => inner,
-            Rule::Delay(inner) => inner,
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
-enum Edge {
-    MatchChar(char),
-    MatchAny,
-    MatchEow,
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
-struct Transition(Edge, usize);
-
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-enum State {
-    Allow,
-    Disallow,
-    Intermediate,
-}
-
-/// A Cylon is a DFA that recognizes rules from a compiled robots.txt
-/// file. By providing it a URL path, it can decide whether or not
-/// the robots file that compiled it allows or disallows that path in
-/// roughly O(n) time, where n is the length of the path.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct Cylon {
-    states: Vec<State>,
-    transitions: Vec<Vec<Transition>>,
-}
-
-impl Cylon {
-    /// Match whether the rules allow or disallow the target path.
-    pub fn allow(&self, path: &str) -> bool {
-        let mut state = path.chars().fold(2, |state, path_char| {
-            let t = &self.transitions[state];
-            t.iter()
-                .rev()
-                // Pick the last transition to always prioritize MatchChar
-                // over MatchAny (which will always be the first transition.)
-                .find(|transition| match transition {
-                    Transition(Edge::MatchAny, ..) => true,
-                    Transition(Edge::MatchEow, ..) => false,
-                    Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
-                })
-                .map(|Transition(.., next_state)| *next_state)
-                // We are guaranteed at least one matching state because of
-                // the way the DFA is constructed.
-                .unwrap()
-        });
-
-        // Follow the EoW transition, if necessary
-        let t = &self.transitions[state];
-        state = t
-            .iter()
-            .rev()
-            .find(|transition| match transition {
-                Transition(Edge::MatchEow, ..) => true,
-                Transition(Edge::MatchAny, ..) => true,
-                _ => false,
-            })
-            .map(|Transition(.., next_state)| *next_state)
-            .unwrap_or(state);
-
-        match self.states[state] {
-            State::Allow => true,
-            State::Disallow => false,
-            // Intermediate states are not preserved in the DFA
-            State::Intermediate => unreachable!(),
-        }
-    }
-
-    /// Compile a machine from a list of rules.
-    pub fn compile(mut rules: Vec<Rule>) -> Self {
-        // This algorithm constructs a DFA by doing BFS over the prefix tree of
-        // paths in the provided list of rules. However, for performance reasons
-        // it does not actually build a tree structure. (Vecs have better
-        // cache-locality by avoiding random memory access.)
-
-        let mut transitions: Vec<Vec<Transition>> = vec![
-            vec![Transition(Edge::MatchAny, 0)],
-            vec![Transition(Edge::MatchAny, 1)],
-        ];
-        let mut states: Vec<State> = vec![State::Allow, State::Disallow];
-
-        rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
-
-        let mut queue = vec![("", 0, 0, State::Intermediate)];
-        while !queue.is_empty() {
-            // parent_prefix is the "parent node" in the prefix tree. We are
-            // going to visit its children by filtering from the list of
-            // paths only the paths that start with the parent_prefix.
-            // wildcard_state is a node to jump to when an unmatched character
-            // is encountered. This is usually a node higher up in the tree
-            // that can match any character legally, but is also a prefix
-            // (read: ancestor) of the current node.
-            let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
-            let last_char = parent_prefix.chars().last();
-
-            wildcard_state = match state {
-                State::Allow => 0,
-                State::Disallow if last_char == Some('$') => wildcard_state,
-                State::Disallow => 1,
-                State::Intermediate => wildcard_state,
-            };
-
-            let mut t = match last_char {
-                Some('$') => {
-                    // The EOW character cannot match anything else
-                    vec![Transition(Edge::MatchAny, wildcard_state)]
-                }
-                Some('*') => {
-                    // The wildcard character overrides the wildcard state
-                    vec![Transition(Edge::MatchAny, transitions.len())]
-                }
-                _ => {
-                    // Every other state has a self-loop that matches anything
-                    vec![Transition(Edge::MatchAny, wildcard_state)]
-                }
-            };
-
-            let mut curr_prefix = "";
-            rules
-                .iter()
-                .map(Rule::inner)
-                .zip(&rules)
-                .filter(|(path, _)| (*path).starts_with(parent_prefix))
-                .filter(|(path, _)| (*path) != parent_prefix)
-                .for_each(|(path, rule)| {
-                    let child_prefix = &path[0..parent_prefix.len() + 1];
-                    if curr_prefix == child_prefix {
-                        // We only want to visit a child node once, but
-                        // many rules might have the same child_prefix, so
-                        // we skip the duplicates after the first time
-                        // we see a prefix. (This could be a filter(), but
-                        // it's a bit hard to encode earlier in the chain.)
-                        return;
-                    }
-                    curr_prefix = child_prefix;
-
-                    let eow = child_prefix == path;
-                    let state = match (rule, eow) {
-                        (Rule::Allow(..), true) => State::Allow,
-                        (Rule::Disallow(..), true) => State::Disallow,
-                        _ => State::Intermediate,
-                    };
-
-                    queue.push((child_prefix, wildcard_state, transitions.len(), state));
-
-                    // NB: we can predict what state index the child
-                    // will have before it's even pushed onto the state vec.
-                    let child_index = transitions.len() + queue.len();
-                    let edge_char = child_prefix.chars().last().unwrap();
-                    let transition = Transition(
-                        match edge_char {
-                            '*' => Edge::MatchAny,
-                            '$' => Edge::MatchEow,
-                            c => Edge::MatchChar(c),
-                        },
-                        child_index,
-                    );
-
-                    // Add transitions from the parent state to the child state
-                    // so that the wildcard character matches are optional.
-                    if last_char == Some('*') {
-                        let parent_t = &mut transitions[parent_state];
-                        parent_t.push(transition);
-                    }
-
-                    t.push(transition);
-                });
-
-            states.push(match state {
-                State::Allow | State::Disallow => state,
-                State::Intermediate => states[wildcard_state],
-            });
-            transitions.push(t);
-        }
-
-        Self {
-            states,
-            transitions,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    macro_rules! t {
-        ('*' => $x:expr) => {
-            Transition(Edge::MatchAny, $x)
-        };
-        ('$' => $x:expr) => {
-            Transition(Edge::MatchEow, $x)
-        };
-        ($x:expr => $y:expr) => {
-            Transition(Edge::MatchChar($x), $y)
-        };
-    }
-
-    #[test]
-    fn test_compile() {
-        let rules = vec![
-            Rule::Disallow("/"),
-            Rule::Allow("/a"),
-            Rule::Allow("/abc"),
-            Rule::Allow("/b"),
-        ];
-
-        let expect_transitions = vec![
-            vec![t!('*' => 0)],
-            vec![t!('*' => 1)],
-            vec![t!('*' => 0), t!('/' => 3)],               // ""
-            vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
-            vec![t!('*' => 0), t!('b' => 6)],               // "/a"
-            vec![t!('*' => 0)],                             // "/b"
-            vec![t!('*' => 0), t!('c' => 7)],               // "/ab"
-            vec![t!('*' => 0)],                             // "/abc"
-        ];
-
-        let expect_states = vec![
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Allow,
-            State::Allow,
-            State::Allow,
-        ];
-
-        let actual = Cylon::compile(rules);
-        assert_eq!(actual.transitions, expect_transitions);
-        assert_eq!(actual.states, expect_states);
-    }
-
-    #[test]
-    fn test_compile_with_wildcard() {
-        let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
-
-        let expect_transitions = vec![
-            vec![t!('*' => 0)],
-            vec![t!('*' => 1)],
-            vec![t!('*' => 0), t!('/' => 3)], // ""
-            vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
-            vec![t!('*' => 4), t!('.' => 6)], // "/*"
-            vec![t!('*' => 0)],               // "/a"
-            vec![t!('*' => 1), t!('b' => 7)], // "/*."
-            vec![t!('*' => 0)],               // "/*.b"
-        ];
-
-        let expect_states = vec![
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Disallow,
-            State::Disallow,
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-        ];
-
-        let actual = Cylon::compile(rules);
-        assert_eq!(actual.transitions, expect_transitions);
-        assert_eq!(actual.states, expect_states);
-    }
-
-    #[test]
-    fn test_compile_tricky_wildcard() {
-        let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
-
-        let expect_transitions = vec![
-            vec![t!('*' => 0)],
-            vec![t!('*' => 1)],
-            vec![t!('*' => 0), t!('/' => 3)],               // ""
-            vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
-            vec![t!('*' => 4), t!('.' => 5)],               // "/*"
-            vec![t!('*' => 0)],                             // "/*."
-        ];
-
-        let expect_states = vec![
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Disallow,
-            State::Disallow,
-            State::Allow,
-        ];
-
-        let actual = Cylon::compile(rules);
-        assert_eq!(actual.transitions, expect_transitions);
-        assert_eq!(actual.states, expect_states);
-    }
-
-    #[test]
-    fn test_compile_with_eow() {
-        let rules = vec![
-            Rule::Allow("/"),
-            Rule::Disallow("/a$"),
-            // Note that this rule is nonsensical. It will compile, but
-            // no guarantees are made as to how it's matched. Rules should
-            // use url-encoded strings to escape $.
-            Rule::Disallow("/x$y"),
-        ];
-
-        let expect_transitions = vec![
-            vec![t!('*' => 0)],
-            vec![t!('*' => 1)],
-            vec![t!('*' => 0), t!('/' => 3)],               // ""
-            vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
-            vec![t!('*' => 0), t!('$' => 6)],               // "/a"
-            vec![t!('*' => 0), t!('$' => 7)],               // "/x"
-            vec![t!('*' => 0)],                             // "/a$"
-            vec![t!('*' => 0), t!('y' => 8)],               // "/x$"
-            vec![t!('*' => 1)],                             // "/x$y"
-        ];
-
-        let expect_states = vec![
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Allow,
-            State::Allow,
-            State::Allow,
-            State::Disallow,
-            State::Allow,
-            State::Disallow,
-        ];
-
-        let actual = Cylon::compile(rules);
-        assert_eq!(actual.transitions, expect_transitions);
-        assert_eq!(actual.states, expect_states);
-    }
-
-    #[test]
-    fn test_allow() {
-        let rules = vec![
-            Rule::Disallow("/"),
-            Rule::Allow("/a"),
-            Rule::Allow("/abc"),
-            Rule::Allow("/b"),
-        ];
-
-        let machine = Cylon::compile(rules);
-        assert_eq!(false, machine.allow("/"));
-        assert_eq!(true, machine.allow("/a"));
-        assert_eq!(true, machine.allow("/a/b"));
-        assert_eq!(true, machine.allow("/a"));
-        assert_eq!(true, machine.allow("/abc"));
-        assert_eq!(true, machine.allow("/abc/def"));
-        assert_eq!(true, machine.allow("/b"));
-        assert_eq!(true, machine.allow("/b/c"));
-    }
-
-    #[test]
-    fn test_allow_match_any() {
-        let rules = vec![
-            Rule::Allow("/"),
-            Rule::Disallow("/secret/*.txt"),
-            Rule::Disallow("/private/*"),
-        ];
-
-        let machine = Cylon::compile(rules);
-        assert_eq!(true, machine.allow("/"));
-        assert_eq!(true, machine.allow("/abc"));
-        assert_eq!(false, machine.allow("/secret/abc.txt"));
-        assert_eq!(false, machine.allow("/secret/123.txt"));
-        assert_eq!(true, machine.allow("/secret/abc.csv"));
-        assert_eq!(true, machine.allow("/secret/123.csv"));
-        assert_eq!(false, machine.allow("/private/abc.txt"));
-        assert_eq!(false, machine.allow("/private/123.txt"));
-        assert_eq!(false, machine.allow("/private/abc.csv"));
-        assert_eq!(false, machine.allow("/private/123.csv"));
-    }
-
-    #[test]
-    fn test_allow_match_eow() {
-        let rules = vec![
-            Rule::Allow("/"),
-            Rule::Disallow("/ignore$"),
-            Rule::Disallow("/foo$bar"),
-        ];
-
-        let machine = Cylon::compile(rules);
-        assert_eq!(true, machine.allow("/"));
-        assert_eq!(true, machine.allow("/abc"));
-        assert_eq!(false, machine.allow("/ignore"));
-        assert_eq!(true, machine.allow("/ignoreabc"));
-        assert_eq!(true, machine.allow("/ignore/abc"));
-        // These are technically undefined, and no behavior
-        // is guaranteed since the rule is malformed. However
-        // it is safer to accept them rather than reject them.
-        assert_eq!(true, machine.allow("/foo"));
-        assert_eq!(true, machine.allow("/foo$bar"));
-    }
-
-    #[test]
-    fn test_allow_more_complicated() {
-        let rules = vec![
-            Rule::Allow("/"),
-            Rule::Disallow("/a$"),
-            Rule::Disallow("/abc"),
-            Rule::Allow("/abc/*"),
-            Rule::Disallow("/foo/bar"),
-            Rule::Allow("/*/bar"),
-            Rule::Disallow("/www/*/images"),
-            Rule::Allow("/www/public/images"),
-        ];
-
-        let machine = Cylon::compile(rules);
-        assert_eq!(true, machine.allow("/"));
-        assert_eq!(true, machine.allow("/directory"));
-        assert_eq!(false, machine.allow("/a"));
-        assert_eq!(true, machine.allow("/ab"));
-        assert_eq!(false, machine.allow("/abc"));
-        assert_eq!(true, machine.allow("/abc/123"));
-        assert_eq!(true, machine.allow("/foo"));
-        assert_eq!(true, machine.allow("/foobar"));
-        assert_eq!(false, machine.allow("/foo/bar"));
-        assert_eq!(false, machine.allow("/foo/bar/baz"));
-        assert_eq!(true, machine.allow("/baz/bar"));
-        assert_eq!(false, machine.allow("/www/cat/images"));
-        assert_eq!(true, machine.allow("/www/public/images"));
-    }
-
-    #[test]
-    fn test_matches() {
-        // Test cases from:
-        // https://developers.google.com/search/reference/robots_txt#group-member-rules
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
-        assert_eq!(true, machine.allow("/fish"));
-        assert_eq!(true, machine.allow("/fish.html"));
-        assert_eq!(true, machine.allow("/fish/salmon.html"));
-        assert_eq!(true, machine.allow("/fishheads.html"));
-        assert_eq!(true, machine.allow("/fishheads/yummy.html"));
-        assert_eq!(true, machine.allow("/fish.php?id=anything"));
-        assert_eq!(false, machine.allow("/Fish.asp"));
-        assert_eq!(false, machine.allow("/catfish"));
-        assert_eq!(false, machine.allow("/?id=fish"));
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
-        assert_eq!(true, machine.allow("/fish"));
-        assert_eq!(true, machine.allow("/fish.html"));
-        assert_eq!(true, machine.allow("/fish/salmon.html"));
-        assert_eq!(true, machine.allow("/fishheads.html"));
-        assert_eq!(true, machine.allow("/fishheads/yummy.html"));
-        assert_eq!(true, machine.allow("/fish.php?id=anything"));
-        assert_eq!(false, machine.allow("/Fish.asp"));
-        assert_eq!(false, machine.allow("/catfish"));
-        assert_eq!(false, machine.allow("/?id=fish"));
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
-        assert_eq!(true, machine.allow("/fish/"));
-        assert_eq!(true, machine.allow("/fish/?id=anything"));
-        assert_eq!(true, machine.allow("/fish/salmon.htm"));
-        assert_eq!(false, machine.allow("/fish"));
-        assert_eq!(false, machine.allow("/fish.html"));
-        assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
-        assert_eq!(true, machine.allow("/filename.php"));
-        assert_eq!(true, machine.allow("/folder/filename.php"));
-        assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
-        assert_eq!(true, machine.allow("/folder/any.php.file.html"));
-        assert_eq!(true, machine.allow("/filename.php/"));
-        assert_eq!(false, machine.allow("/"));
-        assert_eq!(false, machine.allow("/windows.PHP"));
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
-        assert_eq!(true, machine.allow("/filename.php"));
-        assert_eq!(true, machine.allow("/folder/filename.php"));
-        assert_eq!(false, machine.allow("/filename.php?parameters"));
-        assert_eq!(false, machine.allow("/filename.php/"));
-        assert_eq!(false, machine.allow("/filename.php5"));
-        assert_eq!(false, machine.allow("/windows.PHP"));
-
-        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
-        assert_eq!(true, machine.allow("/fish.php"));
-        assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
-        assert_eq!(false, machine.allow("/Fish.PHP"));
-    }
-}
+use std::cmp::Ordering;
+
+use serde_derive::{Deserialize, Serialize};
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Rule<'a> {
+    Allow(&'a str),
+    Disallow(&'a str),
+    Delay(&'a str),
+}
+
+impl<'a> Rule<'a> {
+    fn inner(&self) -> &str {
+        match self {
+            Rule::Allow(inner) => inner,
+            Rule::Disallow(inner) => inner,
+            Rule::Delay(inner) => inner,
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
+enum Edge {
+    MatchChar(char),
+    MatchAny,
+    MatchEow,
+}
+
+#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
+struct Transition(Edge, usize);
+
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+enum State {
+    Allow,
+    Disallow,
+    Delay,
+    Intermediate,
+}
+
+/// A Cylon is a DFA that recognizes rules from a compiled robots.txt
+/// file. By providing it a URL path, it can decide whether or not
+/// the robots file that compiled it allows or disallows that path in
+/// roughly O(n) time, where n is the length of the path.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Cylon {
+    states: Vec<State>,
+    transitions: Vec<Vec<Transition>>,
+    delay: Option<u64>,
+}
+
+impl Cylon {
+    pub fn delay(&self) -> Option<u64> {
+        self.delay
+    }
+
+    /// Match whether the rules allow or disallow the target path.
+    pub fn allow(&self, path: &str) -> bool {
+        match self.states[self.state(path)] {
+            State::Allow => true,
+            State::Disallow => false,
+            // Intermediate states are not preserved in the DFA
+            State::Intermediate | State::Delay => unreachable!(),
+        }
+    }
+
+    fn state(&self, path: &str) -> usize {
+        let state = path.chars().fold(2, |state, path_char| {
+            let t = &self.transitions[state];
+            t.iter()
+                .rev()
+                // Pick the last transition to always prioritize MatchChar
+                // over MatchAny (which will always be the first transition.)
+                .find(|transition| match transition {
+                    Transition(Edge::MatchAny, ..) => true,
+                    Transition(Edge::MatchEow, ..) => false,
+                    Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
+                })
+                .map(|Transition(.., next_state)| *next_state)
+                // We are guaranteed at least one matching state because of
+                // the way the DFA is constructed.
+                .unwrap()
+        });
+
+        // Follow the EoW transition, if necessary
+        let t = &self.transitions[state];
+        t
+            .iter()
+            .rev()
+            .find(|transition| match transition {
+                Transition(Edge::MatchEow, ..) => true,
+                Transition(Edge::MatchAny, ..) => true,
+                _ => false,
+            })
+            .map(|Transition(.., next_state)| *next_state)
+            .unwrap_or(state)
+    }
+
+    /// Compile a machine from a list of rules.
+    pub fn compile(mut rules: Vec<Rule>) -> Self {
+        // This algorithm constructs a DFA by doing BFS over the prefix tree of
+        // paths in the provided list of rules. However, for performance reasons
+        // it does not actually build a tree structure. (Vecs have better
+        // cache-locality by avoiding random memory access.)
+
+        let mut transitions: Vec<Vec<Transition>> = vec![
+            vec![Transition(Edge::MatchAny, 0)],
+            vec![Transition(Edge::MatchAny, 1)],
+        ];
+        let mut states: Vec<State> = vec![State::Allow, State::Disallow];
+
+        rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
+
+        let mut queue = vec![("", 0, 0, State::Intermediate)];
+        while !queue.is_empty() {
+            // parent_prefix is the "parent node" in the prefix tree. We are
+            // going to visit its children by filtering from the list of
+            // paths only the paths that start with the parent_prefix.
+            // wildcard_state is a node to jump to when an unmatched character
+            // is encountered. This is usually a node higher up in the tree
+            // that can match any character legally, but is also a prefix
+            // (read: ancestor) of the current node.
+            let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
+            let last_char = parent_prefix.chars().last();
+
+            wildcard_state = match state {
+                State::Allow => 0,
+                State::Disallow if last_char == Some('$') => wildcard_state,
+                State::Disallow => 1,
+                State::Delay => 1,
+                State::Intermediate => wildcard_state,
+            };
+
+            let mut t = match last_char {
+                Some('$') => {
+                    // The EOW character cannot match anything else
+                    vec![Transition(Edge::MatchAny, wildcard_state)]
+                }
+                Some('*') => {
+                    // The wildcard character overrides the wildcard state
+                    vec![Transition(Edge::MatchAny, transitions.len())]
+                }
+                _ => {
+                    // Every other state has a self-loop that matches anything
+                    vec![Transition(Edge::MatchAny, wildcard_state)]
+                }
+            };
+
+            let mut curr_prefix = "";
+            rules
+                .iter()
+                .map(Rule::inner)
+                .zip(&rules)
+                .filter(|(path, _)| (*path).starts_with(parent_prefix))
+                .filter(|(path, _)| (*path) != parent_prefix)
+                .for_each(|(path, rule)| {
+                    let child_prefix = &path[0..parent_prefix.len() + 1];
+                    if curr_prefix == child_prefix {
+                        // We only want to visit a child node once, but
+                        // many rules might have the same child_prefix, so
+                        // we skip the duplicates after the first time
+                        // we see a prefix. (This could be a filter(), but
+                        // it's a bit hard to encode earlier in the chain.)
+                        return;
+                    }
+                    curr_prefix = child_prefix;
+
+                    let eow = child_prefix == path;
+                    let state = match (rule, eow) {
+                        (Rule::Allow(..), true) => State::Allow,
+                        (Rule::Disallow(..), true) => State::Disallow,
+                        (Rule::Delay(..), true) => State::Delay,
+                        _ => State::Intermediate,
+                    };
+
+                    queue.push((child_prefix, wildcard_state, transitions.len(), state));
+
+                    // NB: we can predict what state index the child
+                    // will have before it's even pushed onto the state vec.
+                    let child_index = transitions.len() + queue.len();
+                    let edge_char = child_prefix.chars().last().unwrap();
+                    let transition = Transition(
+                        match edge_char {
+                            '*' => Edge::MatchAny,
+                            '$' => Edge::MatchEow,
+                            c => Edge::MatchChar(c),
+                        },
+                        child_index,
+                    );
+
+                    // Add transitions from the parent state to the child state
+                    // so that the wildcard character matches are optional.
+                    if last_char == Some('*') {
+                        let parent_t = &mut transitions[parent_state];
+                        parent_t.push(transition);
+                    }
+
+                    t.push(transition);
+                });
+
+            states.push(match state {
+                State::Allow | State::Disallow  | State::Delay => state,
+                State::Intermediate => states[wildcard_state],
+            });
+            transitions.push(t);
+        }
+
+        let mut delays: Vec<Option<u64>> = rules.iter().filter(|rule| { 
+            match rule {
+                Rule::Delay(_) => true,
+                _ => false
+            }
+        }).map(|r| {
+            r.inner().parse::<u64>().ok()
+        }).collect();
+        delays.sort_unstable_by(|a, b| {
+            match (a, b) {
+                (None, Some(_)) => Ordering::Greater,
+                (Some(_), None) => Ordering::Less,
+                (None, None) => Ordering::Equal,
+                (Some(aa), Some(bb)) => aa.cmp(bb)
+
+            }
+        });
+        
+            
+
+        Self {
+            delay: *delays.get(0).unwrap_or(&None),
+            states,
+            transitions,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    macro_rules! t {
+        ('*' => $x:expr) => {
+            Transition(Edge::MatchAny, $x)
+        };
+        ('$' => $x:expr) => {
+            Transition(Edge::MatchEow, $x)
+        };
+        ($x:expr => $y:expr) => {
+            Transition(Edge::MatchChar($x), $y)
+        };
+    }
+
+    #[test]
+    fn test_compile() {
+        let rules = vec![
+            Rule::Disallow("/"),
+            Rule::Allow("/a"),
+            Rule::Allow("/abc"),
+            Rule::Allow("/b"),
+        ];
+
+        let expect_transitions = vec![
+            vec![t!('*' => 0)],
+            vec![t!('*' => 1)],
+            vec![t!('*' => 0), t!('/' => 3)],               // ""
+            vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
+            vec![t!('*' => 0), t!('b' => 6)],               // "/a"
+            vec![t!('*' => 0)],                             // "/b"
+            vec![t!('*' => 0), t!('c' => 7)],               // "/ab"
+            vec![t!('*' => 0)],                             // "/abc"
+        ];
+
+        let expect_states = vec![
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Allow,
+            State::Allow,
+            State::Allow,
+        ];
+
+        let actual = Cylon::compile(rules);
+        assert_eq!(actual.transitions, expect_transitions);
+        assert_eq!(actual.states, expect_states);
+    }
+
+    #[test]
+    fn test_compile_with_wildcard() {
+        let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
+
+        let expect_transitions = vec![
+            vec![t!('*' => 0)],
+            vec![t!('*' => 1)],
+            vec![t!('*' => 0), t!('/' => 3)], // ""
+            vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
+            vec![t!('*' => 4), t!('.' => 6)], // "/*"
+            vec![t!('*' => 0)],               // "/a"
+            vec![t!('*' => 1), t!('b' => 7)], // "/*."
+            vec![t!('*' => 0)],               // "/*.b"
+        ];
+
+        let expect_states = vec![
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Disallow,
+            State::Disallow,
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+        ];
+
+        let actual = Cylon::compile(rules);
+        assert_eq!(actual.transitions, expect_transitions);
+        assert_eq!(actual.states, expect_states);
+    }
+
+    #[test]
+    fn test_compile_tricky_wildcard() {
+        let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
+
+        let expect_transitions = vec![
+            vec![t!('*' => 0)],
+            vec![t!('*' => 1)],
+            vec![t!('*' => 0), t!('/' => 3)],               // ""
+            vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
+            vec![t!('*' => 4), t!('.' => 5)],               // "/*"
+            vec![t!('*' => 0)],                             // "/*."
+        ];
+
+        let expect_states = vec![
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Disallow,
+            State::Disallow,
+            State::Allow,
+        ];
+
+        let actual = Cylon::compile(rules);
+        assert_eq!(actual.transitions, expect_transitions);
+        assert_eq!(actual.states, expect_states);
+    }
+
+    #[test]
+    fn test_compile_with_eow() {
+        let rules = vec![
+            Rule::Allow("/"),
+            Rule::Disallow("/a$"),
+            // Note that this rule is nonsensical. It will compile, but
+            // no guarantees are made as to how it's matched. Rules should
+            // use url-encoded strings to escape $.
+            Rule::Disallow("/x$y"),
+        ];
+
+        let expect_transitions = vec![
+            vec![t!('*' => 0)],
+            vec![t!('*' => 1)],
+            vec![t!('*' => 0), t!('/' => 3)],               // ""
+            vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
+            vec![t!('*' => 0), t!('$' => 6)],               // "/a"
+            vec![t!('*' => 0), t!('$' => 7)],               // "/x"
+            vec![t!('*' => 0)],                             // "/a$"
+            vec![t!('*' => 0), t!('y' => 8)],               // "/x$"
+            vec![t!('*' => 1)],                             // "/x$y"
+        ];
+
+        let expect_states = vec![
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Allow,
+            State::Allow,
+            State::Allow,
+            State::Disallow,
+            State::Allow,
+            State::Disallow,
+        ];
+
+        let actual = Cylon::compile(rules);
+        assert_eq!(actual.transitions, expect_transitions);
+        assert_eq!(actual.states, expect_states);
+    }
+
+    #[test]
+    fn test_allow() {
+        let rules = vec![
+            Rule::Disallow("/"),
+            Rule::Allow("/a"),
+            Rule::Allow("/abc"),
+            Rule::Allow("/b"),
+        ];
+
+        let machine = Cylon::compile(rules);
+        assert_eq!(false, machine.allow("/"));
+        assert_eq!(true, machine.allow("/a"));
+        assert_eq!(true, machine.allow("/a/b"));
+        assert_eq!(true, machine.allow("/a"));
+        assert_eq!(true, machine.allow("/abc"));
+        assert_eq!(true, machine.allow("/abc/def"));
+        assert_eq!(true, machine.allow("/b"));
+        assert_eq!(true, machine.allow("/b/c"));
+    }
+
+    #[test]
+    fn test_allow_match_any() {
+        let rules = vec![
+            Rule::Allow("/"),
+            Rule::Disallow("/secret/*.txt"),
+            Rule::Disallow("/private/*"),
+        ];
+
+        let machine = Cylon::compile(rules);
+        assert_eq!(true, machine.allow("/"));
+        assert_eq!(true, machine.allow("/abc"));
+        assert_eq!(false, machine.allow("/secret/abc.txt"));
+        assert_eq!(false, machine.allow("/secret/123.txt"));
+        assert_eq!(true, machine.allow("/secret/abc.csv"));
+        assert_eq!(true, machine.allow("/secret/123.csv"));
+        assert_eq!(false, machine.allow("/private/abc.txt"));
+        assert_eq!(false, machine.allow("/private/123.txt"));
+        assert_eq!(false, machine.allow("/private/abc.csv"));
+        assert_eq!(false, machine.allow("/private/123.csv"));
+    }
+
+    #[test]
+    fn test_allow_match_eow() {
+        let rules = vec![
+            Rule::Allow("/"),
+            Rule::Disallow("/ignore$"),
+            Rule::Disallow("/foo$bar"),
+        ];
+
+        let machine = Cylon::compile(rules);
+        assert_eq!(true, machine.allow("/"));
+        assert_eq!(true, machine.allow("/abc"));
+        assert_eq!(false, machine.allow("/ignore"));
+        assert_eq!(true, machine.allow("/ignoreabc"));
+        assert_eq!(true, machine.allow("/ignore/abc"));
+        // These are technically undefined, and no behavior
+        // is guaranteed since the rule is malformed. However
+        // it is safer to accept them rather than reject them.
+        assert_eq!(true, machine.allow("/foo"));
+        assert_eq!(true, machine.allow("/foo$bar"));
+    }
+
+    #[test]
+    fn test_allow_more_complicated() {
+        let rules = vec![
+            Rule::Allow("/"),
+            Rule::Disallow("/a$"),
+            Rule::Disallow("/abc"),
+            Rule::Allow("/abc/*"),
+            Rule::Disallow("/foo/bar"),
+            Rule::Allow("/*/bar"),
+            Rule::Disallow("/www/*/images"),
+            Rule::Allow("/www/public/images"),
+        ];
+
+        let machine = Cylon::compile(rules);
+        assert_eq!(true, machine.allow("/"));
+        assert_eq!(true, machine.allow("/directory"));
+        assert_eq!(false, machine.allow("/a"));
+        assert_eq!(true, machine.allow("/ab"));
+        assert_eq!(false, machine.allow("/abc"));
+        assert_eq!(true, machine.allow("/abc/123"));
+        assert_eq!(true, machine.allow("/foo"));
+        assert_eq!(true, machine.allow("/foobar"));
+        assert_eq!(false, machine.allow("/foo/bar"));
+        assert_eq!(false, machine.allow("/foo/bar/baz"));
+        assert_eq!(true, machine.allow("/baz/bar"));
+        assert_eq!(false, machine.allow("/www/cat/images"));
+        assert_eq!(true, machine.allow("/www/public/images"));
+    }
+
+    #[test]
+    fn test_matches() {
+        // Test cases from:
+        // https://developers.google.com/search/reference/robots_txt#group-member-rules
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
+        assert_eq!(true, machine.allow("/fish"));
+        assert_eq!(true, machine.allow("/fish.html"));
+        assert_eq!(true, machine.allow("/fish/salmon.html"));
+        assert_eq!(true, machine.allow("/fishheads.html"));
+        assert_eq!(true, machine.allow("/fishheads/yummy.html"));
+        assert_eq!(true, machine.allow("/fish.php?id=anything"));
+        assert_eq!(false, machine.allow("/Fish.asp"));
+        assert_eq!(false, machine.allow("/catfish"));
+        assert_eq!(false, machine.allow("/?id=fish"));
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
+        assert_eq!(true, machine.allow("/fish"));
+        assert_eq!(true, machine.allow("/fish.html"));
+        assert_eq!(true, machine.allow("/fish/salmon.html"));
+        assert_eq!(true, machine.allow("/fishheads.html"));
+        assert_eq!(true, machine.allow("/fishheads/yummy.html"));
+        assert_eq!(true, machine.allow("/fish.php?id=anything"));
+        assert_eq!(false, machine.allow("/Fish.asp"));
+        assert_eq!(false, machine.allow("/catfish"));
+        assert_eq!(false, machine.allow("/?id=fish"));
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
+        assert_eq!(true, machine.allow("/fish/"));
+        assert_eq!(true, machine.allow("/fish/?id=anything"));
+        assert_eq!(true, machine.allow("/fish/salmon.htm"));
+        assert_eq!(false, machine.allow("/fish"));
+        assert_eq!(false, machine.allow("/fish.html"));
+        assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
+        assert_eq!(true, machine.allow("/filename.php"));
+        assert_eq!(true, machine.allow("/folder/filename.php"));
+        assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
+        assert_eq!(true, machine.allow("/folder/any.php.file.html"));
+        assert_eq!(true, machine.allow("/filename.php/"));
+        assert_eq!(false, machine.allow("/"));
+        assert_eq!(false, machine.allow("/windows.PHP"));
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
+        assert_eq!(true, machine.allow("/filename.php"));
+        assert_eq!(true, machine.allow("/folder/filename.php"));
+        assert_eq!(false, machine.allow("/filename.php?parameters"));
+        assert_eq!(false, machine.allow("/filename.php/"));
+        assert_eq!(false, machine.allow("/filename.php5"));
+        assert_eq!(false, machine.allow("/windows.PHP"));
+
+        let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
+        assert_eq!(true, machine.allow("/fish.php"));
+        assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
+        assert_eq!(false, machine.allow("/Fish.PHP"));
+    }
+}
diff --git a/src/parse.rs b/src/parse.rs
index 921eaba..ebc1ba5 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -1,399 +1,442 @@
-use super::dfa::{Cylon, Rule};
-use futures_util::{
-    io::{AsyncBufRead, AsyncRead, BufReader, Result},
-    AsyncBufReadExt,
-};
-use serde_derive::{Deserialize, Serialize};
-const UA_PREFIX: &str = "user-agent:";
-const DELAY_PREFIX: &str = "crawl-delay:";
-const ALLOW_PREFIX: &str = "allow:";
-const DISALLOW_PREFIX: &str = "disallow:";
-
-#[derive(Debug, PartialEq, Clone)]
-enum ParsedRule {
-    Allow(String),
-    Disallow(String),
-    Delay(u64),
-}
-
-impl<'a> Into<Rule<'a>> for &'a ParsedRule {
-    fn into(self) -> Rule<'a> {
-        match self {
-            ParsedRule::Allow(path) => Rule::Allow(&path[..]),
-            ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
-            ParsedRule::Delay(delay) => Rule.Delay(delay),
-        }
-    }
-}
-
-#[derive(Debug, PartialEq)]
-enum ParsedLine {
-    UserAgent(String),
-    Rule(ParsedRule),
-    Nothing,
-}
-
-/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
-/// which can be used to efficiently match a large number of paths against
-/// the robots.txt file.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct Compiler {
-    user_agent: String,
-}
-
-impl Compiler {
-    /// Build a new compiler that parses rules for the given user agent from
-    /// a robots.txt file.
-    pub fn new(user_agent: &str) -> Self {
-        Self {
-            user_agent: user_agent.to_lowercase(),
-        }
-    }
-
-    /// Parse an input robots.txt file into a Cylon that can recognize
-    /// whether or not a path matches the rules for the Parser's user agent.
-    pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
-        let reader = BufReader::new(file);
-        let mut agent = String::new();
-        let mut rules: Vec<ParsedRule> = vec![];
-        let mut group_reader = GroupReader::new(reader);
-
-        // find the most specific matching group in the robots file
-        while let Some(agents) = group_reader.next_header().await? {
-            let matching_agent = agents.iter().find(|a| {
-                let matches = &a[..] == "*" || self.user_agent.contains(*a);
-                let more_specific = a.len() > agent.len();
-                matches && more_specific
-            });
-
-            if let Some(matching_agent) = matching_agent {
-                agent = matching_agent.clone();
-                rules = group_reader.next_rules().await?;
-            }
-        }
-
-        let rules = rules.iter().map(|r| r.into()).collect();
-        Ok(Cylon::compile(rules))
-    }
-}
-
-struct GroupReader<R: AsyncBufRead + Unpin> {
-    parsing_agents: bool,
-    agents: Vec<String>,
-    rules: Vec<ParsedRule>,
-    reader: R,
-}
-
-impl<R: AsyncBufRead + Unpin> GroupReader<R> {
-    fn new(reader: R) -> Self {
-        Self {
-            parsing_agents: true,
-            agents: vec![],
-            rules: vec![],
-            reader,
-        }
-    }
-
-    /// Scan forward until the next group header defined by one or more
-    /// user agent lines. This lets us optimize the lines we need to copy
-    /// so we can skip over groups that don't match the desired user agent.
-    async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
-        let mut buf = String::new();
-        while self.reader.read_line(&mut buf).await? != 0 {
-            let parsed_line = parse_line(buf.clone());
-
-            match parsed_line {
-                ParsedLine::UserAgent(ua) if self.parsing_agents => {
-                    self.agents.push(ua);
-                }
-                ParsedLine::UserAgent(ua) => {
-                    self.agents = vec![ua];
-                    self.rules = vec![];
-                    self.parsing_agents = true;
-                }
-                ParsedLine::Rule(rule) if self.parsing_agents => {
-                    // Preserve the rule in case we need it in next_rules().
-                    self.rules.push(rule);
-                    self.parsing_agents = false;
-                    break;
-                }
-                // Skip over lines until we get to the next user agent.
-                ParsedLine::Rule(..) => (),
-                ParsedLine::Nothing => (),
-            }
-
-            buf.clear();
-        }
-
-        let agents = self.agents.clone();
-        self.agents = vec![];
-
-        if agents.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(agents))
-    }
-
-    async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
-        let mut buf = String::new();
-        while self.reader.read_line(&mut buf).await? != 0 {
-            let parsed_line = parse_line(buf.clone());
-
-            match parsed_line {
-                ParsedLine::Rule(rule) => {
-                    self.rules.push(rule);
-                    self.parsing_agents = false;
-                }
-                ParsedLine::UserAgent(ua) if !self.parsing_agents => {
-                    // Preserve the agent in case we need it in next_agents().
-                    self.agents.push(ua);
-                    self.parsing_agents = true;
-                    break;
-                }
-                // Skip over lines until we get to the next rule.
-                ParsedLine::UserAgent(..) => (),
-                ParsedLine::Nothing => (),
-            }
-
-            buf.clear();
-        }
-
-        let rules = self.rules.clone();
-        self.rules = vec![];
-        Ok(rules)
-    }
-}
-
-fn parse_line(line: String) -> ParsedLine {
-    let line = strip_comments(&line[..]).trim();
-
-    // This tries to parse lines roughly in order of most frequent kind to
-    // least frequent kind in order to minimize CPU cycles on average.
-    parse_disallow(line)
-        .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
-        .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
-        .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
-        .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
-        .unwrap_or(ParsedLine::Nothing)
-}
-
-fn strip_comments(line: &str) -> &str {
-    if let Some(before) = line.split('#').next() {
-        return before;
-    }
-    return line;
-}
-
-fn parse_user_agent(line: &str) -> Option<&str> {
-    if line.len() < UA_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[UA_PREFIX.len()..];
-
-    if prefix == UA_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_delay(line: &str) -> Option<u64> {
-    if line.len() < DELAY_PREFIX.len() {
-        return None;
-    }
-
-    let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[DELAY_PREFIX.len()..];
-    if prefix == DELAY_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_allow(line: &str) -> Option<&str> {
-    if line.len() < ALLOW_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[ALLOW_PREFIX.len()..];
-
-    if prefix == ALLOW_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-fn parse_disallow(line: &str) -> Option<&str> {
-    if line.len() < DISALLOW_PREFIX.len() {
-        return None;
-    }
-    let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
-    let suffix = &line[DISALLOW_PREFIX.len()..];
-
-    if prefix == DISALLOW_PREFIX {
-        Some(suffix.trim())
-    } else {
-        None
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_allow() {
-        let test_cases = vec![
-            ("Allow: /", "/"),
-            ("allow: /   #  Root with comment", "/"),
-            ("ALLOW: /abc/def  ", "/abc/def"),
-            ("Allow:   /abc/def  ", "/abc/def"),
-            ("  Allow: /*/foo", "/*/foo"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(
-                parse_line(i.into()),
-                ParsedLine::Rule(ParsedRule::Allow(o.into()))
-            );
-        }
-    }
-
-    #[test]
-    fn test_parse_disallow() {
-        let test_cases = vec![
-            ("Disallow: /", "/"),
-            ("disallow: /   #  Root with comment", "/"),
-            ("DISALLOW: /abc/def  ", "/abc/def"),
-            ("Disallow:   /abc/def  ", "/abc/def"),
-            ("  Disallow: /*/foo", "/*/foo"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(
-                parse_line(i.into()),
-                ParsedLine::Rule(ParsedRule::Disallow(o.into()))
-            );
-        }
-    }
-
-    #[test]
-    fn test_parse_user_agent() {
-        let test_cases = vec![
-            ("User-agent: *", "*"),
-            ("user-agent: ImABot   #  User agent with comment", "imabot"),
-            ("  USER-AGENT:   ImABot  ", "imabot"),
-        ];
-
-        for (i, o) in test_cases {
-            assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
-        }
-    }
-
-    #[test]
-    fn test_parse_nothing() {
-        let test_cases = vec![
-            "Useragent: *",
-            "# Comment",
-            "",
-            "    ",
-            "\t",
-            "alow: /",
-            "disalow: /",
-        ];
-
-        for i in test_cases {
-            assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
-        }
-    }
-
-    #[test]
-    fn test_end_to_end() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            User-agent: jones-bot
-            Disallow: /
-
-            User-agent: jones
-            User-agent: foobar
-            Allow: /
-
-            User-agent: *
-            Disallow: /
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("jones-bot");
-            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("abc");
-            let abc_machine = parser.compile(example_robots).await.unwrap();
-
-            assert_eq!(true, foobar_machine.allow("/index.html"));
-            assert_eq!(false, jonesbot_machine.allow("/index.html"));
-            assert_eq!(false, imabot_machine.allow("/index.html"));
-            assert_eq!(false, abc_machine.allow("/index.html"));
-        });
-    }
-
-    #[test]
-    fn test_invalid_1() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            # Instead of treating this as an error, we'll just consider
-            # this behavior undefined.
-            Allow: /
-
-            User-agent: jones
-            User-agent: foobar
-            Disallow: /
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            // Everything is allowed because next_header() returns None
-            assert_eq!(true, foobar_machine.allow("/index.html"));
-            assert_eq!(true, imabot_machine.allow("/index.html"));
-        });
-    }
-
-    #[test]
-    fn test_invalid_2() {
-        tokio_test::block_on(async {
-            let example_robots = r#"
-            User-agent: jones
-            User-agent: foobar
-            Disallow: /
-
-            # Instead of treating this as an error, we consider this
-            # behavior undefined.
-            User-agent: imabot
-            "#
-            .as_bytes();
-
-            let parser = Compiler::new("foobar");
-            let foobar_machine = parser.compile(example_robots).await.unwrap();
-
-            let parser = Compiler::new("imabot");
-            let imabot_machine = parser.compile(example_robots).await.unwrap();
-
-            assert_eq!(false, foobar_machine.allow("/index.html"));
-            assert_eq!(true, imabot_machine.allow("/index.html"));
-        });
-    }
-}
+use super::dfa::{Cylon, Rule};
+use futures_util::{
+    io::{AsyncBufRead, AsyncRead, BufReader, Result},
+    AsyncBufReadExt,
+};
+use serde_derive::{Deserialize, Serialize};
+const UA_PREFIX: &str = "user-agent:";
+const DELAY_PREFIX: &str = "crawl-delay:";
+const ALLOW_PREFIX: &str = "allow:";
+const DISALLOW_PREFIX: &str = "disallow:";
+
+#[derive(Debug, PartialEq, Clone)]
+enum ParsedRule {
+    Allow(String),
+    Disallow(String),
+    Delay(String),
+}
+
+impl<'a> Into<Rule<'a>> for &'a ParsedRule {
+    fn into(self) -> Rule<'a> {
+        match self {
+            ParsedRule::Allow(path) => Rule::Allow(&path[..]),
+            ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
+            ParsedRule::Delay(delay) => Rule::Delay(delay),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq)]
+enum ParsedLine {
+    UserAgent(String),
+    Rule(ParsedRule),
+    Nothing,
+}
+
+/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
+/// which can be used to efficiently match a large number of paths against
+/// the robots.txt file.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Compiler {
+    user_agent: String,
+}
+
+impl Compiler {
+    /// Build a new compiler that parses rules for the given user agent from
+    /// a robots.txt file.
+    pub fn new(user_agent: &str) -> Self {
+        Self {
+            user_agent: user_agent.to_lowercase(),
+        }
+    }
+
+    /// Parse an input robots.txt file into a Cylon that can recognize
+    /// whether or not a path matches the rules for the Parser's user agent.
+    pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
+        let reader = BufReader::new(file);
+        let mut agent = String::new();
+        let mut rules: Vec<ParsedRule> = vec![];
+        let mut group_reader = GroupReader::new(reader);
+
+        // find the most specific matching group in the robots file
+        while let Some(agents) = group_reader.next_header().await? {
+            let matching_agent = agents.iter().find(|a| {
+                let matches = &a[..] == "*" || self.user_agent.contains(*a);
+                let more_specific = a.len() > agent.len();
+                matches && more_specific
+            });
+
+            if let Some(matching_agent) = matching_agent {
+                agent = matching_agent.clone();
+                rules = group_reader.next_rules().await?;
+            }
+        }
+
+        let rules = rules.iter().map(|r| r.into()).collect();
+        Ok(Cylon::compile(rules))
+    }
+}
+
+struct GroupReader<R: AsyncBufRead + Unpin> {
+    parsing_agents: bool,
+    agents: Vec<String>,
+    rules: Vec<ParsedRule>,
+    reader: R,
+}
+
+impl<R: AsyncBufRead + Unpin> GroupReader<R> {
+    fn new(reader: R) -> Self {
+        Self {
+            parsing_agents: true,
+            agents: vec![],
+            rules: vec![],
+            reader,
+        }
+    }
+
+    /// Scan forward until the next group header defined by one or more
+    /// user agent lines. This lets us optimize the lines we need to copy
+    /// so we can skip over groups that don't match the desired user agent.
+    async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
+        let mut buf = String::new();
+        while self.reader.read_line(&mut buf).await? != 0 {
+            let parsed_line = parse_line(buf.clone());
+
+            match parsed_line {
+                ParsedLine::UserAgent(ua) if self.parsing_agents => {
+                    self.agents.push(ua);
+                }
+                ParsedLine::UserAgent(ua) => {
+                    self.agents = vec![ua];
+                    self.rules = vec![];
+                    self.parsing_agents = true;
+                }
+                ParsedLine::Rule(rule) if self.parsing_agents => {
+                    // Preserve the rule in case we need it in next_rules().
+                    self.rules.push(rule);
+                    self.parsing_agents = false;
+                    break;
+                }
+                // Skip over lines until we get to the next user agent.
+                ParsedLine::Rule(..) => (),
+                ParsedLine::Nothing => (),
+            }
+
+            buf.clear();
+        }
+
+        let agents = self.agents.clone();
+        self.agents = vec![];
+
+        if agents.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(agents))
+    }
+
+    async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
+        let mut buf = String::new();
+        while self.reader.read_line(&mut buf).await? != 0 {
+            let parsed_line = parse_line(buf.clone());
+
+            match parsed_line {
+                ParsedLine::Rule(rule) => {
+                    self.rules.push(rule);
+                    self.parsing_agents = false;
+                }
+                ParsedLine::UserAgent(ua) if !self.parsing_agents => {
+                    // Preserve the agent in case we need it in next_agents().
+                    self.agents.push(ua);
+                    self.parsing_agents = true;
+                    break;
+                }
+                // Skip over lines until we get to the next rule.
+                ParsedLine::UserAgent(..) => (),
+                ParsedLine::Nothing => (),
+            }
+
+            buf.clear();
+        }
+
+        let rules = self.rules.clone();
+        self.rules = vec![];
+        Ok(rules)
+    }
+}
+
+fn parse_line(line: String) -> ParsedLine {
+    let line = strip_comments(&line[..]).trim();
+
+    // This tries to parse lines roughly in order of most frequent kind to
+    // least frequent kind in order to minimize CPU cycles on average.
+    parse_disallow(line)
+        .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
+        .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
+        .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
+        .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
+        .unwrap_or(ParsedLine::Nothing)
+}
+
+fn strip_comments(line: &str) -> &str {
+    if let Some(before) = line.split('#').next() {
+        return before;
+    }
+    return line;
+}
+
+fn parse_user_agent(line: &str) -> Option<&str> {
+    if line.len() < UA_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[UA_PREFIX.len()..];
+
+    if prefix == UA_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_delay(line: &str) -> Option<&str> {
+    if line.len() < DELAY_PREFIX.len() {
+        return None;
+    }
+
+    let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[DELAY_PREFIX.len()..];
+    if prefix == DELAY_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_allow(line: &str) -> Option<&str> {
+    if line.len() < ALLOW_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[ALLOW_PREFIX.len()..];
+
+    if prefix == ALLOW_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+fn parse_disallow(line: &str) -> Option<&str> {
+    if line.len() < DISALLOW_PREFIX.len() {
+        return None;
+    }
+    let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
+    let suffix = &line[DISALLOW_PREFIX.len()..];
+
+    if prefix == DISALLOW_PREFIX {
+        Some(suffix.trim())
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_allow() {
+        let test_cases = vec![
+            ("Allow: /", "/"),
+            ("allow: /   #  Root with comment", "/"),
+            ("ALLOW: /abc/def  ", "/abc/def"),
+            ("Allow:   /abc/def  ", "/abc/def"),
+            ("  Allow: /*/foo", "/*/foo"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(
+                parse_line(i.into()),
+                ParsedLine::Rule(ParsedRule::Allow(o.into()))
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_disallow() {
+        let test_cases = vec![
+            ("Disallow: /", "/"),
+            ("disallow: /   #  Root with comment", "/"),
+            ("DISALLOW: /abc/def  ", "/abc/def"),
+            ("Disallow:   /abc/def  ", "/abc/def"),
+            ("  Disallow: /*/foo", "/*/foo"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(
+                parse_line(i.into()),
+                ParsedLine::Rule(ParsedRule::Disallow(o.into()))
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_user_agent() {
+        let test_cases = vec![
+            ("User-agent: *", "*"),
+            ("user-agent: ImABot   #  User agent with comment", "imabot"),
+            ("  USER-AGENT:   ImABot  ", "imabot"),
+        ];
+
+        for (i, o) in test_cases {
+            assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
+        }
+    }
+
+    #[test]
+    fn test_parse_nothing() {
+        let test_cases = vec![
+            "Useragent: *",
+            "# Comment",
+            "",
+            "    ",
+            "\t",
+            "alow: /",
+            "disalow: /",
+        ];
+
+        for i in test_cases {
+            assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
+        }
+    }
+
+    #[test]
+    fn test_crawl_delay() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones-bot
+            Disallow: /
+            Crawl-Delay: 30
+
+            User-agent: foobar
+            Crawl-Delay: 60
+
+            User-agent: googlebot
+            Allow: /
+
+            User-agent: barfoo
+            Crawl-Delay: 60
+            Crawl-Delay: 20
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("googlebot");
+            let googlebot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("barfoo");
+            let barfoo_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("jones-bot");
+            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(Some(60), foobar_machine.delay());
+            assert_eq!(Some(20), barfoo_machine.delay());
+            assert_eq!(Some(30), jonesbot_machine.delay());
+            assert_eq!(None, googlebot_machine.delay());
+        });
+    }
+
+    #[test]
+    fn test_end_to_end() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones-bot
+            Disallow: /
+
+            User-agent: foo
+            Allow: /
+            Crawl-Delay: 20
+
+            User-agent: jones
+            User-agent: foobar
+            Allow: /
+
+            User-agent: *
+            Disallow: /
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("jones-bot");
+            let jonesbot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("abc");
+            let abc_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(true, foobar_machine.allow("/index.html"));
+            assert_eq!(false, jonesbot_machine.allow("/index.html"));
+            assert_eq!(false, imabot_machine.allow("/index.html"));
+            assert_eq!(false, abc_machine.allow("/index.html"));
+        });
+    }
+
+    #[test]
+    fn test_invalid_1() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            # Instead of treating this as an error, we'll just consider
+            # this behavior undefined.
+            Allow: /
+
+            User-agent: jones
+            User-agent: foobar
+            Disallow: /
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            // Everything is allowed because next_header() returns None
+            assert_eq!(true, foobar_machine.allow("/index.html"));
+            assert_eq!(true, imabot_machine.allow("/index.html"));
+        });
+    }
+
+    #[test]
+    fn test_invalid_2() {
+        tokio_test::block_on(async {
+            let example_robots = r#"
+            User-agent: jones
+            User-agent: foobar
+            Disallow: /
+
+            # Instead of treating this as an error, we consider this
+            # behavior undefined.
+            User-agent: imabot
+            "#
+            .as_bytes();
+
+            let parser = Compiler::new("foobar");
+            let foobar_machine = parser.compile(example_robots).await.unwrap();
+
+            let parser = Compiler::new("imabot");
+            let imabot_machine = parser.compile(example_robots).await.unwrap();
+
+            assert_eq!(false, foobar_machine.allow("/index.html"));
+            assert_eq!(true, imabot_machine.allow("/index.html"));
+        });
+    }
+}