From 86ee746b96d4a0ab718ec6b580c5f70d01401eaa Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 14:08:41 +0100 Subject: [PATCH 1/6] test - bug-report --- Cargo.lock | 45 ++++++++++++++++++--------------------------- Cargo.toml | 6 +++--- src/dfa.rs | 2 ++ src/parse.rs | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ba33dc..8026f48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" dependencies = [ "futures-core", "futures-sink", @@ -257,9 +257,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" [[package]] name = "futures-executor" @@ -274,15 +274,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -292,24 +292,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" dependencies = [ "futures-channel", "futures-core", @@ -428,12 +425,6 @@ dependencies = [ "libc", ] -[[package]] -name = "once_cell" -version = "1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" - [[package]] name = "oorandom" version = "11.1.3" @@ -606,9 +597,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "bd761ff957cb2a45fbb9ab3da6512de9de55872866160b23c25f1a841e99d29f" [[package]] name = "serde_cbor" @@ -622,9 +613,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "1800f7693e94e186f5e25a28291ae1570da908aff7d97a095dec1e56ff99069b" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index c82cf6d..795f606 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"] repository = "https://github.com/crestonbunch/cylon" [dependencies] -futures-util = "0.3" -serde = "1.0" -serde_derive = "1.0" +futures-util = "0.3.13" +serde = "1.0.124" +serde_derive = "1.0.124" [dev-dependencies] criterion = { version = "0.3", features = ["async_futures"] } diff --git a/src/dfa.rs b/src/dfa.rs index 461670e..5e57ea5 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -4,6 +4,7 @@ use serde_derive::{Deserialize, Serialize}; pub enum Rule<'a> { Allow(&'a str), Disallow(&'a str), + Delay(u64), } impl<'a> Rule<'a> { @@ -11,6 +12,7 @@ impl<'a> Rule<'a> { match self { Rule::Allow(inner) => inner, Rule::Disallow(inner) => inner, + Rule::Delay(inner) => inner, } } } diff --git a/src/parse.rs b/src/parse.rs index e147409..921eaba 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -5,6 +5,7 @@ use futures_util::{ }; use serde_derive::{Deserialize, Serialize}; const UA_PREFIX: &str = "user-agent:"; +const DELAY_PREFIX: &str = "crawl-delay:"; const ALLOW_PREFIX: &str = "allow:"; const DISALLOW_PREFIX: &str = "disallow:"; @@ -12,6 +13,7 @@ const DISALLOW_PREFIX: &str = "disallow:"; enum ParsedRule { Allow(String), Disallow(String), + Delay(u64), } impl<'a> Into> for &'a ParsedRule { @@ -19,6 +21,7 @@ impl<'a> Into> for &'a ParsedRule { match self { ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + ParsedRule::Delay(delay) => Rule.Delay(delay), } } } @@ -171,6 +174,7 @@ fn parse_line(line: String) -> ParsedLine { .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) .unwrap_or(ParsedLine::Nothing) } @@ -195,6 +199,20 @@ fn parse_user_agent(line: &str) -> Option<&str> { } } +fn parse_delay(line: &str) -> Option { + if line.len() < DELAY_PREFIX.len() { + return None; + } + + let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DELAY_PREFIX.len()..]; + if prefix == DELAY_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + fn parse_allow(line: &str) -> Option<&str> { if line.len() < ALLOW_PREFIX.len() { return None; From fe11216642a6ec9fec05d8bc7bab3d3eaf6ee2ca Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 15:57:55 +0100 Subject: [PATCH 2/6] add crawl-delay --- Cargo.toml | 6 +- benches/parse.rs | 1 + src/dfa.rs | 1033 ++++++++++++++++++++++++---------------------- src/parse.rs | 841 +++++++++++++++++++------------------ 4 files changed, 979 insertions(+), 902 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 795f606..c82cf6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"] repository = "https://github.com/crestonbunch/cylon" [dependencies] -futures-util = "0.3.13" -serde = "1.0.124" -serde_derive = "1.0.124" +futures-util = "0.3" +serde = "1.0" +serde_derive = "1.0" [dev-dependencies] criterion = { version = "0.3", features = ["async_futures"] } diff --git a/benches/parse.rs b/benches/parse.rs index 2084b00..ba0851e 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -9,6 +9,7 @@ Disallow: / Allow: /a Allow: /abc Allow: /b +Crawl-Delay: 20 "# .as_bytes(); diff --git a/src/dfa.rs b/src/dfa.rs index 5e57ea5..6c39eb4 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1,500 +1,533 @@ -use serde_derive::{Deserialize, Serialize}; - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum Rule<'a> { - Allow(&'a str), - Disallow(&'a str), - Delay(u64), -} - -impl<'a> Rule<'a> { - fn inner(&self) -> &str { - match self { - Rule::Allow(inner) => inner, - Rule::Disallow(inner) => inner, - Rule::Delay(inner) => inner, - } - } -} - -#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] -enum Edge { - MatchChar(char), - MatchAny, - MatchEow, -} - -#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] -struct Transition(Edge, usize); - -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -enum State { - Allow, - Disallow, - Intermediate, -} - -/// A Cylon is a DFA that recognizes rules from a compiled robots.txt -/// file. By providing it a URL path, it can decide whether or not -/// the robots file that compiled it allows or disallows that path in -/// roughly O(n) time, where n is the length of the path. -#[derive(Debug, Serialize, Deserialize)] -pub struct Cylon { - states: Vec, - transitions: Vec>, -} - -impl Cylon { - /// Match whether the rules allow or disallow the target path. - pub fn allow(&self, path: &str) -> bool { - let mut state = path.chars().fold(2, |state, path_char| { - let t = &self.transitions[state]; - t.iter() - .rev() - // Pick the last transition to always prioritize MatchChar - // over MatchAny (which will always be the first transition.) - .find(|transition| match transition { - Transition(Edge::MatchAny, ..) => true, - Transition(Edge::MatchEow, ..) => false, - Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, - }) - .map(|Transition(.., next_state)| *next_state) - // We are guaranteed at least one matching state because of - // the way the DFA is constructed. - .unwrap() - }); - - // Follow the EoW transition, if necessary - let t = &self.transitions[state]; - state = t - .iter() - .rev() - .find(|transition| match transition { - Transition(Edge::MatchEow, ..) => true, - Transition(Edge::MatchAny, ..) => true, - _ => false, - }) - .map(|Transition(.., next_state)| *next_state) - .unwrap_or(state); - - match self.states[state] { - State::Allow => true, - State::Disallow => false, - // Intermediate states are not preserved in the DFA - State::Intermediate => unreachable!(), - } - } - - /// Compile a machine from a list of rules. - pub fn compile(mut rules: Vec) -> Self { - // This algorithm constructs a DFA by doing BFS over the prefix tree of - // paths in the provided list of rules. However, for performance reasons - // it does not actually build a tree structure. (Vecs have better - // cache-locality by avoiding random memory access.) - - let mut transitions: Vec> = vec![ - vec![Transition(Edge::MatchAny, 0)], - vec![Transition(Edge::MatchAny, 1)], - ]; - let mut states: Vec = vec![State::Allow, State::Disallow]; - - rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); - - let mut queue = vec![("", 0, 0, State::Intermediate)]; - while !queue.is_empty() { - // parent_prefix is the "parent node" in the prefix tree. We are - // going to visit its children by filtering from the list of - // paths only the paths that start with the parent_prefix. - // wildcard_state is a node to jump to when an unmatched character - // is encountered. This is usually a node higher up in the tree - // that can match any character legally, but is also a prefix - // (read: ancestor) of the current node. - let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); - let last_char = parent_prefix.chars().last(); - - wildcard_state = match state { - State::Allow => 0, - State::Disallow if last_char == Some('$') => wildcard_state, - State::Disallow => 1, - State::Intermediate => wildcard_state, - }; - - let mut t = match last_char { - Some('$') => { - // The EOW character cannot match anything else - vec![Transition(Edge::MatchAny, wildcard_state)] - } - Some('*') => { - // The wildcard character overrides the wildcard state - vec![Transition(Edge::MatchAny, transitions.len())] - } - _ => { - // Every other state has a self-loop that matches anything - vec![Transition(Edge::MatchAny, wildcard_state)] - } - }; - - let mut curr_prefix = ""; - rules - .iter() - .map(Rule::inner) - .zip(&rules) - .filter(|(path, _)| (*path).starts_with(parent_prefix)) - .filter(|(path, _)| (*path) != parent_prefix) - .for_each(|(path, rule)| { - let child_prefix = &path[0..parent_prefix.len() + 1]; - if curr_prefix == child_prefix { - // We only want to visit a child node once, but - // many rules might have the same child_prefix, so - // we skip the duplicates after the first time - // we see a prefix. (This could be a filter(), but - // it's a bit hard to encode earlier in the chain.) - return; - } - curr_prefix = child_prefix; - - let eow = child_prefix == path; - let state = match (rule, eow) { - (Rule::Allow(..), true) => State::Allow, - (Rule::Disallow(..), true) => State::Disallow, - _ => State::Intermediate, - }; - - queue.push((child_prefix, wildcard_state, transitions.len(), state)); - - // NB: we can predict what state index the child - // will have before it's even pushed onto the state vec. - let child_index = transitions.len() + queue.len(); - let edge_char = child_prefix.chars().last().unwrap(); - let transition = Transition( - match edge_char { - '*' => Edge::MatchAny, - '$' => Edge::MatchEow, - c => Edge::MatchChar(c), - }, - child_index, - ); - - // Add transitions from the parent state to the child state - // so that the wildcard character matches are optional. - if last_char == Some('*') { - let parent_t = &mut transitions[parent_state]; - parent_t.push(transition); - } - - t.push(transition); - }); - - states.push(match state { - State::Allow | State::Disallow => state, - State::Intermediate => states[wildcard_state], - }); - transitions.push(t); - } - - Self { - states, - transitions, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - macro_rules! t { - ('*' => $x:expr) => { - Transition(Edge::MatchAny, $x) - }; - ('$' => $x:expr) => { - Transition(Edge::MatchEow, $x) - }; - ($x:expr => $y:expr) => { - Transition(Edge::MatchChar($x), $y) - }; - } - - #[test] - fn test_compile() { - let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), - ]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" - vec![t!('*' => 0), t!('b' => 6)], // "/a" - vec![t!('*' => 0)], // "/b" - vec![t!('*' => 0), t!('c' => 7)], // "/ab" - vec![t!('*' => 0)], // "/abc" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Allow, - State::Allow, - State::Allow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_with_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" - vec![t!('*' => 4), t!('.' => 6)], // "/*" - vec![t!('*' => 0)], // "/a" - vec![t!('*' => 1), t!('b' => 7)], // "/*." - vec![t!('*' => 0)], // "/*.b" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Disallow, - State::Allow, - State::Disallow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_tricky_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" - vec![t!('*' => 4), t!('.' => 5)], // "/*" - vec![t!('*' => 0)], // "/*." - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Disallow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_with_eow() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), - // Note that this rule is nonsensical. It will compile, but - // no guarantees are made as to how it's matched. Rules should - // use url-encoded strings to escape $. - Rule::Disallow("/x$y"), - ]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" - vec![t!('*' => 0), t!('$' => 6)], // "/a" - vec![t!('*' => 0), t!('$' => 7)], // "/x" - vec![t!('*' => 0)], // "/a$" - vec![t!('*' => 0), t!('y' => 8)], // "/x$" - vec![t!('*' => 1)], // "/x$y" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Allow, - State::Allow, - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_allow() { - let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(false, machine.allow("/")); - assert_eq!(true, machine.allow("/a")); - assert_eq!(true, machine.allow("/a/b")); - assert_eq!(true, machine.allow("/a")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(true, machine.allow("/abc/def")); - assert_eq!(true, machine.allow("/b")); - assert_eq!(true, machine.allow("/b/c")); - } - - #[test] - fn test_allow_match_any() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/secret/*.txt"), - Rule::Disallow("/private/*"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(false, machine.allow("/secret/abc.txt")); - assert_eq!(false, machine.allow("/secret/123.txt")); - assert_eq!(true, machine.allow("/secret/abc.csv")); - assert_eq!(true, machine.allow("/secret/123.csv")); - assert_eq!(false, machine.allow("/private/abc.txt")); - assert_eq!(false, machine.allow("/private/123.txt")); - assert_eq!(false, machine.allow("/private/abc.csv")); - assert_eq!(false, machine.allow("/private/123.csv")); - } - - #[test] - fn test_allow_match_eow() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/ignore$"), - Rule::Disallow("/foo$bar"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(false, machine.allow("/ignore")); - assert_eq!(true, machine.allow("/ignoreabc")); - assert_eq!(true, machine.allow("/ignore/abc")); - // These are technically undefined, and no behavior - // is guaranteed since the rule is malformed. However - // it is safer to accept them rather than reject them. - assert_eq!(true, machine.allow("/foo")); - assert_eq!(true, machine.allow("/foo$bar")); - } - - #[test] - fn test_allow_more_complicated() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), - Rule::Disallow("/abc"), - Rule::Allow("/abc/*"), - Rule::Disallow("/foo/bar"), - Rule::Allow("/*/bar"), - Rule::Disallow("/www/*/images"), - Rule::Allow("/www/public/images"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/directory")); - assert_eq!(false, machine.allow("/a")); - assert_eq!(true, machine.allow("/ab")); - assert_eq!(false, machine.allow("/abc")); - assert_eq!(true, machine.allow("/abc/123")); - assert_eq!(true, machine.allow("/foo")); - assert_eq!(true, machine.allow("/foobar")); - assert_eq!(false, machine.allow("/foo/bar")); - assert_eq!(false, machine.allow("/foo/bar/baz")); - assert_eq!(true, machine.allow("/baz/bar")); - assert_eq!(false, machine.allow("/www/cat/images")); - assert_eq!(true, machine.allow("/www/public/images")); - } - - #[test] - fn test_matches() { - // Test cases from: - // https://developers.google.com/search/reference/robots_txt#group-member-rules - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); - assert_eq!(true, machine.allow("/fish")); - assert_eq!(true, machine.allow("/fish.html")); - assert_eq!(true, machine.allow("/fish/salmon.html")); - assert_eq!(true, machine.allow("/fishheads.html")); - assert_eq!(true, machine.allow("/fishheads/yummy.html")); - assert_eq!(true, machine.allow("/fish.php?id=anything")); - assert_eq!(false, machine.allow("/Fish.asp")); - assert_eq!(false, machine.allow("/catfish")); - assert_eq!(false, machine.allow("/?id=fish")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); - assert_eq!(true, machine.allow("/fish")); - assert_eq!(true, machine.allow("/fish.html")); - assert_eq!(true, machine.allow("/fish/salmon.html")); - assert_eq!(true, machine.allow("/fishheads.html")); - assert_eq!(true, machine.allow("/fishheads/yummy.html")); - assert_eq!(true, machine.allow("/fish.php?id=anything")); - assert_eq!(false, machine.allow("/Fish.asp")); - assert_eq!(false, machine.allow("/catfish")); - assert_eq!(false, machine.allow("/?id=fish")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); - assert_eq!(true, machine.allow("/fish/")); - assert_eq!(true, machine.allow("/fish/?id=anything")); - assert_eq!(true, machine.allow("/fish/salmon.htm")); - assert_eq!(false, machine.allow("/fish")); - assert_eq!(false, machine.allow("/fish.html")); - assert_eq!(false, machine.allow("/Fish/Salmon.asp")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); - assert_eq!(true, machine.allow("/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php?parameters")); - assert_eq!(true, machine.allow("/folder/any.php.file.html")); - assert_eq!(true, machine.allow("/filename.php/")); - assert_eq!(false, machine.allow("/")); - assert_eq!(false, machine.allow("/windows.PHP")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); - assert_eq!(true, machine.allow("/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php")); - assert_eq!(false, machine.allow("/filename.php?parameters")); - assert_eq!(false, machine.allow("/filename.php/")); - assert_eq!(false, machine.allow("/filename.php5")); - assert_eq!(false, machine.allow("/windows.PHP")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); - assert_eq!(true, machine.allow("/fish.php")); - assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); - assert_eq!(false, machine.allow("/Fish.PHP")); - } -} +use std::cmp::Ordering; + +use serde_derive::{Deserialize, Serialize}; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Rule<'a> { + Allow(&'a str), + Disallow(&'a str), + Delay(&'a str), +} + +impl<'a> Rule<'a> { + fn inner(&self) -> &str { + match self { + Rule::Allow(inner) => inner, + Rule::Disallow(inner) => inner, + Rule::Delay(inner) => inner, + } + } +} + +#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] +enum Edge { + MatchChar(char), + MatchAny, + MatchEow, +} + +#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] +struct Transition(Edge, usize); + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +enum State { + Allow, + Disallow, + Delay, + Intermediate, +} + +/// A Cylon is a DFA that recognizes rules from a compiled robots.txt +/// file. By providing it a URL path, it can decide whether or not +/// the robots file that compiled it allows or disallows that path in +/// roughly O(n) time, where n is the length of the path. +#[derive(Debug, Serialize, Deserialize)] +pub struct Cylon { + states: Vec, + transitions: Vec>, + delay: Option, +} + +impl Cylon { + pub fn delay(&self) -> Option { + self.delay + } + + /// Match whether the rules allow or disallow the target path. + pub fn allow(&self, path: &str) -> bool { + match self.states[self.state(path)] { + State::Allow => true, + State::Disallow => false, + // Intermediate states are not preserved in the DFA + State::Intermediate | State::Delay => unreachable!(), + } + } + + fn state(&self, path: &str) -> usize { + let state = path.chars().fold(2, |state, path_char| { + let t = &self.transitions[state]; + t.iter() + .rev() + // Pick the last transition to always prioritize MatchChar + // over MatchAny (which will always be the first transition.) + .find(|transition| match transition { + Transition(Edge::MatchAny, ..) => true, + Transition(Edge::MatchEow, ..) => false, + Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, + }) + .map(|Transition(.., next_state)| *next_state) + // We are guaranteed at least one matching state because of + // the way the DFA is constructed. + .unwrap() + }); + + // Follow the EoW transition, if necessary + let t = &self.transitions[state]; + t + .iter() + .rev() + .find(|transition| match transition { + Transition(Edge::MatchEow, ..) => true, + Transition(Edge::MatchAny, ..) => true, + _ => false, + }) + .map(|Transition(.., next_state)| *next_state) + .unwrap_or(state) + } + + /// Compile a machine from a list of rules. + pub fn compile(mut rules: Vec) -> Self { + // This algorithm constructs a DFA by doing BFS over the prefix tree of + // paths in the provided list of rules. However, for performance reasons + // it does not actually build a tree structure. (Vecs have better + // cache-locality by avoiding random memory access.) + + let mut transitions: Vec> = vec![ + vec![Transition(Edge::MatchAny, 0)], + vec![Transition(Edge::MatchAny, 1)], + ]; + let mut states: Vec = vec![State::Allow, State::Disallow]; + + rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); + + let mut queue = vec![("", 0, 0, State::Intermediate)]; + while !queue.is_empty() { + // parent_prefix is the "parent node" in the prefix tree. We are + // going to visit its children by filtering from the list of + // paths only the paths that start with the parent_prefix. + // wildcard_state is a node to jump to when an unmatched character + // is encountered. This is usually a node higher up in the tree + // that can match any character legally, but is also a prefix + // (read: ancestor) of the current node. + let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); + let last_char = parent_prefix.chars().last(); + + wildcard_state = match state { + State::Allow => 0, + State::Disallow if last_char == Some('$') => wildcard_state, + State::Disallow => 1, + State::Delay => 1, + State::Intermediate => wildcard_state, + }; + + let mut t = match last_char { + Some('$') => { + // The EOW character cannot match anything else + vec![Transition(Edge::MatchAny, wildcard_state)] + } + Some('*') => { + // The wildcard character overrides the wildcard state + vec![Transition(Edge::MatchAny, transitions.len())] + } + _ => { + // Every other state has a self-loop that matches anything + vec![Transition(Edge::MatchAny, wildcard_state)] + } + }; + + let mut curr_prefix = ""; + rules + .iter() + .map(Rule::inner) + .zip(&rules) + .filter(|(path, _)| (*path).starts_with(parent_prefix)) + .filter(|(path, _)| (*path) != parent_prefix) + .for_each(|(path, rule)| { + let child_prefix = &path[0..parent_prefix.len() + 1]; + if curr_prefix == child_prefix { + // We only want to visit a child node once, but + // many rules might have the same child_prefix, so + // we skip the duplicates after the first time + // we see a prefix. (This could be a filter(), but + // it's a bit hard to encode earlier in the chain.) + return; + } + curr_prefix = child_prefix; + + let eow = child_prefix == path; + let state = match (rule, eow) { + (Rule::Allow(..), true) => State::Allow, + (Rule::Disallow(..), true) => State::Disallow, + (Rule::Delay(..), true) => State::Delay, + _ => State::Intermediate, + }; + + queue.push((child_prefix, wildcard_state, transitions.len(), state)); + + // NB: we can predict what state index the child + // will have before it's even pushed onto the state vec. + let child_index = transitions.len() + queue.len(); + let edge_char = child_prefix.chars().last().unwrap(); + let transition = Transition( + match edge_char { + '*' => Edge::MatchAny, + '$' => Edge::MatchEow, + c => Edge::MatchChar(c), + }, + child_index, + ); + + // Add transitions from the parent state to the child state + // so that the wildcard character matches are optional. + if last_char == Some('*') { + let parent_t = &mut transitions[parent_state]; + parent_t.push(transition); + } + + t.push(transition); + }); + + states.push(match state { + State::Allow | State::Disallow | State::Delay => state, + State::Intermediate => states[wildcard_state], + }); + transitions.push(t); + } + + let mut delays: Vec> = rules.iter().filter(|rule| { + match rule { + Rule::Delay(_) => true, + _ => false + } + }).map(|r| { + r.inner().parse::().ok() + }).collect(); + delays.sort_unstable_by(|a, b| { + match (a, b) { + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + (Some(aa), Some(bb)) => aa.cmp(bb) + + } + }); + + + + Self { + delay: *delays.get(0).unwrap_or(&None), + states, + transitions, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! t { + ('*' => $x:expr) => { + Transition(Edge::MatchAny, $x) + }; + ('$' => $x:expr) => { + Transition(Edge::MatchEow, $x) + }; + ($x:expr => $y:expr) => { + Transition(Edge::MatchChar($x), $y) + }; + } + + #[test] + fn test_compile() { + let rules = vec![ + Rule::Disallow("/"), + Rule::Allow("/a"), + Rule::Allow("/abc"), + Rule::Allow("/b"), + ]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" + vec![t!('*' => 0), t!('b' => 6)], // "/a" + vec![t!('*' => 0)], // "/b" + vec![t!('*' => 0), t!('c' => 7)], // "/ab" + vec![t!('*' => 0)], // "/abc" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Allow, + State::Allow, + State::Allow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_with_wildcard() { + let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" + vec![t!('*' => 4), t!('.' => 6)], // "/*" + vec![t!('*' => 0)], // "/a" + vec![t!('*' => 1), t!('b' => 7)], // "/*." + vec![t!('*' => 0)], // "/*.b" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Disallow, + State::Allow, + State::Disallow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_tricky_wildcard() { + let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" + vec![t!('*' => 4), t!('.' => 5)], // "/*" + vec![t!('*' => 0)], // "/*." + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Disallow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_with_eow() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/a$"), + // Note that this rule is nonsensical. It will compile, but + // no guarantees are made as to how it's matched. Rules should + // use url-encoded strings to escape $. + Rule::Disallow("/x$y"), + ]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" + vec![t!('*' => 0), t!('$' => 6)], // "/a" + vec![t!('*' => 0), t!('$' => 7)], // "/x" + vec![t!('*' => 0)], // "/a$" + vec![t!('*' => 0), t!('y' => 8)], // "/x$" + vec![t!('*' => 1)], // "/x$y" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Allow, + State::Allow, + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_allow() { + let rules = vec![ + Rule::Disallow("/"), + Rule::Allow("/a"), + Rule::Allow("/abc"), + Rule::Allow("/b"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(false, machine.allow("/")); + assert_eq!(true, machine.allow("/a")); + assert_eq!(true, machine.allow("/a/b")); + assert_eq!(true, machine.allow("/a")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(true, machine.allow("/abc/def")); + assert_eq!(true, machine.allow("/b")); + assert_eq!(true, machine.allow("/b/c")); + } + + #[test] + fn test_allow_match_any() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/secret/*.txt"), + Rule::Disallow("/private/*"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(false, machine.allow("/secret/abc.txt")); + assert_eq!(false, machine.allow("/secret/123.txt")); + assert_eq!(true, machine.allow("/secret/abc.csv")); + assert_eq!(true, machine.allow("/secret/123.csv")); + assert_eq!(false, machine.allow("/private/abc.txt")); + assert_eq!(false, machine.allow("/private/123.txt")); + assert_eq!(false, machine.allow("/private/abc.csv")); + assert_eq!(false, machine.allow("/private/123.csv")); + } + + #[test] + fn test_allow_match_eow() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/ignore$"), + Rule::Disallow("/foo$bar"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(false, machine.allow("/ignore")); + assert_eq!(true, machine.allow("/ignoreabc")); + assert_eq!(true, machine.allow("/ignore/abc")); + // These are technically undefined, and no behavior + // is guaranteed since the rule is malformed. However + // it is safer to accept them rather than reject them. + assert_eq!(true, machine.allow("/foo")); + assert_eq!(true, machine.allow("/foo$bar")); + } + + #[test] + fn test_allow_more_complicated() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/a$"), + Rule::Disallow("/abc"), + Rule::Allow("/abc/*"), + Rule::Disallow("/foo/bar"), + Rule::Allow("/*/bar"), + Rule::Disallow("/www/*/images"), + Rule::Allow("/www/public/images"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/directory")); + assert_eq!(false, machine.allow("/a")); + assert_eq!(true, machine.allow("/ab")); + assert_eq!(false, machine.allow("/abc")); + assert_eq!(true, machine.allow("/abc/123")); + assert_eq!(true, machine.allow("/foo")); + assert_eq!(true, machine.allow("/foobar")); + assert_eq!(false, machine.allow("/foo/bar")); + assert_eq!(false, machine.allow("/foo/bar/baz")); + assert_eq!(true, machine.allow("/baz/bar")); + assert_eq!(false, machine.allow("/www/cat/images")); + assert_eq!(true, machine.allow("/www/public/images")); + } + + #[test] + fn test_matches() { + // Test cases from: + // https://developers.google.com/search/reference/robots_txt#group-member-rules + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); + assert_eq!(true, machine.allow("/fish")); + assert_eq!(true, machine.allow("/fish.html")); + assert_eq!(true, machine.allow("/fish/salmon.html")); + assert_eq!(true, machine.allow("/fishheads.html")); + assert_eq!(true, machine.allow("/fishheads/yummy.html")); + assert_eq!(true, machine.allow("/fish.php?id=anything")); + assert_eq!(false, machine.allow("/Fish.asp")); + assert_eq!(false, machine.allow("/catfish")); + assert_eq!(false, machine.allow("/?id=fish")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); + assert_eq!(true, machine.allow("/fish")); + assert_eq!(true, machine.allow("/fish.html")); + assert_eq!(true, machine.allow("/fish/salmon.html")); + assert_eq!(true, machine.allow("/fishheads.html")); + assert_eq!(true, machine.allow("/fishheads/yummy.html")); + assert_eq!(true, machine.allow("/fish.php?id=anything")); + assert_eq!(false, machine.allow("/Fish.asp")); + assert_eq!(false, machine.allow("/catfish")); + assert_eq!(false, machine.allow("/?id=fish")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); + assert_eq!(true, machine.allow("/fish/")); + assert_eq!(true, machine.allow("/fish/?id=anything")); + assert_eq!(true, machine.allow("/fish/salmon.htm")); + assert_eq!(false, machine.allow("/fish")); + assert_eq!(false, machine.allow("/fish.html")); + assert_eq!(false, machine.allow("/Fish/Salmon.asp")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); + assert_eq!(true, machine.allow("/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php?parameters")); + assert_eq!(true, machine.allow("/folder/any.php.file.html")); + assert_eq!(true, machine.allow("/filename.php/")); + assert_eq!(false, machine.allow("/")); + assert_eq!(false, machine.allow("/windows.PHP")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); + assert_eq!(true, machine.allow("/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php")); + assert_eq!(false, machine.allow("/filename.php?parameters")); + assert_eq!(false, machine.allow("/filename.php/")); + assert_eq!(false, machine.allow("/filename.php5")); + assert_eq!(false, machine.allow("/windows.PHP")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); + assert_eq!(true, machine.allow("/fish.php")); + assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); + assert_eq!(false, machine.allow("/Fish.PHP")); + } +} diff --git a/src/parse.rs b/src/parse.rs index 921eaba..ebc1ba5 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,399 +1,442 @@ -use super::dfa::{Cylon, Rule}; -use futures_util::{ - io::{AsyncBufRead, AsyncRead, BufReader, Result}, - AsyncBufReadExt, -}; -use serde_derive::{Deserialize, Serialize}; -const UA_PREFIX: &str = "user-agent:"; -const DELAY_PREFIX: &str = "crawl-delay:"; -const ALLOW_PREFIX: &str = "allow:"; -const DISALLOW_PREFIX: &str = "disallow:"; - -#[derive(Debug, PartialEq, Clone)] -enum ParsedRule { - Allow(String), - Disallow(String), - Delay(u64), -} - -impl<'a> Into> for &'a ParsedRule { - fn into(self) -> Rule<'a> { - match self { - ParsedRule::Allow(path) => Rule::Allow(&path[..]), - ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), - ParsedRule::Delay(delay) => Rule.Delay(delay), - } - } -} - -#[derive(Debug, PartialEq)] -enum ParsedLine { - UserAgent(String), - Rule(ParsedRule), - Nothing, -} - -/// A compiler takes an input robots.txt file and outputs a compiled Cylon, -/// which can be used to efficiently match a large number of paths against -/// the robots.txt file. -#[derive(Debug, Serialize, Deserialize)] -pub struct Compiler { - user_agent: String, -} - -impl Compiler { - /// Build a new compiler that parses rules for the given user agent from - /// a robots.txt file. - pub fn new(user_agent: &str) -> Self { - Self { - user_agent: user_agent.to_lowercase(), - } - } - - /// Parse an input robots.txt file into a Cylon that can recognize - /// whether or not a path matches the rules for the Parser's user agent. - pub async fn compile(&self, file: R) -> Result { - let reader = BufReader::new(file); - let mut agent = String::new(); - let mut rules: Vec = vec![]; - let mut group_reader = GroupReader::new(reader); - - // find the most specific matching group in the robots file - while let Some(agents) = group_reader.next_header().await? { - let matching_agent = agents.iter().find(|a| { - let matches = &a[..] == "*" || self.user_agent.contains(*a); - let more_specific = a.len() > agent.len(); - matches && more_specific - }); - - if let Some(matching_agent) = matching_agent { - agent = matching_agent.clone(); - rules = group_reader.next_rules().await?; - } - } - - let rules = rules.iter().map(|r| r.into()).collect(); - Ok(Cylon::compile(rules)) - } -} - -struct GroupReader { - parsing_agents: bool, - agents: Vec, - rules: Vec, - reader: R, -} - -impl GroupReader { - fn new(reader: R) -> Self { - Self { - parsing_agents: true, - agents: vec![], - rules: vec![], - reader, - } - } - - /// Scan forward until the next group header defined by one or more - /// user agent lines. This lets us optimize the lines we need to copy - /// so we can skip over groups that don't match the desired user agent. - async fn next_header(&mut self) -> Result>> { - let mut buf = String::new(); - while self.reader.read_line(&mut buf).await? != 0 { - let parsed_line = parse_line(buf.clone()); - - match parsed_line { - ParsedLine::UserAgent(ua) if self.parsing_agents => { - self.agents.push(ua); - } - ParsedLine::UserAgent(ua) => { - self.agents = vec![ua]; - self.rules = vec![]; - self.parsing_agents = true; - } - ParsedLine::Rule(rule) if self.parsing_agents => { - // Preserve the rule in case we need it in next_rules(). - self.rules.push(rule); - self.parsing_agents = false; - break; - } - // Skip over lines until we get to the next user agent. - ParsedLine::Rule(..) => (), - ParsedLine::Nothing => (), - } - - buf.clear(); - } - - let agents = self.agents.clone(); - self.agents = vec![]; - - if agents.is_empty() { - return Ok(None); - } - - Ok(Some(agents)) - } - - async fn next_rules(&mut self) -> Result> { - let mut buf = String::new(); - while self.reader.read_line(&mut buf).await? != 0 { - let parsed_line = parse_line(buf.clone()); - - match parsed_line { - ParsedLine::Rule(rule) => { - self.rules.push(rule); - self.parsing_agents = false; - } - ParsedLine::UserAgent(ua) if !self.parsing_agents => { - // Preserve the agent in case we need it in next_agents(). - self.agents.push(ua); - self.parsing_agents = true; - break; - } - // Skip over lines until we get to the next rule. - ParsedLine::UserAgent(..) => (), - ParsedLine::Nothing => (), - } - - buf.clear(); - } - - let rules = self.rules.clone(); - self.rules = vec![]; - Ok(rules) - } -} - -fn parse_line(line: String) -> ParsedLine { - let line = strip_comments(&line[..]).trim(); - - // This tries to parse lines roughly in order of most frequent kind to - // least frequent kind in order to minimize CPU cycles on average. - parse_disallow(line) - .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) - .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) - .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) - .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) - .unwrap_or(ParsedLine::Nothing) -} - -fn strip_comments(line: &str) -> &str { - if let Some(before) = line.split('#').next() { - return before; - } - return line; -} - -fn parse_user_agent(line: &str) -> Option<&str> { - if line.len() < UA_PREFIX.len() { - return None; - } - let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[UA_PREFIX.len()..]; - - if prefix == UA_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_delay(line: &str) -> Option { - if line.len() < DELAY_PREFIX.len() { - return None; - } - - let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[DELAY_PREFIX.len()..]; - if prefix == DELAY_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_allow(line: &str) -> Option<&str> { - if line.len() < ALLOW_PREFIX.len() { - return None; - } - let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[ALLOW_PREFIX.len()..]; - - if prefix == ALLOW_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_disallow(line: &str) -> Option<&str> { - if line.len() < DISALLOW_PREFIX.len() { - return None; - } - let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[DISALLOW_PREFIX.len()..]; - - if prefix == DISALLOW_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_allow() { - let test_cases = vec![ - ("Allow: /", "/"), - ("allow: / # Root with comment", "/"), - ("ALLOW: /abc/def ", "/abc/def"), - ("Allow: /abc/def ", "/abc/def"), - (" Allow: /*/foo", "/*/foo"), - ]; - - for (i, o) in test_cases { - assert_eq!( - parse_line(i.into()), - ParsedLine::Rule(ParsedRule::Allow(o.into())) - ); - } - } - - #[test] - fn test_parse_disallow() { - let test_cases = vec![ - ("Disallow: /", "/"), - ("disallow: / # Root with comment", "/"), - ("DISALLOW: /abc/def ", "/abc/def"), - ("Disallow: /abc/def ", "/abc/def"), - (" Disallow: /*/foo", "/*/foo"), - ]; - - for (i, o) in test_cases { - assert_eq!( - parse_line(i.into()), - ParsedLine::Rule(ParsedRule::Disallow(o.into())) - ); - } - } - - #[test] - fn test_parse_user_agent() { - let test_cases = vec![ - ("User-agent: *", "*"), - ("user-agent: ImABot # User agent with comment", "imabot"), - (" USER-AGENT: ImABot ", "imabot"), - ]; - - for (i, o) in test_cases { - assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into())); - } - } - - #[test] - fn test_parse_nothing() { - let test_cases = vec![ - "Useragent: *", - "# Comment", - "", - " ", - "\t", - "alow: /", - "disalow: /", - ]; - - for i in test_cases { - assert_eq!(parse_line(i.into()), ParsedLine::Nothing); - } - } - - #[test] - fn test_end_to_end() { - tokio_test::block_on(async { - let example_robots = r#" - User-agent: jones-bot - Disallow: / - - User-agent: jones - User-agent: foobar - Allow: / - - User-agent: * - Disallow: / - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("jones-bot"); - let jonesbot_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("abc"); - let abc_machine = parser.compile(example_robots).await.unwrap(); - - assert_eq!(true, foobar_machine.allow("/index.html")); - assert_eq!(false, jonesbot_machine.allow("/index.html")); - assert_eq!(false, imabot_machine.allow("/index.html")); - assert_eq!(false, abc_machine.allow("/index.html")); - }); - } - - #[test] - fn test_invalid_1() { - tokio_test::block_on(async { - let example_robots = r#" - # Instead of treating this as an error, we'll just consider - # this behavior undefined. - Allow: / - - User-agent: jones - User-agent: foobar - Disallow: / - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - // Everything is allowed because next_header() returns None - assert_eq!(true, foobar_machine.allow("/index.html")); - assert_eq!(true, imabot_machine.allow("/index.html")); - }); - } - - #[test] - fn test_invalid_2() { - tokio_test::block_on(async { - let example_robots = r#" - User-agent: jones - User-agent: foobar - Disallow: / - - # Instead of treating this as an error, we consider this - # behavior undefined. - User-agent: imabot - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - assert_eq!(false, foobar_machine.allow("/index.html")); - assert_eq!(true, imabot_machine.allow("/index.html")); - }); - } -} +use super::dfa::{Cylon, Rule}; +use futures_util::{ + io::{AsyncBufRead, AsyncRead, BufReader, Result}, + AsyncBufReadExt, +}; +use serde_derive::{Deserialize, Serialize}; +const UA_PREFIX: &str = "user-agent:"; +const DELAY_PREFIX: &str = "crawl-delay:"; +const ALLOW_PREFIX: &str = "allow:"; +const DISALLOW_PREFIX: &str = "disallow:"; + +#[derive(Debug, PartialEq, Clone)] +enum ParsedRule { + Allow(String), + Disallow(String), + Delay(String), +} + +impl<'a> Into> for &'a ParsedRule { + fn into(self) -> Rule<'a> { + match self { + ParsedRule::Allow(path) => Rule::Allow(&path[..]), + ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + ParsedRule::Delay(delay) => Rule::Delay(delay), + } + } +} + +#[derive(Debug, PartialEq)] +enum ParsedLine { + UserAgent(String), + Rule(ParsedRule), + Nothing, +} + +/// A compiler takes an input robots.txt file and outputs a compiled Cylon, +/// which can be used to efficiently match a large number of paths against +/// the robots.txt file. +#[derive(Debug, Serialize, Deserialize)] +pub struct Compiler { + user_agent: String, +} + +impl Compiler { + /// Build a new compiler that parses rules for the given user agent from + /// a robots.txt file. + pub fn new(user_agent: &str) -> Self { + Self { + user_agent: user_agent.to_lowercase(), + } + } + + /// Parse an input robots.txt file into a Cylon that can recognize + /// whether or not a path matches the rules for the Parser's user agent. + pub async fn compile(&self, file: R) -> Result { + let reader = BufReader::new(file); + let mut agent = String::new(); + let mut rules: Vec = vec![]; + let mut group_reader = GroupReader::new(reader); + + // find the most specific matching group in the robots file + while let Some(agents) = group_reader.next_header().await? { + let matching_agent = agents.iter().find(|a| { + let matches = &a[..] == "*" || self.user_agent.contains(*a); + let more_specific = a.len() > agent.len(); + matches && more_specific + }); + + if let Some(matching_agent) = matching_agent { + agent = matching_agent.clone(); + rules = group_reader.next_rules().await?; + } + } + + let rules = rules.iter().map(|r| r.into()).collect(); + Ok(Cylon::compile(rules)) + } +} + +struct GroupReader { + parsing_agents: bool, + agents: Vec, + rules: Vec, + reader: R, +} + +impl GroupReader { + fn new(reader: R) -> Self { + Self { + parsing_agents: true, + agents: vec![], + rules: vec![], + reader, + } + } + + /// Scan forward until the next group header defined by one or more + /// user agent lines. This lets us optimize the lines we need to copy + /// so we can skip over groups that don't match the desired user agent. + async fn next_header(&mut self) -> Result>> { + let mut buf = String::new(); + while self.reader.read_line(&mut buf).await? != 0 { + let parsed_line = parse_line(buf.clone()); + + match parsed_line { + ParsedLine::UserAgent(ua) if self.parsing_agents => { + self.agents.push(ua); + } + ParsedLine::UserAgent(ua) => { + self.agents = vec![ua]; + self.rules = vec![]; + self.parsing_agents = true; + } + ParsedLine::Rule(rule) if self.parsing_agents => { + // Preserve the rule in case we need it in next_rules(). + self.rules.push(rule); + self.parsing_agents = false; + break; + } + // Skip over lines until we get to the next user agent. + ParsedLine::Rule(..) => (), + ParsedLine::Nothing => (), + } + + buf.clear(); + } + + let agents = self.agents.clone(); + self.agents = vec![]; + + if agents.is_empty() { + return Ok(None); + } + + Ok(Some(agents)) + } + + async fn next_rules(&mut self) -> Result> { + let mut buf = String::new(); + while self.reader.read_line(&mut buf).await? != 0 { + let parsed_line = parse_line(buf.clone()); + + match parsed_line { + ParsedLine::Rule(rule) => { + self.rules.push(rule); + self.parsing_agents = false; + } + ParsedLine::UserAgent(ua) if !self.parsing_agents => { + // Preserve the agent in case we need it in next_agents(). + self.agents.push(ua); + self.parsing_agents = true; + break; + } + // Skip over lines until we get to the next rule. + ParsedLine::UserAgent(..) => (), + ParsedLine::Nothing => (), + } + + buf.clear(); + } + + let rules = self.rules.clone(); + self.rules = vec![]; + Ok(rules) + } +} + +fn parse_line(line: String) -> ParsedLine { + let line = strip_comments(&line[..]).trim(); + + // This tries to parse lines roughly in order of most frequent kind to + // least frequent kind in order to minimize CPU cycles on average. + parse_disallow(line) + .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) + .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) + .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) + .unwrap_or(ParsedLine::Nothing) +} + +fn strip_comments(line: &str) -> &str { + if let Some(before) = line.split('#').next() { + return before; + } + return line; +} + +fn parse_user_agent(line: &str) -> Option<&str> { + if line.len() < UA_PREFIX.len() { + return None; + } + let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[UA_PREFIX.len()..]; + + if prefix == UA_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_delay(line: &str) -> Option<&str> { + if line.len() < DELAY_PREFIX.len() { + return None; + } + + let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DELAY_PREFIX.len()..]; + if prefix == DELAY_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_allow(line: &str) -> Option<&str> { + if line.len() < ALLOW_PREFIX.len() { + return None; + } + let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[ALLOW_PREFIX.len()..]; + + if prefix == ALLOW_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_disallow(line: &str) -> Option<&str> { + if line.len() < DISALLOW_PREFIX.len() { + return None; + } + let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DISALLOW_PREFIX.len()..]; + + if prefix == DISALLOW_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_allow() { + let test_cases = vec![ + ("Allow: /", "/"), + ("allow: / # Root with comment", "/"), + ("ALLOW: /abc/def ", "/abc/def"), + ("Allow: /abc/def ", "/abc/def"), + (" Allow: /*/foo", "/*/foo"), + ]; + + for (i, o) in test_cases { + assert_eq!( + parse_line(i.into()), + ParsedLine::Rule(ParsedRule::Allow(o.into())) + ); + } + } + + #[test] + fn test_parse_disallow() { + let test_cases = vec![ + ("Disallow: /", "/"), + ("disallow: / # Root with comment", "/"), + ("DISALLOW: /abc/def ", "/abc/def"), + ("Disallow: /abc/def ", "/abc/def"), + (" Disallow: /*/foo", "/*/foo"), + ]; + + for (i, o) in test_cases { + assert_eq!( + parse_line(i.into()), + ParsedLine::Rule(ParsedRule::Disallow(o.into())) + ); + } + } + + #[test] + fn test_parse_user_agent() { + let test_cases = vec![ + ("User-agent: *", "*"), + ("user-agent: ImABot # User agent with comment", "imabot"), + (" USER-AGENT: ImABot ", "imabot"), + ]; + + for (i, o) in test_cases { + assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into())); + } + } + + #[test] + fn test_parse_nothing() { + let test_cases = vec![ + "Useragent: *", + "# Comment", + "", + " ", + "\t", + "alow: /", + "disalow: /", + ]; + + for i in test_cases { + assert_eq!(parse_line(i.into()), ParsedLine::Nothing); + } + } + + #[test] + fn test_crawl_delay() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones-bot + Disallow: / + Crawl-Delay: 30 + + User-agent: foobar + Crawl-Delay: 60 + + User-agent: googlebot + Allow: / + + User-agent: barfoo + Crawl-Delay: 60 + Crawl-Delay: 20 + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("googlebot"); + let googlebot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("barfoo"); + let barfoo_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("jones-bot"); + let jonesbot_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(Some(60), foobar_machine.delay()); + assert_eq!(Some(20), barfoo_machine.delay()); + assert_eq!(Some(30), jonesbot_machine.delay()); + assert_eq!(None, googlebot_machine.delay()); + }); + } + + #[test] + fn test_end_to_end() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones-bot + Disallow: / + + User-agent: foo + Allow: / + Crawl-Delay: 20 + + User-agent: jones + User-agent: foobar + Allow: / + + User-agent: * + Disallow: / + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("jones-bot"); + let jonesbot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("abc"); + let abc_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(true, foobar_machine.allow("/index.html")); + assert_eq!(false, jonesbot_machine.allow("/index.html")); + assert_eq!(false, imabot_machine.allow("/index.html")); + assert_eq!(false, abc_machine.allow("/index.html")); + }); + } + + #[test] + fn test_invalid_1() { + tokio_test::block_on(async { + let example_robots = r#" + # Instead of treating this as an error, we'll just consider + # this behavior undefined. + Allow: / + + User-agent: jones + User-agent: foobar + Disallow: / + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + // Everything is allowed because next_header() returns None + assert_eq!(true, foobar_machine.allow("/index.html")); + assert_eq!(true, imabot_machine.allow("/index.html")); + }); + } + + #[test] + fn test_invalid_2() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones + User-agent: foobar + Disallow: / + + # Instead of treating this as an error, we consider this + # behavior undefined. + User-agent: imabot + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(false, foobar_machine.allow("/index.html")); + assert_eq!(true, imabot_machine.allow("/index.html")); + }); + } +} From 07a4ccadf0ff0118348b7880cda5607bbc0d9f00 Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 16:28:38 +0100 Subject: [PATCH 3/6] move crawl-delay into a feature as it won't be user by many --- Cargo.toml | 3 +++ src/dfa.rs | 45 +++++++++++++++++++++++++++------------------ src/parse.rs | 1 + 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c82cf6d..8e93acc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,9 @@ license = "MIT" keywords = ["robots", "txt", "parse", "compile"] repository = "https://github.com/crestonbunch/cylon" +[features] +crawl-delay = [] + [dependencies] futures-util = "0.3" serde = "1.0" diff --git a/src/dfa.rs b/src/dfa.rs index 6c39eb4..f239d73 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "crawl-delay")] use std::cmp::Ordering; use serde_derive::{Deserialize, Serialize}; @@ -45,10 +46,12 @@ enum State { pub struct Cylon { states: Vec, transitions: Vec>, + #[cfg(feature = "crawl-delay")] delay: Option, } impl Cylon { + #[cfg(feature = "crawl-delay")] pub fn delay(&self) -> Option { self.delay } @@ -204,28 +207,34 @@ impl Cylon { transitions.push(t); } - let mut delays: Vec> = rules.iter().filter(|rule| { - match rule { - Rule::Delay(_) => true, - _ => false - } - }).map(|r| { - r.inner().parse::().ok() - }).collect(); - delays.sort_unstable_by(|a, b| { - match (a, b) { - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (None, None) => Ordering::Equal, - (Some(aa), Some(bb)) => aa.cmp(bb) + #[cfg(feature = "crawl-delay")] + { + let mut delays: Vec> = rules.iter().filter(|rule| { + match rule { + Rule::Delay(_) => true, + _ => false + } + }).map(|r| { + r.inner().parse::().ok() + }).collect(); + delays.sort_unstable_by(|a, b| { + match (a, b) { + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + (Some(aa), Some(bb)) => aa.cmp(bb) + } + }); + Self { + delay: *delays.get(0).unwrap_or(&None), + states, + transitions, } - }); - - + } + #[cfg(not(feature = "crawl-delay"))] Self { - delay: *delays.get(0).unwrap_or(&None), states, transitions, } diff --git a/src/parse.rs b/src/parse.rs index ebc1ba5..1bfb750 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -312,6 +312,7 @@ mod tests { } #[test] + #[cfg(feature = "crawl-delay")] fn test_crawl_delay() { tokio_test::block_on(async { let example_robots = r#" From 33e718edcce788bd46a70fcdcdf09e75d9379bcd Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 16:30:13 +0100 Subject: [PATCH 4/6] fix end of line --- src/dfa.rs | 1084 +++++++++++++++++++++++++------------------------- src/parse.rs | 886 ++++++++++++++++++++--------------------- 2 files changed, 985 insertions(+), 985 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index f239d73..da1e4e2 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1,542 +1,542 @@ -#[cfg(feature = "crawl-delay")] -use std::cmp::Ordering; - -use serde_derive::{Deserialize, Serialize}; - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum Rule<'a> { - Allow(&'a str), - Disallow(&'a str), - Delay(&'a str), -} - -impl<'a> Rule<'a> { - fn inner(&self) -> &str { - match self { - Rule::Allow(inner) => inner, - Rule::Disallow(inner) => inner, - Rule::Delay(inner) => inner, - } - } -} - -#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] -enum Edge { - MatchChar(char), - MatchAny, - MatchEow, -} - -#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] -struct Transition(Edge, usize); - -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -enum State { - Allow, - Disallow, - Delay, - Intermediate, -} - -/// A Cylon is a DFA that recognizes rules from a compiled robots.txt -/// file. By providing it a URL path, it can decide whether or not -/// the robots file that compiled it allows or disallows that path in -/// roughly O(n) time, where n is the length of the path. -#[derive(Debug, Serialize, Deserialize)] -pub struct Cylon { - states: Vec, - transitions: Vec>, - #[cfg(feature = "crawl-delay")] - delay: Option, -} - -impl Cylon { - #[cfg(feature = "crawl-delay")] - pub fn delay(&self) -> Option { - self.delay - } - - /// Match whether the rules allow or disallow the target path. - pub fn allow(&self, path: &str) -> bool { - match self.states[self.state(path)] { - State::Allow => true, - State::Disallow => false, - // Intermediate states are not preserved in the DFA - State::Intermediate | State::Delay => unreachable!(), - } - } - - fn state(&self, path: &str) -> usize { - let state = path.chars().fold(2, |state, path_char| { - let t = &self.transitions[state]; - t.iter() - .rev() - // Pick the last transition to always prioritize MatchChar - // over MatchAny (which will always be the first transition.) - .find(|transition| match transition { - Transition(Edge::MatchAny, ..) => true, - Transition(Edge::MatchEow, ..) => false, - Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, - }) - .map(|Transition(.., next_state)| *next_state) - // We are guaranteed at least one matching state because of - // the way the DFA is constructed. - .unwrap() - }); - - // Follow the EoW transition, if necessary - let t = &self.transitions[state]; - t - .iter() - .rev() - .find(|transition| match transition { - Transition(Edge::MatchEow, ..) => true, - Transition(Edge::MatchAny, ..) => true, - _ => false, - }) - .map(|Transition(.., next_state)| *next_state) - .unwrap_or(state) - } - - /// Compile a machine from a list of rules. - pub fn compile(mut rules: Vec) -> Self { - // This algorithm constructs a DFA by doing BFS over the prefix tree of - // paths in the provided list of rules. However, for performance reasons - // it does not actually build a tree structure. (Vecs have better - // cache-locality by avoiding random memory access.) - - let mut transitions: Vec> = vec![ - vec![Transition(Edge::MatchAny, 0)], - vec![Transition(Edge::MatchAny, 1)], - ]; - let mut states: Vec = vec![State::Allow, State::Disallow]; - - rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); - - let mut queue = vec![("", 0, 0, State::Intermediate)]; - while !queue.is_empty() { - // parent_prefix is the "parent node" in the prefix tree. We are - // going to visit its children by filtering from the list of - // paths only the paths that start with the parent_prefix. - // wildcard_state is a node to jump to when an unmatched character - // is encountered. This is usually a node higher up in the tree - // that can match any character legally, but is also a prefix - // (read: ancestor) of the current node. - let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); - let last_char = parent_prefix.chars().last(); - - wildcard_state = match state { - State::Allow => 0, - State::Disallow if last_char == Some('$') => wildcard_state, - State::Disallow => 1, - State::Delay => 1, - State::Intermediate => wildcard_state, - }; - - let mut t = match last_char { - Some('$') => { - // The EOW character cannot match anything else - vec![Transition(Edge::MatchAny, wildcard_state)] - } - Some('*') => { - // The wildcard character overrides the wildcard state - vec![Transition(Edge::MatchAny, transitions.len())] - } - _ => { - // Every other state has a self-loop that matches anything - vec![Transition(Edge::MatchAny, wildcard_state)] - } - }; - - let mut curr_prefix = ""; - rules - .iter() - .map(Rule::inner) - .zip(&rules) - .filter(|(path, _)| (*path).starts_with(parent_prefix)) - .filter(|(path, _)| (*path) != parent_prefix) - .for_each(|(path, rule)| { - let child_prefix = &path[0..parent_prefix.len() + 1]; - if curr_prefix == child_prefix { - // We only want to visit a child node once, but - // many rules might have the same child_prefix, so - // we skip the duplicates after the first time - // we see a prefix. (This could be a filter(), but - // it's a bit hard to encode earlier in the chain.) - return; - } - curr_prefix = child_prefix; - - let eow = child_prefix == path; - let state = match (rule, eow) { - (Rule::Allow(..), true) => State::Allow, - (Rule::Disallow(..), true) => State::Disallow, - (Rule::Delay(..), true) => State::Delay, - _ => State::Intermediate, - }; - - queue.push((child_prefix, wildcard_state, transitions.len(), state)); - - // NB: we can predict what state index the child - // will have before it's even pushed onto the state vec. - let child_index = transitions.len() + queue.len(); - let edge_char = child_prefix.chars().last().unwrap(); - let transition = Transition( - match edge_char { - '*' => Edge::MatchAny, - '$' => Edge::MatchEow, - c => Edge::MatchChar(c), - }, - child_index, - ); - - // Add transitions from the parent state to the child state - // so that the wildcard character matches are optional. - if last_char == Some('*') { - let parent_t = &mut transitions[parent_state]; - parent_t.push(transition); - } - - t.push(transition); - }); - - states.push(match state { - State::Allow | State::Disallow | State::Delay => state, - State::Intermediate => states[wildcard_state], - }); - transitions.push(t); - } - - #[cfg(feature = "crawl-delay")] - { - let mut delays: Vec> = rules.iter().filter(|rule| { - match rule { - Rule::Delay(_) => true, - _ => false - } - }).map(|r| { - r.inner().parse::().ok() - }).collect(); - delays.sort_unstable_by(|a, b| { - match (a, b) { - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (None, None) => Ordering::Equal, - (Some(aa), Some(bb)) => aa.cmp(bb) - - } - }); - Self { - delay: *delays.get(0).unwrap_or(&None), - states, - transitions, - } - } - - #[cfg(not(feature = "crawl-delay"))] - Self { - states, - transitions, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - macro_rules! t { - ('*' => $x:expr) => { - Transition(Edge::MatchAny, $x) - }; - ('$' => $x:expr) => { - Transition(Edge::MatchEow, $x) - }; - ($x:expr => $y:expr) => { - Transition(Edge::MatchChar($x), $y) - }; - } - - #[test] - fn test_compile() { - let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), - ]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" - vec![t!('*' => 0), t!('b' => 6)], // "/a" - vec![t!('*' => 0)], // "/b" - vec![t!('*' => 0), t!('c' => 7)], // "/ab" - vec![t!('*' => 0)], // "/abc" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Allow, - State::Allow, - State::Allow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_with_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" - vec![t!('*' => 4), t!('.' => 6)], // "/*" - vec![t!('*' => 0)], // "/a" - vec![t!('*' => 1), t!('b' => 7)], // "/*." - vec![t!('*' => 0)], // "/*.b" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Disallow, - State::Allow, - State::Disallow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_tricky_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" - vec![t!('*' => 4), t!('.' => 5)], // "/*" - vec![t!('*' => 0)], // "/*." - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - State::Disallow, - State::Allow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_compile_with_eow() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), - // Note that this rule is nonsensical. It will compile, but - // no guarantees are made as to how it's matched. Rules should - // use url-encoded strings to escape $. - Rule::Disallow("/x$y"), - ]; - - let expect_transitions = vec![ - vec![t!('*' => 0)], - vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" - vec![t!('*' => 0), t!('$' => 6)], // "/a" - vec![t!('*' => 0), t!('$' => 7)], // "/x" - vec![t!('*' => 0)], // "/a$" - vec![t!('*' => 0), t!('y' => 8)], // "/x$" - vec![t!('*' => 1)], // "/x$y" - ]; - - let expect_states = vec![ - State::Allow, - State::Disallow, - State::Allow, - State::Allow, - State::Allow, - State::Allow, - State::Disallow, - State::Allow, - State::Disallow, - ]; - - let actual = Cylon::compile(rules); - assert_eq!(actual.transitions, expect_transitions); - assert_eq!(actual.states, expect_states); - } - - #[test] - fn test_allow() { - let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(false, machine.allow("/")); - assert_eq!(true, machine.allow("/a")); - assert_eq!(true, machine.allow("/a/b")); - assert_eq!(true, machine.allow("/a")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(true, machine.allow("/abc/def")); - assert_eq!(true, machine.allow("/b")); - assert_eq!(true, machine.allow("/b/c")); - } - - #[test] - fn test_allow_match_any() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/secret/*.txt"), - Rule::Disallow("/private/*"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(false, machine.allow("/secret/abc.txt")); - assert_eq!(false, machine.allow("/secret/123.txt")); - assert_eq!(true, machine.allow("/secret/abc.csv")); - assert_eq!(true, machine.allow("/secret/123.csv")); - assert_eq!(false, machine.allow("/private/abc.txt")); - assert_eq!(false, machine.allow("/private/123.txt")); - assert_eq!(false, machine.allow("/private/abc.csv")); - assert_eq!(false, machine.allow("/private/123.csv")); - } - - #[test] - fn test_allow_match_eow() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/ignore$"), - Rule::Disallow("/foo$bar"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/abc")); - assert_eq!(false, machine.allow("/ignore")); - assert_eq!(true, machine.allow("/ignoreabc")); - assert_eq!(true, machine.allow("/ignore/abc")); - // These are technically undefined, and no behavior - // is guaranteed since the rule is malformed. However - // it is safer to accept them rather than reject them. - assert_eq!(true, machine.allow("/foo")); - assert_eq!(true, machine.allow("/foo$bar")); - } - - #[test] - fn test_allow_more_complicated() { - let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), - Rule::Disallow("/abc"), - Rule::Allow("/abc/*"), - Rule::Disallow("/foo/bar"), - Rule::Allow("/*/bar"), - Rule::Disallow("/www/*/images"), - Rule::Allow("/www/public/images"), - ]; - - let machine = Cylon::compile(rules); - assert_eq!(true, machine.allow("/")); - assert_eq!(true, machine.allow("/directory")); - assert_eq!(false, machine.allow("/a")); - assert_eq!(true, machine.allow("/ab")); - assert_eq!(false, machine.allow("/abc")); - assert_eq!(true, machine.allow("/abc/123")); - assert_eq!(true, machine.allow("/foo")); - assert_eq!(true, machine.allow("/foobar")); - assert_eq!(false, machine.allow("/foo/bar")); - assert_eq!(false, machine.allow("/foo/bar/baz")); - assert_eq!(true, machine.allow("/baz/bar")); - assert_eq!(false, machine.allow("/www/cat/images")); - assert_eq!(true, machine.allow("/www/public/images")); - } - - #[test] - fn test_matches() { - // Test cases from: - // https://developers.google.com/search/reference/robots_txt#group-member-rules - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); - assert_eq!(true, machine.allow("/fish")); - assert_eq!(true, machine.allow("/fish.html")); - assert_eq!(true, machine.allow("/fish/salmon.html")); - assert_eq!(true, machine.allow("/fishheads.html")); - assert_eq!(true, machine.allow("/fishheads/yummy.html")); - assert_eq!(true, machine.allow("/fish.php?id=anything")); - assert_eq!(false, machine.allow("/Fish.asp")); - assert_eq!(false, machine.allow("/catfish")); - assert_eq!(false, machine.allow("/?id=fish")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); - assert_eq!(true, machine.allow("/fish")); - assert_eq!(true, machine.allow("/fish.html")); - assert_eq!(true, machine.allow("/fish/salmon.html")); - assert_eq!(true, machine.allow("/fishheads.html")); - assert_eq!(true, machine.allow("/fishheads/yummy.html")); - assert_eq!(true, machine.allow("/fish.php?id=anything")); - assert_eq!(false, machine.allow("/Fish.asp")); - assert_eq!(false, machine.allow("/catfish")); - assert_eq!(false, machine.allow("/?id=fish")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); - assert_eq!(true, machine.allow("/fish/")); - assert_eq!(true, machine.allow("/fish/?id=anything")); - assert_eq!(true, machine.allow("/fish/salmon.htm")); - assert_eq!(false, machine.allow("/fish")); - assert_eq!(false, machine.allow("/fish.html")); - assert_eq!(false, machine.allow("/Fish/Salmon.asp")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); - assert_eq!(true, machine.allow("/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php?parameters")); - assert_eq!(true, machine.allow("/folder/any.php.file.html")); - assert_eq!(true, machine.allow("/filename.php/")); - assert_eq!(false, machine.allow("/")); - assert_eq!(false, machine.allow("/windows.PHP")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); - assert_eq!(true, machine.allow("/filename.php")); - assert_eq!(true, machine.allow("/folder/filename.php")); - assert_eq!(false, machine.allow("/filename.php?parameters")); - assert_eq!(false, machine.allow("/filename.php/")); - assert_eq!(false, machine.allow("/filename.php5")); - assert_eq!(false, machine.allow("/windows.PHP")); - - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); - assert_eq!(true, machine.allow("/fish.php")); - assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); - assert_eq!(false, machine.allow("/Fish.PHP")); - } -} +#[cfg(feature = "crawl-delay")] +use std::cmp::Ordering; + +use serde_derive::{Deserialize, Serialize}; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Rule<'a> { + Allow(&'a str), + Disallow(&'a str), + Delay(&'a str), +} + +impl<'a> Rule<'a> { + fn inner(&self) -> &str { + match self { + Rule::Allow(inner) => inner, + Rule::Disallow(inner) => inner, + Rule::Delay(inner) => inner, + } + } +} + +#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] +enum Edge { + MatchChar(char), + MatchAny, + MatchEow, +} + +#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] +struct Transition(Edge, usize); + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +enum State { + Allow, + Disallow, + Delay, + Intermediate, +} + +/// A Cylon is a DFA that recognizes rules from a compiled robots.txt +/// file. By providing it a URL path, it can decide whether or not +/// the robots file that compiled it allows or disallows that path in +/// roughly O(n) time, where n is the length of the path. +#[derive(Debug, Serialize, Deserialize)] +pub struct Cylon { + states: Vec, + transitions: Vec>, + #[cfg(feature = "crawl-delay")] + delay: Option, +} + +impl Cylon { + #[cfg(feature = "crawl-delay")] + pub fn delay(&self) -> Option { + self.delay + } + + /// Match whether the rules allow or disallow the target path. + pub fn allow(&self, path: &str) -> bool { + match self.states[self.state(path)] { + State::Allow => true, + State::Disallow => false, + // Intermediate states are not preserved in the DFA + State::Intermediate | State::Delay => unreachable!(), + } + } + + fn state(&self, path: &str) -> usize { + let state = path.chars().fold(2, |state, path_char| { + let t = &self.transitions[state]; + t.iter() + .rev() + // Pick the last transition to always prioritize MatchChar + // over MatchAny (which will always be the first transition.) + .find(|transition| match transition { + Transition(Edge::MatchAny, ..) => true, + Transition(Edge::MatchEow, ..) => false, + Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, + }) + .map(|Transition(.., next_state)| *next_state) + // We are guaranteed at least one matching state because of + // the way the DFA is constructed. + .unwrap() + }); + + // Follow the EoW transition, if necessary + let t = &self.transitions[state]; + t + .iter() + .rev() + .find(|transition| match transition { + Transition(Edge::MatchEow, ..) => true, + Transition(Edge::MatchAny, ..) => true, + _ => false, + }) + .map(|Transition(.., next_state)| *next_state) + .unwrap_or(state) + } + + /// Compile a machine from a list of rules. + pub fn compile(mut rules: Vec) -> Self { + // This algorithm constructs a DFA by doing BFS over the prefix tree of + // paths in the provided list of rules. However, for performance reasons + // it does not actually build a tree structure. (Vecs have better + // cache-locality by avoiding random memory access.) + + let mut transitions: Vec> = vec![ + vec![Transition(Edge::MatchAny, 0)], + vec![Transition(Edge::MatchAny, 1)], + ]; + let mut states: Vec = vec![State::Allow, State::Disallow]; + + rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); + + let mut queue = vec![("", 0, 0, State::Intermediate)]; + while !queue.is_empty() { + // parent_prefix is the "parent node" in the prefix tree. We are + // going to visit its children by filtering from the list of + // paths only the paths that start with the parent_prefix. + // wildcard_state is a node to jump to when an unmatched character + // is encountered. This is usually a node higher up in the tree + // that can match any character legally, but is also a prefix + // (read: ancestor) of the current node. + let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); + let last_char = parent_prefix.chars().last(); + + wildcard_state = match state { + State::Allow => 0, + State::Disallow if last_char == Some('$') => wildcard_state, + State::Disallow => 1, + State::Delay => 1, + State::Intermediate => wildcard_state, + }; + + let mut t = match last_char { + Some('$') => { + // The EOW character cannot match anything else + vec![Transition(Edge::MatchAny, wildcard_state)] + } + Some('*') => { + // The wildcard character overrides the wildcard state + vec![Transition(Edge::MatchAny, transitions.len())] + } + _ => { + // Every other state has a self-loop that matches anything + vec![Transition(Edge::MatchAny, wildcard_state)] + } + }; + + let mut curr_prefix = ""; + rules + .iter() + .map(Rule::inner) + .zip(&rules) + .filter(|(path, _)| (*path).starts_with(parent_prefix)) + .filter(|(path, _)| (*path) != parent_prefix) + .for_each(|(path, rule)| { + let child_prefix = &path[0..parent_prefix.len() + 1]; + if curr_prefix == child_prefix { + // We only want to visit a child node once, but + // many rules might have the same child_prefix, so + // we skip the duplicates after the first time + // we see a prefix. (This could be a filter(), but + // it's a bit hard to encode earlier in the chain.) + return; + } + curr_prefix = child_prefix; + + let eow = child_prefix == path; + let state = match (rule, eow) { + (Rule::Allow(..), true) => State::Allow, + (Rule::Disallow(..), true) => State::Disallow, + (Rule::Delay(..), true) => State::Delay, + _ => State::Intermediate, + }; + + queue.push((child_prefix, wildcard_state, transitions.len(), state)); + + // NB: we can predict what state index the child + // will have before it's even pushed onto the state vec. + let child_index = transitions.len() + queue.len(); + let edge_char = child_prefix.chars().last().unwrap(); + let transition = Transition( + match edge_char { + '*' => Edge::MatchAny, + '$' => Edge::MatchEow, + c => Edge::MatchChar(c), + }, + child_index, + ); + + // Add transitions from the parent state to the child state + // so that the wildcard character matches are optional. + if last_char == Some('*') { + let parent_t = &mut transitions[parent_state]; + parent_t.push(transition); + } + + t.push(transition); + }); + + states.push(match state { + State::Allow | State::Disallow | State::Delay => state, + State::Intermediate => states[wildcard_state], + }); + transitions.push(t); + } + + #[cfg(feature = "crawl-delay")] + { + let mut delays: Vec> = rules.iter().filter(|rule| { + match rule { + Rule::Delay(_) => true, + _ => false + } + }).map(|r| { + r.inner().parse::().ok() + }).collect(); + delays.sort_unstable_by(|a, b| { + match (a, b) { + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + (Some(aa), Some(bb)) => aa.cmp(bb) + + } + }); + Self { + delay: *delays.get(0).unwrap_or(&None), + states, + transitions, + } + } + + #[cfg(not(feature = "crawl-delay"))] + Self { + states, + transitions, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! t { + ('*' => $x:expr) => { + Transition(Edge::MatchAny, $x) + }; + ('$' => $x:expr) => { + Transition(Edge::MatchEow, $x) + }; + ($x:expr => $y:expr) => { + Transition(Edge::MatchChar($x), $y) + }; + } + + #[test] + fn test_compile() { + let rules = vec![ + Rule::Disallow("/"), + Rule::Allow("/a"), + Rule::Allow("/abc"), + Rule::Allow("/b"), + ]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" + vec![t!('*' => 0), t!('b' => 6)], // "/a" + vec![t!('*' => 0)], // "/b" + vec![t!('*' => 0), t!('c' => 7)], // "/ab" + vec![t!('*' => 0)], // "/abc" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Allow, + State::Allow, + State::Allow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_with_wildcard() { + let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" + vec![t!('*' => 4), t!('.' => 6)], // "/*" + vec![t!('*' => 0)], // "/a" + vec![t!('*' => 1), t!('b' => 7)], // "/*." + vec![t!('*' => 0)], // "/*.b" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Disallow, + State::Allow, + State::Disallow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_tricky_wildcard() { + let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" + vec![t!('*' => 4), t!('.' => 5)], // "/*" + vec![t!('*' => 0)], // "/*." + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + State::Disallow, + State::Allow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_compile_with_eow() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/a$"), + // Note that this rule is nonsensical. It will compile, but + // no guarantees are made as to how it's matched. Rules should + // use url-encoded strings to escape $. + Rule::Disallow("/x$y"), + ]; + + let expect_transitions = vec![ + vec![t!('*' => 0)], + vec![t!('*' => 1)], + vec![t!('*' => 0), t!('/' => 3)], // "" + vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" + vec![t!('*' => 0), t!('$' => 6)], // "/a" + vec![t!('*' => 0), t!('$' => 7)], // "/x" + vec![t!('*' => 0)], // "/a$" + vec![t!('*' => 0), t!('y' => 8)], // "/x$" + vec![t!('*' => 1)], // "/x$y" + ]; + + let expect_states = vec![ + State::Allow, + State::Disallow, + State::Allow, + State::Allow, + State::Allow, + State::Allow, + State::Disallow, + State::Allow, + State::Disallow, + ]; + + let actual = Cylon::compile(rules); + assert_eq!(actual.transitions, expect_transitions); + assert_eq!(actual.states, expect_states); + } + + #[test] + fn test_allow() { + let rules = vec![ + Rule::Disallow("/"), + Rule::Allow("/a"), + Rule::Allow("/abc"), + Rule::Allow("/b"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(false, machine.allow("/")); + assert_eq!(true, machine.allow("/a")); + assert_eq!(true, machine.allow("/a/b")); + assert_eq!(true, machine.allow("/a")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(true, machine.allow("/abc/def")); + assert_eq!(true, machine.allow("/b")); + assert_eq!(true, machine.allow("/b/c")); + } + + #[test] + fn test_allow_match_any() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/secret/*.txt"), + Rule::Disallow("/private/*"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(false, machine.allow("/secret/abc.txt")); + assert_eq!(false, machine.allow("/secret/123.txt")); + assert_eq!(true, machine.allow("/secret/abc.csv")); + assert_eq!(true, machine.allow("/secret/123.csv")); + assert_eq!(false, machine.allow("/private/abc.txt")); + assert_eq!(false, machine.allow("/private/123.txt")); + assert_eq!(false, machine.allow("/private/abc.csv")); + assert_eq!(false, machine.allow("/private/123.csv")); + } + + #[test] + fn test_allow_match_eow() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/ignore$"), + Rule::Disallow("/foo$bar"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/abc")); + assert_eq!(false, machine.allow("/ignore")); + assert_eq!(true, machine.allow("/ignoreabc")); + assert_eq!(true, machine.allow("/ignore/abc")); + // These are technically undefined, and no behavior + // is guaranteed since the rule is malformed. However + // it is safer to accept them rather than reject them. + assert_eq!(true, machine.allow("/foo")); + assert_eq!(true, machine.allow("/foo$bar")); + } + + #[test] + fn test_allow_more_complicated() { + let rules = vec![ + Rule::Allow("/"), + Rule::Disallow("/a$"), + Rule::Disallow("/abc"), + Rule::Allow("/abc/*"), + Rule::Disallow("/foo/bar"), + Rule::Allow("/*/bar"), + Rule::Disallow("/www/*/images"), + Rule::Allow("/www/public/images"), + ]; + + let machine = Cylon::compile(rules); + assert_eq!(true, machine.allow("/")); + assert_eq!(true, machine.allow("/directory")); + assert_eq!(false, machine.allow("/a")); + assert_eq!(true, machine.allow("/ab")); + assert_eq!(false, machine.allow("/abc")); + assert_eq!(true, machine.allow("/abc/123")); + assert_eq!(true, machine.allow("/foo")); + assert_eq!(true, machine.allow("/foobar")); + assert_eq!(false, machine.allow("/foo/bar")); + assert_eq!(false, machine.allow("/foo/bar/baz")); + assert_eq!(true, machine.allow("/baz/bar")); + assert_eq!(false, machine.allow("/www/cat/images")); + assert_eq!(true, machine.allow("/www/public/images")); + } + + #[test] + fn test_matches() { + // Test cases from: + // https://developers.google.com/search/reference/robots_txt#group-member-rules + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); + assert_eq!(true, machine.allow("/fish")); + assert_eq!(true, machine.allow("/fish.html")); + assert_eq!(true, machine.allow("/fish/salmon.html")); + assert_eq!(true, machine.allow("/fishheads.html")); + assert_eq!(true, machine.allow("/fishheads/yummy.html")); + assert_eq!(true, machine.allow("/fish.php?id=anything")); + assert_eq!(false, machine.allow("/Fish.asp")); + assert_eq!(false, machine.allow("/catfish")); + assert_eq!(false, machine.allow("/?id=fish")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); + assert_eq!(true, machine.allow("/fish")); + assert_eq!(true, machine.allow("/fish.html")); + assert_eq!(true, machine.allow("/fish/salmon.html")); + assert_eq!(true, machine.allow("/fishheads.html")); + assert_eq!(true, machine.allow("/fishheads/yummy.html")); + assert_eq!(true, machine.allow("/fish.php?id=anything")); + assert_eq!(false, machine.allow("/Fish.asp")); + assert_eq!(false, machine.allow("/catfish")); + assert_eq!(false, machine.allow("/?id=fish")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); + assert_eq!(true, machine.allow("/fish/")); + assert_eq!(true, machine.allow("/fish/?id=anything")); + assert_eq!(true, machine.allow("/fish/salmon.htm")); + assert_eq!(false, machine.allow("/fish")); + assert_eq!(false, machine.allow("/fish.html")); + assert_eq!(false, machine.allow("/Fish/Salmon.asp")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); + assert_eq!(true, machine.allow("/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php?parameters")); + assert_eq!(true, machine.allow("/folder/any.php.file.html")); + assert_eq!(true, machine.allow("/filename.php/")); + assert_eq!(false, machine.allow("/")); + assert_eq!(false, machine.allow("/windows.PHP")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); + assert_eq!(true, machine.allow("/filename.php")); + assert_eq!(true, machine.allow("/folder/filename.php")); + assert_eq!(false, machine.allow("/filename.php?parameters")); + assert_eq!(false, machine.allow("/filename.php/")); + assert_eq!(false, machine.allow("/filename.php5")); + assert_eq!(false, machine.allow("/windows.PHP")); + + let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); + assert_eq!(true, machine.allow("/fish.php")); + assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); + assert_eq!(false, machine.allow("/Fish.PHP")); + } +} diff --git a/src/parse.rs b/src/parse.rs index 1bfb750..b1f7cf6 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,443 +1,443 @@ -use super::dfa::{Cylon, Rule}; -use futures_util::{ - io::{AsyncBufRead, AsyncRead, BufReader, Result}, - AsyncBufReadExt, -}; -use serde_derive::{Deserialize, Serialize}; -const UA_PREFIX: &str = "user-agent:"; -const DELAY_PREFIX: &str = "crawl-delay:"; -const ALLOW_PREFIX: &str = "allow:"; -const DISALLOW_PREFIX: &str = "disallow:"; - -#[derive(Debug, PartialEq, Clone)] -enum ParsedRule { - Allow(String), - Disallow(String), - Delay(String), -} - -impl<'a> Into> for &'a ParsedRule { - fn into(self) -> Rule<'a> { - match self { - ParsedRule::Allow(path) => Rule::Allow(&path[..]), - ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), - ParsedRule::Delay(delay) => Rule::Delay(delay), - } - } -} - -#[derive(Debug, PartialEq)] -enum ParsedLine { - UserAgent(String), - Rule(ParsedRule), - Nothing, -} - -/// A compiler takes an input robots.txt file and outputs a compiled Cylon, -/// which can be used to efficiently match a large number of paths against -/// the robots.txt file. -#[derive(Debug, Serialize, Deserialize)] -pub struct Compiler { - user_agent: String, -} - -impl Compiler { - /// Build a new compiler that parses rules for the given user agent from - /// a robots.txt file. - pub fn new(user_agent: &str) -> Self { - Self { - user_agent: user_agent.to_lowercase(), - } - } - - /// Parse an input robots.txt file into a Cylon that can recognize - /// whether or not a path matches the rules for the Parser's user agent. - pub async fn compile(&self, file: R) -> Result { - let reader = BufReader::new(file); - let mut agent = String::new(); - let mut rules: Vec = vec![]; - let mut group_reader = GroupReader::new(reader); - - // find the most specific matching group in the robots file - while let Some(agents) = group_reader.next_header().await? { - let matching_agent = agents.iter().find(|a| { - let matches = &a[..] == "*" || self.user_agent.contains(*a); - let more_specific = a.len() > agent.len(); - matches && more_specific - }); - - if let Some(matching_agent) = matching_agent { - agent = matching_agent.clone(); - rules = group_reader.next_rules().await?; - } - } - - let rules = rules.iter().map(|r| r.into()).collect(); - Ok(Cylon::compile(rules)) - } -} - -struct GroupReader { - parsing_agents: bool, - agents: Vec, - rules: Vec, - reader: R, -} - -impl GroupReader { - fn new(reader: R) -> Self { - Self { - parsing_agents: true, - agents: vec![], - rules: vec![], - reader, - } - } - - /// Scan forward until the next group header defined by one or more - /// user agent lines. This lets us optimize the lines we need to copy - /// so we can skip over groups that don't match the desired user agent. - async fn next_header(&mut self) -> Result>> { - let mut buf = String::new(); - while self.reader.read_line(&mut buf).await? != 0 { - let parsed_line = parse_line(buf.clone()); - - match parsed_line { - ParsedLine::UserAgent(ua) if self.parsing_agents => { - self.agents.push(ua); - } - ParsedLine::UserAgent(ua) => { - self.agents = vec![ua]; - self.rules = vec![]; - self.parsing_agents = true; - } - ParsedLine::Rule(rule) if self.parsing_agents => { - // Preserve the rule in case we need it in next_rules(). - self.rules.push(rule); - self.parsing_agents = false; - break; - } - // Skip over lines until we get to the next user agent. - ParsedLine::Rule(..) => (), - ParsedLine::Nothing => (), - } - - buf.clear(); - } - - let agents = self.agents.clone(); - self.agents = vec![]; - - if agents.is_empty() { - return Ok(None); - } - - Ok(Some(agents)) - } - - async fn next_rules(&mut self) -> Result> { - let mut buf = String::new(); - while self.reader.read_line(&mut buf).await? != 0 { - let parsed_line = parse_line(buf.clone()); - - match parsed_line { - ParsedLine::Rule(rule) => { - self.rules.push(rule); - self.parsing_agents = false; - } - ParsedLine::UserAgent(ua) if !self.parsing_agents => { - // Preserve the agent in case we need it in next_agents(). - self.agents.push(ua); - self.parsing_agents = true; - break; - } - // Skip over lines until we get to the next rule. - ParsedLine::UserAgent(..) => (), - ParsedLine::Nothing => (), - } - - buf.clear(); - } - - let rules = self.rules.clone(); - self.rules = vec![]; - Ok(rules) - } -} - -fn parse_line(line: String) -> ParsedLine { - let line = strip_comments(&line[..]).trim(); - - // This tries to parse lines roughly in order of most frequent kind to - // least frequent kind in order to minimize CPU cycles on average. - parse_disallow(line) - .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) - .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) - .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) - .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) - .unwrap_or(ParsedLine::Nothing) -} - -fn strip_comments(line: &str) -> &str { - if let Some(before) = line.split('#').next() { - return before; - } - return line; -} - -fn parse_user_agent(line: &str) -> Option<&str> { - if line.len() < UA_PREFIX.len() { - return None; - } - let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[UA_PREFIX.len()..]; - - if prefix == UA_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_delay(line: &str) -> Option<&str> { - if line.len() < DELAY_PREFIX.len() { - return None; - } - - let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[DELAY_PREFIX.len()..]; - if prefix == DELAY_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_allow(line: &str) -> Option<&str> { - if line.len() < ALLOW_PREFIX.len() { - return None; - } - let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[ALLOW_PREFIX.len()..]; - - if prefix == ALLOW_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -fn parse_disallow(line: &str) -> Option<&str> { - if line.len() < DISALLOW_PREFIX.len() { - return None; - } - let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase(); - let suffix = &line[DISALLOW_PREFIX.len()..]; - - if prefix == DISALLOW_PREFIX { - Some(suffix.trim()) - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_allow() { - let test_cases = vec![ - ("Allow: /", "/"), - ("allow: / # Root with comment", "/"), - ("ALLOW: /abc/def ", "/abc/def"), - ("Allow: /abc/def ", "/abc/def"), - (" Allow: /*/foo", "/*/foo"), - ]; - - for (i, o) in test_cases { - assert_eq!( - parse_line(i.into()), - ParsedLine::Rule(ParsedRule::Allow(o.into())) - ); - } - } - - #[test] - fn test_parse_disallow() { - let test_cases = vec![ - ("Disallow: /", "/"), - ("disallow: / # Root with comment", "/"), - ("DISALLOW: /abc/def ", "/abc/def"), - ("Disallow: /abc/def ", "/abc/def"), - (" Disallow: /*/foo", "/*/foo"), - ]; - - for (i, o) in test_cases { - assert_eq!( - parse_line(i.into()), - ParsedLine::Rule(ParsedRule::Disallow(o.into())) - ); - } - } - - #[test] - fn test_parse_user_agent() { - let test_cases = vec![ - ("User-agent: *", "*"), - ("user-agent: ImABot # User agent with comment", "imabot"), - (" USER-AGENT: ImABot ", "imabot"), - ]; - - for (i, o) in test_cases { - assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into())); - } - } - - #[test] - fn test_parse_nothing() { - let test_cases = vec![ - "Useragent: *", - "# Comment", - "", - " ", - "\t", - "alow: /", - "disalow: /", - ]; - - for i in test_cases { - assert_eq!(parse_line(i.into()), ParsedLine::Nothing); - } - } - - #[test] - #[cfg(feature = "crawl-delay")] - fn test_crawl_delay() { - tokio_test::block_on(async { - let example_robots = r#" - User-agent: jones-bot - Disallow: / - Crawl-Delay: 30 - - User-agent: foobar - Crawl-Delay: 60 - - User-agent: googlebot - Allow: / - - User-agent: barfoo - Crawl-Delay: 60 - Crawl-Delay: 20 - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("googlebot"); - let googlebot_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("barfoo"); - let barfoo_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("jones-bot"); - let jonesbot_machine = parser.compile(example_robots).await.unwrap(); - - assert_eq!(Some(60), foobar_machine.delay()); - assert_eq!(Some(20), barfoo_machine.delay()); - assert_eq!(Some(30), jonesbot_machine.delay()); - assert_eq!(None, googlebot_machine.delay()); - }); - } - - #[test] - fn test_end_to_end() { - tokio_test::block_on(async { - let example_robots = r#" - User-agent: jones-bot - Disallow: / - - User-agent: foo - Allow: / - Crawl-Delay: 20 - - User-agent: jones - User-agent: foobar - Allow: / - - User-agent: * - Disallow: / - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("jones-bot"); - let jonesbot_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("abc"); - let abc_machine = parser.compile(example_robots).await.unwrap(); - - assert_eq!(true, foobar_machine.allow("/index.html")); - assert_eq!(false, jonesbot_machine.allow("/index.html")); - assert_eq!(false, imabot_machine.allow("/index.html")); - assert_eq!(false, abc_machine.allow("/index.html")); - }); - } - - #[test] - fn test_invalid_1() { - tokio_test::block_on(async { - let example_robots = r#" - # Instead of treating this as an error, we'll just consider - # this behavior undefined. - Allow: / - - User-agent: jones - User-agent: foobar - Disallow: / - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - // Everything is allowed because next_header() returns None - assert_eq!(true, foobar_machine.allow("/index.html")); - assert_eq!(true, imabot_machine.allow("/index.html")); - }); - } - - #[test] - fn test_invalid_2() { - tokio_test::block_on(async { - let example_robots = r#" - User-agent: jones - User-agent: foobar - Disallow: / - - # Instead of treating this as an error, we consider this - # behavior undefined. - User-agent: imabot - "# - .as_bytes(); - - let parser = Compiler::new("foobar"); - let foobar_machine = parser.compile(example_robots).await.unwrap(); - - let parser = Compiler::new("imabot"); - let imabot_machine = parser.compile(example_robots).await.unwrap(); - - assert_eq!(false, foobar_machine.allow("/index.html")); - assert_eq!(true, imabot_machine.allow("/index.html")); - }); - } -} +use super::dfa::{Cylon, Rule}; +use futures_util::{ + io::{AsyncBufRead, AsyncRead, BufReader, Result}, + AsyncBufReadExt, +}; +use serde_derive::{Deserialize, Serialize}; +const UA_PREFIX: &str = "user-agent:"; +const DELAY_PREFIX: &str = "crawl-delay:"; +const ALLOW_PREFIX: &str = "allow:"; +const DISALLOW_PREFIX: &str = "disallow:"; + +#[derive(Debug, PartialEq, Clone)] +enum ParsedRule { + Allow(String), + Disallow(String), + Delay(String), +} + +impl<'a> Into> for &'a ParsedRule { + fn into(self) -> Rule<'a> { + match self { + ParsedRule::Allow(path) => Rule::Allow(&path[..]), + ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + ParsedRule::Delay(delay) => Rule::Delay(delay), + } + } +} + +#[derive(Debug, PartialEq)] +enum ParsedLine { + UserAgent(String), + Rule(ParsedRule), + Nothing, +} + +/// A compiler takes an input robots.txt file and outputs a compiled Cylon, +/// which can be used to efficiently match a large number of paths against +/// the robots.txt file. +#[derive(Debug, Serialize, Deserialize)] +pub struct Compiler { + user_agent: String, +} + +impl Compiler { + /// Build a new compiler that parses rules for the given user agent from + /// a robots.txt file. + pub fn new(user_agent: &str) -> Self { + Self { + user_agent: user_agent.to_lowercase(), + } + } + + /// Parse an input robots.txt file into a Cylon that can recognize + /// whether or not a path matches the rules for the Parser's user agent. + pub async fn compile(&self, file: R) -> Result { + let reader = BufReader::new(file); + let mut agent = String::new(); + let mut rules: Vec = vec![]; + let mut group_reader = GroupReader::new(reader); + + // find the most specific matching group in the robots file + while let Some(agents) = group_reader.next_header().await? { + let matching_agent = agents.iter().find(|a| { + let matches = &a[..] == "*" || self.user_agent.contains(*a); + let more_specific = a.len() > agent.len(); + matches && more_specific + }); + + if let Some(matching_agent) = matching_agent { + agent = matching_agent.clone(); + rules = group_reader.next_rules().await?; + } + } + + let rules = rules.iter().map(|r| r.into()).collect(); + Ok(Cylon::compile(rules)) + } +} + +struct GroupReader { + parsing_agents: bool, + agents: Vec, + rules: Vec, + reader: R, +} + +impl GroupReader { + fn new(reader: R) -> Self { + Self { + parsing_agents: true, + agents: vec![], + rules: vec![], + reader, + } + } + + /// Scan forward until the next group header defined by one or more + /// user agent lines. This lets us optimize the lines we need to copy + /// so we can skip over groups that don't match the desired user agent. + async fn next_header(&mut self) -> Result>> { + let mut buf = String::new(); + while self.reader.read_line(&mut buf).await? != 0 { + let parsed_line = parse_line(buf.clone()); + + match parsed_line { + ParsedLine::UserAgent(ua) if self.parsing_agents => { + self.agents.push(ua); + } + ParsedLine::UserAgent(ua) => { + self.agents = vec![ua]; + self.rules = vec![]; + self.parsing_agents = true; + } + ParsedLine::Rule(rule) if self.parsing_agents => { + // Preserve the rule in case we need it in next_rules(). + self.rules.push(rule); + self.parsing_agents = false; + break; + } + // Skip over lines until we get to the next user agent. + ParsedLine::Rule(..) => (), + ParsedLine::Nothing => (), + } + + buf.clear(); + } + + let agents = self.agents.clone(); + self.agents = vec![]; + + if agents.is_empty() { + return Ok(None); + } + + Ok(Some(agents)) + } + + async fn next_rules(&mut self) -> Result> { + let mut buf = String::new(); + while self.reader.read_line(&mut buf).await? != 0 { + let parsed_line = parse_line(buf.clone()); + + match parsed_line { + ParsedLine::Rule(rule) => { + self.rules.push(rule); + self.parsing_agents = false; + } + ParsedLine::UserAgent(ua) if !self.parsing_agents => { + // Preserve the agent in case we need it in next_agents(). + self.agents.push(ua); + self.parsing_agents = true; + break; + } + // Skip over lines until we get to the next rule. + ParsedLine::UserAgent(..) => (), + ParsedLine::Nothing => (), + } + + buf.clear(); + } + + let rules = self.rules.clone(); + self.rules = vec![]; + Ok(rules) + } +} + +fn parse_line(line: String) -> ParsedLine { + let line = strip_comments(&line[..]).trim(); + + // This tries to parse lines roughly in order of most frequent kind to + // least frequent kind in order to minimize CPU cycles on average. + parse_disallow(line) + .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) + .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) + .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) + .unwrap_or(ParsedLine::Nothing) +} + +fn strip_comments(line: &str) -> &str { + if let Some(before) = line.split('#').next() { + return before; + } + return line; +} + +fn parse_user_agent(line: &str) -> Option<&str> { + if line.len() < UA_PREFIX.len() { + return None; + } + let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[UA_PREFIX.len()..]; + + if prefix == UA_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_delay(line: &str) -> Option<&str> { + if line.len() < DELAY_PREFIX.len() { + return None; + } + + let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DELAY_PREFIX.len()..]; + if prefix == DELAY_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_allow(line: &str) -> Option<&str> { + if line.len() < ALLOW_PREFIX.len() { + return None; + } + let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[ALLOW_PREFIX.len()..]; + + if prefix == ALLOW_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +fn parse_disallow(line: &str) -> Option<&str> { + if line.len() < DISALLOW_PREFIX.len() { + return None; + } + let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DISALLOW_PREFIX.len()..]; + + if prefix == DISALLOW_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_allow() { + let test_cases = vec![ + ("Allow: /", "/"), + ("allow: / # Root with comment", "/"), + ("ALLOW: /abc/def ", "/abc/def"), + ("Allow: /abc/def ", "/abc/def"), + (" Allow: /*/foo", "/*/foo"), + ]; + + for (i, o) in test_cases { + assert_eq!( + parse_line(i.into()), + ParsedLine::Rule(ParsedRule::Allow(o.into())) + ); + } + } + + #[test] + fn test_parse_disallow() { + let test_cases = vec![ + ("Disallow: /", "/"), + ("disallow: / # Root with comment", "/"), + ("DISALLOW: /abc/def ", "/abc/def"), + ("Disallow: /abc/def ", "/abc/def"), + (" Disallow: /*/foo", "/*/foo"), + ]; + + for (i, o) in test_cases { + assert_eq!( + parse_line(i.into()), + ParsedLine::Rule(ParsedRule::Disallow(o.into())) + ); + } + } + + #[test] + fn test_parse_user_agent() { + let test_cases = vec![ + ("User-agent: *", "*"), + ("user-agent: ImABot # User agent with comment", "imabot"), + (" USER-AGENT: ImABot ", "imabot"), + ]; + + for (i, o) in test_cases { + assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into())); + } + } + + #[test] + fn test_parse_nothing() { + let test_cases = vec![ + "Useragent: *", + "# Comment", + "", + " ", + "\t", + "alow: /", + "disalow: /", + ]; + + for i in test_cases { + assert_eq!(parse_line(i.into()), ParsedLine::Nothing); + } + } + + #[test] + #[cfg(feature = "crawl-delay")] + fn test_crawl_delay() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones-bot + Disallow: / + Crawl-Delay: 30 + + User-agent: foobar + Crawl-Delay: 60 + + User-agent: googlebot + Allow: / + + User-agent: barfoo + Crawl-Delay: 60 + Crawl-Delay: 20 + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("googlebot"); + let googlebot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("barfoo"); + let barfoo_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("jones-bot"); + let jonesbot_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(Some(60), foobar_machine.delay()); + assert_eq!(Some(20), barfoo_machine.delay()); + assert_eq!(Some(30), jonesbot_machine.delay()); + assert_eq!(None, googlebot_machine.delay()); + }); + } + + #[test] + fn test_end_to_end() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones-bot + Disallow: / + + User-agent: foo + Allow: / + Crawl-Delay: 20 + + User-agent: jones + User-agent: foobar + Allow: / + + User-agent: * + Disallow: / + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("jones-bot"); + let jonesbot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("abc"); + let abc_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(true, foobar_machine.allow("/index.html")); + assert_eq!(false, jonesbot_machine.allow("/index.html")); + assert_eq!(false, imabot_machine.allow("/index.html")); + assert_eq!(false, abc_machine.allow("/index.html")); + }); + } + + #[test] + fn test_invalid_1() { + tokio_test::block_on(async { + let example_robots = r#" + # Instead of treating this as an error, we'll just consider + # this behavior undefined. + Allow: / + + User-agent: jones + User-agent: foobar + Disallow: / + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + // Everything is allowed because next_header() returns None + assert_eq!(true, foobar_machine.allow("/index.html")); + assert_eq!(true, imabot_machine.allow("/index.html")); + }); + } + + #[test] + fn test_invalid_2() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones + User-agent: foobar + Disallow: / + + # Instead of treating this as an error, we consider this + # behavior undefined. + User-agent: imabot + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("imabot"); + let imabot_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(false, foobar_machine.allow("/index.html")); + assert_eq!(true, imabot_machine.allow("/index.html")); + }); + } +} From e904e9197293215ce8017ef0abfa6a82848b209f Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 16:53:14 +0100 Subject: [PATCH 5/6] more conditional compilation --- src/dfa.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/dfa.rs b/src/dfa.rs index da1e4e2..8b846f4 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -34,6 +34,7 @@ struct Transition(Edge, usize); enum State { Allow, Disallow, + #[cfg(feature = "crawl-delay")] Delay, Intermediate, } @@ -62,7 +63,10 @@ impl Cylon { State::Allow => true, State::Disallow => false, // Intermediate states are not preserved in the DFA + #[cfg(feature = "crawl-delay")] State::Intermediate | State::Delay => unreachable!(), + #[cfg(not(feature = "crawl-delay"))] + State::Intermediate => unreachable!(), } } @@ -129,6 +133,7 @@ impl Cylon { State::Allow => 0, State::Disallow if last_char == Some('$') => wildcard_state, State::Disallow => 1, + #[cfg(feature = "crawl-delay")] State::Delay => 1, State::Intermediate => wildcard_state, }; @@ -171,6 +176,7 @@ impl Cylon { let state = match (rule, eow) { (Rule::Allow(..), true) => State::Allow, (Rule::Disallow(..), true) => State::Disallow, + #[cfg(feature = "crawl-delay")] (Rule::Delay(..), true) => State::Delay, _ => State::Intermediate, }; @@ -201,7 +207,10 @@ impl Cylon { }); states.push(match state { + #[cfg(feature = "crawl-delay")] State::Allow | State::Disallow | State::Delay => state, + #[cfg(not(feature = "crawl-delay"))] + State::Allow | State::Disallow => state, State::Intermediate => states[wildcard_state], }); transitions.push(t); From f331be990fcc19c9b7436e85117e27b486815dfe Mon Sep 17 00:00:00 2001 From: "r.portalez" Date: Wed, 10 Mar 2021 16:57:37 +0100 Subject: [PATCH 6/6] completely isolate crawl-delay code into a feature --- src/dfa.rs | 2 ++ src/parse.rs | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index 8b846f4..7f46d54 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -7,6 +7,7 @@ use serde_derive::{Deserialize, Serialize}; pub enum Rule<'a> { Allow(&'a str), Disallow(&'a str), + #[cfg(feature = "crawl-delay")] Delay(&'a str), } @@ -15,6 +16,7 @@ impl<'a> Rule<'a> { match self { Rule::Allow(inner) => inner, Rule::Disallow(inner) => inner, + #[cfg(feature = "crawl-delay")] Rule::Delay(inner) => inner, } } diff --git a/src/parse.rs b/src/parse.rs index b1f7cf6..94b86da 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -5,6 +5,7 @@ use futures_util::{ }; use serde_derive::{Deserialize, Serialize}; const UA_PREFIX: &str = "user-agent:"; +#[cfg(feature = "crawl-delay")] const DELAY_PREFIX: &str = "crawl-delay:"; const ALLOW_PREFIX: &str = "allow:"; const DISALLOW_PREFIX: &str = "disallow:"; @@ -13,6 +14,7 @@ const DISALLOW_PREFIX: &str = "disallow:"; enum ParsedRule { Allow(String), Disallow(String), + #[cfg(feature = "crawl-delay")] Delay(String), } @@ -21,6 +23,7 @@ impl<'a> Into> for &'a ParsedRule { match self { ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + #[cfg(feature = "crawl-delay")] ParsedRule::Delay(delay) => Rule::Delay(delay), } } @@ -170,12 +173,21 @@ fn parse_line(line: String) -> ParsedLine { // This tries to parse lines roughly in order of most frequent kind to // least frequent kind in order to minimize CPU cycles on average. - parse_disallow(line) + + #[cfg(feature = "crawl-delay")] + return parse_disallow(line) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) - .unwrap_or(ParsedLine::Nothing) + .unwrap_or(ParsedLine::Nothing); + + #[cfg(not(feature = "crawl-delay"))] + return parse_disallow(line) + .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) + .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) + .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .unwrap_or(ParsedLine::Nothing); } fn strip_comments(line: &str) -> &str { @@ -199,6 +211,7 @@ fn parse_user_agent(line: &str) -> Option<&str> { } } +#[cfg(feature = "crawl-delay")] fn parse_delay(line: &str) -> Option<&str> { if line.len() < DELAY_PREFIX.len() { return None;