diff --git a/Cargo.lock b/Cargo.lock index 0dc8578..3d6ca1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -215,7 +215,7 @@ dependencies = [ [[package]] name = "cylon" -version = "0.1.3" +version = "0.2.0" dependencies = [ "criterion", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index a8cff3d..acb71dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cylon" description = "An efficient compiler for robots.txt files" -version = "0.1.3" +version = "0.2.0" authors = ["Creston Bunch "] edition = "2018" diff --git a/src/dfa.rs b/src/dfa.rs index 299641c..ad54b6a 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -3,16 +3,19 @@ use std::cmp::Ordering; use serde_derive::{Deserialize, Serialize}; +const EOW_BYTE: u8 = 36; // '$' +const WILDCARD_BYTE: u8 = 42; // '*' + #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Rule<'a> { - Allow(&'a str), - Disallow(&'a str), + Allow(&'a [u8]), + Disallow(&'a [u8]), #[cfg(feature = "crawl-delay")] - Delay(&'a str), + Delay(&'a [u8]), } impl<'a> Rule<'a> { - fn inner(&self) -> &str { + fn inner(&self) -> &[u8] { match self { Rule::Allow(inner) => inner, Rule::Disallow(inner) => inner, @@ -24,7 +27,7 @@ impl<'a> Rule<'a> { #[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] enum Edge { - MatchChar(char), + MatchByte(u8), MatchAny, MatchEow, } @@ -60,8 +63,8 @@ impl Cylon { } /// Match whether the rules allow or disallow the target path. - pub fn allow(&self, path: &str) -> bool { - match self.states[self.state(path)] { + pub fn allow>(&self, path: T) -> bool { + match self.states[self.state(path.as_ref())] { State::Allow => true, State::Disallow => false, // Intermediate states are not preserved in the DFA @@ -72,8 +75,8 @@ impl Cylon { } } - fn state(&self, path: &str) -> usize { - let state = path.chars().fold(2, |state, path_char| { + fn state(&self, path: &[u8]) -> usize { + let state = path.into_iter().fold(2, |state, path_char| { let t = &self.transitions[state]; t.iter() .rev() @@ -82,7 +85,7 @@ impl Cylon { .find(|transition| match transition { Transition(Edge::MatchAny, ..) => true, Transition(Edge::MatchEow, ..) => false, - Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, + Transition(Edge::MatchByte(edge_char), ..) => edge_char == path_char, }) .map(|Transition(.., next_state)| *next_state) // We are guaranteed at least one matching state because of @@ -118,7 +121,7 @@ impl Cylon { rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); - let mut queue = vec![("", 0, 0, State::Intermediate)]; + let mut queue = vec![(b"" as &[u8], 0, 0, State::Intermediate)]; while !queue.is_empty() { // parent_prefix is the "parent node" in the prefix tree. We are // going to visit its children by filtering from the list of @@ -128,23 +131,23 @@ impl Cylon { // that can match any character legally, but is also a prefix // (read: ancestor) of the current node. let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); - let last_char = parent_prefix.chars().last(); + let last_byte = parent_prefix.last(); wildcard_state = match state { State::Allow => 0, - State::Disallow if last_char == Some('$') => wildcard_state, + State::Disallow if last_byte == Some(&EOW_BYTE) => wildcard_state, State::Disallow => 1, #[cfg(feature = "crawl-delay")] State::Delay => 1, State::Intermediate => wildcard_state, }; - let mut t = match last_char { - Some('$') => { + let mut t = match last_byte { + Some(&EOW_BYTE) => { // The EOW character cannot match anything else vec![Transition(Edge::MatchAny, wildcard_state)] } - Some('*') => { + Some(&WILDCARD_BYTE) => { // The wildcard character overrides the wildcard state vec![Transition(Edge::MatchAny, transitions.len())] } @@ -154,7 +157,7 @@ impl Cylon { } }; - let mut curr_prefix = ""; + let mut curr_prefix: &[u8] = b""; rules .iter() .map(Rule::inner) @@ -187,19 +190,19 @@ impl Cylon { // NB: we can predict what state index the child // will have before it's even pushed onto the state vec. let child_index = transitions.len() + queue.len(); - let edge_char = child_prefix.chars().last().unwrap(); + let edge_char = child_prefix.last().unwrap(); let transition = Transition( - match edge_char { - '*' => Edge::MatchAny, - '$' => Edge::MatchEow, - c => Edge::MatchChar(c), + match *edge_char { + WILDCARD_BYTE => Edge::MatchAny, + EOW_BYTE => Edge::MatchEow, + c => Edge::MatchByte(c), }, child_index, ); // Add transitions from the parent state to the child state // so that the wildcard character matches are optional. - if last_char == Some('*') { + if last_byte == Some(&WILDCARD_BYTE) { let parent_t = &mut transitions[parent_state]; parent_t.push(transition); } @@ -225,7 +228,9 @@ impl Cylon { Rule::Delay(_) => true, _ => false, }) - .map(|r| r.inner().parse::().ok()) + .map(|r| r.inner()) + .flat_map(|r| std::str::from_utf8(r).ok()) + .map(|r| r.parse::().ok()) .collect(); delays.sort_unstable_by(|a, b| match (a, b) { (None, Some(_)) => Ordering::Greater, @@ -260,28 +265,55 @@ mod tests { Transition(Edge::MatchEow, $x) }; ($x:expr => $y:expr) => { - Transition(Edge::MatchChar($x), $y) + Transition(Edge::MatchByte($x), $y) + }; + } + + macro_rules! b { + ('.') => { + 46 + }; + ('/') => { + 47 + }; + ('a') => { + 97 + }; + ('b') => { + 98 + }; + ('c') => { + 99 + }; + ('d') => { + 100 + }; + ('x') => { + 120 + }; + ('y') => { + 121 }; } #[test] fn test_compile() { let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), + Rule::Disallow(b"/"), + Rule::Allow(b"/a"), + Rule::Allow(b"/abc"), + Rule::Allow(b"/b"), ]; let expect_transitions = vec![ vec![t!('*' => 0)], vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" - vec![t!('*' => 0), t!('b' => 6)], // "/a" - vec![t!('*' => 0)], // "/b" - vec![t!('*' => 0), t!('c' => 7)], // "/ab" - vec![t!('*' => 0)], // "/abc" + vec![t!('*' => 0), t!(b!('/') => 3)], // "" + vec![t!('*' => 1), t!(b!('a') => 4), t!(b!('b') => 5)], // "/" + vec![t!('*' => 0), t!(b!('b') => 6)], // "/a" + vec![t!('*' => 0)], // "/b" + vec![t!('*' => 0), t!(b!('c') => 7)], // "/ab" + vec![t!('*' => 0)], // "/abc" ]; let expect_states = vec![ @@ -302,17 +334,26 @@ mod tests { #[test] fn test_compile_with_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; + let rules = vec![ + Rule::Disallow(b"/"), + Rule::Allow(b"/a"), + Rule::Allow(b"/*.b"), + ]; let expect_transitions = vec![ vec![t!('*' => 0)], vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" - vec![t!('*' => 4), t!('.' => 6)], // "/*" - vec![t!('*' => 0)], // "/a" - vec![t!('*' => 1), t!('b' => 7)], // "/*." - vec![t!('*' => 0)], // "/*.b" + vec![t!('*' => 0), t!(b!('/') => 3)], // "" + vec![ + t!('*' => 1), + t!('*' => 4), + t!(b!('a') => 5), + t!(b!('.') => 6), + ], // "/" + vec![t!('*' => 4), t!(b!('.') => 6)], // "/*" + vec![t!('*' => 0)], // "/a" + vec![t!('*' => 1), t!(b!('b') => 7)], // "/*." + vec![t!('*' => 0)], // "/*.b" ]; let expect_states = vec![ @@ -333,15 +374,15 @@ mod tests { #[test] fn test_compile_tricky_wildcard() { - let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; + let rules = vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.")]; let expect_transitions = vec![ vec![t!('*' => 0)], vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" - vec![t!('*' => 4), t!('.' => 5)], // "/*" - vec![t!('*' => 0)], // "/*." + vec![t!('*' => 0), t!(b!('/') => 3)], // "" + vec![t!('*' => 1), t!('*' => 4), t!(b!('.') => 5)], // "/" + vec![t!('*' => 4), t!(b!('.') => 5)], // "/*" + vec![t!('*' => 0)], // "/*." ]; let expect_states = vec![ @@ -361,24 +402,24 @@ mod tests { #[test] fn test_compile_with_eow() { let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), + Rule::Allow(b"/"), + Rule::Disallow(b"/a$"), // Note that this rule is nonsensical. It will compile, but // no guarantees are made as to how it's matched. Rules should // use url-encoded strings to escape $. - Rule::Disallow("/x$y"), + Rule::Disallow(b"/x$y"), ]; let expect_transitions = vec![ vec![t!('*' => 0)], vec![t!('*' => 1)], - vec![t!('*' => 0), t!('/' => 3)], // "" - vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" - vec![t!('*' => 0), t!('$' => 6)], // "/a" - vec![t!('*' => 0), t!('$' => 7)], // "/x" - vec![t!('*' => 0)], // "/a$" - vec![t!('*' => 0), t!('y' => 8)], // "/x$" - vec![t!('*' => 1)], // "/x$y" + vec![t!('*' => 0), t!(b!('/') => 3)], // "" + vec![t!('*' => 0), t!(b!('a') => 4), t!(b!('x') => 5)], // "/" + vec![t!('*' => 0), t!('$' => 6)], // "/a" + vec![t!('*' => 0), t!('$' => 7)], // "/x" + vec![t!('*' => 0)], // "/a$" + vec![t!('*' => 0), t!(b!('y') => 8)], // "/x$" + vec![t!('*' => 1)], // "/x$y" ]; let expect_states = vec![ @@ -401,10 +442,10 @@ mod tests { #[test] fn test_allow() { let rules = vec![ - Rule::Disallow("/"), - Rule::Allow("/a"), - Rule::Allow("/abc"), - Rule::Allow("/b"), + Rule::Disallow(b"/"), + Rule::Allow(b"/a"), + Rule::Allow(b"/abc"), + Rule::Allow(b"/b"), ]; let machine = Cylon::compile(rules); @@ -421,9 +462,9 @@ mod tests { #[test] fn test_allow_match_any() { let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/secret/*.txt"), - Rule::Disallow("/private/*"), + Rule::Allow(b"/"), + Rule::Disallow(b"/secret/*.txt"), + Rule::Disallow(b"/private/*"), ]; let machine = Cylon::compile(rules); @@ -442,9 +483,9 @@ mod tests { #[test] fn test_allow_match_eow() { let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/ignore$"), - Rule::Disallow("/foo$bar"), + Rule::Allow(b"/"), + Rule::Disallow(b"/ignore$"), + Rule::Disallow(b"/foo$bar"), ]; let machine = Cylon::compile(rules); @@ -463,14 +504,14 @@ mod tests { #[test] fn test_allow_more_complicated() { let rules = vec![ - Rule::Allow("/"), - Rule::Disallow("/a$"), - Rule::Disallow("/abc"), - Rule::Allow("/abc/*"), - Rule::Disallow("/foo/bar"), - Rule::Allow("/*/bar"), - Rule::Disallow("/www/*/images"), - Rule::Allow("/www/public/images"), + Rule::Allow(b"/"), + Rule::Disallow(b"/a$"), + Rule::Disallow(b"/abc"), + Rule::Allow(b"/abc/*"), + Rule::Disallow(b"/foo/bar"), + Rule::Allow(b"/*/bar"), + Rule::Disallow(b"/www/*/images"), + Rule::Allow(b"/www/public/images"), ]; let machine = Cylon::compile(rules); @@ -494,7 +535,7 @@ mod tests { // Test cases from: // https://developers.google.com/search/reference/robots_txt#group-member-rules - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish")]); assert_eq!(true, machine.allow("/fish")); assert_eq!(true, machine.allow("/fish.html")); assert_eq!(true, machine.allow("/fish/salmon.html")); @@ -505,7 +546,7 @@ mod tests { assert_eq!(false, machine.allow("/catfish")); assert_eq!(false, machine.allow("/?id=fish")); - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*")]); assert_eq!(true, machine.allow("/fish")); assert_eq!(true, machine.allow("/fish.html")); assert_eq!(true, machine.allow("/fish/salmon.html")); @@ -516,7 +557,7 @@ mod tests { assert_eq!(false, machine.allow("/catfish")); assert_eq!(false, machine.allow("/?id=fish")); - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish/")]); assert_eq!(true, machine.allow("/fish/")); assert_eq!(true, machine.allow("/fish/?id=anything")); assert_eq!(true, machine.allow("/fish/salmon.htm")); @@ -524,7 +565,7 @@ mod tests { assert_eq!(false, machine.allow("/fish.html")); assert_eq!(false, machine.allow("/Fish/Salmon.asp")); - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php")]); assert_eq!(true, machine.allow("/filename.php")); assert_eq!(true, machine.allow("/folder/filename.php")); assert_eq!(true, machine.allow("/folder/filename.php?parameters")); @@ -533,7 +574,7 @@ mod tests { assert_eq!(false, machine.allow("/")); assert_eq!(false, machine.allow("/windows.PHP")); - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php$")]); assert_eq!(true, machine.allow("/filename.php")); assert_eq!(true, machine.allow("/folder/filename.php")); assert_eq!(false, machine.allow("/filename.php?parameters")); @@ -541,7 +582,7 @@ mod tests { assert_eq!(false, machine.allow("/filename.php5")); assert_eq!(false, machine.allow("/windows.PHP")); - let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); + let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*.php")]); assert_eq!(true, machine.allow("/fish.php")); assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); assert_eq!(false, machine.allow("/Fish.PHP")); diff --git a/src/parse.rs b/src/parse.rs index 514c7fe..beb8098 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -21,10 +21,10 @@ enum ParsedRule { impl<'a> Into> for &'a ParsedRule { fn into(self) -> Rule<'a> { match self { - ParsedRule::Allow(path) => Rule::Allow(&path[..]), - ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + ParsedRule::Allow(path) => Rule::Allow(path.as_bytes()), + ParsedRule::Disallow(path) => Rule::Disallow(path.as_bytes()), #[cfg(feature = "crawl-delay")] - ParsedRule::Delay(delay) => Rule::Delay(delay), + ParsedRule::Delay(delay) => Rule::Delay(delay.as_bytes()), } } } @@ -453,4 +453,36 @@ mod tests { assert_eq!(true, imabot_machine.allow("/index.html")); }); } + + #[test] + fn test_unicode_support() { + tokio_test::block_on(async { + // From: wikipedia.org/robots.txt + let example_robots = r#" + User-agent: test + Disallow: /wiki/ויקיפדיה:רשימת_מועמדים_למחיקה/ + Disallow: /wiki/ויקיפדיה%3Aרשימת_מועמדים_למחיקה/ + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/ + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/ + Disallow: /wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים + Disallow: /wiki/ויקיפדיה%3Aערכים_לא_קיימים_ומוגנים + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D + Disallow: /wiki/ויקיפדיה:דפים_לא_קיימים_ומוגנים + Disallow: /wiki/ויקיפדיה%3Aדפים_לא_קיימים_ומוגנים + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D + Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D + "# + .as_bytes(); + + let parser = Compiler::new("test"); + let machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(true, machine.allow("/index.html")); + assert_eq!( + false, + machine.allow("/wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים") + ); + }); + } }