Fix unicode support by operating on bytes

This commit is contained in:
Creston Bunch 2021-04-15 17:20:59 -05:00
parent b6b30d01c7
commit 380dbb57a5
4 changed files with 159 additions and 86 deletions

2
Cargo.lock generated
View File

@ -215,7 +215,7 @@ dependencies = [
[[package]]
name = "cylon"
version = "0.1.3"
version = "0.2.0"
dependencies = [
"criterion",
"futures-util",

View File

@ -1,7 +1,7 @@
[package]
name = "cylon"
description = "An efficient compiler for robots.txt files"
version = "0.1.3"
version = "0.2.0"
authors = ["Creston Bunch <rust@bunch.im>"]
edition = "2018"

View File

@ -3,16 +3,19 @@ use std::cmp::Ordering;
use serde_derive::{Deserialize, Serialize};
const EOW_BYTE: u8 = 36; // '$'
const WILDCARD_BYTE: u8 = 42; // '*'
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> {
Allow(&'a str),
Disallow(&'a str),
Allow(&'a [u8]),
Disallow(&'a [u8]),
#[cfg(feature = "crawl-delay")]
Delay(&'a str),
Delay(&'a [u8]),
}
impl<'a> Rule<'a> {
fn inner(&self) -> &str {
fn inner(&self) -> &[u8] {
match self {
Rule::Allow(inner) => inner,
Rule::Disallow(inner) => inner,
@ -24,7 +27,7 @@ impl<'a> Rule<'a> {
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
enum Edge {
MatchChar(char),
MatchByte(u8),
MatchAny,
MatchEow,
}
@ -60,8 +63,8 @@ impl Cylon {
}
/// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool {
match self.states[self.state(path)] {
pub fn allow<T: AsRef<[u8]>>(&self, path: T) -> bool {
match self.states[self.state(path.as_ref())] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
@ -72,8 +75,8 @@ impl Cylon {
}
}
fn state(&self, path: &str) -> usize {
let state = path.chars().fold(2, |state, path_char| {
fn state(&self, path: &[u8]) -> usize {
let state = path.into_iter().fold(2, |state, path_char| {
let t = &self.transitions[state];
t.iter()
.rev()
@ -82,7 +85,7 @@ impl Cylon {
.find(|transition| match transition {
Transition(Edge::MatchAny, ..) => true,
Transition(Edge::MatchEow, ..) => false,
Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
Transition(Edge::MatchByte(edge_char), ..) => edge_char == path_char,
})
.map(|Transition(.., next_state)| *next_state)
// We are guaranteed at least one matching state because of
@ -118,7 +121,7 @@ impl Cylon {
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
let mut queue = vec![("", 0, 0, State::Intermediate)];
let mut queue = vec![(b"" as &[u8], 0, 0, State::Intermediate)];
while !queue.is_empty() {
// parent_prefix is the "parent node" in the prefix tree. We are
// going to visit its children by filtering from the list of
@ -128,23 +131,23 @@ impl Cylon {
// that can match any character legally, but is also a prefix
// (read: ancestor) of the current node.
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
let last_char = parent_prefix.chars().last();
let last_byte = parent_prefix.last();
wildcard_state = match state {
State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state,
State::Disallow if last_byte == Some(&EOW_BYTE) => wildcard_state,
State::Disallow => 1,
#[cfg(feature = "crawl-delay")]
State::Delay => 1,
State::Intermediate => wildcard_state,
};
let mut t = match last_char {
Some('$') => {
let mut t = match last_byte {
Some(&EOW_BYTE) => {
// The EOW character cannot match anything else
vec![Transition(Edge::MatchAny, wildcard_state)]
}
Some('*') => {
Some(&WILDCARD_BYTE) => {
// The wildcard character overrides the wildcard state
vec![Transition(Edge::MatchAny, transitions.len())]
}
@ -154,7 +157,7 @@ impl Cylon {
}
};
let mut curr_prefix = "";
let mut curr_prefix: &[u8] = b"";
rules
.iter()
.map(Rule::inner)
@ -187,19 +190,19 @@ impl Cylon {
// NB: we can predict what state index the child
// will have before it's even pushed onto the state vec.
let child_index = transitions.len() + queue.len();
let edge_char = child_prefix.chars().last().unwrap();
let edge_char = child_prefix.last().unwrap();
let transition = Transition(
match edge_char {
'*' => Edge::MatchAny,
'$' => Edge::MatchEow,
c => Edge::MatchChar(c),
match *edge_char {
WILDCARD_BYTE => Edge::MatchAny,
EOW_BYTE => Edge::MatchEow,
c => Edge::MatchByte(c),
},
child_index,
);
// Add transitions from the parent state to the child state
// so that the wildcard character matches are optional.
if last_char == Some('*') {
if last_byte == Some(&WILDCARD_BYTE) {
let parent_t = &mut transitions[parent_state];
parent_t.push(transition);
}
@ -225,7 +228,9 @@ impl Cylon {
Rule::Delay(_) => true,
_ => false,
})
.map(|r| r.inner().parse::<u64>().ok())
.map(|r| r.inner())
.flat_map(|r| std::str::from_utf8(r).ok())
.map(|r| r.parse::<u64>().ok())
.collect();
delays.sort_unstable_by(|a, b| match (a, b) {
(None, Some(_)) => Ordering::Greater,
@ -260,27 +265,54 @@ mod tests {
Transition(Edge::MatchEow, $x)
};
($x:expr => $y:expr) => {
Transition(Edge::MatchChar($x), $y)
Transition(Edge::MatchByte($x), $y)
};
}
macro_rules! b {
('.') => {
46
};
('/') => {
47
};
('a') => {
97
};
('b') => {
98
};
('c') => {
99
};
('d') => {
100
};
('x') => {
120
};
('y') => {
121
};
}
#[test]
fn test_compile() {
let rules = vec![
Rule::Disallow("/"),
Rule::Allow("/a"),
Rule::Allow("/abc"),
Rule::Allow("/b"),
Rule::Disallow(b"/"),
Rule::Allow(b"/a"),
Rule::Allow(b"/abc"),
Rule::Allow(b"/b"),
];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
vec![t!('*' => 0), t!('b' => 6)], // "/a"
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 1), t!(b!('a') => 4), t!(b!('b') => 5)], // "/"
vec![t!('*' => 0), t!(b!('b') => 6)], // "/a"
vec![t!('*' => 0)], // "/b"
vec![t!('*' => 0), t!('c' => 7)], // "/ab"
vec![t!('*' => 0), t!(b!('c') => 7)], // "/ab"
vec![t!('*' => 0)], // "/abc"
];
@ -302,16 +334,25 @@ mod tests {
#[test]
fn test_compile_with_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
let rules = vec![
Rule::Disallow(b"/"),
Rule::Allow(b"/a"),
Rule::Allow(b"/*.b"),
];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
vec![t!('*' => 4), t!('.' => 6)], // "/*"
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![
t!('*' => 1),
t!('*' => 4),
t!(b!('a') => 5),
t!(b!('.') => 6),
], // "/"
vec![t!('*' => 4), t!(b!('.') => 6)], // "/*"
vec![t!('*' => 0)], // "/a"
vec![t!('*' => 1), t!('b' => 7)], // "/*."
vec![t!('*' => 1), t!(b!('b') => 7)], // "/*."
vec![t!('*' => 0)], // "/*.b"
];
@ -333,14 +374,14 @@ mod tests {
#[test]
fn test_compile_tricky_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
let rules = vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.")];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
vec![t!('*' => 4), t!('.' => 5)], // "/*"
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!(b!('.') => 5)], // "/"
vec![t!('*' => 4), t!(b!('.') => 5)], // "/*"
vec![t!('*' => 0)], // "/*."
];
@ -361,23 +402,23 @@ mod tests {
#[test]
fn test_compile_with_eow() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/a$"),
Rule::Allow(b"/"),
Rule::Disallow(b"/a$"),
// Note that this rule is nonsensical. It will compile, but
// no guarantees are made as to how it's matched. Rules should
// use url-encoded strings to escape $.
Rule::Disallow("/x$y"),
Rule::Disallow(b"/x$y"),
];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 0), t!(b!('a') => 4), t!(b!('x') => 5)], // "/"
vec![t!('*' => 0), t!('$' => 6)], // "/a"
vec![t!('*' => 0), t!('$' => 7)], // "/x"
vec![t!('*' => 0)], // "/a$"
vec![t!('*' => 0), t!('y' => 8)], // "/x$"
vec![t!('*' => 0), t!(b!('y') => 8)], // "/x$"
vec![t!('*' => 1)], // "/x$y"
];
@ -401,10 +442,10 @@ mod tests {
#[test]
fn test_allow() {
let rules = vec![
Rule::Disallow("/"),
Rule::Allow("/a"),
Rule::Allow("/abc"),
Rule::Allow("/b"),
Rule::Disallow(b"/"),
Rule::Allow(b"/a"),
Rule::Allow(b"/abc"),
Rule::Allow(b"/b"),
];
let machine = Cylon::compile(rules);
@ -421,9 +462,9 @@ mod tests {
#[test]
fn test_allow_match_any() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/secret/*.txt"),
Rule::Disallow("/private/*"),
Rule::Allow(b"/"),
Rule::Disallow(b"/secret/*.txt"),
Rule::Disallow(b"/private/*"),
];
let machine = Cylon::compile(rules);
@ -442,9 +483,9 @@ mod tests {
#[test]
fn test_allow_match_eow() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/ignore$"),
Rule::Disallow("/foo$bar"),
Rule::Allow(b"/"),
Rule::Disallow(b"/ignore$"),
Rule::Disallow(b"/foo$bar"),
];
let machine = Cylon::compile(rules);
@ -463,14 +504,14 @@ mod tests {
#[test]
fn test_allow_more_complicated() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/a$"),
Rule::Disallow("/abc"),
Rule::Allow("/abc/*"),
Rule::Disallow("/foo/bar"),
Rule::Allow("/*/bar"),
Rule::Disallow("/www/*/images"),
Rule::Allow("/www/public/images"),
Rule::Allow(b"/"),
Rule::Disallow(b"/a$"),
Rule::Disallow(b"/abc"),
Rule::Allow(b"/abc/*"),
Rule::Disallow(b"/foo/bar"),
Rule::Allow(b"/*/bar"),
Rule::Disallow(b"/www/*/images"),
Rule::Allow(b"/www/public/images"),
];
let machine = Cylon::compile(rules);
@ -494,7 +535,7 @@ mod tests {
// Test cases from:
// https://developers.google.com/search/reference/robots_txt#group-member-rules
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish")]);
assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html"));
@ -505,7 +546,7 @@ mod tests {
assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*")]);
assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html"));
@ -516,7 +557,7 @@ mod tests {
assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish/")]);
assert_eq!(true, machine.allow("/fish/"));
assert_eq!(true, machine.allow("/fish/?id=anything"));
assert_eq!(true, machine.allow("/fish/salmon.htm"));
@ -524,7 +565,7 @@ mod tests {
assert_eq!(false, machine.allow("/fish.html"));
assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php")]);
assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
@ -533,7 +574,7 @@ mod tests {
assert_eq!(false, machine.allow("/"));
assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php$")]);
assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(false, machine.allow("/filename.php?parameters"));
@ -541,7 +582,7 @@ mod tests {
assert_eq!(false, machine.allow("/filename.php5"));
assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*.php")]);
assert_eq!(true, machine.allow("/fish.php"));
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
assert_eq!(false, machine.allow("/Fish.PHP"));

View File

@ -21,10 +21,10 @@ enum ParsedRule {
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> {
match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
ParsedRule::Allow(path) => Rule::Allow(path.as_bytes()),
ParsedRule::Disallow(path) => Rule::Disallow(path.as_bytes()),
#[cfg(feature = "crawl-delay")]
ParsedRule::Delay(delay) => Rule::Delay(delay),
ParsedRule::Delay(delay) => Rule::Delay(delay.as_bytes()),
}
}
}
@ -453,4 +453,36 @@ mod tests {
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
#[test]
fn test_unicode_support() {
tokio_test::block_on(async {
// From: wikipedia.org/robots.txt
let example_robots = r#"
User-agent: test
Disallow: /wiki/ויקיפדיה:רשימתועמדים_למחיקה/
Disallow: /wiki/ויקיפדיה%3Aרשימת_מועמדים_למחיקה/
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
Disallow: /wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים
Disallow: /wiki/ויקיפדיה%3Aערכים_לא_קיימים_ומוגנים
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/ויקיפדיה:דפים_לא_קיימים_ומוגנים
Disallow: /wiki/ויקיפדיה%3Aדפים_לא_קיימים_ומוגנים
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
"#
.as_bytes();
let parser = Compiler::new("test");
let machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, machine.allow("/index.html"));
assert_eq!(
false,
machine.allow("/wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים")
);
});
}
}