Fix unicode support by operating on bytes
This commit is contained in:
parent
b6b30d01c7
commit
380dbb57a5
|
@ -215,7 +215,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cylon"
|
name = "cylon"
|
||||||
version = "0.1.3"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
[package]
|
[package]
|
||||||
name = "cylon"
|
name = "cylon"
|
||||||
description = "An efficient compiler for robots.txt files"
|
description = "An efficient compiler for robots.txt files"
|
||||||
version = "0.1.3"
|
version = "0.2.0"
|
||||||
authors = ["Creston Bunch <rust@bunch.im>"]
|
authors = ["Creston Bunch <rust@bunch.im>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
|
185
src/dfa.rs
185
src/dfa.rs
|
@ -3,16 +3,19 @@ use std::cmp::Ordering;
|
||||||
|
|
||||||
use serde_derive::{Deserialize, Serialize};
|
use serde_derive::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
const EOW_BYTE: u8 = 36; // '$'
|
||||||
|
const WILDCARD_BYTE: u8 = 42; // '*'
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub enum Rule<'a> {
|
pub enum Rule<'a> {
|
||||||
Allow(&'a str),
|
Allow(&'a [u8]),
|
||||||
Disallow(&'a str),
|
Disallow(&'a [u8]),
|
||||||
#[cfg(feature = "crawl-delay")]
|
#[cfg(feature = "crawl-delay")]
|
||||||
Delay(&'a str),
|
Delay(&'a [u8]),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Rule<'a> {
|
impl<'a> Rule<'a> {
|
||||||
fn inner(&self) -> &str {
|
fn inner(&self) -> &[u8] {
|
||||||
match self {
|
match self {
|
||||||
Rule::Allow(inner) => inner,
|
Rule::Allow(inner) => inner,
|
||||||
Rule::Disallow(inner) => inner,
|
Rule::Disallow(inner) => inner,
|
||||||
|
@ -24,7 +27,7 @@ impl<'a> Rule<'a> {
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
|
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
|
||||||
enum Edge {
|
enum Edge {
|
||||||
MatchChar(char),
|
MatchByte(u8),
|
||||||
MatchAny,
|
MatchAny,
|
||||||
MatchEow,
|
MatchEow,
|
||||||
}
|
}
|
||||||
|
@ -60,8 +63,8 @@ impl Cylon {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Match whether the rules allow or disallow the target path.
|
/// Match whether the rules allow or disallow the target path.
|
||||||
pub fn allow(&self, path: &str) -> bool {
|
pub fn allow<T: AsRef<[u8]>>(&self, path: T) -> bool {
|
||||||
match self.states[self.state(path)] {
|
match self.states[self.state(path.as_ref())] {
|
||||||
State::Allow => true,
|
State::Allow => true,
|
||||||
State::Disallow => false,
|
State::Disallow => false,
|
||||||
// Intermediate states are not preserved in the DFA
|
// Intermediate states are not preserved in the DFA
|
||||||
|
@ -72,8 +75,8 @@ impl Cylon {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn state(&self, path: &str) -> usize {
|
fn state(&self, path: &[u8]) -> usize {
|
||||||
let state = path.chars().fold(2, |state, path_char| {
|
let state = path.into_iter().fold(2, |state, path_char| {
|
||||||
let t = &self.transitions[state];
|
let t = &self.transitions[state];
|
||||||
t.iter()
|
t.iter()
|
||||||
.rev()
|
.rev()
|
||||||
|
@ -82,7 +85,7 @@ impl Cylon {
|
||||||
.find(|transition| match transition {
|
.find(|transition| match transition {
|
||||||
Transition(Edge::MatchAny, ..) => true,
|
Transition(Edge::MatchAny, ..) => true,
|
||||||
Transition(Edge::MatchEow, ..) => false,
|
Transition(Edge::MatchEow, ..) => false,
|
||||||
Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
|
Transition(Edge::MatchByte(edge_char), ..) => edge_char == path_char,
|
||||||
})
|
})
|
||||||
.map(|Transition(.., next_state)| *next_state)
|
.map(|Transition(.., next_state)| *next_state)
|
||||||
// We are guaranteed at least one matching state because of
|
// We are guaranteed at least one matching state because of
|
||||||
|
@ -118,7 +121,7 @@ impl Cylon {
|
||||||
|
|
||||||
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
|
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
|
||||||
|
|
||||||
let mut queue = vec![("", 0, 0, State::Intermediate)];
|
let mut queue = vec![(b"" as &[u8], 0, 0, State::Intermediate)];
|
||||||
while !queue.is_empty() {
|
while !queue.is_empty() {
|
||||||
// parent_prefix is the "parent node" in the prefix tree. We are
|
// parent_prefix is the "parent node" in the prefix tree. We are
|
||||||
// going to visit its children by filtering from the list of
|
// going to visit its children by filtering from the list of
|
||||||
|
@ -128,23 +131,23 @@ impl Cylon {
|
||||||
// that can match any character legally, but is also a prefix
|
// that can match any character legally, but is also a prefix
|
||||||
// (read: ancestor) of the current node.
|
// (read: ancestor) of the current node.
|
||||||
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
|
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
|
||||||
let last_char = parent_prefix.chars().last();
|
let last_byte = parent_prefix.last();
|
||||||
|
|
||||||
wildcard_state = match state {
|
wildcard_state = match state {
|
||||||
State::Allow => 0,
|
State::Allow => 0,
|
||||||
State::Disallow if last_char == Some('$') => wildcard_state,
|
State::Disallow if last_byte == Some(&EOW_BYTE) => wildcard_state,
|
||||||
State::Disallow => 1,
|
State::Disallow => 1,
|
||||||
#[cfg(feature = "crawl-delay")]
|
#[cfg(feature = "crawl-delay")]
|
||||||
State::Delay => 1,
|
State::Delay => 1,
|
||||||
State::Intermediate => wildcard_state,
|
State::Intermediate => wildcard_state,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut t = match last_char {
|
let mut t = match last_byte {
|
||||||
Some('$') => {
|
Some(&EOW_BYTE) => {
|
||||||
// The EOW character cannot match anything else
|
// The EOW character cannot match anything else
|
||||||
vec![Transition(Edge::MatchAny, wildcard_state)]
|
vec![Transition(Edge::MatchAny, wildcard_state)]
|
||||||
}
|
}
|
||||||
Some('*') => {
|
Some(&WILDCARD_BYTE) => {
|
||||||
// The wildcard character overrides the wildcard state
|
// The wildcard character overrides the wildcard state
|
||||||
vec![Transition(Edge::MatchAny, transitions.len())]
|
vec![Transition(Edge::MatchAny, transitions.len())]
|
||||||
}
|
}
|
||||||
|
@ -154,7 +157,7 @@ impl Cylon {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut curr_prefix = "";
|
let mut curr_prefix: &[u8] = b"";
|
||||||
rules
|
rules
|
||||||
.iter()
|
.iter()
|
||||||
.map(Rule::inner)
|
.map(Rule::inner)
|
||||||
|
@ -187,19 +190,19 @@ impl Cylon {
|
||||||
// NB: we can predict what state index the child
|
// NB: we can predict what state index the child
|
||||||
// will have before it's even pushed onto the state vec.
|
// will have before it's even pushed onto the state vec.
|
||||||
let child_index = transitions.len() + queue.len();
|
let child_index = transitions.len() + queue.len();
|
||||||
let edge_char = child_prefix.chars().last().unwrap();
|
let edge_char = child_prefix.last().unwrap();
|
||||||
let transition = Transition(
|
let transition = Transition(
|
||||||
match edge_char {
|
match *edge_char {
|
||||||
'*' => Edge::MatchAny,
|
WILDCARD_BYTE => Edge::MatchAny,
|
||||||
'$' => Edge::MatchEow,
|
EOW_BYTE => Edge::MatchEow,
|
||||||
c => Edge::MatchChar(c),
|
c => Edge::MatchByte(c),
|
||||||
},
|
},
|
||||||
child_index,
|
child_index,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Add transitions from the parent state to the child state
|
// Add transitions from the parent state to the child state
|
||||||
// so that the wildcard character matches are optional.
|
// so that the wildcard character matches are optional.
|
||||||
if last_char == Some('*') {
|
if last_byte == Some(&WILDCARD_BYTE) {
|
||||||
let parent_t = &mut transitions[parent_state];
|
let parent_t = &mut transitions[parent_state];
|
||||||
parent_t.push(transition);
|
parent_t.push(transition);
|
||||||
}
|
}
|
||||||
|
@ -225,7 +228,9 @@ impl Cylon {
|
||||||
Rule::Delay(_) => true,
|
Rule::Delay(_) => true,
|
||||||
_ => false,
|
_ => false,
|
||||||
})
|
})
|
||||||
.map(|r| r.inner().parse::<u64>().ok())
|
.map(|r| r.inner())
|
||||||
|
.flat_map(|r| std::str::from_utf8(r).ok())
|
||||||
|
.map(|r| r.parse::<u64>().ok())
|
||||||
.collect();
|
.collect();
|
||||||
delays.sort_unstable_by(|a, b| match (a, b) {
|
delays.sort_unstable_by(|a, b| match (a, b) {
|
||||||
(None, Some(_)) => Ordering::Greater,
|
(None, Some(_)) => Ordering::Greater,
|
||||||
|
@ -260,27 +265,54 @@ mod tests {
|
||||||
Transition(Edge::MatchEow, $x)
|
Transition(Edge::MatchEow, $x)
|
||||||
};
|
};
|
||||||
($x:expr => $y:expr) => {
|
($x:expr => $y:expr) => {
|
||||||
Transition(Edge::MatchChar($x), $y)
|
Transition(Edge::MatchByte($x), $y)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! b {
|
||||||
|
('.') => {
|
||||||
|
46
|
||||||
|
};
|
||||||
|
('/') => {
|
||||||
|
47
|
||||||
|
};
|
||||||
|
('a') => {
|
||||||
|
97
|
||||||
|
};
|
||||||
|
('b') => {
|
||||||
|
98
|
||||||
|
};
|
||||||
|
('c') => {
|
||||||
|
99
|
||||||
|
};
|
||||||
|
('d') => {
|
||||||
|
100
|
||||||
|
};
|
||||||
|
('x') => {
|
||||||
|
120
|
||||||
|
};
|
||||||
|
('y') => {
|
||||||
|
121
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_compile() {
|
fn test_compile() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Disallow("/"),
|
Rule::Disallow(b"/"),
|
||||||
Rule::Allow("/a"),
|
Rule::Allow(b"/a"),
|
||||||
Rule::Allow("/abc"),
|
Rule::Allow(b"/abc"),
|
||||||
Rule::Allow("/b"),
|
Rule::Allow(b"/b"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let expect_transitions = vec![
|
let expect_transitions = vec![
|
||||||
vec![t!('*' => 0)],
|
vec![t!('*' => 0)],
|
||||||
vec![t!('*' => 1)],
|
vec![t!('*' => 1)],
|
||||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
|
||||||
vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
|
vec![t!('*' => 1), t!(b!('a') => 4), t!(b!('b') => 5)], // "/"
|
||||||
vec![t!('*' => 0), t!('b' => 6)], // "/a"
|
vec![t!('*' => 0), t!(b!('b') => 6)], // "/a"
|
||||||
vec![t!('*' => 0)], // "/b"
|
vec![t!('*' => 0)], // "/b"
|
||||||
vec![t!('*' => 0), t!('c' => 7)], // "/ab"
|
vec![t!('*' => 0), t!(b!('c') => 7)], // "/ab"
|
||||||
vec![t!('*' => 0)], // "/abc"
|
vec![t!('*' => 0)], // "/abc"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
@ -302,16 +334,25 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_compile_with_wildcard() {
|
fn test_compile_with_wildcard() {
|
||||||
let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
|
let rules = vec![
|
||||||
|
Rule::Disallow(b"/"),
|
||||||
|
Rule::Allow(b"/a"),
|
||||||
|
Rule::Allow(b"/*.b"),
|
||||||
|
];
|
||||||
|
|
||||||
let expect_transitions = vec![
|
let expect_transitions = vec![
|
||||||
vec![t!('*' => 0)],
|
vec![t!('*' => 0)],
|
||||||
vec![t!('*' => 1)],
|
vec![t!('*' => 1)],
|
||||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
|
||||||
vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
|
vec![
|
||||||
vec![t!('*' => 4), t!('.' => 6)], // "/*"
|
t!('*' => 1),
|
||||||
|
t!('*' => 4),
|
||||||
|
t!(b!('a') => 5),
|
||||||
|
t!(b!('.') => 6),
|
||||||
|
], // "/"
|
||||||
|
vec![t!('*' => 4), t!(b!('.') => 6)], // "/*"
|
||||||
vec![t!('*' => 0)], // "/a"
|
vec![t!('*' => 0)], // "/a"
|
||||||
vec![t!('*' => 1), t!('b' => 7)], // "/*."
|
vec![t!('*' => 1), t!(b!('b') => 7)], // "/*."
|
||||||
vec![t!('*' => 0)], // "/*.b"
|
vec![t!('*' => 0)], // "/*.b"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
@ -333,14 +374,14 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_compile_tricky_wildcard() {
|
fn test_compile_tricky_wildcard() {
|
||||||
let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
|
let rules = vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.")];
|
||||||
|
|
||||||
let expect_transitions = vec![
|
let expect_transitions = vec![
|
||||||
vec![t!('*' => 0)],
|
vec![t!('*' => 0)],
|
||||||
vec![t!('*' => 1)],
|
vec![t!('*' => 1)],
|
||||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
|
||||||
vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
|
vec![t!('*' => 1), t!('*' => 4), t!(b!('.') => 5)], // "/"
|
||||||
vec![t!('*' => 4), t!('.' => 5)], // "/*"
|
vec![t!('*' => 4), t!(b!('.') => 5)], // "/*"
|
||||||
vec![t!('*' => 0)], // "/*."
|
vec![t!('*' => 0)], // "/*."
|
||||||
];
|
];
|
||||||
|
|
||||||
|
@ -361,23 +402,23 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_compile_with_eow() {
|
fn test_compile_with_eow() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Allow("/"),
|
Rule::Allow(b"/"),
|
||||||
Rule::Disallow("/a$"),
|
Rule::Disallow(b"/a$"),
|
||||||
// Note that this rule is nonsensical. It will compile, but
|
// Note that this rule is nonsensical. It will compile, but
|
||||||
// no guarantees are made as to how it's matched. Rules should
|
// no guarantees are made as to how it's matched. Rules should
|
||||||
// use url-encoded strings to escape $.
|
// use url-encoded strings to escape $.
|
||||||
Rule::Disallow("/x$y"),
|
Rule::Disallow(b"/x$y"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let expect_transitions = vec![
|
let expect_transitions = vec![
|
||||||
vec![t!('*' => 0)],
|
vec![t!('*' => 0)],
|
||||||
vec![t!('*' => 1)],
|
vec![t!('*' => 1)],
|
||||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
vec![t!('*' => 0), t!(b!('/') => 3)], // ""
|
||||||
vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
|
vec![t!('*' => 0), t!(b!('a') => 4), t!(b!('x') => 5)], // "/"
|
||||||
vec![t!('*' => 0), t!('$' => 6)], // "/a"
|
vec![t!('*' => 0), t!('$' => 6)], // "/a"
|
||||||
vec![t!('*' => 0), t!('$' => 7)], // "/x"
|
vec![t!('*' => 0), t!('$' => 7)], // "/x"
|
||||||
vec![t!('*' => 0)], // "/a$"
|
vec![t!('*' => 0)], // "/a$"
|
||||||
vec![t!('*' => 0), t!('y' => 8)], // "/x$"
|
vec![t!('*' => 0), t!(b!('y') => 8)], // "/x$"
|
||||||
vec![t!('*' => 1)], // "/x$y"
|
vec![t!('*' => 1)], // "/x$y"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
@ -401,10 +442,10 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_allow() {
|
fn test_allow() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Disallow("/"),
|
Rule::Disallow(b"/"),
|
||||||
Rule::Allow("/a"),
|
Rule::Allow(b"/a"),
|
||||||
Rule::Allow("/abc"),
|
Rule::Allow(b"/abc"),
|
||||||
Rule::Allow("/b"),
|
Rule::Allow(b"/b"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let machine = Cylon::compile(rules);
|
let machine = Cylon::compile(rules);
|
||||||
|
@ -421,9 +462,9 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_allow_match_any() {
|
fn test_allow_match_any() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Allow("/"),
|
Rule::Allow(b"/"),
|
||||||
Rule::Disallow("/secret/*.txt"),
|
Rule::Disallow(b"/secret/*.txt"),
|
||||||
Rule::Disallow("/private/*"),
|
Rule::Disallow(b"/private/*"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let machine = Cylon::compile(rules);
|
let machine = Cylon::compile(rules);
|
||||||
|
@ -442,9 +483,9 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_allow_match_eow() {
|
fn test_allow_match_eow() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Allow("/"),
|
Rule::Allow(b"/"),
|
||||||
Rule::Disallow("/ignore$"),
|
Rule::Disallow(b"/ignore$"),
|
||||||
Rule::Disallow("/foo$bar"),
|
Rule::Disallow(b"/foo$bar"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let machine = Cylon::compile(rules);
|
let machine = Cylon::compile(rules);
|
||||||
|
@ -463,14 +504,14 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_allow_more_complicated() {
|
fn test_allow_more_complicated() {
|
||||||
let rules = vec![
|
let rules = vec![
|
||||||
Rule::Allow("/"),
|
Rule::Allow(b"/"),
|
||||||
Rule::Disallow("/a$"),
|
Rule::Disallow(b"/a$"),
|
||||||
Rule::Disallow("/abc"),
|
Rule::Disallow(b"/abc"),
|
||||||
Rule::Allow("/abc/*"),
|
Rule::Allow(b"/abc/*"),
|
||||||
Rule::Disallow("/foo/bar"),
|
Rule::Disallow(b"/foo/bar"),
|
||||||
Rule::Allow("/*/bar"),
|
Rule::Allow(b"/*/bar"),
|
||||||
Rule::Disallow("/www/*/images"),
|
Rule::Disallow(b"/www/*/images"),
|
||||||
Rule::Allow("/www/public/images"),
|
Rule::Allow(b"/www/public/images"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let machine = Cylon::compile(rules);
|
let machine = Cylon::compile(rules);
|
||||||
|
@ -494,7 +535,7 @@ mod tests {
|
||||||
// Test cases from:
|
// Test cases from:
|
||||||
// https://developers.google.com/search/reference/robots_txt#group-member-rules
|
// https://developers.google.com/search/reference/robots_txt#group-member-rules
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish")]);
|
||||||
assert_eq!(true, machine.allow("/fish"));
|
assert_eq!(true, machine.allow("/fish"));
|
||||||
assert_eq!(true, machine.allow("/fish.html"));
|
assert_eq!(true, machine.allow("/fish.html"));
|
||||||
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
||||||
|
@ -505,7 +546,7 @@ mod tests {
|
||||||
assert_eq!(false, machine.allow("/catfish"));
|
assert_eq!(false, machine.allow("/catfish"));
|
||||||
assert_eq!(false, machine.allow("/?id=fish"));
|
assert_eq!(false, machine.allow("/?id=fish"));
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*")]);
|
||||||
assert_eq!(true, machine.allow("/fish"));
|
assert_eq!(true, machine.allow("/fish"));
|
||||||
assert_eq!(true, machine.allow("/fish.html"));
|
assert_eq!(true, machine.allow("/fish.html"));
|
||||||
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
||||||
|
@ -516,7 +557,7 @@ mod tests {
|
||||||
assert_eq!(false, machine.allow("/catfish"));
|
assert_eq!(false, machine.allow("/catfish"));
|
||||||
assert_eq!(false, machine.allow("/?id=fish"));
|
assert_eq!(false, machine.allow("/?id=fish"));
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish/")]);
|
||||||
assert_eq!(true, machine.allow("/fish/"));
|
assert_eq!(true, machine.allow("/fish/"));
|
||||||
assert_eq!(true, machine.allow("/fish/?id=anything"));
|
assert_eq!(true, machine.allow("/fish/?id=anything"));
|
||||||
assert_eq!(true, machine.allow("/fish/salmon.htm"));
|
assert_eq!(true, machine.allow("/fish/salmon.htm"));
|
||||||
|
@ -524,7 +565,7 @@ mod tests {
|
||||||
assert_eq!(false, machine.allow("/fish.html"));
|
assert_eq!(false, machine.allow("/fish.html"));
|
||||||
assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
|
assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php")]);
|
||||||
assert_eq!(true, machine.allow("/filename.php"));
|
assert_eq!(true, machine.allow("/filename.php"));
|
||||||
assert_eq!(true, machine.allow("/folder/filename.php"));
|
assert_eq!(true, machine.allow("/folder/filename.php"));
|
||||||
assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
|
assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
|
||||||
|
@ -533,7 +574,7 @@ mod tests {
|
||||||
assert_eq!(false, machine.allow("/"));
|
assert_eq!(false, machine.allow("/"));
|
||||||
assert_eq!(false, machine.allow("/windows.PHP"));
|
assert_eq!(false, machine.allow("/windows.PHP"));
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php$")]);
|
||||||
assert_eq!(true, machine.allow("/filename.php"));
|
assert_eq!(true, machine.allow("/filename.php"));
|
||||||
assert_eq!(true, machine.allow("/folder/filename.php"));
|
assert_eq!(true, machine.allow("/folder/filename.php"));
|
||||||
assert_eq!(false, machine.allow("/filename.php?parameters"));
|
assert_eq!(false, machine.allow("/filename.php?parameters"));
|
||||||
|
@ -541,7 +582,7 @@ mod tests {
|
||||||
assert_eq!(false, machine.allow("/filename.php5"));
|
assert_eq!(false, machine.allow("/filename.php5"));
|
||||||
assert_eq!(false, machine.allow("/windows.PHP"));
|
assert_eq!(false, machine.allow("/windows.PHP"));
|
||||||
|
|
||||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
|
let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*.php")]);
|
||||||
assert_eq!(true, machine.allow("/fish.php"));
|
assert_eq!(true, machine.allow("/fish.php"));
|
||||||
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
|
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
|
||||||
assert_eq!(false, machine.allow("/Fish.PHP"));
|
assert_eq!(false, machine.allow("/Fish.PHP"));
|
||||||
|
|
38
src/parse.rs
38
src/parse.rs
|
@ -21,10 +21,10 @@ enum ParsedRule {
|
||||||
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
||||||
fn into(self) -> Rule<'a> {
|
fn into(self) -> Rule<'a> {
|
||||||
match self {
|
match self {
|
||||||
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
ParsedRule::Allow(path) => Rule::Allow(path.as_bytes()),
|
||||||
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
ParsedRule::Disallow(path) => Rule::Disallow(path.as_bytes()),
|
||||||
#[cfg(feature = "crawl-delay")]
|
#[cfg(feature = "crawl-delay")]
|
||||||
ParsedRule::Delay(delay) => Rule::Delay(delay),
|
ParsedRule::Delay(delay) => Rule::Delay(delay.as_bytes()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -453,4 +453,36 @@ mod tests {
|
||||||
assert_eq!(true, imabot_machine.allow("/index.html"));
|
assert_eq!(true, imabot_machine.allow("/index.html"));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unicode_support() {
|
||||||
|
tokio_test::block_on(async {
|
||||||
|
// From: wikipedia.org/robots.txt
|
||||||
|
let example_robots = r#"
|
||||||
|
User-agent: test
|
||||||
|
Disallow: /wiki/ויקיפדיה:רשימת_מועמדים_למחיקה/
|
||||||
|
Disallow: /wiki/ויקיפדיה%3Aרשימת_מועמדים_למחיקה/
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
|
||||||
|
Disallow: /wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים
|
||||||
|
Disallow: /wiki/ויקיפדיה%3Aערכים_לא_קיימים_ומוגנים
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
|
||||||
|
Disallow: /wiki/ויקיפדיה:דפים_לא_קיימים_ומוגנים
|
||||||
|
Disallow: /wiki/ויקיפדיה%3Aדפים_לא_קיימים_ומוגנים
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
|
||||||
|
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
|
||||||
|
"#
|
||||||
|
.as_bytes();
|
||||||
|
|
||||||
|
let parser = Compiler::new("test");
|
||||||
|
let machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(true, machine.allow("/index.html"));
|
||||||
|
assert_eq!(
|
||||||
|
false,
|
||||||
|
machine.allow("/wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים")
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue