Fix unicode support by operating on bytes

This commit is contained in:
Creston Bunch 2021-04-15 17:20:59 -05:00
parent b6b30d01c7
commit 380dbb57a5
4 changed files with 159 additions and 86 deletions

2
Cargo.lock generated
View File

@ -215,7 +215,7 @@ dependencies = [
[[package]] [[package]]
name = "cylon" name = "cylon"
version = "0.1.3" version = "0.2.0"
dependencies = [ dependencies = [
"criterion", "criterion",
"futures-util", "futures-util",

View File

@ -1,7 +1,7 @@
[package] [package]
name = "cylon" name = "cylon"
description = "An efficient compiler for robots.txt files" description = "An efficient compiler for robots.txt files"
version = "0.1.3" version = "0.2.0"
authors = ["Creston Bunch <rust@bunch.im>"] authors = ["Creston Bunch <rust@bunch.im>"]
edition = "2018" edition = "2018"

View File

@ -3,16 +3,19 @@ use std::cmp::Ordering;
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};
const EOW_BYTE: u8 = 36; // '$'
const WILDCARD_BYTE: u8 = 42; // '*'
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> { pub enum Rule<'a> {
Allow(&'a str), Allow(&'a [u8]),
Disallow(&'a str), Disallow(&'a [u8]),
#[cfg(feature = "crawl-delay")] #[cfg(feature = "crawl-delay")]
Delay(&'a str), Delay(&'a [u8]),
} }
impl<'a> Rule<'a> { impl<'a> Rule<'a> {
fn inner(&self) -> &str { fn inner(&self) -> &[u8] {
match self { match self {
Rule::Allow(inner) => inner, Rule::Allow(inner) => inner,
Rule::Disallow(inner) => inner, Rule::Disallow(inner) => inner,
@ -24,7 +27,7 @@ impl<'a> Rule<'a> {
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
enum Edge { enum Edge {
MatchChar(char), MatchByte(u8),
MatchAny, MatchAny,
MatchEow, MatchEow,
} }
@ -60,8 +63,8 @@ impl Cylon {
} }
/// Match whether the rules allow or disallow the target path. /// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool { pub fn allow<T: AsRef<[u8]>>(&self, path: T) -> bool {
match self.states[self.state(path)] { match self.states[self.state(path.as_ref())] {
State::Allow => true, State::Allow => true,
State::Disallow => false, State::Disallow => false,
// Intermediate states are not preserved in the DFA // Intermediate states are not preserved in the DFA
@ -72,8 +75,8 @@ impl Cylon {
} }
} }
fn state(&self, path: &str) -> usize { fn state(&self, path: &[u8]) -> usize {
let state = path.chars().fold(2, |state, path_char| { let state = path.into_iter().fold(2, |state, path_char| {
let t = &self.transitions[state]; let t = &self.transitions[state];
t.iter() t.iter()
.rev() .rev()
@ -82,7 +85,7 @@ impl Cylon {
.find(|transition| match transition { .find(|transition| match transition {
Transition(Edge::MatchAny, ..) => true, Transition(Edge::MatchAny, ..) => true,
Transition(Edge::MatchEow, ..) => false, Transition(Edge::MatchEow, ..) => false,
Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char, Transition(Edge::MatchByte(edge_char), ..) => edge_char == path_char,
}) })
.map(|Transition(.., next_state)| *next_state) .map(|Transition(.., next_state)| *next_state)
// We are guaranteed at least one matching state because of // We are guaranteed at least one matching state because of
@ -118,7 +121,7 @@ impl Cylon {
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner())); rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
let mut queue = vec![("", 0, 0, State::Intermediate)]; let mut queue = vec![(b"" as &[u8], 0, 0, State::Intermediate)];
while !queue.is_empty() { while !queue.is_empty() {
// parent_prefix is the "parent node" in the prefix tree. We are // parent_prefix is the "parent node" in the prefix tree. We are
// going to visit its children by filtering from the list of // going to visit its children by filtering from the list of
@ -128,23 +131,23 @@ impl Cylon {
// that can match any character legally, but is also a prefix // that can match any character legally, but is also a prefix
// (read: ancestor) of the current node. // (read: ancestor) of the current node.
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0); let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
let last_char = parent_prefix.chars().last(); let last_byte = parent_prefix.last();
wildcard_state = match state { wildcard_state = match state {
State::Allow => 0, State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state, State::Disallow if last_byte == Some(&EOW_BYTE) => wildcard_state,
State::Disallow => 1, State::Disallow => 1,
#[cfg(feature = "crawl-delay")] #[cfg(feature = "crawl-delay")]
State::Delay => 1, State::Delay => 1,
State::Intermediate => wildcard_state, State::Intermediate => wildcard_state,
}; };
let mut t = match last_char { let mut t = match last_byte {
Some('$') => { Some(&EOW_BYTE) => {
// The EOW character cannot match anything else // The EOW character cannot match anything else
vec![Transition(Edge::MatchAny, wildcard_state)] vec![Transition(Edge::MatchAny, wildcard_state)]
} }
Some('*') => { Some(&WILDCARD_BYTE) => {
// The wildcard character overrides the wildcard state // The wildcard character overrides the wildcard state
vec![Transition(Edge::MatchAny, transitions.len())] vec![Transition(Edge::MatchAny, transitions.len())]
} }
@ -154,7 +157,7 @@ impl Cylon {
} }
}; };
let mut curr_prefix = ""; let mut curr_prefix: &[u8] = b"";
rules rules
.iter() .iter()
.map(Rule::inner) .map(Rule::inner)
@ -187,19 +190,19 @@ impl Cylon {
// NB: we can predict what state index the child // NB: we can predict what state index the child
// will have before it's even pushed onto the state vec. // will have before it's even pushed onto the state vec.
let child_index = transitions.len() + queue.len(); let child_index = transitions.len() + queue.len();
let edge_char = child_prefix.chars().last().unwrap(); let edge_char = child_prefix.last().unwrap();
let transition = Transition( let transition = Transition(
match edge_char { match *edge_char {
'*' => Edge::MatchAny, WILDCARD_BYTE => Edge::MatchAny,
'$' => Edge::MatchEow, EOW_BYTE => Edge::MatchEow,
c => Edge::MatchChar(c), c => Edge::MatchByte(c),
}, },
child_index, child_index,
); );
// Add transitions from the parent state to the child state // Add transitions from the parent state to the child state
// so that the wildcard character matches are optional. // so that the wildcard character matches are optional.
if last_char == Some('*') { if last_byte == Some(&WILDCARD_BYTE) {
let parent_t = &mut transitions[parent_state]; let parent_t = &mut transitions[parent_state];
parent_t.push(transition); parent_t.push(transition);
} }
@ -225,7 +228,9 @@ impl Cylon {
Rule::Delay(_) => true, Rule::Delay(_) => true,
_ => false, _ => false,
}) })
.map(|r| r.inner().parse::<u64>().ok()) .map(|r| r.inner())
.flat_map(|r| std::str::from_utf8(r).ok())
.map(|r| r.parse::<u64>().ok())
.collect(); .collect();
delays.sort_unstable_by(|a, b| match (a, b) { delays.sort_unstable_by(|a, b| match (a, b) {
(None, Some(_)) => Ordering::Greater, (None, Some(_)) => Ordering::Greater,
@ -260,27 +265,54 @@ mod tests {
Transition(Edge::MatchEow, $x) Transition(Edge::MatchEow, $x)
}; };
($x:expr => $y:expr) => { ($x:expr => $y:expr) => {
Transition(Edge::MatchChar($x), $y) Transition(Edge::MatchByte($x), $y)
};
}
macro_rules! b {
('.') => {
46
};
('/') => {
47
};
('a') => {
97
};
('b') => {
98
};
('c') => {
99
};
('d') => {
100
};
('x') => {
120
};
('y') => {
121
}; };
} }
#[test] #[test]
fn test_compile() { fn test_compile() {
let rules = vec![ let rules = vec![
Rule::Disallow("/"), Rule::Disallow(b"/"),
Rule::Allow("/a"), Rule::Allow(b"/a"),
Rule::Allow("/abc"), Rule::Allow(b"/abc"),
Rule::Allow("/b"), Rule::Allow(b"/b"),
]; ];
let expect_transitions = vec![ let expect_transitions = vec![
vec![t!('*' => 0)], vec![t!('*' => 0)],
vec![t!('*' => 1)], vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // "" vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/" vec![t!('*' => 1), t!(b!('a') => 4), t!(b!('b') => 5)], // "/"
vec![t!('*' => 0), t!('b' => 6)], // "/a" vec![t!('*' => 0), t!(b!('b') => 6)], // "/a"
vec![t!('*' => 0)], // "/b" vec![t!('*' => 0)], // "/b"
vec![t!('*' => 0), t!('c' => 7)], // "/ab" vec![t!('*' => 0), t!(b!('c') => 7)], // "/ab"
vec![t!('*' => 0)], // "/abc" vec![t!('*' => 0)], // "/abc"
]; ];
@ -302,16 +334,25 @@ mod tests {
#[test] #[test]
fn test_compile_with_wildcard() { fn test_compile_with_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")]; let rules = vec![
Rule::Disallow(b"/"),
Rule::Allow(b"/a"),
Rule::Allow(b"/*.b"),
];
let expect_transitions = vec![ let expect_transitions = vec![
vec![t!('*' => 0)], vec![t!('*' => 0)],
vec![t!('*' => 1)], vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // "" vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/" vec![
vec![t!('*' => 4), t!('.' => 6)], // "/*" t!('*' => 1),
t!('*' => 4),
t!(b!('a') => 5),
t!(b!('.') => 6),
], // "/"
vec![t!('*' => 4), t!(b!('.') => 6)], // "/*"
vec![t!('*' => 0)], // "/a" vec![t!('*' => 0)], // "/a"
vec![t!('*' => 1), t!('b' => 7)], // "/*." vec![t!('*' => 1), t!(b!('b') => 7)], // "/*."
vec![t!('*' => 0)], // "/*.b" vec![t!('*' => 0)], // "/*.b"
]; ];
@ -333,14 +374,14 @@ mod tests {
#[test] #[test]
fn test_compile_tricky_wildcard() { fn test_compile_tricky_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")]; let rules = vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.")];
let expect_transitions = vec![ let expect_transitions = vec![
vec![t!('*' => 0)], vec![t!('*' => 0)],
vec![t!('*' => 1)], vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // "" vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/" vec![t!('*' => 1), t!('*' => 4), t!(b!('.') => 5)], // "/"
vec![t!('*' => 4), t!('.' => 5)], // "/*" vec![t!('*' => 4), t!(b!('.') => 5)], // "/*"
vec![t!('*' => 0)], // "/*." vec![t!('*' => 0)], // "/*."
]; ];
@ -361,23 +402,23 @@ mod tests {
#[test] #[test]
fn test_compile_with_eow() { fn test_compile_with_eow() {
let rules = vec![ let rules = vec![
Rule::Allow("/"), Rule::Allow(b"/"),
Rule::Disallow("/a$"), Rule::Disallow(b"/a$"),
// Note that this rule is nonsensical. It will compile, but // Note that this rule is nonsensical. It will compile, but
// no guarantees are made as to how it's matched. Rules should // no guarantees are made as to how it's matched. Rules should
// use url-encoded strings to escape $. // use url-encoded strings to escape $.
Rule::Disallow("/x$y"), Rule::Disallow(b"/x$y"),
]; ];
let expect_transitions = vec![ let expect_transitions = vec![
vec![t!('*' => 0)], vec![t!('*' => 0)],
vec![t!('*' => 1)], vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // "" vec![t!('*' => 0), t!(b!('/') => 3)], // ""
vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/" vec![t!('*' => 0), t!(b!('a') => 4), t!(b!('x') => 5)], // "/"
vec![t!('*' => 0), t!('$' => 6)], // "/a" vec![t!('*' => 0), t!('$' => 6)], // "/a"
vec![t!('*' => 0), t!('$' => 7)], // "/x" vec![t!('*' => 0), t!('$' => 7)], // "/x"
vec![t!('*' => 0)], // "/a$" vec![t!('*' => 0)], // "/a$"
vec![t!('*' => 0), t!('y' => 8)], // "/x$" vec![t!('*' => 0), t!(b!('y') => 8)], // "/x$"
vec![t!('*' => 1)], // "/x$y" vec![t!('*' => 1)], // "/x$y"
]; ];
@ -401,10 +442,10 @@ mod tests {
#[test] #[test]
fn test_allow() { fn test_allow() {
let rules = vec![ let rules = vec![
Rule::Disallow("/"), Rule::Disallow(b"/"),
Rule::Allow("/a"), Rule::Allow(b"/a"),
Rule::Allow("/abc"), Rule::Allow(b"/abc"),
Rule::Allow("/b"), Rule::Allow(b"/b"),
]; ];
let machine = Cylon::compile(rules); let machine = Cylon::compile(rules);
@ -421,9 +462,9 @@ mod tests {
#[test] #[test]
fn test_allow_match_any() { fn test_allow_match_any() {
let rules = vec![ let rules = vec![
Rule::Allow("/"), Rule::Allow(b"/"),
Rule::Disallow("/secret/*.txt"), Rule::Disallow(b"/secret/*.txt"),
Rule::Disallow("/private/*"), Rule::Disallow(b"/private/*"),
]; ];
let machine = Cylon::compile(rules); let machine = Cylon::compile(rules);
@ -442,9 +483,9 @@ mod tests {
#[test] #[test]
fn test_allow_match_eow() { fn test_allow_match_eow() {
let rules = vec![ let rules = vec![
Rule::Allow("/"), Rule::Allow(b"/"),
Rule::Disallow("/ignore$"), Rule::Disallow(b"/ignore$"),
Rule::Disallow("/foo$bar"), Rule::Disallow(b"/foo$bar"),
]; ];
let machine = Cylon::compile(rules); let machine = Cylon::compile(rules);
@ -463,14 +504,14 @@ mod tests {
#[test] #[test]
fn test_allow_more_complicated() { fn test_allow_more_complicated() {
let rules = vec![ let rules = vec![
Rule::Allow("/"), Rule::Allow(b"/"),
Rule::Disallow("/a$"), Rule::Disallow(b"/a$"),
Rule::Disallow("/abc"), Rule::Disallow(b"/abc"),
Rule::Allow("/abc/*"), Rule::Allow(b"/abc/*"),
Rule::Disallow("/foo/bar"), Rule::Disallow(b"/foo/bar"),
Rule::Allow("/*/bar"), Rule::Allow(b"/*/bar"),
Rule::Disallow("/www/*/images"), Rule::Disallow(b"/www/*/images"),
Rule::Allow("/www/public/images"), Rule::Allow(b"/www/public/images"),
]; ];
let machine = Cylon::compile(rules); let machine = Cylon::compile(rules);
@ -494,7 +535,7 @@ mod tests {
// Test cases from: // Test cases from:
// https://developers.google.com/search/reference/robots_txt#group-member-rules // https://developers.google.com/search/reference/robots_txt#group-member-rules
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish")]);
assert_eq!(true, machine.allow("/fish")); assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html")); assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html")); assert_eq!(true, machine.allow("/fish/salmon.html"));
@ -505,7 +546,7 @@ mod tests {
assert_eq!(false, machine.allow("/catfish")); assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish")); assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*")]);
assert_eq!(true, machine.allow("/fish")); assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html")); assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html")); assert_eq!(true, machine.allow("/fish/salmon.html"));
@ -516,7 +557,7 @@ mod tests {
assert_eq!(false, machine.allow("/catfish")); assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish")); assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish/")]);
assert_eq!(true, machine.allow("/fish/")); assert_eq!(true, machine.allow("/fish/"));
assert_eq!(true, machine.allow("/fish/?id=anything")); assert_eq!(true, machine.allow("/fish/?id=anything"));
assert_eq!(true, machine.allow("/fish/salmon.htm")); assert_eq!(true, machine.allow("/fish/salmon.htm"));
@ -524,7 +565,7 @@ mod tests {
assert_eq!(false, machine.allow("/fish.html")); assert_eq!(false, machine.allow("/fish.html"));
assert_eq!(false, machine.allow("/Fish/Salmon.asp")); assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php")]);
assert_eq!(true, machine.allow("/filename.php")); assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php")); assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php?parameters")); assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
@ -533,7 +574,7 @@ mod tests {
assert_eq!(false, machine.allow("/")); assert_eq!(false, machine.allow("/"));
assert_eq!(false, machine.allow("/windows.PHP")); assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/*.php$")]);
assert_eq!(true, machine.allow("/filename.php")); assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php")); assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(false, machine.allow("/filename.php?parameters")); assert_eq!(false, machine.allow("/filename.php?parameters"));
@ -541,7 +582,7 @@ mod tests {
assert_eq!(false, machine.allow("/filename.php5")); assert_eq!(false, machine.allow("/filename.php5"));
assert_eq!(false, machine.allow("/windows.PHP")); assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]); let machine = Cylon::compile(vec![Rule::Disallow(b"/"), Rule::Allow(b"/fish*.php")]);
assert_eq!(true, machine.allow("/fish.php")); assert_eq!(true, machine.allow("/fish.php"));
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters")); assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
assert_eq!(false, machine.allow("/Fish.PHP")); assert_eq!(false, machine.allow("/Fish.PHP"));

View File

@ -21,10 +21,10 @@ enum ParsedRule {
impl<'a> Into<Rule<'a>> for &'a ParsedRule { impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> { fn into(self) -> Rule<'a> {
match self { match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Allow(path) => Rule::Allow(path.as_bytes()),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(path.as_bytes()),
#[cfg(feature = "crawl-delay")] #[cfg(feature = "crawl-delay")]
ParsedRule::Delay(delay) => Rule::Delay(delay), ParsedRule::Delay(delay) => Rule::Delay(delay.as_bytes()),
} }
} }
} }
@ -453,4 +453,36 @@ mod tests {
assert_eq!(true, imabot_machine.allow("/index.html")); assert_eq!(true, imabot_machine.allow("/index.html"));
}); });
} }
#[test]
fn test_unicode_support() {
tokio_test::block_on(async {
// From: wikipedia.org/robots.txt
let example_robots = r#"
User-agent: test
Disallow: /wiki/ויקיפדיה:רשימתועמדים_למחיקה/
Disallow: /wiki/ויקיפדיה%3Aרשימת_מועמדים_למחיקה/
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A8%D7%A9%D7%99%D7%9E%D7%AA_%D7%9E%D7%95%D7%A2%D7%9E%D7%93%D7%99%D7%9D_%D7%9C%D7%9E%D7%97%D7%99%D7%A7%D7%94/
Disallow: /wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים
Disallow: /wiki/ויקיפדיה%3Aערכים_לא_קיימים_ומוגנים
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%A2%D7%A8%D7%9B%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/ויקיפדיה:דפים_לא_קיימים_ומוגנים
Disallow: /wiki/ויקיפדיה%3Aדפים_לא_קיימים_ומוגנים
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
Disallow: /wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94%3A%D7%93%D7%A4%D7%99%D7%9D_%D7%9C%D7%90_%D7%A7%D7%99%D7%99%D7%9E%D7%99%D7%9D_%D7%95%D7%9E%D7%95%D7%92%D7%A0%D7%99%D7%9D
"#
.as_bytes();
let parser = Compiler::new("test");
let machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, machine.allow("/index.html"));
assert_eq!(
false,
machine.allow("/wiki/ויקיפדיה:ערכים_לא_קיימים_ומוגנים")
);
});
}
} }