add crawl-delay
This commit is contained in:
parent
86ee746b96
commit
fe11216642
|
@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"]
|
||||||
repository = "https://github.com/crestonbunch/cylon"
|
repository = "https://github.com/crestonbunch/cylon"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
futures-util = "0.3.13"
|
futures-util = "0.3"
|
||||||
serde = "1.0.124"
|
serde = "1.0"
|
||||||
serde_derive = "1.0.124"
|
serde_derive = "1.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = { version = "0.3", features = ["async_futures"] }
|
criterion = { version = "0.3", features = ["async_futures"] }
|
||||||
|
|
|
@ -9,6 +9,7 @@ Disallow: /
|
||||||
Allow: /a
|
Allow: /a
|
||||||
Allow: /abc
|
Allow: /abc
|
||||||
Allow: /b
|
Allow: /b
|
||||||
|
Crawl-Delay: 20
|
||||||
"#
|
"#
|
||||||
.as_bytes();
|
.as_bytes();
|
||||||
|
|
||||||
|
|
1033
src/dfa.rs
1033
src/dfa.rs
File diff suppressed because it is too large
Load Diff
841
src/parse.rs
841
src/parse.rs
|
@ -1,399 +1,442 @@
|
||||||
use super::dfa::{Cylon, Rule};
|
use super::dfa::{Cylon, Rule};
|
||||||
use futures_util::{
|
use futures_util::{
|
||||||
io::{AsyncBufRead, AsyncRead, BufReader, Result},
|
io::{AsyncBufRead, AsyncRead, BufReader, Result},
|
||||||
AsyncBufReadExt,
|
AsyncBufReadExt,
|
||||||
};
|
};
|
||||||
use serde_derive::{Deserialize, Serialize};
|
use serde_derive::{Deserialize, Serialize};
|
||||||
const UA_PREFIX: &str = "user-agent:";
|
const UA_PREFIX: &str = "user-agent:";
|
||||||
const DELAY_PREFIX: &str = "crawl-delay:";
|
const DELAY_PREFIX: &str = "crawl-delay:";
|
||||||
const ALLOW_PREFIX: &str = "allow:";
|
const ALLOW_PREFIX: &str = "allow:";
|
||||||
const DISALLOW_PREFIX: &str = "disallow:";
|
const DISALLOW_PREFIX: &str = "disallow:";
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone)]
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
enum ParsedRule {
|
enum ParsedRule {
|
||||||
Allow(String),
|
Allow(String),
|
||||||
Disallow(String),
|
Disallow(String),
|
||||||
Delay(u64),
|
Delay(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
||||||
fn into(self) -> Rule<'a> {
|
fn into(self) -> Rule<'a> {
|
||||||
match self {
|
match self {
|
||||||
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
||||||
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
||||||
ParsedRule::Delay(delay) => Rule.Delay(delay),
|
ParsedRule::Delay(delay) => Rule::Delay(delay),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
enum ParsedLine {
|
enum ParsedLine {
|
||||||
UserAgent(String),
|
UserAgent(String),
|
||||||
Rule(ParsedRule),
|
Rule(ParsedRule),
|
||||||
Nothing,
|
Nothing,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
|
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
|
||||||
/// which can be used to efficiently match a large number of paths against
|
/// which can be used to efficiently match a large number of paths against
|
||||||
/// the robots.txt file.
|
/// the robots.txt file.
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct Compiler {
|
pub struct Compiler {
|
||||||
user_agent: String,
|
user_agent: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Compiler {
|
impl Compiler {
|
||||||
/// Build a new compiler that parses rules for the given user agent from
|
/// Build a new compiler that parses rules for the given user agent from
|
||||||
/// a robots.txt file.
|
/// a robots.txt file.
|
||||||
pub fn new(user_agent: &str) -> Self {
|
pub fn new(user_agent: &str) -> Self {
|
||||||
Self {
|
Self {
|
||||||
user_agent: user_agent.to_lowercase(),
|
user_agent: user_agent.to_lowercase(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse an input robots.txt file into a Cylon that can recognize
|
/// Parse an input robots.txt file into a Cylon that can recognize
|
||||||
/// whether or not a path matches the rules for the Parser's user agent.
|
/// whether or not a path matches the rules for the Parser's user agent.
|
||||||
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
|
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
|
||||||
let reader = BufReader::new(file);
|
let reader = BufReader::new(file);
|
||||||
let mut agent = String::new();
|
let mut agent = String::new();
|
||||||
let mut rules: Vec<ParsedRule> = vec![];
|
let mut rules: Vec<ParsedRule> = vec![];
|
||||||
let mut group_reader = GroupReader::new(reader);
|
let mut group_reader = GroupReader::new(reader);
|
||||||
|
|
||||||
// find the most specific matching group in the robots file
|
// find the most specific matching group in the robots file
|
||||||
while let Some(agents) = group_reader.next_header().await? {
|
while let Some(agents) = group_reader.next_header().await? {
|
||||||
let matching_agent = agents.iter().find(|a| {
|
let matching_agent = agents.iter().find(|a| {
|
||||||
let matches = &a[..] == "*" || self.user_agent.contains(*a);
|
let matches = &a[..] == "*" || self.user_agent.contains(*a);
|
||||||
let more_specific = a.len() > agent.len();
|
let more_specific = a.len() > agent.len();
|
||||||
matches && more_specific
|
matches && more_specific
|
||||||
});
|
});
|
||||||
|
|
||||||
if let Some(matching_agent) = matching_agent {
|
if let Some(matching_agent) = matching_agent {
|
||||||
agent = matching_agent.clone();
|
agent = matching_agent.clone();
|
||||||
rules = group_reader.next_rules().await?;
|
rules = group_reader.next_rules().await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let rules = rules.iter().map(|r| r.into()).collect();
|
let rules = rules.iter().map(|r| r.into()).collect();
|
||||||
Ok(Cylon::compile(rules))
|
Ok(Cylon::compile(rules))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct GroupReader<R: AsyncBufRead + Unpin> {
|
struct GroupReader<R: AsyncBufRead + Unpin> {
|
||||||
parsing_agents: bool,
|
parsing_agents: bool,
|
||||||
agents: Vec<String>,
|
agents: Vec<String>,
|
||||||
rules: Vec<ParsedRule>,
|
rules: Vec<ParsedRule>,
|
||||||
reader: R,
|
reader: R,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
|
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
|
||||||
fn new(reader: R) -> Self {
|
fn new(reader: R) -> Self {
|
||||||
Self {
|
Self {
|
||||||
parsing_agents: true,
|
parsing_agents: true,
|
||||||
agents: vec![],
|
agents: vec![],
|
||||||
rules: vec![],
|
rules: vec![],
|
||||||
reader,
|
reader,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scan forward until the next group header defined by one or more
|
/// Scan forward until the next group header defined by one or more
|
||||||
/// user agent lines. This lets us optimize the lines we need to copy
|
/// user agent lines. This lets us optimize the lines we need to copy
|
||||||
/// so we can skip over groups that don't match the desired user agent.
|
/// so we can skip over groups that don't match the desired user agent.
|
||||||
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
|
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
while self.reader.read_line(&mut buf).await? != 0 {
|
while self.reader.read_line(&mut buf).await? != 0 {
|
||||||
let parsed_line = parse_line(buf.clone());
|
let parsed_line = parse_line(buf.clone());
|
||||||
|
|
||||||
match parsed_line {
|
match parsed_line {
|
||||||
ParsedLine::UserAgent(ua) if self.parsing_agents => {
|
ParsedLine::UserAgent(ua) if self.parsing_agents => {
|
||||||
self.agents.push(ua);
|
self.agents.push(ua);
|
||||||
}
|
}
|
||||||
ParsedLine::UserAgent(ua) => {
|
ParsedLine::UserAgent(ua) => {
|
||||||
self.agents = vec![ua];
|
self.agents = vec![ua];
|
||||||
self.rules = vec![];
|
self.rules = vec![];
|
||||||
self.parsing_agents = true;
|
self.parsing_agents = true;
|
||||||
}
|
}
|
||||||
ParsedLine::Rule(rule) if self.parsing_agents => {
|
ParsedLine::Rule(rule) if self.parsing_agents => {
|
||||||
// Preserve the rule in case we need it in next_rules().
|
// Preserve the rule in case we need it in next_rules().
|
||||||
self.rules.push(rule);
|
self.rules.push(rule);
|
||||||
self.parsing_agents = false;
|
self.parsing_agents = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Skip over lines until we get to the next user agent.
|
// Skip over lines until we get to the next user agent.
|
||||||
ParsedLine::Rule(..) => (),
|
ParsedLine::Rule(..) => (),
|
||||||
ParsedLine::Nothing => (),
|
ParsedLine::Nothing => (),
|
||||||
}
|
}
|
||||||
|
|
||||||
buf.clear();
|
buf.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
let agents = self.agents.clone();
|
let agents = self.agents.clone();
|
||||||
self.agents = vec![];
|
self.agents = vec![];
|
||||||
|
|
||||||
if agents.is_empty() {
|
if agents.is_empty() {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(agents))
|
Ok(Some(agents))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
|
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
while self.reader.read_line(&mut buf).await? != 0 {
|
while self.reader.read_line(&mut buf).await? != 0 {
|
||||||
let parsed_line = parse_line(buf.clone());
|
let parsed_line = parse_line(buf.clone());
|
||||||
|
|
||||||
match parsed_line {
|
match parsed_line {
|
||||||
ParsedLine::Rule(rule) => {
|
ParsedLine::Rule(rule) => {
|
||||||
self.rules.push(rule);
|
self.rules.push(rule);
|
||||||
self.parsing_agents = false;
|
self.parsing_agents = false;
|
||||||
}
|
}
|
||||||
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
|
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
|
||||||
// Preserve the agent in case we need it in next_agents().
|
// Preserve the agent in case we need it in next_agents().
|
||||||
self.agents.push(ua);
|
self.agents.push(ua);
|
||||||
self.parsing_agents = true;
|
self.parsing_agents = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Skip over lines until we get to the next rule.
|
// Skip over lines until we get to the next rule.
|
||||||
ParsedLine::UserAgent(..) => (),
|
ParsedLine::UserAgent(..) => (),
|
||||||
ParsedLine::Nothing => (),
|
ParsedLine::Nothing => (),
|
||||||
}
|
}
|
||||||
|
|
||||||
buf.clear();
|
buf.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
let rules = self.rules.clone();
|
let rules = self.rules.clone();
|
||||||
self.rules = vec![];
|
self.rules = vec![];
|
||||||
Ok(rules)
|
Ok(rules)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_line(line: String) -> ParsedLine {
|
fn parse_line(line: String) -> ParsedLine {
|
||||||
let line = strip_comments(&line[..]).trim();
|
let line = strip_comments(&line[..]).trim();
|
||||||
|
|
||||||
// This tries to parse lines roughly in order of most frequent kind to
|
// This tries to parse lines roughly in order of most frequent kind to
|
||||||
// least frequent kind in order to minimize CPU cycles on average.
|
// least frequent kind in order to minimize CPU cycles on average.
|
||||||
parse_disallow(line)
|
parse_disallow(line)
|
||||||
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
|
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
|
||||||
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
|
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
|
||||||
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
|
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
|
||||||
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
|
.or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into()))))
|
||||||
.unwrap_or(ParsedLine::Nothing)
|
.unwrap_or(ParsedLine::Nothing)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strip_comments(line: &str) -> &str {
|
fn strip_comments(line: &str) -> &str {
|
||||||
if let Some(before) = line.split('#').next() {
|
if let Some(before) = line.split('#').next() {
|
||||||
return before;
|
return before;
|
||||||
}
|
}
|
||||||
return line;
|
return line;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_user_agent(line: &str) -> Option<&str> {
|
fn parse_user_agent(line: &str) -> Option<&str> {
|
||||||
if line.len() < UA_PREFIX.len() {
|
if line.len() < UA_PREFIX.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
|
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
|
||||||
let suffix = &line[UA_PREFIX.len()..];
|
let suffix = &line[UA_PREFIX.len()..];
|
||||||
|
|
||||||
if prefix == UA_PREFIX {
|
if prefix == UA_PREFIX {
|
||||||
Some(suffix.trim())
|
Some(suffix.trim())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_delay(line: &str) -> Option<u64> {
|
fn parse_delay(line: &str) -> Option<&str> {
|
||||||
if line.len() < DELAY_PREFIX.len() {
|
if line.len() < DELAY_PREFIX.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
|
let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase();
|
||||||
let suffix = &line[DELAY_PREFIX.len()..];
|
let suffix = &line[DELAY_PREFIX.len()..];
|
||||||
if prefix == DELAY_PREFIX {
|
if prefix == DELAY_PREFIX {
|
||||||
Some(suffix.trim())
|
Some(suffix.trim())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_allow(line: &str) -> Option<&str> {
|
fn parse_allow(line: &str) -> Option<&str> {
|
||||||
if line.len() < ALLOW_PREFIX.len() {
|
if line.len() < ALLOW_PREFIX.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
|
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
|
||||||
let suffix = &line[ALLOW_PREFIX.len()..];
|
let suffix = &line[ALLOW_PREFIX.len()..];
|
||||||
|
|
||||||
if prefix == ALLOW_PREFIX {
|
if prefix == ALLOW_PREFIX {
|
||||||
Some(suffix.trim())
|
Some(suffix.trim())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_disallow(line: &str) -> Option<&str> {
|
fn parse_disallow(line: &str) -> Option<&str> {
|
||||||
if line.len() < DISALLOW_PREFIX.len() {
|
if line.len() < DISALLOW_PREFIX.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
|
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
|
||||||
let suffix = &line[DISALLOW_PREFIX.len()..];
|
let suffix = &line[DISALLOW_PREFIX.len()..];
|
||||||
|
|
||||||
if prefix == DISALLOW_PREFIX {
|
if prefix == DISALLOW_PREFIX {
|
||||||
Some(suffix.trim())
|
Some(suffix.trim())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_allow() {
|
fn test_parse_allow() {
|
||||||
let test_cases = vec![
|
let test_cases = vec![
|
||||||
("Allow: /", "/"),
|
("Allow: /", "/"),
|
||||||
("allow: / # Root with comment", "/"),
|
("allow: / # Root with comment", "/"),
|
||||||
("ALLOW: /abc/def ", "/abc/def"),
|
("ALLOW: /abc/def ", "/abc/def"),
|
||||||
("Allow: /abc/def ", "/abc/def"),
|
("Allow: /abc/def ", "/abc/def"),
|
||||||
(" Allow: /*/foo", "/*/foo"),
|
(" Allow: /*/foo", "/*/foo"),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (i, o) in test_cases {
|
for (i, o) in test_cases {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
parse_line(i.into()),
|
parse_line(i.into()),
|
||||||
ParsedLine::Rule(ParsedRule::Allow(o.into()))
|
ParsedLine::Rule(ParsedRule::Allow(o.into()))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_disallow() {
|
fn test_parse_disallow() {
|
||||||
let test_cases = vec![
|
let test_cases = vec![
|
||||||
("Disallow: /", "/"),
|
("Disallow: /", "/"),
|
||||||
("disallow: / # Root with comment", "/"),
|
("disallow: / # Root with comment", "/"),
|
||||||
("DISALLOW: /abc/def ", "/abc/def"),
|
("DISALLOW: /abc/def ", "/abc/def"),
|
||||||
("Disallow: /abc/def ", "/abc/def"),
|
("Disallow: /abc/def ", "/abc/def"),
|
||||||
(" Disallow: /*/foo", "/*/foo"),
|
(" Disallow: /*/foo", "/*/foo"),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (i, o) in test_cases {
|
for (i, o) in test_cases {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
parse_line(i.into()),
|
parse_line(i.into()),
|
||||||
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
|
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_user_agent() {
|
fn test_parse_user_agent() {
|
||||||
let test_cases = vec![
|
let test_cases = vec![
|
||||||
("User-agent: *", "*"),
|
("User-agent: *", "*"),
|
||||||
("user-agent: ImABot # User agent with comment", "imabot"),
|
("user-agent: ImABot # User agent with comment", "imabot"),
|
||||||
(" USER-AGENT: ImABot ", "imabot"),
|
(" USER-AGENT: ImABot ", "imabot"),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (i, o) in test_cases {
|
for (i, o) in test_cases {
|
||||||
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
|
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_nothing() {
|
fn test_parse_nothing() {
|
||||||
let test_cases = vec![
|
let test_cases = vec![
|
||||||
"Useragent: *",
|
"Useragent: *",
|
||||||
"# Comment",
|
"# Comment",
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"alow: /",
|
"alow: /",
|
||||||
"disalow: /",
|
"disalow: /",
|
||||||
];
|
];
|
||||||
|
|
||||||
for i in test_cases {
|
for i in test_cases {
|
||||||
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
|
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_end_to_end() {
|
fn test_crawl_delay() {
|
||||||
tokio_test::block_on(async {
|
tokio_test::block_on(async {
|
||||||
let example_robots = r#"
|
let example_robots = r#"
|
||||||
User-agent: jones-bot
|
User-agent: jones-bot
|
||||||
Disallow: /
|
Disallow: /
|
||||||
|
Crawl-Delay: 30
|
||||||
User-agent: jones
|
|
||||||
User-agent: foobar
|
User-agent: foobar
|
||||||
Allow: /
|
Crawl-Delay: 60
|
||||||
|
|
||||||
User-agent: *
|
User-agent: googlebot
|
||||||
Disallow: /
|
Allow: /
|
||||||
"#
|
|
||||||
.as_bytes();
|
User-agent: barfoo
|
||||||
|
Crawl-Delay: 60
|
||||||
let parser = Compiler::new("foobar");
|
Crawl-Delay: 20
|
||||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
"#
|
||||||
|
.as_bytes();
|
||||||
let parser = Compiler::new("jones-bot");
|
|
||||||
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
|
let parser = Compiler::new("foobar");
|
||||||
|
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||||
let parser = Compiler::new("imabot");
|
|
||||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
let parser = Compiler::new("googlebot");
|
||||||
|
let googlebot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
let parser = Compiler::new("abc");
|
|
||||||
let abc_machine = parser.compile(example_robots).await.unwrap();
|
let parser = Compiler::new("barfoo");
|
||||||
|
let barfoo_machine = parser.compile(example_robots).await.unwrap();
|
||||||
assert_eq!(true, foobar_machine.allow("/index.html"));
|
|
||||||
assert_eq!(false, jonesbot_machine.allow("/index.html"));
|
let parser = Compiler::new("jones-bot");
|
||||||
assert_eq!(false, imabot_machine.allow("/index.html"));
|
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
assert_eq!(false, abc_machine.allow("/index.html"));
|
|
||||||
});
|
assert_eq!(Some(60), foobar_machine.delay());
|
||||||
}
|
assert_eq!(Some(20), barfoo_machine.delay());
|
||||||
|
assert_eq!(Some(30), jonesbot_machine.delay());
|
||||||
#[test]
|
assert_eq!(None, googlebot_machine.delay());
|
||||||
fn test_invalid_1() {
|
});
|
||||||
tokio_test::block_on(async {
|
}
|
||||||
let example_robots = r#"
|
|
||||||
# Instead of treating this as an error, we'll just consider
|
#[test]
|
||||||
# this behavior undefined.
|
fn test_end_to_end() {
|
||||||
Allow: /
|
tokio_test::block_on(async {
|
||||||
|
let example_robots = r#"
|
||||||
User-agent: jones
|
User-agent: jones-bot
|
||||||
User-agent: foobar
|
Disallow: /
|
||||||
Disallow: /
|
|
||||||
"#
|
User-agent: foo
|
||||||
.as_bytes();
|
Allow: /
|
||||||
|
Crawl-Delay: 20
|
||||||
let parser = Compiler::new("foobar");
|
|
||||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
User-agent: jones
|
||||||
|
User-agent: foobar
|
||||||
let parser = Compiler::new("imabot");
|
Allow: /
|
||||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
|
||||||
|
User-agent: *
|
||||||
// Everything is allowed because next_header() returns None
|
Disallow: /
|
||||||
assert_eq!(true, foobar_machine.allow("/index.html"));
|
"#
|
||||||
assert_eq!(true, imabot_machine.allow("/index.html"));
|
.as_bytes();
|
||||||
});
|
|
||||||
}
|
let parser = Compiler::new("foobar");
|
||||||
|
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||||
#[test]
|
|
||||||
fn test_invalid_2() {
|
let parser = Compiler::new("jones-bot");
|
||||||
tokio_test::block_on(async {
|
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
let example_robots = r#"
|
|
||||||
User-agent: jones
|
let parser = Compiler::new("imabot");
|
||||||
User-agent: foobar
|
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
Disallow: /
|
|
||||||
|
let parser = Compiler::new("abc");
|
||||||
# Instead of treating this as an error, we consider this
|
let abc_machine = parser.compile(example_robots).await.unwrap();
|
||||||
# behavior undefined.
|
|
||||||
User-agent: imabot
|
assert_eq!(true, foobar_machine.allow("/index.html"));
|
||||||
"#
|
assert_eq!(false, jonesbot_machine.allow("/index.html"));
|
||||||
.as_bytes();
|
assert_eq!(false, imabot_machine.allow("/index.html"));
|
||||||
|
assert_eq!(false, abc_machine.allow("/index.html"));
|
||||||
let parser = Compiler::new("foobar");
|
});
|
||||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
}
|
||||||
|
|
||||||
let parser = Compiler::new("imabot");
|
#[test]
|
||||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
fn test_invalid_1() {
|
||||||
|
tokio_test::block_on(async {
|
||||||
assert_eq!(false, foobar_machine.allow("/index.html"));
|
let example_robots = r#"
|
||||||
assert_eq!(true, imabot_machine.allow("/index.html"));
|
# Instead of treating this as an error, we'll just consider
|
||||||
});
|
# this behavior undefined.
|
||||||
}
|
Allow: /
|
||||||
}
|
|
||||||
|
User-agent: jones
|
||||||
|
User-agent: foobar
|
||||||
|
Disallow: /
|
||||||
|
"#
|
||||||
|
.as_bytes();
|
||||||
|
|
||||||
|
let parser = Compiler::new("foobar");
|
||||||
|
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
let parser = Compiler::new("imabot");
|
||||||
|
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
// Everything is allowed because next_header() returns None
|
||||||
|
assert_eq!(true, foobar_machine.allow("/index.html"));
|
||||||
|
assert_eq!(true, imabot_machine.allow("/index.html"));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_invalid_2() {
|
||||||
|
tokio_test::block_on(async {
|
||||||
|
let example_robots = r#"
|
||||||
|
User-agent: jones
|
||||||
|
User-agent: foobar
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
# Instead of treating this as an error, we consider this
|
||||||
|
# behavior undefined.
|
||||||
|
User-agent: imabot
|
||||||
|
"#
|
||||||
|
.as_bytes();
|
||||||
|
|
||||||
|
let parser = Compiler::new("foobar");
|
||||||
|
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
let parser = Compiler::new("imabot");
|
||||||
|
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(false, foobar_machine.allow("/index.html"));
|
||||||
|
assert_eq!(true, imabot_machine.allow("/index.html"));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue