diff --git a/Cargo.lock b/Cargo.lock index 9ba33dc..8026f48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" dependencies = [ "futures-core", "futures-sink", @@ -257,9 +257,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" [[package]] name = "futures-executor" @@ -274,15 +274,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -292,24 +292,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" dependencies = [ "futures-channel", "futures-core", @@ -428,12 +425,6 @@ dependencies = [ "libc", ] -[[package]] -name = "once_cell" -version = "1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" - [[package]] name = "oorandom" version = "11.1.3" @@ -606,9 +597,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "bd761ff957cb2a45fbb9ab3da6512de9de55872866160b23c25f1a841e99d29f" [[package]] name = "serde_cbor" @@ -622,9 +613,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "1800f7693e94e186f5e25a28291ae1570da908aff7d97a095dec1e56ff99069b" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index c82cf6d..8e93acc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,9 @@ license = "MIT" keywords = ["robots", "txt", "parse", "compile"] repository = "https://github.com/crestonbunch/cylon" +[features] +crawl-delay = [] + [dependencies] futures-util = "0.3" serde = "1.0" diff --git a/benches/parse.rs b/benches/parse.rs index 2084b00..ba0851e 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -9,6 +9,7 @@ Disallow: / Allow: /a Allow: /abc Allow: /b +Crawl-Delay: 20 "# .as_bytes(); diff --git a/src/dfa.rs b/src/dfa.rs index 461670e..7f46d54 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1,9 +1,14 @@ +#[cfg(feature = "crawl-delay")] +use std::cmp::Ordering; + use serde_derive::{Deserialize, Serialize}; #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Rule<'a> { Allow(&'a str), Disallow(&'a str), + #[cfg(feature = "crawl-delay")] + Delay(&'a str), } impl<'a> Rule<'a> { @@ -11,6 +16,8 @@ impl<'a> Rule<'a> { match self { Rule::Allow(inner) => inner, Rule::Disallow(inner) => inner, + #[cfg(feature = "crawl-delay")] + Rule::Delay(inner) => inner, } } } @@ -29,6 +36,8 @@ struct Transition(Edge, usize); enum State { Allow, Disallow, + #[cfg(feature = "crawl-delay")] + Delay, Intermediate, } @@ -40,12 +49,31 @@ enum State { pub struct Cylon { states: Vec, transitions: Vec>, + #[cfg(feature = "crawl-delay")] + delay: Option, } impl Cylon { + #[cfg(feature = "crawl-delay")] + pub fn delay(&self) -> Option { + self.delay + } + /// Match whether the rules allow or disallow the target path. pub fn allow(&self, path: &str) -> bool { - let mut state = path.chars().fold(2, |state, path_char| { + match self.states[self.state(path)] { + State::Allow => true, + State::Disallow => false, + // Intermediate states are not preserved in the DFA + #[cfg(feature = "crawl-delay")] + State::Intermediate | State::Delay => unreachable!(), + #[cfg(not(feature = "crawl-delay"))] + State::Intermediate => unreachable!(), + } + } + + fn state(&self, path: &str) -> usize { + let state = path.chars().fold(2, |state, path_char| { let t = &self.transitions[state]; t.iter() .rev() @@ -64,7 +92,7 @@ impl Cylon { // Follow the EoW transition, if necessary let t = &self.transitions[state]; - state = t + t .iter() .rev() .find(|transition| match transition { @@ -73,14 +101,7 @@ impl Cylon { _ => false, }) .map(|Transition(.., next_state)| *next_state) - .unwrap_or(state); - - match self.states[state] { - State::Allow => true, - State::Disallow => false, - // Intermediate states are not preserved in the DFA - State::Intermediate => unreachable!(), - } + .unwrap_or(state) } /// Compile a machine from a list of rules. @@ -114,6 +135,8 @@ impl Cylon { State::Allow => 0, State::Disallow if last_char == Some('$') => wildcard_state, State::Disallow => 1, + #[cfg(feature = "crawl-delay")] + State::Delay => 1, State::Intermediate => wildcard_state, }; @@ -155,6 +178,8 @@ impl Cylon { let state = match (rule, eow) { (Rule::Allow(..), true) => State::Allow, (Rule::Disallow(..), true) => State::Disallow, + #[cfg(feature = "crawl-delay")] + (Rule::Delay(..), true) => State::Delay, _ => State::Intermediate, }; @@ -184,12 +209,42 @@ impl Cylon { }); states.push(match state { + #[cfg(feature = "crawl-delay")] + State::Allow | State::Disallow | State::Delay => state, + #[cfg(not(feature = "crawl-delay"))] State::Allow | State::Disallow => state, State::Intermediate => states[wildcard_state], }); transitions.push(t); } + #[cfg(feature = "crawl-delay")] + { + let mut delays: Vec> = rules.iter().filter(|rule| { + match rule { + Rule::Delay(_) => true, + _ => false + } + }).map(|r| { + r.inner().parse::().ok() + }).collect(); + delays.sort_unstable_by(|a, b| { + match (a, b) { + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + (Some(aa), Some(bb)) => aa.cmp(bb) + + } + }); + Self { + delay: *delays.get(0).unwrap_or(&None), + states, + transitions, + } + } + + #[cfg(not(feature = "crawl-delay"))] Self { states, transitions, diff --git a/src/parse.rs b/src/parse.rs index e147409..94b86da 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -5,6 +5,8 @@ use futures_util::{ }; use serde_derive::{Deserialize, Serialize}; const UA_PREFIX: &str = "user-agent:"; +#[cfg(feature = "crawl-delay")] +const DELAY_PREFIX: &str = "crawl-delay:"; const ALLOW_PREFIX: &str = "allow:"; const DISALLOW_PREFIX: &str = "disallow:"; @@ -12,6 +14,8 @@ const DISALLOW_PREFIX: &str = "disallow:"; enum ParsedRule { Allow(String), Disallow(String), + #[cfg(feature = "crawl-delay")] + Delay(String), } impl<'a> Into> for &'a ParsedRule { @@ -19,6 +23,8 @@ impl<'a> Into> for &'a ParsedRule { match self { ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + #[cfg(feature = "crawl-delay")] + ParsedRule::Delay(delay) => Rule::Delay(delay), } } } @@ -167,11 +173,21 @@ fn parse_line(line: String) -> ParsedLine { // This tries to parse lines roughly in order of most frequent kind to // least frequent kind in order to minimize CPU cycles on average. - parse_disallow(line) + + #[cfg(feature = "crawl-delay")] + return parse_disallow(line) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) - .unwrap_or(ParsedLine::Nothing) + .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) + .unwrap_or(ParsedLine::Nothing); + + #[cfg(not(feature = "crawl-delay"))] + return parse_disallow(line) + .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) + .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) + .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .unwrap_or(ParsedLine::Nothing); } fn strip_comments(line: &str) -> &str { @@ -195,6 +211,21 @@ fn parse_user_agent(line: &str) -> Option<&str> { } } +#[cfg(feature = "crawl-delay")] +fn parse_delay(line: &str) -> Option<&str> { + if line.len() < DELAY_PREFIX.len() { + return None; + } + + let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DELAY_PREFIX.len()..]; + if prefix == DELAY_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + fn parse_allow(line: &str) -> Option<&str> { if line.len() < ALLOW_PREFIX.len() { return None; @@ -293,6 +324,46 @@ mod tests { } } + #[test] + #[cfg(feature = "crawl-delay")] + fn test_crawl_delay() { + tokio_test::block_on(async { + let example_robots = r#" + User-agent: jones-bot + Disallow: / + Crawl-Delay: 30 + + User-agent: foobar + Crawl-Delay: 60 + + User-agent: googlebot + Allow: / + + User-agent: barfoo + Crawl-Delay: 60 + Crawl-Delay: 20 + "# + .as_bytes(); + + let parser = Compiler::new("foobar"); + let foobar_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("googlebot"); + let googlebot_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("barfoo"); + let barfoo_machine = parser.compile(example_robots).await.unwrap(); + + let parser = Compiler::new("jones-bot"); + let jonesbot_machine = parser.compile(example_robots).await.unwrap(); + + assert_eq!(Some(60), foobar_machine.delay()); + assert_eq!(Some(20), barfoo_machine.delay()); + assert_eq!(Some(30), jonesbot_machine.delay()); + assert_eq!(None, googlebot_machine.delay()); + }); + } + #[test] fn test_end_to_end() { tokio_test::block_on(async { @@ -300,6 +371,10 @@ mod tests { User-agent: jones-bot Disallow: / + User-agent: foo + Allow: / + Crawl-Delay: 20 + User-agent: jones User-agent: foobar Allow: /