diff --git a/Cargo.lock b/Cargo.lock index 9ba33dc..8026f48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" dependencies = [ "futures-core", "futures-sink", @@ -257,9 +257,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" [[package]] name = "futures-executor" @@ -274,15 +274,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -292,24 +292,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" dependencies = [ "futures-channel", "futures-core", @@ -428,12 +425,6 @@ dependencies = [ "libc", ] -[[package]] -name = "once_cell" -version = "1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" - [[package]] name = "oorandom" version = "11.1.3" @@ -606,9 +597,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "bd761ff957cb2a45fbb9ab3da6512de9de55872866160b23c25f1a841e99d29f" [[package]] name = "serde_cbor" @@ -622,9 +613,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "1800f7693e94e186f5e25a28291ae1570da908aff7d97a095dec1e56ff99069b" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index c82cf6d..795f606 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,9 +11,9 @@ keywords = ["robots", "txt", "parse", "compile"] repository = "https://github.com/crestonbunch/cylon" [dependencies] -futures-util = "0.3" -serde = "1.0" -serde_derive = "1.0" +futures-util = "0.3.13" +serde = "1.0.124" +serde_derive = "1.0.124" [dev-dependencies] criterion = { version = "0.3", features = ["async_futures"] } diff --git a/src/dfa.rs b/src/dfa.rs index 461670e..5e57ea5 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -4,6 +4,7 @@ use serde_derive::{Deserialize, Serialize}; pub enum Rule<'a> { Allow(&'a str), Disallow(&'a str), + Delay(u64), } impl<'a> Rule<'a> { @@ -11,6 +12,7 @@ impl<'a> Rule<'a> { match self { Rule::Allow(inner) => inner, Rule::Disallow(inner) => inner, + Rule::Delay(inner) => inner, } } } diff --git a/src/parse.rs b/src/parse.rs index e147409..921eaba 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -5,6 +5,7 @@ use futures_util::{ }; use serde_derive::{Deserialize, Serialize}; const UA_PREFIX: &str = "user-agent:"; +const DELAY_PREFIX: &str = "crawl-delay:"; const ALLOW_PREFIX: &str = "allow:"; const DISALLOW_PREFIX: &str = "disallow:"; @@ -12,6 +13,7 @@ const DISALLOW_PREFIX: &str = "disallow:"; enum ParsedRule { Allow(String), Disallow(String), + Delay(u64), } impl<'a> Into> for &'a ParsedRule { @@ -19,6 +21,7 @@ impl<'a> Into> for &'a ParsedRule { match self { ParsedRule::Allow(path) => Rule::Allow(&path[..]), ParsedRule::Disallow(path) => Rule::Disallow(&path[..]), + ParsedRule::Delay(delay) => Rule.Delay(delay), } } } @@ -171,6 +174,7 @@ fn parse_line(line: String) -> ParsedLine { .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) .or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase()))) .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) + .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) .unwrap_or(ParsedLine::Nothing) } @@ -195,6 +199,20 @@ fn parse_user_agent(line: &str) -> Option<&str> { } } +fn parse_delay(line: &str) -> Option { + if line.len() < DELAY_PREFIX.len() { + return None; + } + + let prefix = &line[..DELAY_PREFIX.len()].to_ascii_lowercase(); + let suffix = &line[DELAY_PREFIX.len()..]; + if prefix == DELAY_PREFIX { + Some(suffix.trim()) + } else { + None + } +} + fn parse_allow(line: &str) -> Option<&str> { if line.len() < ALLOW_PREFIX.len() { return None;