diff --git a/Cargo.lock b/Cargo.lock index 09cb268..0dc8578 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -215,7 +215,7 @@ dependencies = [ [[package]] name = "cylon" -version = "0.1.2" +version = "0.1.3" dependencies = [ "criterion", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index 21b0b54..a8cff3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cylon" description = "An efficient compiler for robots.txt files" -version = "0.1.2" +version = "0.1.3" authors = ["Creston Bunch "] edition = "2018" diff --git a/src/dfa.rs b/src/dfa.rs index 7f46d54..299641c 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -45,7 +45,7 @@ enum State { /// file. By providing it a URL path, it can decide whether or not /// the robots file that compiled it allows or disallows that path in /// roughly O(n) time, where n is the length of the path. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct Cylon { states: Vec, transitions: Vec>, @@ -92,8 +92,7 @@ impl Cylon { // Follow the EoW transition, if necessary let t = &self.transitions[state]; - t - .iter() + t.iter() .rev() .find(|transition| match transition { Transition(Edge::MatchEow, ..) => true, @@ -210,7 +209,7 @@ impl Cylon { states.push(match state { #[cfg(feature = "crawl-delay")] - State::Allow | State::Disallow | State::Delay => state, + State::Allow | State::Disallow | State::Delay => state, #[cfg(not(feature = "crawl-delay"))] State::Allow | State::Disallow => state, State::Intermediate => states[wildcard_state], @@ -220,22 +219,19 @@ impl Cylon { #[cfg(feature = "crawl-delay")] { - let mut delays: Vec> = rules.iter().filter(|rule| { - match rule { + let mut delays: Vec> = rules + .iter() + .filter(|rule| match rule { Rule::Delay(_) => true, - _ => false - } - }).map(|r| { - r.inner().parse::().ok() - }).collect(); - delays.sort_unstable_by(|a, b| { - match (a, b) { - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (None, None) => Ordering::Equal, - (Some(aa), Some(bb)) => aa.cmp(bb) - - } + _ => false, + }) + .map(|r| r.inner().parse::().ok()) + .collect(); + delays.sort_unstable_by(|a, b| match (a, b) { + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + (Some(aa), Some(bb)) => aa.cmp(bb), }); Self { delay: *delays.get(0).unwrap_or(&None), diff --git a/src/parse.rs b/src/parse.rs index 94b86da..514c7fe 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -39,7 +39,7 @@ enum ParsedLine { /// A compiler takes an input robots.txt file and outputs a compiled Cylon, /// which can be used to efficiently match a large number of paths against /// the robots.txt file. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct Compiler { user_agent: String, } @@ -173,7 +173,7 @@ fn parse_line(line: String) -> ParsedLine { // This tries to parse lines roughly in order of most frequent kind to // least frequent kind in order to minimize CPU cycles on average. - + #[cfg(feature = "crawl-delay")] return parse_disallow(line) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into()))) @@ -181,7 +181,7 @@ fn parse_line(line: String) -> ParsedLine { .or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into())))) .or_else(|| parse_delay(line).map(|s| ParsedLine::Rule(ParsedRule::Delay(s.into())))) .unwrap_or(ParsedLine::Nothing); - + #[cfg(not(feature = "crawl-delay"))] return parse_disallow(line) .map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))