Initial release

This commit is contained in:
Creston Bunch 2021-02-14 18:08:24 -06:00
commit 56cf5d8d91
9 changed files with 1912 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

831
Cargo.lock generated Normal file
View File

@ -0,0 +1,831 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "async-stream"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3670df70cbc01729f901f94c887814b3c68db038aad1329a418bae178bc5295c"
dependencies = [
"async-stream-impl",
"futures-core",
]
[[package]]
name = "async-stream-impl"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3548b8efc9f8e8a5a0a2808c5bd8451a9031b9e5b879a79590304ae928b0a70"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "bitflags"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "bstr"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "099e596ef14349721d9016f6b80dd3419ea1bf289ab9b44df8e4dfd3a005d5d9"
[[package]]
name = "byteorder"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b"
[[package]]
name = "bytes"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
[[package]]
name = "cast"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
dependencies = [
"rustc_version",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "2.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
dependencies = [
"bitflags",
"textwrap",
"unicode-width",
]
[[package]]
name = "const_fn"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6"
[[package]]
name = "criterion"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23"
dependencies = [
"atty",
"cast",
"clap",
"criterion-plot",
"csv",
"futures",
"itertools 0.10.0",
"lazy_static",
"num-traits",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_cbor",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d"
dependencies = [
"cast",
"itertools 0.9.0",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d"
dependencies = [
"cfg-if",
"const_fn",
"crossbeam-utils",
"lazy_static",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
dependencies = [
"autocfg",
"cfg-if",
"lazy_static",
]
[[package]]
name = "csv"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97"
dependencies = [
"bstr",
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "cylon"
version = "0.1.0"
dependencies = [
"criterion",
"futures-util",
"serde",
"serde_derive",
"tokio-test",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "futures"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
[[package]]
name = "futures-executor"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
[[package]]
name = "futures-macro"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
[[package]]
name = "futures-task"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [
"once_cell",
]
[[package]]
name = "futures-util"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"proc-macro-hack",
"proc-macro-nested",
"slab",
]
[[package]]
name = "half"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
[[package]]
name = "hermit-abi"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
dependencies = [
"libc",
]
[[package]]
name = "itertools"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "js-sys"
version = "0.3.47"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cfb73131c35423a367daf8cbd24100af0d077668c8c2943f0e7dd775fef0f65"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c"
[[package]]
name = "log"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [
"cfg-if",
]
[[package]]
name = "memchr"
version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
[[package]]
name = "memoffset"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
dependencies = [
"autocfg",
]
[[package]]
name = "num-traits"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "once_cell"
version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0"
[[package]]
name = "oorandom"
version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "pin-project-lite"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "plotters"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45ca0ae5f169d0917a7c7f5a9c1a3d3d9598f18f529dd2b8373ed988efea307a"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590"
[[package]]
name = "plotters-svg"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211"
dependencies = [
"plotters-backend",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro-nested"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
[[package]]
name = "proc-macro2"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674"
dependencies = [
"autocfg",
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "regex"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
dependencies = [
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
]
[[package]]
name = "regex-syntax"
version = "0.6.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
dependencies = [
"semver",
]
[[package]]
name = "ryu"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "semver"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
dependencies = [
"semver-parser",
]
[[package]]
name = "semver-parser"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "serde"
version = "1.0.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae"
[[package]]
name = "serde_cbor"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
dependencies = [
"half",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.62"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea1c6153794552ea7cf7cf63b1231a25de00ec90db326ba6264440fa08e31486"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "slab"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "syn"
version = "1.0.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "tinytemplate"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "tokio"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a"
dependencies = [
"autocfg",
"pin-project-lite",
]
[[package]]
name = "tokio-stream"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1981ad97df782ab506a1f43bf82c967326960d278acf3bf8279809648c3ff3ea"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]]
name = "tokio-test"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c7d205f6f59b03f9e824ac86eaba635a98395f287756ecc8a06464779c399bf"
dependencies = [
"async-stream",
"bytes",
"futures-core",
"tokio",
"tokio-stream",
]
[[package]]
name = "unicode-width"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "walkdir"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d"
dependencies = [
"same-file",
"winapi",
"winapi-util",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55c0f7123de74f0dab9b7d00fd614e7b19349cd1e2f5252bbe9b1754b59433be"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bc45447f0d4573f3d65720f636bbcc3dd6ce920ed704670118650bcd47764c7"
dependencies = [
"bumpalo",
"lazy_static",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b8853882eef39593ad4174dd26fc9865a64e84026d223f63bb2c42affcbba2c"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4133b5e7f2a531fa413b3a1695e925038a05a71cf67e87dafa295cb645a01385"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd4945e4943ae02d15c13962b38a5b1e81eadd4b71214eee75af64a4d6a4fd64"
[[package]]
name = "web-sys"
version = "0.3.47"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c40dc691fc48003eba817c38da7113c15698142da971298003cac3ef175680b3"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

22
Cargo.toml Normal file
View File

@ -0,0 +1,22 @@
[package]
name = "cylon"
version = "0.1.0"
authors = ["Creston Bunch <rust@bunch.im>"]
edition = "2018"
categories = ["parser-implementations"]
license = "MIT"
keywords = ["robots", "txt", "parse", "compile"]
[dependencies]
futures-util = "0.3"
serde = "1.0"
serde_derive = "1.0"
[dev-dependencies]
criterion = { version = "0.3", features = ["async_futures"] }
tokio-test = "0.4"
[[bench]]
name = "parse"
harness = false

7
LICENSE Normal file
View File

@ -0,0 +1,7 @@
Copyright © 2021 Creston Bunch
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

97
README.md Normal file
View File

@ -0,0 +1,97 @@
# Cylon
Cylon is a library for reading robots.txt files.
## Features
There is no universal standard for what rules a web crawler
is required to support in a robots.txt file. Cylon supports
the following directives (notably `Site-map` is missing):
- `User-agent`
- `Allow`
- `Disallow`
In addition, Cylon supports `*` as a wildcard character to
match any length substring of 0 or more characters, as well
as the `$` character to match the end of a path.
## Usage
Using Cylon is very simple. Simply create a new complier
for your user agent, then compile the robots.txt file.
```rust
// You can use something like hyper or reqwest to download
// the robots.txt file instead.
let example_robots = r#"
User-agent: googlebot
Allow: /
User-agent: *
Disallow: /
"#
.as_bytes();
// Create a new compiler that compiles a robots.txt file looking for
// rules that apply to the "googlebot" user agent.
let compiler = Compiler::new("googlebot");
let cylon = compiler.compile(example_robots).await.unwrap();
assert_eq!(true, cylon.allow("/index.html"));
assert_eq!(true, cylon.allow("/directory"));
// Create a new compiler that compiles a robots.txt file looking for
// rules that apply to the "bing" user agent.
let complier = Compiler::new("bing");
let cylon = compiler.compile(example_robots).await.unwrap();
assert_eq!(false, cylon.allow("/index.html"));
assert_eq!(false, cylon.allow("/directory"));
```
## Contributing
Contributions are welcome! Please make a pull request. Issues may not
be addressed in a timely manner unless they expose fundamental issues
or security concerns.
## Implementation
### Async
This library uses an async API by default. This library does not assume
any async runtime so you can use it with any (tokio, async-std, etc.)
A synchronous API may be an optional feature in the future, but there
are no current plans to add one. If you need a synchronous API consider
adding one yourself (contributions are welcome).
### Performance
Cylon compiles robots.txt files into very efficient DFAs. This means it is
well-suited for web crawlers that need to use the same robots.txt file for
multiple URLs.
The compiler avoids any random memory access when compiling the DFA (e.g. by
not using hashmaps or tree structures.) so it has very good cache-locality.
The DFA can match input paths in roughly O(n) time, where n is the length of
the input path. (Compare that to the alternative O(n \* m) complexity of
matching the input path against every rule in the robots.txt file.)
### (De-)serialization
This library uses serde to allow serializing/deserializing the compiled Cylon
DFA structs. This is useful e.g. if you need to cache the DFA in something like
Memcached or Redis. (Use a format like bincode or msgpack to convert it to
bytes first.)
### Error handling
Robots.txt files are more like guidelines than actual rules.
In general, Cylon tries not to cause errors for things that might be considered
an invalid robots.txt file, which means there are very few failure cases.
## License
MIT

70
benches/parse.rs Normal file
View File

@ -0,0 +1,70 @@
use cylon::Compiler;
use criterion::async_executor::FuturesExecutor;
use criterion::{criterion_group, criterion_main, Criterion};
const SMALL_FILE: &[u8] = r#"
User-agent: *
Disallow: /
Allow: /a
Allow: /abc
Allow: /b
"#
.as_bytes();
const LARGE_FILE: &[u8] = r#"
User-agent: *
Allow: /
Disallow: /a$
Disallow: /abc
Allow: /abc/*
Disallow: /foo/bar
Allow /*/bar
Disallow: /www/*/images
Allow: /www/public/images
"#
.as_bytes();
fn bench(c: &mut Criterion) {
c.bench_function("compile small", |b| {
b.to_async(FuturesExecutor).iter(|| async {
let parser = Compiler::new("ImABot");
parser.compile(SMALL_FILE).await.unwrap();
})
});
c.bench_function("compile large", |b| {
b.to_async(FuturesExecutor).iter(|| async {
let parser = Compiler::new("ImABot");
parser.compile(LARGE_FILE).await.unwrap();
})
});
let parser = Compiler::new("ImABot");
let small_machine = &tokio_test::block_on(parser.compile(SMALL_FILE)).unwrap();
c.bench_function("allow small A", move |b| {
b.iter(|| {
small_machine.allow("/abc");
});
});
c.bench_function("allow small B", move |b| {
b.iter(|| {
small_machine.allow("/www/cat/images");
});
});
let large_machine = &tokio_test::block_on(parser.compile(LARGE_FILE)).unwrap();
c.bench_function("allow large A", move |b| {
b.iter(|| {
large_machine.allow("/abc");
});
});
c.bench_function("allow large B", move |b| {
b.iter(|| {
large_machine.allow("/www/cat/images");
});
});
}
criterion_group!(benches, bench);
criterion_main!(benches);

498
src/dfa.rs Normal file
View File

@ -0,0 +1,498 @@
use serde_derive::{Deserialize, Serialize};
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Rule<'a> {
Allow(&'a str),
Disallow(&'a str),
}
impl<'a> Rule<'a> {
fn inner(&self) -> &str {
match self {
Rule::Allow(inner) => inner,
Rule::Disallow(inner) => inner,
}
}
}
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
enum Edge {
MatchChar(char),
MatchAny,
MatchEow,
}
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
struct Transition(Edge, usize);
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
enum State {
Allow,
Disallow,
Intermediate,
}
/// A Cylon is a DFA that recognizes rules from a compiled robots.txt
/// file. By providing it a URL path, it can decide whether or not
/// the robots file that compiled it allows or disallows that path in
/// roughly O(n) time, where n is the length of the path.
#[derive(Debug, Serialize, Deserialize)]
pub struct Cylon {
states: Vec<State>,
transitions: Vec<Vec<Transition>>,
}
impl Cylon {
/// Match whether the rules allow or disallow the target path.
pub fn allow(&self, path: &str) -> bool {
let mut state = path.chars().fold(2, |state, path_char| {
let t = &self.transitions[state];
t.iter()
.rev()
// Pick the last transition to always prioritize MatchChar
// over MatchAny (which will always be the first transition.)
.find(|transition| match transition {
Transition(Edge::MatchAny, ..) => true,
Transition(Edge::MatchEow, ..) => false,
Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
})
.map(|Transition(.., next_state)| *next_state)
// We are guaranteed at least one matching state because of
// the way the DFA is constructed.
.unwrap()
});
// Follow the EoW transition, if necessary
let t = &self.transitions[state];
state = t
.iter()
.rev()
.find(|transition| match transition {
Transition(Edge::MatchEow, ..) => true,
Transition(Edge::MatchAny, ..) => true,
_ => false,
})
.map(|Transition(.., next_state)| *next_state)
.unwrap_or(state);
match self.states[state] {
State::Allow => true,
State::Disallow => false,
// Intermediate states are not preserved in the DFA
State::Intermediate => unreachable!(),
}
}
/// Compile a machine from a list of rules.
pub fn compile(mut rules: Vec<Rule>) -> Self {
// This algorithm constructs a DFA by doing BFS over the prefix tree of
// paths in the provided list of rules. However, for performance reasons
// it does not actually build a tree structure. (Vecs have better
// cache-locality by avoiding random memory access.)
let mut transitions: Vec<Vec<Transition>> = vec![
vec![Transition(Edge::MatchAny, 0)],
vec![Transition(Edge::MatchAny, 1)],
];
let mut states: Vec<State> = vec![State::Allow, State::Disallow];
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
let mut queue = vec![("", 0, 0, State::Intermediate)];
while !queue.is_empty() {
// parent_prefix is the "parent node" in the prefix tree. We are
// going to visit its children by filtering from the list of
// paths only the paths that start with the parent_prefix.
// wildcard_state is a node to jump to when an unmatched character
// is encountered. This is usually a node higher up in the tree
// that can match any character legally, but is also a prefix
// (read: ancestor) of the current node.
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
let last_char = parent_prefix.chars().last();
wildcard_state = match state {
State::Allow => 0,
State::Disallow if last_char == Some('$') => wildcard_state,
State::Disallow => 1,
State::Intermediate => wildcard_state,
};
let mut t = match last_char {
Some('$') => {
// The EOW character cannot match anything else
vec![Transition(Edge::MatchAny, wildcard_state)]
}
Some('*') => {
// The wildcard character overrides the wildcard state
vec![Transition(Edge::MatchAny, transitions.len())]
}
_ => {
// Every other state has a self-loop that matches anything
vec![Transition(Edge::MatchAny, wildcard_state)]
}
};
let mut curr_prefix = "";
rules
.iter()
.map(Rule::inner)
.zip(&rules)
.filter(|(path, _)| (*path).starts_with(parent_prefix))
.filter(|(path, _)| (*path) != parent_prefix)
.for_each(|(path, rule)| {
let child_prefix = &path[0..parent_prefix.len() + 1];
if curr_prefix == child_prefix {
// We only want to visit a child node once, but
// many rules might have the same child_prefix, so
// we skip the duplicates after the first time
// we see a prefix. (This could be a filter(), but
// it's a bit hard to encode earlier in the chain.)
return;
}
curr_prefix = child_prefix;
let eow = child_prefix == path;
let state = match (rule, eow) {
(Rule::Allow(..), true) => State::Allow,
(Rule::Disallow(..), true) => State::Disallow,
_ => State::Intermediate,
};
queue.push((child_prefix, wildcard_state, transitions.len(), state));
// NB: we can predict what state index the child
// will have before it's even pushed onto the state vec.
let child_index = transitions.len() + queue.len();
let edge_char = child_prefix.chars().last().unwrap();
let transition = Transition(
match edge_char {
'*' => Edge::MatchAny,
'$' => Edge::MatchEow,
c => Edge::MatchChar(c),
},
child_index,
);
// Add transitions from the parent state to the child state
// so that the wildcard character matches are optional.
if last_char == Some('*') {
let parent_t = &mut transitions[parent_state];
parent_t.push(transition);
}
t.push(transition);
});
states.push(match state {
State::Allow | State::Disallow => state,
State::Intermediate => states[wildcard_state],
});
transitions.push(t);
}
Self {
states,
transitions,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! t {
('*' => $x:expr) => {
Transition(Edge::MatchAny, $x)
};
('$' => $x:expr) => {
Transition(Edge::MatchEow, $x)
};
($x:expr => $y:expr) => {
Transition(Edge::MatchChar($x), $y)
};
}
#[test]
fn test_compile() {
let rules = vec![
Rule::Disallow("/"),
Rule::Allow("/a"),
Rule::Allow("/abc"),
Rule::Allow("/b"),
];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
vec![t!('*' => 0), t!('b' => 6)], // "/a"
vec![t!('*' => 0)], // "/b"
vec![t!('*' => 0), t!('c' => 7)], // "/ab"
vec![t!('*' => 0)], // "/abc"
];
let expect_states = vec![
State::Allow,
State::Disallow,
State::Allow,
State::Disallow,
State::Allow,
State::Allow,
State::Allow,
State::Allow,
];
let actual = Cylon::compile(rules);
assert_eq!(actual.transitions, expect_transitions);
assert_eq!(actual.states, expect_states);
}
#[test]
fn test_compile_with_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
vec![t!('*' => 4), t!('.' => 6)], // "/*"
vec![t!('*' => 0)], // "/a"
vec![t!('*' => 1), t!('b' => 7)], // "/*."
vec![t!('*' => 0)], // "/*.b"
];
let expect_states = vec![
State::Allow,
State::Disallow,
State::Allow,
State::Disallow,
State::Disallow,
State::Allow,
State::Disallow,
State::Allow,
];
let actual = Cylon::compile(rules);
assert_eq!(actual.transitions, expect_transitions);
assert_eq!(actual.states, expect_states);
}
#[test]
fn test_compile_tricky_wildcard() {
let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
vec![t!('*' => 4), t!('.' => 5)], // "/*"
vec![t!('*' => 0)], // "/*."
];
let expect_states = vec![
State::Allow,
State::Disallow,
State::Allow,
State::Disallow,
State::Disallow,
State::Allow,
];
let actual = Cylon::compile(rules);
assert_eq!(actual.transitions, expect_transitions);
assert_eq!(actual.states, expect_states);
}
#[test]
fn test_compile_with_eow() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/a$"),
// Note that this rule is nonsensical. It will compile, but
// no guarantees are made as to how it's matched. Rules should
// use url-encoded strings to escape $.
Rule::Disallow("/x$y"),
];
let expect_transitions = vec![
vec![t!('*' => 0)],
vec![t!('*' => 1)],
vec![t!('*' => 0), t!('/' => 3)], // ""
vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
vec![t!('*' => 0), t!('$' => 6)], // "/a"
vec![t!('*' => 0), t!('$' => 7)], // "/x"
vec![t!('*' => 0)], // "/a$"
vec![t!('*' => 0), t!('y' => 8)], // "/x$"
vec![t!('*' => 1)], // "/x$y"
];
let expect_states = vec![
State::Allow,
State::Disallow,
State::Allow,
State::Allow,
State::Allow,
State::Allow,
State::Disallow,
State::Allow,
State::Disallow,
];
let actual = Cylon::compile(rules);
assert_eq!(actual.transitions, expect_transitions);
assert_eq!(actual.states, expect_states);
}
#[test]
fn test_allow() {
let rules = vec![
Rule::Disallow("/"),
Rule::Allow("/a"),
Rule::Allow("/abc"),
Rule::Allow("/b"),
];
let machine = Cylon::compile(rules);
assert_eq!(false, machine.allow("/"));
assert_eq!(true, machine.allow("/a"));
assert_eq!(true, machine.allow("/a/b"));
assert_eq!(true, machine.allow("/a"));
assert_eq!(true, machine.allow("/abc"));
assert_eq!(true, machine.allow("/abc/def"));
assert_eq!(true, machine.allow("/b"));
assert_eq!(true, machine.allow("/b/c"));
}
#[test]
fn test_allow_match_any() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/secret/*.txt"),
Rule::Disallow("/private/*"),
];
let machine = Cylon::compile(rules);
assert_eq!(true, machine.allow("/"));
assert_eq!(true, machine.allow("/abc"));
assert_eq!(false, machine.allow("/secret/abc.txt"));
assert_eq!(false, machine.allow("/secret/123.txt"));
assert_eq!(true, machine.allow("/secret/abc.csv"));
assert_eq!(true, machine.allow("/secret/123.csv"));
assert_eq!(false, machine.allow("/private/abc.txt"));
assert_eq!(false, machine.allow("/private/123.txt"));
assert_eq!(false, machine.allow("/private/abc.csv"));
assert_eq!(false, machine.allow("/private/123.csv"));
}
#[test]
fn test_allow_match_eow() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/ignore$"),
Rule::Disallow("/foo$bar"),
];
let machine = Cylon::compile(rules);
assert_eq!(true, machine.allow("/"));
assert_eq!(true, machine.allow("/abc"));
assert_eq!(false, machine.allow("/ignore"));
assert_eq!(true, machine.allow("/ignoreabc"));
assert_eq!(true, machine.allow("/ignore/abc"));
// These are technically undefined, and no behavior
// is guaranteed since the rule is malformed. However
// it is safer to accept them rather than reject them.
assert_eq!(true, machine.allow("/foo"));
assert_eq!(true, machine.allow("/foo$bar"));
}
#[test]
fn test_allow_more_complicated() {
let rules = vec![
Rule::Allow("/"),
Rule::Disallow("/a$"),
Rule::Disallow("/abc"),
Rule::Allow("/abc/*"),
Rule::Disallow("/foo/bar"),
Rule::Allow("/*/bar"),
Rule::Disallow("/www/*/images"),
Rule::Allow("/www/public/images"),
];
let machine = Cylon::compile(rules);
assert_eq!(true, machine.allow("/"));
assert_eq!(true, machine.allow("/directory"));
assert_eq!(false, machine.allow("/a"));
assert_eq!(true, machine.allow("/ab"));
assert_eq!(false, machine.allow("/abc"));
assert_eq!(true, machine.allow("/abc/123"));
assert_eq!(true, machine.allow("/foo"));
assert_eq!(true, machine.allow("/foobar"));
assert_eq!(false, machine.allow("/foo/bar"));
assert_eq!(false, machine.allow("/foo/bar/baz"));
assert_eq!(true, machine.allow("/baz/bar"));
assert_eq!(false, machine.allow("/www/cat/images"));
assert_eq!(true, machine.allow("/www/public/images"));
}
#[test]
fn test_matches() {
// Test cases from:
// https://developers.google.com/search/reference/robots_txt#group-member-rules
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html"));
assert_eq!(true, machine.allow("/fishheads.html"));
assert_eq!(true, machine.allow("/fishheads/yummy.html"));
assert_eq!(true, machine.allow("/fish.php?id=anything"));
assert_eq!(false, machine.allow("/Fish.asp"));
assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
assert_eq!(true, machine.allow("/fish"));
assert_eq!(true, machine.allow("/fish.html"));
assert_eq!(true, machine.allow("/fish/salmon.html"));
assert_eq!(true, machine.allow("/fishheads.html"));
assert_eq!(true, machine.allow("/fishheads/yummy.html"));
assert_eq!(true, machine.allow("/fish.php?id=anything"));
assert_eq!(false, machine.allow("/Fish.asp"));
assert_eq!(false, machine.allow("/catfish"));
assert_eq!(false, machine.allow("/?id=fish"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
assert_eq!(true, machine.allow("/fish/"));
assert_eq!(true, machine.allow("/fish/?id=anything"));
assert_eq!(true, machine.allow("/fish/salmon.htm"));
assert_eq!(false, machine.allow("/fish"));
assert_eq!(false, machine.allow("/fish.html"));
assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
assert_eq!(true, machine.allow("/folder/any.php.file.html"));
assert_eq!(true, machine.allow("/filename.php/"));
assert_eq!(false, machine.allow("/"));
assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
assert_eq!(true, machine.allow("/filename.php"));
assert_eq!(true, machine.allow("/folder/filename.php"));
assert_eq!(false, machine.allow("/filename.php?parameters"));
assert_eq!(false, machine.allow("/filename.php/"));
assert_eq!(false, machine.allow("/filename.php5"));
assert_eq!(false, machine.allow("/windows.PHP"));
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
assert_eq!(true, machine.allow("/fish.php"));
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
assert_eq!(false, machine.allow("/Fish.PHP"));
}
}

5
src/lib.rs Normal file
View File

@ -0,0 +1,5 @@
mod dfa;
mod parse;
pub use dfa::Cylon;
pub use parse::Compiler;

381
src/parse.rs Normal file
View File

@ -0,0 +1,381 @@
use super::dfa::{Cylon, Rule};
use futures_util::{
io::{AsyncBufRead, AsyncRead, BufReader, Result},
AsyncBufReadExt,
};
use serde_derive::{Deserialize, Serialize};
const UA_PREFIX: &str = "user-agent:";
const ALLOW_PREFIX: &str = "allow:";
const DISALLOW_PREFIX: &str = "disallow:";
#[derive(Debug, PartialEq, Clone)]
enum ParsedRule {
Allow(String),
Disallow(String),
}
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
fn into(self) -> Rule<'a> {
match self {
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
}
}
}
#[derive(Debug, PartialEq)]
enum ParsedLine {
UserAgent(String),
Rule(ParsedRule),
Nothing,
}
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
/// which can be used to efficiently match a large number of paths against
/// the robots.txt file.
#[derive(Debug, Serialize, Deserialize)]
pub struct Compiler {
user_agent: String,
}
impl Compiler {
/// Build a new compiler that parses rules for the given user agent from
/// a robots.txt file.
pub fn new(user_agent: &str) -> Self {
Self {
user_agent: user_agent.to_lowercase(),
}
}
/// Parse an input robots.txt file into a Cylon that can recognize
/// whether or not a path matches the rules for the Parser's user agent.
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
let reader = BufReader::new(file);
let mut agent = String::new();
let mut rules: Vec<ParsedRule> = vec![];
let mut group_reader = GroupReader::new(reader);
// find the most specific matching group in the robots file
while let Some(agents) = group_reader.next_header().await? {
let matching_agent = agents.iter().find(|a| {
let matches = &a[..] == "*" || self.user_agent.contains(*a);
let more_specific = a.len() > agent.len();
matches && more_specific
});
if let Some(matching_agent) = matching_agent {
agent = matching_agent.clone();
rules = group_reader.next_rules().await?;
}
}
let rules = rules.iter().map(|r| r.into()).collect();
Ok(Cylon::compile(rules))
}
}
struct GroupReader<R: AsyncBufRead + Unpin> {
parsing_agents: bool,
agents: Vec<String>,
rules: Vec<ParsedRule>,
reader: R,
}
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
fn new(reader: R) -> Self {
Self {
parsing_agents: true,
agents: vec![],
rules: vec![],
reader,
}
}
/// Scan forward until the next group header defined by one or more
/// user agent lines. This lets us optimize the lines we need to copy
/// so we can skip over groups that don't match the desired user agent.
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::UserAgent(ua) if self.parsing_agents => {
self.agents.push(ua);
}
ParsedLine::UserAgent(ua) => {
self.agents = vec![ua];
self.rules = vec![];
self.parsing_agents = true;
}
ParsedLine::Rule(rule) if self.parsing_agents => {
// Preserve the rule in case we need it in next_rules().
self.rules.push(rule);
self.parsing_agents = false;
break;
}
// Skip over lines until we get to the next user agent.
ParsedLine::Rule(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let agents = self.agents.clone();
self.agents = vec![];
if agents.is_empty() {
return Ok(None);
}
Ok(Some(agents))
}
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
let mut buf = String::new();
while self.reader.read_line(&mut buf).await? != 0 {
let parsed_line = parse_line(buf.clone());
match parsed_line {
ParsedLine::Rule(rule) => {
self.rules.push(rule);
self.parsing_agents = false;
}
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
// Preserve the agent in case we need it in next_agents().
self.agents.push(ua);
self.parsing_agents = true;
break;
}
// Skip over lines until we get to the next rule.
ParsedLine::UserAgent(..) => (),
ParsedLine::Nothing => (),
}
buf.clear();
}
let rules = self.rules.clone();
self.rules = vec![];
Ok(rules)
}
}
fn parse_line(line: String) -> ParsedLine {
let line = strip_comments(&line[..]).trim();
// This tries to parse lines roughly in order of most frequent kind to
// least frequent kind in order to minimize CPU cycles on average.
parse_disallow(line)
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
.unwrap_or(ParsedLine::Nothing)
}
fn strip_comments(line: &str) -> &str {
if let Some(before) = line.split('#').next() {
return before;
}
return line;
}
fn parse_user_agent(line: &str) -> Option<&str> {
if line.len() < UA_PREFIX.len() {
return None;
}
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[UA_PREFIX.len()..];
if prefix == UA_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_allow(line: &str) -> Option<&str> {
if line.len() < ALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[ALLOW_PREFIX.len()..];
if prefix == ALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
fn parse_disallow(line: &str) -> Option<&str> {
if line.len() < DISALLOW_PREFIX.len() {
return None;
}
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
let suffix = &line[DISALLOW_PREFIX.len()..];
if prefix == DISALLOW_PREFIX {
Some(suffix.trim())
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_allow() {
let test_cases = vec![
("Allow: /", "/"),
("allow: / # Root with comment", "/"),
("ALLOW: /abc/def ", "/abc/def"),
("Allow: /abc/def ", "/abc/def"),
(" Allow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Allow(o.into()))
);
}
}
#[test]
fn test_parse_disallow() {
let test_cases = vec![
("Disallow: /", "/"),
("disallow: / # Root with comment", "/"),
("DISALLOW: /abc/def ", "/abc/def"),
("Disallow: /abc/def ", "/abc/def"),
(" Disallow: /*/foo", "/*/foo"),
];
for (i, o) in test_cases {
assert_eq!(
parse_line(i.into()),
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
);
}
}
#[test]
fn test_parse_user_agent() {
let test_cases = vec![
("User-agent: *", "*"),
("user-agent: ImABot # User agent with comment", "imabot"),
(" USER-AGENT: ImABot ", "imabot"),
];
for (i, o) in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
}
}
#[test]
fn test_parse_nothing() {
let test_cases = vec![
"Useragent: *",
"# Comment",
"",
" ",
"\t",
"alow: /",
"disalow: /",
];
for i in test_cases {
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
}
}
#[test]
fn test_end_to_end() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones-bot
Disallow: /
User-agent: jones
User-agent: foobar
Allow: /
User-agent: *
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("jones-bot");
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("abc");
let abc_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(false, jonesbot_machine.allow("/index.html"));
assert_eq!(false, imabot_machine.allow("/index.html"));
assert_eq!(false, abc_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_1() {
tokio_test::block_on(async {
let example_robots = r#"
# Instead of treating this as an error, we'll just consider
# this behavior undefined.
Allow: /
User-agent: jones
User-agent: foobar
Disallow: /
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
// Everything is allowed because next_header() returns None
assert_eq!(true, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
#[test]
fn test_invalid_2() {
tokio_test::block_on(async {
let example_robots = r#"
User-agent: jones
User-agent: foobar
Disallow: /
# Instead of treating this as an error, we consider this
# behavior undefined.
User-agent: imabot
"#
.as_bytes();
let parser = Compiler::new("foobar");
let foobar_machine = parser.compile(example_robots).await.unwrap();
let parser = Compiler::new("imabot");
let imabot_machine = parser.compile(example_robots).await.unwrap();
assert_eq!(false, foobar_machine.allow("/index.html"));
assert_eq!(true, imabot_machine.allow("/index.html"));
});
}
}