Initial release
This commit is contained in:
commit
56cf5d8d91
|
@ -0,0 +1 @@
|
|||
/target
|
|
@ -0,0 +1,831 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3670df70cbc01729f901f94c887814b3c68db038aad1329a418bae178bc5295c"
|
||||
dependencies = [
|
||||
"async-stream-impl",
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream-impl"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3548b8efc9f8e8a5a0a2808c5bd8451a9031b9e5b879a79590304ae928b0a70"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "099e596ef14349721d9016f6b80dd3419ea1bf289ab9b44df8e4dfd3a005d5d9"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"textwrap",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_fn"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6"
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"cast",
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"csv",
|
||||
"futures",
|
||||
"itertools 0.10.0",
|
||||
"lazy_static",
|
||||
"num-traits",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_cbor",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"tinytemplate",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"const_fn",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"memoffset",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97"
|
||||
dependencies = [
|
||||
"bstr",
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cylon"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"futures-util",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"tokio-test",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
|
||||
dependencies = [
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"proc-macro-hack",
|
||||
"proc-macro-nested",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5cfb73131c35423a367daf8cbd24100af0d077668c8c2943f0e7dd775fef0f65"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0"
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
|
||||
|
||||
[[package]]
|
||||
name = "pin-utils"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45ca0ae5f169d0917a7c7f5a9c1a3d3d9598f18f529dd2b8373ed988efea307a"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"plotters-backend",
|
||||
"plotters-svg",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plotters-backend"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590"
|
||||
|
||||
[[package]]
|
||||
name = "plotters-svg"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211"
|
||||
dependencies = [
|
||||
"plotters-backend",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-nested"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
|
||||
dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"crossbeam-deque",
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"num_cpus",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
|
||||
dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||
dependencies = [
|
||||
"semver-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver-parser"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.123"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae"
|
||||
|
||||
[[package]]
|
||||
name = "serde_cbor"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
|
||||
dependencies = [
|
||||
"half",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.123"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.62"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea1c6153794552ea7cf7cf63b1231a25de00ec90db326ba6264440fa08e31486"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-stream"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1981ad97df782ab506a1f43bf82c967326960d278acf3bf8279809648c3ff3ea"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-test"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c7d205f6f59b03f9e824ac86eaba635a98395f287756ecc8a06464779c399bf"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55c0f7123de74f0dab9b7d00fd614e7b19349cd1e2f5252bbe9b1754b59433be"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"wasm-bindgen-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bc45447f0d4573f3d65720f636bbcc3dd6ce920ed704670118650bcd47764c7"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b8853882eef39593ad4174dd26fc9865a64e84026d223f63bb2c42affcbba2c"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4133b5e7f2a531fa413b3a1695e925038a05a71cf67e87dafa295cb645a01385"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd4945e4943ae02d15c13962b38a5b1e81eadd4b71214eee75af64a4d6a4fd64"
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c40dc691fc48003eba817c38da7113c15698142da971298003cac3ef175680b3"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "cylon"
|
||||
version = "0.1.0"
|
||||
authors = ["Creston Bunch <rust@bunch.im>"]
|
||||
edition = "2018"
|
||||
|
||||
categories = ["parser-implementations"]
|
||||
license = "MIT"
|
||||
keywords = ["robots", "txt", "parse", "compile"]
|
||||
|
||||
[dependencies]
|
||||
futures-util = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { version = "0.3", features = ["async_futures"] }
|
||||
tokio-test = "0.4"
|
||||
|
||||
[[bench]]
|
||||
name = "parse"
|
||||
harness = false
|
|
@ -0,0 +1,7 @@
|
|||
Copyright © 2021 Creston Bunch
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,97 @@
|
|||
# Cylon
|
||||
|
||||
Cylon is a library for reading robots.txt files.
|
||||
|
||||
## Features
|
||||
|
||||
There is no universal standard for what rules a web crawler
|
||||
is required to support in a robots.txt file. Cylon supports
|
||||
the following directives (notably `Site-map` is missing):
|
||||
|
||||
- `User-agent`
|
||||
- `Allow`
|
||||
- `Disallow`
|
||||
|
||||
In addition, Cylon supports `*` as a wildcard character to
|
||||
match any length substring of 0 or more characters, as well
|
||||
as the `$` character to match the end of a path.
|
||||
|
||||
## Usage
|
||||
|
||||
Using Cylon is very simple. Simply create a new complier
|
||||
for your user agent, then compile the robots.txt file.
|
||||
|
||||
```rust
|
||||
// You can use something like hyper or reqwest to download
|
||||
// the robots.txt file instead.
|
||||
let example_robots = r#"
|
||||
User-agent: googlebot
|
||||
Allow: /
|
||||
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
// Create a new compiler that compiles a robots.txt file looking for
|
||||
// rules that apply to the "googlebot" user agent.
|
||||
let compiler = Compiler::new("googlebot");
|
||||
let cylon = compiler.compile(example_robots).await.unwrap();
|
||||
assert_eq!(true, cylon.allow("/index.html"));
|
||||
assert_eq!(true, cylon.allow("/directory"));
|
||||
|
||||
// Create a new compiler that compiles a robots.txt file looking for
|
||||
// rules that apply to the "bing" user agent.
|
||||
let complier = Compiler::new("bing");
|
||||
let cylon = compiler.compile(example_robots).await.unwrap();
|
||||
assert_eq!(false, cylon.allow("/index.html"));
|
||||
assert_eq!(false, cylon.allow("/directory"));
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please make a pull request. Issues may not
|
||||
be addressed in a timely manner unless they expose fundamental issues
|
||||
or security concerns.
|
||||
|
||||
## Implementation
|
||||
|
||||
### Async
|
||||
|
||||
This library uses an async API by default. This library does not assume
|
||||
any async runtime so you can use it with any (tokio, async-std, etc.)
|
||||
|
||||
A synchronous API may be an optional feature in the future, but there
|
||||
are no current plans to add one. If you need a synchronous API consider
|
||||
adding one yourself (contributions are welcome).
|
||||
|
||||
### Performance
|
||||
|
||||
Cylon compiles robots.txt files into very efficient DFAs. This means it is
|
||||
well-suited for web crawlers that need to use the same robots.txt file for
|
||||
multiple URLs.
|
||||
|
||||
The compiler avoids any random memory access when compiling the DFA (e.g. by
|
||||
not using hashmaps or tree structures.) so it has very good cache-locality.
|
||||
|
||||
The DFA can match input paths in roughly O(n) time, where n is the length of
|
||||
the input path. (Compare that to the alternative O(n \* m) complexity of
|
||||
matching the input path against every rule in the robots.txt file.)
|
||||
|
||||
### (De-)serialization
|
||||
|
||||
This library uses serde to allow serializing/deserializing the compiled Cylon
|
||||
DFA structs. This is useful e.g. if you need to cache the DFA in something like
|
||||
Memcached or Redis. (Use a format like bincode or msgpack to convert it to
|
||||
bytes first.)
|
||||
|
||||
### Error handling
|
||||
|
||||
Robots.txt files are more like guidelines than actual rules.
|
||||
|
||||
In general, Cylon tries not to cause errors for things that might be considered
|
||||
an invalid robots.txt file, which means there are very few failure cases.
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
|
@ -0,0 +1,70 @@
|
|||
use cylon::Compiler;
|
||||
|
||||
use criterion::async_executor::FuturesExecutor;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
const SMALL_FILE: &[u8] = r#"
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
Allow: /a
|
||||
Allow: /abc
|
||||
Allow: /b
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
const LARGE_FILE: &[u8] = r#"
|
||||
User-agent: *
|
||||
Allow: /
|
||||
Disallow: /a$
|
||||
Disallow: /abc
|
||||
Allow: /abc/*
|
||||
Disallow: /foo/bar
|
||||
Allow /*/bar
|
||||
Disallow: /www/*/images
|
||||
Allow: /www/public/images
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
c.bench_function("compile small", |b| {
|
||||
b.to_async(FuturesExecutor).iter(|| async {
|
||||
let parser = Compiler::new("ImABot");
|
||||
parser.compile(SMALL_FILE).await.unwrap();
|
||||
})
|
||||
});
|
||||
|
||||
c.bench_function("compile large", |b| {
|
||||
b.to_async(FuturesExecutor).iter(|| async {
|
||||
let parser = Compiler::new("ImABot");
|
||||
parser.compile(LARGE_FILE).await.unwrap();
|
||||
})
|
||||
});
|
||||
|
||||
let parser = Compiler::new("ImABot");
|
||||
let small_machine = &tokio_test::block_on(parser.compile(SMALL_FILE)).unwrap();
|
||||
c.bench_function("allow small A", move |b| {
|
||||
b.iter(|| {
|
||||
small_machine.allow("/abc");
|
||||
});
|
||||
});
|
||||
c.bench_function("allow small B", move |b| {
|
||||
b.iter(|| {
|
||||
small_machine.allow("/www/cat/images");
|
||||
});
|
||||
});
|
||||
|
||||
let large_machine = &tokio_test::block_on(parser.compile(LARGE_FILE)).unwrap();
|
||||
c.bench_function("allow large A", move |b| {
|
||||
b.iter(|| {
|
||||
large_machine.allow("/abc");
|
||||
});
|
||||
});
|
||||
c.bench_function("allow large B", move |b| {
|
||||
b.iter(|| {
|
||||
large_machine.allow("/www/cat/images");
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench);
|
||||
criterion_main!(benches);
|
|
@ -0,0 +1,498 @@
|
|||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum Rule<'a> {
|
||||
Allow(&'a str),
|
||||
Disallow(&'a str),
|
||||
}
|
||||
|
||||
impl<'a> Rule<'a> {
|
||||
fn inner(&self) -> &str {
|
||||
match self {
|
||||
Rule::Allow(inner) => inner,
|
||||
Rule::Disallow(inner) => inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
|
||||
enum Edge {
|
||||
MatchChar(char),
|
||||
MatchAny,
|
||||
MatchEow,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize)]
|
||||
struct Transition(Edge, usize);
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
enum State {
|
||||
Allow,
|
||||
Disallow,
|
||||
Intermediate,
|
||||
}
|
||||
|
||||
/// A Cylon is a DFA that recognizes rules from a compiled robots.txt
|
||||
/// file. By providing it a URL path, it can decide whether or not
|
||||
/// the robots file that compiled it allows or disallows that path in
|
||||
/// roughly O(n) time, where n is the length of the path.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Cylon {
|
||||
states: Vec<State>,
|
||||
transitions: Vec<Vec<Transition>>,
|
||||
}
|
||||
|
||||
impl Cylon {
|
||||
/// Match whether the rules allow or disallow the target path.
|
||||
pub fn allow(&self, path: &str) -> bool {
|
||||
let mut state = path.chars().fold(2, |state, path_char| {
|
||||
let t = &self.transitions[state];
|
||||
t.iter()
|
||||
.rev()
|
||||
// Pick the last transition to always prioritize MatchChar
|
||||
// over MatchAny (which will always be the first transition.)
|
||||
.find(|transition| match transition {
|
||||
Transition(Edge::MatchAny, ..) => true,
|
||||
Transition(Edge::MatchEow, ..) => false,
|
||||
Transition(Edge::MatchChar(edge_char), ..) => *edge_char == path_char,
|
||||
})
|
||||
.map(|Transition(.., next_state)| *next_state)
|
||||
// We are guaranteed at least one matching state because of
|
||||
// the way the DFA is constructed.
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
// Follow the EoW transition, if necessary
|
||||
let t = &self.transitions[state];
|
||||
state = t
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|transition| match transition {
|
||||
Transition(Edge::MatchEow, ..) => true,
|
||||
Transition(Edge::MatchAny, ..) => true,
|
||||
_ => false,
|
||||
})
|
||||
.map(|Transition(.., next_state)| *next_state)
|
||||
.unwrap_or(state);
|
||||
|
||||
match self.states[state] {
|
||||
State::Allow => true,
|
||||
State::Disallow => false,
|
||||
// Intermediate states are not preserved in the DFA
|
||||
State::Intermediate => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Compile a machine from a list of rules.
|
||||
pub fn compile(mut rules: Vec<Rule>) -> Self {
|
||||
// This algorithm constructs a DFA by doing BFS over the prefix tree of
|
||||
// paths in the provided list of rules. However, for performance reasons
|
||||
// it does not actually build a tree structure. (Vecs have better
|
||||
// cache-locality by avoiding random memory access.)
|
||||
|
||||
let mut transitions: Vec<Vec<Transition>> = vec![
|
||||
vec![Transition(Edge::MatchAny, 0)],
|
||||
vec![Transition(Edge::MatchAny, 1)],
|
||||
];
|
||||
let mut states: Vec<State> = vec![State::Allow, State::Disallow];
|
||||
|
||||
rules.sort_by(|a, b| Ord::cmp(a.inner(), b.inner()));
|
||||
|
||||
let mut queue = vec![("", 0, 0, State::Intermediate)];
|
||||
while !queue.is_empty() {
|
||||
// parent_prefix is the "parent node" in the prefix tree. We are
|
||||
// going to visit its children by filtering from the list of
|
||||
// paths only the paths that start with the parent_prefix.
|
||||
// wildcard_state is a node to jump to when an unmatched character
|
||||
// is encountered. This is usually a node higher up in the tree
|
||||
// that can match any character legally, but is also a prefix
|
||||
// (read: ancestor) of the current node.
|
||||
let (parent_prefix, mut wildcard_state, parent_state, state) = queue.remove(0);
|
||||
let last_char = parent_prefix.chars().last();
|
||||
|
||||
wildcard_state = match state {
|
||||
State::Allow => 0,
|
||||
State::Disallow if last_char == Some('$') => wildcard_state,
|
||||
State::Disallow => 1,
|
||||
State::Intermediate => wildcard_state,
|
||||
};
|
||||
|
||||
let mut t = match last_char {
|
||||
Some('$') => {
|
||||
// The EOW character cannot match anything else
|
||||
vec![Transition(Edge::MatchAny, wildcard_state)]
|
||||
}
|
||||
Some('*') => {
|
||||
// The wildcard character overrides the wildcard state
|
||||
vec![Transition(Edge::MatchAny, transitions.len())]
|
||||
}
|
||||
_ => {
|
||||
// Every other state has a self-loop that matches anything
|
||||
vec![Transition(Edge::MatchAny, wildcard_state)]
|
||||
}
|
||||
};
|
||||
|
||||
let mut curr_prefix = "";
|
||||
rules
|
||||
.iter()
|
||||
.map(Rule::inner)
|
||||
.zip(&rules)
|
||||
.filter(|(path, _)| (*path).starts_with(parent_prefix))
|
||||
.filter(|(path, _)| (*path) != parent_prefix)
|
||||
.for_each(|(path, rule)| {
|
||||
let child_prefix = &path[0..parent_prefix.len() + 1];
|
||||
if curr_prefix == child_prefix {
|
||||
// We only want to visit a child node once, but
|
||||
// many rules might have the same child_prefix, so
|
||||
// we skip the duplicates after the first time
|
||||
// we see a prefix. (This could be a filter(), but
|
||||
// it's a bit hard to encode earlier in the chain.)
|
||||
return;
|
||||
}
|
||||
curr_prefix = child_prefix;
|
||||
|
||||
let eow = child_prefix == path;
|
||||
let state = match (rule, eow) {
|
||||
(Rule::Allow(..), true) => State::Allow,
|
||||
(Rule::Disallow(..), true) => State::Disallow,
|
||||
_ => State::Intermediate,
|
||||
};
|
||||
|
||||
queue.push((child_prefix, wildcard_state, transitions.len(), state));
|
||||
|
||||
// NB: we can predict what state index the child
|
||||
// will have before it's even pushed onto the state vec.
|
||||
let child_index = transitions.len() + queue.len();
|
||||
let edge_char = child_prefix.chars().last().unwrap();
|
||||
let transition = Transition(
|
||||
match edge_char {
|
||||
'*' => Edge::MatchAny,
|
||||
'$' => Edge::MatchEow,
|
||||
c => Edge::MatchChar(c),
|
||||
},
|
||||
child_index,
|
||||
);
|
||||
|
||||
// Add transitions from the parent state to the child state
|
||||
// so that the wildcard character matches are optional.
|
||||
if last_char == Some('*') {
|
||||
let parent_t = &mut transitions[parent_state];
|
||||
parent_t.push(transition);
|
||||
}
|
||||
|
||||
t.push(transition);
|
||||
});
|
||||
|
||||
states.push(match state {
|
||||
State::Allow | State::Disallow => state,
|
||||
State::Intermediate => states[wildcard_state],
|
||||
});
|
||||
transitions.push(t);
|
||||
}
|
||||
|
||||
Self {
|
||||
states,
|
||||
transitions,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! t {
|
||||
('*' => $x:expr) => {
|
||||
Transition(Edge::MatchAny, $x)
|
||||
};
|
||||
('$' => $x:expr) => {
|
||||
Transition(Edge::MatchEow, $x)
|
||||
};
|
||||
($x:expr => $y:expr) => {
|
||||
Transition(Edge::MatchChar($x), $y)
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compile() {
|
||||
let rules = vec![
|
||||
Rule::Disallow("/"),
|
||||
Rule::Allow("/a"),
|
||||
Rule::Allow("/abc"),
|
||||
Rule::Allow("/b"),
|
||||
];
|
||||
|
||||
let expect_transitions = vec![
|
||||
vec![t!('*' => 0)],
|
||||
vec![t!('*' => 1)],
|
||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
||||
vec![t!('*' => 1), t!('a' => 4), t!('b' => 5)], // "/"
|
||||
vec![t!('*' => 0), t!('b' => 6)], // "/a"
|
||||
vec![t!('*' => 0)], // "/b"
|
||||
vec![t!('*' => 0), t!('c' => 7)], // "/ab"
|
||||
vec![t!('*' => 0)], // "/abc"
|
||||
];
|
||||
|
||||
let expect_states = vec![
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
];
|
||||
|
||||
let actual = Cylon::compile(rules);
|
||||
assert_eq!(actual.transitions, expect_transitions);
|
||||
assert_eq!(actual.states, expect_states);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compile_with_wildcard() {
|
||||
let rules = vec![Rule::Disallow("/"), Rule::Allow("/a"), Rule::Allow("/*.b")];
|
||||
|
||||
let expect_transitions = vec![
|
||||
vec![t!('*' => 0)],
|
||||
vec![t!('*' => 1)],
|
||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
||||
vec![t!('*' => 1), t!('*' => 4), t!('a' => 5), t!('.' => 6)], // "/"
|
||||
vec![t!('*' => 4), t!('.' => 6)], // "/*"
|
||||
vec![t!('*' => 0)], // "/a"
|
||||
vec![t!('*' => 1), t!('b' => 7)], // "/*."
|
||||
vec![t!('*' => 0)], // "/*.b"
|
||||
];
|
||||
|
||||
let expect_states = vec![
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
];
|
||||
|
||||
let actual = Cylon::compile(rules);
|
||||
assert_eq!(actual.transitions, expect_transitions);
|
||||
assert_eq!(actual.states, expect_states);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compile_tricky_wildcard() {
|
||||
let rules = vec![Rule::Disallow("/"), Rule::Allow("/*.")];
|
||||
|
||||
let expect_transitions = vec![
|
||||
vec![t!('*' => 0)],
|
||||
vec![t!('*' => 1)],
|
||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
||||
vec![t!('*' => 1), t!('*' => 4), t!('.' => 5)], // "/"
|
||||
vec![t!('*' => 4), t!('.' => 5)], // "/*"
|
||||
vec![t!('*' => 0)], // "/*."
|
||||
];
|
||||
|
||||
let expect_states = vec![
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
];
|
||||
|
||||
let actual = Cylon::compile(rules);
|
||||
assert_eq!(actual.transitions, expect_transitions);
|
||||
assert_eq!(actual.states, expect_states);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compile_with_eow() {
|
||||
let rules = vec![
|
||||
Rule::Allow("/"),
|
||||
Rule::Disallow("/a$"),
|
||||
// Note that this rule is nonsensical. It will compile, but
|
||||
// no guarantees are made as to how it's matched. Rules should
|
||||
// use url-encoded strings to escape $.
|
||||
Rule::Disallow("/x$y"),
|
||||
];
|
||||
|
||||
let expect_transitions = vec![
|
||||
vec![t!('*' => 0)],
|
||||
vec![t!('*' => 1)],
|
||||
vec![t!('*' => 0), t!('/' => 3)], // ""
|
||||
vec![t!('*' => 0), t!('a' => 4), t!('x' => 5)], // "/"
|
||||
vec![t!('*' => 0), t!('$' => 6)], // "/a"
|
||||
vec![t!('*' => 0), t!('$' => 7)], // "/x"
|
||||
vec![t!('*' => 0)], // "/a$"
|
||||
vec![t!('*' => 0), t!('y' => 8)], // "/x$"
|
||||
vec![t!('*' => 1)], // "/x$y"
|
||||
];
|
||||
|
||||
let expect_states = vec![
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
State::Allow,
|
||||
State::Disallow,
|
||||
];
|
||||
|
||||
let actual = Cylon::compile(rules);
|
||||
assert_eq!(actual.transitions, expect_transitions);
|
||||
assert_eq!(actual.states, expect_states);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allow() {
|
||||
let rules = vec![
|
||||
Rule::Disallow("/"),
|
||||
Rule::Allow("/a"),
|
||||
Rule::Allow("/abc"),
|
||||
Rule::Allow("/b"),
|
||||
];
|
||||
|
||||
let machine = Cylon::compile(rules);
|
||||
assert_eq!(false, machine.allow("/"));
|
||||
assert_eq!(true, machine.allow("/a"));
|
||||
assert_eq!(true, machine.allow("/a/b"));
|
||||
assert_eq!(true, machine.allow("/a"));
|
||||
assert_eq!(true, machine.allow("/abc"));
|
||||
assert_eq!(true, machine.allow("/abc/def"));
|
||||
assert_eq!(true, machine.allow("/b"));
|
||||
assert_eq!(true, machine.allow("/b/c"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allow_match_any() {
|
||||
let rules = vec![
|
||||
Rule::Allow("/"),
|
||||
Rule::Disallow("/secret/*.txt"),
|
||||
Rule::Disallow("/private/*"),
|
||||
];
|
||||
|
||||
let machine = Cylon::compile(rules);
|
||||
assert_eq!(true, machine.allow("/"));
|
||||
assert_eq!(true, machine.allow("/abc"));
|
||||
assert_eq!(false, machine.allow("/secret/abc.txt"));
|
||||
assert_eq!(false, machine.allow("/secret/123.txt"));
|
||||
assert_eq!(true, machine.allow("/secret/abc.csv"));
|
||||
assert_eq!(true, machine.allow("/secret/123.csv"));
|
||||
assert_eq!(false, machine.allow("/private/abc.txt"));
|
||||
assert_eq!(false, machine.allow("/private/123.txt"));
|
||||
assert_eq!(false, machine.allow("/private/abc.csv"));
|
||||
assert_eq!(false, machine.allow("/private/123.csv"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allow_match_eow() {
|
||||
let rules = vec![
|
||||
Rule::Allow("/"),
|
||||
Rule::Disallow("/ignore$"),
|
||||
Rule::Disallow("/foo$bar"),
|
||||
];
|
||||
|
||||
let machine = Cylon::compile(rules);
|
||||
assert_eq!(true, machine.allow("/"));
|
||||
assert_eq!(true, machine.allow("/abc"));
|
||||
assert_eq!(false, machine.allow("/ignore"));
|
||||
assert_eq!(true, machine.allow("/ignoreabc"));
|
||||
assert_eq!(true, machine.allow("/ignore/abc"));
|
||||
// These are technically undefined, and no behavior
|
||||
// is guaranteed since the rule is malformed. However
|
||||
// it is safer to accept them rather than reject them.
|
||||
assert_eq!(true, machine.allow("/foo"));
|
||||
assert_eq!(true, machine.allow("/foo$bar"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allow_more_complicated() {
|
||||
let rules = vec![
|
||||
Rule::Allow("/"),
|
||||
Rule::Disallow("/a$"),
|
||||
Rule::Disallow("/abc"),
|
||||
Rule::Allow("/abc/*"),
|
||||
Rule::Disallow("/foo/bar"),
|
||||
Rule::Allow("/*/bar"),
|
||||
Rule::Disallow("/www/*/images"),
|
||||
Rule::Allow("/www/public/images"),
|
||||
];
|
||||
|
||||
let machine = Cylon::compile(rules);
|
||||
assert_eq!(true, machine.allow("/"));
|
||||
assert_eq!(true, machine.allow("/directory"));
|
||||
assert_eq!(false, machine.allow("/a"));
|
||||
assert_eq!(true, machine.allow("/ab"));
|
||||
assert_eq!(false, machine.allow("/abc"));
|
||||
assert_eq!(true, machine.allow("/abc/123"));
|
||||
assert_eq!(true, machine.allow("/foo"));
|
||||
assert_eq!(true, machine.allow("/foobar"));
|
||||
assert_eq!(false, machine.allow("/foo/bar"));
|
||||
assert_eq!(false, machine.allow("/foo/bar/baz"));
|
||||
assert_eq!(true, machine.allow("/baz/bar"));
|
||||
assert_eq!(false, machine.allow("/www/cat/images"));
|
||||
assert_eq!(true, machine.allow("/www/public/images"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_matches() {
|
||||
// Test cases from:
|
||||
// https://developers.google.com/search/reference/robots_txt#group-member-rules
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish")]);
|
||||
assert_eq!(true, machine.allow("/fish"));
|
||||
assert_eq!(true, machine.allow("/fish.html"));
|
||||
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
||||
assert_eq!(true, machine.allow("/fishheads.html"));
|
||||
assert_eq!(true, machine.allow("/fishheads/yummy.html"));
|
||||
assert_eq!(true, machine.allow("/fish.php?id=anything"));
|
||||
assert_eq!(false, machine.allow("/Fish.asp"));
|
||||
assert_eq!(false, machine.allow("/catfish"));
|
||||
assert_eq!(false, machine.allow("/?id=fish"));
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*")]);
|
||||
assert_eq!(true, machine.allow("/fish"));
|
||||
assert_eq!(true, machine.allow("/fish.html"));
|
||||
assert_eq!(true, machine.allow("/fish/salmon.html"));
|
||||
assert_eq!(true, machine.allow("/fishheads.html"));
|
||||
assert_eq!(true, machine.allow("/fishheads/yummy.html"));
|
||||
assert_eq!(true, machine.allow("/fish.php?id=anything"));
|
||||
assert_eq!(false, machine.allow("/Fish.asp"));
|
||||
assert_eq!(false, machine.allow("/catfish"));
|
||||
assert_eq!(false, machine.allow("/?id=fish"));
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish/")]);
|
||||
assert_eq!(true, machine.allow("/fish/"));
|
||||
assert_eq!(true, machine.allow("/fish/?id=anything"));
|
||||
assert_eq!(true, machine.allow("/fish/salmon.htm"));
|
||||
assert_eq!(false, machine.allow("/fish"));
|
||||
assert_eq!(false, machine.allow("/fish.html"));
|
||||
assert_eq!(false, machine.allow("/Fish/Salmon.asp"));
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php")]);
|
||||
assert_eq!(true, machine.allow("/filename.php"));
|
||||
assert_eq!(true, machine.allow("/folder/filename.php"));
|
||||
assert_eq!(true, machine.allow("/folder/filename.php?parameters"));
|
||||
assert_eq!(true, machine.allow("/folder/any.php.file.html"));
|
||||
assert_eq!(true, machine.allow("/filename.php/"));
|
||||
assert_eq!(false, machine.allow("/"));
|
||||
assert_eq!(false, machine.allow("/windows.PHP"));
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/*.php$")]);
|
||||
assert_eq!(true, machine.allow("/filename.php"));
|
||||
assert_eq!(true, machine.allow("/folder/filename.php"));
|
||||
assert_eq!(false, machine.allow("/filename.php?parameters"));
|
||||
assert_eq!(false, machine.allow("/filename.php/"));
|
||||
assert_eq!(false, machine.allow("/filename.php5"));
|
||||
assert_eq!(false, machine.allow("/windows.PHP"));
|
||||
|
||||
let machine = Cylon::compile(vec![Rule::Disallow("/"), Rule::Allow("/fish*.php")]);
|
||||
assert_eq!(true, machine.allow("/fish.php"));
|
||||
assert_eq!(true, machine.allow("/fishheads/catfish.php?parameters"));
|
||||
assert_eq!(false, machine.allow("/Fish.PHP"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
mod dfa;
|
||||
mod parse;
|
||||
|
||||
pub use dfa::Cylon;
|
||||
pub use parse::Compiler;
|
|
@ -0,0 +1,381 @@
|
|||
use super::dfa::{Cylon, Rule};
|
||||
use futures_util::{
|
||||
io::{AsyncBufRead, AsyncRead, BufReader, Result},
|
||||
AsyncBufReadExt,
|
||||
};
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
const UA_PREFIX: &str = "user-agent:";
|
||||
const ALLOW_PREFIX: &str = "allow:";
|
||||
const DISALLOW_PREFIX: &str = "disallow:";
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
enum ParsedRule {
|
||||
Allow(String),
|
||||
Disallow(String),
|
||||
}
|
||||
|
||||
impl<'a> Into<Rule<'a>> for &'a ParsedRule {
|
||||
fn into(self) -> Rule<'a> {
|
||||
match self {
|
||||
ParsedRule::Allow(path) => Rule::Allow(&path[..]),
|
||||
ParsedRule::Disallow(path) => Rule::Disallow(&path[..]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum ParsedLine {
|
||||
UserAgent(String),
|
||||
Rule(ParsedRule),
|
||||
Nothing,
|
||||
}
|
||||
|
||||
/// A compiler takes an input robots.txt file and outputs a compiled Cylon,
|
||||
/// which can be used to efficiently match a large number of paths against
|
||||
/// the robots.txt file.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Compiler {
|
||||
user_agent: String,
|
||||
}
|
||||
|
||||
impl Compiler {
|
||||
/// Build a new compiler that parses rules for the given user agent from
|
||||
/// a robots.txt file.
|
||||
pub fn new(user_agent: &str) -> Self {
|
||||
Self {
|
||||
user_agent: user_agent.to_lowercase(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse an input robots.txt file into a Cylon that can recognize
|
||||
/// whether or not a path matches the rules for the Parser's user agent.
|
||||
pub async fn compile<R: AsyncRead + Unpin>(&self, file: R) -> Result<Cylon> {
|
||||
let reader = BufReader::new(file);
|
||||
let mut agent = String::new();
|
||||
let mut rules: Vec<ParsedRule> = vec![];
|
||||
let mut group_reader = GroupReader::new(reader);
|
||||
|
||||
// find the most specific matching group in the robots file
|
||||
while let Some(agents) = group_reader.next_header().await? {
|
||||
let matching_agent = agents.iter().find(|a| {
|
||||
let matches = &a[..] == "*" || self.user_agent.contains(*a);
|
||||
let more_specific = a.len() > agent.len();
|
||||
matches && more_specific
|
||||
});
|
||||
|
||||
if let Some(matching_agent) = matching_agent {
|
||||
agent = matching_agent.clone();
|
||||
rules = group_reader.next_rules().await?;
|
||||
}
|
||||
}
|
||||
|
||||
let rules = rules.iter().map(|r| r.into()).collect();
|
||||
Ok(Cylon::compile(rules))
|
||||
}
|
||||
}
|
||||
|
||||
struct GroupReader<R: AsyncBufRead + Unpin> {
|
||||
parsing_agents: bool,
|
||||
agents: Vec<String>,
|
||||
rules: Vec<ParsedRule>,
|
||||
reader: R,
|
||||
}
|
||||
|
||||
impl<R: AsyncBufRead + Unpin> GroupReader<R> {
|
||||
fn new(reader: R) -> Self {
|
||||
Self {
|
||||
parsing_agents: true,
|
||||
agents: vec![],
|
||||
rules: vec![],
|
||||
reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan forward until the next group header defined by one or more
|
||||
/// user agent lines. This lets us optimize the lines we need to copy
|
||||
/// so we can skip over groups that don't match the desired user agent.
|
||||
async fn next_header(&mut self) -> Result<Option<Vec<String>>> {
|
||||
let mut buf = String::new();
|
||||
while self.reader.read_line(&mut buf).await? != 0 {
|
||||
let parsed_line = parse_line(buf.clone());
|
||||
|
||||
match parsed_line {
|
||||
ParsedLine::UserAgent(ua) if self.parsing_agents => {
|
||||
self.agents.push(ua);
|
||||
}
|
||||
ParsedLine::UserAgent(ua) => {
|
||||
self.agents = vec![ua];
|
||||
self.rules = vec![];
|
||||
self.parsing_agents = true;
|
||||
}
|
||||
ParsedLine::Rule(rule) if self.parsing_agents => {
|
||||
// Preserve the rule in case we need it in next_rules().
|
||||
self.rules.push(rule);
|
||||
self.parsing_agents = false;
|
||||
break;
|
||||
}
|
||||
// Skip over lines until we get to the next user agent.
|
||||
ParsedLine::Rule(..) => (),
|
||||
ParsedLine::Nothing => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
let agents = self.agents.clone();
|
||||
self.agents = vec![];
|
||||
|
||||
if agents.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(agents))
|
||||
}
|
||||
|
||||
async fn next_rules(&mut self) -> Result<Vec<ParsedRule>> {
|
||||
let mut buf = String::new();
|
||||
while self.reader.read_line(&mut buf).await? != 0 {
|
||||
let parsed_line = parse_line(buf.clone());
|
||||
|
||||
match parsed_line {
|
||||
ParsedLine::Rule(rule) => {
|
||||
self.rules.push(rule);
|
||||
self.parsing_agents = false;
|
||||
}
|
||||
ParsedLine::UserAgent(ua) if !self.parsing_agents => {
|
||||
// Preserve the agent in case we need it in next_agents().
|
||||
self.agents.push(ua);
|
||||
self.parsing_agents = true;
|
||||
break;
|
||||
}
|
||||
// Skip over lines until we get to the next rule.
|
||||
ParsedLine::UserAgent(..) => (),
|
||||
ParsedLine::Nothing => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
let rules = self.rules.clone();
|
||||
self.rules = vec![];
|
||||
Ok(rules)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_line(line: String) -> ParsedLine {
|
||||
let line = strip_comments(&line[..]).trim();
|
||||
|
||||
// This tries to parse lines roughly in order of most frequent kind to
|
||||
// least frequent kind in order to minimize CPU cycles on average.
|
||||
parse_disallow(line)
|
||||
.map(|s| ParsedLine::Rule(ParsedRule::Disallow(s.into())))
|
||||
.or_else(|| parse_user_agent(line).map(|s| ParsedLine::UserAgent(s.to_lowercase())))
|
||||
.or_else(|| parse_allow(line).map(|s| ParsedLine::Rule(ParsedRule::Allow(s.into()))))
|
||||
.unwrap_or(ParsedLine::Nothing)
|
||||
}
|
||||
|
||||
fn strip_comments(line: &str) -> &str {
|
||||
if let Some(before) = line.split('#').next() {
|
||||
return before;
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
fn parse_user_agent(line: &str) -> Option<&str> {
|
||||
if line.len() < UA_PREFIX.len() {
|
||||
return None;
|
||||
}
|
||||
let prefix = &line[..UA_PREFIX.len()].to_ascii_lowercase();
|
||||
let suffix = &line[UA_PREFIX.len()..];
|
||||
|
||||
if prefix == UA_PREFIX {
|
||||
Some(suffix.trim())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_allow(line: &str) -> Option<&str> {
|
||||
if line.len() < ALLOW_PREFIX.len() {
|
||||
return None;
|
||||
}
|
||||
let prefix = &line[..ALLOW_PREFIX.len()].to_ascii_lowercase();
|
||||
let suffix = &line[ALLOW_PREFIX.len()..];
|
||||
|
||||
if prefix == ALLOW_PREFIX {
|
||||
Some(suffix.trim())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_disallow(line: &str) -> Option<&str> {
|
||||
if line.len() < DISALLOW_PREFIX.len() {
|
||||
return None;
|
||||
}
|
||||
let prefix = &line[..DISALLOW_PREFIX.len()].to_ascii_lowercase();
|
||||
let suffix = &line[DISALLOW_PREFIX.len()..];
|
||||
|
||||
if prefix == DISALLOW_PREFIX {
|
||||
Some(suffix.trim())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_allow() {
|
||||
let test_cases = vec![
|
||||
("Allow: /", "/"),
|
||||
("allow: / # Root with comment", "/"),
|
||||
("ALLOW: /abc/def ", "/abc/def"),
|
||||
("Allow: /abc/def ", "/abc/def"),
|
||||
(" Allow: /*/foo", "/*/foo"),
|
||||
];
|
||||
|
||||
for (i, o) in test_cases {
|
||||
assert_eq!(
|
||||
parse_line(i.into()),
|
||||
ParsedLine::Rule(ParsedRule::Allow(o.into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_disallow() {
|
||||
let test_cases = vec![
|
||||
("Disallow: /", "/"),
|
||||
("disallow: / # Root with comment", "/"),
|
||||
("DISALLOW: /abc/def ", "/abc/def"),
|
||||
("Disallow: /abc/def ", "/abc/def"),
|
||||
(" Disallow: /*/foo", "/*/foo"),
|
||||
];
|
||||
|
||||
for (i, o) in test_cases {
|
||||
assert_eq!(
|
||||
parse_line(i.into()),
|
||||
ParsedLine::Rule(ParsedRule::Disallow(o.into()))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_user_agent() {
|
||||
let test_cases = vec![
|
||||
("User-agent: *", "*"),
|
||||
("user-agent: ImABot # User agent with comment", "imabot"),
|
||||
(" USER-AGENT: ImABot ", "imabot"),
|
||||
];
|
||||
|
||||
for (i, o) in test_cases {
|
||||
assert_eq!(parse_line(i.into()), ParsedLine::UserAgent(o.into()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_nothing() {
|
||||
let test_cases = vec![
|
||||
"Useragent: *",
|
||||
"# Comment",
|
||||
"",
|
||||
" ",
|
||||
"\t",
|
||||
"alow: /",
|
||||
"disalow: /",
|
||||
];
|
||||
|
||||
for i in test_cases {
|
||||
assert_eq!(parse_line(i.into()), ParsedLine::Nothing);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_end_to_end() {
|
||||
tokio_test::block_on(async {
|
||||
let example_robots = r#"
|
||||
User-agent: jones-bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: jones
|
||||
User-agent: foobar
|
||||
Allow: /
|
||||
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
let parser = Compiler::new("foobar");
|
||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
let parser = Compiler::new("jones-bot");
|
||||
let jonesbot_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
let parser = Compiler::new("imabot");
|
||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
let parser = Compiler::new("abc");
|
||||
let abc_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
assert_eq!(true, foobar_machine.allow("/index.html"));
|
||||
assert_eq!(false, jonesbot_machine.allow("/index.html"));
|
||||
assert_eq!(false, imabot_machine.allow("/index.html"));
|
||||
assert_eq!(false, abc_machine.allow("/index.html"));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_1() {
|
||||
tokio_test::block_on(async {
|
||||
let example_robots = r#"
|
||||
# Instead of treating this as an error, we'll just consider
|
||||
# this behavior undefined.
|
||||
Allow: /
|
||||
|
||||
User-agent: jones
|
||||
User-agent: foobar
|
||||
Disallow: /
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
let parser = Compiler::new("foobar");
|
||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
let parser = Compiler::new("imabot");
|
||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
// Everything is allowed because next_header() returns None
|
||||
assert_eq!(true, foobar_machine.allow("/index.html"));
|
||||
assert_eq!(true, imabot_machine.allow("/index.html"));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_2() {
|
||||
tokio_test::block_on(async {
|
||||
let example_robots = r#"
|
||||
User-agent: jones
|
||||
User-agent: foobar
|
||||
Disallow: /
|
||||
|
||||
# Instead of treating this as an error, we consider this
|
||||
# behavior undefined.
|
||||
User-agent: imabot
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
let parser = Compiler::new("foobar");
|
||||
let foobar_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
let parser = Compiler::new("imabot");
|
||||
let imabot_machine = parser.compile(example_robots).await.unwrap();
|
||||
|
||||
assert_eq!(false, foobar_machine.allow("/index.html"));
|
||||
assert_eq!(true, imabot_machine.allow("/index.html"));
|
||||
});
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue