diff --git a/Cargo.lock b/Cargo.lock index ffb415d..990bc0e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,6 +104,27 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" +[[package]] +name = "bzip2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.0.73" @@ -169,6 +190,50 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" +dependencies = [ + "cfg-if", + "lazy_static", +] + [[package]] name = "cssparser" version = "0.27.2" @@ -332,6 +397,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fraction" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb65943183b6b3cbf00f64c181e8178217e30194381b150e4f87ec59864c803" +dependencies = [ + "lazy_static", + "num", +] + [[package]] name = "futf" version = "0.1.5" @@ -476,6 +551,15 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "hermit-abi" version = "0.1.19" @@ -587,6 +671,25 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "include_dir" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "482a2e29200b7eed25d7fdbd14423326760b7f6658d21a4cf12d55a50713c69f" +dependencies = [ + "include_dir_macros", +] + +[[package]] +name = "include_dir_macros" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e074c19deab2501407c91ba1860fa3d6820bfde307db6d8cb851b55a10be89b" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "indexmap" version = "1.8.0" @@ -681,6 +784,776 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89be94dbd775db37b46ca4f4bf5cf89adfb13ba197bfbcb69b2122848ee73c26" +[[package]] +name = "lingua" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccdec52ba7d7377a580b0d22ef3835612ac953ed1fa41d6e29d87620b96cbd63" +dependencies = [ + "fraction", + "include_dir", + "itertools", + "lingua-afrikaans-language-model", + "lingua-albanian-language-model", + "lingua-arabic-language-model", + "lingua-armenian-language-model", + "lingua-azerbaijani-language-model", + "lingua-basque-language-model", + "lingua-belarusian-language-model", + "lingua-bengali-language-model", + "lingua-bokmal-language-model", + "lingua-bosnian-language-model", + "lingua-bulgarian-language-model", + "lingua-catalan-language-model", + "lingua-chinese-language-model", + "lingua-croatian-language-model", + "lingua-czech-language-model", + "lingua-danish-language-model", + "lingua-dutch-language-model", + "lingua-english-language-model", + "lingua-esperanto-language-model", + "lingua-estonian-language-model", + "lingua-finnish-language-model", + "lingua-french-language-model", + "lingua-ganda-language-model", + "lingua-georgian-language-model", + "lingua-german-language-model", + "lingua-greek-language-model", + "lingua-gujarati-language-model", + "lingua-hebrew-language-model", + "lingua-hindi-language-model", + "lingua-hungarian-language-model", + "lingua-icelandic-language-model", + "lingua-indonesian-language-model", + "lingua-irish-language-model", + "lingua-italian-language-model", + "lingua-japanese-language-model", + "lingua-kazakh-language-model", + "lingua-korean-language-model", + "lingua-latin-language-model", + "lingua-latvian-language-model", + "lingua-lithuanian-language-model", + "lingua-macedonian-language-model", + "lingua-malay-language-model", + "lingua-maori-language-model", + "lingua-marathi-language-model", + "lingua-mongolian-language-model", + "lingua-nynorsk-language-model", + "lingua-persian-language-model", + "lingua-polish-language-model", + "lingua-portuguese-language-model", + "lingua-punjabi-language-model", + "lingua-romanian-language-model", + "lingua-russian-language-model", + "lingua-serbian-language-model", + "lingua-shona-language-model", + "lingua-slovak-language-model", + "lingua-slovene-language-model", + "lingua-somali-language-model", + "lingua-sotho-language-model", + "lingua-spanish-language-model", + "lingua-swahili-language-model", + "lingua-swedish-language-model", + "lingua-tagalog-language-model", + "lingua-tamil-language-model", + "lingua-telugu-language-model", + "lingua-thai-language-model", + "lingua-tsonga-language-model", + "lingua-tswana-language-model", + "lingua-turkish-language-model", + "lingua-ukrainian-language-model", + "lingua-urdu-language-model", + "lingua-vietnamese-language-model", + "lingua-welsh-language-model", + "lingua-xhosa-language-model", + "lingua-yoruba-language-model", + "lingua-zulu-language-model", + "maplit", + "once_cell", + "rayon", + "regex", + "serde", + "serde_json", + "strum", + "strum_macros", + "zip", +] + +[[package]] +name = "lingua-afrikaans-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a5a9ec0e06d8afce002d93b8d3c41a5ae489228c7b6cc2dda882a41e186b9cb" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-albanian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a332101361229cfbf1c723fd6d5a31ea6925ffa031f5b848f26df7642e133f2" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-arabic-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea063b53093ebc05ba56130317a6cbe80723fa5d8da152fd5d7bc9b58450f19" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-armenian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9060db52a551fac2712b3f5051de8942716b4e1e2967856d864a956392d6ce3" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-azerbaijani-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68232a1eb939d251f9f3468996aec8184647eb35d5575280ff22f102a7334f6" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-basque-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd8e0bf799ddf0fce013759fd77099c9a50360108b603b849cfe78820317efbe" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-belarusian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65226128d55fa9be0128caddbca7f7866b242d793df0f2f20dd02c7f0c5aadf1" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-bengali-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47002195ac6aa3283986023a8d7384fd469b403af1093dff0a4abfa913ddfeea" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-bokmal-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e69a2e872f146246d05254884b17100c732d663c73422413f0dd5a65a351798" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-bosnian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cf6b0e45c26c805eebc59b21856f7b4860146ec5e3984e6291d5bfc422c127c" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-bulgarian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a904551e986c5bd5f55ed832893aba0bba80315e4f06f3aed45b9d7c13c4328f" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-catalan-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "099aaee95f04177b785bb791b603d53a8b0e3eda5eceef5c043633b0631457d7" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-chinese-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6015fe96965c7c4b1f1552931cfd993b710cc9c6a8f2b28655d79f20ad43092" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-croatian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1c07a2adaf7ee6986bcbf40d2787873e46c400fd103babc6427f506123e21e5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-czech-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8daaec821d88775d247154dd75962a3a5daf83dacae0e40aa33718c310e46e" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-danish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04af91fa8c79572331842d111ba3ca75f785925dc3b144957c1cce7d1c5f728c" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-dutch-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae4a303904373d2b5d8165c579a3cd01dc0f2c11cfbeef73ff941c64c09e27f5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-english-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa6ba281afeb7f767fa6a08b0e10eedc45944eb3a7994fa4dd49fdcc0c3981b2" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-esperanto-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b60ed4f4bfeef65890f9fa17c937b8258c12d1060bda4779a7b09555f67d7727" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-estonian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a95a66f981f6d3b09ec69b78ea3ef30e861c4380d7a1940a27380d053353cc" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-finnish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90657afd99b586d8c2ad2229e3de62a79cc7d631fbdcd6e148056b8363364caa" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-french-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41d3ef75e2c6b4c363feabe66c34409add33bc04eeed48a06b1e196d9700d85" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-ganda-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb4c076bafdf27a59c35a58d0fa8067fa2dd048ee0a5e7ff1fd8db725ce986b" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-georgian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "647526a2b32d1e095dd23e3666a3104f3e69a0fe2f44dc0e904272dedfb0f189" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-german-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b878b826926f260313afb4568f10f87c1e55a2c32640f30ee127a83d87fcc115" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-greek-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c8f219cd4249326196eded87d284be6a1b12542b0d178a62b293e52e79b4095" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-gujarati-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2343159f19ef0b398e1390a6c829a5280c295349d44d171bd85ba909c5e793a2" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-hebrew-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26beada9b9791a8bb5fa1ef7715e1df182c8c5abdc88731b3d4afed7f9dced4e" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-hindi-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30699440ee47c7b91cb2e0631985f05c225460f1e319a8136ca72a97c0bf5a84" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-hungarian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b048a1f8d446c0ecd73f16b14d8e7e66a16cd0382fd5baafa4e6a5bd5ea99e2" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-icelandic-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8129d934943bc989aa60371e2f1ff315d17a1129f2dc2a2096650921d296fe98" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-indonesian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4ed10b915f8b29156adfa26ac33f64a77ea86d235ff83dc52b22384e78891e" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-irish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "592fee9f34d0fcd24fd171d5320031606626304d40f860ce87532ad7830022a0" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-italian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24cec9eab6c9c137ef348c173da12bed857dd9d4d3fce8972de6d5ee80ca41ae" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-japanese-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d00bd26e2de75e813a1e9f204d047aae469f6d451b65773407f52422c780929" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-kazakh-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21da0ab799a77a6221f86c15227eb95fe3e092d2a31263f276a4c9aa43b75786" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-korean-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa37965f7449f2c731d830aab18d2a528b920befbb6e4405772d92d539242a0b" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-latin-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48bace6da840f1a2627878219c0fa50d629d02add35e3561e5c75006b54d4b1" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-latvian-language-model" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fe0b49052fa9399bc70ccaf820c13fd514f132a027f966d7df5be2eba504e1f" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-lithuanian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ccfd1a636365e72433a7ca43bb32e5acc8b5e239919747a89d6afca9d60d7cb" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-macedonian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45db07681c6c72683a83fa97288135dd1ed090db0d1fefd563d01b49428183db" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-malay-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15efc930c9b3bce23b261527faf6cf8f3819f0f91c573760d7327d2f11a183e5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-maori-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e517b2bf83f7cbd3cdc3b1603899612a71a54d830dd96be63b082ebab7dea000" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-marathi-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88462495f40ddd18c4e05fe6d067351e1337ddab8ae7fd7eba39b89630809367" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-mongolian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82ca6a89a7500525d2ec5e24a7ee7ed1cb93cda4116a3ac2a3cabe689ff03951" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-nynorsk-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445d81e0d9b171c23eb8b8c37660dc6594d0bee801f5d0789cd7db17ceef58fe" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-persian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "795011db21b24cd51ad7cec8b8fe1a2452b4c26451d1bb91737eeb3434f6b507" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-polish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17673ac03a100e9df1588fcc8f7dba0124d2055f96d1169626e805cedb332205" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-portuguese-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb604ee45a96a8be1c4b80776bf29399fcedc55b65912d28aa7e3ef726dda77" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-punjabi-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d9868ddd52a367dd2cd99eb135a328f2493631a03908894193bcf286d2a1d0" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-romanian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3645b4134f3aa7e756212ca5f3b79eb2acfda815aef9b35da15ae299c2423a76" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-russian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2b092363a5834298cb7f1632c184f0877103fb88ad82bf6f07edcb871f0fd85" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-serbian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f19064a23062e81711d7cdc1feb6c51fad175ca3558cb0ad18af8324d0eb5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-shona-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3894f3c223ffc05b7a9f8aab860e61242eb7c63ad109c7c800e959a6879fe1e5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-slovak-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "150ccbde4f5e0c2ea3b4e50e5438918df0d54e14734478d59bd446b66efac561" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-slovene-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7991cf3a04fc61fe3b6624dd2077c46a5c2b321de4440a809b5efddb94e9a689" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-somali-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccb563dc178d1642f94976e3b3e54f62fa4526c39ad80c4a9405dc7507baf52b" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-sotho-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00e2eadbac81290fe6af5ce634af3de635a2c66025e9774cb0843ae1660c0a0a" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-spanish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fa5ff519621176934e2edd0eb38bdc9ffe8f187e02acacc8c8102192d76f51e" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-swahili-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e9901282f3061f1ca137dc7e0187a982da23044be1176342e15770f0ce29a5" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-swedish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c434ce21aefb3bc8d88d91235f806ac12406ed035dff36c284777b721f300c0" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-tagalog-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93bc8cf068b1ac4839263f9ded63106597178c53d4189a3d8c48f994fbd38e4c" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-tamil-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d953994ec7e7bc80d5930bbb5ded5436a34f67734101a108d2b62c36a3a9310e" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-telugu-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232aad872671ff53c50a96a91e5d1c886b191217adf41674b777f651fecef287" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-thai-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c01dc8ced43c87d37c869528a353681a2bac9c393352e96b820f99120374dc93" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-tsonga-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5990437427d9dee91f8d3f3059d6861dce98719022b8345cbd096043a45e242" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-tswana-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19fb235b7b05852f6bded0ffc1890b1da708f1490a41d1c41a417583e406dbf7" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-turkish-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f58513bff0ce9503eed82522dc65813370e0116c897c30703f66aa8894788373" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-ukrainian-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c39f40906c186265e12458e73d46767a5211dfad25d5df538f15cee6c348782" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-urdu-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "984de2de71dda05cf0c09af752d0370dd8d2abffa25bee54749963918663406b" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-vietnamese-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9490c7234aed122e201e629af6854de5752bb879ff3f6ed5aa0ff65fa52fb5c" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-welsh-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320df870ddd01919b4815e1a361d753ce1f77dd1abe8db10526ca9093af9f4b9" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-xhosa-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f5efd0a5cd523fa59d2051cb073b94f4c0ebe4653f6bbeb00e8bd2965be536" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-yoruba-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1db77f9689eb87a89eb715a212ac381f132cc92972f160b4e2c58fe1a2dacb3" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-zulu-language-model" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4750bd5fe9fcb76c7626b6d267a930758509317947ab0931e7fe055c7c099403" +dependencies = [ + "include_dir", +] + [[package]] name = "lock_api" version = "0.4.6" @@ -705,6 +1578,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "markup5ever" version = "0.10.1" @@ -731,6 +1610,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.16" @@ -809,6 +1697,41 @@ dependencies = [ "winapi", ] +[[package]] +name = "num" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95" +dependencies = [ + "autocfg", + "num-traits", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -819,6 +1742,29 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.14" @@ -1082,6 +2028,7 @@ dependencies = [ "itertools", "kuchiki", "lazy_static", + "lingua", "log", "quickpeep_densedoc", "quickpeep_moz_readability", @@ -1186,6 +2133,31 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "redox_syscall" version = "0.2.11" @@ -1327,6 +2299,12 @@ dependencies = [ "webpki", ] +[[package]] +name = "rustversion" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" + [[package]] name = "ryu" version = "1.0.9" @@ -1563,6 +2541,25 @@ dependencies = [ "quote", ] +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" + +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" version = "1.0.86" @@ -1788,6 +2785,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + [[package]] name = "unicode-xid" version = "0.2.2" @@ -2054,3 +3057,17 @@ name = "xml-rs" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" + +[[package]] +name = "zip" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93ab48844d61251bb3835145c521d88aa4031d7139e8485990f60ca911fa0815" +dependencies = [ + "byteorder", + "bzip2", + "crc32fast", + "flate2", + "thiserror", + "time", +] diff --git a/quickpeep/Cargo.toml b/quickpeep/Cargo.toml index 4db5359..2e42c76 100644 --- a/quickpeep/Cargo.toml +++ b/quickpeep/Cargo.toml @@ -33,6 +33,8 @@ ipnetwork = "0.18.0" futures-util = "0.3.21" +lingua = "1.3.3" + ### Raking helpers # HTTP Requests reqwest = { version = "0.11.9", features = ["stream"] } diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index f2ca1a8..dc32b86 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -1,16 +1,20 @@ -use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet}; +use crate::raking::analysis::{ + analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet, +}; use adblock::engine::Engine; use anyhow::{bail, Context}; use chrono::{DateTime, FixedOffset, Utc}; use cylon::Cylon; use futures_util::stream::StreamExt; use html5ever::tendril::fmt::Slice; +use itertools::Itertools; use kuchiki::traits::TendrilSink; use kuchiki::NodeRef; use lazy_static::lazy_static; use log::debug; use quickpeep_densedoc::DenseTree; use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; use serde::{Deserialize, Serialize}; use sitemap::reader::SiteMapEntity; @@ -214,11 +218,12 @@ impl Raker { })); }; + let headers = response.headers().clone(); let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?; if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { - match self.rake_html_page(&content, url, is_cf) { + match self.rake_html_page(&content, url, is_cf, &headers) { Ok(page_rake) => { return Ok(page_rake); } @@ -264,6 +269,7 @@ impl Raker { content: &[u8], url: &Url, is_cf: bool, + headers: &HeaderMap, ) -> anyhow::Result { let content_str = std::str::from_utf8(content)?; @@ -286,6 +292,32 @@ impl Raker { } } + // Try and dig up the page's language. + // First try since this is the modern way, and potentially the most trustworthy... + let mut language = None; + + if let Ok(html_node) = root_node.select_first("html") { + if let Some(lang) = html_node.attributes.borrow().get("lang") { + language = Some(lang.trim().to_string()); + } + } + + if language.is_none() { + // Next fallback: prefer the content-language header baked into the page itself + if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") { + if let Some(lang) = meta_node.attributes.borrow().get("content") { + language = Some(lang.trim().to_string()); + } + } + } + + if language.is_none() { + // Next fallback: prefer the content-language received as a header + if let Some(lang) = headers.get("content-language") { + language = Some(lang.to_str()?.to_owned()); + } + } + let mut antifeature_flags = AnalysisAntifeatures::empty(); if is_cf { @@ -311,6 +343,18 @@ impl Raker { let dense_doc = DenseTree::from_body(root_node.clone()); let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); + + if language.is_none() { + // Final fallback: guess the language + language = guess_document_language(&dense_doc_text); + } + + // Try and enforce some consistency in the language code; + // we want codes like en_US rather than en-us. + if let Some(language) = language.as_mut() { + normalise_language(language); + } + eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); @@ -334,6 +378,18 @@ impl Raker { } } +pub fn normalise_language(lang_string: &mut String) { + *lang_string = lang_string.to_lowercase(); + let mut pieces = lang_string + .replace("-", "_") + .split('_') + .map(|s| s.to_owned()) + .collect_vec(); + if let Some(dialect) = pieces.get_mut(1) { + *dialect = dialect.to_uppercase(); + } +} + pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result> { let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; diff --git a/quickpeep/src/raking/analysis.rs b/quickpeep/src/raking/analysis.rs index 2101da0..8204a3b 100644 --- a/quickpeep/src/raking/analysis.rs +++ b/quickpeep/src/raking/analysis.rs @@ -3,6 +3,7 @@ use adblock::lists::{ParseOptions, RuleTypes}; use anyhow::Context; use ipnetwork::IpNetwork; use kuchiki::NodeRef; +use lingua::Language; use std::collections::{BTreeSet, HashSet}; use std::net::IpAddr; use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; @@ -103,6 +104,13 @@ pub fn analyse_with_ad_block_cosmetic_filter( Ok(matches > 0) } +pub fn guess_document_language(text: &str) -> Option { + let detector = lingua::LanguageDetectorBuilder::from_all_languages().build(); + detector + .detect_language_of(text) + .map(|lang: Language| lang.iso_code_639_1().to_string()) +} + // TODO this isn't particularly efficient. Probably want a trie if it's important... pub struct IpSet { ips: BTreeSet,