Compare commits
	
		
			1 Commits
		
	
	
		
			main
			...
			rei/rakers
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2876939b59 | 
							
								
								
									
										343
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										343
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @ -15,7 +15,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7ab3f32d1eb0f323dcdd51cbb8d68cff415153870ff3bd60e12d5d56298bfcb1" | ||||
| dependencies = [ | ||||
|  "addr", | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "bitflags", | ||||
|  "flate2", | ||||
|  "idna", | ||||
| @ -206,9 +206,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "atoi" | ||||
| version = "0.4.0" | ||||
| version = "1.0.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "616896e05fc0e2649463a93a15183c6a16bf03413a7af88ef1285ddedfa9cda5" | ||||
| checksum = "d7c57d12312ff59c811c0643f4d80830505833c9ffaebd193d819392b265be8e" | ||||
| dependencies = [ | ||||
|  "num-traits", | ||||
| ] | ||||
| @ -318,6 +318,12 @@ version = "0.13.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "base64" | ||||
| version = "0.21.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "bindgen" | ||||
| version = "0.59.2" | ||||
| @ -394,15 +400,6 @@ dependencies = [ | ||||
|  "generic-array 0.12.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "block-buffer" | ||||
| version = "0.9.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" | ||||
| dependencies = [ | ||||
|  "generic-array 0.14.5", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "block-buffer" | ||||
| version = "0.10.2" | ||||
| @ -700,18 +697,18 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "crc" | ||||
| version = "2.1.0" | ||||
| version = "3.0.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "49fc9a695bca7f35f5f4c15cddc84415f66a74ea78eef08e90c5024f2b540e23" | ||||
| checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe" | ||||
| dependencies = [ | ||||
|  "crc-catalog", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "crc-catalog" | ||||
| version = "1.1.1" | ||||
| version = "2.2.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403" | ||||
| checksum = "9cace84e55f07e7301bae1c519df89cdad8cc3cd868413d3fdbdeca9ff3db484" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "crc32fast" | ||||
| @ -947,15 +944,6 @@ dependencies = [ | ||||
|  "generic-array 0.12.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "digest" | ||||
| version = "0.9.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" | ||||
| dependencies = [ | ||||
|  "generic-array 0.14.5", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "digest" | ||||
| version = "0.10.3" | ||||
| @ -978,10 +966,30 @@ dependencies = [ | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "dotenv" | ||||
| version = "0.15.0" | ||||
| name = "dirs" | ||||
| version = "4.0.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" | ||||
| checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" | ||||
| dependencies = [ | ||||
|  "dirs-sys", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "dirs-sys" | ||||
| version = "0.3.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" | ||||
| dependencies = [ | ||||
|  "libc", | ||||
|  "redox_users", | ||||
|  "winapi", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "dotenvy" | ||||
| version = "0.15.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "downcast-rs" | ||||
| @ -1009,6 +1017,9 @@ name = "either" | ||||
| version = "1.6.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" | ||||
| dependencies = [ | ||||
|  "serde", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "encoding_rs" | ||||
| @ -1243,9 +1254,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "futures-core" | ||||
| version = "0.3.21" | ||||
| version = "0.3.28" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" | ||||
| checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "futures-executor" | ||||
| @ -1349,13 +1360,13 @@ checksum = "aa12dfaa57be769c6681b4d193398cae8db7f7b9af3e86d362d7f0a3c294a1a0" | ||||
| dependencies = [ | ||||
|  "anyhow", | ||||
|  "ring", | ||||
|  "rustls", | ||||
|  "rustls 0.19.1", | ||||
|  "thiserror", | ||||
|  "tokio", | ||||
|  "tokio-rustls", | ||||
|  "tokio-rustls 0.22.0", | ||||
|  "url", | ||||
|  "webpki", | ||||
|  "webpki-roots", | ||||
|  "webpki 0.21.4", | ||||
|  "webpki-roots 0.21.1", | ||||
|  "x509-signature", | ||||
| ] | ||||
| 
 | ||||
| @ -1453,12 +1464,21 @@ dependencies = [ | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "hashlink" | ||||
| version = "0.7.0" | ||||
| name = "hashbrown" | ||||
| version = "0.12.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7249a3129cbc1ffccd74857f81464a323a152173cdb134e0fd81bc803b29facf" | ||||
| checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" | ||||
| dependencies = [ | ||||
|  "hashbrown", | ||||
|  "ahash 0.7.6", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "hashlink" | ||||
| version = "0.8.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" | ||||
| dependencies = [ | ||||
|  "hashbrown 0.12.3", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -1467,7 +1487,7 @@ version = "7.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "31672b7011be2c4f7456c4ddbcb40e7e9a4a9fad8efe49a6ebaf5f307d0109c0" | ||||
| dependencies = [ | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "byteorder", | ||||
|  "crossbeam-channel", | ||||
|  "flate2", | ||||
| @ -1489,6 +1509,9 @@ name = "heck" | ||||
| version = "0.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" | ||||
| dependencies = [ | ||||
|  "unicode-segmentation", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "hermit-abi" | ||||
| @ -1505,6 +1528,24 @@ version = "0.4.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "hkdf" | ||||
| version = "0.12.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "791a029f6b9fc27657f6f188ec6e5e43f6911f6f878e0dc5501396e09809d437" | ||||
| dependencies = [ | ||||
|  "hmac", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "hmac" | ||||
| version = "0.12.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" | ||||
| dependencies = [ | ||||
|  "digest 0.10.3", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "html5ever" | ||||
| version = "0.25.1" | ||||
| @ -1671,7 +1712,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" | ||||
| dependencies = [ | ||||
|  "autocfg", | ||||
|  "hashbrown", | ||||
|  "hashbrown 0.11.2", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -1890,9 +1931,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "libsqlite3-sys" | ||||
| version = "0.23.2" | ||||
| version = "0.24.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d2cafc7c74096c336d9d27145f7ebd4f4b6f95ba16aa5a282387267e6925cb58" | ||||
| checksum = "898745e570c7d0453cc1fbc4a701eb6c662ed54e8fec8b7d14be137ebeeb9d14" | ||||
| dependencies = [ | ||||
|  "cc", | ||||
|  "pkg-config", | ||||
| @ -2720,7 +2761,7 @@ version = "0.7.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fcb87f3080f6d1d69e8c564c0fcfde1d7aa8cc451ce40cae89479111f03bc0eb" | ||||
| dependencies = [ | ||||
|  "hashbrown", | ||||
|  "hashbrown 0.11.2", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -2779,6 +2820,15 @@ version = "0.4.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9376a4f0340565ad675d11fc1419227faf5f60cd7ac9cb2e7185a471f30af833" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "md-5" | ||||
| version = "0.10.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "66b48670c893079d3c2ed79114e3644b7004df1c361a4e0ad52e2e6940d07c3d" | ||||
| dependencies = [ | ||||
|  "digest 0.10.3", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "mdbx-sys" | ||||
| version = "0.11.4-git.20210105" | ||||
| @ -2924,7 +2974,7 @@ dependencies = [ | ||||
|  "atomic-shim", | ||||
|  "crossbeam-epoch", | ||||
|  "crossbeam-utils", | ||||
|  "hashbrown", | ||||
|  "hashbrown 0.11.2", | ||||
|  "metrics 0.18.1", | ||||
|  "num_cpus", | ||||
|  "parking_lot 0.11.2", | ||||
| @ -3219,12 +3269,6 @@ version = "0.2.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "opaque-debug" | ||||
| version = "0.3.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "openssl" | ||||
| version = "0.10.38" | ||||
| @ -3615,11 +3659,11 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "proc-macro2" | ||||
| version = "1.0.36" | ||||
| version = "1.0.55" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" | ||||
| checksum = "1d0dd4be24fcdcfeaa12a432d588dc59bbad6cad3510c67e74a2b6b2fc950564" | ||||
| dependencies = [ | ||||
|  "unicode-xid", | ||||
|  "unicode-ident", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -3659,7 +3703,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "292972edad6bbecc137ab84c5e36421a4a6c979ea31d3cc73540dd04315b33e1" | ||||
| dependencies = [ | ||||
|  "byteorder", | ||||
|  "hashbrown", | ||||
|  "hashbrown 0.11.2", | ||||
|  "idna", | ||||
|  "psl-types", | ||||
| ] | ||||
| @ -3807,6 +3851,7 @@ dependencies = [ | ||||
|  "diplomatic-bag", | ||||
|  "env_logger", | ||||
|  "feed-rs", | ||||
|  "futures-core", | ||||
|  "futures-util", | ||||
|  "gemini-fetch", | ||||
|  "html5ever", | ||||
| @ -3815,12 +3860,10 @@ dependencies = [ | ||||
|  "itertools", | ||||
|  "kuchiki", | ||||
|  "lazy_static", | ||||
|  "libmdbx", | ||||
|  "lingua", | ||||
|  "log", | ||||
|  "lru", | ||||
|  "markup5ever", | ||||
|  "mdbx-sys", | ||||
|  "metrics 0.18.1", | ||||
|  "metrics-exporter-prometheus", | ||||
|  "metrics-process-promstyle", | ||||
| @ -3843,7 +3886,7 @@ dependencies = [ | ||||
|  "signal-hook 0.3.13", | ||||
|  "sitemap", | ||||
|  "smartstring", | ||||
|  "tempfile", | ||||
|  "sqlx", | ||||
|  "tikv-jemallocator", | ||||
|  "tokio", | ||||
|  "webp", | ||||
| @ -4017,6 +4060,17 @@ dependencies = [ | ||||
|  "bitflags", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "redox_users" | ||||
| version = "0.4.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" | ||||
| dependencies = [ | ||||
|  "getrandom 0.2.5", | ||||
|  "redox_syscall", | ||||
|  "thiserror", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "regex" | ||||
| version = "1.5.5" | ||||
| @ -4066,7 +4120,7 @@ version = "0.11.10" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "46a1f7aa4f35e5e8b4160449f51afc758f0ce6454315a9fa7d0d113e958c41eb" | ||||
| dependencies = [ | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "bytes", | ||||
|  "encoding_rs", | ||||
|  "futures-core", | ||||
| @ -4150,7 +4204,7 @@ version = "0.7.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1b861ecaade43ac97886a512b360d01d66be9f41f3c61088b42cedf92e03d678" | ||||
| dependencies = [ | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "bitflags", | ||||
|  "serde", | ||||
| ] | ||||
| @ -4186,11 +4240,32 @@ version = "0.19.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" | ||||
| dependencies = [ | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "log", | ||||
|  "ring", | ||||
|  "sct", | ||||
|  "webpki", | ||||
|  "sct 0.6.1", | ||||
|  "webpki 0.21.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "rustls" | ||||
| version = "0.20.8" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" | ||||
| dependencies = [ | ||||
|  "log", | ||||
|  "ring", | ||||
|  "sct 0.7.0", | ||||
|  "webpki 0.22.0", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "rustls-pemfile" | ||||
| version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" | ||||
| dependencies = [ | ||||
|  "base64 0.21.0", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -4237,6 +4312,16 @@ dependencies = [ | ||||
|  "untrusted", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sct" | ||||
| version = "0.7.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" | ||||
| dependencies = [ | ||||
|  "ring", | ||||
|  "untrusted", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "seahash" | ||||
| version = "3.0.7" | ||||
| @ -4363,20 +4448,29 @@ dependencies = [ | ||||
|  "block-buffer 0.7.3", | ||||
|  "digest 0.8.1", | ||||
|  "fake-simd", | ||||
|  "opaque-debug 0.2.3", | ||||
|  "opaque-debug", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sha1" | ||||
| version = "0.10.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "006769ba83e921b3085caa8334186b00cf92b4cb1a6cf4632fbccc8eff5c7549" | ||||
| dependencies = [ | ||||
|  "cfg-if", | ||||
|  "cpufeatures", | ||||
|  "digest 0.10.3", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sha2" | ||||
| version = "0.9.9" | ||||
| version = "0.10.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" | ||||
| checksum = "cf9db03534dff993187064c4e0c05a5708d2a9728ace9a8959b77bedf415dac5" | ||||
| dependencies = [ | ||||
|  "block-buffer 0.9.0", | ||||
|  "cfg-if", | ||||
|  "cpufeatures", | ||||
|  "digest 0.9.0", | ||||
|  "opaque-debug 0.3.0", | ||||
|  "digest 0.10.3", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -4500,9 +4594,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sqlformat" | ||||
| version = "0.1.8" | ||||
| version = "0.2.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b4b7922be017ee70900be125523f38bdd644f4f06a1b16e8fa5a8ee8c34bffd4" | ||||
| checksum = "0c12bc9199d1db8234678b7051747c07f517cdcf019262d1847b94ec8b1aee3e" | ||||
| dependencies = [ | ||||
|  "itertools", | ||||
|  "nom 7.1.1", | ||||
| @ -4511,9 +4605,9 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sqlx" | ||||
| version = "0.5.11" | ||||
| version = "0.6.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fc15591eb44ffb5816a4a70a7efd5dd87bfd3aa84c4c200401c4396140525826" | ||||
| checksum = "f8de3b03a925878ed54a954f621e64bf55a3c1bd29652d0d1a17830405350188" | ||||
| dependencies = [ | ||||
|  "sqlx-core", | ||||
|  "sqlx-macros", | ||||
| @ -4521,18 +4615,22 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sqlx-core" | ||||
| version = "0.5.11" | ||||
| version = "0.6.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "195183bf6ff8328bb82c0511a83faf60aacf75840103388851db61d7a9854ae3" | ||||
| checksum = "fa8241483a83a3f33aa5fff7e7d9def398ff9990b2752b6c6112b83c6d246029" | ||||
| dependencies = [ | ||||
|  "ahash 0.7.6", | ||||
|  "atoi", | ||||
|  "base64 0.13.0", | ||||
|  "bitflags", | ||||
|  "byteorder", | ||||
|  "bytes", | ||||
|  "crc", | ||||
|  "crossbeam-queue", | ||||
|  "dirs", | ||||
|  "dotenvy", | ||||
|  "either", | ||||
|  "event-listener", | ||||
|  "flume", | ||||
|  "futures-channel", | ||||
|  "futures-core", | ||||
| @ -4541,16 +4639,24 @@ dependencies = [ | ||||
|  "futures-util", | ||||
|  "hashlink", | ||||
|  "hex", | ||||
|  "hkdf", | ||||
|  "hmac", | ||||
|  "indexmap", | ||||
|  "itoa 1.0.1", | ||||
|  "libc", | ||||
|  "libsqlite3-sys", | ||||
|  "log", | ||||
|  "md-5", | ||||
|  "memchr", | ||||
|  "once_cell", | ||||
|  "paste", | ||||
|  "percent-encoding", | ||||
|  "rustls", | ||||
|  "rand 0.8.5", | ||||
|  "rustls 0.20.8", | ||||
|  "rustls-pemfile", | ||||
|  "serde", | ||||
|  "serde_json", | ||||
|  "sha1", | ||||
|  "sha2", | ||||
|  "smallvec", | ||||
|  "sqlformat", | ||||
| @ -4559,22 +4665,25 @@ dependencies = [ | ||||
|  "thiserror", | ||||
|  "tokio-stream", | ||||
|  "url", | ||||
|  "webpki", | ||||
|  "webpki-roots", | ||||
|  "webpki-roots 0.22.6", | ||||
|  "whoami", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sqlx-macros" | ||||
| version = "0.5.11" | ||||
| version = "0.6.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "eee35713129561f5e55c554bba1c378e2a7e67f81257b7311183de98c50e6f94" | ||||
| checksum = "9966e64ae989e7e575b19d7265cb79d7fc3cbbdf179835cb0d716f294c2049c9" | ||||
| dependencies = [ | ||||
|  "dotenv", | ||||
|  "dotenvy", | ||||
|  "either", | ||||
|  "heck 0.3.3", | ||||
|  "heck 0.4.0", | ||||
|  "hex", | ||||
|  "once_cell", | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "serde", | ||||
|  "serde_json", | ||||
|  "sha2", | ||||
|  "sqlx-core", | ||||
|  "sqlx-rt", | ||||
| @ -4584,13 +4693,13 @@ dependencies = [ | ||||
| 
 | ||||
| [[package]] | ||||
| name = "sqlx-rt" | ||||
| version = "0.5.11" | ||||
| version = "0.6.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b555e70fbbf84e269ec3858b7a6515bcfe7a166a7cc9c636dd6efd20431678b6" | ||||
| checksum = "804d3f245f894e61b1e6263c84b23ca675d96753b5abfd5cc8597d86806e8024" | ||||
| dependencies = [ | ||||
|  "once_cell", | ||||
|  "tokio", | ||||
|  "tokio-rustls", | ||||
|  "tokio-rustls 0.23.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -4683,13 +4792,13 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "syn" | ||||
| version = "1.0.89" | ||||
| version = "1.0.109" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ea297be220d52398dcc07ce15a209fce436d361735ac1db700cab3b6cdfb9f54" | ||||
| checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" | ||||
| dependencies = [ | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "unicode-xid", | ||||
|  "unicode-ident", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -4705,7 +4814,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "264c2549892aa83975386a924ef8d0b8e909674c837d37ea58b4bd8739495c6e" | ||||
| dependencies = [ | ||||
|  "async-trait", | ||||
|  "base64", | ||||
|  "base64 0.13.0", | ||||
|  "bitpacking", | ||||
|  "byteorder", | ||||
|  "census", | ||||
| @ -4994,9 +5103,20 @@ version = "0.22.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" | ||||
| dependencies = [ | ||||
|  "rustls", | ||||
|  "rustls 0.19.1", | ||||
|  "tokio", | ||||
|  "webpki", | ||||
|  "webpki 0.21.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "tokio-rustls" | ||||
| version = "0.23.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" | ||||
| dependencies = [ | ||||
|  "rustls 0.20.8", | ||||
|  "tokio", | ||||
|  "webpki 0.22.0", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -5203,6 +5323,12 @@ version = "0.3.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "unicode-ident" | ||||
| version = "1.0.8" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "unicode-normalization" | ||||
| version = "0.1.19" | ||||
| @ -5218,12 +5344,6 @@ version = "1.9.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "unicode-xid" | ||||
| version = "0.2.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "unicode_categories" | ||||
| version = "0.1.1" | ||||
| @ -5412,13 +5532,32 @@ dependencies = [ | ||||
|  "untrusted", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "webpki" | ||||
| version = "0.22.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" | ||||
| dependencies = [ | ||||
|  "ring", | ||||
|  "untrusted", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "webpki-roots" | ||||
| version = "0.21.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" | ||||
| dependencies = [ | ||||
|  "webpki", | ||||
|  "webpki 0.21.4", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "webpki-roots" | ||||
| version = "0.22.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" | ||||
| dependencies = [ | ||||
|  "webpki 0.22.0", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| @ -5436,6 +5575,16 @@ dependencies = [ | ||||
|  "cc", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "whoami" | ||||
| version = "1.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2c70234412ca409cc04e864e89523cb0fc37f5e1344ebed5a3ebf4192b6b9f68" | ||||
| dependencies = [ | ||||
|  "wasm-bindgen", | ||||
|  "web-sys", | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "winapi" | ||||
| version = "0.3.9" | ||||
|  | ||||
| @ -17,7 +17,7 @@ ron = "0.7.0" | ||||
| tower-http = { version = "0.2.5", features = ["fs"] } | ||||
| log = "0.4.14" | ||||
| env_logger = "0.9.0" | ||||
| sqlx = { version = "0.5.11", features = ["sqlite", "runtime-tokio-rustls"] } | ||||
| sqlx = { version = "0.6.3", features = ["sqlite", "runtime-tokio-rustls"] } | ||||
| itertools = "0.10.3" | ||||
| colour = "0.6.0" | ||||
| futures-util = "0.3.21" | ||||
|  | ||||
							
								
								
									
										1
									
								
								quickpeep_raker/.env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								quickpeep_raker/.env
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| DATABASE_URL=${RAKER_DATABASE_URL} | ||||
| @ -34,9 +34,7 @@ bytesize = {version = "1.1.0", features = ["serde"]} | ||||
| chrono = "0.4.19" | ||||
| 
 | ||||
| ### Storage | ||||
| libmdbx = "0.1.1" | ||||
| # Used for FFI. Must match the version in libmdbx. | ||||
| mdbx-sys = "0.11.4-git.20210105" | ||||
| sqlx = { version = "0.6.3", features = ["postgres", "runtime-tokio-rustls", "offline", "json"] } | ||||
| # For compression of emitted packs. 0.11.1+zstd.1.5.2 | ||||
| zstd = "0.11.1" | ||||
| 
 | ||||
| @ -45,6 +43,7 @@ lazy_static = "1.4.0" | ||||
| bytes = "1.1.0" | ||||
| itertools = "0.10.3" | ||||
| ipnetwork = "0.18.0" | ||||
| futures-core = "0.3.28" | ||||
| futures-util = "0.3.21" | ||||
| tokio = { version = "1.17.0", features = ["full"] } | ||||
| anyhow = "1.0.55" | ||||
| @ -91,6 +90,3 @@ metrics = "0.18.1" | ||||
| metrics-exporter-prometheus = { version = "0.9.0", default-features = false, features = ["http-listener"] } | ||||
| metrics-process-promstyle = "0.18.0" | ||||
| bare-metrics-recorder = "0.1.0" | ||||
| 
 | ||||
| [dev-dependencies] | ||||
| tempfile = "3.3.0" | ||||
|  | ||||
							
								
								
									
										5
									
								
								quickpeep_raker/build.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								quickpeep_raker/build.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| // generated by `sqlx migrate build-script`
 | ||||
| fn main() { | ||||
|     // trigger recompilation when a new migration is added
 | ||||
|     println!("cargo:rerun-if-changed=migrations"); | ||||
| } | ||||
							
								
								
									
										21
									
								
								quickpeep_raker/devdb.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								quickpeep_raker/devdb.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | ||||
| #!/bin/sh | ||||
| 
 | ||||
| cname=`docker run -d --rm -e POSTGRES_PASSWORD=password -p 127.0.0.10:55432:5432 postgres:15` | ||||
| 
 | ||||
| onstop() { | ||||
|   docker stop $cname | ||||
| } | ||||
| 
 | ||||
| trap onstop EXIT | ||||
| 
 | ||||
| export RAKER_DATABASE_URL=postgres://postgres:password@127.0.0.10:55432/postgres | ||||
| 
 | ||||
| echo "Running migrations" | ||||
| sqlx migrate run | ||||
| echo "Preparing offline mode SQLx data" | ||||
| #cargo sqlx prepare | ||||
| cargo sqlx prepare -- --lib | ||||
| #cargo sqlx prepare --merged -- --all-features --all-targets --lib | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @ -0,0 +1,42 @@ | ||||
| -- All timestamps are in sec since 2023-01-01. | ||||
| 
 | ||||
| CREATE TYPE rakeintent AS ENUM ('Any', 'Page', 'Feed', 'SiteMap', 'Icon'); | ||||
| 
 | ||||
| CREATE TABLE domains ( | ||||
|     domain_id SERIAL PRIMARY KEY NOT NULL, | ||||
|     name TEXT UNIQUE NOT NULL, | ||||
|     domain_record JSONB NOT NULL | ||||
| ); | ||||
| 
 | ||||
| CREATE TABLE urls ( | ||||
|     url_id SERIAL PRIMARY KEY NOT NULL, | ||||
|     domain_id INTEGER NOT NULL REFERENCES domains(domain_id), | ||||
|     url TEXT NOT NULL, | ||||
|     intent rakeintent NOT NULL, | ||||
|     visited_at_ts INTEGER, | ||||
| 
 | ||||
|     UNIQUE (domain_id, url) | ||||
| ); | ||||
| 
 | ||||
| CREATE TABLE url_queue ( | ||||
|     url_id INTEGER PRIMARY KEY NOT NULL REFERENCES urls(url_id), | ||||
|     rake_after_ts INTEGER NOT NULL | ||||
| ); | ||||
| -- Used for finding things to rake. | ||||
| CREATE INDEX url_queue_rake_after_ts_idx ON url_queue(rake_after_ts); | ||||
| 
 | ||||
| 
 | ||||
| CREATE TABLE active_domain_raffle ( | ||||
|     domain_id INTEGER PRIMARY KEY NOT NULL REFERENCES domains(domain_id), | ||||
|     raffle INTEGER UNIQUE NOT NULL | ||||
| ); | ||||
| 
 | ||||
| 
 | ||||
| CREATE TABLE domain_backoffs ( | ||||
|     domain_id INTEGER PRIMARY KEY NOT NULL REFERENCES domains(domain_id), | ||||
|     backoff_until_ts INTEGER NOT NULL, | ||||
|     backoff_sec INTEGER NOT NULL, | ||||
|     reason TEXT NOT NULL | ||||
| ); | ||||
| -- Used for finding things to rake. | ||||
| CREATE INDEX domain_backoffs_backoff_until_ts_idx ON domain_backoffs(backoff_until_ts); | ||||
							
								
								
									
										3
									
								
								quickpeep_raker/sqlx-data.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								quickpeep_raker/sqlx-data.json
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,3 @@ | ||||
| { | ||||
|   "db": "PostgreSQL" | ||||
| } | ||||
| @ -1,173 +0,0 @@ | ||||
| use clap::Parser; | ||||
| use std::borrow::Cow; | ||||
| 
 | ||||
| use std::fmt::Debug; | ||||
| 
 | ||||
| use env_logger::Env; | ||||
| 
 | ||||
| use anyhow::{bail, Context}; | ||||
| 
 | ||||
| use colour::{dark_yellow_ln, red_ln}; | ||||
| use libmdbx::{Database, TableObject, RO}; | ||||
| 
 | ||||
| use std::path::PathBuf; | ||||
| 
 | ||||
| use quickpeep_raker::config; | ||||
| 
 | ||||
| use quickpeep_raker::storage::mdbx_helper_types::MdbxBare; | ||||
| use quickpeep_raker::storage::records::{ | ||||
|     ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord, | ||||
|     UrlVisitedRecord, | ||||
| }; | ||||
| use quickpeep_raker::storage::{RakerStore, RakerTxn}; | ||||
| 
 | ||||
| /// Seeds a raker's queue with URLs
 | ||||
| #[derive(Clone, Debug, Parser)] | ||||
| pub struct Opts { | ||||
|     #[clap(long = "config")] | ||||
|     config: Option<PathBuf>, | ||||
| 
 | ||||
|     /// Table name
 | ||||
|     table: String, | ||||
| 
 | ||||
|     /// Key name to look up
 | ||||
|     key_name: String, | ||||
| 
 | ||||
|     /// Search for any prefix, not an exact match.
 | ||||
|     #[clap(long = "prefix", short = 'p')] | ||||
|     prefix: bool, | ||||
| } | ||||
| 
 | ||||
| #[tokio::main] | ||||
| pub async fn main() -> anyhow::Result<()> { | ||||
|     env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init(); | ||||
| 
 | ||||
|     let opts: Opts = Opts::parse(); | ||||
| 
 | ||||
|     let config_path = opts | ||||
|         .config | ||||
|         .unwrap_or_else(|| PathBuf::from("quickpeep.ron")); | ||||
|     let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; | ||||
| 
 | ||||
|     if !config.raker.workbench_dir.exists() { | ||||
|         bail!( | ||||
|             "Workbench directory ({:?}) doesn't exist.", | ||||
|             config.raker.workbench_dir | ||||
|         ); | ||||
|     } | ||||
|     if !config.seed_dir.exists() { | ||||
|         bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir); | ||||
|     } | ||||
| 
 | ||||
|     let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?; | ||||
| 
 | ||||
|     let txn = store.ro_txn()?; | ||||
|     match opts.table.as_ref() { | ||||
|         "queue_urls" | "urls_queue" => { | ||||
|             inspect::<MdbxBare<QueueUrlRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().queue_urls, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "active_domains" => { | ||||
|             inspect::<MdbxBare<ActiveDomainRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().active_domains, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "active_domains_raffle" => { | ||||
|             inspect::<MdbxBare<String>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().active_domain_raffle, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "backing_off_reinstatements" => { | ||||
|             inspect::<MdbxBare<String>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().backing_off_reinstatements, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "backing_off_domains" => { | ||||
|             inspect::<MdbxBare<BackingOffDomainRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().backing_off_domains, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "visited_urls" => { | ||||
|             inspect::<MdbxBare<UrlVisitedRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().visited_urls, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "domains" => { | ||||
|             inspect::<MdbxBare<DomainRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().domains, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         "urls_on_hold" => { | ||||
|             inspect::<MdbxBare<OnHoldUrlRecord>>( | ||||
|                 opts.key_name.as_ref(), | ||||
|                 opts.prefix, | ||||
|                 &txn.mdbx.borrow_dbs().urls_on_hold, | ||||
|                 &txn, | ||||
|             )?; | ||||
|         } | ||||
|         other => { | ||||
|             dark_yellow_ln!("Unknown database {:?}", other); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Ok(()) | ||||
| } | ||||
| 
 | ||||
| trait Inspectable { | ||||
|     fn inspect(&self) -> String; | ||||
| } | ||||
| 
 | ||||
| impl<T: Debug> Inspectable for MdbxBare<T> { | ||||
|     fn inspect(&self) -> String { | ||||
|         format!("{:?}", &self.0) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| fn inspect<'a, IV: Inspectable + TableObject<'a> + 'static>( | ||||
|     key: &str, | ||||
|     prefix: bool, | ||||
|     database: &Database<'a>, | ||||
|     txn: &'a RakerTxn<'a, RO>, | ||||
| ) -> anyhow::Result<()> { | ||||
|     if prefix { | ||||
|         let mut cur = txn.mdbx_txn.cursor(database)?; | ||||
|         for item in cur.iter_from::<Cow<'_, [u8]>, IV>(key.as_bytes()) { | ||||
|             let (k, v) = item?; | ||||
|             if !k.starts_with(key.as_bytes()) { | ||||
|                 break; | ||||
|             } | ||||
|             println!("• {}", std::str::from_utf8(&k).unwrap_or("<Not UTF-8>")); | ||||
|             println!("  = {}", v.inspect()); | ||||
|         } | ||||
|     } else { | ||||
|         if let Some(entry) = txn.mdbx_txn.get::<IV>(database, key.as_bytes())? { | ||||
|             println!("{}", entry.inspect()); | ||||
|         } else { | ||||
|             red_ln!("no value"); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Ok(()) | ||||
| } | ||||
| @ -1,13 +1,11 @@ | ||||
| use anyhow::{bail, Context}; | ||||
| use anyhow::Context; | ||||
| use clap::Parser; | ||||
| use colour::{blue, yellow_ln}; | ||||
| use env_logger::Env; | ||||
| use itertools::Itertools; | ||||
| use libmdbx::Database; | ||||
| use quickpeep_raker::config; | ||||
| use quickpeep_raker::storage::mdbx_helper_types::{MdbxBare, MdbxString}; | ||||
| use quickpeep_raker::storage::records::OnHoldUrlRecord; | ||||
| use quickpeep_raker::storage::RakerStore; | ||||
| use sqlx::query; | ||||
| use std::collections::HashMap; | ||||
| use std::path::PathBuf; | ||||
| 
 | ||||
| @ -16,12 +14,9 @@ use std::path::PathBuf; | ||||
| pub struct Opts { | ||||
|     #[clap(long = "config")] | ||||
|     config: Option<PathBuf>, | ||||
| 
 | ||||
|     /// Whether to show URLs instead of domains
 | ||||
|     #[clap(long = "urls")] | ||||
|     urls: bool, | ||||
| } | ||||
| 
 | ||||
| // TODO re-introduce refcounting
 | ||||
| #[tokio::main] | ||||
| pub async fn main() -> anyhow::Result<()> { | ||||
|     env_logger::Builder::from_env( | ||||
| @ -36,19 +31,11 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|         .unwrap_or_else(|| PathBuf::from("quickpeep.ron")); | ||||
|     let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; | ||||
| 
 | ||||
|     if !config.raker.workbench_dir.exists() { | ||||
|         bail!( | ||||
|             "Workbench directory ({:?}) doesn't exist.", | ||||
|             config.raker.workbench_dir | ||||
|         ); | ||||
|     } | ||||
|     let store = RakerStore::open(&config.raker.database_uri).await?; | ||||
| 
 | ||||
|     let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?; | ||||
|     let is_urls = opts.urls; | ||||
|     let counts = count_on_hold(&store).await?; | ||||
| 
 | ||||
|     let counts = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<_>> { | ||||
|         let counts = count_on_hold(&store, is_urls)?; | ||||
| 
 | ||||
|         let sorted_counts = counts | ||||
|             .into_iter() | ||||
|             .map(|(string, count)| (count, string)) | ||||
| @ -60,12 +47,8 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|     }) | ||||
|     .await??; | ||||
| 
 | ||||
|     blue!("№ Refs   "); | ||||
|     if opts.urls { | ||||
|         yellow_ln!("URL"); | ||||
|     } else { | ||||
|         yellow_ln!("Domain"); | ||||
|     } | ||||
|     blue!("№ URLs   "); | ||||
|     yellow_ln!("Domain"); | ||||
|     for (count, string) in counts { | ||||
|         println!("{:>6}   {}", count, string); | ||||
|     } | ||||
| @ -73,25 +56,27 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|     Ok(()) | ||||
| } | ||||
| 
 | ||||
| pub fn count_on_hold(store: &RakerStore, urls: bool) -> anyhow::Result<HashMap<String, u32>> { | ||||
|     let mut map: HashMap<String, u32> = Default::default(); | ||||
| pub async fn count_on_hold(store: &RakerStore) -> anyhow::Result<HashMap<String, u32>> { | ||||
|     store | ||||
|         .ro_txn(move |txn| { | ||||
|             Box::pin(async move { | ||||
|                 let rows = query!( | ||||
|                     r#" | ||||
|             SELECT d.name AS "domain_name", COUNT(1) AS "url_count" FROM urls u | ||||
|             LEFT JOIN url_queue q ON u.url_id = q.url_id | ||||
|             JOIN domains d USING (domain_id) | ||||
|             WHERE u.visited_at_ts IS NULL AND q.url_id IS NULL | ||||
|             GROUP BY d.name | ||||
|         "#
 | ||||
|                 ) | ||||
|                 .fetch_all(&mut *txn.txn) | ||||
|                 .await?; | ||||
| 
 | ||||
|     let txn = store.ro_txn()?; | ||||
|     let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold; | ||||
| 
 | ||||
|     let mut cur = txn.mdbx_txn.cursor(urls_on_hold)?; | ||||
| 
 | ||||
|     for row in cur.iter_start::<MdbxString, MdbxBare<OnHoldUrlRecord>>() { | ||||
|         let (domain_then_url, record) = row?; | ||||
|         let mut split = domain_then_url.0.as_ref().split('\n'); | ||||
|         if urls { | ||||
|             // Skip one
 | ||||
|             split.next(); | ||||
|         } | ||||
|         let piece = split.next().context("Missing piece")?; | ||||
|         let count = map.entry(piece.to_owned()).or_insert(0); | ||||
|         *count += record.0.refs as u32; | ||||
|     } | ||||
| 
 | ||||
|     Ok(map) | ||||
|                 Ok(rows | ||||
|                     .into_iter() | ||||
|                     .map(|row| (row.domain_name, row.url_count.unwrap_or(0) as u32)) | ||||
|                     .collect::<HashMap<String, u32>>()) | ||||
|             }) | ||||
|         }) | ||||
|         .await | ||||
| } | ||||
|  | ||||
| @ -3,8 +3,7 @@ use clap::Parser; | ||||
| use env_logger::Env; | ||||
| 
 | ||||
| use adblock::lists::RuleTypes; | ||||
| use anyhow::{anyhow, bail, ensure, Context}; | ||||
| use chrono::Utc; | ||||
| use anyhow::{ensure, Context}; | ||||
| use log::{debug, error, info, warn}; | ||||
| use lru::LruCache; | ||||
| use metrics_exporter_prometheus::PrometheusBuilder; | ||||
| @ -14,10 +13,10 @@ use signal_hook::consts::{SIGINT, SIGTERM}; | ||||
| use signal_hook::iterator::Signals; | ||||
| use std::path::PathBuf; | ||||
| use std::sync::atomic::{AtomicBool, Ordering}; | ||||
| use std::sync::{Arc, Mutex, RwLock}; | ||||
| use std::sync::{Arc, RwLock}; | ||||
| use std::time::{Duration, Instant, SystemTime}; | ||||
| use tokio::fs::File; | ||||
| use tokio::sync::{mpsc, oneshot, Notify, Semaphore}; | ||||
| use tokio::sync::{mpsc, oneshot, Mutex, Notify, Semaphore}; | ||||
| use tokio::time::MissedTickBehavior; | ||||
| 
 | ||||
| use quickpeep_raker::config; | ||||
| @ -32,7 +31,6 @@ use quickpeep_structs::rake_entries::{ | ||||
|     AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES, | ||||
|     SCHEMA_RAKED_REJECTIONS, | ||||
| }; | ||||
| use quickpeep_utils::dates::date_to_quickpeep_days; | ||||
| 
 | ||||
| /// The ordering is slightly important on these: more specific things should come first.
 | ||||
| /// This means they filter out the troublesome elements before the broader filters do.
 | ||||
| @ -78,13 +76,6 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|         .unwrap_or_else(|| PathBuf::from("quickpeep.ron")); | ||||
|     let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; | ||||
| 
 | ||||
|     if !config.raker.workbench_dir.exists() { | ||||
|         bail!( | ||||
|             "Workbench directory ({:?}) doesn't exist.", | ||||
|             config.raker.workbench_dir | ||||
|         ); | ||||
|     } | ||||
| 
 | ||||
|     let mut header_map = HeaderMap::new(); | ||||
|     header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); | ||||
| 
 | ||||
| @ -106,7 +97,7 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|         .redirect(Policy::limited(5)) | ||||
|         .build()?; | ||||
| 
 | ||||
|     let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?; | ||||
|     let store = RakerStore::open(&config.raker.database_uri).await?; | ||||
| 
 | ||||
|     let mut adblock_engines = Vec::new(); | ||||
|     for (antifeature, name) in &ADBLOCK_FILTER_PATHS { | ||||
| @ -277,12 +268,8 @@ pub async fn main() -> anyhow::Result<()> { | ||||
| 
 | ||||
|     // Reinstate old backoffs and re-rakable URLs
 | ||||
|     store | ||||
|         .async_rw_txn(|txn| { | ||||
|             let today = date_to_quickpeep_days(&Utc::today())?; | ||||
|             txn.reinstate_backoffs(SystemTime::now())?; | ||||
|             txn.reinstate_rerakables(today)?; | ||||
|             txn.commit()?; | ||||
|             Ok(()) | ||||
|         .rw_txn(move |mut txn| { | ||||
|             Box::pin(async move { txn.reinstate_backoffs(SystemTime::now()).await }) | ||||
|         }) | ||||
|         .await?; | ||||
| 
 | ||||
| @ -295,8 +282,9 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|             loop { | ||||
|                 tokio::select! { | ||||
|                     _ = interval.tick() => { | ||||
|                         let txn = store.ro_txn()?; | ||||
|                         txn.emit_datastore_metrics()?; | ||||
|                         store.ro_txn(move |mut txn| Box::pin(async move { | ||||
|                             txn.emit_datastore_metrics().await | ||||
|                         })).await?; | ||||
|                         metrics_process_promstyle::emit_now()?; | ||||
|                     } | ||||
|                     _ = &mut dsmu_cancel_rx => { | ||||
| @ -341,14 +329,19 @@ pub async fn main() -> anyhow::Result<()> { | ||||
| 
 | ||||
| async fn acquire_active_domain(task_context: &TaskContext) -> anyhow::Result<Option<String>> { | ||||
|     // Acquire a domain for the task to run against
 | ||||
|     let domain = { | ||||
|         let txn = task_context.store.ro_txn()?; | ||||
|         // TODO: don't clone teh Arc here — conv to ref.
 | ||||
|         txn.acquire_random_active_domain(task_context.busy_domains.clone())? | ||||
|     }; | ||||
|     let busy_domains = task_context.busy_domains.clone(); | ||||
|     let domain = task_context | ||||
|         .store | ||||
|         .ro_txn(move |mut txn| { | ||||
|             Box::pin(async move { | ||||
|                 // TODO: don't clone teh Arc here — conv to ref.
 | ||||
|                 txn.acquire_random_active_domain(busy_domains).await | ||||
|             }) | ||||
|         }) | ||||
|         .await?; | ||||
| 
 | ||||
|     match domain { | ||||
|         RandomActiveDomainAcquisition::GotOne { domain, record: _ } => Ok(Some(domain)), | ||||
|         RandomActiveDomainAcquisition::GotOne { domain } => Ok(Some(domain)), | ||||
|         RandomActiveDomainAcquisition::AllBusy => Ok(None), | ||||
|         RandomActiveDomainAcquisition::NoneLeft => Ok(None), | ||||
|     } | ||||
| @ -370,7 +363,7 @@ async fn orchestrator(task_context: TaskContext, semaphore: Arc<Semaphore>) -> a | ||||
|         if domain_to_process.is_none() && semaphore.available_permits() == max_permits { | ||||
|             // There's nothing to do and nothing is being processed.
 | ||||
|             ensure!( | ||||
|                 task_context.busy_domains.lock().unwrap().is_empty(), | ||||
|                 task_context.busy_domains.lock().await.is_empty(), | ||||
|                 "Shutting down orchestrator but set of busy domains is not empty." | ||||
|             ); | ||||
|         } | ||||
| @ -378,13 +371,9 @@ async fn orchestrator(task_context: TaskContext, semaphore: Arc<Semaphore>) -> a | ||||
|         tokio::select! { | ||||
|             _ = tokio::time::sleep_until(next_reinstate.into()) => { | ||||
|                 // Reinstate backoffs and rerakables
 | ||||
|                 if let Err(err) = task_context.store.async_rw_txn(|txn| { | ||||
|                     txn.reinstate_backoffs(SystemTime::now())?; | ||||
|                     let today = date_to_quickpeep_days(&Utc::today())?; | ||||
|                     txn.reinstate_rerakables(today)?; | ||||
|                     txn.commit()?; | ||||
|                     Ok(()) | ||||
|                 }).await { | ||||
|                 if let Err(err) = task_context.store.rw_txn(move |mut txn| Box::pin(async move { | ||||
|                     txn.reinstate_backoffs(SystemTime::now()).await | ||||
|                 })).await { | ||||
|                     error!("Error performing periodic reinstatements: {err:?}"); | ||||
|                 } | ||||
| 
 | ||||
| @ -402,8 +391,7 @@ async fn orchestrator(task_context: TaskContext, semaphore: Arc<Semaphore>) -> a | ||||
|                     } | ||||
|                     ensure!( | ||||
|                         task_context.busy_domains | ||||
|                             .lock() | ||||
|                             .map_err(|_| anyhow!("busy domains set poisoned"))? | ||||
|                             .lock().await | ||||
|                             .remove(&domain), | ||||
|                         "Our domain was not busy after processing!" | ||||
|                     ); | ||||
|  | ||||
| @ -8,6 +8,8 @@ use anyhow::{bail, Context}; | ||||
| use colour::{dark_green_ln, dark_red_ln, dark_yellow, green, red, yellow_ln}; | ||||
| use reqwest::{Client, Url}; | ||||
| use std::path::PathBuf; | ||||
| use std::sync::atomic::{AtomicU32, Ordering}; | ||||
| use std::sync::Arc; | ||||
| use tokio::sync::mpsc; | ||||
| use tokio::sync::mpsc::Receiver; | ||||
| 
 | ||||
| @ -42,17 +44,11 @@ pub async fn main() -> anyhow::Result<()> { | ||||
|         .unwrap_or_else(|| PathBuf::from("quickpeep.ron")); | ||||
|     let config = RakerConfig::load(&config_path).context("Failed to load config")?; | ||||
| 
 | ||||
|     if !config.raker.workbench_dir.exists() { | ||||
|         bail!( | ||||
|             "Workbench directory ({:?}) doesn't exist.", | ||||
|             config.raker.workbench_dir | ||||
|         ); | ||||
|     } | ||||
|     if !config.seed_dir.exists() { | ||||
|         bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir); | ||||
|     } | ||||
| 
 | ||||
|     let store = RakerStore::open(&config.raker.workbench_dir.join("raker.mdbx"))?; | ||||
|     let store = RakerStore::open(&config.raker.database_uri).await?; | ||||
| 
 | ||||
|     import_seeds(store.clone(), &config).await?; | ||||
| 
 | ||||
| @ -60,7 +56,9 @@ pub async fn main() -> anyhow::Result<()> { | ||||
| 
 | ||||
|     eprintln!("... re-applying seeds and weeds to on-hold URLs ..."); | ||||
|     store | ||||
|         .async_rw_txn(|txn| maintenance::reapply_seeds_and_weeds_to_on_hold_urls(txn)) | ||||
|         .rw_txn(move |mut txn| { | ||||
|             Box::pin(async move { maintenance::reapply_seeds_and_weeds_to_on_hold_urls(txn).await }) | ||||
|         }) | ||||
|         .await?; | ||||
|     eprintln!("... done!"); | ||||
| 
 | ||||
| @ -137,18 +135,23 @@ async fn importer( | ||||
| ) -> anyhow::Result<SeedImportStats> { | ||||
|     let mut buf = Vec::with_capacity(BATCH_SIZE); | ||||
|     let mut stats = SeedImportStats::default(); | ||||
|     let client = Client::new(); | ||||
|     let client = Arc::new(Client::new()); | ||||
|     while let Some(seed) = recv.recv().await { | ||||
|         buf.push(seed); | ||||
| 
 | ||||
|         if buf.len() == BATCH_SIZE { | ||||
|             import_and_flush_batch_seeds_or_weeds( | ||||
|                 &store, &mut buf, &mut stats, &client, !are_weeds, | ||||
|             stats = import_and_flush_batch_seeds_or_weeds( | ||||
|                 &store, | ||||
|                 buf, | ||||
|                 stats, | ||||
|                 client.clone(), | ||||
|                 !are_weeds, | ||||
|             ) | ||||
|             .await?; | ||||
|             buf = Vec::new(); | ||||
|         } | ||||
|     } | ||||
|     import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds) | ||||
|     stats = import_and_flush_batch_seeds_or_weeds(&store, buf, stats, client.clone(), !are_weeds) | ||||
|         .await?; | ||||
| 
 | ||||
|     Ok(stats) | ||||
| @ -156,83 +159,106 @@ async fn importer( | ||||
| 
 | ||||
| async fn import_and_flush_batch_seeds_or_weeds( | ||||
|     store: &RakerStore, | ||||
|     buf: &mut Vec<Seed>, | ||||
|     stats: &mut SeedImportStats, | ||||
|     client: &Client, | ||||
|     mut buf: Vec<Seed>, | ||||
|     mut stats: SeedImportStats, | ||||
|     client: Arc<Client>, | ||||
|     is_seed: bool, | ||||
| ) -> anyhow::Result<()> { | ||||
|     let txn = store.rw_txn()?; | ||||
|     for seed in buf.drain(..) { | ||||
|         let as_url = Url::parse(seed.url.as_str()) | ||||
|             .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; | ||||
|         let domain = get_reduced_domain(&as_url) | ||||
|             .with_context(|| format!("No domain in seed URL '{as_url}'!"))?; | ||||
| ) -> anyhow::Result<SeedImportStats> { | ||||
|     store | ||||
|         .rw_txn(move |mut txn| { | ||||
|             Box::pin(async move { | ||||
|                 for seed in buf.drain(..) { | ||||
|                     let as_url = Url::parse(seed.url.as_str()) | ||||
|                         .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; | ||||
|                     let domain = get_reduced_domain(&as_url) | ||||
|                         .with_context(|| format!("No domain in seed URL '{as_url}'!"))?; | ||||
| 
 | ||||
|         let domain_record = txn.get_domain_record(domain.borrow())?; | ||||
|         let is_domain_new = domain_record.is_none(); | ||||
|         let mut domain_record = domain_record.unwrap_or_default(); | ||||
|         if is_domain_new { | ||||
|             stats.new_domains += 1; | ||||
|         } | ||||
|         let mut dirty = is_domain_new; | ||||
|                     let domain_record = txn.get_domain_record(domain.borrow()).await?; | ||||
|                     let is_domain_new = domain_record.is_none(); | ||||
|                     let mut domain_record = domain_record.unwrap_or_default(); | ||||
|                     if is_domain_new { | ||||
|                         stats.new_domains += 1; | ||||
|                     } | ||||
|                     let mut dirty = is_domain_new; | ||||
| 
 | ||||
|         // Register the domain. This is a no-op if it's already active or backing off.
 | ||||
|         txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?; | ||||
|                     // Register the domain. This is a no-op if it's already active or backing off.
 | ||||
|                     txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned()) | ||||
|                         .await?; | ||||
| 
 | ||||
|         let url_like = match &seed.url { | ||||
|             UrlOrUrlPattern::Url(url_str) => { | ||||
|                 let url = Url::parse(url_str.as_str())?; | ||||
|                 if is_seed { | ||||
|                     if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? { | ||||
|                         stats.new_urls += 1; | ||||
|                     } else { | ||||
|                         stats.already_present_urls += 1; | ||||
|                     let url_like = match &seed.url { | ||||
|                         UrlOrUrlPattern::Url(url_str) => { | ||||
|                             let url = Url::parse(url_str.as_str())?; | ||||
|                             if is_seed { | ||||
|                                 if txn | ||||
|                                     .enqueue_url(url.as_str(), None, RakeIntent::Any, false) | ||||
|                                     .await? | ||||
|                                 { | ||||
|                                     stats.new_urls += 1; | ||||
|                                 } else { | ||||
|                                     stats.already_present_urls += 1; | ||||
|                                 } | ||||
|                             } | ||||
| 
 | ||||
|                             // Seed/weed with empty prefix
 | ||||
|                             dirty |= domain_record | ||||
|                                 .rakeable_path_prefixes | ||||
|                                 .insert(String::new(), is_seed) | ||||
|                                 != Some(is_seed); | ||||
| 
 | ||||
|                             url | ||||
|                         } | ||||
|                         UrlOrUrlPattern::UrlPrefix(prefix) => { | ||||
|                             let prefix_as_url = Url::parse(prefix.as_str())?; | ||||
|                             if is_seed { | ||||
|                                 if txn | ||||
|                                     .enqueue_url( | ||||
|                                         prefix_as_url.as_str(), | ||||
|                                         None, | ||||
|                                         RakeIntent::Any, | ||||
|                                         false, | ||||
|                                     ) | ||||
|                                     .await? | ||||
|                                 { | ||||
|                                     stats.new_urls += 1; | ||||
|                                 } else { | ||||
|                                     stats.already_present_urls += 1; | ||||
|                                 } | ||||
|                             } | ||||
| 
 | ||||
|                             dirty |= domain_record | ||||
|                                 .rakeable_path_prefixes | ||||
|                                 .insert(prefix_as_url.path().to_string(), is_seed) | ||||
|                                 != Some(is_seed); | ||||
| 
 | ||||
|                             prefix_as_url | ||||
|                         } | ||||
|                     }; | ||||
| 
 | ||||
|                     if dirty { | ||||
|                         txn.put_domain_record(domain.borrow(), domain_record) | ||||
|                             .await?; | ||||
|                     } | ||||
| 
 | ||||
|                     if is_seed { | ||||
|                         // look at robots.txt and discover sitemaps!
 | ||||
|                         if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { | ||||
|                             for sitemap in robots_txt.sitemaps { | ||||
|                                 if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) { | ||||
|                                     txn.enqueue_url( | ||||
|                                         sitemap.url.as_str(), | ||||
|                                         None, | ||||
|                                         RakeIntent::SiteMap, | ||||
|                                         false, | ||||
|                                     ) | ||||
|                                     .await?; | ||||
|                                     stats.new_sitemaps += 1; | ||||
|                                 } | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 // Seed/weed with empty prefix
 | ||||
|                 dirty |= domain_record | ||||
|                     .rakeable_path_prefixes | ||||
|                     .insert(String::new(), is_seed) | ||||
|                     != Some(is_seed); | ||||
| 
 | ||||
|                 url | ||||
|             } | ||||
|             UrlOrUrlPattern::UrlPrefix(prefix) => { | ||||
|                 let prefix_as_url = Url::parse(prefix.as_str())?; | ||||
|                 if is_seed { | ||||
|                     if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? { | ||||
|                         stats.new_urls += 1; | ||||
|                     } else { | ||||
|                         stats.already_present_urls += 1; | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 dirty |= domain_record | ||||
|                     .rakeable_path_prefixes | ||||
|                     .insert(prefix_as_url.path().to_string(), is_seed) | ||||
|                     != Some(is_seed); | ||||
| 
 | ||||
|                 prefix_as_url | ||||
|             } | ||||
|         }; | ||||
| 
 | ||||
|         if dirty { | ||||
|             txn.put_domain_record(domain.borrow(), domain_record)?; | ||||
|         } | ||||
| 
 | ||||
|         if is_seed { | ||||
|             // look at robots.txt and discover sitemaps!
 | ||||
|             if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { | ||||
|                 for sitemap in robots_txt.sitemaps { | ||||
|                     if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) { | ||||
|                         txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?; | ||||
|                         stats.new_sitemaps += 1; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     txn.commit()?; | ||||
|     Ok(()) | ||||
|                 Ok(stats) | ||||
|             }) | ||||
|         }) | ||||
|         .await | ||||
| } | ||||
|  | ||||
| @ -19,8 +19,9 @@ pub struct RakerOnlyConfig { | ||||
|     /// Path to data files
 | ||||
|     pub data_dir: PathBuf, | ||||
| 
 | ||||
|     /// Path to the raker's workbench (queue etc)
 | ||||
|     pub workbench_dir: PathBuf, | ||||
|     /// URI to connect to Postgres.
 | ||||
|     /// e.g. `postgres://user:password@host:port/dbname`
 | ||||
|     pub database_uri: String, | ||||
| 
 | ||||
|     /// Directory where new rake packs will be emitted
 | ||||
|     pub emit_dir: PathBuf, | ||||
| @ -36,17 +37,19 @@ pub struct RakerOnlyConfig { | ||||
| pub struct RerakeTimings { | ||||
|     /// How long, in days, between re-rakes of the same page?
 | ||||
|     /// Suggested: 300
 | ||||
|     pub page: u16, | ||||
|     pub page: i32, | ||||
| 
 | ||||
|     /// How long, in days, between re-rakes of feeds?
 | ||||
|     /// Suggested: 10
 | ||||
|     pub feed: u16, | ||||
|     pub feed: i32, | ||||
| 
 | ||||
|     /// How long, in days, between re-rakes of icons?
 | ||||
|     /// Suggested: 365
 | ||||
|     pub icon: u16, | ||||
|     pub icon: i32, | ||||
| } | ||||
| 
 | ||||
| pub const DAY_SEC: i32 = 86400; | ||||
| 
 | ||||
| impl RakerConfig { | ||||
|     /// Loads a config at the specified path.
 | ||||
|     /// Will resolve all the paths in the RakerConfig for you.
 | ||||
| @ -57,7 +60,6 @@ impl RakerConfig { | ||||
| 
 | ||||
|         raker_config.raker.data_dir = config_dir.join(raker_config.raker.data_dir); | ||||
|         raker_config.seed_dir = config_dir.join(raker_config.seed_dir); | ||||
|         raker_config.raker.workbench_dir = config_dir.join(raker_config.raker.workbench_dir); | ||||
|         raker_config.raker.emit_dir = config_dir.join(raker_config.raker.emit_dir); | ||||
| 
 | ||||
|         Ok(raker_config) | ||||
|  | ||||
| @ -126,7 +126,8 @@ impl Display for PermanentFailure { | ||||
| 
 | ||||
| impl Error for PermanentFailure {} | ||||
| 
 | ||||
| #[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] | ||||
| #[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, sqlx::Type)] | ||||
| // supposedly we need this, but doesn't seem to work ... #[sqlx(postgres(oid = 16386))]
 | ||||
| pub enum RakeIntent { | ||||
|     Any, | ||||
|     Page, | ||||
| @ -140,11 +141,11 @@ impl FromStr for RakeIntent { | ||||
| 
 | ||||
|     fn from_str(s: &str) -> Result<Self, Self::Err> { | ||||
|         Ok(match s.to_lowercase().as_ref() { | ||||
|             "any" => RakeIntent::Any, | ||||
|             "page" => RakeIntent::Page, | ||||
|             "feed" => RakeIntent::Feed, | ||||
|             "sitemap" => RakeIntent::SiteMap, | ||||
|             "icon" => RakeIntent::Icon, | ||||
|             "Any" => RakeIntent::Any, | ||||
|             "Page" => RakeIntent::Page, | ||||
|             "Feed" => RakeIntent::Feed, | ||||
|             "SiteMap" => RakeIntent::SiteMap, | ||||
|             "Icon" => RakeIntent::Icon, | ||||
|             other => { | ||||
|                 bail!("Unrecognised intent: {:?}", other) | ||||
|             } | ||||
| @ -152,6 +153,18 @@ impl FromStr for RakeIntent { | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl Into<&'static str> for RakeIntent { | ||||
|     fn into(self) -> &'static str { | ||||
|         match self { | ||||
|             RakeIntent::Any => "Any", | ||||
|             RakeIntent::Page => "Page", | ||||
|             RakeIntent::Feed => "Feed", | ||||
|             RakeIntent::SiteMap => "SiteMap", | ||||
|             RakeIntent::Icon => "Icon", | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl From<ReferenceKind> for RakeIntent { | ||||
|     fn from(kind: ReferenceKind) -> Self { | ||||
|         match kind { | ||||
|  | ||||
| @ -1,13 +1,12 @@ | ||||
| use crate::config::RerakeTimings; | ||||
| use crate::config::{RerakeTimings, DAY_SEC}; | ||||
| use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES}; | ||||
| use crate::raking::{ | ||||
|     get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, | ||||
|     RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, | ||||
| }; | ||||
| use crate::storage::records::{DomainRecord, UrlVisitedRecord}; | ||||
| use crate::storage::records::DomainRecord; | ||||
| use crate::storage::RakerStore; | ||||
| use anyhow::{anyhow, Context}; | ||||
| use chrono::Utc; | ||||
| use cylon::Cylon; | ||||
| use log::{debug, warn}; | ||||
| use lru::LruCache; | ||||
| @ -15,16 +14,16 @@ use metrics::increment_counter; | ||||
| use quickpeep_structs::rake_entries::{ | ||||
|     IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, | ||||
| }; | ||||
| use quickpeep_utils::dates::date_to_quickpeep_days; | ||||
| use quickpeep_utils::dates::QUICKPEEP_EPOCH_ST; | ||||
| use quickpeep_utils::urls::get_reduced_domain; | ||||
| use reqwest::{Client, Url}; | ||||
| use std::borrow::{Borrow, Cow}; | ||||
| use std::collections::HashSet; | ||||
| use std::sync::atomic::{AtomicBool, Ordering}; | ||||
| use std::sync::{Arc, Mutex as StdMutex, RwLock}; | ||||
| use std::time::Duration; | ||||
| use std::sync::{Arc, RwLock}; | ||||
| use std::time::{Duration, SystemTime}; | ||||
| use tokio::sync::mpsc::Sender; | ||||
| use tokio::sync::{Notify, Semaphore}; | ||||
| use tokio::sync::{Mutex, Notify, Semaphore}; | ||||
| use tokio::time::Instant; | ||||
| 
 | ||||
| /// A crawl delay that is greater than 61 seconds will cause the domain to lose its place in the
 | ||||
| @ -64,7 +63,7 @@ pub struct TaskContext { | ||||
|     pub raker: Arc<Raker>, | ||||
| 
 | ||||
|     /// Busy domains (that are being processed by other tasks)
 | ||||
|     pub busy_domains: Arc<StdMutex<HashSet<String>>>, | ||||
|     pub busy_domains: Arc<Mutex<HashSet<String>>>, | ||||
| 
 | ||||
|     /// Cache of robots.txt entries for recently-made dormant sites
 | ||||
|     pub robotstxt_cache: Arc<RwLock<LruCache<String, Option<Cylon>>>>, | ||||
| @ -94,24 +93,31 @@ impl TaskContext { | ||||
|         let mut current_robot_rules: Option<Cylon> = None; | ||||
|         let mut wait_until: Option<Instant> = None; | ||||
| 
 | ||||
|         let domain_record = { | ||||
|             let txn = self.store.ro_txn()?; | ||||
|             let dr = txn.get_domain_record(&domain)?; | ||||
|             match dr { | ||||
|                 None => { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|                 Some(dr) => dr, | ||||
|         let domain2 = domain.clone(); | ||||
|         let dr = self | ||||
|             .store | ||||
|             .ro_txn(move |mut txn| Box::pin(async move { txn.get_domain_record(&domain2).await })) | ||||
|             .await?; | ||||
|         let domain_record = match dr { | ||||
|             None => { | ||||
|                 return Ok(()); | ||||
|             } | ||||
|             Some(dr) => dr, | ||||
|         }; | ||||
| 
 | ||||
|         while !self.graceful_stop.load(Ordering::Relaxed) { | ||||
|             // Get a URL to process
 | ||||
|             let url = { | ||||
|                 let txn = self.store.ro_txn()?; | ||||
|                 txn.choose_url_for_domain(&domain) | ||||
|                     .context("failed to choose URL for domain")? | ||||
|             }; | ||||
|             let domain2 = domain.clone(); | ||||
|             let url = self | ||||
|                 .store | ||||
|                 .ro_txn(move |mut txn| { | ||||
|                     Box::pin(async move { | ||||
|                         txn.choose_url_for_domain(&domain2) | ||||
|                             .await | ||||
|                             .context("failed to choose URL for domain") | ||||
|                     }) | ||||
|                 }) | ||||
|                 .await?; | ||||
| 
 | ||||
|             let (url_str, url_record) = if let Some(url) = url { | ||||
|                 url | ||||
| @ -121,21 +127,22 @@ impl TaskContext { | ||||
|                 let domain = domain.to_owned(); | ||||
|                 let out_of_urls = self | ||||
|                     .store | ||||
|                     .async_rw_txn(move |txn| { | ||||
|                         // Double-check we're still out of URLs (another could have been added since
 | ||||
|                         // we last checked!)
 | ||||
|                         let out_of_urls = txn.choose_url_for_domain(&domain)?.is_none(); | ||||
|                     .rw_txn(move |mut txn| { | ||||
|                         Box::pin(async move { | ||||
|                             // Double-check we're still out of URLs (another could have been added since
 | ||||
|                             // we last checked!)
 | ||||
|                             let out_of_urls = txn.choose_url_for_domain(&domain).await?.is_none(); | ||||
| 
 | ||||
|                         if !out_of_urls { | ||||
|                             return Ok(false); | ||||
|                         } | ||||
|                             if !out_of_urls { | ||||
|                                 return Ok(false); | ||||
|                             } | ||||
| 
 | ||||
|                         // Delete the active domain from the store
 | ||||
|                         txn.remove_active_domain(&domain) | ||||
|                             .context("failed to remove active domain")?; | ||||
| 
 | ||||
|                         txn.commit()?; | ||||
|                         Ok(true) | ||||
|                             // Delete the active domain from the store
 | ||||
|                             txn.remove_active_domain(&domain) | ||||
|                                 .await | ||||
|                                 .context("failed to remove active domain")?; | ||||
|                             Ok(true) | ||||
|                         }) | ||||
|                     }) | ||||
|                     .await | ||||
|                     .context("failed to check if we're out of URLs")?; | ||||
| @ -154,10 +161,8 @@ impl TaskContext { | ||||
|                 let domain = domain.clone(); | ||||
|                 let url = url.clone(); | ||||
|                 self.store | ||||
|                     .async_rw_txn(move |txn| { | ||||
|                         txn.dequeue_url(&domain, url.as_str())?; | ||||
|                         txn.commit()?; | ||||
|                         Ok(()) | ||||
|                     .rw_txn(move |mut txn| { | ||||
|                         Box::pin(async move { txn.dequeue_url(&domain, url.as_str()).await }) | ||||
|                     }) | ||||
|                     .await?; | ||||
|                 continue; | ||||
| @ -290,24 +295,26 @@ impl TaskContext { | ||||
| 
 | ||||
|                 let domain = domain.clone(); | ||||
|                 let url = url.clone(); | ||||
|                 let backoff = delay.as_secs().try_into().unwrap_or(u32::MAX); | ||||
|                 let backoff = delay.as_secs().try_into().unwrap_or(i32::MAX); | ||||
| 
 | ||||
|                 self.store | ||||
|                     .async_rw_txn(move |txn| { | ||||
|                         txn.start_backing_off( | ||||
|                             &domain, | ||||
|                             backoff, | ||||
|                             url.to_string(), | ||||
|                             TemporaryFailure { | ||||
|                                 reason: TemporaryFailureReason::ExcruciatingCrawlDelay( | ||||
|                                     delay.as_secs(), | ||||
|                                 ), | ||||
|                                 // Don't stack this up with a backoff; it's not an actual failure!
 | ||||
|                                 backoff_sec: 0, | ||||
|                             }, | ||||
|                         )?; | ||||
|                         txn.commit()?; | ||||
|                         Ok(()) | ||||
|                     .rw_txn(move |mut txn| { | ||||
|                         Box::pin(async move { | ||||
|                             txn.start_backing_off( | ||||
|                                 &domain, | ||||
|                                 backoff, | ||||
|                                 url.to_string(), | ||||
|                                 TemporaryFailure { | ||||
|                                     reason: TemporaryFailureReason::ExcruciatingCrawlDelay( | ||||
|                                         delay.as_secs(), | ||||
|                                     ), | ||||
|                                     // Don't stack this up with a backoff; it's not an actual failure!
 | ||||
|                                     backoff_sec: 0, | ||||
|                                 }, | ||||
|                             ) | ||||
|                             .await?; | ||||
|                             Ok(()) | ||||
|                         }) | ||||
|                     }) | ||||
|                     .await | ||||
|                     .context("failure whilst turning long crawl delay into backoff")?; | ||||
| @ -319,7 +326,10 @@ impl TaskContext { | ||||
| 
 | ||||
|     /// Processes the outcome of
 | ||||
|     async fn process_outcome(&self, url: &Url, outcome: RakeOutcome) -> anyhow::Result<NextAction> { | ||||
|         let today = date_to_quickpeep_days(&Utc::today())?; | ||||
|         let visited_on_ts = SystemTime::now() | ||||
|             .duration_since(*QUICKPEEP_EPOCH_ST) | ||||
|             .unwrap() | ||||
|             .as_secs() as i32; | ||||
|         match outcome { | ||||
|             RakeOutcome::RakedPage(page) => { | ||||
|                 self.submission | ||||
| @ -335,11 +345,11 @@ impl TaskContext { | ||||
|                     .context("Reference processor shut down; can't stream references!")?; | ||||
| 
 | ||||
|                 self.as_event_processor() | ||||
|                     .process_page(url.clone(), page.page_entry, today) | ||||
|                     .process_page(url.clone(), page.page_entry, visited_on_ts) | ||||
|                     .await | ||||
|                     .context("failure processing page for RakedPage")?; | ||||
|                 self.as_event_processor() | ||||
|                     .process_refs(url.clone(), page.referrer_entry, today, false) | ||||
|                     .process_refs(url.clone(), page.referrer_entry, visited_on_ts, false) | ||||
|                     .await | ||||
|                     .context("failure processing refs for RakedPage")?; | ||||
| 
 | ||||
| @ -357,7 +367,7 @@ impl TaskContext { | ||||
|                     .context("Reference processor shut down; can't stream references!")?; | ||||
| 
 | ||||
|                 self.as_event_processor() | ||||
|                     .process_refs(url.clone(), refs, today, true) | ||||
|                     .process_refs(url.clone(), refs, visited_on_ts, true) | ||||
|                     .await | ||||
|                     .context("failure processing refs for RakedFeed")?; | ||||
| 
 | ||||
| @ -375,7 +385,7 @@ impl TaskContext { | ||||
|                     .context("Reference processor shut down; can't stream references!")?; | ||||
| 
 | ||||
|                 self.as_event_processor() | ||||
|                     .process_refs(url.clone(), refs, today, true) | ||||
|                     .process_refs(url.clone(), refs, visited_on_ts, true) | ||||
|                     .await | ||||
|                     .context("failure processing refs for RakedSitemap")?; | ||||
| 
 | ||||
| @ -395,7 +405,7 @@ impl TaskContext { | ||||
|                     .await?; | ||||
| 
 | ||||
|                 self.as_event_processor() | ||||
|                     .process_icon(url.clone(), today) | ||||
|                     .process_icon(url.clone(), visited_on_ts) | ||||
|                     .await | ||||
|                     .context("failure processing icon for RakedIcon")?; | ||||
| 
 | ||||
| @ -423,7 +433,7 @@ impl TaskContext { | ||||
|                     .context("Reference processor shut down; can't stream references!")?; | ||||
| 
 | ||||
|                 self.as_event_processor() | ||||
|                     .process_refs(url.clone(), refs, today, false) | ||||
|                     .process_refs(url.clone(), refs, visited_on_ts, false) | ||||
|                     .await | ||||
|                     .context("Failure processing refs for Redirect")?; | ||||
| 
 | ||||
| @ -439,14 +449,15 @@ impl TaskContext { | ||||
|                 let url = url.clone(); | ||||
| 
 | ||||
|                 // TODO(feature) add 1.1× the previous backoff, if there was one.
 | ||||
|                 let new_backoff = failure.backoff_sec; | ||||
|                 let new_backoff = failure.backoff_sec as i32; | ||||
|                 let domain = domain.into_owned(); | ||||
| 
 | ||||
|                 self.store | ||||
|                     .async_rw_txn(move |txn| { | ||||
|                         txn.start_backing_off(&domain, new_backoff, url.to_string(), failure)?; | ||||
|                         txn.commit()?; | ||||
|                         Ok(()) | ||||
|                     .rw_txn(move |mut txn| { | ||||
|                         Box::pin(async move { | ||||
|                             txn.start_backing_off(&domain, new_backoff, url.to_string(), failure) | ||||
|                                 .await | ||||
|                         }) | ||||
|                     }) | ||||
|                     .await | ||||
|                     .context("failed to store backoff")?; | ||||
| @ -461,7 +472,7 @@ impl TaskContext { | ||||
|                     .await | ||||
|                     .context("Rejection processor shut down; can't stream rejection!!")?; | ||||
|                 self.as_event_processor() | ||||
|                     .process_rejection(url.clone(), today) | ||||
|                     .process_rejection(url.clone(), visited_on_ts) | ||||
|                     .await | ||||
|                     .context("failed to process rejection for PermanentFailure")?; | ||||
| 
 | ||||
| @ -493,58 +504,58 @@ impl EventProcessor<'_> { | ||||
|         &self, | ||||
|         url: Url, | ||||
|         page: RakedPageEntry, | ||||
|         datestamp: u16, | ||||
|         visited_on_ts: i32, | ||||
|     ) -> anyhow::Result<()> { | ||||
|         let rerake_on = Some(datestamp + self.rerake_timings.page); | ||||
|         let rerake_on = Some(visited_on_ts + self.rerake_timings.page * DAY_SEC); | ||||
|         self.store | ||||
|             .as_ref() | ||||
|             .async_rw_txn(move |txn| { | ||||
|                 let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                     format!("No domain for URL '{url}' for which we are processing the page!") | ||||
|                 })?; | ||||
|                 txn.mark_url_as_visited( | ||||
|                     domain.as_ref(), | ||||
|                     url.as_ref(), | ||||
|                     UrlVisitedRecord { | ||||
|                         last_visited_days: datestamp, | ||||
|                     }, | ||||
|                     rerake_on, | ||||
|                 )?; | ||||
|             .rw_txn(move |mut txn| { | ||||
|                 Box::pin(async move { | ||||
|                     let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                         format!("No domain for URL '{url}' for which we are processing the page!") | ||||
|                     })?; | ||||
|                     txn.mark_url_as_visited( | ||||
|                         domain.as_ref(), | ||||
|                         url.as_ref(), | ||||
|                         visited_on_ts, | ||||
|                         rerake_on, | ||||
|                     ) | ||||
|                     .await?; | ||||
| 
 | ||||
|                 // If there's a favicon to be tried, add it to the list...
 | ||||
|                 let favicon_url_rel = page.document.head.effective_favicon_url(); | ||||
|                 if let Ok(favicon_url) = url.join(favicon_url_rel) { | ||||
|                     if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) { | ||||
|                         txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; | ||||
|                     // If there's a favicon to be tried, add it to the list...
 | ||||
|                     let favicon_url_rel = page.document.head.effective_favicon_url(); | ||||
|                     if let Ok(favicon_url) = url.join(favicon_url_rel) { | ||||
|                         if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) { | ||||
|                             txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon, false) | ||||
|                                 .await?; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 txn.commit()?; | ||||
|                 Ok(()) | ||||
|                     Ok(()) | ||||
|                 }) | ||||
|             }) | ||||
|             .await | ||||
|     } | ||||
| 
 | ||||
|     pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> { | ||||
|         let rerake_on = Some(datestamp + self.rerake_timings.icon); | ||||
|     pub async fn process_icon(&self, url: Url, visited_on_ts: i32) -> anyhow::Result<()> { | ||||
|         let rerake_on = Some(visited_on_ts + self.rerake_timings.icon * DAY_SEC); | ||||
| 
 | ||||
|         self.store | ||||
|             .as_ref() | ||||
|             .async_rw_txn(move |txn| { | ||||
|                 let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                     format!("No domain for URL '{url}' for which we are processing an icon!") | ||||
|                 })?; | ||||
|                 txn.mark_url_as_visited( | ||||
|                     domain.as_ref(), | ||||
|                     url.as_ref(), | ||||
|                     UrlVisitedRecord { | ||||
|                         last_visited_days: datestamp, | ||||
|                     }, | ||||
|                     rerake_on, | ||||
|                 )?; | ||||
| 
 | ||||
|                 txn.commit()?; | ||||
|                 Ok(()) | ||||
|             .rw_txn(move |mut txn| { | ||||
|                 Box::pin(async move { | ||||
|                     let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                         format!("No domain for URL '{url}' for which we are processing an icon!") | ||||
|                     })?; | ||||
|                     txn.mark_url_as_visited( | ||||
|                         domain.as_ref(), | ||||
|                         url.as_ref(), | ||||
|                         visited_on_ts, | ||||
|                         rerake_on, | ||||
|                     ) | ||||
|                     .await?; | ||||
|                     Ok(()) | ||||
|                 }) | ||||
|             }) | ||||
|             .await | ||||
|     } | ||||
| @ -553,93 +564,105 @@ impl EventProcessor<'_> { | ||||
|         &self, | ||||
|         url: Url, | ||||
|         refs: RakedReferrerEntry, | ||||
|         datestamp: u16, | ||||
|         visited_on_ts: i32, | ||||
|         rerakeable_feed: bool, | ||||
|     ) -> anyhow::Result<()> { | ||||
|         let rerake_on = if rerakeable_feed { | ||||
|             Some(self.rerake_timings.feed) | ||||
|             Some(visited_on_ts + self.rerake_timings.feed * DAY_SEC) | ||||
|         } else { | ||||
|             None | ||||
|         }; | ||||
|         self.store | ||||
|             .as_ref() | ||||
|             .async_rw_txn(move |txn| { | ||||
|                 let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                     format!("No domain for URL '{url}' for which we are processing refs!") | ||||
|                 })?; | ||||
|                 txn.mark_url_as_visited( | ||||
|                     domain.as_ref(), | ||||
|                     url.as_ref(), | ||||
|                     UrlVisitedRecord { | ||||
|                         last_visited_days: datestamp, | ||||
|                     }, | ||||
|                     rerake_on, | ||||
|                 ) | ||||
|                 .context("failed to mark URL as visited")?; | ||||
| 
 | ||||
|                 // track all the referred-to URLs!
 | ||||
|                 for reference in refs.references { | ||||
|                     let ref_url = Url::parse(&reference.target).with_context(|| { | ||||
|                         format!( | ||||
|                             "failed to parse target URL of reference: {:?}", | ||||
|                             reference.target | ||||
|                         ) | ||||
|                     })?; | ||||
|                     let domain = get_reduced_domain(&ref_url).with_context(|| { | ||||
|                         format!("failed to reduce domain: {:?}", reference.target) | ||||
|             .rw_txn(move |mut txn| { | ||||
|                 Box::pin(async move { | ||||
|                     let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                         format!("No domain for URL '{url}' for which we are processing refs!") | ||||
|                     })?; | ||||
|                     txn.mark_url_as_visited( | ||||
|                         domain.as_ref(), | ||||
|                         url.as_ref(), | ||||
|                         visited_on_ts, | ||||
|                         rerake_on, | ||||
|                     ) | ||||
|                     .await | ||||
|                     .context("failed to mark URL as visited")?; | ||||
| 
 | ||||
|                     // Check if this URL is an allowed URL (hence should be enqueued)
 | ||||
|                     let allowed = txn | ||||
|                         .get_domain_record(domain.borrow())? | ||||
|                         .map(|record: DomainRecord| record.is_url_rakeable(&ref_url)) | ||||
|                         .flatten(); | ||||
|                     // track all the referred-to URLs!
 | ||||
|                     for reference in refs.references { | ||||
|                         let ref_url = Url::parse(&reference.target).with_context(|| { | ||||
|                             format!( | ||||
|                                 "failed to parse target URL of reference: {:?}", | ||||
|                                 reference.target | ||||
|                             ) | ||||
|                         })?; | ||||
|                         let domain = get_reduced_domain(&ref_url).with_context(|| { | ||||
|                             format!("failed to reduce domain: {:?}", reference.target) | ||||
|                         })?; | ||||
| 
 | ||||
|                     match allowed { | ||||
|                         Some(true) => { | ||||
|                             let is_fresh = txn.enqueue_url( | ||||
|                                 &reference.target, | ||||
|                                 reference.last_mod, | ||||
|                                 reference.kind.into(), | ||||
|                             )?; | ||||
|                             if is_fresh { | ||||
|                                 increment_counter!("qprake_queue_new_url"); | ||||
|                         // Check if this URL is an allowed URL (hence should be enqueued)
 | ||||
|                         let allowed = txn | ||||
|                             .get_domain_record(domain.borrow()) | ||||
|                             .await? | ||||
|                             .map(|record: DomainRecord| record.is_url_rakeable(&ref_url)) | ||||
|                             .flatten(); | ||||
| 
 | ||||
|                         match allowed { | ||||
|                             Some(true) => { | ||||
|                                 let is_fresh = txn | ||||
|                                     .enqueue_url( | ||||
|                                         &reference.target, | ||||
|                                         reference.last_mod.map(|days| { | ||||
|                                             *QUICKPEEP_EPOCH_ST | ||||
|                                                 + Duration::from_secs(days as u64 * DAY_SEC as u64) | ||||
|                                         }), | ||||
|                                         reference.kind.into(), | ||||
|                                         false, | ||||
|                                     ) | ||||
|                                     .await?; | ||||
|                                 if is_fresh { | ||||
|                                     increment_counter!("qprake_queue_new_url"); | ||||
|                                 } | ||||
|                                 continue; | ||||
|                             } | ||||
|                             Some(false) => { | ||||
|                                 // Weed! Do nothing.
 | ||||
|                             } | ||||
|                             None => { | ||||
|                                 // It's neither allowed nor weeded, so put it on hold for later inspection
 | ||||
|                                 txn.enqueue_url( | ||||
|                                     &reference.target, | ||||
|                                     reference.last_mod.map(|days| { | ||||
|                                         *QUICKPEEP_EPOCH_ST | ||||
|                                             + Duration::from_secs(days as u64 * DAY_SEC as u64) | ||||
|                                     }), | ||||
|                                     reference.kind.into(), | ||||
|                                     true, | ||||
|                                 ) | ||||
|                                 .await?; | ||||
|                             } | ||||
|                             continue; | ||||
|                         } | ||||
|                         Some(false) => { | ||||
|                             // Weed! Do nothing.
 | ||||
|                         } | ||||
|                         None => { | ||||
|                             // It's neither allowed nor weeded, so put it on hold for later inspection
 | ||||
|                             txn.put_url_on_hold(&reference.target, reference.kind.into())?; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 txn.commit()?; | ||||
|                 Ok(()) | ||||
|                     Ok(()) | ||||
|                 }) | ||||
|             }) | ||||
|             .await | ||||
|     } | ||||
| 
 | ||||
|     pub async fn process_rejection(&self, url: Url, datestamp: u16) -> anyhow::Result<()> { | ||||
|     pub async fn process_rejection(&self, url: Url, visited_on_ts: i32) -> anyhow::Result<()> { | ||||
|         self.store | ||||
|             .as_ref() | ||||
|             .async_rw_txn(move |txn| { | ||||
|                 let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                     format!("No domain for URL '{url}' for which we are processing a rejection!") | ||||
|                 })?; | ||||
|                 txn.mark_url_as_visited( | ||||
|                     domain.as_ref(), | ||||
|                     url.as_ref(), | ||||
|                     UrlVisitedRecord { | ||||
|                         last_visited_days: datestamp, | ||||
|                     }, | ||||
|                     None, | ||||
|                 )?; | ||||
|                 txn.commit()?; | ||||
|                 Ok(()) | ||||
|             .rw_txn(move |mut txn| { | ||||
|                 Box::pin(async move { | ||||
|                     let domain = get_reduced_domain(&url).with_context(|| { | ||||
|                         format!( | ||||
|                             "No domain for URL '{url}' for which we are processing a rejection!" | ||||
|                         ) | ||||
|                     })?; | ||||
|                     txn.mark_url_as_visited(domain.as_ref(), url.as_ref(), visited_on_ts, None) | ||||
|                         .await?; | ||||
|                     Ok(()) | ||||
|                 }) | ||||
|             }) | ||||
|             .await | ||||
|     } | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,80 +1,41 @@ | ||||
| use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString}; | ||||
| use crate::storage::records::{DomainRecord, OnHoldUrlRecord}; | ||||
| use crate::storage::records::DomainRecord; | ||||
| use crate::storage::RakerTxn; | ||||
| use anyhow::Context; | ||||
| use libmdbx::{Database, WriteFlags, RW}; | ||||
| use reqwest::Url; | ||||
| use itertools::Itertools; | ||||
| use sqlx::query_as; | ||||
| use sqlx::types::Json; | ||||
| 
 | ||||
| /// Runs one big transaction that:
 | ||||
| /// - scans on-hold URLs
 | ||||
| ///     - moves 'allowed' ones to the queue
 | ||||
| ///     - deletes 'weeds'
 | ||||
| ///     - leaves unknown ones alone
 | ||||
| ///
 | ||||
| /// Re-enqueues domains
 | ||||
| /// Ideally should be applied after importing seeds and weeds on an existing database.
 | ||||
| pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> { | ||||
| pub async fn reapply_seeds_and_weeds_to_on_hold_urls( | ||||
|     mut txn: RakerTxn<'_, '_, true>, | ||||
| ) -> anyhow::Result<()> { | ||||
|     struct DomainState { | ||||
|         pub domain: String, | ||||
|         pub domain_record: Option<DomainRecord>, | ||||
|         pub domain_id: i32, | ||||
|         pub domain_record: Json<DomainRecord>, | ||||
|     } | ||||
| 
 | ||||
|     let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold; | ||||
|     let reinstatable_domains: Vec<DomainState> = query_as!(DomainState, r#" | ||||
|         SELECT DISTINCT u.domain_id, domain_record AS "domain_record: Json<DomainRecord>" FROM url_queue | ||||
|         JOIN urls u USING (url_id) | ||||
|         JOIN domains USING (domain_id) | ||||
|         LEFT JOIN domain_backoffs db ON u.domain_id = db.domain_id | ||||
|         WHERE db.domain_id IS NULL | ||||
|     "#).fetch_all(&mut *txn.txn)
 | ||||
|         .await?; | ||||
| 
 | ||||
|     let mut domain_state = None; | ||||
| 
 | ||||
|     // Scan through the on-hold URLs
 | ||||
|     let mut cur = txn.mdbx_txn.cursor(urls_on_hold)?; | ||||
|     let mut first_iteration = true; | ||||
| 
 | ||||
|     while let Some((MdbxString(domain_then_url), MdbxBare(record))) = if first_iteration { | ||||
|         first_iteration = false; | ||||
|         cur.first::<MdbxString, MdbxBare<OnHoldUrlRecord>>() | ||||
|     } else { | ||||
|         cur.next::<MdbxString, MdbxBare<OnHoldUrlRecord>>() | ||||
|     }? { | ||||
|         let mut split = domain_then_url.as_ref().split("\n"); | ||||
|         let domain = split.next().context("No first split..?")?; | ||||
|         let url_str = split.next().context("No URL")?; | ||||
| 
 | ||||
|         // Is the domain new?
 | ||||
|         if domain_state | ||||
|             .as_ref() | ||||
|             .map(|ds: &DomainState| &ds.domain != domain) | ||||
|             .unwrap_or(true) | ||||
|         { | ||||
|             // Then load the relevant records for it.
 | ||||
|             domain_state = Some(DomainState { | ||||
|                 domain: domain.to_owned(), | ||||
|                 domain_record: txn.get_domain_record(domain)?, | ||||
|             }); | ||||
|         } | ||||
| 
 | ||||
|         let url = Url::parse(url_str)?; | ||||
| 
 | ||||
|         let domain_state = domain_state.as_ref().unwrap(); | ||||
| 
 | ||||
|         let is_rakeable = domain_state | ||||
|     for domain_state in reinstatable_domains { | ||||
|         let any_seeds = domain_state | ||||
|             .domain_record | ||||
|             .as_ref() | ||||
|             .map(|dr: &DomainRecord| dr.is_url_rakeable(&url)) | ||||
|             .flatten(); | ||||
| 
 | ||||
|         match is_rakeable { | ||||
|             Some(true) => { | ||||
|                 // ALLOWED
 | ||||
|                 // Make it a queued URL
 | ||||
|                 txn.enqueue_url(url_str, None, record.queue_record.intent)?; | ||||
|                 cur.del(WriteFlags::empty())?; | ||||
|             } | ||||
|             Some(false) => { | ||||
|                 // WEED
 | ||||
|                 // Just delete
 | ||||
|                 cur.del(WriteFlags::empty())?; | ||||
|             } | ||||
|             None => { /* nop: neither allowed nor a weed. Keep on hold. */ } | ||||
|             .rakeable_path_prefixes | ||||
|             .values() | ||||
|             .contains(&true); | ||||
|         if !any_seeds { | ||||
|             continue; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     txn.commit()?; | ||||
|         // This domain has *some* seeds, let's just reinstate it.
 | ||||
|         txn.insert_active_domain_id_with_new_raffle_ticket(domain_state.domain_id) | ||||
|             .await?; | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| @ -1,190 +0,0 @@ | ||||
| use anyhow::{anyhow, Context}; | ||||
| use libmdbx::{Error, TableObject, TransactionKind}; | ||||
| use serde::de::DeserializeOwned; | ||||
| use serde::Serialize; | ||||
| use std::borrow::Cow; | ||||
| 
 | ||||
| /// u16 in BIG byte endianness (u16 not supported by INTEGERKEY mode!)
 | ||||
| #[derive(Copy, Clone, Debug)] | ||||
| pub struct MdbxU16BE(pub u16); | ||||
| 
 | ||||
| impl MdbxU16BE { | ||||
|     pub fn as_bytes(&self) -> Cow<'_, [u8]> { | ||||
|         Cow::Owned(self.0.to_be_bytes().to_vec()) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl TableObject<'_> for MdbxU16BE { | ||||
|     fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         if data_val.len() != 2 { | ||||
|             return Err(libmdbx::Error::DecodeError( | ||||
|                 anyhow!("MDBX Key not 2 bytes; can't be decoded as u16").into(), | ||||
|             )); | ||||
|         } | ||||
|         let mut buf = [0u8; 2]; | ||||
|         buf.copy_from_slice(&data_val); | ||||
|         Ok(MdbxU16BE(u16::from_be_bytes(buf))) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// u32 in native byte endianness (as required by INTEGERKEY mode)
 | ||||
| #[derive(Copy, Clone, Debug)] | ||||
| pub struct MdbxU32(pub u32); | ||||
| 
 | ||||
| impl MdbxU32 { | ||||
|     pub fn as_bytes(&self) -> Cow<'_, [u8]> { | ||||
|         Cow::Owned(self.0.to_ne_bytes().to_vec()) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl TableObject<'_> for MdbxU32 { | ||||
|     fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         if data_val.len() != 4 { | ||||
|             return Err(libmdbx::Error::DecodeError( | ||||
|                 anyhow!("MDBX Key not 4 bytes; can't be decoded as u32").into(), | ||||
|             )); | ||||
|         } | ||||
|         let mut buf = [0u8; 4]; | ||||
|         buf.copy_from_slice(&data_val); | ||||
|         Ok(MdbxU32(u32::from_ne_bytes(buf))) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// u64 in native byte endianness (as required by INTEGERKEY mode)
 | ||||
| #[derive(Copy, Clone, Debug)] | ||||
| pub struct MdbxU64(pub u64); | ||||
| 
 | ||||
| impl MdbxU64 { | ||||
|     pub fn as_bytes(&self) -> Cow<'_, [u8]> { | ||||
|         Cow::Owned(self.0.to_ne_bytes().to_vec()) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl TableObject<'_> for MdbxU64 { | ||||
|     fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         if data_val.len() != 8 { | ||||
|             return Err(libmdbx::Error::DecodeError( | ||||
|                 anyhow!("MDBX Key not 8 bytes; can't be decoded as u64").into(), | ||||
|             )); | ||||
|         } | ||||
|         let mut buf = [0u8; 8]; | ||||
|         buf.copy_from_slice(&data_val); | ||||
|         Ok(MdbxU64(u64::from_ne_bytes(buf))) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// UTF-8 String
 | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct MdbxString<'txn>(pub Cow<'txn, str>); | ||||
| 
 | ||||
| impl MdbxString<'_> { | ||||
|     pub fn as_bytes(&self) -> &[u8] { | ||||
|         self.0.as_bytes() | ||||
|     } | ||||
| 
 | ||||
|     pub fn into_string(self) -> String { | ||||
|         self.0.into_owned() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl<'a> TableObject<'_> for MdbxString<'a> { | ||||
|     fn decode(_data_val: &[u8]) -> Result<Self, libmdbx::Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         unreachable!() | ||||
|     } | ||||
| 
 | ||||
|     unsafe fn decode_val<K: TransactionKind>( | ||||
|         txnptr: *const mdbx_sys::MDBX_txn, | ||||
|         data_val: &mdbx_sys::MDBX_val, | ||||
|     ) -> Result<Self, Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         let bytes = MdbxBytes::decode_val::<K>(txnptr, data_val)?; | ||||
|         let string_cow = match bytes { | ||||
|             Cow::Borrowed(data) => { | ||||
|                 let string = std::str::from_utf8(data) | ||||
|                     .context("Failed to decode MDBX key as string") | ||||
|                     .map_err(|e| libmdbx::Error::DecodeError(e.into()))?; | ||||
|                 Cow::Borrowed(string) | ||||
|             } | ||||
|             Cow::Owned(data) => { | ||||
|                 let string = String::from_utf8(data) | ||||
|                     .context("Failed to decode MDBX key as string") | ||||
|                     .map_err(|e| libmdbx::Error::DecodeError(e.into()))?; | ||||
|                 Cow::Owned(string) | ||||
|             } | ||||
|         }; | ||||
|         Ok(MdbxString(string_cow)) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // /// UTF-8 String
 | ||||
| // /// Using Cow<'txn, str> would've needed some unsafe code (see `Cow<'txn, [u8]>` for inspiration),
 | ||||
| // /// so I didn't bother for now.
 | ||||
| // #[derive(Clone, Debug)]
 | ||||
| // pub struct MdbxString(pub String);
 | ||||
| //
 | ||||
| // impl MdbxString {
 | ||||
| //     pub fn as_bytes(&self) -> &[u8] {
 | ||||
| //         self.0.as_bytes()
 | ||||
| //     }
 | ||||
| // }
 | ||||
| //
 | ||||
| // impl TableObject<'_> for MdbxString {
 | ||||
| //     fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error> where Self: Sized {
 | ||||
| //         let string = String::from_utf8(data_val.to_vec())
 | ||||
| //             .context("Failed to decode MDBX key as string")
 | ||||
| //             .map_err(|e| libmdbx::Error::DecodeError(e.into()))?;
 | ||||
| //         Ok(MdbxString(string))
 | ||||
| //     }
 | ||||
| // }
 | ||||
| 
 | ||||
| /// Any BARE payload
 | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct MdbxBare<T>(pub T); | ||||
| 
 | ||||
| impl<T: Serialize> MdbxBare<T> { | ||||
|     pub fn as_bytes(&self) -> Vec<u8> { | ||||
|         serde_bare::to_vec(&self.0).expect("It's unreasonable to expect serialisation will fail") | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl<T: DeserializeOwned> TableObject<'_> for MdbxBare<T> { | ||||
|     fn decode(_data_val: &[u8]) -> Result<Self, libmdbx::Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         unreachable!() | ||||
|     } | ||||
| 
 | ||||
|     unsafe fn decode_val<K: TransactionKind>( | ||||
|         txnptr: *const mdbx_sys::MDBX_txn, | ||||
|         data_val: &mdbx_sys::MDBX_val, | ||||
|     ) -> Result<Self, Error> | ||||
|     where | ||||
|         Self: Sized, | ||||
|     { | ||||
|         let bytes = MdbxBytes::decode_val::<K>(txnptr, data_val)?; | ||||
| 
 | ||||
|         let record = serde_bare::from_slice(bytes.as_ref()) | ||||
|             .context("Failed to decode MDBX key as BARE object") | ||||
|             .map_err(|e| libmdbx::Error::DecodeError(e.into()))?; | ||||
| 
 | ||||
|         Ok(MdbxBare(record)) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// Supported natively by libmdbx.
 | ||||
| pub type MdbxBytes<'txn> = Cow<'txn, [u8]>; | ||||
| @ -1,2 +0,0 @@ | ||||
| pub const MIGRATION_KEY: &[u8] = b"MIGRATION_VERSION"; | ||||
| pub const MIGRATION_VERSION: &str = "quickpeep_raker:0.1.0"; | ||||
| @ -1,50 +1,13 @@ | ||||
| use crate::raking::{RakeIntent, TemporaryFailure}; | ||||
| use crate::raking::RakeIntent; | ||||
| use reqwest::Url; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use std::collections::BTreeMap; | ||||
| 
 | ||||
| #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] | ||||
| pub struct ActiveDomainRecord { | ||||
|     /// The raffle ticket number owned by this domain.
 | ||||
|     pub raffle_ticket: u32, | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, Deserialize, Serialize)] | ||||
| pub struct UrlVisitedRecord { | ||||
|     /// Number of days since the QuickPeep Epoch that this page was last raked at.
 | ||||
|     /// A u16 is fine here, giving 179 years worth of values. This allows compact encoding.
 | ||||
|     /// We don't really care about a more granular timestamp: sitemaps and feeds usually only
 | ||||
|     /// give the date of last update anyway.
 | ||||
|     pub last_visited_days: u16, | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, Deserialize, Serialize)] | ||||
| pub struct QueueUrlRecord { | ||||
|     pub intent: RakeIntent, // TODO CONSIDER
 | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, Deserialize, Serialize)] | ||||
| pub struct OnHoldUrlRecord { | ||||
|     /// Record that should be emitted once this is released.
 | ||||
|     pub queue_record: QueueUrlRecord, | ||||
| 
 | ||||
|     /// Number of times this URL has been 'enqueued'; capped at 255.
 | ||||
|     pub refs: u8, | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, Deserialize, Serialize)] | ||||
| pub struct BackingOffDomainRecord { | ||||
|     /// The URL that caused the backoff.
 | ||||
|     pub failed_url: String, | ||||
|     /// The reason that this backoff is in place
 | ||||
|     pub failure: TemporaryFailure, | ||||
|     /// Duration of the backoff. Used to provide increasing backoffs if the failures persist.
 | ||||
|     pub backoff: u32, | ||||
|     /// When the domain should be reinstated
 | ||||
|     /// MUST match the timestamp present in the reinstatements table.
 | ||||
|     pub reinstate_at: u64, | ||||
| } | ||||
| 
 | ||||
| #[derive(Clone, Debug, Serialize, Deserialize, Default)] | ||||
| pub struct DomainRecord { | ||||
|     pub rakeable_path_prefixes: BTreeMap<String, bool>, | ||||
|  | ||||
| @ -1,11 +1,15 @@ | ||||
| use anyhow::Context; | ||||
| use chrono::{Date, Duration, TimeZone, Utc}; | ||||
| use lazy_static::lazy_static; | ||||
| use std::time::SystemTime; | ||||
| 
 | ||||
| lazy_static! { | ||||
|     /// The QuickPeep Epoch is 2022-01-01, as this gives us 52 years of extra headroom compared to the
 | ||||
|     /// Unix one. QuickPeep didn't exist before 2022 so we needn't worry about negative dates!
 | ||||
|     pub static ref QUICKPEEP_EPOCH: Date<Utc> = Utc.ymd(2022, 1, 1); | ||||
|     /// The QuickPeep Epoch is 2023-01-01, as this gives us 53 years of extra headroom compared to the
 | ||||
|     /// Unix one. This QuickPeep database format didn't exist before 2023 so we needn't worry about negative dates!
 | ||||
|     pub static ref QUICKPEEP_EPOCH: Date<Utc> = Utc.ymd(2023, 1, 1); | ||||
| 
 | ||||
|     /// 2023-01-01
 | ||||
|     pub static ref QUICKPEEP_EPOCH_ST: SystemTime = SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(1672531200); | ||||
| } | ||||
| 
 | ||||
| pub fn date_from_quickpeep_days(days: u16) -> Date<Utc> { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user