Compare commits
76 Commits
v0.1.0-alp
...
main
|
@ -1,8 +1,8 @@
|
|||
platform: linux/arm64
|
||||
platform: linux/amd64
|
||||
|
||||
pipeline:
|
||||
deployManual:
|
||||
image: docker.bics.ga/rei_ci/mdbook:latest-arm64
|
||||
image: git.emunest.net/rei_oci_pub/mdbook:latest-amd64
|
||||
when:
|
||||
branch:
|
||||
- main
|
||||
|
|
|
@ -4,7 +4,8 @@ platform: linux/${ARCH}
|
|||
|
||||
matrix:
|
||||
ARCH:
|
||||
- arm64
|
||||
# We don't have an arm64 runner for now.
|
||||
#- arm64
|
||||
- amd64
|
||||
|
||||
.a1: &when
|
||||
|
|
|
@ -14,9 +14,11 @@
|
|||
/workbench
|
||||
/rakepacks
|
||||
/index
|
||||
/nix_flake/result
|
||||
/nix_flake/test_vm/nixos.qcow2
|
||||
/nix_flake/test_vm/result
|
||||
/result
|
||||
/test_vm/nixos.qcow2
|
||||
/test_vm/result
|
||||
/quickpeep.ron
|
||||
/index_icons
|
||||
/index_icons-lck
|
||||
|
||||
target
|
|
@ -523,6 +523,17 @@ version = "1.0.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.19"
|
||||
|
@ -1001,9 +1012,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
|||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.30"
|
||||
version = "0.8.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df"
|
||||
checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
@ -1062,9 +1073,9 @@ checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
|||
|
||||
[[package]]
|
||||
name = "fancy_mdbx"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83535ae4359168a08578b1329699712ba3d43a3a800395f2c14e64506861d449"
|
||||
checksum = "1de3f56e0423d5cfd4dea0628c619a36dad4441e8cc027cfdaf81d5f6caf3463"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libmdbx",
|
||||
|
@ -3712,6 +3723,15 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quickpeep_html_charset_detection"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chardetng",
|
||||
"encoding_rs",
|
||||
"subslice",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quickpeep_index"
|
||||
version = "0.1.0"
|
||||
|
@ -3746,11 +3766,13 @@ dependencies = [
|
|||
"quickpeep_seed_parser",
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils",
|
||||
"reqwest",
|
||||
"ron",
|
||||
"serde",
|
||||
"serde_bare",
|
||||
"serde_json",
|
||||
"smartstring",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"url",
|
||||
"zstd",
|
||||
|
@ -3806,6 +3828,7 @@ dependencies = [
|
|||
"ouroboros",
|
||||
"publicsuffix",
|
||||
"quickpeep_densedoc",
|
||||
"quickpeep_html_charset_detection",
|
||||
"quickpeep_moz_readability",
|
||||
"quickpeep_seed_parser",
|
||||
"quickpeep_structs",
|
||||
|
@ -3820,6 +3843,8 @@ dependencies = [
|
|||
"signal-hook 0.3.13",
|
||||
"sitemap",
|
||||
"smartstring",
|
||||
"tempfile",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"webp",
|
||||
"zstd",
|
||||
|
@ -4641,6 +4666,15 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "subslice"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0a8e4809a3bb02de01f1f7faf1ba01a83af9e8eabcd4d31dd6e413d14d56aae"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.4.1"
|
||||
|
@ -4846,6 +4880,26 @@ dependencies = [
|
|||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-sys"
|
||||
version = "0.5.3+5.3.0-patched"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a678df20055b43e57ef8cddde41cdfda9a3c1a060b67f4c5836dfb1d78543ba8"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemallocator"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tikv-jemalloc-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.1.44"
|
||||
|
|
|
@ -8,5 +8,6 @@ members = [
|
|||
"quickpeep_moz_readability",
|
||||
"quickpeep_seed_parser",
|
||||
"quickpeep_structs",
|
||||
"quickpeep_utils"
|
||||
"quickpeep_utils",
|
||||
"quickpeep_html_charset_detection",
|
||||
]
|
||||
|
|
16
README.md
16
README.md
|
@ -26,11 +26,11 @@ If you need to fall back to a conventional search engine, this will eventually b
|
|||
|
||||
*Crossed-out things are aspirational and not yet implemented.*
|
||||
|
||||
- ~~Shareable 'rakepacks', so that anyone can run their own search instance without needing to rake (crawl) themselves~~
|
||||
- ~~Dense encoding to minimise disk space usage; compressed with Zstd?~~
|
||||
- Shareable 'rakepacks', so that anyone can run their own search instance without needing to rake (crawl) themselves
|
||||
- Dense encoding to minimise disk space usage; compressed with Zstd.
|
||||
- Raking (crawling) support for
|
||||
- HTML (including redirecting to Canonical URLs)
|
||||
- ~~Language detection~~
|
||||
- Language detection for when the metadata is absent.
|
||||
- Redirects
|
||||
- ~~Gemtext over Gemini~~
|
||||
- RSS, Atom and JSON feeds
|
||||
|
@ -43,9 +43,9 @@ If you need to fall back to a conventional search engine, this will eventually b
|
|||
- Article content extraction, to provide more weight to words found within the article content (based on a Rust version of Mozilla's *Readability* engine)
|
||||
- (Misc)
|
||||
- ~~Use of the Public Suffix List~~
|
||||
- ~~Tagging URL patterns; e.g. to mark documentation as 'old'.~~
|
||||
- Tagging URL patterns; e.g. to mark documentation as 'old'.
|
||||
- ~~Page duplicate content detection (e.g. to detect `/` and `/index.html`, or non-HTTPS and HTTPS, or non-`www` and `www`...)~~
|
||||
- ~~Language detection for pages that don't have that metadata available.~~
|
||||
|
||||
|
||||
|
||||
## Limitations
|
||||
|
@ -62,11 +62,17 @@ If you need to fall back to a conventional search engine, this will eventually b
|
|||
|
||||
*Not written yet.*
|
||||
|
||||
The stages of the QuickPeep pipeline are briefly described in [an introductory blog post][qp_intro_blog].
|
||||
|
||||
[qp_intro_blog]: https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine
|
||||
|
||||
|
||||
## Development and Running
|
||||
|
||||
*Not written yet.*
|
||||
|
||||
Some hints may be obtained from the introductory blog post mentioned in the 'Architecture' section, but it's probably quite difficult to follow right now.
|
||||
|
||||
|
||||
### Helper scripts
|
||||
|
||||
|
|
|
@ -8,6 +8,6 @@ description = "Documentation for QuickPeep"
|
|||
|
||||
[output.html]
|
||||
default-theme = "coal"
|
||||
git-repository-url = "https://bics.ga/reivilibre/quickpeep.git"
|
||||
git-repository-url = "https://git.emunest.net/reivilibre/quickpeep.git"
|
||||
git-repository-icon = "fa-git-alt"
|
||||
fold = { enable = true, level = 1 }
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
- [QuickPeep]()
|
||||
- [Running and Hosting]()
|
||||
- [QuickPeep Search]()
|
||||
- [QuickPeep Seed Collection Service]()
|
||||
- [QuickPeep Seed Collection Service](./running/seed_collection_service.md)
|
||||
- [QuickPeep Raker]()
|
||||
- [QuickPeep Indexer]()
|
||||
- [Internals](./internals/index.md)
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# Running a QuickPeep Seed Collection Service
|
||||
|
||||
The QuickPeep web interface has a built-in seed collection page at `/seeds`.
|
||||
This is a simple form where anyone can submit a URL and tag it with appropriate tags.
|
||||
|
||||
The list of tags can be changed in `quickpeep.ron` (`web` → `seed_collection` → ...).
|
||||
|
||||
|
||||
## Retrieving seeds from the Seed Collection Service
|
||||
|
||||
First use the `qp-seedcoll-sort` utility included with QuickPeep to sort through the seeds that you have received.
|
||||
For each seed that has been received on the web interface, but not yet sorted, you will be given the option to:
|
||||
|
||||
- accept (`y`)
|
||||
- reject
|
||||
- generic reason (`n`)
|
||||
- because it duplicates an existing entry (`dupe`)
|
||||
- because it's spam (`spam`)
|
||||
- because it's invalid for some reason; e.g. the URL isn't valid (`inv`)
|
||||
|
||||
This just marks the seeds in the database but doesn't yet emit them in a format usable by the QuickPeep raker.
|
||||
To export the seeds, use `qp-seedcoll-dump path/to/new/file.seed`.
|
||||
This command writes a seed file (usable by the Raker) for all the seeds that haven't yet been dumped.
|
||||
|
||||
Getting the seeds from your web UI to the Raker is an exercise left for the reader, but I will note that I do this by committing them into a Git repository, with each seed file being dated.
|
||||
This also has the benefit of being able to easily publish them for others to use; my seeds are available at https://git.emunest.net/reivilibre/quickpeep_seeds.
|
|
@ -0,0 +1,63 @@
|
|||
{
|
||||
"nodes": {
|
||||
"naersk": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1654608517,
|
||||
"narHash": "sha256-KIxHjDDJYhoiLanLjpeAk5AuZsfip8M62JhkuloEGb0=",
|
||||
"owner": "nix-community",
|
||||
"repo": "naersk",
|
||||
"rev": "14997a79cd78fe34ad6390f18a327ee0593e5eec",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "naersk",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1714971268,
|
||||
"narHash": "sha256-IKwMSwHj9+ec660l+I4tki/1NRoeGpyA2GdtdYpAgEw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "27c13997bf450a01219899f5a83bd6ffbfc70d3c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"ref": "nixos-23.11",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"naersk": "naersk",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"utils": "utils"
|
||||
}
|
||||
},
|
||||
"utils": {
|
||||
"locked": {
|
||||
"lastModified": 1653893745,
|
||||
"narHash": "sha256-0jntwV3Z8//YwuOjzhV2sgJJPt+HY6KhU7VZUL0fKZQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "1ed9fb1935d260de5fe1c2f7ee0ebaae17ed2fa1",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
|
@ -2,13 +2,15 @@
|
|||
description = "QuickPeep Search Engine Flake for Nix";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "nixpkgs/nixos-23.11";
|
||||
utils.url = "github:numtide/flake-utils";
|
||||
naersk.url = "github:nix-community/naersk";
|
||||
src.url = "path:..";
|
||||
src.flake = false;
|
||||
naersk = {
|
||||
url = "github:nix-community/naersk";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, utils, naersk, src }:
|
||||
outputs = { self, nixpkgs, utils, naersk }:
|
||||
utils.lib.eachDefaultSystem (system: let
|
||||
pkgs = nixpkgs.legacyPackages."${system}";
|
||||
naersk-lib = naersk.lib."${system}";
|
||||
|
@ -16,10 +18,10 @@
|
|||
# `nix build`
|
||||
packages.quickpeep = naersk-lib.buildPackage {
|
||||
pname = "quickpeep";
|
||||
root = src;
|
||||
root = ./.;
|
||||
buildInputs = with pkgs; [
|
||||
openssl
|
||||
pkgconfig
|
||||
pkg-config
|
||||
|
||||
];
|
||||
nativeBuildInputs = with pkgs; [
|
||||
|
@ -37,11 +39,48 @@
|
|||
};
|
||||
};
|
||||
|
||||
# packages.quickpeepWebStatic = pkgs.stdenv.mkDerivation {
|
||||
# name = "quickpeepWebStatic";
|
||||
#
|
||||
# src = ./quickpeep_static;
|
||||
#
|
||||
# buildInputs = [ pkgs.yarn ];
|
||||
#
|
||||
# preparePhase = ''
|
||||
# yarn install
|
||||
# '';
|
||||
#
|
||||
# buildPhase = ''
|
||||
# yarn build
|
||||
# cp -r dist/* $out
|
||||
# '';
|
||||
# };
|
||||
|
||||
packages.quickpeepWebStatic = pkgs.mkYarnPackage {
|
||||
name = "quickpeepWebStatic";
|
||||
src = ./quickpeep_static;
|
||||
packageJSON = ./quickpeep_static/package.json;
|
||||
yarnLock = ./quickpeep_static/yarn.lock;
|
||||
yarnNix = ./quickpeep_static/yarn.nix;
|
||||
|
||||
postBuild = ''
|
||||
yarn build
|
||||
'';
|
||||
installPhase = ''
|
||||
# nop ?
|
||||
'';
|
||||
distPhase = ''
|
||||
mkdir $out
|
||||
cp -r deps/quickpeep_static/dist/* $out/
|
||||
'';
|
||||
};
|
||||
|
||||
defaultPackage = packages.quickpeep;
|
||||
|
||||
# NixOS Modules
|
||||
nixosModules = {
|
||||
quickpeepSearch = import ./modules/quickpeepSearch.nix self;
|
||||
quickpeepSearch = import ./nixos_modules/quickpeepSearch.nix self;
|
||||
quickpeepRaker = import ./nixos_modules/quickpeepRaker.nix self;
|
||||
};
|
||||
|
||||
# `nix run`
|
|
@ -0,0 +1,693 @@
|
|||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__elements": [],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "8.5.4"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 16,
|
||||
"panels": [],
|
||||
"title": "Raking",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"200": {
|
||||
"color": "green",
|
||||
"index": 0
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
},
|
||||
{
|
||||
"options": {
|
||||
"pattern": "5\\d\\d",
|
||||
"result": {
|
||||
"color": "orange",
|
||||
"index": 1
|
||||
}
|
||||
},
|
||||
"type": "regex"
|
||||
},
|
||||
{
|
||||
"options": {
|
||||
"pattern": "4\\d\\d",
|
||||
"result": {
|
||||
"color": "red",
|
||||
"index": 2
|
||||
}
|
||||
},
|
||||
"type": "regex"
|
||||
},
|
||||
{
|
||||
"options": {
|
||||
"pattern": "3\\d\\d",
|
||||
"result": {
|
||||
"color": "purple",
|
||||
"index": 3
|
||||
}
|
||||
},
|
||||
"type": "regex"
|
||||
}
|
||||
],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "hertz"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(quickpeep_qprake_rake_status_count[1m])",
|
||||
"legendFormat": "{{status}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Response Codes",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "binBps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(quickpeep_emitted_pack_bytes[1m])",
|
||||
"legendFormat": "{{pack}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "RakePack Write Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"id": 14,
|
||||
"panels": [],
|
||||
"title": "Database",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "rows"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "quickpeep_db_entries",
|
||||
"legendFormat": "{{db}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Database Rows",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "quickpeep_db_size_bytes",
|
||||
"legendFormat": "{{db}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Database Size",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "hertz"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"id": 4,
|
||||
"interval": "30s",
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "delta(quickpeep_db_entries{db=\"visited_urls\"}[1m])",
|
||||
"instant": false,
|
||||
"legendFormat": "URLs",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Rake (Visit) Rate (by DB rows)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "hertz"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "delta(quickpeep_db_entries{db=\"queue_urls\"}[1m])",
|
||||
"legendFormat": "URLs",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Queue Growth/Reduction Rate (by DB rows)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-30m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "QuickPeep Raking",
|
||||
"uid": "e4Z8Q_97k",
|
||||
"version": 2,
|
||||
"weekStart": ""
|
||||
}
|
|
@ -1,87 +0,0 @@
|
|||
{
|
||||
"nodes": {
|
||||
"naersk": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1649096192,
|
||||
"narHash": "sha256-7O8e+eZEYeU+ET98u/zW5epuoN/xYx9G+CIh4DjZVzY=",
|
||||
"owner": "nix-community",
|
||||
"repo": "naersk",
|
||||
"rev": "d626f73332a8f587b613b0afe7293dd0777be07d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "naersk",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1648219316,
|
||||
"narHash": "sha256-Ctij+dOi0ZZIfX5eMhgwugfvB+WZSrvVNAyAuANOsnQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "30d3d79b7d3607d56546dd2a6b49e156ba0ec634",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1648219316,
|
||||
"narHash": "sha256-Ctij+dOi0ZZIfX5eMhgwugfvB+WZSrvVNAyAuANOsnQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "30d3d79b7d3607d56546dd2a6b49e156ba0ec634",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"naersk": "naersk",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"src": "src",
|
||||
"utils": "utils"
|
||||
}
|
||||
},
|
||||
"src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-1/06n6MpN2m2LMteAwFqZ2qWADpGSo7MFQplLFEJRSc=",
|
||||
"path": "..",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"path": "..",
|
||||
"type": "path"
|
||||
}
|
||||
},
|
||||
"utils": {
|
||||
"locked": {
|
||||
"lastModified": 1648297722,
|
||||
"narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
flake: {config, pkgs, lib, ...}:
|
||||
|
||||
let
|
||||
cfg = config.services.quickpeepRaker;
|
||||
inherit (flake.packages.${pkgs.stdenv.hostPlatform.system}) quickpeep;
|
||||
in
|
||||
|
||||
with lib;
|
||||
|
||||
{
|
||||
options = {
|
||||
services.quickpeepRaker = {
|
||||
enable = mkOption {
|
||||
default = false;
|
||||
type = with types; bool;
|
||||
description = ''
|
||||
Start the QuickPeep Raker.
|
||||
'';
|
||||
};
|
||||
|
||||
user = mkOption {
|
||||
default = "quickpeep";
|
||||
type = with types; uniq str;
|
||||
description = ''
|
||||
Name of the user.
|
||||
'';
|
||||
};
|
||||
|
||||
# metricsBind = mkOption {
|
||||
# default = null;
|
||||
# example = "127.0.0.1:1234";
|
||||
# type = with types; nullOr str;
|
||||
# description = ''
|
||||
# Host and port upon which to bind the Prometheus/OpenMetrics interface.
|
||||
# '';
|
||||
# };
|
||||
|
||||
configPath = mkOption {
|
||||
type = with types; path;
|
||||
description = ''
|
||||
Config path to use, in RON format.
|
||||
'';
|
||||
};
|
||||
|
||||
concurrency = mkOption {
|
||||
type = types.int;
|
||||
default = 8;
|
||||
description = ''
|
||||
Number of concurrent fetches to allow.
|
||||
'';
|
||||
};
|
||||
|
||||
sleepers = mkOption {
|
||||
type = types.int;
|
||||
default = 56;
|
||||
description = ''
|
||||
An additional number of tasks to permit in the sleeping state.
|
||||
'';
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
users.users."${cfg.user}" = {
|
||||
description = "QuickPeep User";
|
||||
isSystemUser = true;
|
||||
group = "${cfg.user}";
|
||||
};
|
||||
users.groups."${cfg.user}" = {};
|
||||
|
||||
systemd.services.quickpeepRaker = {
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" ];
|
||||
description = "Rakes websites and converts them to an indexable format.";
|
||||
|
||||
environment = {
|
||||
QUICKPEEP_CONFIG = cfg.configPath;
|
||||
};
|
||||
serviceConfig = {
|
||||
# TODO disable automatic restart?
|
||||
# TODO start it on a timer to ensure it picks up updates to stuff..?
|
||||
Type = "simple";
|
||||
User = "${cfg.user}";
|
||||
ExecStart = ''${quickpeep}/bin/qp-raker --config ${cfg.configPath} --concurrency ${builtins.toString cfg.concurrency} --sleepers ${builtins.toString cfg.sleepers}'';
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
|
@ -48,6 +48,23 @@ with lib;
|
|||
Config path to use, in RON format.
|
||||
'';
|
||||
};
|
||||
|
||||
workingDir = mkOption {
|
||||
type = with types; path;
|
||||
description = ''
|
||||
Path to a working directory to run the web interface and indexer from.
|
||||
This is the base from which paths in the config file are looked up from.
|
||||
'';
|
||||
};
|
||||
|
||||
autoIndexUrl = mkOption {
|
||||
default = null;
|
||||
type = with types; nullOr str;
|
||||
description = ''
|
||||
HTTP(S) URL to an index (list) of rakepacks.
|
||||
If specified, the indexer will periodically fetch new packs from that list and then add the pages within to the search index.
|
||||
'';
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -71,7 +88,23 @@ with lib;
|
|||
Type = "simple";
|
||||
User = "${cfg.user}";
|
||||
ExecStart = ''${quickpeep}/bin/quickpeep ${cfg.bindHost}:${builtins.toString cfg.bindPort}'';
|
||||
WorkingDirectory = cfg.workingDir;
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.quickpeepIndex = mkIf (cfg.autoIndexUrl != null) {
|
||||
after = [ "network.target" ];
|
||||
description = "Fetches rakepacks from a feed and adds pages to the search index.";
|
||||
serviceConfig = {
|
||||
Type = "simple";
|
||||
User = "${cfg.user}";
|
||||
ExecStart = ''${quickpeep}/bin/qp-indexer --config ${lib.strings.escapeShellArg cfg.configPath} --feed ${lib.strings.escapeShellArg cfg.autoIndexUrl}'';
|
||||
WorkingDirectory = cfg.workingDir;
|
||||
};
|
||||
};
|
||||
|
||||
# TODO systemd.timers.quickpeepIndex = mkIf (cfg.autoIndexUrl != null) {
|
||||
#
|
||||
# };
|
||||
};
|
||||
}
|
|
@ -63,6 +63,8 @@
|
|||
],
|
||||
|
||||
sqlite_db_path: "data/dev_qp_web.sqlite3",
|
||||
|
||||
public_base: "http://127.0.0.1:9001",
|
||||
),
|
||||
|
||||
// Index (indexer, web)
|
||||
|
@ -96,5 +98,11 @@
|
|||
pack_emitter: (
|
||||
|
||||
),
|
||||
|
||||
rerake_timings: (
|
||||
page: 300,
|
||||
icon: 365,
|
||||
feed: 10,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
|
|
@ -27,8 +27,9 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
.context("Must specify output file as arg № 1! :)")?,
|
||||
);
|
||||
|
||||
let config_path =
|
||||
PathBuf::from(std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "qp_web.ron".to_owned()));
|
||||
let config_path = PathBuf::from(
|
||||
std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "quickpeep.ron".to_owned()),
|
||||
);
|
||||
|
||||
if !config_path.exists() {
|
||||
bail!(
|
||||
|
|
|
@ -24,8 +24,9 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
)
|
||||
.init();
|
||||
|
||||
let config_path =
|
||||
PathBuf::from(std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "qp_web.ron".to_owned()));
|
||||
let config_path = PathBuf::from(
|
||||
std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "quickpeep.ron".to_owned()),
|
||||
);
|
||||
|
||||
if !config_path.exists() {
|
||||
bail!(
|
||||
|
|
|
@ -7,9 +7,11 @@ use env_logger::Env;
|
|||
use log::info;
|
||||
use quickpeep::config::WebConfig;
|
||||
use quickpeep::web::icon_retrieval::retrieve_icon;
|
||||
use quickpeep::web::metadata::get_opensearch_xml;
|
||||
use quickpeep::web::searcher::{search_root, search_search};
|
||||
use quickpeep::web::seed_collector::{seed_collection_root, seed_collection_root_post};
|
||||
use quickpeep::web::IndexAccess;
|
||||
use quickpeep_index::auxiliary::icon_store::IconStore;
|
||||
use sqlx::sqlite::SqlitePoolOptions;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
|
@ -67,6 +69,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
let backend = Arc::new(web_config.open_indexer_backend()?);
|
||||
|
||||
let index_access = IndexAccess { backend };
|
||||
let icon_store = IconStore::open(web_config.index.icon_store.as_path())?;
|
||||
|
||||
let app = Router::new()
|
||||
.route("/seeds/", get(seed_collection_root))
|
||||
|
@ -74,9 +77,11 @@ async fn main() -> anyhow::Result<()> {
|
|||
.route("/", get(search_root))
|
||||
.route("/search", get(search_search))
|
||||
.route("/icon.webp", get(retrieve_icon))
|
||||
.route("/opensearch.xml", get(get_opensearch_xml))
|
||||
.layer(Extension(web_config))
|
||||
.layer(Extension(pool))
|
||||
.layer(Extension(index_access))
|
||||
.layer(Extension(Arc::new(icon_store)))
|
||||
.nest(
|
||||
"/static",
|
||||
get_service(ServeDir::new("./quickpeep_static/dist")).handle_error(
|
||||
|
|
|
@ -22,7 +22,7 @@ pub struct WebConfig {
|
|||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct IndexConfig {
|
||||
pub backend: BackendConfig,
|
||||
// TODO icon_store
|
||||
pub icon_store: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
|
@ -31,6 +31,10 @@ pub struct WebOnlyConfig {
|
|||
pub sqlite_db_path: PathBuf,
|
||||
/// Name, URL pairs
|
||||
pub contact: Vec<(String, String)>,
|
||||
|
||||
/// URL prefix for QuickPeep. Should include protocol. No trailing slash.
|
||||
/// Example: https://quickpeep.net
|
||||
pub public_base: String,
|
||||
}
|
||||
|
||||
impl WebConfig {
|
||||
|
@ -49,6 +53,8 @@ impl WebConfig {
|
|||
BackendConfig::Meili(_) => {}
|
||||
}
|
||||
|
||||
web_config.index.icon_store = config_dir.join(web_config.index.icon_store);
|
||||
|
||||
Ok(web_config)
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ use quickpeep_index::backend::Backend;
|
|||
use std::sync::Arc;
|
||||
|
||||
pub mod icon_retrieval;
|
||||
pub mod metadata;
|
||||
pub mod searcher;
|
||||
pub mod seed_collector;
|
||||
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
use crate::config::WebConfig;
|
||||
use axum::extract::Extension;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
|
||||
pub async fn get_opensearch_xml(Extension(web_config): Extension<WebConfig>) -> impl IntoResponse {
|
||||
let public_base = &web_config.web.public_base;
|
||||
let formatted = format!(
|
||||
r#"
|
||||
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
|
||||
xmlns:moz="http://www.mozilla.org/2006/browser/search/">
|
||||
<ShortName>QuickPeep</ShortName>
|
||||
<Description>small-scale web search engine</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">{public_base}/favicon.ico</Image>
|
||||
<Url type="text/html" template="{public_base}/search?q=%s"/>
|
||||
</OpenSearchDescription>
|
||||
"#
|
||||
);
|
||||
|
||||
// Extras for the future:
|
||||
// <Url type="application/x-suggestions+json" template="[suggestionURL]"/>
|
||||
// <moz:SearchForm>[https://example.com/search]</moz:SearchForm>
|
||||
|
||||
Response::builder()
|
||||
.header("content-type", "application/opensearchdescription+xml")
|
||||
.body(formatted.into_response())
|
||||
.unwrap()
|
||||
}
|
|
@ -4,7 +4,9 @@ use crate::webutil::{internal_error, TemplatedHtml};
|
|||
use askama::Template;
|
||||
use axum::extract::{Extension, Query};
|
||||
use axum::response::IntoResponse;
|
||||
use quickpeep_index::auxiliary::icon_store::IconStore;
|
||||
use serde::Deserialize;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone, Template)]
|
||||
#[template(path = "search.html.askama")]
|
||||
|
@ -12,11 +14,12 @@ pub struct SearchTemplate {
|
|||
pub search_term: String,
|
||||
pub results: Vec<SearchResult>,
|
||||
pub contact: Vec<(String, String)>,
|
||||
pub show_spiel: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SearchResult {
|
||||
pub favicon_url: String,
|
||||
pub favicon_url: Option<String>,
|
||||
pub url: String,
|
||||
pub title: String,
|
||||
pub excerpt: String,
|
||||
|
@ -34,6 +37,7 @@ pub async fn search_root(Extension(web_config): Extension<WebConfig>) -> impl In
|
|||
search_term: String::with_capacity(0),
|
||||
results: vec![],
|
||||
contact: web_config.web.contact.clone(),
|
||||
show_spiel: true,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -47,8 +51,9 @@ pub async fn search_search(
|
|||
web_config: Extension<WebConfig>,
|
||||
index_access: Extension<IndexAccess>,
|
||||
params: Query<QueryParameters>,
|
||||
icon_store: Extension<Arc<IconStore>>,
|
||||
) -> impl IntoResponse {
|
||||
search_search_inner(web_config, index_access, params)
|
||||
search_search_inner(web_config, index_access, params, icon_store)
|
||||
.await
|
||||
.map_err(internal_error)
|
||||
}
|
||||
|
@ -57,14 +62,22 @@ pub async fn search_search_inner(
|
|||
Extension(web_config): Extension<WebConfig>,
|
||||
Extension(index_access): Extension<IndexAccess>,
|
||||
Query(params): Query<QueryParameters>,
|
||||
Extension(icon_store): Extension<Arc<IconStore>>,
|
||||
) -> anyhow::Result<impl IntoResponse> {
|
||||
let raw_results = index_access.backend.query(params.q.clone())?;
|
||||
|
||||
let mut results = Vec::with_capacity(raw_results.len());
|
||||
|
||||
let txn = icon_store.env.ro_txn()?;
|
||||
|
||||
for search_doc in raw_results {
|
||||
let favicon_url_hash: [u8; 8] = search_doc.favicon_url_hash.to_le_bytes();
|
||||
let favicon_url = icon_store
|
||||
.icons
|
||||
.get(&txn, &favicon_url_hash)?
|
||||
.map(|_| format!("{:016x}", search_doc.favicon_url_hash));
|
||||
results.push(SearchResult {
|
||||
favicon_url: format!("{:16x}", search_doc.favicon_url_hash),
|
||||
favicon_url,
|
||||
url: search_doc.url,
|
||||
title: search_doc.title,
|
||||
excerpt: search_doc.excerpt,
|
||||
|
@ -76,5 +89,6 @@ pub async fn search_search_inner(
|
|||
search_term: params.q.clone(),
|
||||
results,
|
||||
contact: web_config.web.contact.clone(),
|
||||
show_spiel: false,
|
||||
}))
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
<header>
|
||||
<form method="GET" action="search">
|
||||
<fieldset class="horizontal">
|
||||
<img src="/static/quickpeep_logo_sml.png" class="bar_logo">
|
||||
<a href="/" title="QuickPeep"><img src="/static/quickpeep_logo_sml.png" alt="QuickPeep Logo" class="bar_logo"></a>
|
||||
<input type="search" id="search" name="q" placeholder="..." value="{{ search_term }}" class="grow">
|
||||
|
||||
<input type="submit" value="Search" class="shrink">
|
||||
|
@ -23,10 +23,28 @@
|
|||
|
||||
<!-- Main -->
|
||||
<main class="search">
|
||||
{% if show_spiel %}
|
||||
<p>
|
||||
QuickPeep is a hobbyist, open-source and very immature (for now) web search engine. It's intended to help you encounter webpages that are interesting and from a real person, rather than from a 'content mill' or other source of SEO spam. In general, websites that don't respect the reader are unwelcome.
|
||||
</p>
|
||||
<p>
|
||||
QuickPeep's approach to rubbish websites is to 'just' not index them! This also helps with another goal of the project, which is to allow anyone to run an instance of QuickPeep with only modest hardware requirements (especially storage space which could easily be problematic).
|
||||
</p>
|
||||
<p>
|
||||
This is an ambitious project and it is probably not very usable right now. It may never be. With that said, I'm hoping to see how far I can take it.
|
||||
</p>
|
||||
<p>
|
||||
There is an <a href="https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine">article introducing the project on my personal blog</a>.<br>
|
||||
The source code is <a href="https://git.emunest.net/reivilibre/quickpeep.git">available on my personal Gitea instance</a>.
|
||||
</p>
|
||||
{% endif %}
|
||||
|
||||
<ul class="search_results">
|
||||
{%- for result in results %}
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
{%- if result.favicon_url.is_some() -%}
|
||||
<img src="/icon.webp?b={{ result.favicon_url.as_ref().unwrap() }}">
|
||||
{%- endif -%}
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
|
|
|
@ -73,7 +73,7 @@
|
|||
<h3>Open Data</h3>
|
||||
<label for="opendata_agree">
|
||||
<input type="checkbox" role="switch" id="opendata_agree" name="opendata_agree" value="true" required>
|
||||
I'm happy for this data to be Open Data under the <a href="https://bics.ga/reivilibre/quickpeep_seeds/src/branch/main/LICENCE" target="_blank">CC0 licence</a>.
|
||||
I'm happy for this data to be Open Data under the <a href="https://git.emunest.net/reivilibre/quickpeep_seeds/src/branch/main/LICENCE" target="_blank">CC0 licence</a>.
|
||||
</label>
|
||||
|
||||
<input type="submit" value="Submit seed">
|
||||
|
|
|
@ -287,7 +287,8 @@ impl DenseTreeBuilder {
|
|||
let nofollow = attrs
|
||||
.get("rel")
|
||||
.map(|rel: &str| {
|
||||
rel.split_whitespace()
|
||||
rel.split(|c: char| c.is_whitespace() || c == ',')
|
||||
.filter(|s| !s.is_empty())
|
||||
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
[package]
|
||||
name = "quickpeep_html_charset_detection"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
encoding_rs = "0.8.31"
|
||||
subslice = "0.2.3"
|
||||
chardetng = "0.1.17"
|
|
@ -0,0 +1,114 @@
|
|||
use crate::steps::{bom_sniff, prescan, BOM_SNIFF_NEEDED_BYTES};
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
pub mod steps;
|
||||
|
||||
/// The spec requires document authors to place their <meta> tags in the first 1024 bytes.
|
||||
pub const SNIFF_WINDOW_SIZE: usize = 1024;
|
||||
|
||||
/// Attempts to implement the 'certain' stages of the encoding sniffing algorithm described at:
|
||||
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
///
|
||||
/// You should pass in at least the first 3 bytes as the `sniff_window` here.
|
||||
pub fn sniff_with_certain_confidence(
|
||||
sniff_window: &[u8],
|
||||
content_type_header: Option<&[u8]>,
|
||||
) -> Option<&'static Encoding> {
|
||||
// 1. BOM sniffing.
|
||||
if sniff_window.len() > BOM_SNIFF_NEEDED_BYTES {
|
||||
if let Some(encoding) = bom_sniff(
|
||||
sniff_window[0..BOM_SNIFF_NEEDED_BYTES]
|
||||
.try_into()
|
||||
.expect("checked size cast"),
|
||||
) {
|
||||
return Some(encoding);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. User override (Not implemented)
|
||||
|
||||
// 3. 'Wait for bytes' — we already have 1024
|
||||
|
||||
// 4. If the transport layer specifies an encoding, return as certain.
|
||||
extract_encoding_from_content_type_header(content_type_header.unwrap_or(b""))
|
||||
}
|
||||
|
||||
/// Implementing the all the stages of the encoding sniffing algorithm described at:
|
||||
/// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
/// except where those stages rely on other information (e.g. 'prior knowledge', a user default
|
||||
/// or a parent window context).
|
||||
///
|
||||
/// You should pass in at least the first 1024 bytes as the `sniff_window` here.
|
||||
pub fn sniff(
|
||||
sniff_window: &[u8],
|
||||
is_sniff_window_the_entire_document: bool,
|
||||
content_type_header: Option<&[u8]>,
|
||||
) -> &'static Encoding {
|
||||
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
|
||||
return certain;
|
||||
}
|
||||
|
||||
// 5. Optionally prescan the byte stream to determine its encoding
|
||||
if let Some(prescan_tentative) = prescan(sniff_window) {
|
||||
return prescan_tentative;
|
||||
}
|
||||
|
||||
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
|
||||
// analysis or other algorithms to the data stream. Such algorithms may use information about
|
||||
// the resource other than the resource's contents, including the address of the resource.
|
||||
// If autodetection succeeds in determining a character encoding, and that encoding is a
|
||||
// supported encoding, then return that encoding, with the confidence tentative.
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(sniff_window, is_sniff_window_the_entire_document);
|
||||
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
|
||||
// We do that here, though I'm not sure this is what we actually want outside of a browser...
|
||||
detector.guess(None, false)
|
||||
}
|
||||
|
||||
pub fn extract_encoding_from_content_type_header(
|
||||
content_type_header: &[u8],
|
||||
) -> Option<&'static Encoding> {
|
||||
for header_part in content_type_header.split(|b| *b == b';') {
|
||||
// To-UTF-8-lossy is definitely not spec compliant, but trim_ascii() on byte slices is
|
||||
// unstable, so let's just help ourselves out of a pickle.
|
||||
let header_part_ascii_ish = String::from_utf8_lossy(header_part);
|
||||
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
|
||||
let key = key_value.get(0).cloned().unwrap_or("");
|
||||
if key.to_ascii_lowercase() == "charset" {
|
||||
let value = key_value.get(1).cloned().unwrap_or("");
|
||||
return Encoding::for_label(value.as_bytes());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::sniff;
|
||||
|
||||
#[test]
|
||||
fn test_simple_cases() {
|
||||
assert_eq!(sniff(b"<u>hi</u>", true, None).name(), "windows-1252");
|
||||
|
||||
assert_eq!(
|
||||
sniff(b"<meta charset=UTF8><u>hi</u>", true, None).name(),
|
||||
"UTF-8"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sniff(
|
||||
b"<meta charset=UTF8><u>hi</u>",
|
||||
true,
|
||||
Some("text/html; charset=Shift-JIS".as_bytes())
|
||||
)
|
||||
.name(),
|
||||
"Shift_JIS"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sniff(b"<!-- haha we wish <meta charset=UTF8> --><meta http-equiv='content-type' content='text/html; charset=Shift-JIS'><u>hi</u>", true, None).name(),
|
||||
"Shift_JIS"
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,352 @@
|
|||
use crate::extract_encoding_from_content_type_header;
|
||||
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
|
||||
use subslice::SubsliceExt;
|
||||
|
||||
pub const BOM_SNIFF_NEEDED_BYTES: usize = 3;
|
||||
|
||||
/// Implements BOM sniffing
|
||||
/// https://encoding.spec.whatwg.org/#bom-sniff
|
||||
pub fn bom_sniff(first_3_bytes: [u8; 3]) -> Option<&'static Encoding> {
|
||||
if first_3_bytes == [0xEF, 0xBB, 0xBF] {
|
||||
Some(&UTF_8)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
|
||||
// Let fallback encoding be null.
|
||||
// TODO let fallback = None; ??
|
||||
|
||||
// Let position be a pointer to a byte in the input byte stream, initially pointing at the first byte.
|
||||
let mut position = 0;
|
||||
|
||||
// Prescan for UTF-16 XML declarations: If position points to:
|
||||
//
|
||||
// A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')
|
||||
if bytes.starts_with(&[0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0]) {
|
||||
// Return UTF-16LE.
|
||||
return Some(UTF_16LE);
|
||||
}
|
||||
|
||||
// A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')
|
||||
if bytes.starts_with(&[0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78]) {
|
||||
// Return UTF-16BE.
|
||||
return Some(UTF_16BE);
|
||||
}
|
||||
// For historical reasons, the prefix is two bytes longer than in Appendix F of XML and the encoding name is not checked.
|
||||
|
||||
// Loop: If position points to:
|
||||
while position < bytes.len() {
|
||||
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
|
||||
if bytes[position..].starts_with(b"<!--") {
|
||||
// Advance the position pointer so that it points at the first 0x3E byte which is
|
||||
// preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
|
||||
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
|
||||
// the '<!--' sequence.)
|
||||
match bytes[position..].find(b"-->") {
|
||||
Some(location_of_closer) => {
|
||||
position += location_of_closer + 3;
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
|
||||
let check = bytes
|
||||
.get(position..position + b"<meta ".len())?
|
||||
.to_ascii_lowercase();
|
||||
let ends_in_whitespace = bytes
|
||||
.get(position + b"<meta".len())
|
||||
.map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
|
||||
.unwrap_or(false);
|
||||
if check.starts_with(b"<meta") && ends_in_whitespace {
|
||||
// Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
|
||||
position += 6;
|
||||
|
||||
// Let attribute list be an empty list of strings.
|
||||
let mut attributes = Vec::new();
|
||||
|
||||
// Let got pragma be false.
|
||||
let mut got_pragma = false;
|
||||
|
||||
// Let need pragma be null.
|
||||
let mut need_pragma = None;
|
||||
|
||||
// Let charset be the null value (which, for the purposes of this algorithm, is distinct from an unrecognized encoding or the empty string).
|
||||
// We'll represent 'unrecognised' as Some(None) and 'empty string' as Some(Some("")).
|
||||
let mut charset: Option<Option<&'static Encoding>> = None;
|
||||
|
||||
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
|
||||
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
|
||||
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
|
||||
if attributes.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add the attribute's name to attribute list.
|
||||
attributes.push(key.clone());
|
||||
|
||||
// Run the appropriate step from the following list, if one applies:
|
||||
|
||||
match &key.to_ascii_lowercase()[..] {
|
||||
// If the attribute's name is "http-equiv"
|
||||
b"http-equiv" => {
|
||||
// If the attribute's value is "content-type", then set got pragma to true.
|
||||
if value.to_ascii_lowercase() == b"content-type" {
|
||||
got_pragma = true;
|
||||
}
|
||||
}
|
||||
// If the attribute's name is "content"
|
||||
b"content" => {
|
||||
// Apply the algorithm for extracting a character encoding from a meta element,
|
||||
// giving the attribute's value as the string to parse.
|
||||
// If a character encoding is returned, and if charset is still set to null,
|
||||
// let charset be the encoding returned, and set need pragma to true.
|
||||
let content_charset = extract_encoding_from_content_type_header(&value);
|
||||
if let Some(content_charset) = content_charset {
|
||||
if charset.is_none() {
|
||||
charset = Some(Some(content_charset));
|
||||
need_pragma = Some(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
// If the attribute's name is "charset"
|
||||
b"charset" => {
|
||||
// Let charset be the result of getting an encoding from the attribute's value,
|
||||
// and set need pragma to false.
|
||||
charset = Some(Encoding::for_label(&value));
|
||||
need_pragma = Some(false);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
// Return to the step labeled attributes.
|
||||
}
|
||||
|
||||
// Processing: If need pragma is null, then jump to the step below labeled next byte.
|
||||
if let Some(need_pragma) = need_pragma {
|
||||
// If need pragma is true but got pragma is false, then jump to the step below labeled next byte.
|
||||
if need_pragma && !got_pragma {
|
||||
position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If charset is failure, then jump to the step below labeled next byte.
|
||||
if charset == Some(None) {
|
||||
position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If charset is UTF-16BE/LE, then set charset to UTF-8.
|
||||
if charset
|
||||
.map(|i| i.map(|i| i.name() == UTF_16BE.name() || i.name() == UTF_16LE.name()))
|
||||
.flatten()
|
||||
.unwrap_or(false)
|
||||
{
|
||||
charset = Some(Some(UTF_8));
|
||||
}
|
||||
|
||||
// If charset is x-user-defined, then set charset to windows-1252.
|
||||
if charset
|
||||
.map(|i| i.map(|i| i.name() == X_USER_DEFINED.name()))
|
||||
.flatten()
|
||||
.unwrap_or(false)
|
||||
{
|
||||
charset = Some(Some(WINDOWS_1252));
|
||||
}
|
||||
|
||||
// Return charset.
|
||||
return charset.flatten();
|
||||
} else {
|
||||
position += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/), and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
|
||||
if (bytes[position..].starts_with(b"</")
|
||||
&& bytes[position..].len() > 3
|
||||
&& bytes[position + 2].is_ascii_alphabetic())
|
||||
|| (bytes[position..].starts_with(b"<")
|
||||
&& bytes[position..].len() > 2
|
||||
&& bytes[position + 1].is_ascii_alphabetic())
|
||||
{
|
||||
// Advance the position pointer so that it points at the next 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
|
||||
while position < bytes.len()
|
||||
&& !matches!(bytes[position], 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
|
||||
{
|
||||
position += 1;
|
||||
}
|
||||
|
||||
// Repeatedly get an attribute until no further attributes can be found, then jump to the step below labeled next byte.
|
||||
while prescan_get_attribute(&bytes, &mut position).is_some() {}
|
||||
position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// A sequence of bytes starting with: 0x3C 0x21 (`<!`)
|
||||
// A sequence of bytes starting with: 0x3C 0x2F (`</`)
|
||||
// A sequence of bytes starting with: 0x3C 0x3F (`<?`)
|
||||
if bytes[position..].starts_with(b"<!")
|
||||
|| bytes[position..].starts_with(b"</")
|
||||
|| bytes[position..].starts_with(b"<?")
|
||||
{
|
||||
// Advance the position pointer so that it points at the first 0x3E byte (>) that comes after the 0x3C byte that was found.
|
||||
match bytes[position..].find(b">") {
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
Some(at) => {
|
||||
position += at + 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any other byte
|
||||
//
|
||||
// Do nothing with that byte.
|
||||
|
||||
// Next byte: Move position so it points at the next byte in the input byte stream, and return to the step above labeled loop.
|
||||
position += 1;
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
|
||||
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
|
||||
// or 0x2F (/) then advance position to the next byte and redo this step.
|
||||
while matches!(
|
||||
bytes.get(*position),
|
||||
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
|
||||
) {
|
||||
*position += 1;
|
||||
}
|
||||
|
||||
// If the byte at position is 0x3E (>), then abort the get an attribute algorithm. There isn't one.
|
||||
if bytes.get(*position) == Some(&0x3E) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Otherwise, the byte at position is the start of the attribute name. Let attribute name and attribute value be the empty string.
|
||||
let mut attribute_name = Vec::new();
|
||||
let mut attribute_value = Vec::new();
|
||||
|
||||
// Process the byte at position as follows:
|
||||
//
|
||||
// If it is 0x3D (=), and the attribute name is longer than the empty string
|
||||
loop {
|
||||
if bytes.get(*position) == Some(&0x3D) && !attribute_name.is_empty() {
|
||||
// Advance position to the next byte and jump to the step below labeled value.
|
||||
*position += 1;
|
||||
break;
|
||||
} else {
|
||||
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
|
||||
// Jump to the step below labeled spaces.
|
||||
if !matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
|
||||
// If it is 0x2F (/) or 0x3E (>)
|
||||
if matches!(bytes.get(*position), Some(0x2F | 0x3E)) {
|
||||
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
|
||||
return Some((attribute_name, Vec::with_capacity(0)));
|
||||
}
|
||||
|
||||
// If it is in the range 0x41 (A) to 0x5A (Z)
|
||||
// Append the code point b+0x20 to attribute name (where b is the value of the byte at position). (This converts the input to lowercase.)
|
||||
// Anything else
|
||||
// Append the code point with the same value as the byte at position to attribute name. (It doesn't actually matter how bytes outside the ASCII range are handled here, since only ASCII bytes can contribute to the detection of a character encoding.)
|
||||
let new_byte: &u8 = bytes.get(*position)?;
|
||||
attribute_name.push(new_byte.to_ascii_lowercase());
|
||||
|
||||
// Advance position to the next byte and return to the previous step.
|
||||
*position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
||||
while matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
|
||||
*position += 1;
|
||||
}
|
||||
|
||||
// If the byte at position is not 0x3D (=), abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
|
||||
if bytes.get(*position) != Some(&0x3D) {
|
||||
return Some((attribute_name, Vec::with_capacity(0)));
|
||||
}
|
||||
|
||||
// Advance position past the 0x3D (=) byte.
|
||||
*position += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
||||
while matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
|
||||
*position += 1;
|
||||
}
|
||||
|
||||
// Process the byte at position as follows:
|
||||
//
|
||||
// If it is 0x22 (") or 0x27 (')
|
||||
if matches!(bytes.get(*position), Some(0x22 | 0x27)) {
|
||||
let quote_byte = bytes[*position];
|
||||
// Let b be the value of the byte at position.
|
||||
// Quote loop: Advance position to the next byte.
|
||||
loop {
|
||||
*position += 1;
|
||||
// If the value of the byte at position is the value of b, then advance position to the next byte and abort the "get an attribute" algorithm. The attribute's name is the value of attribute name, and its value is the value of attribute value.
|
||||
// (That part of the algorithm isn't clear, but it must mean the closing quote)
|
||||
if bytes.get(*position) == Some("e_byte) {
|
||||
*position += 1;
|
||||
return Some((attribute_name, attribute_value));
|
||||
}
|
||||
|
||||
// Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z), then append a code point to attribute value whose value is 0x20 more than the value of the byte at position.
|
||||
// Otherwise, append a code point to attribute value whose value is the same as the value of the byte at position.
|
||||
attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
|
||||
|
||||
// Return to the step above labeled quote loop.
|
||||
}
|
||||
}
|
||||
|
||||
// If it is 0x3E (>)
|
||||
if bytes.get(*position) == Some(&0x3E) {
|
||||
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
|
||||
return Some((attribute_name, Vec::with_capacity(0)));
|
||||
}
|
||||
|
||||
// If it is in the range 0x41 (A) to 0x5A (Z)
|
||||
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position). Advance position to the next byte.
|
||||
// Anything else
|
||||
// Append a code point with the same value as the byte at position to attribute value. Advance position to the next byte.
|
||||
attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
|
||||
*position += 1;
|
||||
|
||||
loop {
|
||||
// Process the byte at position as follows:
|
||||
//
|
||||
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
|
||||
if matches!(
|
||||
bytes.get(*position),
|
||||
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
|
||||
) {
|
||||
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
|
||||
return Some((attribute_name, attribute_value));
|
||||
}
|
||||
|
||||
// If it is in the range 0x41 (A) to 0x5A (Z)
|
||||
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
|
||||
// Anything else
|
||||
// Append a code point with the same value as the byte at position to attribute value.
|
||||
attribute_value.push(bytes.get(*position)?.to_ascii_lowercase());
|
||||
|
||||
// Advance position to the next byte and return to the previous step.
|
||||
*position += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn get_an_xml_encoding() -> Option<&'static Encoding> {
|
||||
// TODO NOT IMPLEMENTED
|
||||
None
|
||||
}
|
|
@ -20,6 +20,6 @@ toml = "0.5.8"
|
|||
|
||||
blake3 = "1.3.1"
|
||||
|
||||
fancy_mdbx = "0.1.0"
|
||||
fancy_mdbx = "0.1.1"
|
||||
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
|
|
|
@ -50,7 +50,7 @@ impl TantivyBackend {
|
|||
let dir_path = path.join("tantivy");
|
||||
|
||||
let (index, fields) = if dir_path.exists() {
|
||||
let index = Index::open_in_dir(&dir_path)?;
|
||||
let index = Index::open_in_dir(&dir_path).context("failed to open index")?;
|
||||
|
||||
let schema = index.schema();
|
||||
let mut field_map: HashMap<_, _> = schema
|
||||
|
@ -81,7 +81,8 @@ impl TantivyBackend {
|
|||
let schema = schema_builder.build();
|
||||
|
||||
std::fs::create_dir(&dir_path)?;
|
||||
let index = Index::create_in_dir(&dir_path, schema)?;
|
||||
let index =
|
||||
Index::create_in_dir(&dir_path, schema).context("failed to create index")?;
|
||||
|
||||
(index, fields)
|
||||
};
|
||||
|
@ -165,7 +166,11 @@ impl Backend for TantivyBackend {
|
|||
let reader = self.index.reader()?;
|
||||
let parser = QueryParser::new(
|
||||
self.index.schema(),
|
||||
vec![self.fields.title, self.fields.article, self.fields.article],
|
||||
vec![
|
||||
self.fields.title,
|
||||
self.fields.article,
|
||||
self.fields.nonarticle,
|
||||
],
|
||||
TokenizerManager::default(),
|
||||
);
|
||||
|
||||
|
@ -197,14 +202,27 @@ impl Backend for TantivyBackend {
|
|||
let snippet = article_snippet_generator.snippet(&doc_row.body);
|
||||
let excerpt = snippet.to_html();
|
||||
|
||||
let tags = doc
|
||||
.get_all(self.fields.tags)
|
||||
.map(|fv| {
|
||||
String::from(
|
||||
*fv.as_facet()
|
||||
.expect("tags must be facet!")
|
||||
.to_path()
|
||||
.last()
|
||||
.unwrap_or(&""),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
out.push(SearchDocument {
|
||||
score,
|
||||
title: doc_row.title,
|
||||
excerpt,
|
||||
tags: vec![],
|
||||
tags,
|
||||
url: url.to_owned(),
|
||||
favicon_url_hash: doc_row.favicon_url_hash,
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
|
|
|
@ -30,6 +30,11 @@ patricia_tree = "0.3.1"
|
|||
# For decompression of emitted packs. 0.11.1+zstd.1.5.2
|
||||
zstd = "0.11.1"
|
||||
|
||||
# HTTP Requests
|
||||
reqwest = { version = "0.11.9", features = ["blocking"] }
|
||||
|
||||
tempfile = "3.3.0"
|
||||
|
||||
quickpeep_densedoc = { path = "../quickpeep_densedoc" }
|
||||
quickpeep_index = { path = "../quickpeep_index" }
|
||||
quickpeep_structs = { path = "../quickpeep_structs" }
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use colour::{dark_grey_ln, grey_ln, yellow_ln};
|
||||
use colour::{dark_grey_ln, yellow_ln};
|
||||
use env_logger::Env;
|
||||
|
||||
use quickpeep_indexer::config::IndexerConfig;
|
||||
|
@ -26,7 +26,7 @@ pub fn main() -> anyhow::Result<()> {
|
|||
|
||||
let config_path = opts
|
||||
.config
|
||||
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml"));
|
||||
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
let indexer_backend = config.open_indexer_backend()?;
|
||||
|
|
|
@ -2,9 +2,9 @@ use anyhow::{bail, Context};
|
|||
use clap::Parser;
|
||||
use colour::{blue, yellow_ln};
|
||||
use env_logger::Env;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufRead, BufReader, Write};
|
||||
|
||||
use patricia_tree::PatriciaMap;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
|
@ -20,7 +20,8 @@ use quickpeep_structs::rake_entries::{
|
|||
};
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use smartstring::alias::CompactString;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::NamedTempFile;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use url::Url;
|
||||
|
||||
|
@ -30,6 +31,11 @@ pub struct Opts {
|
|||
#[clap(long = "config")]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// If specified, rakepacks from a feed will automatically be fetched and indexed.
|
||||
/// The rakepacks are tracked as having been processed.
|
||||
#[clap(long = "feed")]
|
||||
feed: Option<Url>,
|
||||
|
||||
rakepacks: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
|
@ -44,9 +50,12 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
||||
let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
let icon_store = IconStore::open(config.index.icon_store.as_path())?;
|
||||
let icon_store =
|
||||
IconStore::open(config.index.icon_store.as_path()).context("failed to open icon store")?;
|
||||
|
||||
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION).await?;
|
||||
let seed_files = find_seed_files(config.seed_dir.clone(), SEED_EXTENSION)
|
||||
.await
|
||||
.context("failed to find seed files")?;
|
||||
let (seed_tx, seed_rx) = tokio::sync::mpsc::channel(64);
|
||||
let handle = tokio::spawn(async move {
|
||||
seed_loader(seed_files, &seed_tx).await?;
|
||||
|
@ -59,43 +68,155 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let mut indexer_backend = config.open_indexer_backend()?;
|
||||
|
||||
if let Some(feed) = opts.feed {
|
||||
let processed_rakepack_path = config
|
||||
.processed_rakepack_path()
|
||||
.context("can't get a suitable location to track processed rakepacks")?;
|
||||
handle_pack_feed(
|
||||
feed,
|
||||
&mut indexer_backend,
|
||||
processed_rakepack_path,
|
||||
&seed_lookup,
|
||||
&icon_store,
|
||||
)
|
||||
.context("failed to handle pack feed")?;
|
||||
}
|
||||
|
||||
for pack in opts.rakepacks {
|
||||
blue!("Indexing: ");
|
||||
yellow_ln!("{:?}", pack);
|
||||
|
||||
let file = File::open(&pack)?;
|
||||
let decompressor = zstd::stream::Decoder::new(file)?;
|
||||
// TODO the decompressor has a buffer already, but we need this to see the end
|
||||
let mut buf_reader = BufReader::new(decompressor);
|
||||
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
|
||||
|
||||
match schema.as_ref() {
|
||||
SCHEMA_RAKED_PAGES => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)?;
|
||||
}
|
||||
}
|
||||
SCHEMA_RAKED_ICONS => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_icon_pack(&mut buf_reader, &icon_store)?;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
bail!(
|
||||
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
||||
SCHEMA_RAKED_PAGES,
|
||||
&schema
|
||||
);
|
||||
}
|
||||
}
|
||||
handle_pack(&pack, &mut indexer_backend, &seed_lookup, &icon_store)
|
||||
.with_context(|| format!("Whilst handling pack: {pack:?}"))?;
|
||||
}
|
||||
indexer_backend.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn handle_pack_feed(
|
||||
feed_url: Url,
|
||||
indexer_backend: &mut Box<dyn Backend>,
|
||||
processed_list_path: PathBuf,
|
||||
seed_lookup: &SeedLookupTable,
|
||||
icon_store: &IconStore,
|
||||
) -> anyhow::Result<()> {
|
||||
blue!("Scanning feed: ");
|
||||
yellow_ln!("{:?}", feed_url);
|
||||
|
||||
let new_packs =
|
||||
find_new_packs(feed_url.clone(), &processed_list_path).context("finding new packs")?;
|
||||
let mut processed_log = OpenOptions::new()
|
||||
.append(true)
|
||||
.create(true)
|
||||
.open(&processed_list_path)
|
||||
.context("can't open processed list for append")?;
|
||||
|
||||
for pack_name in new_packs {
|
||||
let pack_url = feed_url
|
||||
.join(&pack_name)
|
||||
.context("Can't resolve URL of new pack")?;
|
||||
|
||||
blue!("Downloading: ");
|
||||
yellow_ln!("{:?}", pack_url);
|
||||
let mut temp_file = NamedTempFile::new().context("opening temp file")?;
|
||||
|
||||
reqwest::blocking::get(pack_url.clone())
|
||||
.context("failed to request pack")?
|
||||
.error_for_status()?
|
||||
.copy_to(temp_file.as_file_mut())
|
||||
.context("failed to download pack to temp file")?;
|
||||
|
||||
handle_pack(temp_file.path(), indexer_backend, seed_lookup, icon_store).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"Whilst handling pack: {:?} ({:?})",
|
||||
temp_file.path(),
|
||||
pack_url
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
processed_log.write(format!("\n{}", &pack_name).as_bytes())?;
|
||||
processed_log.flush()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const USEFUL_RAKEPACKS_TO_PULL_FROM_FEED: [&'static str; 2] = [".icons.pack", ".pages.pack"];
|
||||
|
||||
fn find_new_packs(feed_url: Url, processed_list_path: &Path) -> anyhow::Result<BTreeSet<String>> {
|
||||
let processed_file = OpenOptions::new()
|
||||
.read(true)
|
||||
.create(true)
|
||||
.open(processed_list_path)?;
|
||||
let br = BufReader::new(processed_file);
|
||||
let processed: Result<BTreeSet<String>, _> = br.lines().collect();
|
||||
let processed = processed.context("failed to read local processed list")?;
|
||||
|
||||
let mut unprocessed: BTreeSet<String> = BTreeSet::new();
|
||||
|
||||
let feed_lines = BufReader::new(reqwest::blocking::get(feed_url)?.error_for_status()?).lines();
|
||||
for line in feed_lines {
|
||||
let line = line?;
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if !USEFUL_RAKEPACKS_TO_PULL_FROM_FEED
|
||||
.iter()
|
||||
.any(|ext| line.ends_with(ext))
|
||||
{
|
||||
// not a sort of rakepack we care about
|
||||
continue;
|
||||
}
|
||||
if processed.contains(&line) {
|
||||
continue;
|
||||
}
|
||||
unprocessed.insert(line.to_owned());
|
||||
}
|
||||
|
||||
Ok(unprocessed)
|
||||
}
|
||||
|
||||
pub fn handle_pack(
|
||||
pack: &Path,
|
||||
indexer_backend: &mut Box<dyn Backend>,
|
||||
seed_lookup: &SeedLookupTable,
|
||||
icon_store: &IconStore,
|
||||
) -> anyhow::Result<()> {
|
||||
blue!("Indexing: ");
|
||||
yellow_ln!("{:?}", pack);
|
||||
|
||||
let file = File::open(&pack)?;
|
||||
let decompressor = zstd::stream::Decoder::new(file)?;
|
||||
// TODO the decompressor has a buffer already, but we need this to see the end
|
||||
let mut buf_reader = BufReader::new(decompressor);
|
||||
let schema: String =
|
||||
serde_bare::from_reader(&mut buf_reader).context("failed to read schema ver")?;
|
||||
|
||||
match schema.as_ref() {
|
||||
SCHEMA_RAKED_PAGES => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_page_pack(&mut buf_reader, &seed_lookup, indexer_backend)
|
||||
.context("failed to handle page pack")?;
|
||||
}
|
||||
}
|
||||
SCHEMA_RAKED_ICONS => {
|
||||
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
|
||||
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
|
||||
handle_icon_pack(&mut buf_reader, &icon_store)
|
||||
.context("failed to handle icon pack")?;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
bail!(
|
||||
"Wrong schema version: wanted e.g. {:?}, got {:?}",
|
||||
SCHEMA_RAKED_PAGES,
|
||||
&schema
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn handle_page_pack(
|
||||
buf_reader: &mut impl BufRead,
|
||||
seed_lookup: &SeedLookupTable,
|
||||
|
@ -143,15 +264,17 @@ pub fn handle_page_pack(
|
|||
favicon_url_hash_long.copy_from_slice(&blake3::hash(favicon_url.as_bytes()).as_bytes()[0..8]);
|
||||
let favicon_url_hash = u64::from_le_bytes(favicon_url_hash_long);
|
||||
|
||||
indexer_backend.add_document(BackendIndependentDocument {
|
||||
title: document.head.title,
|
||||
article_body,
|
||||
nonarticle_body,
|
||||
// TODO populate tags & antifeatures
|
||||
tags,
|
||||
url: page_record.url.to_string(),
|
||||
favicon_url_hash,
|
||||
})?;
|
||||
indexer_backend
|
||||
.add_document(BackendIndependentDocument {
|
||||
title: document.head.title,
|
||||
article_body,
|
||||
nonarticle_body,
|
||||
// TODO populate tags & antifeatures
|
||||
tags,
|
||||
url: page_record.url.to_string(),
|
||||
favicon_url_hash,
|
||||
})
|
||||
.context("failed to add document to indexer backend")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -195,7 +318,8 @@ pub async fn build_seed_lookup_table(
|
|||
match &seed.url {
|
||||
UrlOrUrlPattern::Url(url_str) => {
|
||||
let url = Url::parse(url_str)?;
|
||||
let reduced_domain = get_reduced_domain(&url)?;
|
||||
let reduced_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain in seed: '{url}'!"))?;
|
||||
seed_lookup
|
||||
.by_reduced_domain
|
||||
.insert(reduced_domain.into(), seed);
|
||||
|
@ -215,7 +339,8 @@ impl SeedLookupTable {
|
|||
return Ok(Some(seed));
|
||||
}
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
let domain = get_reduced_domain(url)
|
||||
.with_context(|| format!("No domain in looked up URL: '{url}'"))?;
|
||||
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
|
||||
return Ok(Some(seed));
|
||||
}
|
||||
|
|
|
@ -45,9 +45,23 @@ impl IndexerConfig {
|
|||
}
|
||||
|
||||
pub fn open_indexer_backend(&self) -> anyhow::Result<Box<dyn Backend>> {
|
||||
match &self.index.backend {
|
||||
BackendConfig::Tantivy(tantivy) => Ok(Box::new(
|
||||
TantivyBackend::open(&tantivy.index_dir)
|
||||
.context("failed to open Tantivy backend")?,
|
||||
)),
|
||||
BackendConfig::Meili(_) => {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the path to a text file which can be used for storing a list of processed rakepacks
|
||||
/// (needed for following rakepack streams over a network).
|
||||
pub fn processed_rakepack_path(&self) -> anyhow::Result<PathBuf> {
|
||||
match &self.index.backend {
|
||||
BackendConfig::Tantivy(tantivy) => {
|
||||
Ok(Box::new(TantivyBackend::open(&tantivy.index_dir)?))
|
||||
Ok(tantivy.index_dir.join("processed_rakepacks.lst"))
|
||||
}
|
||||
BackendConfig::Meili(_) => {
|
||||
todo!()
|
||||
|
|
|
@ -58,6 +58,8 @@ arc-interner = "0.7.0"
|
|||
smartstring = "1.0.0"
|
||||
signal-hook = "0.3.13"
|
||||
nix = "0.23.1"
|
||||
quickpeep_html_charset_detection = { version = "0.1.0", path = "../quickpeep_html_charset_detection" }
|
||||
tikv-jemallocator = "0.5.0"
|
||||
|
||||
### Raking helpers
|
||||
# HTTP Requests
|
||||
|
@ -89,3 +91,6 @@ metrics = "0.18.1"
|
|||
metrics-exporter-prometheus = { version = "0.9.0", default-features = false, features = ["http-listener"] }
|
||||
metrics-process-promstyle = "0.18.0"
|
||||
bare-metrics-recorder = "0.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.3.0"
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use clap::Parser;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use std::fmt::Debug;
|
||||
|
||||
|
@ -14,7 +15,10 @@ use std::path::PathBuf;
|
|||
use quickpeep_raker::config;
|
||||
|
||||
use quickpeep_raker::storage::mdbx_helper_types::MdbxBare;
|
||||
use quickpeep_raker::storage::records::AllowedDomainRecord;
|
||||
use quickpeep_raker::storage::records::{
|
||||
ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
|
||||
UrlVisitedRecord,
|
||||
};
|
||||
use quickpeep_raker::storage::{RakerStore, RakerTxn};
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
|
@ -42,7 +46,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let config_path = opts
|
||||
.config
|
||||
.unwrap_or_else(|| PathBuf::from("qp_raker.toml"));
|
||||
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
||||
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
if !config.raker.workbench_dir.exists() {
|
||||
|
@ -59,11 +63,67 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let txn = store.ro_txn()?;
|
||||
match opts.table.as_ref() {
|
||||
"allowed_domains" => {
|
||||
inspect::<MdbxBare<AllowedDomainRecord>>(
|
||||
"queue_urls" | "urls_queue" => {
|
||||
inspect::<MdbxBare<QueueUrlRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().allowed_domains,
|
||||
&txn.mdbx.borrow_dbs().queue_urls,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"active_domains" => {
|
||||
inspect::<MdbxBare<ActiveDomainRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().active_domains,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"active_domains_raffle" => {
|
||||
inspect::<MdbxBare<String>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().active_domain_raffle,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"backing_off_reinstatements" => {
|
||||
inspect::<MdbxBare<String>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().backing_off_reinstatements,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"backing_off_domains" => {
|
||||
inspect::<MdbxBare<BackingOffDomainRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().backing_off_domains,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"visited_urls" => {
|
||||
inspect::<MdbxBare<UrlVisitedRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().visited_urls,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"domains" => {
|
||||
inspect::<MdbxBare<DomainRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().domains,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
"urls_on_hold" => {
|
||||
inspect::<MdbxBare<OnHoldUrlRecord>>(
|
||||
opts.key_name.as_ref(),
|
||||
opts.prefix,
|
||||
&txn.mdbx.borrow_dbs().urls_on_hold,
|
||||
&txn,
|
||||
)?;
|
||||
}
|
||||
|
@ -85,13 +145,22 @@ impl<T: Debug> Inspectable for MdbxBare<T> {
|
|||
}
|
||||
}
|
||||
|
||||
fn inspect<'a, IV: Inspectable + TableObject<'a>>(
|
||||
fn inspect<'a, IV: Inspectable + TableObject<'a> + 'static>(
|
||||
key: &str,
|
||||
prefix: bool,
|
||||
database: &Database<'a>,
|
||||
txn: &'a RakerTxn<'a, RO>,
|
||||
) -> anyhow::Result<()> {
|
||||
if prefix {
|
||||
let mut cur = txn.mdbx_txn.cursor(database)?;
|
||||
for item in cur.iter_from::<Cow<'_, [u8]>, IV>(key.as_bytes()) {
|
||||
let (k, v) = item?;
|
||||
if !k.starts_with(key.as_bytes()) {
|
||||
break;
|
||||
}
|
||||
println!("• {}", std::str::from_utf8(&k).unwrap_or("<Not UTF-8>"));
|
||||
println!(" = {}", v.inspect());
|
||||
}
|
||||
} else {
|
||||
if let Some(entry) = txn.mdbx_txn.get::<IV>(database, key.as_bytes())? {
|
||||
println!("{}", entry.inspect());
|
||||
|
|
|
@ -33,7 +33,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let config_path = opts
|
||||
.config
|
||||
.unwrap_or_else(|| PathBuf::from("qp_raker.toml"));
|
||||
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
||||
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
if !config.raker.workbench_dir.exists() {
|
||||
|
|
|
@ -3,8 +3,9 @@ use clap::Parser;
|
|||
use env_logger::Env;
|
||||
|
||||
use adblock::lists::RuleTypes;
|
||||
use anyhow::{bail, Context};
|
||||
use log::{debug, error, warn};
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use chrono::Utc;
|
||||
use log::{debug, error, info, warn};
|
||||
use lru::LruCache;
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
|
@ -14,7 +15,7 @@ use signal_hook::iterator::Signals;
|
|||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use tokio::fs::File;
|
||||
use tokio::sync::{mpsc, oneshot, Notify, Semaphore};
|
||||
use tokio::time::MissedTickBehavior;
|
||||
|
@ -26,11 +27,12 @@ use quickpeep_raker::raking::page_extraction::PageExtractionService;
|
|||
use quickpeep_raker::raking::rakemetrics::describe_raking_metrics;
|
||||
use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission};
|
||||
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
|
||||
use quickpeep_raker::storage::RakerStore;
|
||||
use quickpeep_raker::storage::{RakerStore, RandomActiveDomainAcquisition};
|
||||
use quickpeep_structs::rake_entries::{
|
||||
AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES,
|
||||
SCHEMA_RAKED_REJECTIONS,
|
||||
};
|
||||
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||
|
||||
/// The ordering is slightly important on these: more specific things should come first.
|
||||
/// This means they filter out the troublesome elements before the broader filters do.
|
||||
|
@ -58,6 +60,9 @@ pub struct Opts {
|
|||
concurrent_sleepers: u32,
|
||||
}
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(
|
||||
|
@ -145,8 +150,7 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
describe_raking_metrics();
|
||||
}
|
||||
|
||||
let num_tasks = opts.concurrent_jobs + opts.concurrent_sleepers;
|
||||
let semaphore = Arc::new(Semaphore::new(opts.concurrent_jobs as usize));
|
||||
let active_fetch_semaphore = Arc::new(Semaphore::new(opts.concurrent_jobs as usize));
|
||||
|
||||
let (pages_tx, pages_rx) = mpsc::channel(32);
|
||||
let (refs_tx, refs_rx) = mpsc::channel(32);
|
||||
|
@ -264,29 +268,29 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
raker: Arc::new(raker),
|
||||
busy_domains: Arc::new(Mutex::new(Default::default())),
|
||||
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
|
||||
semaphore,
|
||||
semaphore: active_fetch_semaphore,
|
||||
submission,
|
||||
graceful_stop,
|
||||
notify: graceful_stop_notify,
|
||||
rerake_timings: Arc::new(config.raker.rerake_timings.clone()),
|
||||
};
|
||||
|
||||
let mut tasks = Vec::with_capacity(num_tasks as usize);
|
||||
|
||||
for task_num in 0..num_tasks {
|
||||
let task_context = task_context.clone();
|
||||
|
||||
tasks.push(tokio::spawn(async move {
|
||||
if let Err(err) = task_context.run().await {
|
||||
error!("Raker task {:?} encountered an error: {:?}", task_num, err);
|
||||
}
|
||||
}));
|
||||
}
|
||||
// Reinstate old backoffs and re-rakable URLs
|
||||
store
|
||||
.async_rw_txn(|txn| {
|
||||
let today = date_to_quickpeep_days(&Utc::today())?;
|
||||
txn.reinstate_backoffs(SystemTime::now())?;
|
||||
txn.reinstate_rerakables(today)?;
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
let (dsmu_cancel_tx, mut dsmu_cancel_rx) = oneshot::channel();
|
||||
let datastore_metrics_updater = {
|
||||
let store = task_context.store.clone();
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(60));
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(30));
|
||||
interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
loop {
|
||||
tokio::select! {
|
||||
|
@ -306,21 +310,22 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
})
|
||||
};
|
||||
|
||||
let TaskContext {
|
||||
graceful_stop,
|
||||
notify,
|
||||
submission,
|
||||
..
|
||||
} = task_context;
|
||||
let graceful_stop = task_context.graceful_stop.clone();
|
||||
let notify = task_context.notify.clone();
|
||||
|
||||
// Manually drop submission otherwise the senders don't hang up.
|
||||
drop(submission);
|
||||
let worker_semaphore =
|
||||
Semaphore::new((opts.concurrent_jobs + opts.concurrent_sleepers) as usize);
|
||||
let orchestrator_handle = tokio::spawn(async move {
|
||||
if let Err(err) = orchestrator(task_context, Arc::new(worker_semaphore)).await {
|
||||
error!("Error in orchestrator: {err:?}");
|
||||
}
|
||||
});
|
||||
|
||||
// ^C is SIGINT; systemd sends SIGTERM
|
||||
start_signal_handler(Signals::new([SIGINT, SIGTERM])?, graceful_stop, notify)?;
|
||||
|
||||
for task in tasks {
|
||||
task.await?;
|
||||
if let Err(panic_err) = orchestrator_handle.await {
|
||||
error!("orchestrator panic: {panic_err:?}");
|
||||
}
|
||||
|
||||
for task in emitters {
|
||||
|
@ -334,6 +339,101 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn acquire_active_domain(task_context: &TaskContext) -> anyhow::Result<Option<String>> {
|
||||
// Acquire a domain for the task to run against
|
||||
let domain = {
|
||||
let txn = task_context.store.ro_txn()?;
|
||||
// TODO: don't clone teh Arc here — conv to ref.
|
||||
txn.acquire_random_active_domain(task_context.busy_domains.clone())?
|
||||
};
|
||||
|
||||
match domain {
|
||||
RandomActiveDomainAcquisition::GotOne { domain, record: _ } => Ok(Some(domain)),
|
||||
RandomActiveDomainAcquisition::AllBusy => Ok(None),
|
||||
RandomActiveDomainAcquisition::NoneLeft => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawns tasks to do the work as necessary.
|
||||
/// Performs back-off and re-rake reinstatements periodically and spawns up new workers if needed.
|
||||
async fn orchestrator(task_context: TaskContext, semaphore: Arc<Semaphore>) -> anyhow::Result<()> {
|
||||
let mut next_reinstate = Instant::now() + Duration::from_secs(1800);
|
||||
let max_permits = semaphore.available_permits();
|
||||
|
||||
while !task_context.graceful_stop.load(Ordering::Relaxed) {
|
||||
// Spawn up new tasks if there are available worker permits.
|
||||
|
||||
let domain_to_process = acquire_active_domain(&task_context)
|
||||
.await
|
||||
.context("failed trying to acquire active domain")?;
|
||||
|
||||
if domain_to_process.is_none() && semaphore.available_permits() == max_permits {
|
||||
// There's nothing to do and nothing is being processed.
|
||||
ensure!(
|
||||
task_context.busy_domains.lock().unwrap().is_empty(),
|
||||
"Shutting down orchestrator but set of busy domains is not empty."
|
||||
);
|
||||
}
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(next_reinstate.into()) => {
|
||||
// Reinstate backoffs and rerakables
|
||||
if let Err(err) = task_context.store.async_rw_txn(|txn| {
|
||||
txn.reinstate_backoffs(SystemTime::now())?;
|
||||
let today = date_to_quickpeep_days(&Utc::today())?;
|
||||
txn.reinstate_rerakables(today)?;
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
}).await {
|
||||
error!("Error performing periodic reinstatements: {err:?}");
|
||||
}
|
||||
|
||||
next_reinstate = Instant::now() + Duration::from_secs(1800);
|
||||
},
|
||||
_ = task_context.notify.notified() => {
|
||||
// nop: just wake from the loop
|
||||
}
|
||||
Ok(new_permit) = semaphore.clone().acquire_owned(), if domain_to_process.is_some() => {
|
||||
let domain = domain_to_process.unwrap();
|
||||
let mut task_context = task_context.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = task_context.process_domain(domain.clone()).await {
|
||||
error!("Encountered error processing {:?}: {:?}", domain, err);
|
||||
}
|
||||
ensure!(
|
||||
task_context.busy_domains
|
||||
.lock()
|
||||
.map_err(|_| anyhow!("busy domains set poisoned"))?
|
||||
.remove(&domain),
|
||||
"Our domain was not busy after processing!"
|
||||
);
|
||||
|
||||
// Release the permit here, within the task.
|
||||
drop(new_permit);
|
||||
Ok(())
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
info!("Orchestrator shutting down gracefully...");
|
||||
|
||||
// Wind up:
|
||||
let TaskContext { submission, .. } = task_context;
|
||||
// Manually drop submission otherwise the senders don't hang up.
|
||||
drop(submission);
|
||||
|
||||
let num_active_tasks = max_permits - semaphore.available_permits();
|
||||
info!("Waiting for {num_active_tasks} rake tasks to close.");
|
||||
|
||||
info!(
|
||||
"Acquired all remaining permits: {:?}",
|
||||
semaphore.acquire_many(num_active_tasks as u32).await
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_signal_handler(
|
||||
mut signals: Signals,
|
||||
shutdown: Arc<AtomicBool>,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use clap::Parser;
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::borrow::Borrow;
|
||||
|
||||
use env_logger::Env;
|
||||
|
||||
|
@ -11,15 +11,13 @@ use std::path::PathBuf;
|
|||
use tokio::sync::mpsc;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
|
||||
use quickpeep_raker::config;
|
||||
use quickpeep_raker::config::RakerConfig;
|
||||
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
|
||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||
use quickpeep_seed_parser::loader::{
|
||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
||||
};
|
||||
use quickpeep_utils::dirty::DirtyTracker;
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
|
@ -41,8 +39,8 @@ pub async fn main() -> anyhow::Result<()> {
|
|||
|
||||
let config_path = opts
|
||||
.config
|
||||
.unwrap_or_else(|| PathBuf::from("qp_raker.toml"));
|
||||
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?;
|
||||
.unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
|
||||
let config = RakerConfig::load(&config_path).context("Failed to load config")?;
|
||||
|
||||
if !config.raker.workbench_dir.exists() {
|
||||
bail!(
|
||||
|
@ -144,48 +142,39 @@ async fn importer(
|
|||
buf.push(seed);
|
||||
|
||||
if buf.len() == BATCH_SIZE {
|
||||
if are_weeds {
|
||||
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?;
|
||||
} else {
|
||||
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
|
||||
}
|
||||
import_and_flush_batch_seeds_or_weeds(
|
||||
&store, &mut buf, &mut stats, &client, !are_weeds,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
if are_weeds {
|
||||
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?;
|
||||
} else {
|
||||
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
|
||||
}
|
||||
import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
|
||||
.await?;
|
||||
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
async fn import_and_flush_batch_seeds(
|
||||
async fn import_and_flush_batch_seeds_or_weeds(
|
||||
store: &RakerStore,
|
||||
buf: &mut Vec<Seed>,
|
||||
stats: &mut SeedImportStats,
|
||||
client: &Client,
|
||||
is_seed: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let txn = store.rw_txn()?;
|
||||
for seed in buf.drain(..) {
|
||||
let as_url = Url::parse(seed.url.as_str())
|
||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||
let domain = get_reduced_domain(&as_url)?;
|
||||
let domain = get_reduced_domain(&as_url)
|
||||
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
|
||||
|
||||
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?;
|
||||
|
||||
let is_domain_new = allowed_domain_record.is_none();
|
||||
let domain_record = txn.get_domain_record(domain.borrow())?;
|
||||
let is_domain_new = domain_record.is_none();
|
||||
let mut domain_record = domain_record.unwrap_or_default();
|
||||
if is_domain_new {
|
||||
stats.new_domains += 1;
|
||||
}
|
||||
|
||||
let mut allowed_domain_record = DirtyTracker::new(
|
||||
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
|
||||
);
|
||||
if is_domain_new {
|
||||
// Mark it as dirty
|
||||
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
|
||||
}
|
||||
let mut dirty = is_domain_new;
|
||||
|
||||
// Register the domain. This is a no-op if it's already active or backing off.
|
||||
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
||||
|
@ -193,41 +182,53 @@ async fn import_and_flush_batch_seeds(
|
|||
let url_like = match &seed.url {
|
||||
UrlOrUrlPattern::Url(url_str) => {
|
||||
let url = Url::parse(url_str.as_str())?;
|
||||
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
|
||||
stats.new_urls += 1;
|
||||
} else {
|
||||
stats.already_present_urls += 1;
|
||||
if is_seed {
|
||||
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
|
||||
stats.new_urls += 1;
|
||||
} else {
|
||||
stats.already_present_urls += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Seed/weed with empty prefix
|
||||
dirty |= domain_record
|
||||
.rakeable_path_prefixes
|
||||
.insert(String::new(), is_seed)
|
||||
!= Some(is_seed);
|
||||
|
||||
url
|
||||
}
|
||||
UrlOrUrlPattern::UrlPrefix(prefix) => {
|
||||
let prefix_as_url = Url::parse(prefix.as_str())?;
|
||||
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
|
||||
stats.new_urls += 1;
|
||||
} else {
|
||||
stats.already_present_urls += 1;
|
||||
}
|
||||
if is_domain_new {
|
||||
let allowed_domain_record: &mut AllowedDomainRecord =
|
||||
allowed_domain_record.borrow_mut();
|
||||
allowed_domain_record
|
||||
.restricted_prefixes
|
||||
.insert(prefix_as_url.path().to_string());
|
||||
if is_seed {
|
||||
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
|
||||
stats.new_urls += 1;
|
||||
} else {
|
||||
stats.already_present_urls += 1;
|
||||
}
|
||||
}
|
||||
|
||||
dirty |= domain_record
|
||||
.rakeable_path_prefixes
|
||||
.insert(prefix_as_url.path().to_string(), is_seed)
|
||||
!= Some(is_seed);
|
||||
|
||||
prefix_as_url
|
||||
}
|
||||
};
|
||||
|
||||
if allowed_domain_record.is_dirty() {
|
||||
txn.put_allowed_domain_record(domain.borrow(), allowed_domain_record.into_inner())?;
|
||||
if dirty {
|
||||
txn.put_domain_record(domain.borrow(), domain_record)?;
|
||||
}
|
||||
|
||||
if is_domain_new {
|
||||
if is_seed {
|
||||
// look at robots.txt and discover sitemaps!
|
||||
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
|
||||
for sitemap in robots_txt.sitemaps {
|
||||
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
|
||||
stats.new_sitemaps += 1;
|
||||
if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
|
||||
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
|
||||
stats.new_sitemaps += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -235,36 +236,3 @@ async fn import_and_flush_batch_seeds(
|
|||
txn.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn import_and_flush_batch_weeds(
|
||||
store: &RakerStore,
|
||||
buf: &mut Vec<Seed>,
|
||||
stats: &mut SeedImportStats,
|
||||
) -> anyhow::Result<()> {
|
||||
let txn = store.rw_txn()?;
|
||||
for seed in buf.drain(..) {
|
||||
let as_url = Url::parse(seed.url.as_str())
|
||||
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
|
||||
let domain = get_reduced_domain(&as_url)?;
|
||||
|
||||
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
|
||||
|
||||
let is_domain_new = weed_domain_record.is_none();
|
||||
if is_domain_new {
|
||||
stats.new_domains += 1;
|
||||
}
|
||||
|
||||
let mut weed_domain_record =
|
||||
DirtyTracker::new(weed_domain_record.unwrap_or_else(|| WeedDomainRecord::default()));
|
||||
if is_domain_new {
|
||||
// Mark it as dirty
|
||||
let _: &mut WeedDomainRecord = weed_domain_record.borrow_mut();
|
||||
}
|
||||
|
||||
if weed_domain_record.is_dirty() {
|
||||
txn.put_weed_domain_record(domain.borrow(), weed_domain_record.into_inner())?;
|
||||
}
|
||||
}
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -28,6 +28,23 @@ pub struct RakerOnlyConfig {
|
|||
pub metrics: MetricsConfig,
|
||||
|
||||
pub pack_emitter: PackEmitterSettings,
|
||||
|
||||
pub rerake_timings: RerakeTimings,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct RerakeTimings {
|
||||
/// How long, in days, between re-rakes of the same page?
|
||||
/// Suggested: 300
|
||||
pub page: u16,
|
||||
|
||||
/// How long, in days, between re-rakes of feeds?
|
||||
/// Suggested: 10
|
||||
pub feed: u16,
|
||||
|
||||
/// How long, in days, between re-rakes of icons?
|
||||
/// Suggested: 365
|
||||
pub icon: u16,
|
||||
}
|
||||
|
||||
impl RakerConfig {
|
||||
|
|
|
@ -57,12 +57,15 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
|
|||
Unit::Count,
|
||||
"Records emitted into a pack file"
|
||||
);
|
||||
|
||||
let pack_index_file = directory.join("index");
|
||||
|
||||
loop {
|
||||
let now = Utc::now();
|
||||
// 2022-01-01 01:01:01
|
||||
let new_pack_file_path = loop {
|
||||
let new_pack_file_path =
|
||||
directory.join(format!("{}.{}.pack", now.format("%F_%T"), name));
|
||||
// 2022-01-01_01:01:01
|
||||
let (pack_name, new_pack_file_path) = loop {
|
||||
let pack_name = format!("{}.{}.pack", now.format("%F_%T"), name);
|
||||
let new_pack_file_path = directory.join(&pack_name);
|
||||
if new_pack_file_path.exists() {
|
||||
warn!(
|
||||
"{:?} already exists; sleeping to generate new timestamp.",
|
||||
|
@ -70,11 +73,11 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
|
|||
);
|
||||
std::thread::sleep(Duration::from_secs(2));
|
||||
} else {
|
||||
break new_pack_file_path;
|
||||
break (pack_name, new_pack_file_path);
|
||||
}
|
||||
};
|
||||
|
||||
if !pack_emitter_to_file(
|
||||
let file_cutoff_reached = pack_emitter_to_file(
|
||||
&new_pack_file_path,
|
||||
&mut rx,
|
||||
name,
|
||||
|
@ -82,8 +85,19 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
|
|||
settings,
|
||||
shutdown.clone(),
|
||||
shutdown_notify.clone(),
|
||||
)? {
|
||||
// File wasn't filled; the receiver was exhausted (we're shutting down).
|
||||
)?;
|
||||
|
||||
// Add an entry to the index. This essentially marks it as 'done' and enables
|
||||
// a follower to catch up.
|
||||
let mut index_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&pack_index_file)?;
|
||||
index_file.write(format!("\n{}", pack_name).as_bytes())?;
|
||||
index_file.flush()?;
|
||||
|
||||
if !file_cutoff_reached {
|
||||
// File wasn't filled; the receiver was exhausted (that means we're shutting down).
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
use crate::raking::analysis::IpSet;
|
||||
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::io::Cursor;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use ::metrics::increment_counter;
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use chrono::{DateTime, FixedOffset, Utc};
|
||||
|
@ -10,18 +15,18 @@ use image::imageops::FilterType;
|
|||
use image::{GenericImageView, ImageFormat};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||
use log::{debug, info, warn};
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sitemap::reader::SiteMapEntity;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::Cursor;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
|
||||
|
||||
use crate::raking::analysis::IpSet;
|
||||
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
|
||||
|
||||
pub mod analysis;
|
||||
pub mod page_extraction;
|
||||
pub mod rakemetrics;
|
||||
|
@ -35,24 +40,6 @@ pub const SIZE_LIMIT: usize = 4 * 1024 * 1024;
|
|||
pub const TIME_LIMIT: Duration = Duration::from_secs(10);
|
||||
pub const RAKER_USER_AGENT: &'static str = "QuickPeepBot";
|
||||
|
||||
lazy_static! {
|
||||
pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = {
|
||||
[
|
||||
("image/png", ImageFormat::Png),
|
||||
("image/webp", ImageFormat::WebP),
|
||||
("image/jpeg", ImageFormat::Jpeg),
|
||||
("image/gif", ImageFormat::Gif),
|
||||
("image/vnd.microsoft.icon", ImageFormat::Ico),
|
||||
("image/x-icon", ImageFormat::Ico),
|
||||
("image/icon", ImageFormat::Ico),
|
||||
("image/ico", ImageFormat::Ico),
|
||||
("application/ico", ImageFormat::Ico),
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
};
|
||||
}
|
||||
|
||||
pub enum RakeOutcome {
|
||||
RakedPage(RakedPage),
|
||||
RakedFeed(Vec<UrlRaked>),
|
||||
|
@ -75,6 +62,8 @@ pub enum RedirectReason {
|
|||
},
|
||||
/// The page was not canonical, and should not be indexed.
|
||||
NotCanonical,
|
||||
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
|
||||
SecureUpgrade,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
|
@ -123,10 +112,20 @@ pub enum TemporaryFailureReason {
|
|||
pub enum PermanentFailureReason {
|
||||
ResourceDenied(u16),
|
||||
DeniedToRobots,
|
||||
IndexingDenied,
|
||||
WrongLanguage(String),
|
||||
UnknownContentType(String),
|
||||
ExceedsSizeLimit,
|
||||
}
|
||||
|
||||
impl Display for PermanentFailure {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
Debug::fmt(&self, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for PermanentFailure {}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum RakeIntent {
|
||||
Any,
|
||||
|
@ -156,7 +155,7 @@ impl FromStr for RakeIntent {
|
|||
impl From<ReferenceKind> for RakeIntent {
|
||||
fn from(kind: ReferenceKind) -> Self {
|
||||
match kind {
|
||||
ReferenceKind::CanonicalUrl => {
|
||||
ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
|
||||
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
|
||||
RakeIntent::Any
|
||||
}
|
||||
|
@ -175,7 +174,22 @@ impl From<ReferenceKind> for RakeIntent {
|
|||
}
|
||||
}
|
||||
|
||||
impl RakeIntent {
|
||||
pub fn supports_mime_type(&self, mime_type: &str) -> bool {
|
||||
match self {
|
||||
RakeIntent::Any => ALL_MIME_TYPES.contains(mime_type),
|
||||
RakeIntent::Page => PAGE_MIME_TYPES.contains(mime_type),
|
||||
RakeIntent::Feed => FEED_MIME_TYPES.contains(mime_type),
|
||||
RakeIntent::SiteMap => SITEMAP_MIME_TYPES.contains(mime_type),
|
||||
RakeIntent::Icon => IMAGE_MIME_TYPES.contains_key(mime_type),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref PAGE_MIME_TYPES: HashSet<&'static str> =
|
||||
HashSet::from_iter(vec!["text/html", "text/gemini",]);
|
||||
|
||||
static ref SITEMAP_MIME_TYPES: HashSet<&'static str> =
|
||||
HashSet::from_iter(vec!["text/xml", "application/xml",]);
|
||||
|
||||
|
@ -197,6 +211,30 @@ lazy_static! {
|
|||
"application/rdf+xml",
|
||||
"application/feed+json"
|
||||
]);
|
||||
|
||||
pub static ref IMAGE_MIME_TYPES: HashMap<&'static str, ImageFormat> = {
|
||||
[
|
||||
("image/png", ImageFormat::Png),
|
||||
("image/webp", ImageFormat::WebP),
|
||||
("image/jpeg", ImageFormat::Jpeg),
|
||||
("image/gif", ImageFormat::Gif),
|
||||
("image/vnd.microsoft.icon", ImageFormat::Ico),
|
||||
("image/x-icon", ImageFormat::Ico),
|
||||
("image/icon", ImageFormat::Ico),
|
||||
("image/ico", ImageFormat::Ico),
|
||||
("application/ico", ImageFormat::Ico),
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
};
|
||||
|
||||
pub static ref ALL_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(
|
||||
PAGE_MIME_TYPES.iter().cloned()
|
||||
.chain(SITEMAP_MIME_TYPES.iter().cloned())
|
||||
.chain(FEED_MIME_TYPES.iter().cloned())
|
||||
.chain(FEED_LINK_MIME_TYPES.iter().cloned())
|
||||
.chain(IMAGE_MIME_TYPES.keys().cloned())
|
||||
);
|
||||
}
|
||||
|
||||
async fn response_to_bytes_limited(
|
||||
|
@ -204,6 +242,26 @@ async fn response_to_bytes_limited(
|
|||
size_limit: usize,
|
||||
time_limit: Duration,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
// Check the content-length header without
|
||||
let content_length = response
|
||||
.headers()
|
||||
.get("content-length")
|
||||
.map(|len| len.to_str().ok())
|
||||
.flatten()
|
||||
.map(|len| len.parse::<u64>().ok())
|
||||
.flatten();
|
||||
|
||||
if let Some(content_length) = content_length {
|
||||
if content_length > size_limit as u64 {
|
||||
// We can avoid downloading it: we already know it exceeds the limit.
|
||||
increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
|
||||
return Err(PermanentFailure {
|
||||
reason: PermanentFailureReason::ExceedsSizeLimit,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
let deadline = Instant::now() + time_limit;
|
||||
let mut buffer = Vec::new();
|
||||
let mut bytestream = response.bytes_stream();
|
||||
|
@ -216,7 +274,9 @@ async fn response_to_bytes_limited(
|
|||
buffer.extend_from_slice(next_chunk?.as_bytes());
|
||||
if buffer.len() > size_limit {
|
||||
increment_counter!("qprake_rake_specific_fail_count", "reason" => "SizeLimit");
|
||||
bail!("Exceeds size limit");
|
||||
return Err(PermanentFailure {
|
||||
reason: PermanentFailureReason::ExceedsSizeLimit,
|
||||
}.into());
|
||||
}
|
||||
},
|
||||
None => {
|
||||
|
@ -241,6 +301,30 @@ pub struct Raker {
|
|||
}
|
||||
|
||||
impl Raker {
|
||||
/// Figure out whether we can upgrade a URL to HTTPS.
|
||||
pub async fn try_upgrade_to_https(
|
||||
&self,
|
||||
url: &Url,
|
||||
client: &Client,
|
||||
) -> anyhow::Result<Option<Url>> {
|
||||
if url.scheme().eq_ignore_ascii_case("http") {
|
||||
// Try to upgrade to HTTPS if we can.
|
||||
let mut https_url = url.clone();
|
||||
https_url.set_scheme("https").unwrap();
|
||||
client
|
||||
.head(https_url.clone())
|
||||
.timeout(Duration::from_secs(10))
|
||||
.send()
|
||||
.await
|
||||
.context("failed to make HEAD request")?
|
||||
.error_for_status()
|
||||
.context("bad response for HEAD requesst")?;
|
||||
Ok(Some(https_url))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Rakes a resource by URL.
|
||||
///
|
||||
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
|
||||
|
@ -251,6 +335,22 @@ impl Raker {
|
|||
intent: RakeIntent,
|
||||
client: &Client,
|
||||
) -> anyhow::Result<RakeOutcome> {
|
||||
match self.try_upgrade_to_https(url, client).await {
|
||||
Ok(Some(upgraded)) => {
|
||||
return Ok(RakeOutcome::Redirect {
|
||||
reason: RedirectReason::SecureUpgrade,
|
||||
new_url: upgraded,
|
||||
});
|
||||
}
|
||||
Ok(None) => {
|
||||
// continue
|
||||
}
|
||||
Err(err) => {
|
||||
info!("can't upgrade {url} to HTTPS: {err:?}");
|
||||
// continue
|
||||
}
|
||||
}
|
||||
|
||||
let response = client.get(url.clone()).send().await?;
|
||||
|
||||
let is_cf = if let Some(remote_addr) = response.remote_addr() {
|
||||
|
@ -315,15 +415,26 @@ impl Raker {
|
|||
let content_type = content_type
|
||||
.to_str()
|
||||
.context("Can't convert content-type to str")?;
|
||||
content_type.split(";").next().unwrap().trim().to_owned()
|
||||
content_type
|
||||
.split(";")
|
||||
.next()
|
||||
.unwrap()
|
||||
.trim()
|
||||
.to_lowercase()
|
||||
} else {
|
||||
increment_counter!("qprake_rake_specific_fail_count", "reason" => "NoCT");
|
||||
return Ok(RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||
reason: TemporaryFailureReason::MissingInformation("content-type".to_owned()),
|
||||
backoff_sec: 86400 * 7,
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::UnknownContentType("not specified".to_owned()),
|
||||
}));
|
||||
};
|
||||
|
||||
if !intent.supports_mime_type(&content_type) {
|
||||
increment_counter!("qprake_rake_specific_fail_count", "reason" => "OtherCT");
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::UnknownContentType(content_type.to_owned()),
|
||||
}));
|
||||
}
|
||||
|
||||
let headers = response.headers().clone();
|
||||
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
|
||||
|
||||
|
@ -331,7 +442,7 @@ impl Raker {
|
|||
{
|
||||
// We don't try any fallbacks for an HTML page
|
||||
return Ok(self
|
||||
.rake_html_page(&content, url, is_cf, &headers)
|
||||
.rake_html_page(content, url, is_cf, &headers)
|
||||
.await
|
||||
.context("Raking HTML page")?);
|
||||
}
|
||||
|
@ -380,16 +491,14 @@ impl Raker {
|
|||
|
||||
pub async fn rake_html_page(
|
||||
&self,
|
||||
content: &[u8],
|
||||
content: Vec<u8>,
|
||||
url: &Url,
|
||||
is_cf: bool,
|
||||
headers: &HeaderMap,
|
||||
) -> anyhow::Result<RakeOutcome> {
|
||||
let content_str = std::str::from_utf8(content)?.to_owned();
|
||||
|
||||
match self
|
||||
.page_extraction
|
||||
.extract(content_str, url.clone(), headers.clone(), is_cf)
|
||||
.extract(content, url.clone(), headers.clone(), is_cf)
|
||||
.await?
|
||||
{
|
||||
ExtractedPage::Success {
|
||||
|
@ -397,8 +506,34 @@ impl Raker {
|
|||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
no_follow,
|
||||
no_index,
|
||||
} => {
|
||||
let references = references::find_references(&unreadable_document, &feeds, url);
|
||||
if no_index {
|
||||
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
|
||||
reason: PermanentFailureReason::IndexingDenied,
|
||||
}));
|
||||
}
|
||||
|
||||
let mut references = references::find_references(&unreadable_document, &feeds, url);
|
||||
|
||||
if no_follow {
|
||||
// Remove any link references
|
||||
for reference in references {
|
||||
match reference.kind {
|
||||
ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
|
||||
ReferenceKind::CanonicalUrl
|
||||
| ReferenceKind::FeedEntry
|
||||
| ReferenceKind::SitemapEntry
|
||||
| ReferenceKind::SecureUpgrade
|
||||
| ReferenceKind::Redirect => {
|
||||
warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
|
||||
}
|
||||
}
|
||||
}
|
||||
references = BTreeSet::new();
|
||||
}
|
||||
|
||||
Ok(RakeOutcome::RakedPage(RakedPage {
|
||||
page_entry: RakedPageEntry {
|
||||
analysed_antifeatures: antifeature_flags,
|
||||
|
@ -497,7 +632,6 @@ pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
|
|||
debug!("Sitemap error {:?}", error);
|
||||
}
|
||||
}
|
||||
eprintln!("{:?}", entry);
|
||||
}
|
||||
|
||||
if urls.is_empty() {
|
||||
|
@ -555,8 +689,10 @@ pub async fn get_robots_txt_for(url: &Url, client: &Client) -> anyhow::Result<Op
|
|||
|
||||
if !resp.status().is_success() {
|
||||
let code = resp.status().as_u16();
|
||||
if code == 404 || code == 410 {
|
||||
if code == 403 || code == 404 || code == 410 {
|
||||
// not found or gone? Assume there is intentionally no robots.txt file.
|
||||
// If they deny us access to the robots file, then they deserve whatever they get and
|
||||
// we proceed.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
|
|
|
@ -12,11 +12,13 @@ use itertools::Itertools;
|
|||
use kuchiki::NodeRef;
|
||||
use log::{debug, error, trace, warn};
|
||||
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
|
||||
use quickpeep_html_charset_detection::sniff;
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use quickpeep_utils::lazy::Lazy;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Borrow;
|
||||
use tokio::runtime;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
|
@ -29,7 +31,7 @@ pub struct PageExtractionService {
|
|||
}
|
||||
|
||||
pub struct ExtractionTask {
|
||||
content: String,
|
||||
content: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
|
@ -39,7 +41,7 @@ pub struct ExtractionTask {
|
|||
impl PageExtractionService {
|
||||
pub async fn extract(
|
||||
&self,
|
||||
content: String,
|
||||
content: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
|
@ -111,12 +113,21 @@ struct PageExtractionServiceInternal {
|
|||
impl PageExtractionServiceInternal {
|
||||
fn extract_page(
|
||||
&self,
|
||||
content_str: String,
|
||||
content_bytes: Vec<u8>,
|
||||
url: Url,
|
||||
headers: HeaderMap,
|
||||
is_cf: bool,
|
||||
) -> anyhow::Result<ExtractedPage> {
|
||||
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
|
||||
let encoding = sniff(
|
||||
&content_bytes,
|
||||
true,
|
||||
headers.get("content-type").map(|hv| hv.as_bytes()),
|
||||
);
|
||||
let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
|
||||
if replacements_made {
|
||||
warn!("Character replacements made!");
|
||||
}
|
||||
let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());
|
||||
|
||||
// See whether this page is at the canonical URL for the page.
|
||||
// If it's not, then we redirect the raker to the canonical URL.
|
||||
|
@ -145,6 +156,25 @@ impl PageExtractionServiceInternal {
|
|||
}
|
||||
}
|
||||
|
||||
let mut no_follow = false;
|
||||
let mut no_index = false;
|
||||
|
||||
// Find any restrictions on indexing this page or following any links.
|
||||
if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
|
||||
for node in robots_nodes {
|
||||
if let Some(content) = node.attributes.borrow().get("content") {
|
||||
for directive in content
|
||||
.split(|c: char| c.is_whitespace() || c == ',')
|
||||
.filter(|s| !s.is_empty())
|
||||
{
|
||||
let none = directive.eq_ignore_ascii_case("none");
|
||||
no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
|
||||
no_index |= directive.eq_ignore_ascii_case("noindex") | none;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if language.is_none() {
|
||||
// Next fallback: prefer the content-language header baked into the page itself
|
||||
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||
|
@ -300,6 +330,8 @@ impl PageExtractionServiceInternal {
|
|||
document,
|
||||
feeds,
|
||||
antifeature_flags,
|
||||
no_follow,
|
||||
no_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -351,6 +383,8 @@ pub enum ExtractedPage {
|
|||
document: DenseDocument,
|
||||
feeds: Vec<Url>,
|
||||
antifeature_flags: AnalysisAntifeatures,
|
||||
no_follow: bool,
|
||||
no_index: bool,
|
||||
},
|
||||
Redirect {
|
||||
reason: RedirectReason,
|
||||
|
|
|
@ -18,4 +18,10 @@ pub fn describe_raking_metrics() {
|
|||
Unit::Bytes,
|
||||
"Number of content bytes raked."
|
||||
);
|
||||
|
||||
describe_counter!(
|
||||
"qprake_queue_new_url",
|
||||
Unit::Count,
|
||||
"Number of new URLs enqueued"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -6,6 +6,10 @@ use quickpeep_utils::dates::date_to_quickpeep_days;
|
|||
use reqwest::Url;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// Supported schemes.
|
||||
/// References in all other schemes will be ignored.
|
||||
pub const SUPPORTED_SCHEMES: [&'static str; 2] = ["http", "https"];
|
||||
|
||||
pub fn find_references(
|
||||
doc: &Vec<DenseTree>,
|
||||
feeds: &Vec<Url>,
|
||||
|
@ -41,11 +45,24 @@ pub fn find_references(
|
|||
} => {
|
||||
if !nofollow {
|
||||
if let Ok(full_url) = page_url.join(&href) {
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(&full_url).to_string(),
|
||||
kind: ReferenceKind::Link,
|
||||
last_mod: None,
|
||||
});
|
||||
if full_url.domain().is_none() {
|
||||
// Skip URLs that don't have a domain after being made absolute.
|
||||
// This also skips IP addresses: we probably don't want to bother
|
||||
// indexing content from explicit IP addresses.
|
||||
continue;
|
||||
}
|
||||
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(&full_url).to_string(),
|
||||
kind: ReferenceKind::Link,
|
||||
last_mod: None,
|
||||
});
|
||||
} else {
|
||||
debug!(
|
||||
"ignoring reference {:?}: not a supported scheme",
|
||||
full_url.as_str()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
debug!("Can't join {:?} + {:?} to get full URL", page_url, href);
|
||||
}
|
||||
|
@ -61,6 +78,10 @@ pub fn find_references(
|
|||
add_link_refs(&doc, &mut refs, &page_url);
|
||||
|
||||
for feed in feeds {
|
||||
if feed.domain().is_none() {
|
||||
// same rationale as above.
|
||||
continue;
|
||||
}
|
||||
refs.insert(RakedReference {
|
||||
target: clean_url(feed).as_str().to_owned(),
|
||||
kind: ReferenceKind::HeaderLinkedFeed,
|
||||
|
@ -78,7 +99,7 @@ pub fn references_from_urlrakes(
|
|||
input
|
||||
.iter()
|
||||
.map(|url_raked| RakedReference {
|
||||
target: url_raked.url.to_string(),
|
||||
target: clean_url(&url_raked.url).to_string(),
|
||||
kind: ref_kind,
|
||||
last_mod: url_raked
|
||||
.last_changed
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
use crate::raking::references::references_from_urlrakes;
|
||||
use crate::config::RerakeTimings;
|
||||
use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES};
|
||||
use crate::raking::{
|
||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
|
||||
};
|
||||
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord};
|
||||
use crate::storage::{RakerStore, RandomActiveDomainAcquisition};
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use crate::storage::records::{DomainRecord, UrlVisitedRecord};
|
||||
use crate::storage::RakerStore;
|
||||
use anyhow::{anyhow, Context};
|
||||
use chrono::Utc;
|
||||
use cylon::Cylon;
|
||||
use log::{error, warn};
|
||||
use log::{debug, warn};
|
||||
use lru::LruCache;
|
||||
use metrics::increment_counter;
|
||||
use quickpeep_structs::rake_entries::{
|
||||
IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||
};
|
||||
|
@ -31,8 +33,8 @@ use tokio::time::Instant;
|
|||
pub const MAX_CRAWL_DELAY_BEFORE_BACKOFF: Duration = Duration::from_secs(61);
|
||||
|
||||
/// Most sites request a crawl delay of 10 sec or less.
|
||||
/// If unspecified, let's go with a reasonable-sounding number of 15 secs.
|
||||
pub const DEFAULT_CRAWL_DELAY: Duration = Duration::from_secs(15);
|
||||
/// If unspecified, let's go with a reasonable-sounding number of 10 secs.
|
||||
pub const DEFAULT_CRAWL_DELAY: Duration = Duration::from_secs(10);
|
||||
|
||||
enum NextAction {
|
||||
Continue,
|
||||
|
@ -77,56 +79,11 @@ pub struct TaskContext {
|
|||
/// Notifier used to wake up sleepers (either to stop them gracefully, or because work
|
||||
/// is available (not implemented))
|
||||
pub notify: Arc<Notify>,
|
||||
|
||||
pub rerake_timings: Arc<RerakeTimings>,
|
||||
}
|
||||
|
||||
impl TaskContext {
|
||||
pub async fn run(mut self) -> anyhow::Result<()> {
|
||||
// Get a domain to process
|
||||
while !self.graceful_stop.load(Ordering::SeqCst) {
|
||||
let domain = {
|
||||
let txn = self.store.ro_txn()?;
|
||||
txn.acquire_random_active_domain(self.busy_domains.clone())?
|
||||
};
|
||||
|
||||
match domain {
|
||||
RandomActiveDomainAcquisition::GotOne {
|
||||
domain,
|
||||
record: _active_record,
|
||||
} => {
|
||||
if let Err(err) = self.process_domain(domain.clone()).await {
|
||||
error!("Encountered error processing {:?}: {:?}", domain, err);
|
||||
}
|
||||
ensure!(
|
||||
self.busy_domains
|
||||
.lock()
|
||||
.map_err(|_| anyhow!("busy domains set poisoned"))?
|
||||
.remove(&domain),
|
||||
"Our domain was not busy after processing!"
|
||||
);
|
||||
}
|
||||
RandomActiveDomainAcquisition::AllBusy => {
|
||||
// TODO(perf): notify waiters when new domains are available.
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(60)) => {
|
||||
// nop
|
||||
},
|
||||
_ = self.notify.notified() => {
|
||||
// nop (we allow the notifier to wake us up in case we need to gracefully
|
||||
// stop).
|
||||
},
|
||||
};
|
||||
}
|
||||
RandomActiveDomainAcquisition::NoneLeft => {
|
||||
// Nothing left to do, and it's not temporary because there aren't even any
|
||||
// busy domains left.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn get_robot_rules(&self, url_of_site: &Url) -> anyhow::Result<Option<Cylon>> {
|
||||
let robots = get_robots_txt_for(url_of_site, &self.redirect_following_client).await?;
|
||||
Ok(robots.map(|robots: RobotsTxt| robots.rules))
|
||||
|
@ -137,11 +94,23 @@ impl TaskContext {
|
|||
let mut current_robot_rules: Option<Cylon> = None;
|
||||
let mut wait_until: Option<Instant> = None;
|
||||
|
||||
let domain_record = {
|
||||
let txn = self.store.ro_txn()?;
|
||||
let dr = txn.get_domain_record(&domain)?;
|
||||
match dr {
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
Some(dr) => dr,
|
||||
}
|
||||
};
|
||||
|
||||
while !self.graceful_stop.load(Ordering::Relaxed) {
|
||||
// Get a URL to process
|
||||
let url = {
|
||||
let txn = self.store.ro_txn()?;
|
||||
txn.choose_url_for_domain(&domain)?
|
||||
txn.choose_url_for_domain(&domain)
|
||||
.context("failed to choose URL for domain")?
|
||||
};
|
||||
|
||||
let (url_str, url_record) = if let Some(url) = url {
|
||||
|
@ -162,12 +131,14 @@ impl TaskContext {
|
|||
}
|
||||
|
||||
// Delete the active domain from the store
|
||||
txn.remove_active_domain(&domain)?;
|
||||
txn.remove_active_domain(&domain)
|
||||
.context("failed to remove active domain")?;
|
||||
|
||||
txn.commit()?;
|
||||
Ok(true)
|
||||
})
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to check if we're out of URLs")?;
|
||||
if out_of_urls {
|
||||
break;
|
||||
} else {
|
||||
|
@ -175,13 +146,50 @@ impl TaskContext {
|
|||
}
|
||||
};
|
||||
|
||||
let url = Url::parse(&url_str)?;
|
||||
let url = Url::parse(&url_str)
|
||||
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
|
||||
|
||||
if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
|
||||
// This is now a weed: skip.
|
||||
let domain = domain.clone();
|
||||
let url = url.clone();
|
||||
self.store
|
||||
.async_rw_txn(move |txn| {
|
||||
txn.dequeue_url(&domain, url.as_str())?;
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check our robot rules are valid for that URL.
|
||||
let robot_url = robots_txt_url_for(&url)?;
|
||||
let robot_url = robots_txt_url_for(&url)
|
||||
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
|
||||
if Some(&robot_url) != current_robot_rules_url.as_ref() {
|
||||
// We need to update our robot rules!
|
||||
current_robot_rules = self.get_robot_rules(&url).await?;
|
||||
match self.get_robot_rules(&url).await {
|
||||
Ok(rules) => {
|
||||
current_robot_rules = rules;
|
||||
}
|
||||
Err(err) => {
|
||||
self.process_outcome(
|
||||
&url,
|
||||
RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||
reason: TemporaryFailureReason::UnknownClientError(format!(
|
||||
"robots.txt failure {:?}: {:?}",
|
||||
url, err
|
||||
)),
|
||||
// Back off for a day: this ought to be enough time for the operator to fix the problem... maybe?
|
||||
backoff_sec: 86400,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.context("failed to handle TemporaryFailure outcome for robots.txt")?;
|
||||
// Forcefully change domain
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
current_robot_rules_url = Some(robot_url);
|
||||
}
|
||||
|
||||
|
@ -195,7 +203,8 @@ impl TaskContext {
|
|||
reason: PermanentFailureReason::DeniedToRobots,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to process PermanentFailure outcome for robots.txt")?;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -231,6 +240,7 @@ impl TaskContext {
|
|||
} else {
|
||||
&self.client
|
||||
};
|
||||
debug!("Rake: {url}");
|
||||
let raked = self.raker.rake(&url, url_record.intent, client).await;
|
||||
drop(permit);
|
||||
|
||||
|
@ -239,15 +249,21 @@ impl TaskContext {
|
|||
|
||||
let rake_outcome = raked.unwrap_or_else(|err| {
|
||||
warn!("Failed to rake {:?}: {:?}", url, err);
|
||||
// Treat this as a temporary rejection (backoff).
|
||||
RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||
reason: TemporaryFailureReason::UnknownClientError(format!(
|
||||
"Failed to rake {:?}: {:?}",
|
||||
url, err
|
||||
)),
|
||||
// Back off for a day: this ought to be enough time for the operator to fix the problem... maybe?
|
||||
backoff_sec: 86400,
|
||||
})
|
||||
|
||||
match err.downcast::<PermanentFailure>() {
|
||||
Ok(permanent) => RakeOutcome::PermanentFailure(permanent),
|
||||
Err(err) => {
|
||||
// Treat this as a temporary rejection (backoff).
|
||||
RakeOutcome::TemporaryFailure(TemporaryFailure {
|
||||
reason: TemporaryFailureReason::UnknownClientError(format!(
|
||||
"Failed to rake {:?}: {:?}",
|
||||
url, err
|
||||
)),
|
||||
// Back off for a day: this ought to be enough time for the operator to fix the problem... maybe?
|
||||
backoff_sec: 86400,
|
||||
})
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
match self.process_outcome(&url, rake_outcome).await? {
|
||||
|
@ -293,7 +309,8 @@ impl TaskContext {
|
|||
txn.commit()?;
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
.await
|
||||
.context("failure whilst turning long crawl delay into backoff")?;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -319,10 +336,12 @@ impl TaskContext {
|
|||
|
||||
self.as_event_processor()
|
||||
.process_page(url.clone(), page.page_entry, today)
|
||||
.await?;
|
||||
.await
|
||||
.context("failure processing page for RakedPage")?;
|
||||
self.as_event_processor()
|
||||
.process_refs(url.clone(), page.referrer_entry, today)
|
||||
.await?;
|
||||
.process_refs(url.clone(), page.referrer_entry, today, false)
|
||||
.await
|
||||
.context("failure processing refs for RakedPage")?;
|
||||
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
|
@ -338,8 +357,9 @@ impl TaskContext {
|
|||
.context("Reference processor shut down; can't stream references!")?;
|
||||
|
||||
self.as_event_processor()
|
||||
.process_refs(url.clone(), refs, today)
|
||||
.await?;
|
||||
.process_refs(url.clone(), refs, today, true)
|
||||
.await
|
||||
.context("failure processing refs for RakedFeed")?;
|
||||
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
|
@ -355,8 +375,9 @@ impl TaskContext {
|
|||
.context("Reference processor shut down; can't stream references!")?;
|
||||
|
||||
self.as_event_processor()
|
||||
.process_refs(url.clone(), refs, today)
|
||||
.await?;
|
||||
.process_refs(url.clone(), refs, today, true)
|
||||
.await
|
||||
.context("failure processing refs for RakedSitemap")?;
|
||||
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
|
@ -375,17 +396,19 @@ impl TaskContext {
|
|||
|
||||
self.as_event_processor()
|
||||
.process_icon(url.clone(), today)
|
||||
.await?;
|
||||
.await
|
||||
.context("failure processing icon for RakedIcon")?;
|
||||
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
RakeOutcome::Redirect { reason, new_url } => {
|
||||
let refs = RakedReferrerEntry {
|
||||
references: [RakedReference {
|
||||
target: new_url.to_string(),
|
||||
target: clean_url(&new_url).to_string(),
|
||||
kind: match reason {
|
||||
RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
|
||||
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
|
||||
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
|
||||
},
|
||||
last_mod: None,
|
||||
}]
|
||||
|
@ -400,8 +423,9 @@ impl TaskContext {
|
|||
.context("Reference processor shut down; can't stream references!")?;
|
||||
|
||||
self.as_event_processor()
|
||||
.process_refs(url.clone(), refs, today)
|
||||
.await?;
|
||||
.process_refs(url.clone(), refs, today, false)
|
||||
.await
|
||||
.context("Failure processing refs for Redirect")?;
|
||||
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
|
@ -409,7 +433,9 @@ impl TaskContext {
|
|||
// TODO(future) do we want to log this somewhere?
|
||||
// or at least a metric
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
let domain = get_reduced_domain(url).with_context(|| {
|
||||
format!("No domain in URL '{url}' for which we are processing the outcome!")
|
||||
})?;
|
||||
let url = url.clone();
|
||||
|
||||
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
||||
|
@ -422,7 +448,8 @@ impl TaskContext {
|
|||
txn.commit()?;
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to store backoff")?;
|
||||
|
||||
// Change domain now
|
||||
Ok(NextAction::ChangeDomain)
|
||||
|
@ -435,7 +462,8 @@ impl TaskContext {
|
|||
.context("Rejection processor shut down; can't stream rejection!!")?;
|
||||
self.as_event_processor()
|
||||
.process_rejection(url.clone(), today)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to process rejection for PermanentFailure")?;
|
||||
|
||||
// Reasons for permanent rejection aren't our fault or a site-wide fault;
|
||||
// so don't worry about carrying on.
|
||||
|
@ -447,6 +475,7 @@ impl TaskContext {
|
|||
fn as_event_processor(&self) -> EventProcessor {
|
||||
EventProcessor {
|
||||
store: Cow::Borrowed(&self.store),
|
||||
rerake_timings: &self.rerake_timings,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -456,6 +485,7 @@ impl TaskContext {
|
|||
/// just by replaying the stream of RakePacks and importing seeds.
|
||||
pub struct EventProcessor<'a> {
|
||||
store: Cow<'a, RakerStore>,
|
||||
rerake_timings: &'a RerakeTimings,
|
||||
}
|
||||
|
||||
impl EventProcessor<'_> {
|
||||
|
@ -465,22 +495,28 @@ impl EventProcessor<'_> {
|
|||
page: RakedPageEntry,
|
||||
datestamp: u16,
|
||||
) -> anyhow::Result<()> {
|
||||
let rerake_on = Some(datestamp + self.rerake_timings.page);
|
||||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing the page!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
UrlVisitedRecord {
|
||||
last_visited_days: datestamp,
|
||||
},
|
||||
rerake_on,
|
||||
)?;
|
||||
|
||||
// If there's a favicon to be tried, add it to the list...
|
||||
let favicon_url_rel = page.document.head.effective_favicon_url();
|
||||
if let Ok(favicon_url) = url.join(favicon_url_rel) {
|
||||
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
|
||||
if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) {
|
||||
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
|
||||
}
|
||||
}
|
||||
|
||||
txn.commit()?;
|
||||
|
@ -490,16 +526,21 @@ impl EventProcessor<'_> {
|
|||
}
|
||||
|
||||
pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> {
|
||||
let rerake_on = Some(datestamp + self.rerake_timings.icon);
|
||||
|
||||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing an icon!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
UrlVisitedRecord {
|
||||
last_visited_days: datestamp,
|
||||
},
|
||||
rerake_on,
|
||||
)?;
|
||||
|
||||
txn.commit()?;
|
||||
|
@ -513,46 +554,66 @@ impl EventProcessor<'_> {
|
|||
url: Url,
|
||||
refs: RakedReferrerEntry,
|
||||
datestamp: u16,
|
||||
rerakeable_feed: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let rerake_on = if rerakeable_feed {
|
||||
Some(self.rerake_timings.feed)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing refs!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
UrlVisitedRecord {
|
||||
last_visited_days: datestamp,
|
||||
},
|
||||
)?;
|
||||
rerake_on,
|
||||
)
|
||||
.context("failed to mark URL as visited")?;
|
||||
|
||||
// track all the referred-to URLs!
|
||||
for reference in refs.references {
|
||||
let ref_url = Url::parse(&reference.target)?;
|
||||
let domain = get_reduced_domain(&ref_url)?;
|
||||
let ref_url = Url::parse(&reference.target).with_context(|| {
|
||||
format!(
|
||||
"failed to parse target URL of reference: {:?}",
|
||||
reference.target
|
||||
)
|
||||
})?;
|
||||
let domain = get_reduced_domain(&ref_url).with_context(|| {
|
||||
format!("failed to reduce domain: {:?}", reference.target)
|
||||
})?;
|
||||
|
||||
// First check if this URL is an allowed URL (hence should be enqueued)
|
||||
// Check if this URL is an allowed URL (hence should be enqueued)
|
||||
let allowed = txn
|
||||
.get_allowed_domain_record(domain.borrow())?
|
||||
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url))
|
||||
.unwrap_or(false);
|
||||
if allowed {
|
||||
txn.enqueue_url(
|
||||
&reference.target,
|
||||
reference.last_mod,
|
||||
reference.kind.into(),
|
||||
)?;
|
||||
continue;
|
||||
}
|
||||
.get_domain_record(domain.borrow())?
|
||||
.map(|record: DomainRecord| record.is_url_rakeable(&ref_url))
|
||||
.flatten();
|
||||
|
||||
// Then check if this URL is a weed (hence should be ignored)
|
||||
let is_weed = txn
|
||||
.get_weed_domain_record(domain.borrow())?
|
||||
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url))
|
||||
.unwrap_or(false);
|
||||
if !is_weed {
|
||||
// It's neither allowed nor weeded, so put it on hold for later inspection
|
||||
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
|
||||
match allowed {
|
||||
Some(true) => {
|
||||
let is_fresh = txn.enqueue_url(
|
||||
&reference.target,
|
||||
reference.last_mod,
|
||||
reference.kind.into(),
|
||||
)?;
|
||||
if is_fresh {
|
||||
increment_counter!("qprake_queue_new_url");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Some(false) => {
|
||||
// Weed! Do nothing.
|
||||
}
|
||||
None => {
|
||||
// It's neither allowed nor weeded, so put it on hold for later inspection
|
||||
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -566,13 +627,16 @@ impl EventProcessor<'_> {
|
|||
self.store
|
||||
.as_ref()
|
||||
.async_rw_txn(move |txn| {
|
||||
let domain = get_reduced_domain(&url)?;
|
||||
let domain = get_reduced_domain(&url).with_context(|| {
|
||||
format!("No domain for URL '{url}' for which we are processing a rejection!")
|
||||
})?;
|
||||
txn.mark_url_as_visited(
|
||||
domain.as_ref(),
|
||||
url.as_ref(),
|
||||
UrlVisitedRecord {
|
||||
last_visited_days: datestamp,
|
||||
},
|
||||
None,
|
||||
)?;
|
||||
txn.commit()?;
|
||||
Ok(())
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64};
|
||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64};
|
||||
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
||||
use crate::storage::records::{
|
||||
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord,
|
||||
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord,
|
||||
ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
|
||||
UrlVisitedRecord,
|
||||
};
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use libmdbx::{
|
||||
Database, DatabaseFlags, Environment, EnvironmentFlags, Transaction, TransactionKind,
|
||||
Database, DatabaseFlags, Environment, EnvironmentFlags, Geometry, Transaction, TransactionKind,
|
||||
WriteFlags, WriteMap, RO, RW,
|
||||
};
|
||||
use log::info;
|
||||
|
@ -16,11 +16,12 @@ use ouroboros::self_referencing;
|
|||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use reqwest::Url;
|
||||
use std::borrow::{Borrow, Cow};
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::ops::Add;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
pub mod maintenance;
|
||||
pub mod mdbx_helper_types;
|
||||
|
@ -31,6 +32,9 @@ pub mod records;
|
|||
pub struct Databases<'env> {
|
||||
/// Domain \n URL → QueueUrlRecord
|
||||
pub queue_urls: Database<'env>,
|
||||
/// u16 → URL. The u16 is the day-precision QuickPeep timestamp at which the URL should (MULTI-VALUE; INT16)
|
||||
/// be enqueued again for reraking.
|
||||
pub rerake_queue: Database<'env>,
|
||||
/// Domain → ActiveDomainRecord
|
||||
pub active_domains: Database<'env>,
|
||||
/// u32 → domain name. Used to try and give some fairness.
|
||||
|
@ -41,18 +45,17 @@ pub struct Databases<'env> {
|
|||
pub backing_off_domains: Database<'env>,
|
||||
/// URL → VisitedDomainRecord
|
||||
pub visited_urls: Database<'env>,
|
||||
/// Domain → AllowedDomainRecord
|
||||
pub allowed_domains: Database<'env>,
|
||||
/// Domain → DomainRecord
|
||||
pub domains: Database<'env>,
|
||||
/// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE)
|
||||
pub urls_on_hold: Database<'env>,
|
||||
/// Domain → WeedDomainRecord
|
||||
pub weed_domains: Database<'env>,
|
||||
}
|
||||
|
||||
impl<'env> Databases<'env> {
|
||||
pub fn iter_all_databases(&self) -> impl Iterator<Item = (&'static str, &Database<'env>)> {
|
||||
[
|
||||
("queue_urls", &self.queue_urls),
|
||||
("rerake_queue", &self.rerake_queue),
|
||||
("active_domains", &self.active_domains),
|
||||
("active_domain_raffle", &self.active_domain_raffle),
|
||||
(
|
||||
|
@ -61,9 +64,8 @@ impl<'env> Databases<'env> {
|
|||
),
|
||||
("backing_off_domains", &self.backing_off_domains),
|
||||
("visited_urls", &self.visited_urls),
|
||||
("allowed_domains", &self.allowed_domains),
|
||||
("domains", &self.domains),
|
||||
("urls_on_hold", &self.urls_on_hold),
|
||||
("weed_domains", &self.weed_domains),
|
||||
]
|
||||
.into_iter()
|
||||
}
|
||||
|
@ -72,6 +74,7 @@ impl<'env> Databases<'env> {
|
|||
// Must match the order of the Databases struct fields.
|
||||
pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
|
||||
("urls_queue", DatabaseFlags::empty()),
|
||||
("rerake_queue", DatabaseFlags::DUP_SORT),
|
||||
("active_domains", DatabaseFlags::empty()),
|
||||
("active_domain_raffle", DatabaseFlags::INTEGER_KEY),
|
||||
(
|
||||
|
@ -80,9 +83,8 @@ pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
|
|||
),
|
||||
("backing_off_domains", DatabaseFlags::empty()),
|
||||
("urls_visited", DatabaseFlags::empty()),
|
||||
("allowed_domains", DatabaseFlags::empty()),
|
||||
("domains", DatabaseFlags::empty()),
|
||||
("urls_on_hold", DatabaseFlags::empty()),
|
||||
("weed_domains", DatabaseFlags::empty()),
|
||||
];
|
||||
|
||||
#[self_referencing]
|
||||
|
@ -106,7 +108,19 @@ impl RakerStore {
|
|||
let mut flags = EnvironmentFlags::default();
|
||||
flags.no_sub_dir = true;
|
||||
|
||||
let mut geom = Geometry::default();
|
||||
// Don't stop the database growing until it hits 64 GiB.
|
||||
// (The default is 1 MiB which is just not enough!)
|
||||
geom.size = Some(1024 * 1024..64 * 1024 * 1024 * 1024);
|
||||
|
||||
// Grow 16 MiB at a time.
|
||||
geom.growth_step = Some(16 * 1024 * 1024);
|
||||
// Shrink 64 MiB at a time.
|
||||
geom.shrink_threshold = Some(64 * 1024 * 1024);
|
||||
// (Yes these numbers represent a large database).
|
||||
|
||||
let env = Environment::new()
|
||||
.set_geometry(geom)
|
||||
.set_max_dbs(256)
|
||||
.set_flags(flags)
|
||||
.open(path)?;
|
||||
|
@ -163,14 +177,14 @@ impl RakerStore {
|
|||
// Must match the order of the DATABASES constant and the struct field definitions
|
||||
Databases {
|
||||
queue_urls: dbs.next().unwrap(),
|
||||
rerake_queue: dbs.next().unwrap(),
|
||||
active_domains: dbs.next().unwrap(),
|
||||
active_domain_raffle: dbs.next().unwrap(),
|
||||
backing_off_reinstatements: dbs.next().unwrap(),
|
||||
backing_off_domains: dbs.next().unwrap(),
|
||||
visited_urls: dbs.next().unwrap(),
|
||||
allowed_domains: dbs.next().unwrap(),
|
||||
domains: dbs.next().unwrap(),
|
||||
urls_on_hold: dbs.next().unwrap(),
|
||||
weed_domains: dbs.next().unwrap(),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
@ -324,9 +338,11 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
domain: &str,
|
||||
url_str: &str,
|
||||
record: UrlVisitedRecord,
|
||||
rerake_on: Option<u16>,
|
||||
) -> anyhow::Result<()> {
|
||||
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
||||
let rerake_queue = &self.mdbx.borrow_dbs().rerake_queue;
|
||||
|
||||
let queue_key = format!("{}\n{}", domain, url_str);
|
||||
|
||||
|
@ -345,6 +361,24 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
WriteFlags::empty(),
|
||||
)?;
|
||||
|
||||
if let Some(rerake_on) = rerake_on {
|
||||
self.mdbx_txn.put(
|
||||
rerake_queue,
|
||||
&rerake_on.to_be_bytes(),
|
||||
url_str.as_bytes(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Marks a URL as visited and takes it out of the queue.
|
||||
pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
|
||||
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||
let queue_key = format!("{}\n{}", domain, url_str);
|
||||
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -388,6 +422,87 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Reinstates backing-off domains up to the specified time.
|
||||
/// Returns the time of the next reinstatement, if there is one.
|
||||
pub fn reinstate_backoffs(&self, up_to_ts: SystemTime) -> anyhow::Result<Option<SystemTime>> {
|
||||
let backing_off_domains = &self.mdbx.borrow_dbs().backing_off_domains;
|
||||
let backing_off_reinstatements = &self.mdbx.borrow_dbs().backing_off_reinstatements;
|
||||
|
||||
let reinstate_up_to = up_to_ts.duration_since(UNIX_EPOCH)?.as_secs();
|
||||
|
||||
let mut cur = self.mdbx_txn.cursor(backing_off_reinstatements)?;
|
||||
cur.first::<MdbxU64, MdbxString>()?;
|
||||
loop {
|
||||
let (MdbxU64(reinstatement_time), domain_to_reinstate) =
|
||||
match cur.get_current::<MdbxU64, MdbxString>()? {
|
||||
Some(x) => x,
|
||||
None => break,
|
||||
};
|
||||
|
||||
if reinstatement_time > reinstate_up_to {
|
||||
return Ok(Some(
|
||||
UNIX_EPOCH.add(Duration::from_secs(reinstatement_time)),
|
||||
));
|
||||
}
|
||||
|
||||
let dom_str = domain_to_reinstate.into_string();
|
||||
self.mdbx_txn
|
||||
.del(backing_off_domains, dom_str.clone(), None)?;
|
||||
self.insert_active_domain_with_new_raffle_ticket(dom_str)?;
|
||||
cur.del(WriteFlags::empty())?;
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Reinstates URLs that are now re-rakable.
|
||||
pub fn reinstate_rerakables(&self, today: u16) -> anyhow::Result<()> {
|
||||
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
|
||||
let rerake_queue = &self.mdbx.borrow_dbs().rerake_queue;
|
||||
|
||||
let mut reinstatable_domains: BTreeSet<String> = BTreeSet::new();
|
||||
|
||||
let mut cur = self.mdbx_txn.cursor(rerake_queue)?;
|
||||
cur.first::<MdbxU16BE, MdbxString>()?;
|
||||
loop {
|
||||
let (MdbxU16BE(rerake_datestamp), url_to_rerake) =
|
||||
match cur.get_current::<MdbxU16BE, MdbxString>()? {
|
||||
Some(x) => x,
|
||||
None => break,
|
||||
};
|
||||
|
||||
if rerake_datestamp > today {
|
||||
break;
|
||||
}
|
||||
|
||||
let url_str = url_to_rerake.into_string();
|
||||
let url = Url::parse(&url_str).context("Failed to parse rerakable URL")?;
|
||||
let url_domain =
|
||||
get_reduced_domain(&url).context("Unable to reduce domain for rerakable URL")?;
|
||||
|
||||
self.mdbx_txn.put(
|
||||
queue_urls,
|
||||
format!("{}\n{}", url_domain, url_str).as_bytes(),
|
||||
// TODO(correctness): should specify the same intent as before.
|
||||
&MdbxBare(QueueUrlRecord {
|
||||
intent: RakeIntent::Any,
|
||||
})
|
||||
.as_bytes(),
|
||||
WriteFlags::NO_OVERWRITE,
|
||||
)?;
|
||||
|
||||
reinstatable_domains.insert(url_domain.into_owned());
|
||||
|
||||
cur.del(WriteFlags::empty())?;
|
||||
}
|
||||
|
||||
for domain in reinstatable_domains {
|
||||
self.insert_active_domain_with_new_raffle_ticket(domain)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Enqueues a URL.
|
||||
/// If `only_if_not_visited_since` is specified, then this is a no-op if the page has already been
|
||||
/// visited since then.
|
||||
|
@ -405,7 +520,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
|
||||
|
||||
let url = Url::parse(url_str)?;
|
||||
let url_domain = get_reduced_domain(&url)?;
|
||||
let url_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;
|
||||
|
||||
let queue_key = format!("{}\n{}", url_domain, url);
|
||||
|
||||
|
@ -457,7 +573,8 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
|
||||
|
||||
let url = Url::parse(url_str)?;
|
||||
let url_domain = get_reduced_domain(&url)?;
|
||||
let url_domain = get_reduced_domain(&url)
|
||||
.with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;
|
||||
|
||||
let queue_key = format!("{}\n{}", url_domain, url);
|
||||
|
||||
|
@ -490,33 +607,17 @@ impl<'a> RakerTxn<'a, RW> {
|
|||
Ok(is_new)
|
||||
}
|
||||
|
||||
pub fn put_allowed_domain_record(
|
||||
pub fn put_domain_record(
|
||||
&self,
|
||||
domain: &str,
|
||||
allowed_domain_record: AllowedDomainRecord,
|
||||
domain_record: DomainRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
|
||||
let domains = &self.mdbx.borrow_dbs().domains;
|
||||
|
||||
self.mdbx_txn.put(
|
||||
allowed_domains,
|
||||
domains,
|
||||
domain.as_bytes(),
|
||||
MdbxBare(allowed_domain_record).as_bytes(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_weed_domain_record(
|
||||
&self,
|
||||
domain: &str,
|
||||
weed_domain_record: WeedDomainRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
|
||||
|
||||
self.mdbx_txn.put(
|
||||
weed_domains,
|
||||
domain.as_bytes(),
|
||||
MdbxBare(weed_domain_record).as_bytes(),
|
||||
MdbxBare(domain_record).as_bytes(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
Ok(())
|
||||
|
@ -541,6 +642,7 @@ pub fn register_datastore_metrics() -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub enum RandomActiveDomainAcquisition {
|
||||
GotOne {
|
||||
domain: String,
|
||||
|
@ -665,27 +767,12 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn get_allowed_domain_record(
|
||||
&self,
|
||||
domain: &str,
|
||||
) -> anyhow::Result<Option<AllowedDomainRecord>> {
|
||||
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
|
||||
pub fn get_domain_record(&self, domain: &str) -> anyhow::Result<Option<DomainRecord>> {
|
||||
let domains = &self.mdbx.borrow_dbs().domains;
|
||||
|
||||
match self
|
||||
.mdbx_txn
|
||||
.get::<MdbxBare<AllowedDomainRecord>>(allowed_domains, domain.as_bytes())?
|
||||
{
|
||||
None => Ok(None),
|
||||
Some(MdbxBare(record)) => Ok(Some(record)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_weed_domain_record(&self, domain: &str) -> anyhow::Result<Option<WeedDomainRecord>> {
|
||||
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
|
||||
|
||||
match self
|
||||
.mdbx_txn
|
||||
.get::<MdbxBare<WeedDomainRecord>>(weed_domains, domain.as_bytes())?
|
||||
.get::<MdbxBare<DomainRecord>>(domains, domain.as_bytes())?
|
||||
{
|
||||
None => Ok(None),
|
||||
Some(MdbxBare(record)) => Ok(Some(record)),
|
||||
|
@ -705,3 +792,89 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> {
|
|||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
use super::*;
|
||||
use crate::raking::TemporaryFailureReason;
|
||||
use std::collections::BTreeSet;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
#[test]
|
||||
fn test_reinstate_multiple_domains() -> anyhow::Result<()> {
|
||||
let tfile = NamedTempFile::new()?;
|
||||
let store = RakerStore::open(tfile.path())?;
|
||||
{
|
||||
let txn = store.rw_txn()?;
|
||||
txn.insert_active_domain_with_new_raffle_ticket("a.invalid".to_owned())?;
|
||||
txn.insert_active_domain_with_new_raffle_ticket("b.invalid".to_owned())?;
|
||||
txn.commit()?;
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
{
|
||||
let txn = store.rw_txn()?;
|
||||
txn.start_backing_off(
|
||||
"a.invalid",
|
||||
300,
|
||||
"".to_owned(),
|
||||
TemporaryFailure {
|
||||
reason: TemporaryFailureReason::ExcruciatingCrawlDelay(1),
|
||||
backoff_sec: 300,
|
||||
},
|
||||
)?;
|
||||
txn.start_backing_off(
|
||||
"b.invalid",
|
||||
300,
|
||||
"".to_owned(),
|
||||
TemporaryFailure {
|
||||
reason: TemporaryFailureReason::ExcruciatingCrawlDelay(1),
|
||||
backoff_sec: 300,
|
||||
},
|
||||
)?;
|
||||
txn.commit()?;
|
||||
}
|
||||
|
||||
{
|
||||
let txn = store.ro_txn()?;
|
||||
assert_eq!(
|
||||
txn.acquire_random_active_domain(Default::default())?,
|
||||
RandomActiveDomainAcquisition::NoneLeft
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
let txn = store.rw_txn()?;
|
||||
txn.reinstate_backoffs(now + Duration::from_secs(600))?;
|
||||
txn.commit()?;
|
||||
}
|
||||
|
||||
{
|
||||
let txn = store.ro_txn()?;
|
||||
let busy = Default::default();
|
||||
|
||||
let acq1 = txn.acquire_random_active_domain(Arc::clone(&busy))?;
|
||||
let acq2 = txn.acquire_random_active_domain(Arc::clone(&busy))?;
|
||||
|
||||
assert!(
|
||||
matches!((acq1.clone(), acq2.clone()), (
|
||||
RandomActiveDomainAcquisition::GotOne {
|
||||
domain: dom1,
|
||||
..
|
||||
},
|
||||
RandomActiveDomainAcquisition::GotOne {
|
||||
domain: dom2,
|
||||
..
|
||||
}
|
||||
) if vec![dom1.as_ref(), dom2.as_ref()].into_iter().collect::<BTreeSet<&str>>() == vec![
|
||||
"a.invalid", "b.invalid"
|
||||
].into_iter().collect::<BTreeSet<&str>>()),
|
||||
"{:#?}",
|
||||
(acq1, acq2)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
|
||||
use crate::storage::records::{AllowedDomainRecord, OnHoldUrlRecord, WeedDomainRecord};
|
||||
use crate::storage::records::{DomainRecord, OnHoldUrlRecord};
|
||||
use crate::storage::RakerTxn;
|
||||
use anyhow::Context;
|
||||
use libmdbx::{Database, WriteFlags, RW};
|
||||
use log::warn;
|
||||
use reqwest::Url;
|
||||
|
||||
/// Runs one big transaction that:
|
||||
|
@ -16,8 +15,7 @@ use reqwest::Url;
|
|||
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
|
||||
struct DomainState {
|
||||
pub domain: String,
|
||||
pub allowed_domain_record: Option<AllowedDomainRecord>,
|
||||
pub weed_domain_record: Option<WeedDomainRecord>,
|
||||
pub domain_record: Option<DomainRecord>,
|
||||
}
|
||||
|
||||
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
|
||||
|
@ -47,44 +45,33 @@ pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Res
|
|||
// Then load the relevant records for it.
|
||||
domain_state = Some(DomainState {
|
||||
domain: domain.to_owned(),
|
||||
allowed_domain_record: txn.get_allowed_domain_record(domain)?,
|
||||
weed_domain_record: txn.get_weed_domain_record(domain)?,
|
||||
domain_record: txn.get_domain_record(domain)?,
|
||||
});
|
||||
}
|
||||
|
||||
let url = Url::parse(url_str)?;
|
||||
|
||||
let domain_state = domain_state.as_ref().unwrap();
|
||||
let is_allowed = domain_state
|
||||
.allowed_domain_record
|
||||
.as_ref()
|
||||
.map(|adr: &AllowedDomainRecord| adr.applies_to_url(&url))
|
||||
.unwrap_or(false);
|
||||
let is_weed = domain_state
|
||||
.weed_domain_record
|
||||
.as_ref()
|
||||
.map(|wdr: &WeedDomainRecord| wdr.applies_to_url(&url))
|
||||
.unwrap_or(false);
|
||||
|
||||
match (is_allowed, is_weed) {
|
||||
(false, false) => { /* nop */ }
|
||||
(true, true) => {
|
||||
warn!(
|
||||
"Ambiguous: {:?} is both mentioned by a seed and a weed. Ignoring.",
|
||||
url
|
||||
);
|
||||
}
|
||||
(true, false) => {
|
||||
let is_rakeable = domain_state
|
||||
.domain_record
|
||||
.as_ref()
|
||||
.map(|dr: &DomainRecord| dr.is_url_rakeable(&url))
|
||||
.flatten();
|
||||
|
||||
match is_rakeable {
|
||||
Some(true) => {
|
||||
// ALLOWED
|
||||
// Make it a queued URL
|
||||
txn.enqueue_url(url_str, None, record.queue_record.intent)?;
|
||||
cur.del(WriteFlags::empty())?;
|
||||
}
|
||||
(false, true) => {
|
||||
Some(false) => {
|
||||
// WEED
|
||||
// Just delete
|
||||
cur.del(WriteFlags::empty())?;
|
||||
}
|
||||
None => { /* nop: neither allowed nor a weed. Keep on hold. */ }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,32 @@ use serde::de::DeserializeOwned;
|
|||
use serde::Serialize;
|
||||
use std::borrow::Cow;
|
||||
|
||||
/// u16 in BIG byte endianness (u16 not supported by INTEGERKEY mode!)
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct MdbxU16BE(pub u16);
|
||||
|
||||
impl MdbxU16BE {
|
||||
pub fn as_bytes(&self) -> Cow<'_, [u8]> {
|
||||
Cow::Owned(self.0.to_be_bytes().to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
impl TableObject<'_> for MdbxU16BE {
|
||||
fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
if data_val.len() != 2 {
|
||||
return Err(libmdbx::Error::DecodeError(
|
||||
anyhow!("MDBX Key not 2 bytes; can't be decoded as u16").into(),
|
||||
));
|
||||
}
|
||||
let mut buf = [0u8; 2];
|
||||
buf.copy_from_slice(&data_val);
|
||||
Ok(MdbxU16BE(u16::from_be_bytes(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
/// u32 in native byte endianness (as required by INTEGERKEY mode)
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct MdbxU32(pub u32);
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct ActiveDomainRecord {
|
||||
/// The raffle ticket number owned by this domain.
|
||||
pub raffle_ticket: u32,
|
||||
|
@ -11,11 +11,10 @@ pub struct ActiveDomainRecord {
|
|||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct UrlVisitedRecord {
|
||||
/// Number of minutes since the QuickPeep Epoch that this page was last raked at.
|
||||
/// We store minutes to give us 60× the range of times.
|
||||
/// We'd really rather stick with 32-bit ints to reduce the space storage requirements.
|
||||
/// We could *possibly* go for a u16 in the future and store number of days (179 years' range):
|
||||
/// sitemaps and feeds usually only tell you the date the page was last updated.
|
||||
/// Number of days since the QuickPeep Epoch that this page was last raked at.
|
||||
/// A u16 is fine here, giving 179 years worth of values. This allows compact encoding.
|
||||
/// We don't really care about a more granular timestamp: sitemaps and feeds usually only
|
||||
/// give the date of last update anyway.
|
||||
pub last_visited_days: u16,
|
||||
}
|
||||
|
||||
|
@ -47,26 +46,20 @@ pub struct BackingOffDomainRecord {
|
|||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||
pub struct AllowedDomainRecord {
|
||||
/// Set of acceptable path prefixes.
|
||||
/// Empty if ALL path prefixes are permitted.
|
||||
pub restricted_prefixes: BTreeSet<String>,
|
||||
pub struct DomainRecord {
|
||||
pub rakeable_path_prefixes: BTreeMap<String, bool>,
|
||||
}
|
||||
|
||||
impl AllowedDomainRecord {
|
||||
/// Returns true iff this record applies to this URL.
|
||||
impl DomainRecord {
|
||||
/// Returns whether the URL is rakeable.
|
||||
///
|
||||
/// Preconditions: it has been checked that the record applies to the domain
|
||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
||||
if self.restricted_prefixes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
let mut applies = false;
|
||||
for prefix in self.restricted_prefixes.iter() {
|
||||
pub fn is_url_rakeable(&self, url: &Url) -> Option<bool> {
|
||||
let mut final_result = None;
|
||||
// TODO This could be made more efficient.
|
||||
for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() {
|
||||
if url.path().starts_with(prefix) {
|
||||
applies = true;
|
||||
break;
|
||||
final_result = Some(rakeable);
|
||||
}
|
||||
if prefix.as_str() > url.path() {
|
||||
// e.g. /dog > /cat/xyz
|
||||
|
@ -75,39 +68,6 @@ impl AllowedDomainRecord {
|
|||
break;
|
||||
}
|
||||
}
|
||||
applies
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||
pub struct WeedDomainRecord {
|
||||
/// Set of weedy path prefixes.
|
||||
/// Empty if ALL path prefixes are weedy.
|
||||
pub restricted_prefixes: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl WeedDomainRecord {
|
||||
/// Returns true iff this record applies to this URL.
|
||||
///
|
||||
/// Preconditions: it has been checked that the record applies to the domain
|
||||
pub fn applies_to_url(&self, url: &Url) -> bool {
|
||||
if self.restricted_prefixes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
let mut applies = false;
|
||||
for prefix in self.restricted_prefixes.iter() {
|
||||
if url.path().starts_with(prefix) {
|
||||
applies = true;
|
||||
break;
|
||||
}
|
||||
if prefix.as_str() > url.path() {
|
||||
// e.g. /dog > /cat/xyz
|
||||
// This means we've missed all chances to see our prefix,
|
||||
// so we break here (efficiency).
|
||||
break;
|
||||
}
|
||||
}
|
||||
applies
|
||||
final_result
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{{ search_term }} — QuickPeep</title>
|
||||
<link rel="stylesheet" type="text/css" href="dist/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<!-- Header -->
|
||||
<div class="container_overall">
|
||||
<div class="left_side_container">
|
||||
<header>
|
||||
<form method="GET" action="search">
|
||||
<fieldset class="horizontal">
|
||||
<a href="/" title="QuickPeep"><img src="/static/quickpeep_logo_sml.png" alt="QuickPeep Logo" class="bar_logo"></a>
|
||||
<input type="search" id="search" name="q" placeholder="..." value="{{ search_term }}" class="grow">
|
||||
|
||||
<input type="submit" value="Search" class="shrink">
|
||||
</fieldset>
|
||||
</form>
|
||||
</header><!-- ./ Header -->
|
||||
|
||||
<!-- Main -->
|
||||
<main class="search">
|
||||
{% if show_spiel %}
|
||||
<p>
|
||||
QuickPeep is a hobbyist, open-source and very immature (for now) web search engine. It's intended to help you encounter webpages that are interesting and from a real person, rather than from a 'content mill' or other source of SEO spam. In general, websites that don't respect the reader are unwelcome.
|
||||
</p>
|
||||
<p>
|
||||
QuickPeep's approach to rubbish websites is to 'just' not index them! This also helps with another goal of the project, which is to allow anyone to run an instance of QuickPeep with only modest hardware requirements (especially storage space which could easily be problematic).
|
||||
</p>
|
||||
<p>
|
||||
This is an ambitious project and it is probably not very usable right now. It may never be. With that said, I'm hoping to see how far I can take it.
|
||||
</p>
|
||||
<p>
|
||||
There is an <a href="https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine">article introducing the project on my personal blog</a>.<br>
|
||||
The source code is <a href="https://git.emunest.net/reivilibre/quickpeep.git">available on my personal Gitea instance</a>.
|
||||
</p>
|
||||
{% endif %}
|
||||
|
||||
<ul class="search_results">
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
</div>
|
||||
<ul class="result_tags">
|
||||
{%- for tag in result.tags -%}
|
||||
<li>{{ tag }}</li>
|
||||
{%- endfor -%}
|
||||
</ul>
|
||||
<div class="result_url">{{ result.url }}</div>
|
||||
</li>
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
</div>
|
||||
<ul class="result_tags">
|
||||
{%- for tag in result.tags -%}
|
||||
<li>{{ tag }}</li>
|
||||
{%- endfor -%}
|
||||
</ul>
|
||||
<div class="result_url">{{ result.url }}</div>
|
||||
</li>
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
</div>
|
||||
<ul class="result_tags">
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
</ul>
|
||||
<div class="result_url">{{ result.url }}</div>
|
||||
</li>
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
</div>
|
||||
<ul class="result_tags">
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
</ul>
|
||||
<div class="result_url">{{ result.url }}</div>
|
||||
</li>
|
||||
<li>
|
||||
<img src="/icon.webp?b={{ result.favicon_url }}">
|
||||
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
|
||||
<div class="result_excerpt">
|
||||
{{- result.excerpt|safe -}}
|
||||
</div>
|
||||
<ul class="result_tags">
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
<li>{{ tag }}</li>
|
||||
</ul>
|
||||
<div class="result_url">{{ result.url }}</div>
|
||||
</li>
|
||||
</ul>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<div class="right_side_container">
|
||||
<!-- Preview pane -->
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<footer class="container">
|
||||
{% for (method, url) in contact %}
|
||||
<a href="{{ url }}">{{ method }}</a> •
|
||||
{% endfor %}
|
||||
<a href="/">Return to QuickPeep Root</a>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
.result_title a {
|
||||
color: palegoldenrod;
|
||||
}
|
||||
|
||||
.result_tags {
|
||||
> li {
|
||||
color: palegreen;
|
||||
}
|
||||
}
|
|
@ -4,6 +4,11 @@
|
|||
--typography-spacing-vertical: 1rem;
|
||||
}
|
||||
|
||||
// light green theming
|
||||
$primary-500: #8bc34a;
|
||||
$primary-600: #7cb342;
|
||||
$primary-700: #689f38;
|
||||
|
||||
.bar_happy {
|
||||
padding: 1em;
|
||||
margin: 3em;
|
||||
|
@ -16,8 +21,6 @@
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@media only screen and (max-width: 960px) {
|
||||
.left_side_container {
|
||||
@extends(.container);
|
||||
|
@ -91,7 +94,7 @@ ul.search_results {
|
|||
}
|
||||
|
||||
.result_title a {
|
||||
color: palegoldenrod;
|
||||
color: brown;
|
||||
}
|
||||
|
||||
.result_excerpt {
|
||||
|
@ -114,7 +117,7 @@ ul.search_results {
|
|||
list-style-type: none;
|
||||
display: inline-block;
|
||||
//background-color: palegreen;
|
||||
color: palegreen;
|
||||
color: darkgreen;
|
||||
//padding: 0.2em;
|
||||
//border-radius: 8px;
|
||||
|
||||
|
@ -128,3 +131,14 @@ ul.search_results {
|
|||
margin-right: 0.4em;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@media only screen and (prefers-color-scheme: dark) {
|
||||
:root:not([data-theme=light]) {
|
||||
@import "dark.scss";
|
||||
}
|
||||
}
|
||||
|
||||
[data-theme=dark] {
|
||||
@import "dark.scss";
|
||||
}
|
||||
|
|
|
@ -1122,11 +1122,6 @@ mdn-data@2.0.14:
|
|||
resolved "https://registry.yarnpkg.com/mdn-data/-/mdn-data-2.0.14.tgz#7113fc4281917d63ce29b43446f701e68c25ba50"
|
||||
integrity sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow==
|
||||
|
||||
mini.css@^3.0.1:
|
||||
version "3.0.1"
|
||||
resolved "https://registry.yarnpkg.com/mini.css/-/mini.css-3.0.1.tgz#f6236e99997bbd19484d5655d087ec96b887af68"
|
||||
integrity sha512-FmuuBL0wuyDO1UA66TkAo8w2RxxuHmNPaUqUHcYlHtM9CJkrscQaNAJ/ParEahYFwtZOSgfEA7flbMoSPkzrPA==
|
||||
|
||||
minimist@^1.2.5:
|
||||
version "1.2.5"
|
||||
resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602"
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -60,6 +60,8 @@ pub struct RakedReference {
|
|||
pub enum ReferenceKind {
|
||||
/// Canonical URL for the same document, as declared in the page.
|
||||
CanonicalUrl,
|
||||
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
|
||||
SecureUpgrade,
|
||||
/// HTTP-level redirect.
|
||||
Redirect,
|
||||
/// Link in a page (<a>). Could be to another page or to a feed.
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
pub struct DirtyTracker<T> {
|
||||
inner: T,
|
||||
dirty: bool,
|
||||
}
|
||||
|
||||
impl<T> Borrow<T> for DirtyTracker<T> {
|
||||
fn borrow(&self) -> &T {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> BorrowMut<T> for DirtyTracker<T> {
|
||||
fn borrow_mut(&mut self) -> &mut T {
|
||||
self.dirty = true;
|
||||
&mut self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DirtyTracker<T> {
|
||||
pub fn new(inner: T) -> DirtyTracker<T> {
|
||||
DirtyTracker {
|
||||
inner,
|
||||
dirty: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_dirty(&self) -> bool {
|
||||
self.dirty
|
||||
}
|
||||
|
||||
pub fn make_clean(&mut self) {
|
||||
self.dirty = false;
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> T {
|
||||
self.inner
|
||||
}
|
||||
}
|
|
@ -1,4 +1,3 @@
|
|||
pub mod dates;
|
||||
pub mod dirty;
|
||||
pub mod lazy;
|
||||
pub mod urls;
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
use anyhow::Context;
|
||||
use std::borrow::Cow;
|
||||
use url::Url;
|
||||
|
||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
||||
let domain = url.domain().context("URLs must have domains")?;
|
||||
pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
|
||||
// If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
|
||||
let domain = url.domain()?;
|
||||
|
||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(stripped) => stripped,
|
||||
None => domain,
|
||||
}))
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
{ pkgs ? import <nixpkgs> {} }:
|
||||
|
||||
let
|
||||
# We may need some packages from nixpkgs-unstable
|
||||
#unstable = import <nixpkgs-unstable> {};
|
||||
|
||||
rust-toolchain = pkgs.symlinkJoin {
|
||||
name = "rust-toolchain";
|
||||
paths = [pkgs.rustc pkgs.cargo pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
|
||||
};
|
||||
in
|
||||
|
||||
pkgs.mkShell {
|
||||
|
||||
buildInputs = [
|
||||
rust-toolchain
|
||||
|
||||
pkgs.pkg-config
|
||||
|
||||
#pkgs.libclang # ??
|
||||
];
|
||||
|
||||
nativeBuildInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
|
||||
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
|
||||
|
||||
# Cargo culted:
|
||||
# Add to rustc search path
|
||||
# RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
|
||||
# ]);
|
||||
# Add to bindgen search path
|
||||
BINDGEN_EXTRA_CLANG_ARGS =
|
||||
# Includes with normal include path
|
||||
(builtins.map (a: ''-I"${a}/include"'') [
|
||||
pkgs.glibc.dev
|
||||
])
|
||||
# Includes with special directory paths
|
||||
++ [
|
||||
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
|
||||
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
|
||||
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
|
||||
];
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue