Compare commits

...

47 Commits

Author SHA1 Message Date
b1d2c49f96 Fix flake and make nixpkgs top-level so it can follow 2024-05-08 20:34:52 +01:00
e07ac16bc4 Skip raking of weeded URLs
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
May be useful for retroactively clearing out URLs
2023-03-31 22:59:23 +01:00
ff514e90b8 Simplify allowed_/weed_domains 2023-03-31 22:50:02 +01:00
1c10cb203a Dodge some places where we enqueue URLs without checking they have supported schemes
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-30 23:40:43 +01:00
1e8aa95e7a Respect nofollow and noindex <meta> robots tags
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
Along with doing the right thing, this should speed up raking for us
2023-03-30 23:09:39 +01:00
18d2023550 Add a debug line when we rake something
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-30 21:17:15 +01:00
83fecf1464 Improve the raker to perform a reinstate periodically and to respawn workers
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-28 21:09:24 +01:00
626b448245 raker: Switch to Jemalloc for the global allocator
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-22 23:08:08 +00:00
4f5977002b nixos: Add way of building the static files
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-22 01:20:13 +00:00
0811be9ae0 nixos: fix path to import module
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-21 23:58:26 +00:00
63f94577c9 nixos: Add working directory config option to quickpeepSearch
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-21 23:56:47 +00:00
6d37a07d3e Clarify and handle 'No domain for URL' error in a better way
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-03-21 23:36:47 +00:00
73c72bce25 nixos: Add a quickpeepIndex service if an autoIndexUrl is specified 2023-03-21 23:31:17 +00:00
0bebfc0025 Fix unfinished work around SecureUpgrade
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-12-03 15:13:06 +00:00
99fcbf77f6 Hide icons that don't exist 2022-12-03 15:08:59 +00:00
e2a4835536 Fix light theme having unreadable search results
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-30 22:01:58 +00:00
51c90ecaa0 (yarn.lock update) 2022-11-30 22:01:52 +00:00
cfb2fca649 Add a page for testing search theme 2022-11-30 22:01:49 +00:00
05a60af389 tantivy backend: return tags in search results
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-28 23:19:11 +00:00
bff48f35f4 Make the raker attempt HTTPS upgrades
Some checks failed
ci/woodpecker/push/check Pipeline failed
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
Not only does this improve security for searchers later on,

it also enables us to cut down on the number of duplicates quite easily.
2022-11-28 23:15:37 +00:00
34a05f84ff Add OpenSearch XML 2022-11-28 22:49:18 +00:00
8b439c1550 Remove noisy and obsolete debug output in the sitemap extractor
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-27 10:14:47 +00:00
b254ab1231 rakepack feeds: Only pull the ones that we can index 2022-11-27 10:11:13 +00:00
8578ee4b10 Fix manual-pushing pipeline
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-27 10:02:52 +00:00
0654d1aa07 Fix raker tools having wrong default config path 2022-11-27 00:02:33 +00:00
4bba2fc89b Don't fall over on unknown schemes e.g. mailto:
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-26 23:47:23 +00:00
c940900fab Add missing URL clean
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-26 22:59:24 +00:00
438beed86a Add more error context 2022-11-26 22:59:14 +00:00
08f4b7aeaa Add a lot of debug output
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-26 22:45:51 +00:00
2ce8e2ba8e Fix qp-seedrake
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-26 22:30:40 +00:00
54a468d079 Add facility to qp-indexer that lets it download rakepacks from a feed
Some checks failed
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-26 20:48:06 +00:00
bd16f58d9e Maintain an index file of rakepacks and append when a rakepack is finished 2022-11-26 20:07:12 +00:00
52d0183942 Reinstate re-rakable URLs on startup 2022-11-26 19:22:34 +00:00
6ecbc0561f Add configurable re-rake times for different kinds of raked things 2022-11-26 19:05:36 +00:00
d5255410f5 Fix comment on last_visited_days 2022-11-26 18:15:53 +00:00
94a0a588cf Add documentation about the seed collection service 2022-11-26 18:09:34 +00:00
20d9fb956a Update qp-seedcoll* to use quickpeep.ron instead of qp_web.ron
like all the other utilities
2022-11-26 17:19:31 +00:00
402135a6fa Add Nix shell 2022-11-21 15:21:52 +00:00
a7b3ed711c Fix git links 2022-11-05 14:40:12 +00:00
d8d6f13f7e Update the README a little bit
All checks were successful
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/check Pipeline was successful
ci/woodpecker/tag/manual Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-07-02 22:55:18 +01:00
09f70ad8ce Link to blog post on home page 2022-07-02 22:53:48 +01:00
555ad2eab3 Fix typo in tantivy backend query 2022-07-02 22:53:24 +01:00
dba851879b Fix qp-index-search looking in wrong place for config
All checks were successful
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-07-02 22:41:47 +01:00
9dc49f12bd Add some 'About' info
All checks were successful
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/manual Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-06-27 22:21:51 +01:00
545e5dd11f Fix icon store not being registered 2022-06-27 19:45:50 +01:00
fda08b20b4 Add context for unreadable rakepacks 2022-06-26 21:32:23 +01:00
d24083c755 Use amd64 image for mdbook 2022-06-26 20:10:46 +01:00
51 changed files with 3010 additions and 534 deletions

View File

@ -2,7 +2,7 @@ platform: linux/amd64
pipeline: pipeline:
deployManual: deployManual:
image: docker.bics.ga/rei_ci/mdbook:latest-arm64 image: git.emunest.net/rei_oci_pub/mdbook:latest-amd64
when: when:
branch: branch:
- main - main

2
.envrc Normal file
View File

@ -0,0 +1,2 @@
use nix

23
Cargo.lock generated
View File

@ -3766,11 +3766,13 @@ dependencies = [
"quickpeep_seed_parser", "quickpeep_seed_parser",
"quickpeep_structs", "quickpeep_structs",
"quickpeep_utils", "quickpeep_utils",
"reqwest",
"ron", "ron",
"serde", "serde",
"serde_bare", "serde_bare",
"serde_json", "serde_json",
"smartstring", "smartstring",
"tempfile",
"tokio", "tokio",
"url", "url",
"zstd", "zstd",
@ -3842,6 +3844,7 @@ dependencies = [
"sitemap", "sitemap",
"smartstring", "smartstring",
"tempfile", "tempfile",
"tikv-jemallocator",
"tokio", "tokio",
"webp", "webp",
"zstd", "zstd",
@ -4877,6 +4880,26 @@ dependencies = [
"weezl", "weezl",
] ]
[[package]]
name = "tikv-jemalloc-sys"
version = "0.5.3+5.3.0-patched"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a678df20055b43e57ef8cddde41cdfda9a3c1a060b67f4c5836dfb1d78543ba8"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "tikv-jemallocator"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979"
dependencies = [
"libc",
"tikv-jemalloc-sys",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.1.44" version = "0.1.44"

View File

@ -26,11 +26,11 @@ If you need to fall back to a conventional search engine, this will eventually b
*Crossed-out things are aspirational and not yet implemented.* *Crossed-out things are aspirational and not yet implemented.*
- ~~Shareable 'rakepacks', so that anyone can run their own search instance without needing to rake (crawl) themselves~~ - Shareable 'rakepacks', so that anyone can run their own search instance without needing to rake (crawl) themselves
- ~~Dense encoding to minimise disk space usage; compressed with Zstd?~~ - Dense encoding to minimise disk space usage; compressed with Zstd.
- Raking (crawling) support for - Raking (crawling) support for
- HTML (including redirecting to Canonical URLs) - HTML (including redirecting to Canonical URLs)
- ~~Language detection~~ - Language detection for when the metadata is absent.
- Redirects - Redirects
- ~~Gemtext over Gemini~~ - ~~Gemtext over Gemini~~
- RSS, Atom and JSON feeds - RSS, Atom and JSON feeds
@ -43,9 +43,9 @@ If you need to fall back to a conventional search engine, this will eventually b
- Article content extraction, to provide more weight to words found within the article content (based on a Rust version of Mozilla's *Readability* engine) - Article content extraction, to provide more weight to words found within the article content (based on a Rust version of Mozilla's *Readability* engine)
- (Misc) - (Misc)
- ~~Use of the Public Suffix List~~ - ~~Use of the Public Suffix List~~
- ~~Tagging URL patterns; e.g. to mark documentation as 'old'.~~ - Tagging URL patterns; e.g. to mark documentation as 'old'.
- ~~Page duplicate content detection (e.g. to detect `/` and `/index.html`, or non-HTTPS and HTTPS, or non-`www` and `www`...)~~ - ~~Page duplicate content detection (e.g. to detect `/` and `/index.html`, or non-HTTPS and HTTPS, or non-`www` and `www`...)~~
- ~~Language detection for pages that don't have that metadata available.~~
## Limitations ## Limitations
@ -62,11 +62,17 @@ If you need to fall back to a conventional search engine, this will eventually b
*Not written yet.* *Not written yet.*
The stages of the QuickPeep pipeline are briefly described in [an introductory blog post][qp_intro_blog].
[qp_intro_blog]: https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine
## Development and Running ## Development and Running
*Not written yet.* *Not written yet.*
Some hints may be obtained from the introductory blog post mentioned in the 'Architecture' section, but it's probably quite difficult to follow right now.
### Helper scripts ### Helper scripts

View File

@ -8,6 +8,6 @@ description = "Documentation for QuickPeep"
[output.html] [output.html]
default-theme = "coal" default-theme = "coal"
git-repository-url = "https://bics.ga/reivilibre/quickpeep.git" git-repository-url = "https://git.emunest.net/reivilibre/quickpeep.git"
git-repository-icon = "fa-git-alt" git-repository-icon = "fa-git-alt"
fold = { enable = true, level = 1 } fold = { enable = true, level = 1 }

View File

@ -3,7 +3,7 @@
- [QuickPeep]() - [QuickPeep]()
- [Running and Hosting]() - [Running and Hosting]()
- [QuickPeep Search]() - [QuickPeep Search]()
- [QuickPeep Seed Collection Service]() - [QuickPeep Seed Collection Service](./running/seed_collection_service.md)
- [QuickPeep Raker]() - [QuickPeep Raker]()
- [QuickPeep Indexer]() - [QuickPeep Indexer]()
- [Internals](./internals/index.md) - [Internals](./internals/index.md)

View File

@ -0,0 +1,26 @@
# Running a QuickPeep Seed Collection Service
The QuickPeep web interface has a built-in seed collection page at `/seeds`.
This is a simple form where anyone can submit a URL and tag it with appropriate tags.
The list of tags can be changed in `quickpeep.ron` (`web``seed_collection` → ...).
## Retrieving seeds from the Seed Collection Service
First use the `qp-seedcoll-sort` utility included with QuickPeep to sort through the seeds that you have received.
For each seed that has been received on the web interface, but not yet sorted, you will be given the option to:
- accept (`y`)
- reject
- generic reason (`n`)
- because it duplicates an existing entry (`dupe`)
- because it's spam (`spam`)
- because it's invalid for some reason; e.g. the URL isn't valid (`inv`)
This just marks the seeds in the database but doesn't yet emit them in a format usable by the QuickPeep raker.
To export the seeds, use `qp-seedcoll-dump path/to/new/file.seed`.
This command writes a seed file (usable by the Raker) for all the seeds that haven't yet been dumped.
Getting the seeds from your web UI to the Raker is an exercise left for the reader, but I will note that I do this by committing them into a Git repository, with each seed file being dated.
This also has the benefit of being able to easily publish them for others to use; my seeds are available at https://git.emunest.net/reivilibre/quickpeep_seeds.

27
flake.lock generated
View File

@ -2,7 +2,9 @@
"nodes": { "nodes": {
"naersk": { "naersk": {
"inputs": { "inputs": {
"nixpkgs": "nixpkgs" "nixpkgs": [
"nixpkgs"
]
}, },
"locked": { "locked": {
"lastModified": 1654608517, "lastModified": 1654608517,
@ -20,36 +22,23 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1654845941, "lastModified": 1714971268,
"narHash": "sha256-uXulXu4BQ9ch1ItV0FlL2Ns8X83m6unT5h/0X//VRLQ=", "narHash": "sha256-IKwMSwHj9+ec660l+I4tki/1NRoeGpyA2GdtdYpAgEw=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "7b3e907a6fef935794b5049c2c57c519853deb90", "rev": "27c13997bf450a01219899f5a83bd6ffbfc70d3c",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1654845941,
"narHash": "sha256-uXulXu4BQ9ch1ItV0FlL2Ns8X83m6unT5h/0X//VRLQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "7b3e907a6fef935794b5049c2c57c519853deb90",
"type": "github" "type": "github"
}, },
"original": { "original": {
"id": "nixpkgs", "id": "nixpkgs",
"ref": "nixos-23.11",
"type": "indirect" "type": "indirect"
} }
}, },
"root": { "root": {
"inputs": { "inputs": {
"naersk": "naersk", "naersk": "naersk",
"nixpkgs": "nixpkgs_2", "nixpkgs": "nixpkgs",
"utils": "utils" "utils": "utils"
} }
}, },

View File

@ -2,8 +2,12 @@
description = "QuickPeep Search Engine Flake for Nix"; description = "QuickPeep Search Engine Flake for Nix";
inputs = { inputs = {
nixpkgs.url = "nixpkgs/nixos-23.11";
utils.url = "github:numtide/flake-utils"; utils.url = "github:numtide/flake-utils";
naersk.url = "github:nix-community/naersk"; naersk = {
url = "github:nix-community/naersk";
inputs.nixpkgs.follows = "nixpkgs";
};
}; };
outputs = { self, nixpkgs, utils, naersk }: outputs = { self, nixpkgs, utils, naersk }:
@ -17,7 +21,7 @@
root = ./.; root = ./.;
buildInputs = with pkgs; [ buildInputs = with pkgs; [
openssl openssl
pkgconfig pkg-config
]; ];
nativeBuildInputs = with pkgs; [ nativeBuildInputs = with pkgs; [
@ -35,12 +39,48 @@
}; };
}; };
# packages.quickpeepWebStatic = pkgs.stdenv.mkDerivation {
# name = "quickpeepWebStatic";
#
# src = ./quickpeep_static;
#
# buildInputs = [ pkgs.yarn ];
#
# preparePhase = ''
# yarn install
# '';
#
# buildPhase = ''
# yarn build
# cp -r dist/* $out
# '';
# };
packages.quickpeepWebStatic = pkgs.mkYarnPackage {
name = "quickpeepWebStatic";
src = ./quickpeep_static;
packageJSON = ./quickpeep_static/package.json;
yarnLock = ./quickpeep_static/yarn.lock;
yarnNix = ./quickpeep_static/yarn.nix;
postBuild = ''
yarn build
'';
installPhase = ''
# nop ?
'';
distPhase = ''
mkdir $out
cp -r deps/quickpeep_static/dist/* $out/
'';
};
defaultPackage = packages.quickpeep; defaultPackage = packages.quickpeep;
# NixOS Modules # NixOS Modules
nixosModules = { nixosModules = {
quickpeepSearch = import ./modules/quickpeepSearch.nix self; quickpeepSearch = import ./nixos_modules/quickpeepSearch.nix self;
quickpeepRaker = import ./modules/quickpeepRaker.nix self; quickpeepRaker = import ./nixos_modules/quickpeepRaker.nix self;
}; };
# `nix run` # `nix run`

View File

@ -48,6 +48,23 @@ with lib;
Config path to use, in RON format. Config path to use, in RON format.
''; '';
}; };
workingDir = mkOption {
type = with types; path;
description = ''
Path to a working directory to run the web interface and indexer from.
This is the base from which paths in the config file are looked up from.
'';
};
autoIndexUrl = mkOption {
default = null;
type = with types; nullOr str;
description = ''
HTTP(S) URL to an index (list) of rakepacks.
If specified, the indexer will periodically fetch new packs from that list and then add the pages within to the search index.
'';
};
}; };
}; };
@ -71,7 +88,23 @@ with lib;
Type = "simple"; Type = "simple";
User = "${cfg.user}"; User = "${cfg.user}";
ExecStart = ''${quickpeep}/bin/quickpeep ${cfg.bindHost}:${builtins.toString cfg.bindPort}''; ExecStart = ''${quickpeep}/bin/quickpeep ${cfg.bindHost}:${builtins.toString cfg.bindPort}'';
WorkingDirectory = cfg.workingDir;
}; };
}; };
systemd.services.quickpeepIndex = mkIf (cfg.autoIndexUrl != null) {
after = [ "network.target" ];
description = "Fetches rakepacks from a feed and adds pages to the search index.";
serviceConfig = {
Type = "simple";
User = "${cfg.user}";
ExecStart = ''${quickpeep}/bin/qp-indexer --config ${lib.strings.escapeShellArg cfg.configPath} --feed ${lib.strings.escapeShellArg cfg.autoIndexUrl}'';
WorkingDirectory = cfg.workingDir;
};
};
# TODO systemd.timers.quickpeepIndex = mkIf (cfg.autoIndexUrl != null) {
#
# };
}; };
} }

View File

@ -63,6 +63,8 @@
], ],
sqlite_db_path: "data/dev_qp_web.sqlite3", sqlite_db_path: "data/dev_qp_web.sqlite3",
public_base: "http://127.0.0.1:9001",
), ),
// Index (indexer, web) // Index (indexer, web)
@ -96,5 +98,11 @@
pack_emitter: ( pack_emitter: (
), ),
rerake_timings: (
page: 300,
icon: 365,
feed: 10,
)
), ),
) )

View File

@ -27,8 +27,9 @@ pub async fn main() -> anyhow::Result<()> {
.context("Must specify output file as arg № 1! :)")?, .context("Must specify output file as arg № 1! :)")?,
); );
let config_path = let config_path = PathBuf::from(
PathBuf::from(std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "qp_web.ron".to_owned())); std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "quickpeep.ron".to_owned()),
);
if !config_path.exists() { if !config_path.exists() {
bail!( bail!(

View File

@ -24,8 +24,9 @@ pub async fn main() -> anyhow::Result<()> {
) )
.init(); .init();
let config_path = let config_path = PathBuf::from(
PathBuf::from(std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "qp_web.ron".to_owned())); std::env::var("QP_WEB_CONFIG").unwrap_or_else(|_| "quickpeep.ron".to_owned()),
);
if !config_path.exists() { if !config_path.exists() {
bail!( bail!(

View File

@ -7,9 +7,11 @@ use env_logger::Env;
use log::info; use log::info;
use quickpeep::config::WebConfig; use quickpeep::config::WebConfig;
use quickpeep::web::icon_retrieval::retrieve_icon; use quickpeep::web::icon_retrieval::retrieve_icon;
use quickpeep::web::metadata::get_opensearch_xml;
use quickpeep::web::searcher::{search_root, search_search}; use quickpeep::web::searcher::{search_root, search_search};
use quickpeep::web::seed_collector::{seed_collection_root, seed_collection_root_post}; use quickpeep::web::seed_collector::{seed_collection_root, seed_collection_root_post};
use quickpeep::web::IndexAccess; use quickpeep::web::IndexAccess;
use quickpeep_index::auxiliary::icon_store::IconStore;
use sqlx::sqlite::SqlitePoolOptions; use sqlx::sqlite::SqlitePoolOptions;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::path::PathBuf; use std::path::PathBuf;
@ -67,6 +69,7 @@ async fn main() -> anyhow::Result<()> {
let backend = Arc::new(web_config.open_indexer_backend()?); let backend = Arc::new(web_config.open_indexer_backend()?);
let index_access = IndexAccess { backend }; let index_access = IndexAccess { backend };
let icon_store = IconStore::open(web_config.index.icon_store.as_path())?;
let app = Router::new() let app = Router::new()
.route("/seeds/", get(seed_collection_root)) .route("/seeds/", get(seed_collection_root))
@ -74,9 +77,11 @@ async fn main() -> anyhow::Result<()> {
.route("/", get(search_root)) .route("/", get(search_root))
.route("/search", get(search_search)) .route("/search", get(search_search))
.route("/icon.webp", get(retrieve_icon)) .route("/icon.webp", get(retrieve_icon))
.route("/opensearch.xml", get(get_opensearch_xml))
.layer(Extension(web_config)) .layer(Extension(web_config))
.layer(Extension(pool)) .layer(Extension(pool))
.layer(Extension(index_access)) .layer(Extension(index_access))
.layer(Extension(Arc::new(icon_store)))
.nest( .nest(
"/static", "/static",
get_service(ServeDir::new("./quickpeep_static/dist")).handle_error( get_service(ServeDir::new("./quickpeep_static/dist")).handle_error(

View File

@ -22,7 +22,7 @@ pub struct WebConfig {
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
pub struct IndexConfig { pub struct IndexConfig {
pub backend: BackendConfig, pub backend: BackendConfig,
// TODO icon_store pub icon_store: PathBuf,
} }
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
@ -31,6 +31,10 @@ pub struct WebOnlyConfig {
pub sqlite_db_path: PathBuf, pub sqlite_db_path: PathBuf,
/// Name, URL pairs /// Name, URL pairs
pub contact: Vec<(String, String)>, pub contact: Vec<(String, String)>,
/// URL prefix for QuickPeep. Should include protocol. No trailing slash.
/// Example: https://quickpeep.net
pub public_base: String,
} }
impl WebConfig { impl WebConfig {
@ -49,6 +53,8 @@ impl WebConfig {
BackendConfig::Meili(_) => {} BackendConfig::Meili(_) => {}
} }
web_config.index.icon_store = config_dir.join(web_config.index.icon_store);
Ok(web_config) Ok(web_config)
} }

View File

@ -2,6 +2,7 @@ use quickpeep_index::backend::Backend;
use std::sync::Arc; use std::sync::Arc;
pub mod icon_retrieval; pub mod icon_retrieval;
pub mod metadata;
pub mod searcher; pub mod searcher;
pub mod seed_collector; pub mod seed_collector;

View File

@ -0,0 +1,28 @@
use crate::config::WebConfig;
use axum::extract::Extension;
use axum::response::{IntoResponse, Response};
pub async fn get_opensearch_xml(Extension(web_config): Extension<WebConfig>) -> impl IntoResponse {
let public_base = &web_config.web.public_base;
let formatted = format!(
r#"
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
xmlns:moz="http://www.mozilla.org/2006/browser/search/">
<ShortName>QuickPeep</ShortName>
<Description>small-scale web search engine</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">{public_base}/favicon.ico</Image>
<Url type="text/html" template="{public_base}/search?q=%s"/>
</OpenSearchDescription>
"#
);
// Extras for the future:
// <Url type="application/x-suggestions+json" template="[suggestionURL]"/>
// <moz:SearchForm>[https://example.com/search]</moz:SearchForm>
Response::builder()
.header("content-type", "application/opensearchdescription+xml")
.body(formatted.into_response())
.unwrap()
}

View File

@ -4,7 +4,9 @@ use crate::webutil::{internal_error, TemplatedHtml};
use askama::Template; use askama::Template;
use axum::extract::{Extension, Query}; use axum::extract::{Extension, Query};
use axum::response::IntoResponse; use axum::response::IntoResponse;
use quickpeep_index::auxiliary::icon_store::IconStore;
use serde::Deserialize; use serde::Deserialize;
use std::sync::Arc;
#[derive(Clone, Template)] #[derive(Clone, Template)]
#[template(path = "search.html.askama")] #[template(path = "search.html.askama")]
@ -12,11 +14,12 @@ pub struct SearchTemplate {
pub search_term: String, pub search_term: String,
pub results: Vec<SearchResult>, pub results: Vec<SearchResult>,
pub contact: Vec<(String, String)>, pub contact: Vec<(String, String)>,
pub show_spiel: bool,
} }
#[derive(Clone)] #[derive(Clone)]
pub struct SearchResult { pub struct SearchResult {
pub favicon_url: String, pub favicon_url: Option<String>,
pub url: String, pub url: String,
pub title: String, pub title: String,
pub excerpt: String, pub excerpt: String,
@ -34,6 +37,7 @@ pub async fn search_root(Extension(web_config): Extension<WebConfig>) -> impl In
search_term: String::with_capacity(0), search_term: String::with_capacity(0),
results: vec![], results: vec![],
contact: web_config.web.contact.clone(), contact: web_config.web.contact.clone(),
show_spiel: true,
}) })
} }
@ -47,8 +51,9 @@ pub async fn search_search(
web_config: Extension<WebConfig>, web_config: Extension<WebConfig>,
index_access: Extension<IndexAccess>, index_access: Extension<IndexAccess>,
params: Query<QueryParameters>, params: Query<QueryParameters>,
icon_store: Extension<Arc<IconStore>>,
) -> impl IntoResponse { ) -> impl IntoResponse {
search_search_inner(web_config, index_access, params) search_search_inner(web_config, index_access, params, icon_store)
.await .await
.map_err(internal_error) .map_err(internal_error)
} }
@ -57,14 +62,22 @@ pub async fn search_search_inner(
Extension(web_config): Extension<WebConfig>, Extension(web_config): Extension<WebConfig>,
Extension(index_access): Extension<IndexAccess>, Extension(index_access): Extension<IndexAccess>,
Query(params): Query<QueryParameters>, Query(params): Query<QueryParameters>,
Extension(icon_store): Extension<Arc<IconStore>>,
) -> anyhow::Result<impl IntoResponse> { ) -> anyhow::Result<impl IntoResponse> {
let raw_results = index_access.backend.query(params.q.clone())?; let raw_results = index_access.backend.query(params.q.clone())?;
let mut results = Vec::with_capacity(raw_results.len()); let mut results = Vec::with_capacity(raw_results.len());
let txn = icon_store.env.ro_txn()?;
for search_doc in raw_results { for search_doc in raw_results {
let favicon_url_hash: [u8; 8] = search_doc.favicon_url_hash.to_le_bytes();
let favicon_url = icon_store
.icons
.get(&txn, &favicon_url_hash)?
.map(|_| format!("{:016x}", search_doc.favicon_url_hash));
results.push(SearchResult { results.push(SearchResult {
favicon_url: format!("{:16x}", search_doc.favicon_url_hash), favicon_url,
url: search_doc.url, url: search_doc.url,
title: search_doc.title, title: search_doc.title,
excerpt: search_doc.excerpt, excerpt: search_doc.excerpt,
@ -76,5 +89,6 @@ pub async fn search_search_inner(
search_term: params.q.clone(), search_term: params.q.clone(),
results, results,
contact: web_config.web.contact.clone(), contact: web_config.web.contact.clone(),
show_spiel: false,
})) }))
} }

View File

@ -13,7 +13,7 @@
<header> <header>
<form method="GET" action="search"> <form method="GET" action="search">
<fieldset class="horizontal"> <fieldset class="horizontal">
<img src="/static/quickpeep_logo_sml.png" class="bar_logo"> <a href="/" title="QuickPeep"><img src="/static/quickpeep_logo_sml.png" alt="QuickPeep Logo" class="bar_logo"></a>
<input type="search" id="search" name="q" placeholder="..." value="{{ search_term }}" class="grow"> <input type="search" id="search" name="q" placeholder="..." value="{{ search_term }}" class="grow">
<input type="submit" value="Search" class="shrink"> <input type="submit" value="Search" class="shrink">
@ -23,10 +23,28 @@
<!-- Main --> <!-- Main -->
<main class="search"> <main class="search">
{% if show_spiel %}
<p>
QuickPeep is a hobbyist, open-source and very immature (for now) web search engine. It's intended to help you encounter webpages that are interesting and from a real person, rather than from a 'content mill' or other source of SEO spam. In general, websites that don't respect the reader are unwelcome.
</p>
<p>
QuickPeep's approach to rubbish websites is to 'just' not index them! This also helps with another goal of the project, which is to allow anyone to run an instance of QuickPeep with only modest hardware requirements (especially storage space which could easily be problematic).
</p>
<p>
This is an ambitious project and it is probably not very usable right now. It may never be. With that said, I'm hoping to see how far I can take it.
</p>
<p>
There is an <a href="https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine">article introducing the project on my personal blog</a>.<br>
The source code is <a href="https://git.emunest.net/reivilibre/quickpeep.git">available on my personal Gitea instance</a>.
</p>
{% endif %}
<ul class="search_results"> <ul class="search_results">
{%- for result in results %} {%- for result in results %}
<li> <li>
<img src="/icon.webp?b={{ result.favicon_url }}"> {%- if result.favicon_url.is_some() -%}
<img src="/icon.webp?b={{ result.favicon_url.as_ref().unwrap() }}">
{%- endif -%}
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div> <div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt"> <div class="result_excerpt">
{{- result.excerpt|safe -}} {{- result.excerpt|safe -}}

View File

@ -73,7 +73,7 @@
<h3>Open Data</h3> <h3>Open Data</h3>
<label for="opendata_agree"> <label for="opendata_agree">
<input type="checkbox" role="switch" id="opendata_agree" name="opendata_agree" value="true" required> <input type="checkbox" role="switch" id="opendata_agree" name="opendata_agree" value="true" required>
I'm happy for this data to be Open Data under the <a href="https://bics.ga/reivilibre/quickpeep_seeds/src/branch/main/LICENCE" target="_blank">CC0 licence</a>. I'm happy for this data to be Open Data under the <a href="https://git.emunest.net/reivilibre/quickpeep_seeds/src/branch/main/LICENCE" target="_blank">CC0 licence</a>.
</label> </label>
<input type="submit" value="Submit seed"> <input type="submit" value="Submit seed">

View File

@ -287,7 +287,8 @@ impl DenseTreeBuilder {
let nofollow = attrs let nofollow = attrs
.get("rel") .get("rel")
.map(|rel: &str| { .map(|rel: &str| {
rel.split_whitespace() rel.split(|c: char| c.is_whitespace() || c == ',')
.filter(|s| !s.is_empty())
.any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow")) .any(|rel_word: &str| rel_word.eq_ignore_ascii_case("nofollow"))
}) })
.unwrap_or(false); .unwrap_or(false);

View File

@ -166,7 +166,11 @@ impl Backend for TantivyBackend {
let reader = self.index.reader()?; let reader = self.index.reader()?;
let parser = QueryParser::new( let parser = QueryParser::new(
self.index.schema(), self.index.schema(),
vec![self.fields.title, self.fields.article, self.fields.article], vec![
self.fields.title,
self.fields.article,
self.fields.nonarticle,
],
TokenizerManager::default(), TokenizerManager::default(),
); );
@ -198,14 +202,27 @@ impl Backend for TantivyBackend {
let snippet = article_snippet_generator.snippet(&doc_row.body); let snippet = article_snippet_generator.snippet(&doc_row.body);
let excerpt = snippet.to_html(); let excerpt = snippet.to_html();
let tags = doc
.get_all(self.fields.tags)
.map(|fv| {
String::from(
*fv.as_facet()
.expect("tags must be facet!")
.to_path()
.last()
.unwrap_or(&""),
)
})
.collect();
out.push(SearchDocument { out.push(SearchDocument {
score, score,
title: doc_row.title, title: doc_row.title,
excerpt, excerpt,
tags: vec![], tags,
url: url.to_owned(), url: url.to_owned(),
favicon_url_hash: doc_row.favicon_url_hash, favicon_url_hash: doc_row.favicon_url_hash,
}) });
} }
Ok(out) Ok(out)

View File

@ -30,6 +30,11 @@ patricia_tree = "0.3.1"
# For decompression of emitted packs. 0.11.1+zstd.1.5.2 # For decompression of emitted packs. 0.11.1+zstd.1.5.2
zstd = "0.11.1" zstd = "0.11.1"
# HTTP Requests
reqwest = { version = "0.11.9", features = ["blocking"] }
tempfile = "3.3.0"
quickpeep_densedoc = { path = "../quickpeep_densedoc" } quickpeep_densedoc = { path = "../quickpeep_densedoc" }
quickpeep_index = { path = "../quickpeep_index" } quickpeep_index = { path = "../quickpeep_index" }
quickpeep_structs = { path = "../quickpeep_structs" } quickpeep_structs = { path = "../quickpeep_structs" }

View File

@ -26,7 +26,7 @@ pub fn main() -> anyhow::Result<()> {
let config_path = opts let config_path = opts
.config .config
.unwrap_or_else(|| PathBuf::from("qp_indexer.toml")); .unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
let config = IndexerConfig::load(&config_path).context("Failed to load config")?; let config = IndexerConfig::load(&config_path).context("Failed to load config")?;
let indexer_backend = config.open_indexer_backend()?; let indexer_backend = config.open_indexer_backend()?;

View File

@ -2,9 +2,9 @@ use anyhow::{bail, Context};
use clap::Parser; use clap::Parser;
use colour::{blue, yellow_ln}; use colour::{blue, yellow_ln};
use env_logger::Env; use env_logger::Env;
use std::collections::HashMap; use std::collections::{BTreeSet, HashMap};
use std::fs::File; use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader, Write};
use patricia_tree::PatriciaMap; use patricia_tree::PatriciaMap;
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
@ -20,7 +20,8 @@ use quickpeep_structs::rake_entries::{
}; };
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
use smartstring::alias::CompactString; use smartstring::alias::CompactString;
use std::path::PathBuf; use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use url::Url; use url::Url;
@ -30,6 +31,11 @@ pub struct Opts {
#[clap(long = "config")] #[clap(long = "config")]
config: Option<PathBuf>, config: Option<PathBuf>,
/// If specified, rakepacks from a feed will automatically be fetched and indexed.
/// The rakepacks are tracked as having been processed.
#[clap(long = "feed")]
feed: Option<Url>,
rakepacks: Vec<PathBuf>, rakepacks: Vec<PathBuf>,
} }
@ -62,45 +68,155 @@ pub async fn main() -> anyhow::Result<()> {
let mut indexer_backend = config.open_indexer_backend()?; let mut indexer_backend = config.open_indexer_backend()?;
if let Some(feed) = opts.feed {
let processed_rakepack_path = config
.processed_rakepack_path()
.context("can't get a suitable location to track processed rakepacks")?;
handle_pack_feed(
feed,
&mut indexer_backend,
processed_rakepack_path,
&seed_lookup,
&icon_store,
)
.context("failed to handle pack feed")?;
}
for pack in opts.rakepacks { for pack in opts.rakepacks {
blue!("Indexing: "); handle_pack(&pack, &mut indexer_backend, &seed_lookup, &icon_store)
yellow_ln!("{:?}", pack); .with_context(|| format!("Whilst handling pack: {pack:?}"))?;
let file = File::open(&pack)?;
let decompressor = zstd::stream::Decoder::new(file)?;
// TODO the decompressor has a buffer already, but we need this to see the end
let mut buf_reader = BufReader::new(decompressor);
let schema: String = serde_bare::from_reader(&mut buf_reader)?;
match schema.as_ref() {
SCHEMA_RAKED_PAGES => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_page_pack(&mut buf_reader, &seed_lookup, &mut indexer_backend)
.context("failed to handle page pack")?;
}
}
SCHEMA_RAKED_ICONS => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_icon_pack(&mut buf_reader, &icon_store)
.context("failed to handle icon pack")?;
}
}
_ => {
bail!(
"Wrong schema version: wanted e.g. {:?}, got {:?}",
SCHEMA_RAKED_PAGES,
&schema
);
}
}
} }
indexer_backend.flush()?; indexer_backend.flush()?;
Ok(()) Ok(())
} }
pub fn handle_pack_feed(
feed_url: Url,
indexer_backend: &mut Box<dyn Backend>,
processed_list_path: PathBuf,
seed_lookup: &SeedLookupTable,
icon_store: &IconStore,
) -> anyhow::Result<()> {
blue!("Scanning feed: ");
yellow_ln!("{:?}", feed_url);
let new_packs =
find_new_packs(feed_url.clone(), &processed_list_path).context("finding new packs")?;
let mut processed_log = OpenOptions::new()
.append(true)
.create(true)
.open(&processed_list_path)
.context("can't open processed list for append")?;
for pack_name in new_packs {
let pack_url = feed_url
.join(&pack_name)
.context("Can't resolve URL of new pack")?;
blue!("Downloading: ");
yellow_ln!("{:?}", pack_url);
let mut temp_file = NamedTempFile::new().context("opening temp file")?;
reqwest::blocking::get(pack_url.clone())
.context("failed to request pack")?
.error_for_status()?
.copy_to(temp_file.as_file_mut())
.context("failed to download pack to temp file")?;
handle_pack(temp_file.path(), indexer_backend, seed_lookup, icon_store).with_context(
|| {
format!(
"Whilst handling pack: {:?} ({:?})",
temp_file.path(),
pack_url
)
},
)?;
processed_log.write(format!("\n{}", &pack_name).as_bytes())?;
processed_log.flush()?;
}
Ok(())
}
const USEFUL_RAKEPACKS_TO_PULL_FROM_FEED: [&'static str; 2] = [".icons.pack", ".pages.pack"];
fn find_new_packs(feed_url: Url, processed_list_path: &Path) -> anyhow::Result<BTreeSet<String>> {
let processed_file = OpenOptions::new()
.read(true)
.create(true)
.open(processed_list_path)?;
let br = BufReader::new(processed_file);
let processed: Result<BTreeSet<String>, _> = br.lines().collect();
let processed = processed.context("failed to read local processed list")?;
let mut unprocessed: BTreeSet<String> = BTreeSet::new();
let feed_lines = BufReader::new(reqwest::blocking::get(feed_url)?.error_for_status()?).lines();
for line in feed_lines {
let line = line?;
if line.is_empty() {
continue;
}
if !USEFUL_RAKEPACKS_TO_PULL_FROM_FEED
.iter()
.any(|ext| line.ends_with(ext))
{
// not a sort of rakepack we care about
continue;
}
if processed.contains(&line) {
continue;
}
unprocessed.insert(line.to_owned());
}
Ok(unprocessed)
}
pub fn handle_pack(
pack: &Path,
indexer_backend: &mut Box<dyn Backend>,
seed_lookup: &SeedLookupTable,
icon_store: &IconStore,
) -> anyhow::Result<()> {
blue!("Indexing: ");
yellow_ln!("{:?}", pack);
let file = File::open(&pack)?;
let decompressor = zstd::stream::Decoder::new(file)?;
// TODO the decompressor has a buffer already, but we need this to see the end
let mut buf_reader = BufReader::new(decompressor);
let schema: String =
serde_bare::from_reader(&mut buf_reader).context("failed to read schema ver")?;
match schema.as_ref() {
SCHEMA_RAKED_PAGES => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_page_pack(&mut buf_reader, &seed_lookup, indexer_backend)
.context("failed to handle page pack")?;
}
}
SCHEMA_RAKED_ICONS => {
// TODO(unstable): this condition is `.has_data_left()` but it's unstable.
while buf_reader.fill_buf().map(|b| !b.is_empty())? {
handle_icon_pack(&mut buf_reader, &icon_store)
.context("failed to handle icon pack")?;
}
}
_ => {
bail!(
"Wrong schema version: wanted e.g. {:?}, got {:?}",
SCHEMA_RAKED_PAGES,
&schema
);
}
}
Ok(())
}
pub fn handle_page_pack( pub fn handle_page_pack(
buf_reader: &mut impl BufRead, buf_reader: &mut impl BufRead,
seed_lookup: &SeedLookupTable, seed_lookup: &SeedLookupTable,
@ -202,7 +318,8 @@ pub async fn build_seed_lookup_table(
match &seed.url { match &seed.url {
UrlOrUrlPattern::Url(url_str) => { UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str)?; let url = Url::parse(url_str)?;
let reduced_domain = get_reduced_domain(&url)?; let reduced_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain in seed: '{url}'!"))?;
seed_lookup seed_lookup
.by_reduced_domain .by_reduced_domain
.insert(reduced_domain.into(), seed); .insert(reduced_domain.into(), seed);
@ -222,7 +339,8 @@ impl SeedLookupTable {
return Ok(Some(seed)); return Ok(Some(seed));
} }
let domain = get_reduced_domain(url)?; let domain = get_reduced_domain(url)
.with_context(|| format!("No domain in looked up URL: '{url}'"))?;
if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) { if let Some(seed) = self.by_reduced_domain.get(domain.as_ref()) {
return Ok(Some(seed)); return Ok(Some(seed));
} }

View File

@ -55,4 +55,17 @@ impl IndexerConfig {
} }
} }
} }
/// Returns the path to a text file which can be used for storing a list of processed rakepacks
/// (needed for following rakepack streams over a network).
pub fn processed_rakepack_path(&self) -> anyhow::Result<PathBuf> {
match &self.index.backend {
BackendConfig::Tantivy(tantivy) => {
Ok(tantivy.index_dir.join("processed_rakepacks.lst"))
}
BackendConfig::Meili(_) => {
todo!()
}
}
}
} }

View File

@ -59,6 +59,7 @@ smartstring = "1.0.0"
signal-hook = "0.3.13" signal-hook = "0.3.13"
nix = "0.23.1" nix = "0.23.1"
quickpeep_html_charset_detection = { version = "0.1.0", path = "../quickpeep_html_charset_detection" } quickpeep_html_charset_detection = { version = "0.1.0", path = "../quickpeep_html_charset_detection" }
tikv-jemallocator = "0.5.0"
### Raking helpers ### Raking helpers
# HTTP Requests # HTTP Requests

View File

@ -16,8 +16,8 @@ use quickpeep_raker::config;
use quickpeep_raker::storage::mdbx_helper_types::MdbxBare; use quickpeep_raker::storage::mdbx_helper_types::MdbxBare;
use quickpeep_raker::storage::records::{ use quickpeep_raker::storage::records::{
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, UrlVisitedRecord,
}; };
use quickpeep_raker::storage::{RakerStore, RakerTxn}; use quickpeep_raker::storage::{RakerStore, RakerTxn};
@ -46,7 +46,7 @@ pub async fn main() -> anyhow::Result<()> {
let config_path = opts let config_path = opts
.config .config
.unwrap_or_else(|| PathBuf::from("qp_raker.toml")); .unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; let config = config::RakerConfig::load(&config_path).context("Failed to load config")?;
if !config.raker.workbench_dir.exists() { if !config.raker.workbench_dir.exists() {
@ -111,11 +111,11 @@ pub async fn main() -> anyhow::Result<()> {
&txn, &txn,
)?; )?;
} }
"allowed_domains" => { "domains" => {
inspect::<MdbxBare<AllowedDomainRecord>>( inspect::<MdbxBare<DomainRecord>>(
opts.key_name.as_ref(), opts.key_name.as_ref(),
opts.prefix, opts.prefix,
&txn.mdbx.borrow_dbs().allowed_domains, &txn.mdbx.borrow_dbs().domains,
&txn, &txn,
)?; )?;
} }
@ -127,14 +127,6 @@ pub async fn main() -> anyhow::Result<()> {
&txn, &txn,
)?; )?;
} }
"weed_domains" => {
inspect::<MdbxBare<WeedDomainRecord>>(
opts.key_name.as_ref(),
opts.prefix,
&txn.mdbx.borrow_dbs().weed_domains,
&txn,
)?;
}
other => { other => {
dark_yellow_ln!("Unknown database {:?}", other); dark_yellow_ln!("Unknown database {:?}", other);
} }

View File

@ -33,7 +33,7 @@ pub async fn main() -> anyhow::Result<()> {
let config_path = opts let config_path = opts
.config .config
.unwrap_or_else(|| PathBuf::from("qp_raker.toml")); .unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; let config = config::RakerConfig::load(&config_path).context("Failed to load config")?;
if !config.raker.workbench_dir.exists() { if !config.raker.workbench_dir.exists() {

View File

@ -3,8 +3,9 @@ use clap::Parser;
use env_logger::Env; use env_logger::Env;
use adblock::lists::RuleTypes; use adblock::lists::RuleTypes;
use anyhow::{bail, Context}; use anyhow::{anyhow, bail, ensure, Context};
use log::{debug, error, warn}; use chrono::Utc;
use log::{debug, error, info, warn};
use lru::LruCache; use lru::LruCache;
use metrics_exporter_prometheus::PrometheusBuilder; use metrics_exporter_prometheus::PrometheusBuilder;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
@ -14,7 +15,7 @@ use signal_hook::iterator::Signals;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex, RwLock}; use std::sync::{Arc, Mutex, RwLock};
use std::time::{Duration, SystemTime}; use std::time::{Duration, Instant, SystemTime};
use tokio::fs::File; use tokio::fs::File;
use tokio::sync::{mpsc, oneshot, Notify, Semaphore}; use tokio::sync::{mpsc, oneshot, Notify, Semaphore};
use tokio::time::MissedTickBehavior; use tokio::time::MissedTickBehavior;
@ -26,11 +27,12 @@ use quickpeep_raker::raking::page_extraction::PageExtractionService;
use quickpeep_raker::raking::rakemetrics::describe_raking_metrics; use quickpeep_raker::raking::rakemetrics::describe_raking_metrics;
use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission}; use quickpeep_raker::raking::task::{TaskContext, TaskResultSubmission};
use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep_raker::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT};
use quickpeep_raker::storage::RakerStore; use quickpeep_raker::storage::{RakerStore, RandomActiveDomainAcquisition};
use quickpeep_structs::rake_entries::{ use quickpeep_structs::rake_entries::{
AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES, AnalysisAntifeatures, SCHEMA_RAKED_ICONS, SCHEMA_RAKED_PAGES, SCHEMA_RAKED_REFERENCES,
SCHEMA_RAKED_REJECTIONS, SCHEMA_RAKED_REJECTIONS,
}; };
use quickpeep_utils::dates::date_to_quickpeep_days;
/// The ordering is slightly important on these: more specific things should come first. /// The ordering is slightly important on these: more specific things should come first.
/// This means they filter out the troublesome elements before the broader filters do. /// This means they filter out the troublesome elements before the broader filters do.
@ -58,6 +60,9 @@ pub struct Opts {
concurrent_sleepers: u32, concurrent_sleepers: u32,
} }
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[tokio::main] #[tokio::main]
pub async fn main() -> anyhow::Result<()> { pub async fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env( env_logger::Builder::from_env(
@ -145,8 +150,7 @@ pub async fn main() -> anyhow::Result<()> {
describe_raking_metrics(); describe_raking_metrics();
} }
let num_tasks = opts.concurrent_jobs + opts.concurrent_sleepers; let active_fetch_semaphore = Arc::new(Semaphore::new(opts.concurrent_jobs as usize));
let semaphore = Arc::new(Semaphore::new(opts.concurrent_jobs as usize));
let (pages_tx, pages_rx) = mpsc::channel(32); let (pages_tx, pages_rx) = mpsc::channel(32);
let (refs_tx, refs_rx) = mpsc::channel(32); let (refs_tx, refs_rx) = mpsc::channel(32);
@ -264,33 +268,24 @@ pub async fn main() -> anyhow::Result<()> {
raker: Arc::new(raker), raker: Arc::new(raker),
busy_domains: Arc::new(Mutex::new(Default::default())), busy_domains: Arc::new(Mutex::new(Default::default())),
robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))), robotstxt_cache: Arc::new(RwLock::new(LruCache::new(64))),
semaphore, semaphore: active_fetch_semaphore,
submission, submission,
graceful_stop, graceful_stop,
notify: graceful_stop_notify, notify: graceful_stop_notify,
rerake_timings: Arc::new(config.raker.rerake_timings.clone()),
}; };
// Reinstate old backoffs // Reinstate old backoffs and re-rakable URLs
store store
.async_rw_txn(|txn| { .async_rw_txn(|txn| {
let today = date_to_quickpeep_days(&Utc::today())?;
txn.reinstate_backoffs(SystemTime::now())?; txn.reinstate_backoffs(SystemTime::now())?;
txn.reinstate_rerakables(today)?;
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
}) })
.await?; .await?;
let mut tasks = Vec::with_capacity(num_tasks as usize);
for task_num in 0..num_tasks {
let task_context = task_context.clone();
tasks.push(tokio::spawn(async move {
if let Err(err) = task_context.run().await {
error!("Raker task {:?} encountered an error: {:?}", task_num, err);
}
}));
}
let (dsmu_cancel_tx, mut dsmu_cancel_rx) = oneshot::channel(); let (dsmu_cancel_tx, mut dsmu_cancel_rx) = oneshot::channel();
let datastore_metrics_updater = { let datastore_metrics_updater = {
let store = task_context.store.clone(); let store = task_context.store.clone();
@ -315,21 +310,22 @@ pub async fn main() -> anyhow::Result<()> {
}) })
}; };
let TaskContext { let graceful_stop = task_context.graceful_stop.clone();
graceful_stop, let notify = task_context.notify.clone();
notify,
submission,
..
} = task_context;
// Manually drop submission otherwise the senders don't hang up. let worker_semaphore =
drop(submission); Semaphore::new((opts.concurrent_jobs + opts.concurrent_sleepers) as usize);
let orchestrator_handle = tokio::spawn(async move {
if let Err(err) = orchestrator(task_context, Arc::new(worker_semaphore)).await {
error!("Error in orchestrator: {err:?}");
}
});
// ^C is SIGINT; systemd sends SIGTERM // ^C is SIGINT; systemd sends SIGTERM
start_signal_handler(Signals::new([SIGINT, SIGTERM])?, graceful_stop, notify)?; start_signal_handler(Signals::new([SIGINT, SIGTERM])?, graceful_stop, notify)?;
for task in tasks { if let Err(panic_err) = orchestrator_handle.await {
task.await?; error!("orchestrator panic: {panic_err:?}");
} }
for task in emitters { for task in emitters {
@ -343,6 +339,101 @@ pub async fn main() -> anyhow::Result<()> {
Ok(()) Ok(())
} }
async fn acquire_active_domain(task_context: &TaskContext) -> anyhow::Result<Option<String>> {
// Acquire a domain for the task to run against
let domain = {
let txn = task_context.store.ro_txn()?;
// TODO: don't clone teh Arc here — conv to ref.
txn.acquire_random_active_domain(task_context.busy_domains.clone())?
};
match domain {
RandomActiveDomainAcquisition::GotOne { domain, record: _ } => Ok(Some(domain)),
RandomActiveDomainAcquisition::AllBusy => Ok(None),
RandomActiveDomainAcquisition::NoneLeft => Ok(None),
}
}
/// Spawns tasks to do the work as necessary.
/// Performs back-off and re-rake reinstatements periodically and spawns up new workers if needed.
async fn orchestrator(task_context: TaskContext, semaphore: Arc<Semaphore>) -> anyhow::Result<()> {
let mut next_reinstate = Instant::now() + Duration::from_secs(1800);
let max_permits = semaphore.available_permits();
while !task_context.graceful_stop.load(Ordering::Relaxed) {
// Spawn up new tasks if there are available worker permits.
let domain_to_process = acquire_active_domain(&task_context)
.await
.context("failed trying to acquire active domain")?;
if domain_to_process.is_none() && semaphore.available_permits() == max_permits {
// There's nothing to do and nothing is being processed.
ensure!(
task_context.busy_domains.lock().unwrap().is_empty(),
"Shutting down orchestrator but set of busy domains is not empty."
);
}
tokio::select! {
_ = tokio::time::sleep_until(next_reinstate.into()) => {
// Reinstate backoffs and rerakables
if let Err(err) = task_context.store.async_rw_txn(|txn| {
txn.reinstate_backoffs(SystemTime::now())?;
let today = date_to_quickpeep_days(&Utc::today())?;
txn.reinstate_rerakables(today)?;
txn.commit()?;
Ok(())
}).await {
error!("Error performing periodic reinstatements: {err:?}");
}
next_reinstate = Instant::now() + Duration::from_secs(1800);
},
_ = task_context.notify.notified() => {
// nop: just wake from the loop
}
Ok(new_permit) = semaphore.clone().acquire_owned(), if domain_to_process.is_some() => {
let domain = domain_to_process.unwrap();
let mut task_context = task_context.clone();
tokio::spawn(async move {
if let Err(err) = task_context.process_domain(domain.clone()).await {
error!("Encountered error processing {:?}: {:?}", domain, err);
}
ensure!(
task_context.busy_domains
.lock()
.map_err(|_| anyhow!("busy domains set poisoned"))?
.remove(&domain),
"Our domain was not busy after processing!"
);
// Release the permit here, within the task.
drop(new_permit);
Ok(())
});
}
};
}
info!("Orchestrator shutting down gracefully...");
// Wind up:
let TaskContext { submission, .. } = task_context;
// Manually drop submission otherwise the senders don't hang up.
drop(submission);
let num_active_tasks = max_permits - semaphore.available_permits();
info!("Waiting for {num_active_tasks} rake tasks to close.");
info!(
"Acquired all remaining permits: {:?}",
semaphore.acquire_many(num_active_tasks as u32).await
);
Ok(())
}
fn start_signal_handler( fn start_signal_handler(
mut signals: Signals, mut signals: Signals,
shutdown: Arc<AtomicBool>, shutdown: Arc<AtomicBool>,

View File

@ -1,5 +1,5 @@
use clap::Parser; use clap::Parser;
use std::borrow::{Borrow, BorrowMut}; use std::borrow::Borrow;
use env_logger::Env; use env_logger::Env;
@ -11,15 +11,13 @@ use std::path::PathBuf;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config;
use quickpeep_raker::config::RakerConfig; use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::references::SUPPORTED_SCHEMES;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore}; use quickpeep_raker::storage::{maintenance, RakerStore};
use quickpeep_seed_parser::loader::{ use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION, find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
}; };
use quickpeep_utils::dirty::DirtyTracker;
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
/// Seeds a raker's queue with URLs /// Seeds a raker's queue with URLs
@ -41,8 +39,8 @@ pub async fn main() -> anyhow::Result<()> {
let config_path = opts let config_path = opts
.config .config
.unwrap_or_else(|| PathBuf::from("qp_raker.toml")); .unwrap_or_else(|| PathBuf::from("quickpeep.ron"));
let config = config::RakerConfig::load(&config_path).context("Failed to load config")?; let config = RakerConfig::load(&config_path).context("Failed to load config")?;
if !config.raker.workbench_dir.exists() { if !config.raker.workbench_dir.exists() {
bail!( bail!(
@ -144,48 +142,39 @@ async fn importer(
buf.push(seed); buf.push(seed);
if buf.len() == BATCH_SIZE { if buf.len() == BATCH_SIZE {
if are_weeds { import_and_flush_batch_seeds_or_weeds(
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; &store, &mut buf, &mut stats, &client, !are_weeds,
} else { )
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?; .await?;
}
} }
} }
if are_weeds { import_and_flush_batch_seeds_or_weeds(&store, &mut buf, &mut stats, &client, !are_weeds)
import_and_flush_batch_weeds(&store, &mut buf, &mut stats).await?; .await?;
} else {
import_and_flush_batch_seeds(&store, &mut buf, &mut stats, &client).await?;
}
Ok(stats) Ok(stats)
} }
async fn import_and_flush_batch_seeds( async fn import_and_flush_batch_seeds_or_weeds(
store: &RakerStore, store: &RakerStore,
buf: &mut Vec<Seed>, buf: &mut Vec<Seed>,
stats: &mut SeedImportStats, stats: &mut SeedImportStats,
client: &Client, client: &Client,
is_seed: bool,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let txn = store.rw_txn()?; let txn = store.rw_txn()?;
for seed in buf.drain(..) { for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str()) let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?; .with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)?; let domain = get_reduced_domain(&as_url)
.with_context(|| format!("No domain in seed URL '{as_url}'!"))?;
let allowed_domain_record = txn.get_allowed_domain_record(domain.borrow())?; let domain_record = txn.get_domain_record(domain.borrow())?;
let is_domain_new = domain_record.is_none();
let is_domain_new = allowed_domain_record.is_none(); let mut domain_record = domain_record.unwrap_or_default();
if is_domain_new { if is_domain_new {
stats.new_domains += 1; stats.new_domains += 1;
} }
let mut dirty = is_domain_new;
let mut allowed_domain_record = DirtyTracker::new(
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
);
if is_domain_new {
// Mark it as dirty
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
}
// Register the domain. This is a no-op if it's already active or backing off. // Register the domain. This is a no-op if it's already active or backing off.
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?; txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
@ -193,41 +182,53 @@ async fn import_and_flush_batch_seeds(
let url_like = match &seed.url { let url_like = match &seed.url {
UrlOrUrlPattern::Url(url_str) => { UrlOrUrlPattern::Url(url_str) => {
let url = Url::parse(url_str.as_str())?; let url = Url::parse(url_str.as_str())?;
if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? { if is_seed {
stats.new_urls += 1; if txn.enqueue_url(url.as_str(), None, RakeIntent::Any)? {
} else { stats.new_urls += 1;
stats.already_present_urls += 1; } else {
stats.already_present_urls += 1;
}
} }
// Seed/weed with empty prefix
dirty |= domain_record
.rakeable_path_prefixes
.insert(String::new(), is_seed)
!= Some(is_seed);
url url
} }
UrlOrUrlPattern::UrlPrefix(prefix) => { UrlOrUrlPattern::UrlPrefix(prefix) => {
let prefix_as_url = Url::parse(prefix.as_str())?; let prefix_as_url = Url::parse(prefix.as_str())?;
if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? { if is_seed {
stats.new_urls += 1; if txn.enqueue_url(prefix_as_url.as_str(), None, RakeIntent::Any)? {
} else { stats.new_urls += 1;
stats.already_present_urls += 1; } else {
} stats.already_present_urls += 1;
if is_domain_new { }
let allowed_domain_record: &mut AllowedDomainRecord =
allowed_domain_record.borrow_mut();
allowed_domain_record
.restricted_prefixes
.insert(prefix_as_url.path().to_string());
} }
dirty |= domain_record
.rakeable_path_prefixes
.insert(prefix_as_url.path().to_string(), is_seed)
!= Some(is_seed);
prefix_as_url prefix_as_url
} }
}; };
if allowed_domain_record.is_dirty() { if dirty {
txn.put_allowed_domain_record(domain.borrow(), allowed_domain_record.into_inner())?; txn.put_domain_record(domain.borrow(), domain_record)?;
} }
if is_domain_new { if is_seed {
// look at robots.txt and discover sitemaps! // look at robots.txt and discover sitemaps!
if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? { if let Some(robots_txt) = get_robots_txt_for(&url_like, &client).await? {
for sitemap in robots_txt.sitemaps { for sitemap in robots_txt.sitemaps {
txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?; if SUPPORTED_SCHEMES.contains(&sitemap.url.scheme()) {
stats.new_sitemaps += 1; txn.enqueue_url(sitemap.url.as_str(), None, RakeIntent::SiteMap)?;
stats.new_sitemaps += 1;
}
} }
} }
} }
@ -235,36 +236,3 @@ async fn import_and_flush_batch_seeds(
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
} }
async fn import_and_flush_batch_weeds(
store: &RakerStore,
buf: &mut Vec<Seed>,
stats: &mut SeedImportStats,
) -> anyhow::Result<()> {
let txn = store.rw_txn()?;
for seed in buf.drain(..) {
let as_url = Url::parse(seed.url.as_str())
.with_context(|| format!("Failed to parse {:?} as URL", seed.url))?;
let domain = get_reduced_domain(&as_url)?;
let weed_domain_record = txn.get_weed_domain_record(domain.borrow())?;
let is_domain_new = weed_domain_record.is_none();
if is_domain_new {
stats.new_domains += 1;
}
let mut weed_domain_record =
DirtyTracker::new(weed_domain_record.unwrap_or_else(|| WeedDomainRecord::default()));
if is_domain_new {
// Mark it as dirty
let _: &mut WeedDomainRecord = weed_domain_record.borrow_mut();
}
if weed_domain_record.is_dirty() {
txn.put_weed_domain_record(domain.borrow(), weed_domain_record.into_inner())?;
}
}
txn.commit()?;
Ok(())
}

View File

@ -28,6 +28,23 @@ pub struct RakerOnlyConfig {
pub metrics: MetricsConfig, pub metrics: MetricsConfig,
pub pack_emitter: PackEmitterSettings, pub pack_emitter: PackEmitterSettings,
pub rerake_timings: RerakeTimings,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RerakeTimings {
/// How long, in days, between re-rakes of the same page?
/// Suggested: 300
pub page: u16,
/// How long, in days, between re-rakes of feeds?
/// Suggested: 10
pub feed: u16,
/// How long, in days, between re-rakes of icons?
/// Suggested: 365
pub icon: u16,
} }
impl RakerConfig { impl RakerConfig {

View File

@ -57,12 +57,15 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
Unit::Count, Unit::Count,
"Records emitted into a pack file" "Records emitted into a pack file"
); );
let pack_index_file = directory.join("index");
loop { loop {
let now = Utc::now(); let now = Utc::now();
// 2022-01-01 01:01:01 // 2022-01-01_01:01:01
let new_pack_file_path = loop { let (pack_name, new_pack_file_path) = loop {
let new_pack_file_path = let pack_name = format!("{}.{}.pack", now.format("%F_%T"), name);
directory.join(format!("{}.{}.pack", now.format("%F_%T"), name)); let new_pack_file_path = directory.join(&pack_name);
if new_pack_file_path.exists() { if new_pack_file_path.exists() {
warn!( warn!(
"{:?} already exists; sleeping to generate new timestamp.", "{:?} already exists; sleeping to generate new timestamp.",
@ -70,11 +73,11 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
); );
std::thread::sleep(Duration::from_secs(2)); std::thread::sleep(Duration::from_secs(2));
} else { } else {
break new_pack_file_path; break (pack_name, new_pack_file_path);
} }
}; };
if !pack_emitter_to_file( let file_cutoff_reached = pack_emitter_to_file(
&new_pack_file_path, &new_pack_file_path,
&mut rx, &mut rx,
name, name,
@ -82,8 +85,19 @@ pub fn pack_emitter<T: Serialize + Send + 'static>(
settings, settings,
shutdown.clone(), shutdown.clone(),
shutdown_notify.clone(), shutdown_notify.clone(),
)? { )?;
// File wasn't filled; the receiver was exhausted (we're shutting down).
// Add an entry to the index. This essentially marks it as 'done' and enables
// a follower to catch up.
let mut index_file = OpenOptions::new()
.create(true)
.append(true)
.open(&pack_index_file)?;
index_file.write(format!("\n{}", pack_name).as_bytes())?;
index_file.flush()?;
if !file_cutoff_reached {
// File wasn't filled; the receiver was exhausted (that means we're shutting down).
break; break;
} }
} }

View File

@ -1,5 +1,10 @@
use crate::raking::analysis::IpSet; use std::collections::{BTreeSet, HashMap, HashSet};
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService}; use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use ::metrics::increment_counter; use ::metrics::increment_counter;
use anyhow::{anyhow, bail, Context}; use anyhow::{anyhow, bail, Context};
use chrono::{DateTime, FixedOffset, Utc}; use chrono::{DateTime, FixedOffset, Utc};
@ -10,20 +15,18 @@ use image::imageops::FilterType;
use image::{GenericImageView, ImageFormat}; use image::{GenericImageView, ImageFormat};
use itertools::Itertools; use itertools::Itertools;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::debug; use log::{debug, info, warn};
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity; use sitemap::reader::SiteMapEntity;
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::io::Cursor;
use std::str::FromStr;
use std::time::Duration;
use tokio::time::Instant; use tokio::time::Instant;
use quickpeep_structs::rake_entries::{RakedPageEntry, RakedReferrerEntry, ReferenceKind};
use crate::raking::analysis::IpSet;
use crate::raking::page_extraction::{ExtractedPage, PageExtractionService};
pub mod analysis; pub mod analysis;
pub mod page_extraction; pub mod page_extraction;
pub mod rakemetrics; pub mod rakemetrics;
@ -59,6 +62,8 @@ pub enum RedirectReason {
}, },
/// The page was not canonical, and should not be indexed. /// The page was not canonical, and should not be indexed.
NotCanonical, NotCanonical,
/// Upgrade from a HTTP to HTTPS URL (or equivalent).
SecureUpgrade,
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
@ -107,6 +112,7 @@ pub enum TemporaryFailureReason {
pub enum PermanentFailureReason { pub enum PermanentFailureReason {
ResourceDenied(u16), ResourceDenied(u16),
DeniedToRobots, DeniedToRobots,
IndexingDenied,
WrongLanguage(String), WrongLanguage(String),
UnknownContentType(String), UnknownContentType(String),
ExceedsSizeLimit, ExceedsSizeLimit,
@ -149,7 +155,7 @@ impl FromStr for RakeIntent {
impl From<ReferenceKind> for RakeIntent { impl From<ReferenceKind> for RakeIntent {
fn from(kind: ReferenceKind) -> Self { fn from(kind: ReferenceKind) -> Self {
match kind { match kind {
ReferenceKind::CanonicalUrl => { ReferenceKind::CanonicalUrl | ReferenceKind::SecureUpgrade => {
// FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter... // FIXME We don't know what this is a canonical URL for. Suppose it doesn't matter...
RakeIntent::Any RakeIntent::Any
} }
@ -295,6 +301,30 @@ pub struct Raker {
} }
impl Raker { impl Raker {
/// Figure out whether we can upgrade a URL to HTTPS.
pub async fn try_upgrade_to_https(
&self,
url: &Url,
client: &Client,
) -> anyhow::Result<Option<Url>> {
if url.scheme().eq_ignore_ascii_case("http") {
// Try to upgrade to HTTPS if we can.
let mut https_url = url.clone();
https_url.set_scheme("https").unwrap();
client
.head(https_url.clone())
.timeout(Duration::from_secs(10))
.send()
.await
.context("failed to make HEAD request")?
.error_for_status()
.context("bad response for HEAD requesst")?;
Ok(Some(https_url))
} else {
Ok(None)
}
}
/// Rakes a resource by URL. /// Rakes a resource by URL.
/// ///
/// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances, /// `intent` specifies the kind of resource we're expecting. This matters in a few circumstances,
@ -305,6 +335,22 @@ impl Raker {
intent: RakeIntent, intent: RakeIntent,
client: &Client, client: &Client,
) -> anyhow::Result<RakeOutcome> { ) -> anyhow::Result<RakeOutcome> {
match self.try_upgrade_to_https(url, client).await {
Ok(Some(upgraded)) => {
return Ok(RakeOutcome::Redirect {
reason: RedirectReason::SecureUpgrade,
new_url: upgraded,
});
}
Ok(None) => {
// continue
}
Err(err) => {
info!("can't upgrade {url} to HTTPS: {err:?}");
// continue
}
}
let response = client.get(url.clone()).send().await?; let response = client.get(url.clone()).send().await?;
let is_cf = if let Some(remote_addr) = response.remote_addr() { let is_cf = if let Some(remote_addr) = response.remote_addr() {
@ -460,8 +506,34 @@ impl Raker {
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
no_follow,
no_index,
} => { } => {
let references = references::find_references(&unreadable_document, &feeds, url); if no_index {
return Ok(RakeOutcome::PermanentFailure(PermanentFailure {
reason: PermanentFailureReason::IndexingDenied,
}));
}
let mut references = references::find_references(&unreadable_document, &feeds, url);
if no_follow {
// Remove any link references
for reference in references {
match reference.kind {
ReferenceKind::Link | ReferenceKind::HeaderLinkedFeed => (),
ReferenceKind::CanonicalUrl
| ReferenceKind::FeedEntry
| ReferenceKind::SitemapEntry
| ReferenceKind::SecureUpgrade
| ReferenceKind::Redirect => {
warn!("unexpected: refkind of {:?} being filtered due to meta nofollow. This is a bug.", reference.kind);
}
}
}
references = BTreeSet::new();
}
Ok(RakeOutcome::RakedPage(RakedPage { Ok(RakeOutcome::RakedPage(RakedPage {
page_entry: RakedPageEntry { page_entry: RakedPageEntry {
analysed_antifeatures: antifeature_flags, analysed_antifeatures: antifeature_flags,
@ -560,7 +632,6 @@ pub fn rake_sitemap(content: &[u8]) -> anyhow::Result<Vec<UrlRaked>> {
debug!("Sitemap error {:?}", error); debug!("Sitemap error {:?}", error);
} }
} }
eprintln!("{:?}", entry);
} }
if urls.is_empty() { if urls.is_empty() {

View File

@ -156,6 +156,25 @@ impl PageExtractionServiceInternal {
} }
} }
let mut no_follow = false;
let mut no_index = false;
// Find any restrictions on indexing this page or following any links.
if let Ok(robots_nodes) = root_node.select("meta[name=robots]") {
for node in robots_nodes {
if let Some(content) = node.attributes.borrow().get("content") {
for directive in content
.split(|c: char| c.is_whitespace() || c == ',')
.filter(|s| !s.is_empty())
{
let none = directive.eq_ignore_ascii_case("none");
no_follow |= directive.eq_ignore_ascii_case("nofollow") | none;
no_index |= directive.eq_ignore_ascii_case("noindex") | none;
}
}
}
}
if language.is_none() { if language.is_none() {
// Next fallback: prefer the content-language header baked into the page itself // Next fallback: prefer the content-language header baked into the page itself
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") { if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
@ -311,6 +330,8 @@ impl PageExtractionServiceInternal {
document, document,
feeds, feeds,
antifeature_flags, antifeature_flags,
no_follow,
no_index,
}) })
} }
} }
@ -362,6 +383,8 @@ pub enum ExtractedPage {
document: DenseDocument, document: DenseDocument,
feeds: Vec<Url>, feeds: Vec<Url>,
antifeature_flags: AnalysisAntifeatures, antifeature_flags: AnalysisAntifeatures,
no_follow: bool,
no_index: bool,
}, },
Redirect { Redirect {
reason: RedirectReason, reason: RedirectReason,

View File

@ -6,6 +6,10 @@ use quickpeep_utils::dates::date_to_quickpeep_days;
use reqwest::Url; use reqwest::Url;
use std::collections::BTreeSet; use std::collections::BTreeSet;
/// Supported schemes.
/// References in all other schemes will be ignored.
pub const SUPPORTED_SCHEMES: [&'static str; 2] = ["http", "https"];
pub fn find_references( pub fn find_references(
doc: &Vec<DenseTree>, doc: &Vec<DenseTree>,
feeds: &Vec<Url>, feeds: &Vec<Url>,
@ -41,11 +45,24 @@ pub fn find_references(
} => { } => {
if !nofollow { if !nofollow {
if let Ok(full_url) = page_url.join(&href) { if let Ok(full_url) = page_url.join(&href) {
refs.insert(RakedReference { if full_url.domain().is_none() {
target: clean_url(&full_url).to_string(), // Skip URLs that don't have a domain after being made absolute.
kind: ReferenceKind::Link, // This also skips IP addresses: we probably don't want to bother
last_mod: None, // indexing content from explicit IP addresses.
}); continue;
}
if SUPPORTED_SCHEMES.contains(&full_url.scheme()) {
refs.insert(RakedReference {
target: clean_url(&full_url).to_string(),
kind: ReferenceKind::Link,
last_mod: None,
});
} else {
debug!(
"ignoring reference {:?}: not a supported scheme",
full_url.as_str()
);
}
} else { } else {
debug!("Can't join {:?} + {:?} to get full URL", page_url, href); debug!("Can't join {:?} + {:?} to get full URL", page_url, href);
} }
@ -61,6 +78,10 @@ pub fn find_references(
add_link_refs(&doc, &mut refs, &page_url); add_link_refs(&doc, &mut refs, &page_url);
for feed in feeds { for feed in feeds {
if feed.domain().is_none() {
// same rationale as above.
continue;
}
refs.insert(RakedReference { refs.insert(RakedReference {
target: clean_url(feed).as_str().to_owned(), target: clean_url(feed).as_str().to_owned(),
kind: ReferenceKind::HeaderLinkedFeed, kind: ReferenceKind::HeaderLinkedFeed,
@ -78,7 +99,7 @@ pub fn references_from_urlrakes(
input input
.iter() .iter()
.map(|url_raked| RakedReference { .map(|url_raked| RakedReference {
target: url_raked.url.to_string(), target: clean_url(&url_raked.url).to_string(),
kind: ref_kind, kind: ref_kind,
last_mod: url_raked last_mod: url_raked
.last_changed .last_changed

View File

@ -1,14 +1,15 @@
use crate::raking::references::references_from_urlrakes; use crate::config::RerakeTimings;
use crate::raking::references::{clean_url, references_from_urlrakes, SUPPORTED_SCHEMES};
use crate::raking::{ use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason, RakeOutcome, Raker, RedirectReason, RobotsTxt, TemporaryFailure, TemporaryFailureReason,
}; };
use crate::storage::records::{AllowedDomainRecord, UrlVisitedRecord, WeedDomainRecord}; use crate::storage::records::{DomainRecord, UrlVisitedRecord};
use crate::storage::{RakerStore, RandomActiveDomainAcquisition}; use crate::storage::RakerStore;
use anyhow::{anyhow, ensure, Context}; use anyhow::{anyhow, Context};
use chrono::Utc; use chrono::Utc;
use cylon::Cylon; use cylon::Cylon;
use log::{error, warn}; use log::{debug, warn};
use lru::LruCache; use lru::LruCache;
use metrics::increment_counter; use metrics::increment_counter;
use quickpeep_structs::rake_entries::{ use quickpeep_structs::rake_entries::{
@ -78,56 +79,11 @@ pub struct TaskContext {
/// Notifier used to wake up sleepers (either to stop them gracefully, or because work /// Notifier used to wake up sleepers (either to stop them gracefully, or because work
/// is available (not implemented)) /// is available (not implemented))
pub notify: Arc<Notify>, pub notify: Arc<Notify>,
pub rerake_timings: Arc<RerakeTimings>,
} }
impl TaskContext { impl TaskContext {
pub async fn run(mut self) -> anyhow::Result<()> {
// Get a domain to process
while !self.graceful_stop.load(Ordering::SeqCst) {
let domain = {
let txn = self.store.ro_txn()?;
txn.acquire_random_active_domain(self.busy_domains.clone())?
};
match domain {
RandomActiveDomainAcquisition::GotOne {
domain,
record: _active_record,
} => {
if let Err(err) = self.process_domain(domain.clone()).await {
error!("Encountered error processing {:?}: {:?}", domain, err);
}
ensure!(
self.busy_domains
.lock()
.map_err(|_| anyhow!("busy domains set poisoned"))?
.remove(&domain),
"Our domain was not busy after processing!"
);
}
RandomActiveDomainAcquisition::AllBusy => {
// TODO(perf): notify waiters when new domains are available.
tokio::select! {
_ = tokio::time::sleep(Duration::from_secs(60)) => {
// nop
},
_ = self.notify.notified() => {
// nop (we allow the notifier to wake us up in case we need to gracefully
// stop).
},
};
}
RandomActiveDomainAcquisition::NoneLeft => {
// Nothing left to do, and it's not temporary because there aren't even any
// busy domains left.
break;
}
}
}
Ok(())
}
pub async fn get_robot_rules(&self, url_of_site: &Url) -> anyhow::Result<Option<Cylon>> { pub async fn get_robot_rules(&self, url_of_site: &Url) -> anyhow::Result<Option<Cylon>> {
let robots = get_robots_txt_for(url_of_site, &self.redirect_following_client).await?; let robots = get_robots_txt_for(url_of_site, &self.redirect_following_client).await?;
Ok(robots.map(|robots: RobotsTxt| robots.rules)) Ok(robots.map(|robots: RobotsTxt| robots.rules))
@ -138,11 +94,23 @@ impl TaskContext {
let mut current_robot_rules: Option<Cylon> = None; let mut current_robot_rules: Option<Cylon> = None;
let mut wait_until: Option<Instant> = None; let mut wait_until: Option<Instant> = None;
let domain_record = {
let txn = self.store.ro_txn()?;
let dr = txn.get_domain_record(&domain)?;
match dr {
None => {
return Ok(());
}
Some(dr) => dr,
}
};
while !self.graceful_stop.load(Ordering::Relaxed) { while !self.graceful_stop.load(Ordering::Relaxed) {
// Get a URL to process // Get a URL to process
let url = { let url = {
let txn = self.store.ro_txn()?; let txn = self.store.ro_txn()?;
txn.choose_url_for_domain(&domain)? txn.choose_url_for_domain(&domain)
.context("failed to choose URL for domain")?
}; };
let (url_str, url_record) = if let Some(url) = url { let (url_str, url_record) = if let Some(url) = url {
@ -163,12 +131,14 @@ impl TaskContext {
} }
// Delete the active domain from the store // Delete the active domain from the store
txn.remove_active_domain(&domain)?; txn.remove_active_domain(&domain)
.context("failed to remove active domain")?;
txn.commit()?; txn.commit()?;
Ok(true) Ok(true)
}) })
.await?; .await
.context("failed to check if we're out of URLs")?;
if out_of_urls { if out_of_urls {
break; break;
} else { } else {
@ -176,10 +146,26 @@ impl TaskContext {
} }
}; };
let url = Url::parse(&url_str)?; let url = Url::parse(&url_str)
.with_context(|| format!("failed to parse as URL: {url_str:?}"))?;
if !domain_record.is_url_rakeable(&url).unwrap_or(false) {
// This is now a weed: skip.
let domain = domain.clone();
let url = url.clone();
self.store
.async_rw_txn(move |txn| {
txn.dequeue_url(&domain, url.as_str())?;
txn.commit()?;
Ok(())
})
.await?;
continue;
}
// Check our robot rules are valid for that URL. // Check our robot rules are valid for that URL.
let robot_url = robots_txt_url_for(&url)?; let robot_url = robots_txt_url_for(&url)
.with_context(|| format!("failed to get robots.txt URL for {url_str:?}"))?;
if Some(&robot_url) != current_robot_rules_url.as_ref() { if Some(&robot_url) != current_robot_rules_url.as_ref() {
// We need to update our robot rules! // We need to update our robot rules!
match self.get_robot_rules(&url).await { match self.get_robot_rules(&url).await {
@ -198,7 +184,8 @@ impl TaskContext {
backoff_sec: 86400, backoff_sec: 86400,
}), }),
) )
.await?; .await
.context("failed to handle TemporaryFailure outcome for robots.txt")?;
// Forcefully change domain // Forcefully change domain
return Ok(()); return Ok(());
} }
@ -216,7 +203,8 @@ impl TaskContext {
reason: PermanentFailureReason::DeniedToRobots, reason: PermanentFailureReason::DeniedToRobots,
}), }),
) )
.await?; .await
.context("failed to process PermanentFailure outcome for robots.txt")?;
continue; continue;
} }
} }
@ -252,6 +240,7 @@ impl TaskContext {
} else { } else {
&self.client &self.client
}; };
debug!("Rake: {url}");
let raked = self.raker.rake(&url, url_record.intent, client).await; let raked = self.raker.rake(&url, url_record.intent, client).await;
drop(permit); drop(permit);
@ -320,7 +309,8 @@ impl TaskContext {
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
}) })
.await?; .await
.context("failure whilst turning long crawl delay into backoff")?;
} }
} }
@ -346,10 +336,12 @@ impl TaskContext {
self.as_event_processor() self.as_event_processor()
.process_page(url.clone(), page.page_entry, today) .process_page(url.clone(), page.page_entry, today)
.await?; .await
.context("failure processing page for RakedPage")?;
self.as_event_processor() self.as_event_processor()
.process_refs(url.clone(), page.referrer_entry, today) .process_refs(url.clone(), page.referrer_entry, today, false)
.await?; .await
.context("failure processing refs for RakedPage")?;
Ok(NextAction::Continue) Ok(NextAction::Continue)
} }
@ -365,8 +357,9 @@ impl TaskContext {
.context("Reference processor shut down; can't stream references!")?; .context("Reference processor shut down; can't stream references!")?;
self.as_event_processor() self.as_event_processor()
.process_refs(url.clone(), refs, today) .process_refs(url.clone(), refs, today, true)
.await?; .await
.context("failure processing refs for RakedFeed")?;
Ok(NextAction::Continue) Ok(NextAction::Continue)
} }
@ -382,8 +375,9 @@ impl TaskContext {
.context("Reference processor shut down; can't stream references!")?; .context("Reference processor shut down; can't stream references!")?;
self.as_event_processor() self.as_event_processor()
.process_refs(url.clone(), refs, today) .process_refs(url.clone(), refs, today, true)
.await?; .await
.context("failure processing refs for RakedSitemap")?;
Ok(NextAction::Continue) Ok(NextAction::Continue)
} }
@ -402,17 +396,19 @@ impl TaskContext {
self.as_event_processor() self.as_event_processor()
.process_icon(url.clone(), today) .process_icon(url.clone(), today)
.await?; .await
.context("failure processing icon for RakedIcon")?;
Ok(NextAction::Continue) Ok(NextAction::Continue)
} }
RakeOutcome::Redirect { reason, new_url } => { RakeOutcome::Redirect { reason, new_url } => {
let refs = RakedReferrerEntry { let refs = RakedReferrerEntry {
references: [RakedReference { references: [RakedReference {
target: new_url.to_string(), target: clean_url(&new_url).to_string(),
kind: match reason { kind: match reason {
RedirectReason::Redirected { .. } => ReferenceKind::Redirect, RedirectReason::Redirected { .. } => ReferenceKind::Redirect,
RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl, RedirectReason::NotCanonical { .. } => ReferenceKind::CanonicalUrl,
RedirectReason::SecureUpgrade => ReferenceKind::SecureUpgrade,
}, },
last_mod: None, last_mod: None,
}] }]
@ -427,8 +423,9 @@ impl TaskContext {
.context("Reference processor shut down; can't stream references!")?; .context("Reference processor shut down; can't stream references!")?;
self.as_event_processor() self.as_event_processor()
.process_refs(url.clone(), refs, today) .process_refs(url.clone(), refs, today, false)
.await?; .await
.context("Failure processing refs for Redirect")?;
Ok(NextAction::Continue) Ok(NextAction::Continue)
} }
@ -436,7 +433,9 @@ impl TaskContext {
// TODO(future) do we want to log this somewhere? // TODO(future) do we want to log this somewhere?
// or at least a metric // or at least a metric
let domain = get_reduced_domain(url)?; let domain = get_reduced_domain(url).with_context(|| {
format!("No domain in URL '{url}' for which we are processing the outcome!")
})?;
let url = url.clone(); let url = url.clone();
// TODO(feature) add 1.1× the previous backoff, if there was one. // TODO(feature) add 1.1× the previous backoff, if there was one.
@ -449,7 +448,8 @@ impl TaskContext {
txn.commit()?; txn.commit()?;
Ok(()) Ok(())
}) })
.await?; .await
.context("failed to store backoff")?;
// Change domain now // Change domain now
Ok(NextAction::ChangeDomain) Ok(NextAction::ChangeDomain)
@ -462,7 +462,8 @@ impl TaskContext {
.context("Rejection processor shut down; can't stream rejection!!")?; .context("Rejection processor shut down; can't stream rejection!!")?;
self.as_event_processor() self.as_event_processor()
.process_rejection(url.clone(), today) .process_rejection(url.clone(), today)
.await?; .await
.context("failed to process rejection for PermanentFailure")?;
// Reasons for permanent rejection aren't our fault or a site-wide fault; // Reasons for permanent rejection aren't our fault or a site-wide fault;
// so don't worry about carrying on. // so don't worry about carrying on.
@ -474,6 +475,7 @@ impl TaskContext {
fn as_event_processor(&self) -> EventProcessor { fn as_event_processor(&self) -> EventProcessor {
EventProcessor { EventProcessor {
store: Cow::Borrowed(&self.store), store: Cow::Borrowed(&self.store),
rerake_timings: &self.rerake_timings,
} }
} }
} }
@ -483,6 +485,7 @@ impl TaskContext {
/// just by replaying the stream of RakePacks and importing seeds. /// just by replaying the stream of RakePacks and importing seeds.
pub struct EventProcessor<'a> { pub struct EventProcessor<'a> {
store: Cow<'a, RakerStore>, store: Cow<'a, RakerStore>,
rerake_timings: &'a RerakeTimings,
} }
impl EventProcessor<'_> { impl EventProcessor<'_> {
@ -492,22 +495,28 @@ impl EventProcessor<'_> {
page: RakedPageEntry, page: RakedPageEntry,
datestamp: u16, datestamp: u16,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let rerake_on = Some(datestamp + self.rerake_timings.page);
self.store self.store
.as_ref() .as_ref()
.async_rw_txn(move |txn| { .async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?; let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing the page!")
})?;
txn.mark_url_as_visited( txn.mark_url_as_visited(
domain.as_ref(), domain.as_ref(),
url.as_ref(), url.as_ref(),
UrlVisitedRecord { UrlVisitedRecord {
last_visited_days: datestamp, last_visited_days: datestamp,
}, },
rerake_on,
)?; )?;
// If there's a favicon to be tried, add it to the list... // If there's a favicon to be tried, add it to the list...
let favicon_url_rel = page.document.head.effective_favicon_url(); let favicon_url_rel = page.document.head.effective_favicon_url();
if let Ok(favicon_url) = url.join(favicon_url_rel) { if let Ok(favicon_url) = url.join(favicon_url_rel) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?; if SUPPORTED_SCHEMES.contains(&favicon_url.scheme()) {
txn.enqueue_url(favicon_url.as_str(), None, RakeIntent::Icon)?;
}
} }
txn.commit()?; txn.commit()?;
@ -517,16 +526,21 @@ impl EventProcessor<'_> {
} }
pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> { pub async fn process_icon(&self, url: Url, datestamp: u16) -> anyhow::Result<()> {
let rerake_on = Some(datestamp + self.rerake_timings.icon);
self.store self.store
.as_ref() .as_ref()
.async_rw_txn(move |txn| { .async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?; let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing an icon!")
})?;
txn.mark_url_as_visited( txn.mark_url_as_visited(
domain.as_ref(), domain.as_ref(),
url.as_ref(), url.as_ref(),
UrlVisitedRecord { UrlVisitedRecord {
last_visited_days: datestamp, last_visited_days: datestamp,
}, },
rerake_on,
)?; )?;
txn.commit()?; txn.commit()?;
@ -540,49 +554,66 @@ impl EventProcessor<'_> {
url: Url, url: Url,
refs: RakedReferrerEntry, refs: RakedReferrerEntry,
datestamp: u16, datestamp: u16,
rerakeable_feed: bool,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let rerake_on = if rerakeable_feed {
Some(self.rerake_timings.feed)
} else {
None
};
self.store self.store
.as_ref() .as_ref()
.async_rw_txn(move |txn| { .async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?; let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing refs!")
})?;
txn.mark_url_as_visited( txn.mark_url_as_visited(
domain.as_ref(), domain.as_ref(),
url.as_ref(), url.as_ref(),
UrlVisitedRecord { UrlVisitedRecord {
last_visited_days: datestamp, last_visited_days: datestamp,
}, },
)?; rerake_on,
)
.context("failed to mark URL as visited")?;
// track all the referred-to URLs! // track all the referred-to URLs!
for reference in refs.references { for reference in refs.references {
let ref_url = Url::parse(&reference.target)?; let ref_url = Url::parse(&reference.target).with_context(|| {
let domain = get_reduced_domain(&ref_url)?; format!(
"failed to parse target URL of reference: {:?}",
reference.target
)
})?;
let domain = get_reduced_domain(&ref_url).with_context(|| {
format!("failed to reduce domain: {:?}", reference.target)
})?;
// First check if this URL is an allowed URL (hence should be enqueued) // Check if this URL is an allowed URL (hence should be enqueued)
let allowed = txn let allowed = txn
.get_allowed_domain_record(domain.borrow())? .get_domain_record(domain.borrow())?
.map(|record: AllowedDomainRecord| record.applies_to_url(&ref_url)) .map(|record: DomainRecord| record.is_url_rakeable(&ref_url))
.unwrap_or(false); .flatten();
if allowed {
let is_fresh = txn.enqueue_url(
&reference.target,
reference.last_mod,
reference.kind.into(),
)?;
if is_fresh {
increment_counter!("qprake_queue_new_url");
}
continue;
}
// Then check if this URL is a weed (hence should be ignored) match allowed {
let is_weed = txn Some(true) => {
.get_weed_domain_record(domain.borrow())? let is_fresh = txn.enqueue_url(
.map(|record: WeedDomainRecord| record.applies_to_url(&ref_url)) &reference.target,
.unwrap_or(false); reference.last_mod,
if !is_weed { reference.kind.into(),
// It's neither allowed nor weeded, so put it on hold for later inspection )?;
txn.put_url_on_hold(&reference.target, reference.kind.into())?; if is_fresh {
increment_counter!("qprake_queue_new_url");
}
continue;
}
Some(false) => {
// Weed! Do nothing.
}
None => {
// It's neither allowed nor weeded, so put it on hold for later inspection
txn.put_url_on_hold(&reference.target, reference.kind.into())?;
}
} }
} }
@ -596,13 +627,16 @@ impl EventProcessor<'_> {
self.store self.store
.as_ref() .as_ref()
.async_rw_txn(move |txn| { .async_rw_txn(move |txn| {
let domain = get_reduced_domain(&url)?; let domain = get_reduced_domain(&url).with_context(|| {
format!("No domain for URL '{url}' for which we are processing a rejection!")
})?;
txn.mark_url_as_visited( txn.mark_url_as_visited(
domain.as_ref(), domain.as_ref(),
url.as_ref(), url.as_ref(),
UrlVisitedRecord { UrlVisitedRecord {
last_visited_days: datestamp, last_visited_days: datestamp,
}, },
None,
)?; )?;
txn.commit()?; txn.commit()?;
Ok(()) Ok(())

View File

@ -1,9 +1,9 @@
use crate::raking::{RakeIntent, TemporaryFailure}; use crate::raking::{RakeIntent, TemporaryFailure};
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU16BE, MdbxU32, MdbxU64};
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION}; use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
use crate::storage::records::{ use crate::storage::records::{
ActiveDomainRecord, AllowedDomainRecord, BackingOffDomainRecord, OnHoldUrlRecord, ActiveDomainRecord, BackingOffDomainRecord, DomainRecord, OnHoldUrlRecord, QueueUrlRecord,
QueueUrlRecord, UrlVisitedRecord, WeedDomainRecord, UrlVisitedRecord,
}; };
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{anyhow, bail, ensure, Context};
use libmdbx::{ use libmdbx::{
@ -16,7 +16,7 @@ use ouroboros::self_referencing;
use quickpeep_utils::urls::get_reduced_domain; use quickpeep_utils::urls::get_reduced_domain;
use reqwest::Url; use reqwest::Url;
use std::borrow::{Borrow, Cow}; use std::borrow::{Borrow, Cow};
use std::collections::HashSet; use std::collections::{BTreeSet, HashSet};
use std::ops::Add; use std::ops::Add;
use std::path::Path; use std::path::Path;
use std::sync::atomic::AtomicU64; use std::sync::atomic::AtomicU64;
@ -32,6 +32,9 @@ pub mod records;
pub struct Databases<'env> { pub struct Databases<'env> {
/// Domain \n URL → QueueUrlRecord /// Domain \n URL → QueueUrlRecord
pub queue_urls: Database<'env>, pub queue_urls: Database<'env>,
/// u16 → URL. The u16 is the day-precision QuickPeep timestamp at which the URL should (MULTI-VALUE; INT16)
/// be enqueued again for reraking.
pub rerake_queue: Database<'env>,
/// Domain → ActiveDomainRecord /// Domain → ActiveDomainRecord
pub active_domains: Database<'env>, pub active_domains: Database<'env>,
/// u32 → domain name. Used to try and give some fairness. /// u32 → domain name. Used to try and give some fairness.
@ -42,18 +45,17 @@ pub struct Databases<'env> {
pub backing_off_domains: Database<'env>, pub backing_off_domains: Database<'env>,
/// URL → VisitedDomainRecord /// URL → VisitedDomainRecord
pub visited_urls: Database<'env>, pub visited_urls: Database<'env>,
/// Domain → AllowedDomainRecord /// Domain → DomainRecord
pub allowed_domains: Database<'env>, pub domains: Database<'env>,
/// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE) /// Domain \n URL → OnHoldUrlRecord Number of refs (INT VALUE)
pub urls_on_hold: Database<'env>, pub urls_on_hold: Database<'env>,
/// Domain → WeedDomainRecord
pub weed_domains: Database<'env>,
} }
impl<'env> Databases<'env> { impl<'env> Databases<'env> {
pub fn iter_all_databases(&self) -> impl Iterator<Item = (&'static str, &Database<'env>)> { pub fn iter_all_databases(&self) -> impl Iterator<Item = (&'static str, &Database<'env>)> {
[ [
("queue_urls", &self.queue_urls), ("queue_urls", &self.queue_urls),
("rerake_queue", &self.rerake_queue),
("active_domains", &self.active_domains), ("active_domains", &self.active_domains),
("active_domain_raffle", &self.active_domain_raffle), ("active_domain_raffle", &self.active_domain_raffle),
( (
@ -62,9 +64,8 @@ impl<'env> Databases<'env> {
), ),
("backing_off_domains", &self.backing_off_domains), ("backing_off_domains", &self.backing_off_domains),
("visited_urls", &self.visited_urls), ("visited_urls", &self.visited_urls),
("allowed_domains", &self.allowed_domains), ("domains", &self.domains),
("urls_on_hold", &self.urls_on_hold), ("urls_on_hold", &self.urls_on_hold),
("weed_domains", &self.weed_domains),
] ]
.into_iter() .into_iter()
} }
@ -73,6 +74,7 @@ impl<'env> Databases<'env> {
// Must match the order of the Databases struct fields. // Must match the order of the Databases struct fields.
pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [ pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
("urls_queue", DatabaseFlags::empty()), ("urls_queue", DatabaseFlags::empty()),
("rerake_queue", DatabaseFlags::DUP_SORT),
("active_domains", DatabaseFlags::empty()), ("active_domains", DatabaseFlags::empty()),
("active_domain_raffle", DatabaseFlags::INTEGER_KEY), ("active_domain_raffle", DatabaseFlags::INTEGER_KEY),
( (
@ -81,9 +83,8 @@ pub const DATABASES: [(&'static str, DatabaseFlags); 9] = [
), ),
("backing_off_domains", DatabaseFlags::empty()), ("backing_off_domains", DatabaseFlags::empty()),
("urls_visited", DatabaseFlags::empty()), ("urls_visited", DatabaseFlags::empty()),
("allowed_domains", DatabaseFlags::empty()), ("domains", DatabaseFlags::empty()),
("urls_on_hold", DatabaseFlags::empty()), ("urls_on_hold", DatabaseFlags::empty()),
("weed_domains", DatabaseFlags::empty()),
]; ];
#[self_referencing] #[self_referencing]
@ -176,14 +177,14 @@ impl RakerStore {
// Must match the order of the DATABASES constant and the struct field definitions // Must match the order of the DATABASES constant and the struct field definitions
Databases { Databases {
queue_urls: dbs.next().unwrap(), queue_urls: dbs.next().unwrap(),
rerake_queue: dbs.next().unwrap(),
active_domains: dbs.next().unwrap(), active_domains: dbs.next().unwrap(),
active_domain_raffle: dbs.next().unwrap(), active_domain_raffle: dbs.next().unwrap(),
backing_off_reinstatements: dbs.next().unwrap(), backing_off_reinstatements: dbs.next().unwrap(),
backing_off_domains: dbs.next().unwrap(), backing_off_domains: dbs.next().unwrap(),
visited_urls: dbs.next().unwrap(), visited_urls: dbs.next().unwrap(),
allowed_domains: dbs.next().unwrap(), domains: dbs.next().unwrap(),
urls_on_hold: dbs.next().unwrap(), urls_on_hold: dbs.next().unwrap(),
weed_domains: dbs.next().unwrap(),
} }
}, },
} }
@ -337,9 +338,11 @@ impl<'a> RakerTxn<'a, RW> {
domain: &str, domain: &str,
url_str: &str, url_str: &str,
record: UrlVisitedRecord, record: UrlVisitedRecord,
rerake_on: Option<u16>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let queue_urls = &self.mdbx.borrow_dbs().queue_urls; let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
let visited_urls = &self.mdbx.borrow_dbs().visited_urls; let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
let rerake_queue = &self.mdbx.borrow_dbs().rerake_queue;
let queue_key = format!("{}\n{}", domain, url_str); let queue_key = format!("{}\n{}", domain, url_str);
@ -358,6 +361,24 @@ impl<'a> RakerTxn<'a, RW> {
WriteFlags::empty(), WriteFlags::empty(),
)?; )?;
if let Some(rerake_on) = rerake_on {
self.mdbx_txn.put(
rerake_queue,
&rerake_on.to_be_bytes(),
url_str.as_bytes(),
WriteFlags::empty(),
)?;
}
Ok(())
}
/// Marks a URL as visited and takes it out of the queue.
pub fn dequeue_url(&self, domain: &str, url_str: &str) -> anyhow::Result<()> {
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
let queue_key = format!("{}\n{}", domain, url_str);
self.mdbx_txn.del(&queue_urls, queue_key.as_bytes(), None)?;
Ok(()) Ok(())
} }
@ -434,6 +455,54 @@ impl<'a> RakerTxn<'a, RW> {
Ok(None) Ok(None)
} }
/// Reinstates URLs that are now re-rakable.
pub fn reinstate_rerakables(&self, today: u16) -> anyhow::Result<()> {
let queue_urls = &self.mdbx.borrow_dbs().queue_urls;
let rerake_queue = &self.mdbx.borrow_dbs().rerake_queue;
let mut reinstatable_domains: BTreeSet<String> = BTreeSet::new();
let mut cur = self.mdbx_txn.cursor(rerake_queue)?;
cur.first::<MdbxU16BE, MdbxString>()?;
loop {
let (MdbxU16BE(rerake_datestamp), url_to_rerake) =
match cur.get_current::<MdbxU16BE, MdbxString>()? {
Some(x) => x,
None => break,
};
if rerake_datestamp > today {
break;
}
let url_str = url_to_rerake.into_string();
let url = Url::parse(&url_str).context("Failed to parse rerakable URL")?;
let url_domain =
get_reduced_domain(&url).context("Unable to reduce domain for rerakable URL")?;
self.mdbx_txn.put(
queue_urls,
format!("{}\n{}", url_domain, url_str).as_bytes(),
// TODO(correctness): should specify the same intent as before.
&MdbxBare(QueueUrlRecord {
intent: RakeIntent::Any,
})
.as_bytes(),
WriteFlags::NO_OVERWRITE,
)?;
reinstatable_domains.insert(url_domain.into_owned());
cur.del(WriteFlags::empty())?;
}
for domain in reinstatable_domains {
self.insert_active_domain_with_new_raffle_ticket(domain)?;
}
Ok(())
}
/// Enqueues a URL. /// Enqueues a URL.
/// If `only_if_not_visited_since` is specified, then this is a no-op if the page has already been /// If `only_if_not_visited_since` is specified, then this is a no-op if the page has already been
/// visited since then. /// visited since then.
@ -451,7 +520,8 @@ impl<'a> RakerTxn<'a, RW> {
let visited_urls = &self.mdbx.borrow_dbs().visited_urls; let visited_urls = &self.mdbx.borrow_dbs().visited_urls;
let url = Url::parse(url_str)?; let url = Url::parse(url_str)?;
let url_domain = get_reduced_domain(&url)?; let url_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain for to-be-enqueued URL: '{url}'!"))?;
let queue_key = format!("{}\n{}", url_domain, url); let queue_key = format!("{}\n{}", url_domain, url);
@ -503,7 +573,8 @@ impl<'a> RakerTxn<'a, RW> {
let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold; let urls_on_hold = &self.mdbx.borrow_dbs().urls_on_hold;
let url = Url::parse(url_str)?; let url = Url::parse(url_str)?;
let url_domain = get_reduced_domain(&url)?; let url_domain = get_reduced_domain(&url)
.with_context(|| format!("No domain for to-be-put-on-hold URL: '{url}'!"))?;
let queue_key = format!("{}\n{}", url_domain, url); let queue_key = format!("{}\n{}", url_domain, url);
@ -536,33 +607,17 @@ impl<'a> RakerTxn<'a, RW> {
Ok(is_new) Ok(is_new)
} }
pub fn put_allowed_domain_record( pub fn put_domain_record(
&self, &self,
domain: &str, domain: &str,
allowed_domain_record: AllowedDomainRecord, domain_record: DomainRecord,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains; let domains = &self.mdbx.borrow_dbs().domains;
self.mdbx_txn.put( self.mdbx_txn.put(
allowed_domains, domains,
domain.as_bytes(), domain.as_bytes(),
MdbxBare(allowed_domain_record).as_bytes(), MdbxBare(domain_record).as_bytes(),
WriteFlags::empty(),
)?;
Ok(())
}
pub fn put_weed_domain_record(
&self,
domain: &str,
weed_domain_record: WeedDomainRecord,
) -> anyhow::Result<()> {
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
self.mdbx_txn.put(
weed_domains,
domain.as_bytes(),
MdbxBare(weed_domain_record).as_bytes(),
WriteFlags::empty(), WriteFlags::empty(),
)?; )?;
Ok(()) Ok(())
@ -712,27 +767,12 @@ impl<'a, K: TransactionKind> RakerTxn<'a, K> {
} }
} }
pub fn get_allowed_domain_record( pub fn get_domain_record(&self, domain: &str) -> anyhow::Result<Option<DomainRecord>> {
&self, let domains = &self.mdbx.borrow_dbs().domains;
domain: &str,
) -> anyhow::Result<Option<AllowedDomainRecord>> {
let allowed_domains = &self.mdbx.borrow_dbs().allowed_domains;
match self match self
.mdbx_txn .mdbx_txn
.get::<MdbxBare<AllowedDomainRecord>>(allowed_domains, domain.as_bytes())? .get::<MdbxBare<DomainRecord>>(domains, domain.as_bytes())?
{
None => Ok(None),
Some(MdbxBare(record)) => Ok(Some(record)),
}
}
pub fn get_weed_domain_record(&self, domain: &str) -> anyhow::Result<Option<WeedDomainRecord>> {
let weed_domains = &self.mdbx.borrow_dbs().weed_domains;
match self
.mdbx_txn
.get::<MdbxBare<WeedDomainRecord>>(weed_domains, domain.as_bytes())?
{ {
None => Ok(None), None => Ok(None),
Some(MdbxBare(record)) => Ok(Some(record)), Some(MdbxBare(record)) => Ok(Some(record)),

View File

@ -1,9 +1,8 @@
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString};
use crate::storage::records::{AllowedDomainRecord, OnHoldUrlRecord, WeedDomainRecord}; use crate::storage::records::{DomainRecord, OnHoldUrlRecord};
use crate::storage::RakerTxn; use crate::storage::RakerTxn;
use anyhow::Context; use anyhow::Context;
use libmdbx::{Database, WriteFlags, RW}; use libmdbx::{Database, WriteFlags, RW};
use log::warn;
use reqwest::Url; use reqwest::Url;
/// Runs one big transaction that: /// Runs one big transaction that:
@ -16,8 +15,7 @@ use reqwest::Url;
pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> { pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Result<()> {
struct DomainState { struct DomainState {
pub domain: String, pub domain: String,
pub allowed_domain_record: Option<AllowedDomainRecord>, pub domain_record: Option<DomainRecord>,
pub weed_domain_record: Option<WeedDomainRecord>,
} }
let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold; let urls_on_hold: &Database = &txn.mdbx.borrow_dbs().urls_on_hold;
@ -47,44 +45,33 @@ pub fn reapply_seeds_and_weeds_to_on_hold_urls(txn: RakerTxn<RW>) -> anyhow::Res
// Then load the relevant records for it. // Then load the relevant records for it.
domain_state = Some(DomainState { domain_state = Some(DomainState {
domain: domain.to_owned(), domain: domain.to_owned(),
allowed_domain_record: txn.get_allowed_domain_record(domain)?, domain_record: txn.get_domain_record(domain)?,
weed_domain_record: txn.get_weed_domain_record(domain)?,
}); });
} }
let url = Url::parse(url_str)?; let url = Url::parse(url_str)?;
let domain_state = domain_state.as_ref().unwrap(); let domain_state = domain_state.as_ref().unwrap();
let is_allowed = domain_state
.allowed_domain_record
.as_ref()
.map(|adr: &AllowedDomainRecord| adr.applies_to_url(&url))
.unwrap_or(false);
let is_weed = domain_state
.weed_domain_record
.as_ref()
.map(|wdr: &WeedDomainRecord| wdr.applies_to_url(&url))
.unwrap_or(false);
match (is_allowed, is_weed) { let is_rakeable = domain_state
(false, false) => { /* nop */ } .domain_record
(true, true) => { .as_ref()
warn!( .map(|dr: &DomainRecord| dr.is_url_rakeable(&url))
"Ambiguous: {:?} is both mentioned by a seed and a weed. Ignoring.", .flatten();
url
); match is_rakeable {
} Some(true) => {
(true, false) => {
// ALLOWED // ALLOWED
// Make it a queued URL // Make it a queued URL
txn.enqueue_url(url_str, None, record.queue_record.intent)?; txn.enqueue_url(url_str, None, record.queue_record.intent)?;
cur.del(WriteFlags::empty())?; cur.del(WriteFlags::empty())?;
} }
(false, true) => { Some(false) => {
// WEED // WEED
// Just delete // Just delete
cur.del(WriteFlags::empty())?; cur.del(WriteFlags::empty())?;
} }
None => { /* nop: neither allowed nor a weed. Keep on hold. */ }
} }
} }

View File

@ -4,6 +4,32 @@ use serde::de::DeserializeOwned;
use serde::Serialize; use serde::Serialize;
use std::borrow::Cow; use std::borrow::Cow;
/// u16 in BIG byte endianness (u16 not supported by INTEGERKEY mode!)
#[derive(Copy, Clone, Debug)]
pub struct MdbxU16BE(pub u16);
impl MdbxU16BE {
pub fn as_bytes(&self) -> Cow<'_, [u8]> {
Cow::Owned(self.0.to_be_bytes().to_vec())
}
}
impl TableObject<'_> for MdbxU16BE {
fn decode(data_val: &[u8]) -> Result<Self, libmdbx::Error>
where
Self: Sized,
{
if data_val.len() != 2 {
return Err(libmdbx::Error::DecodeError(
anyhow!("MDBX Key not 2 bytes; can't be decoded as u16").into(),
));
}
let mut buf = [0u8; 2];
buf.copy_from_slice(&data_val);
Ok(MdbxU16BE(u16::from_be_bytes(buf)))
}
}
/// u32 in native byte endianness (as required by INTEGERKEY mode) /// u32 in native byte endianness (as required by INTEGERKEY mode)
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
pub struct MdbxU32(pub u32); pub struct MdbxU32(pub u32);

View File

@ -1,7 +1,7 @@
use crate::raking::{RakeIntent, TemporaryFailure}; use crate::raking::{RakeIntent, TemporaryFailure};
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::BTreeSet; use std::collections::BTreeMap;
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] #[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct ActiveDomainRecord { pub struct ActiveDomainRecord {
@ -11,11 +11,10 @@ pub struct ActiveDomainRecord {
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize, Serialize)]
pub struct UrlVisitedRecord { pub struct UrlVisitedRecord {
/// Number of minutes since the QuickPeep Epoch that this page was last raked at. /// Number of days since the QuickPeep Epoch that this page was last raked at.
/// We store minutes to give us 60× the range of times. /// A u16 is fine here, giving 179 years worth of values. This allows compact encoding.
/// We'd really rather stick with 32-bit ints to reduce the space storage requirements. /// We don't really care about a more granular timestamp: sitemaps and feeds usually only
/// We could *possibly* go for a u16 in the future and store number of days (179 years' range): /// give the date of last update anyway.
/// sitemaps and feeds usually only tell you the date the page was last updated.
pub last_visited_days: u16, pub last_visited_days: u16,
} }
@ -47,26 +46,20 @@ pub struct BackingOffDomainRecord {
} }
#[derive(Clone, Debug, Serialize, Deserialize, Default)] #[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct AllowedDomainRecord { pub struct DomainRecord {
/// Set of acceptable path prefixes. pub rakeable_path_prefixes: BTreeMap<String, bool>,
/// Empty if ALL path prefixes are permitted.
pub restricted_prefixes: BTreeSet<String>,
} }
impl AllowedDomainRecord { impl DomainRecord {
/// Returns true iff this record applies to this URL. /// Returns whether the URL is rakeable.
/// ///
/// Preconditions: it has been checked that the record applies to the domain /// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool { pub fn is_url_rakeable(&self, url: &Url) -> Option<bool> {
if self.restricted_prefixes.is_empty() { let mut final_result = None;
return true; // TODO This could be made more efficient.
} for (prefix, &rakeable) in self.rakeable_path_prefixes.iter() {
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) { if url.path().starts_with(prefix) {
applies = true; final_result = Some(rakeable);
break;
} }
if prefix.as_str() > url.path() { if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz // e.g. /dog > /cat/xyz
@ -75,39 +68,6 @@ impl AllowedDomainRecord {
break; break;
} }
} }
applies final_result
}
}
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
pub struct WeedDomainRecord {
/// Set of weedy path prefixes.
/// Empty if ALL path prefixes are weedy.
pub restricted_prefixes: BTreeSet<String>,
}
impl WeedDomainRecord {
/// Returns true iff this record applies to this URL.
///
/// Preconditions: it has been checked that the record applies to the domain
pub fn applies_to_url(&self, url: &Url) -> bool {
if self.restricted_prefixes.is_empty() {
return true;
}
let mut applies = false;
for prefix in self.restricted_prefixes.iter() {
if url.path().starts_with(prefix) {
applies = true;
break;
}
if prefix.as_str() > url.path() {
// e.g. /dog > /cat/xyz
// This means we've missed all chances to see our prefix,
// so we break here (efficiency).
break;
}
}
applies
} }
} }

View File

@ -0,0 +1,132 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{{ search_term }} — QuickPeep</title>
<link rel="stylesheet" type="text/css" href="dist/main.css">
</head>
<body>
<!-- Header -->
<div class="container_overall">
<div class="left_side_container">
<header>
<form method="GET" action="search">
<fieldset class="horizontal">
<a href="/" title="QuickPeep"><img src="/static/quickpeep_logo_sml.png" alt="QuickPeep Logo" class="bar_logo"></a>
<input type="search" id="search" name="q" placeholder="..." value="{{ search_term }}" class="grow">
<input type="submit" value="Search" class="shrink">
</fieldset>
</form>
</header><!-- ./ Header -->
<!-- Main -->
<main class="search">
{% if show_spiel %}
<p>
QuickPeep is a hobbyist, open-source and very immature (for now) web search engine. It's intended to help you encounter webpages that are interesting and from a real person, rather than from a 'content mill' or other source of SEO spam. In general, websites that don't respect the reader are unwelcome.
</p>
<p>
QuickPeep's approach to rubbish websites is to 'just' not index them! This also helps with another goal of the project, which is to allow anyone to run an instance of QuickPeep with only modest hardware requirements (especially storage space which could easily be problematic).
</p>
<p>
This is an ambitious project and it is probably not very usable right now. It may never be. With that said, I'm hoping to see how far I can take it.
</p>
<p>
There is an <a href="https://o.librepush.net/blog/2022-07-02-quickpeep-small-scale-web-search-engine">article introducing the project on my personal blog</a>.<br>
The source code is <a href="https://git.emunest.net/reivilibre/quickpeep.git">available on my personal Gitea instance</a>.
</p>
{% endif %}
<ul class="search_results">
<li>
<img src="/icon.webp?b={{ result.favicon_url }}">
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt">
{{- result.excerpt|safe -}}
</div>
<ul class="result_tags">
{%- for tag in result.tags -%}
<li>{{ tag }}</li>
{%- endfor -%}
</ul>
<div class="result_url">{{ result.url }}</div>
</li>
<li>
<img src="/icon.webp?b={{ result.favicon_url }}">
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt">
{{- result.excerpt|safe -}}
</div>
<ul class="result_tags">
{%- for tag in result.tags -%}
<li>{{ tag }}</li>
{%- endfor -%}
</ul>
<div class="result_url">{{ result.url }}</div>
</li>
<li>
<img src="/icon.webp?b={{ result.favicon_url }}">
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt">
{{- result.excerpt|safe -}}
</div>
<ul class="result_tags">
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
</ul>
<div class="result_url">{{ result.url }}</div>
</li>
<li>
<img src="/icon.webp?b={{ result.favicon_url }}">
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt">
{{- result.excerpt|safe -}}
</div>
<ul class="result_tags">
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
</ul>
<div class="result_url">{{ result.url }}</div>
</li>
<li>
<img src="/icon.webp?b={{ result.favicon_url }}">
<div class="result_title"><a href="{{ result.url }}" rel="nofollow noreferrer">{{ result.title }}</a></div>
<div class="result_excerpt">
{{- result.excerpt|safe -}}
</div>
<ul class="result_tags">
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
<li>{{ tag }}</li>
</ul>
<div class="result_url">{{ result.url }}</div>
</li>
</ul>
</main>
</div>
<div class="right_side_container">
<!-- Preview pane -->
</div>
</div>
<footer class="container">
{% for (method, url) in contact %}
<a href="{{ url }}">{{ method }}</a>
{% endfor %}
<a href="/">Return to QuickPeep Root</a>
</footer>
</body>
</html>

View File

@ -0,0 +1,10 @@
.result_title a {
color: palegoldenrod;
}
.result_tags {
> li {
color: palegreen;
}
}

View File

@ -4,6 +4,11 @@
--typography-spacing-vertical: 1rem; --typography-spacing-vertical: 1rem;
} }
// light green theming
$primary-500: #8bc34a;
$primary-600: #7cb342;
$primary-700: #689f38;
.bar_happy { .bar_happy {
padding: 1em; padding: 1em;
margin: 3em; margin: 3em;
@ -16,8 +21,6 @@
} }
} }
@media only screen and (max-width: 960px) { @media only screen and (max-width: 960px) {
.left_side_container { .left_side_container {
@extends(.container); @extends(.container);
@ -91,7 +94,7 @@ ul.search_results {
} }
.result_title a { .result_title a {
color: palegoldenrod; color: brown;
} }
.result_excerpt { .result_excerpt {
@ -114,7 +117,7 @@ ul.search_results {
list-style-type: none; list-style-type: none;
display: inline-block; display: inline-block;
//background-color: palegreen; //background-color: palegreen;
color: palegreen; color: darkgreen;
//padding: 0.2em; //padding: 0.2em;
//border-radius: 8px; //border-radius: 8px;
@ -128,3 +131,14 @@ ul.search_results {
margin-right: 0.4em; margin-right: 0.4em;
} }
} }
@media only screen and (prefers-color-scheme: dark) {
:root:not([data-theme=light]) {
@import "dark.scss";
}
}
[data-theme=dark] {
@import "dark.scss";
}

View File

@ -1122,11 +1122,6 @@ mdn-data@2.0.14:
resolved "https://registry.yarnpkg.com/mdn-data/-/mdn-data-2.0.14.tgz#7113fc4281917d63ce29b43446f701e68c25ba50" resolved "https://registry.yarnpkg.com/mdn-data/-/mdn-data-2.0.14.tgz#7113fc4281917d63ce29b43446f701e68c25ba50"
integrity sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow== integrity sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow==
mini.css@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/mini.css/-/mini.css-3.0.1.tgz#f6236e99997bbd19484d5655d087ec96b887af68"
integrity sha512-FmuuBL0wuyDO1UA66TkAo8w2RxxuHmNPaUqUHcYlHtM9CJkrscQaNAJ/ParEahYFwtZOSgfEA7flbMoSPkzrPA==
minimist@^1.2.5: minimist@^1.2.5:
version "1.2.5" version "1.2.5"
resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602" resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602"

1717
quickpeep_static/yarn.nix Normal file

File diff suppressed because it is too large Load Diff

View File

@ -60,6 +60,8 @@ pub struct RakedReference {
pub enum ReferenceKind { pub enum ReferenceKind {
/// Canonical URL for the same document, as declared in the page. /// Canonical URL for the same document, as declared in the page.
CanonicalUrl, CanonicalUrl,
/// HTTP -> HTTPS upgrade, automatically caused by QuickPeep
SecureUpgrade,
/// HTTP-level redirect. /// HTTP-level redirect.
Redirect, Redirect,
/// Link in a page (<a>). Could be to another page or to a feed. /// Link in a page (<a>). Could be to another page or to a feed.

View File

@ -1,40 +0,0 @@
use std::borrow::{Borrow, BorrowMut};
pub struct DirtyTracker<T> {
inner: T,
dirty: bool,
}
impl<T> Borrow<T> for DirtyTracker<T> {
fn borrow(&self) -> &T {
&self.inner
}
}
impl<T> BorrowMut<T> for DirtyTracker<T> {
fn borrow_mut(&mut self) -> &mut T {
self.dirty = true;
&mut self.inner
}
}
impl<T> DirtyTracker<T> {
pub fn new(inner: T) -> DirtyTracker<T> {
DirtyTracker {
inner,
dirty: false,
}
}
pub fn is_dirty(&self) -> bool {
self.dirty
}
pub fn make_clean(&mut self) {
self.dirty = false;
}
pub fn into_inner(self) -> T {
self.inner
}
}

View File

@ -1,4 +1,3 @@
pub mod dates; pub mod dates;
pub mod dirty;
pub mod lazy; pub mod lazy;
pub mod urls; pub mod urls;

View File

@ -1,11 +1,11 @@
use anyhow::Context;
use std::borrow::Cow; use std::borrow::Cow;
use url::Url; use url::Url;
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> { pub fn get_reduced_domain(url: &Url) -> Option<Cow<'_, str>> {
let domain = url.domain().context("URLs must have domains")?; // If the URL does not have a host or not a domain (e.g. IP address) then exits with None here.
let domain = url.domain()?;
Ok(Cow::Borrowed(match domain.strip_prefix("www.") { Some(Cow::Borrowed(match domain.strip_prefix("www.") {
Some(stripped) => stripped, Some(stripped) => stripped,
None => domain, None => domain,
})) }))

47
shell.nix Normal file
View File

@ -0,0 +1,47 @@
{ pkgs ? import <nixpkgs> {} }:
let
# We may need some packages from nixpkgs-unstable
#unstable = import <nixpkgs-unstable> {};
rust-toolchain = pkgs.symlinkJoin {
name = "rust-toolchain";
paths = [pkgs.rustc pkgs.cargo pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
};
in
pkgs.mkShell {
buildInputs = [
rust-toolchain
pkgs.pkg-config
#pkgs.libclang # ??
];
nativeBuildInputs = [
pkgs.openssl
];
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
# Cargo culted:
# Add to rustc search path
# RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
# ]);
# Add to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
pkgs.glibc.dev
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
}