Compare commits

..

135 Commits

Author SHA1 Message Date
0e3191a0d5 Accept overhaul (v0.7.x): Merge branch 'rei/overhaul' into develop 2025-09-20 20:28:45 +01:00
0873997f1e Track statistics when integrating pointers
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 09:53:24 +01:00
9d06016d06 Add TODOs for performance aspects
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 00:10:20 +01:00
b410114523 Add log lines with timings
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 00:05:01 +01:00
1e029a1826 Reduce logging in SFTP wormfile implementation
Signed-off-by: Olivier <olivier@librepush.net>
2024-07-21 11:50:08 +01:00
0869aa1afb hack: Allow reading Zstd compression level from env
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
Signed-off-by: Olivier <olivier@librepush.net>
2024-06-27 22:36:29 +01:00
565c99cf8c Update flake and fix it 2024-05-08 20:41:28 +01:00
f17ad6fac3 Update flake and fix it 2024-05-08 20:40:01 +01:00
7728e0b0a1 Fix typo in 'Acquired' 2023-11-04 12:59:17 +00:00
c395a50803 Remove obsolete 'label' field from virtual sources 2023-11-04 12:59:11 +00:00
2551c0e641 Make it clear that 'enter keyring password' is for decryption 2023-11-04 10:28:57 +00:00
87b6530aed Allow specifying a connector inline (with custom key path) in Datman configs 2023-10-03 21:44:50 +01:00
1c2d7957ee Remove pretend password support for SFTP 2023-10-03 21:36:57 +01:00
ecda1e5359 Extract the connector loading part of open_pile 2023-10-03 21:31:46 +01:00
eb9d65b918 Add rust analyser to flake 2023-10-03 21:26:53 +01:00
feb05cfecf Add check routine that checks all chunk hashes 2023-08-15 20:13:17 +01:00
6f0e3de350 Don't use mmap for storing due to concurrency bug scare 2023-08-15 19:53:28 +01:00
d07351d465 Add size hints for Datman Backup on dir trees 2023-08-13 22:12:45 +01:00
9c3ea26ea6 nix flake: Don't strip debug symbols 2023-08-13 18:31:24 +01:00
e9c4400ea5 Add some more async_backtrace tracking and remove redundant block_in_places 2023-08-13 17:41:13 +01:00
6434190cf1 Convert store_file to blocking 2023-08-13 17:41:13 +01:00
27c5f09d0d Convert storage_pipeline_worker to blocking 2023-08-13 17:41:03 +01:00
96deadd270 Remove old yama and datman from tree 2023-08-13 17:41:03 +01:00
2c14654d29 Add a small amount of async_backtrace tracking 2023-08-13 16:30:50 +01:00
a9379dba14 Actually add a limit to prevent infinite buffering memory 2023-08-11 22:19:12 +01:00
e306acd196 Update SFTP client to get fix for infinite buffering memory leak 2023-08-11 21:44:14 +01:00
5137ac0640 Fix ignore rules 2023-08-11 20:31:08 +01:00
6b72672d29 Fix bug in path iteration leading to bug in gradual scans 2023-08-11 20:24:59 +01:00
e85c8c372d Don't special-case the root 2023-08-11 20:20:20 +01:00
31ffb1ce86 Add logging for new bug with gradual scans 2023-08-11 20:18:03 +01:00
22beea0c50 Fix bug when using gradual scans 2023-08-11 20:00:31 +01:00
2e08b2df47 Set Zstd level to 16 2023-08-10 21:33:03 +01:00
c812532541 Add --gradual flag to datman backup commands: allows softlimiting the size of a backup round 2023-08-10 20:02:32 +01:00
00fa9d0951 SFTP wormfile: pull SSH key from YAMA_SSH_KEY if set 2023-08-10 20:02:24 +01:00
1ac9bb6d8d Add yama keyring create command 2023-08-08 21:17:27 +01:00
53886aad46 Only produce warnings if files vanish during store
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-26 23:43:23 +01:00
32e514bd2e Introduce Option<> on file store entries 2023-05-26 23:30:14 +01:00
2b4608e06b Cleanups
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-26 22:58:41 +01:00
1bd46b934d Avoid use of PatriciaMap.insert_str to avoid causing bug 2023-05-26 22:58:37 +01:00
5ffbf4fc1c Add some debug lines for Bad PMap issue 2023-05-26 21:00:22 +01:00
470420665f Tweaks that tracked down SFTP infinite buffer problem
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-22 20:44:24 +01:00
a47924dc80 Fix flake and add 7-prefix so we can use it alongside the stable version
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-20 14:16:13 +01:00
3a2ece31b6 Fix query on empty local cache
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-20 13:22:06 +01:00
70663ad016 Fix progress bar in datman 2023-05-20 13:21:58 +01:00
dabf7c5cf0 overhaul: datman support 2023-05-20 13:11:30 +01:00
8e5649597b overhaul: streaming extract support
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-04 23:56:35 +01:00
00dec17da0 overhaul: streaming store support
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 23:50:55 +01:00
a8e1cc45ef CHECKPOINT overhaul 2
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 21:03:33 +01:00
5cd2700396 CHECKPOINT overhaul
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 22:35:41 +01:00
8d5c373abc Add a rather dodgy untested S3 implementation 2023-04-03 21:09:26 +01:00
a5d6bf3085 Add an untested SFTP implementation of Yama Wormfiles 2023-04-02 22:54:24 +01:00
1fe4d9d2f3 Add yama_wormfile crates
These will be a useful component of the new on-disk storage format
2023-04-02 22:54:24 +01:00
7cd71695bc Start of Yama & Datman v0.7.0-alpha.1 2023-04-02 20:46:47 +01:00
b57dbad890 Simplify flake lock
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-04-01 16:57:04 +01:00
9001177143 Batch up chunk deletions in an attempt to make vacuuming more performant
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-28 21:03:07 +00:00
c9d64b2962 Make sure to flush + add some error contexts
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-21 21:23:38 +00:00
50ff9bb36a Fix including trailing empty line as pointer name
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-20 22:11:05 +00:00
7e41408815 Add test for incremental backup with mid delete
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
Just for validation that delete does the right thing
2022-11-20 20:58:45 +00:00
4072c5ae82 Fix parent not being integrated before being used to differentiate whilst removing a pointer
Some checks are pending
ci/woodpecker/push/build Pipeline is pending
ci/woodpecker/push/release Pipeline is pending
2022-11-20 20:42:26 +00:00
d3fe111a06 Replace debug rmp with new implementation
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-20 19:44:21 +00:00
6e1e173cb6 Implement datman prune 2022-11-20 19:43:20 +00:00
fcc79ca95d Hopefully fix descriptors to compare in test
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-20 10:02:27 +00:00
c1de1341ef Tweak wording 2022-11-20 10:02:13 +00:00
e85c606c95 Make a no-op compaction really a no-op compaction
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-20 08:57:22 +00:00
34c619ef41 Fix compact thresholds in tests to demonstrate what we need
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-19 17:43:27 +00:00
b9dce3ddfc rustfmt 2022-11-19 17:42:09 +00:00
52202874f2 Update images to remove deprecated ones
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-19 16:47:25 +00:00
69656131af Fix linter
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-11-19 16:35:36 +00:00
cc93997230 Linting 2022-11-19 16:35:33 +00:00
41248fe396 Add tests for yama compact 2022-11-19 16:35:24 +00:00
e7eb9ef288 Update nix shell to have python 2022-11-19 16:33:14 +00:00
b5e9e55cad Add yama compact command 2022-11-19 15:49:09 +00:00
cf502b7f7e rustfmt 2022-11-19 15:28:36 +00:00
58c5c3f039 Add compaction logic 2022-11-19 15:27:41 +00:00
30b261d172 Add Nix shell for Rust devel 2022-11-19 13:13:19 +00:00
0811c11c48 Add ability to extract subset of files from yama
Some checks are pending
ci/woodpecker/push/build Pipeline is pending
ci/woodpecker/push/release Pipeline is pending
2022-10-04 20:21:23 +01:00
aa2722607e Skip directories with .datmanskip files
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-07-25 10:35:47 +01:00
8612804298 Do short exclusions for remote backups (also bump version as protocol version incompatible) 2022-07-25 10:35:47 +01:00
080875bfce Add debug log for not descending 2022-07-25 10:35:47 +01:00
098895d913 Do short exclusions for local backups 2022-07-25 10:35:47 +01:00
bd5e18bc9f Extract load_labelling_rules 2022-07-25 10:35:47 +01:00
e25e92b273 Introduce 'exclusions' parameter to scanner 2022-07-25 10:35:47 +01:00
4aa1948350 Sort chunk IDs by hint to make pull more efficient
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-07-23 22:43:35 +01:00
ee9ca73224 Put reader bloblogs in an LRU map to prevent hitting open FD limit 2022-07-23 22:38:29 +01:00
05c6d3e662 Ignore non-UTF-8 file names instead of panicking
Some checks failed
ci/woodpecker/push/build Pipeline is pending
ci/woodpecker/push/release Pipeline is pending
ci/woodpecker/tag/build Pipeline failed
ci/woodpecker/tag/release Pipeline was successful
2022-07-23 21:55:04 +01:00
0b84c793bf Flush chunk pointers in one transaction
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-06-15 21:13:58 +01:00
eef22e7009 Use WAL mode in SQLite bloblogs 2022-06-15 21:13:56 +01:00
332563f5a7 Bump version
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-06-14 22:55:04 +01:00
14be0ef0a3 Add a version check to the pull protocol
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-06-14 22:54:32 +01:00
375d68eb0e Don't forget the terminator
Some checks are pending
ci/woodpecker/push/build Pipeline is pending
ci/woodpecker/push/release Pipeline was successful
2022-06-14 19:57:34 +01:00
fc29c6fca1 Add some writer flushes that are probably necessary 2022-06-14 19:57:14 +01:00
d384b1bcbd Write down basic implementation of datman pull 2022-06-14 19:56:45 +01:00
e357547777 Glue together an implementation for the pull responder 2022-06-14 19:54:21 +01:00
c83e2be66d Flesh out both sides 2022-06-14 08:54:17 +01:00
a24778209e Finish off the basic offering side implementation 2022-06-13 23:27:43 +01:00
bb8fc355f0 Lay down the basic structure of push/pull offerer 2022-06-13 23:15:46 +01:00
14fc925dbc Make existing push/pull legacy 2022-06-13 23:15:34 +01:00
9e51c2428e Report the size used by the pile itself in the report 2022-06-13 23:15:34 +01:00
01c98cb415 Aggregate reports by month and reorder sections 2022-06-04 12:33:14 +01:00
3637b00f38 Add lz4 to path to ensure the backup helpers can work
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-06-03 11:42:08 +01:00
25b1e14d84 Make it less strict?
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-06-02 22:26:24 +01:00
ef70e0998e Attempt to package up the helpers alongside yama and datman
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-06-02 20:30:22 +01:00
5fd9a72de8 Convert helpers to Poetry to make them easier to package for NixOS 2022-06-02 20:29:54 +01:00
4244fb88a7 Update version
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-06-01 09:51:56 +01:00
d62e864bee Don't crash when backing up the root directory because it has no name 2022-06-01 09:16:05 +01:00
001d626ccd Update version (protocol changed too)
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-05-31 09:45:24 +01:00
af553d1fed Only scan one filesystem by default (can configure 'cross_filesystems' if needed)
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-31 09:39:25 +01:00
e8fc448ace Remove needless bare_cnr crate
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
ci/woodpecker/tag/build Pipeline was successful
ci/woodpecker/tag/release Pipeline was successful
2022-05-30 23:11:16 +01:00
4216243dcf Remove src input to try and avoid getting told off about relative paths
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-30 22:46:54 +01:00
1cd0b9887a Add a Nix flake
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-29 17:43:03 +01:00
ec8c5ff42d Add times and disk space to the report
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-29 17:24:46 +01:00
948ca3f2b5 Add a command to show a report of the Datman system 2022-05-29 13:35:15 +01:00
438af9164e Guard the Requester so that the Responder can't do whatever it wants 2022-05-29 09:45:02 +01:00
e1c6d31ee3 Fix: remember to flush 2022-05-29 09:36:54 +01:00
ac97957394 Remove obsolete comment
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-29 09:12:51 +01:00
23e112b1be Automatically pull updates for images in CI
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-29 08:43:03 +01:00
8692d83510 Push things around so that the chunking process doesn't need to know about pointers
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-05-29 00:10:53 +01:00
f1b73b28ee Split up store_fully so that we can avoid pointer ops in a restricted context 2022-05-28 23:53:09 +01:00
db0d9dd493 Add a basic access guard 2022-05-28 23:31:25 +01:00
081a1922c7 Separate out the pointer operations 2022-05-28 23:30:52 +01:00
23b352f936 Remove non-pipelined storage 2022-05-28 23:30:52 +01:00
d82176075a Bump up Rust version
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-05-28 22:53:35 +01:00
1803946b4a Disable ARM64 runners for now 2022-05-28 22:47:59 +01:00
760626d01e Add operation to describe the pipeline
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2022-05-28 22:44:36 +01:00
f4debbc9fe Fix up clap changes 2022-05-28 22:27:33 +01:00
f9c0d814c2 Update deps 2022-05-28 22:27:33 +01:00
a06b393630 Make some remarks about where I'd like to go 2022-05-28 22:14:43 +01:00
56dafc6b5f Remove bare_cnr_ssh crate since it does depend a little bit 2022-05-28 21:43:20 +01:00
0a9cb559bd Remove transport module as it was obsolete 2022-05-28 21:22:25 +01:00
00b06963d7 Add tests and useful functionality 2022-05-28 21:21:40 +01:00
b659a5ddac Introduce channel handles that can be passed in serde messages 2022-05-28 20:40:04 +01:00
183f365032 Start a half decent Bare CnR crate 2022-05-28 19:47:08 +01:00
d0ed984dca Upgrade some dependencies 2022-05-28 13:21:56 +01:00
675c8884f9 Start the v0.6.0-alpha.1 'next' branch
All checks were successful
ci/woodpecker/push/build Pipeline was successful
ci/woodpecker/push/release Pipeline was successful
2022-05-28 13:08:56 +01:00
60cf81c59c Use plain old Rust image rather than sccache image
All checks were successful
continuous-integration/drone the build was successful
2022-01-14 21:17:51 +00:00
133 changed files with 15513 additions and 9165 deletions

1
.env Normal file
View File

@ -0,0 +1 @@
DATABASE_URL=sqlite:yama_localcache/testdb.sqlite

3
.envrc Normal file
View File

@ -0,0 +1,3 @@
#use nix
use flake .

9
.gitignore vendored
View File

@ -15,3 +15,12 @@
__pycache__
/datman-helper-postgres/datman_helper_postgres.egg-info
/datman-helper-mysql/datman_helper_mysql.egg-info
/result
.direnv
yama7demo
yamaSFTPdemo
yama_localcache/testdb.sqlite
sftp7demo

View File

@ -5,27 +5,19 @@ platform: linux/amd64
pipeline:
unitTests:
image: "docker.bics.ga/rei_ci/rust-sccache:latest-amd64"
image: "rust:1.65.0"
pull: true
commands:
- DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
- DEBIAN_FRONTEND=noninteractive apt-get -yqq install pkg-config libssl-dev build-essential libsqlite3-dev > /dev/null
- cargo build --all
- cargo test --all
- sccache --show-stats
environment:
RUSTC_WRAPPER: /usr/local/bin/sccache
SCCACHE_S3_USE_SSL: "true"
SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
secrets:
- sccache_bucket
- aws_access_key_id
- aws_secret_access_key
when:
event: [push, pull_request]
testSuite:
image: "docker.bics.ga/rei_ci/rust-sccache:latest-amd64"
image: "rust:1.65.0"
commands:
- DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
- DEBIAN_FRONTEND=noninteractive apt-get -yqq -o=Dpkg::Use-Pty=0 install pkg-config libssl-dev build-essential libsqlite3-dev python3.9 python3.9-venv postgresql postgresql-client mariadb-server mariadb-client zstd lz4 > /dev/null
@ -38,22 +30,14 @@ pipeline:
- cargo install -q --path yama
- cargo install -q --path datman
- python3.9 -m venv testsuite/.venv
- ./testsuite/.venv/bin/pip install -e testsuite -e datman-helper-postgres -e datman-helper-mysql
- ./testsuite/.venv/bin/pip install ./testsuite ./datman-helper-postgres ./datman-helper-mysql
- cd testsuite && . .venv/bin/activate && TEST_POSTGRES=$(hostname),testsuitedb,root TEST_MYSQL=$(hostname),testsuitemydb,root green
- sccache --show-stats
environment:
RUSTC_WRAPPER: /usr/local/bin/sccache
SCCACHE_S3_USE_SSL: "true"
SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
secrets:
- sccache_bucket
- aws_access_key_id
- aws_secret_access_key
when:
event: [push, pull_request]
deployManual:
image: "docker.bics.ga/rei_ci/mdbook:latest-amd64"
image: "docker.emunest.net/rei_ci/mdbook:latest-amd64"
pull: true
when:
branch:
- develop

View File

@ -4,7 +4,8 @@ platform: linux/${ARCH}
matrix:
ARCH:
- arm64
# I don't have an arm64 runner at the moment.
#- arm64
- amd64
.a1: &when
@ -15,7 +16,10 @@ pipeline:
buildRelease:
when: *when
image: "docker.bics.ga/rei_ci/rust-sccache:latest-${ARCH}"
# Disabled for now because I'm trying to get infinite build times to stop :-(.
# Suspect a kernel bug but any workaround will do for now.
#image: "docker.bics.ga/rei_ci/rust-sccache:latest-${ARCH}"
image: "rust:1.61"
commands:
- apt-get -qq update && apt-get -yqq install pkg-config libssl-dev build-essential libolm-dev cmake
- cargo build --release

3978
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,14 @@
[workspace]
members = [
"yama",
"datman"
"datman",
"yama_wormfile",
"yama_wormfile_fs",
"yama_wormfile_sftp",
"yama_wormfile_s3",
"yama_midlevel_crypto",
"yama_pile",
"yama_localcache",
]
[profile.release]
@ -9,3 +16,16 @@ members = [
debug = 2
# When this feature stabilises, it will be possible to split the debug information into a file alongside the binary
#split-debuginfo = "packed"
# A few packages benefit from optimisations in the dev profile, otherwise Yama operations are needlessly slowed down.
[profile.dev.package.fastcdc]
opt-level = 2
[profile.dev.package.blake3]
opt-level = 2
# not so obvious with sqlx:
#[profile.dev.package.sqlx]
#opt-level = 2

5
GLOSSARY.md Normal file
View File

@ -0,0 +1,5 @@
## Internals
* **Chunk**: piece of a file that is obtained using a Content-Defined Chunking scheme
* **Chunk ID**: Blake3 hash of the contents of a chunk
* **Metachunk**: a chunk that itself contains (part of) a list of chunks.

8
datman-helper-mysql/poetry.lock generated Normal file
View File

@ -0,0 +1,8 @@
package = []
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
[metadata.files]

View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "datman-helper-mysql"
version = "0.1.0"
description = "MySQL integration for Datman"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
license = "GPL-3.0-or-later"
[tool.poetry.dependencies]
python = "^3.8"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
datman-helper-mysql-backup="datman_helper_mysql.backup:cli"
datman-helper-mysql-restore="datman_helper_mysql.restore:cli"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,119 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import sys
from shutil import rmtree
from setuptools import Command, find_packages, setup
# Package meta-data.
NAME = "datman_helper_mysql"
DESCRIPTION = "MySQL integration for Datman"
URL = "https://bics.ga/reivilibre/yama"
EMAIL = "reivi@librepush.net"
AUTHOR = "Olivier 'reivilibre'"
REQUIRES_PYTHON = ">=3.7.0"
VERSION = "0.1.0"
# What packages are required for this module to be executed?
REQUIRED = []
# What packages are optional?
EXTRAS = {}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------
# Except, perhaps the License and Trove Classifiers!
# If you do change the License, remember to change the Trove Classifier for that!
here = os.path.abspath(os.path.dirname(__file__))
# Import the README and use it as the long-description.
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
try:
with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = "\n" + f.read()
except FileNotFoundError:
long_description = DESCRIPTION
# Load the package's __version__.py module as a dictionary.
about = {}
if not VERSION:
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
with open(os.path.join(here, project_slug, "__version__.py")) as f:
exec(f.read(), about)
else:
about["__version__"] = VERSION
class UploadCommand(Command):
"""Support setup.py upload."""
description = "Build and publish the package."
user_options = []
@staticmethod
def status(s):
"""Prints things in bold."""
print("\033[1m{0}\033[0m".format(s))
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
self.status("Removing previous builds…")
rmtree(os.path.join(here, "dist"))
except OSError:
pass
self.status("Building Source and Wheel (universal) distribution…")
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
self.status("Uploading the package to PyPI via Twine…")
os.system("twine upload dist/*")
self.status("Pushing git tags…")
os.system("git tag v{0}".format(about["__version__"]))
os.system("git push --tags")
sys.exit()
# Where the magic happens:
setup(
name=NAME,
version=about["__version__"],
description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
author=AUTHOR,
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
url=URL,
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
# If your package is a single module, use this instead of 'packages':
# py_modules=['mypackage'],
entry_points={
"console_scripts": [
"datman-helper-mysql-backup=datman_helper_mysql.backup:cli",
"datman-helper-mysql-restore=datman_helper_mysql.restore:cli",
],
},
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
# TODO license='GPL3',
classifiers=[
# Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
"Programming Language :: Python",
"Programming Language :: Python :: 3",
],
)

View File

@ -39,10 +39,7 @@ def cli():
# The process (if any) that is our LZ4 decompressor.
lz4_process = None
dump_command = [
"pg_dump",
database_to_use
]
dump_command = ["pg_dump", database_to_use]
if host_to_use is not None:
if use_lz4:
@ -63,21 +60,19 @@ def cli():
# (rather than lz4 covering it).
command = [
"ssh",
f"{user_to_use}@{host_to_use}" if user_to_use is not None else f"{host_to_use}",
f"{user_to_use}@{host_to_use}"
if user_to_use is not None
else f"{host_to_use}",
"bash",
"-o",
"pipefail",
"-c",
shlex.quote(" ".join(dump_command))
shlex.quote(" ".join(dump_command)),
]
elif user_to_use is not None:
current_username = pwd.getpwuid(os.getuid()).pw_name
if current_username != user_to_use:
command = [
"sudo",
"-u",
user_to_use
] + dump_command
command = ["sudo", "-u", user_to_use] + dump_command
else:
command = dump_command
else:

8
datman-helper-postgres/poetry.lock generated Normal file
View File

@ -0,0 +1,8 @@
package = []
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
[metadata.files]

View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "datman-helper-postgres"
version = "0.1.0"
description = "Postgres integration for Datman"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
license = "GPL-3.0-or-later"
[tool.poetry.dependencies]
python = "^3.8"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
datman-helper-postgres-backup="datman_helper_postgres.backup:cli"
datman-helper-postgres-restore="datman_helper_postgres.restore:cli"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,119 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import sys
from shutil import rmtree
from setuptools import Command, find_packages, setup
# Package meta-data.
NAME = "datman_helper_postgres"
DESCRIPTION = "Postgres integration for Datman"
URL = "https://bics.ga/reivilibre/yama"
EMAIL = "reivi@librepush.net"
AUTHOR = "Olivier 'reivilibre'"
REQUIRES_PYTHON = ">=3.7.0"
VERSION = "0.1.0"
# What packages are required for this module to be executed?
REQUIRED = []
# What packages are optional?
EXTRAS = {}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------
# Except, perhaps the License and Trove Classifiers!
# If you do change the License, remember to change the Trove Classifier for that!
here = os.path.abspath(os.path.dirname(__file__))
# Import the README and use it as the long-description.
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
try:
with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = "\n" + f.read()
except FileNotFoundError:
long_description = DESCRIPTION
# Load the package's __version__.py module as a dictionary.
about = {}
if not VERSION:
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
with open(os.path.join(here, project_slug, "__version__.py")) as f:
exec(f.read(), about)
else:
about["__version__"] = VERSION
class UploadCommand(Command):
"""Support setup.py upload."""
description = "Build and publish the package."
user_options = []
@staticmethod
def status(s):
"""Prints things in bold."""
print("\033[1m{0}\033[0m".format(s))
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
self.status("Removing previous builds…")
rmtree(os.path.join(here, "dist"))
except OSError:
pass
self.status("Building Source and Wheel (universal) distribution…")
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
self.status("Uploading the package to PyPI via Twine…")
os.system("twine upload dist/*")
self.status("Pushing git tags…")
os.system("git tag v{0}".format(about["__version__"]))
os.system("git push --tags")
sys.exit()
# Where the magic happens:
setup(
name=NAME,
version=about["__version__"],
description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
author=AUTHOR,
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
url=URL,
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
# If your package is a single module, use this instead of 'packages':
# py_modules=['mypackage'],
entry_points={
"console_scripts": [
"datman-helper-postgres-backup=datman_helper_postgres.backup:cli",
"datman-helper-postgres-restore=datman_helper_postgres.restore:cli",
],
},
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
# TODO license='GPL3',
classifiers=[
# Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
"Programming Language :: Python",
"Programming Language :: Python :: 3",
],
)

View File

@ -1,8 +1,8 @@
[package]
name = "datman"
version = "0.5.0-alpha.2"
version = "0.7.0-alpha.1"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2018"
edition = "2021"
repository = "https://bics.ga/reivilibre/yama"
license = "GPL-3.0-or-later"
@ -11,25 +11,29 @@ description = "A chunked and deduplicated backup system using Yama"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = "= 3.0.0-beta.5"
crossbeam-channel = "0.5.1"
anyhow = "1.0"
thiserror = "1.0"
serde = { version = "1.0.104", features = ["derive"] }
serde_json = "1.0.64"
toml = "0.5.5"
log = "0.4"
env_logger = "0.7.1"
indicatif = "0.14.0"
arc-interner = "0.5.1"
zstd = "0.6.0" # 0.6.0+zstd.1.4.8
byteorder = "1"
termion = "1.5.6"
glob = "0.3.0"
humansize = "1.1.1"
chrono = "0.4.19"
itertools = "0.10.1"
hostname = "0.3.1"
yama = { path = "../yama", version = "0.5.0-alpha.1" }
metrics = "0.17.1"
bare-metrics-recorder = { version = "0.1.0" }
eyre = "0.6.8"
clap = { version = "4.2.2", features = ["derive", "env"] }
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.16", features = ["tracing-log", "env-filter"] }
tracing-indicatif = "0.3.0"
indicatif = "0.17.3"
serde = { version = "1.0.160", features = ["derive"] }
serde_json = "1.0.96"
toml = "0.7.3"
tokio = { version = "1.28.0", features = ["fs", "macros", "rt-multi-thread"] }
dashmap = "5.4.0"
chrono = "0.4.24"
users = "0.11.0"
bytesize = "1.2.0"
yama = { version = "0.7.0-alpha.1", path = "../yama" }
yama_pile = { path = "../yama_pile" }
#yama_localcache = { path = "../yama_localcache" }
yama_wormfile = { path = "../yama_wormfile" }
#yama_wormfile_fs = { path = "../yama_wormfile_fs" }
#yama_wormfile_s3 = { path = "../yama_wormfile_s3" }
#yama_wormfile_sftp = { path = "../yama_wormfile_sftp" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
patricia_tree = "0.5.7"
async-backtrace = "0.2.6"

View File

@ -8,5 +8,6 @@ Features:
* (optional) Compression using Zstd and a specifiable dictionary
* (optional) Encryption
* Ability to back up to remote machines over SSH
* Labelling of files in a backup source; different destinations can choose to backup either all or a subset of the labels.
See the documentation for more information.

583
datman/src/backup.rs Normal file
View File

@ -0,0 +1,583 @@
use crate::descriptor_config::{SourceDescriptor, SourceDescriptorInner, VirtualSourceKind};
use crate::pointer_names::{get_pointer_name_at, POINTER_NAME_DATETIME_SPLITTER};
use bytesize::ByteSize;
use chrono::{DateTime, Utc};
use clap::Args;
use dashmap::DashSet;
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use indicatif::ProgressStyle;
use patricia_tree::PatriciaMap;
use std::cmp::max;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::io::Write;
use std::path::PathBuf;
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use std::time::{Instant, SystemTime, UNIX_EPOCH};
use tokio::runtime::Handle;
use tokio::task::JoinSet;
use tracing::{debug, info, info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use users::{get_current_gid, get_current_uid};
use yama::pile_with_cache::{PileWithCache, PointerIntegrationStatistics};
use yama::scan::{create_uidgid_lookup_tables, limit_scan_entry_map_to_size};
use yama::storing::{
assemble_and_write_indices, StoragePipeline, StoringBloblogWriters, StoringState,
};
use yama::{scan, PROGRESS_BAR_STYLE};
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BlobLocator, BloblogId, IndexBloblogEntry, RecursiveChunkRef};
use yama_pile::pointers::Pointer;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{
assemble_tree_from_scan_entries, differentiate_node_in_place, FilesystemOwnership,
FilesystemPermissions, RootTreeNode, TreeNode,
};
use yama_wormfile::boxed::BoxedWormFileProvider;
#[derive(Args, Clone, Debug)]
pub struct BackupOptions {
/// Number of bytes to back up in one go. Intended for gradually getting a backup started.
/// Supports suffixes like MiB and MB.
/// Applies per-source. Does not apply to virtual sources.
#[clap(long)]
gradual: Option<ByteSize>,
}
pub async fn backup(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
sources_to_backup: BTreeMap<String, SourceDescriptor>,
options: &BackupOptions,
) -> eyre::Result<()> {
// Locate suitable parent pointers
let timestart = Instant::now();
let parents_to_use = find_suitable_parent_pointers(&pwc, &sources_to_backup)
.await
.context("failed to look for suitable parent pointers")?;
debug!(
"find_suitable_parent_pointers: {:?}",
Instant::now() - timestart
);
let now = Utc::now();
// (dirtrees) Scan
let timestart = Instant::now();
let dir_sources = scan_dir_sources(&sources_to_backup, parents_to_use, now, options)
.await
.context("failed to scan directory sources")?;
debug!("scan_dir_sources: {:?}", Instant::now() - timestart);
let new_unflushed_chunks: Arc<DashSet<ChunkId>> = Arc::new(Default::default());
// (dirtrees) Start a storage pipeline and submit jobs to it
let task_store_dirs = {
let new_unflushed_chunks = new_unflushed_chunks.clone();
let pwc = pwc.clone();
let bds_span = info_span!("storing");
tokio::spawn(async_backtrace::frame!(async move {
backup_dir_sources(dir_sources, pwc, new_unflushed_chunks)
.await
.context("whilst backing up dir sources")
}
.instrument(bds_span)))
};
// (virtual source streams) Store to bloblog writers
let task_store_virtuals = {
let bvs_span = info_span!("storing_virts");
let new_unflushed_chunks = new_unflushed_chunks.clone();
let pwc = pwc.clone();
tokio::spawn(async_backtrace::frame!(async move {
backup_virtual_sources(&sources_to_backup, now, pwc, new_unflushed_chunks)
.await
.context("whilst backing up virtual sources")
}
.instrument(bvs_span)))
};
let timestart = Instant::now();
let (dir_sources_and_chunkmaps, virt_sources) =
tokio::join!(task_store_dirs, task_store_virtuals);
debug!(
"join!(task_store_dirs, task_store_virtuals): {:?}",
Instant::now() - timestart
);
let dir_sources_and_chunkmaps: BackupDirSourcesReturn = dir_sources_and_chunkmaps??;
let mut virt_sources: Vec<VirtualSourceReturn> = virt_sources??;
let mut chunkmaps = dir_sources_and_chunkmaps.chunkmaps;
for source in &mut virt_sources {
chunkmaps.extend(
std::mem::take(&mut source.chunkmaps)
.into_iter()
.map(|(k, nb)| {
(
k,
IndexBloblogEntry {
chunks: nb,
forgotten_bytes: 0,
},
)
}),
);
}
// Chunkmaps, indices and write pointers
assemble_and_write_indices(&pwc, chunkmaps)
.await
.context("failed to assemble and write indices")?;
info!("All indices stored, writing pointer...");
for (dir_source_prep, chunk_file_map) in dir_sources_and_chunkmaps.dir_source_returns {
// Assemble and write a pointer
let mut tree = assemble_tree_from_scan_entries(
dir_source_prep.new_scan_entry_map,
dir_source_prep.reused_scan_entry_map,
chunk_file_map,
)
.context("failed to assemble tree")?;
let (uids, gids) =
create_uidgid_lookup_tables(&tree).context("failed to create uid/gid tables")?;
if let Some(ref parent_node) = dir_source_prep.parent {
differentiate_node_in_place(&mut tree, &parent_node.root.node)
.context("failed to differentiate?")?;
}
pwc.pile
.write_pointer(
&dir_source_prep.new_pointer_name,
false,
&Pointer {
parent: dir_source_prep.parent_name.clone(),
root: RootTreeNode {
name: dir_source_prep
.path
.file_name()
.map(|oss| oss.to_str())
.flatten()
.unwrap_or("")
.to_owned(),
node: tree,
},
uids,
gids,
},
)
.await
.context("failed to write pointer")?;
}
for virtual_source in virt_sources {
pwc.pile
.write_pointer(&virtual_source.pointer_name, false, &virtual_source.pointer)
.await
.context("failed to write pointer")?;
}
Arc::try_unwrap(pwc)
.map_err(|_| eyre!("pwc still in use; can't close down gracefully"))?
.close()
.await?;
Ok(())
}
/// Given access to a PWC and a map of sources to back up, returns a map of pointer names to use as parents.
/// For virtual sources, no parent is chosen.
/// For directory sources, the most recent pointer from the same source is chosen as a parent.
async fn find_suitable_parent_pointers(
pwc: &PileWithCache<BoxedWormFileProvider>,
sources_to_backup: &BTreeMap<String, SourceDescriptor>,
) -> eyre::Result<BTreeMap<String, (String, Pointer)>> {
let mut result = BTreeMap::new();
let pointers = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
for (source_name, source) in sources_to_backup.iter() {
if source.is_directory_source() {
let starter = format!("{source_name}{POINTER_NAME_DATETIME_SPLITTER}");
if let Some(most_recent_pointer) = pointers
.iter()
.rev()
.filter(|pn| pn.starts_with(&starter))
.next()
{
debug!("for {source_name:?}, using parent {most_recent_pointer:?}");
let mut stats = PointerIntegrationStatistics::default();
let pointer = pwc
.read_pointer_fully_integrated(&most_recent_pointer, &mut stats)
.await
.context("failed to read parent pointer")?
.context("no parent pointer despite having just listed it")?;
debug!("when loading parent, stats = {stats:?}");
result.insert(
source_name.to_owned(),
(most_recent_pointer.clone(), pointer),
);
}
}
}
Ok(result)
}
struct DirSourcePrep {
/// New entries only.
new_scan_entry_map: PatriciaMap<ScanEntry>,
/// Files: Reused entries only. Directories: can be partially changed but there's no chunking to be done.
reused_scan_entry_map: PatriciaMap<ScanEntry>,
parent_name: Option<String>,
parent: Option<Pointer>,
path: PathBuf,
new_pointer_name: String,
chunk_file_map: PatriciaMap<Option<(RecursiveChunkRef, u64)>>,
}
async fn scan_dir_sources(
sources_to_backup: &BTreeMap<String, SourceDescriptor>,
mut parents: BTreeMap<String, (String, Pointer)>,
now: DateTime<Utc>,
options: &BackupOptions,
) -> eyre::Result<Vec<DirSourcePrep>> {
let mut joinset = JoinSet::new();
for (source_name, source) in sources_to_backup {
if let SourceDescriptorInner::DirectorySource {
path,
cross_filesystems,
ignore,
} = &source.inner
{
let path = path.to_owned();
let ignore = ignore.to_owned();
let (parent_name, parent) = parents.remove(source_name).unzip();
let new_pointer_name = get_pointer_name_at(&source_name, now);
let options = options.clone();
joinset.spawn_blocking(move || -> eyre::Result<DirSourcePrep> {
let timestart = Instant::now();
let scan_entry_map = scan::scan(&path, &ignore).context("Failed to scan")?;
debug!("scan: {:?}", Instant::now() - timestart);
info!(
"size estimate for {:?} (full scan): {}",
path,
summarise_scan_entry_map_size(&scan_entry_map)
);
// TODO This whole section is messy.
// Maybe we should consider combining prepopulate_unmodified and limit_scan_entry_map_to_size
// as the latter might benefit from being able to see what is in the parent pointer...
let (chunk_file_map, pruned_scan_entry_map, prepopulated_scan_entry_map) =
if let Some(ref parent_node) = parent {
let (cfm, pruned, prepopulated) =
scan::prepopulate_unmodified(&parent_node.root.node, &scan_entry_map);
// debug
let pruned_keys: BTreeSet<String> = pruned
.keys()
.map(|b| String::from_utf8(b).unwrap())
.collect();
let prepop_keys: BTreeSet<String> = prepopulated
.keys()
.map(|b| String::from_utf8(b).unwrap())
.collect();
let ix_keys: BTreeSet<&String> =
pruned_keys.intersection(&prepop_keys).collect();
if !ix_keys.is_empty() {
bail!("bug: intersecting prepop and prune keys: {ix_keys:?}");
}
info!(
"size estimate for {:?} (differential): {}",
path,
summarise_scan_entry_map_size(&pruned)
);
(cfm, pruned, prepopulated)
} else {
(
PatriciaMap::<Option<(RecursiveChunkRef, u64)>>::new(),
scan_entry_map,
PatriciaMap::new(),
)
};
let pruned_scan_entry_map = match options.gradual {
Some(gradual_size_limit) => {
let limited = limit_scan_entry_map_to_size(
pruned_scan_entry_map,
gradual_size_limit.as_u64(),
);
info!(
"size estimate for {:?} (gradual/limited): {}",
path,
summarise_scan_entry_map_size(&limited)
);
limited
}
None => pruned_scan_entry_map,
};
Ok(DirSourcePrep {
chunk_file_map,
new_scan_entry_map: pruned_scan_entry_map,
reused_scan_entry_map: prepopulated_scan_entry_map,
parent_name,
parent,
path,
new_pointer_name,
})
});
}
}
let mut result = Vec::new();
while let Some(dsp_res_res) = joinset.join_next().await {
result.push(dsp_res_res??);
}
Ok(result)
}
fn summarise_scan_entry_map_size(scan_entry_map: &PatriciaMap<ScanEntry>) -> String {
let mut num_bytes = 0u64;
for (_, entry) in scan_entry_map.iter() {
num_bytes += match entry {
ScanEntry::NormalFile { size, .. } => max(*size, 4096),
_ => 4096,
};
}
let num_files = scan_entry_map.len();
format!(
"{num_files} files ({})",
ByteSize(num_bytes).to_string_as(true)
)
}
struct BackupDirSourcesReturn {
pub chunkmaps: BTreeMap<BloblogId, IndexBloblogEntry>,
pub dir_source_returns: Vec<(DirSourcePrep, PatriciaMap<Option<(RecursiveChunkRef, u64)>>)>,
}
#[async_backtrace::framed]
async fn backup_dir_sources(
mut dir_sources: Vec<DirSourcePrep>,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<BackupDirSourcesReturn> {
let mut chunk_file_maps = Vec::new();
let mut pruned_scan_entry_maps = Vec::new();
// First collect all that stuff together...
for dir_source in &mut dir_sources {
chunk_file_maps.push(std::mem::take(&mut dir_source.chunk_file_map));
}
for dir_source in &dir_sources {
pruned_scan_entry_maps.push(&dir_source.new_scan_entry_map);
}
let store_span = Span::current();
// store_span.pb_set_style(&ProgressStyle::default_bar());
store_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
store_span.pb_set_message("storing files");
store_span.pb_set_length(
pruned_scan_entry_maps
.iter()
.map(|pruned_scan_entry_map| {
pruned_scan_entry_map
.values()
.filter(|v| matches!(v, ScanEntry::NormalFile { .. }))
.count() as u64
})
.sum(),
);
//
let (pipeline, pipeline_job_tx) =
StoragePipeline::launch_new(4, pwc.clone(), new_unflushed_chunks).await?;
let dir_sources2 = &dir_sources;
let mut submitted = 0;
let mut completed = 0;
let submitted_mut = &mut submitted;
let (submitter_task, receiver_task) = tokio::join!(
async move {
let pipeline_job_tx = pipeline_job_tx;
for (dir_source_idx, dir_source) in dir_sources2.iter().enumerate() {
for (name_bytes, scan_entry) in pruned_scan_entry_maps[dir_source_idx].iter() {
if let ScanEntry::NormalFile { .. } = scan_entry {
let name = std::str::from_utf8(name_bytes.as_slice())
.context("name is not str")?;
// TODO(bug): if source name is a file, this doesn't work (.join(""))
pipeline_job_tx
.send_async((
(dir_source_idx, name.to_owned()),
dir_source.path.join(name),
))
.await
.map_err(|_| eyre!("unable to send to pipeline."))?;
*submitted_mut += 1;
}
}
}
drop(pipeline_job_tx);
Ok::<_, eyre::Report>(())
},
async {
while let Ok(((dir_source_idx, job_id), file_store_opt)) = pipeline.next_result().await
{
chunk_file_maps[dir_source_idx].insert(&job_id, file_store_opt);
completed += 1;
Span::current().pb_inc(1);
}
// eprintln!("fin rec");
Ok::<_, eyre::Report>(())
}
);
submitter_task?;
receiver_task?;
ensure!(
completed == submitted,
"completed: {completed:?} != submitted {submitted:?}"
);
assert_eq!(dir_sources.len(), chunk_file_maps.len());
let chunkmaps = pipeline.finish_into_chunkmaps().await?;
Ok(BackupDirSourcesReturn {
chunkmaps,
dir_source_returns: dir_sources
.into_iter()
.zip(chunk_file_maps.into_iter())
.collect(),
})
}
async fn backup_virtual_sources(
sources: &BTreeMap<String, SourceDescriptor>,
now: DateTime<Utc>,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<Vec<VirtualSourceReturn>> {
let mut joinset: JoinSet<eyre::Result<VirtualSourceReturn>> = JoinSet::new();
for (source_name, source) in sources {
if source.is_virtual_source() {
joinset.spawn(backup_virtual_source(
get_pointer_name_at(source_name, now),
source.clone(),
pwc.clone(),
new_unflushed_chunks.clone(),
));
}
}
let mut results = Vec::new();
while let Some(result_res_res) = joinset.join_next().await {
results.push(result_res_res??);
}
Ok(results)
}
struct VirtualSourceReturn {
pub pointer_name: String,
pub pointer: Pointer,
pub chunkmaps: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
}
async fn backup_virtual_source(
pointer_name: String,
source: SourceDescriptor,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<VirtualSourceReturn> {
let SourceDescriptorInner::VirtualSource(virtual_source) = &source.inner else {
bail!("bug: non-VS SDI passed to BVS");
};
let mut storing_state = StoringState::new(pwc.clone(), new_unflushed_chunks, Handle::current())
.await
.context("failed to create storing state")?;
let mut sbw = StoringBloblogWriters::default();
let ((chunkref, size), mut sbw, mut storing_state) = tokio::task::spawn_blocking({
let virtual_source = virtual_source.clone();
move || -> eyre::Result<((RecursiveChunkRef, u64), StoringBloblogWriters, StoringState)> {
let child = open_stdout_backup_process(&virtual_source.extra_args, &virtual_source.helper)?;
Ok((storing_state.store_full_stream(child.stdout.unwrap(), &mut sbw).context("Failed to store stream into Yama pile")?, sbw, storing_state))
}
}).await??;
sbw.finish_bloblogs(&mut storing_state)
.await
.context("Failed to finish bloblogs")?;
let chunkmaps = storing_state.new_bloblogs;
// Assemble and write a pointer
let uid = get_current_uid() as u16;
let gid = get_current_gid() as u16;
let tree = TreeNode::NormalFile {
mtime: SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_millis() as u64)
.unwrap_or(0),
ownership: FilesystemOwnership { uid, gid },
permissions: FilesystemPermissions { mode: 0o600 },
size,
content: chunkref,
};
let (uids, gids) =
create_uidgid_lookup_tables(&tree).context("failed to create uid/gid tables")?;
let VirtualSourceKind::Stdout { filename } = &virtual_source.kind;
Ok(VirtualSourceReturn {
pointer_name,
pointer: Pointer {
parent: None,
root: RootTreeNode {
name: filename.clone(),
node: tree,
},
uids,
gids,
},
chunkmaps,
})
}
pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>,
program_name: &str,
) -> eyre::Result<Child> {
let mut child = Command::new(format!("datman-helper-{}-backup", program_name))
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.stdin(Stdio::piped())
.spawn()?;
let mut child_stdin = child.stdin.as_mut().unwrap();
serde_json::to_writer(&mut child_stdin, extra_args)?;
child_stdin.flush()?;
// close stdin!
child.stdin = None;
Ok(child)
}

View File

@ -15,298 +15,281 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::path::{Path, PathBuf};
use clap::Parser;
use env_logger::Env;
use anyhow::bail;
use bare_metrics_recorder::recording::BareMetricsRecorderCore;
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone, Utc};
use datman::commands::backup::{backup_all_sources_to_destination, backup_source_to_destination};
use datman::commands::ilabel::interactive_labelling_session;
use datman::commands::init_descriptor;
use datman::descriptor::{load_descriptor, SourceDescriptor};
use datman::get_hostname;
use datman::remote::backup_source_requester::backup_remote_source_to_destination;
use datman::remote::backup_source_responder;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::info;
use clap::{Parser, Subcommand};
use datman::backup::{backup, BackupOptions};
use datman::descriptor_config::{
load_descriptor, Descriptor, PilePathOrConnector, SourceDescriptor,
};
use datman::extract::{
extract, load_pointers_for_extraction, merge_roots_for_batch_extract, select_to_extract,
};
use eyre::{bail, Context, ContextCompat};
use std::collections::{BTreeMap, BTreeSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use tracing::info;
use tracing_indicatif::IndicatifLayer;
use tracing_subscriber::filter::filter_fn;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::Layer;
use yama::debugging::register_sigusr1_backtrace_helper;
use yama::get_hostname;
use yama::open::{open_lock_and_update_cache, open_lock_and_update_cache_with_connector};
use yama::pile_with_cache::PileWithCache;
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const FAILURE_SYMBOL_OBNOXIOUS_FLASHING: &str = "\x1b[5m\x1b[31m⚠ \x1b[25m\x1b[22m";
pub const BOLD: &str = "\x1b[1m";
pub const BOLD_OFF: &str = "\x1b[22m";
pub const WHITE: &str = "\x1b[37m";
pub const RED: &str = "\x1b[31m";
pub const GREEN: &str = "\x1b[32m";
#[derive(Clone, Debug)]
pub struct PileAndPointer {
pub pile_path: Option<PathBuf>,
pub pointer: PointerName,
}
#[derive(Parser)]
#[derive(Clone, Debug)]
#[repr(transparent)]
pub struct PointerName(String);
impl FromStr for PointerName {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if !s
.chars()
.all(|c| c.is_alphanumeric() || ['_', '+', '-', ':'].contains(&c))
{
bail!("Bad pointer name: {s:?}");
}
Ok(PointerName(s.to_owned()))
}
}
impl FromStr for PileAndPointer {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.split_once(":") {
None => Ok(PileAndPointer {
pile_path: None,
pointer: PointerName::from_str(s)?,
}),
Some((pile_path, pointer)) => Ok(PileAndPointer {
pile_path: Some(PathBuf::from(pile_path)),
pointer: PointerName::from_str(pointer)?,
}),
}
}
}
#[derive(Clone, Debug)]
pub struct PileAndPointerWithSubTree {
pub pile_path: Option<PathBuf>,
pub pointer: PointerName,
// TODO how to represent...
pub sub_tree: String,
}
impl FromStr for PileAndPointerWithSubTree {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let (pile_path, pointer_and_subtree) = match s.split_once(":") {
None => (None, s),
Some((pile_path, pointer)) => (Some(PathBuf::from(pile_path)), pointer),
};
if let Some(slash) = pointer_and_subtree.find('/') {
Ok(PileAndPointerWithSubTree {
pile_path,
pointer: PointerName::from_str(&pointer_and_subtree[0..slash])?,
sub_tree: pointer_and_subtree[slash + 1..].to_owned(),
})
} else {
Ok(PileAndPointerWithSubTree {
pile_path,
pointer: PointerName::from_str(&pointer_and_subtree)?,
sub_tree: String::new(),
})
}
}
}
#[derive(Parser, Clone, Debug)]
pub struct DatmanArgs {
#[arg(long, env = "DATMAN_CONFIG", default_value = "datman.toml")]
config: PathBuf,
#[command(subcommand)]
command: DatmanCommand,
}
#[derive(Subcommand, Clone, Debug)]
pub enum DatmanCommand {
/// Initialise a datman descriptor in this directory.
Init {},
///
Status {},
#[clap(name = "ilabel")]
InteractiveLabelling {
/// Name of the source to label.
source_name: String,
},
#[clap(name = "ibrowse")]
InteractiveBrowsing {
/// Name of the source to browse.
source_name: String,
},
/// Back up a source locally or over the network.
BackupOne {
/// Name of the source to back up.
source_name: String,
/// Name of the destination to back up to.
destination_name: String,
pile_name: String,
#[clap(flatten)]
options: BackupOptions,
},
BackupAll {
/// Name of the remote to back up.
/// Special value 'self' means 'this host only'.
/// Special value 'all' means 'all hosts'.
remote_name: String,
/// Name of the destination to back up to.
destination_name: String,
},
Extract {
/// Name of the 'source' to extract
/// Omit for 'all'.
#[clap(short)]
source_name: Option<String>,
/// If specified, will get the first backup after this date.
#[clap(long)]
after: Option<HumanDateTime>,
/// If specified, will get the last backup before this date. The default behaviour is to get the latest.
#[clap(long)]
before: Option<HumanDateTime>,
/// If not specified, time-restricted extractions that don't have a pointer for every source
/// will instead lead to an error.
#[clap(long)]
accept_partial: bool, // TODO unimplemented.
/// Name of the pile to extract from
pile_name: String,
/// Place to extract to.
destination: PathBuf,
/// Skip applying metadata. Might be needed to extract without superuser privileges.
#[clap(long)]
skip_metadata: bool,
#[clap(flatten)]
options: BackupOptions,
},
#[clap(name = "_backup_source_responder")]
InternalBackupSourceResponder,
ExtractOne {
pile_name: String,
source_name: String,
destination: PathBuf,
},
ExtractAll {
pile_name: String,
destination: PathBuf,
},
}
pub struct HumanDateTime(pub DateTime<Local>);
const PROGRESS_SPANS: &'static [&'static str] = &[
"store_file",
"storing",
"unpack_files",
"expand_chunkrefs",
"extract_files",
"check_all_chunks",
];
impl FromStr for HumanDateTime {
type Err = anyhow::Error;
#[tokio::main]
pub async fn main() -> eyre::Result<()> {
let indicatif_layer = IndicatifLayer::new();
let stderr_writer = indicatif_layer.get_stderr_writer();
let indicatif_layer = indicatif_layer.with_filter(filter_fn(|span_metadata| {
(span_metadata.target().starts_with("yama") || span_metadata.target().starts_with("datman"))
&& PROGRESS_SPANS.contains(&span_metadata.name())
}));
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(date_only) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
let local_date = chrono::offset::Local.from_local_date(&date_only).unwrap();
let local_datetime = local_date.and_hms(0, 0, 0);
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
let local_datetime = chrono::offset::Local
.from_local_datetime(&date_and_time)
.unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
let local_datetime = chrono::offset::Local
.from_local_datetime(&date_and_time)
.unwrap();
Ok(HumanDateTime(local_datetime))
} else {
bail!("Couldn't parse using either format. Use one of: 2021-05-16 OR 2021-05-16T17:42:14 OR 2021-05-16 17:42:14");
}
}
}
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| {
"sqlx=warn,yama=debug,datman=debug,yama_wormfile_sftp=debug,info".into()
}),
)
.with(tracing_subscriber::fmt::layer().with_writer(stderr_writer))
.with(indicatif_layer)
.init();
fn with_obvious_successfail_message<R>(result: anyhow::Result<R>) -> anyhow::Result<R> {
match &result {
Ok(_) => {
eprintln!("Operation {}successful{}.", GREEN, WHITE);
}
Err(error) => {
eprintln!("{:?}", error);
eprintln!(
"{}{}Operation {}{}FAILED{}!{}",
FAILURE_SYMBOL_OBNOXIOUS_FLASHING, WHITE, RED, BOLD, WHITE, BOLD_OFF
);
}
};
result
}
register_sigusr1_backtrace_helper();
fn with_exitcode<R>(result: anyhow::Result<R>) {
match &result {
Ok(_) => {
std::process::exit(0);
}
Err(_) => {
std::process::exit(5);
}
};
}
let args: DatmanArgs = dbg!(DatmanArgs::parse());
fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
let descriptor = load_descriptor(&args.config)
.await
.context("failed to load Datman descriptor")?;
dbg!(&descriptor);
let now = Utc::now();
let (shard, _stopper) = BareMetricsRecorderCore::new(File::create(format!(
"/tmp/datman_{}.baremetrics",
now.format("%F_%H%M%S")
))?)
.start("datman".to_string())?;
shard.install_as_metrics_recorder()?;
let opts: DatmanCommand = DatmanCommand::parse();
match opts {
DatmanCommand::Init {} => {
init_descriptor(Path::new(".")).unwrap();
}
DatmanCommand::Status { .. } => {
unimplemented!();
}
DatmanCommand::InteractiveLabelling { source_name } => {
interactive_labelling_session(Path::new("."), source_name).unwrap();
}
DatmanCommand::InteractiveBrowsing { source_name } => {
datman::commands::ibrowse::session(Path::new("."), source_name).unwrap();
}
match args.command {
DatmanCommand::BackupOne {
source_name,
destination_name,
pile_name,
options,
} => {
let lock_name = format!("{} datman backup {:?}", get_hostname(), source_name);
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let source = descriptor
.sources
.get(&source_name)
.context("no source by that name")?;
let my_hostname = get_hostname();
let descriptor = load_descriptor(Path::new(".")).unwrap();
let source = &descriptor.sources[&source_name];
let destination = &descriptor.piles[&destination_name];
if &source.host != &my_hostname {
bail!(
"Current hostname is {:?}, not {:?} as expected for this source.",
my_hostname,
source.host
);
}
let mut pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("storing");
let mut sources_to_backup = BTreeMap::new();
sources_to_backup.insert(source_name.clone(), source.clone());
let is_remote = if let SourceDescriptor::DirectorySource { hostname, .. } = source {
hostname != &my_hostname
} else {
false
};
let result = if is_remote {
backup_remote_source_to_destination(
source,
destination,
&descriptor,
Path::new("."),
&source_name,
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
pbar,
)
} else {
backup_source_to_destination(
source,
destination,
&descriptor,
Path::new("."),
&source_name,
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
&mut pbar,
)
};
with_exitcode(with_obvious_successfail_message(result))
backup(pwc, sources_to_backup, &options).await?;
}
DatmanCommand::BackupAll {
remote_name,
destination_name,
} => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let destination = &descriptor.piles[&destination_name];
DatmanCommand::BackupAll { pile_name, options } => {
let lock_name = format!("{} datman backupall", get_hostname());
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let mut pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
let my_hostname = get_hostname();
let sources_to_backup: BTreeMap<String, SourceDescriptor> = descriptor
.sources
.clone()
.into_iter()
.filter(|(_, source)| &source.host == &my_hostname)
.collect();
if sources_to_backup.len() == 0 {
bail!(
"No sources to back up! The current hostname is {:?}; is it correct?",
my_hostname
);
}
info!(
"Backing up the following {} sources: {:?}",
sources_to_backup.len(),
sources_to_backup.keys().collect::<Vec<_>>()
);
pbar.set_message("storing");
backup_all_sources_to_destination(
destination,
&descriptor,
Path::new("."),
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
&mut pbar,
remote_name,
)
.unwrap();
backup(pwc, sources_to_backup, &options).await?;
}
DatmanCommand::Extract {
DatmanCommand::ExtractOne {
pile_name,
source_name,
after,
before,
accept_partial,
destination,
} => {
let lock_name = format!("{} datman extract {:?}", get_hostname(), source_name);
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let mut sources = BTreeSet::new();
sources.insert(source_name.clone());
let selected = select_to_extract(&pwc, sources, None, None, false).await?;
let mut for_extraction = load_pointers_for_extraction(pwc.clone(), selected).await?;
assert_eq!(for_extraction.len(), 1);
let root_node = for_extraction.remove(&source_name).unwrap();
extract(pwc, root_node.node, &destination).await?;
}
DatmanCommand::ExtractAll {
pile_name,
destination,
skip_metadata,
} => {
if !accept_partial {
bail!("Specify --accept-partial until running without it is supported.");
}
let lock_name = format!("{} datman extractall", get_hostname());
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
if after.is_some() && before.is_some() {
bail!("Can't specify both before and after!");
}
let before = before.map(|dt| dt.0.with_timezone(&Utc));
let after = after.map(|dt| dt.0.with_timezone(&Utc));
datman::commands::extract::extract(
&destination,
Path::new("."),
source_name.as_ref().map(|x| x.as_ref()),
&pile_name,
before.into(),
after.into(),
!skip_metadata,
!skip_metadata,
!skip_metadata,
yama::utils::get_number_of_workers("YAMA_EXTRACTORS"),
)?;
}
DatmanCommand::InternalBackupSourceResponder => {
info!("Datman responder at {:?}", std::env::current_exe()?);
backup_source_responder::handler_stdio()?;
let sources = descriptor.sources.keys().cloned().collect();
let selected = select_to_extract(&pwc, sources, None, None, false).await?;
let for_extraction = load_pointers_for_extraction(pwc.clone(), selected).await?;
let merged_node = merge_roots_for_batch_extract(for_extraction);
extract(pwc, merged_node, &destination).await?;
}
}
Ok(())
}
async fn open_destination(
descriptor: &Descriptor,
pile_name: &str,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let path_or_connector = descriptor
.piles
.get(pile_name)
.context("no pile by that name")?;
match path_or_connector {
PilePathOrConnector::PilePath(path) => {
open_lock_and_update_cache(path.clone(), lock_name).await
}
PilePathOrConnector::PileConnector { scheme, yamakey } => {
open_lock_and_update_cache_with_connector(scheme, pile_name, yamakey, lock_name).await
}
}
}

View File

@ -1,57 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::HashMap;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use crate::descriptor::{Descriptor, SourceDescriptor};
pub mod backup;
pub mod extract;
pub mod ibrowse;
pub mod ilabel;
pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
std::fs::create_dir_all(path)?;
std::fs::create_dir(path.join("labelling"))?;
let mut datman_toml_file = File::create(path.join("datman.toml"))?;
let source: HashMap<String, SourceDescriptor> = Default::default();
/*source.insert("demo1".to_owned(), SourceDescriptor::DirectorySource {
hostname: "demohost1".to_string(),
directory: PathBuf::from("/dev/null")
});
source.insert("demo2".to_owned(), SourceDescriptor::VirtualSource { blah: "".to_string(), label: "wat".to_string() });*/
let bytes = toml::to_vec(&Descriptor {
labels: vec![
"pocket".to_owned(),
"precious".to_owned(),
"bulky".to_owned(),
],
sources: source,
piles: Default::default(),
remote_hosts: Default::default(),
})?;
datman_toml_file.write_all(&bytes)?;
Ok(())
}

View File

@ -1,378 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor, VirtualSourceKind};
use crate::get_hostname;
use crate::labelling::{label_node, load_labelling_rules, str_to_label, Label, State};
use crate::tree::{scan, FileTree, FileTree1};
use anyhow::{anyhow, bail};
use arc_interner::ArcIntern;
use chrono::{DateTime, Utc};
use log::{info, warn};
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
use std::io::Write;
use std::path::Path;
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use yama::chunking::SENSIBLE_THRESHOLD;
use yama::commands::{load_pile_descriptor, open_pile, store_tree_node};
use yama::definitions::{
FilesystemOwnership, FilesystemPermissions, PointerData, RecursiveChunkRef, RootTreeNode,
TreeNode,
};
use yama::progress::ProgressTracker;
pub const POINTER_DATETIME_FORMAT: &'static str = "%F_%T";
pub const POINTER_FIELD_SEPARATOR: char = '+';
pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String {
format!(
"{}{}{}",
source_name,
POINTER_FIELD_SEPARATOR,
datetime.format(POINTER_DATETIME_FORMAT).to_string()
)
}
pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>,
program_name: &str,
) -> anyhow::Result<Child> {
let mut child = Command::new(format!("datman-helper-{}-backup", program_name))
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.stdin(Stdio::piped())
.spawn()?;
let mut child_stdin = child.stdin.as_mut().unwrap();
serde_json::to_writer(&mut child_stdin, extra_args)?;
child_stdin.flush()?;
// close stdin!
child.stdin = None;
Ok(child)
}
pub fn label_filter_and_convert(
tree: FileTree1<()>,
descriptor: &Descriptor,
desc_path: &Path,
source_name: &str,
dest: &DestPileDescriptor,
) -> anyhow::Result<Option<TreeNode>> {
info!("Labelling.");
let mut tree = tree.replace_meta(&None);
let labels = descriptor
.labels
.iter()
.map(|l| Label(ArcIntern::new(l.clone())))
.collect();
let rules = load_labelling_rules(desc_path, source_name)?;
label_node("".to_owned(), None, &mut tree, &labels, &rules)?;
let included_labels: HashSet<Label> = dest.included_labels.iter().map(str_to_label).collect();
info!("Filtering.");
let mut unlabelled_included = false;
// filter_inclusive includes directories that must exist for the sake of their children.
if !tree.filter_inclusive(&mut |node| {
match node.get_metadata().unwrap() {
None => {
// unlabelled -- include by default for safety
unlabelled_included = true;
true
}
Some(State::Excluded) => {
// don't include excluded things
false
}
Some(State::Labelled(label)) => {
// include things only if we want the label
included_labels.contains(&label)
}
Some(State::Split) => {
// no point retaining this directory if its children aren't going to be!
assert!(
node.is_dir(),
"Non-directories should not be labelled for Split!"
);
false
}
}
}) {
info!("Empty filter. Stopping.");
return Ok(None);
}
if unlabelled_included {
warn!("Unlabelled nodes. They have been included for safety, but you should consider running\n\t'datman ilabel {}'\nat some point to assign labels.", source_name);
}
let root = convert_filetree_to_yamatree(&tree);
Ok(Some(root))
}
pub fn backup_source_to_destination<PT: ProgressTracker>(
source: &SourceDescriptor,
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
source_name: &str,
dest_name: &str,
num_workers: u8,
progress_bar: &mut PT,
) -> anyhow::Result<()> {
match source {
SourceDescriptor::DirectorySource {
hostname: _,
directory,
} => {
info!("Looking to backup {} to {}", source_name, dest_name);
info!("Scanning.");
let tree = scan(directory)?.ok_or_else(|| anyhow!("Source does not exist."))?;
let absolute_source_path = desc_path.join(directory);
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let root = if let Some(root) =
label_filter_and_convert(tree, descriptor, desc_path, source_name, dest)?
{
root
} else {
return Ok(());
};
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
info!("Searching for suitable parents.");
let mut parent: Option<String> = None;
let prefix = format!("{}+", source_name);
for pointer in pile.list_pointers()?.iter() {
if pointer.starts_with(&prefix) {
match parent.as_ref() {
None => {
parent = Some(pointer.to_owned());
}
Some(cur_parent) => {
if cur_parent < pointer {
parent = Some(pointer.to_owned());
}
}
}
}
}
match parent.as_ref() {
Some(parent) => {
info!("Using parent: {}", parent);
}
None => {
info!("No suitable parent found.");
}
}
info!("Storing using yama.");
yama::operations::storing::store_fully(
Arc::new(pile),
&absolute_source_path,
&pointer_name,
root,
parent,
num_workers,
progress_bar,
true,
)?;
info!("Stored!");
}
SourceDescriptor::VirtualSource {
helper,
label,
kind: VirtualSourceKind::Stdout { filename },
extra_args,
} => {
if !dest.included_labels.contains(label) {
info!("Skipping because the source's label is not included in this destination!");
return Ok(());
}
info!("Starting up process and writing to yama store.");
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
let mut chunker = yama::chunking::RecursiveChunker::new(SENSIBLE_THRESHOLD, &pile);
let mut process = open_stdout_backup_process(extra_args, helper)?;
info!("Storing. No progress bar is available for this style of backup yet.");
// this bit does all the magic.
// TODO(feature): progress bar for
std::io::copy(process.stdout.as_mut().unwrap(), &mut chunker)?;
let exit_status = process.wait()?;
if !exit_status.success() {
bail!(
"The process was not successful (exit code {}). Exiting.",
exit_status.code().unwrap()
);
}
let data_chunk_ref = chunker.finish()?;
eprintln!("Stored data! Now writing a pointer...");
let root = TreeNode::NormalFile {
mtime: Utc::now().timestamp_millis() as u64,
ownership: FilesystemOwnership {
uid: u16::MAX,
gid: u16::MAX,
},
permissions: FilesystemPermissions { mode: 0o600 },
content: data_chunk_ref,
};
// very important: store the pointer!
let pointer_chunk_ref = store_tree_node(
&pile,
&RootTreeNode {
name: filename.to_owned(),
node: root,
},
)?;
let pointer_data = PointerData {
chunk_ref: pointer_chunk_ref,
parent_pointer: None,
uid_lookup: Default::default(),
gid_lookup: Default::default(),
};
pile.write_pointer(&pointer_name, &pointer_data)?;
pile.flush()?;
eprintln!("Pointer saved!");
}
}
Ok(())
}
pub fn convert_filetree_to_yamatree<A, B, C, D>(
filetree: &FileTree<A, B, C, D>,
) -> yama::definitions::TreeNode
where
A: Debug + Clone + Eq + PartialEq,
B: Debug + Clone + Eq + PartialEq,
C: Debug + Clone + Eq + PartialEq,
D: Debug + Clone + Eq + PartialEq,
{
match filetree {
FileTree::NormalFile {
mtime,
ownership,
permissions,
meta: _,
} => TreeNode::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
content: RecursiveChunkRef {
chunk_id: Default::default(),
depth: 0,
},
},
FileTree::Directory {
ownership,
permissions,
children,
meta: _,
} => TreeNode::Directory {
ownership: *ownership,
permissions: *permissions,
children: children
.iter()
.map(|(k, v)| (k.clone(), convert_filetree_to_yamatree(v)))
.collect(),
},
FileTree::SymbolicLink {
ownership,
target,
meta: _,
} => TreeNode::SymbolicLink {
ownership: *ownership,
target: target.clone(),
},
FileTree::Other(_) => {
panic!("Shouldn't be any Others in the tree.");
}
}
}
pub fn backup_all_sources_to_destination<PT: ProgressTracker>(
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
dest_name: &str,
num_workers: u8,
progress_bar: &mut PT,
restricted_remote_name: String,
) -> anyhow::Result<()> {
let restricted_remote = match restricted_remote_name.as_str() {
"all" => None,
"self" | "this" | "here" => Some(get_hostname()),
other => Some(other.to_string()),
};
for (source_name, source_descriptor) in descriptor.sources.iter() {
if let (Some(source_host), Some(restricted_host)) = (
source_descriptor.get_remote_hostname(),
restricted_remote.as_ref(),
) {
if source_host != restricted_host {
// Skip this one, it wasn't requested right now.
continue;
}
}
backup_source_to_destination(
source_descriptor,
dest,
descriptor,
desc_path,
source_name.as_str(),
dest_name,
num_workers,
progress_bar,
)?;
}
Ok(())
}

View File

@ -1,182 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::commands::backup::POINTER_DATETIME_FORMAT;
use crate::descriptor::load_descriptor;
use anyhow::bail;
use chrono::{DateTime, NaiveDateTime, Utc};
use itertools::Itertools;
use log::{info, warn};
use std::path::Path;
use yama::commands::{load_pile_descriptor, open_pile};
use yama::pile::{Pile, RawPile};
pub type PileT = Pile<Box<dyn RawPile>>;
pub fn extract(
destination: &Path,
descriptor_path: &Path,
source_name: Option<&str>,
pile_name: &str,
before: Option<DateTime<Utc>>,
after: Option<DateTime<Utc>>,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
num_workers: u8,
) -> anyhow::Result<()> {
if destination.exists() {
bail!("For now, the destination is not allowed to exist prior to extraction.");
}
let descriptor = load_descriptor(descriptor_path)?;
let dest_descriptor = &descriptor.piles[pile_name];
let dest_pile_path = descriptor_path.join(&dest_descriptor.path);
let pile_descriptor = load_pile_descriptor(&dest_pile_path)?;
let pile = open_pile(&dest_pile_path, &pile_descriptor)?;
std::fs::create_dir_all(&destination)?;
let mut pointers_to_extract = Vec::new();
match source_name {
Some(source_name) => match find_pointer_for_source(source_name, &pile, &before, &after)? {
None => {
bail!(
"No pointer found for {:?} and it's the only one requested.",
source_name
);
}
Some(pointer) => {
pointers_to_extract.push(pointer);
}
},
None => {
for source in descriptor.sources.keys() {
match find_pointer_for_source(source, &pile, &before, &after)? {
None => {
warn!("No pointer found for {:?}! Carrying on anyway...", source);
}
Some(pointer) => {
pointers_to_extract.push(pointer);
}
}
}
}
}
extract_pointers_into_already_created_directory(
destination,
pointers_to_extract,
&pile,
apply_permissions,
apply_mtime,
apply_ownership,
num_workers,
)?;
Ok(())
}
fn find_pointer_for_source(
source_name: &str,
pile: &PileT,
before: &Option<DateTime<Utc>>,
after: &Option<DateTime<Utc>>,
) -> anyhow::Result<Option<String>> {
let mut current_choice: Option<(String, DateTime<Utc>)> = None;
for pointer_name in pile.list_pointers()? {
if let Some((pointer_source_name, encoded_datetime)) =
pointer_name.split('+').collect_tuple()
{
if source_name != pointer_source_name {
// don't accept pointers for other sources!
continue;
}
match NaiveDateTime::parse_from_str(encoded_datetime, POINTER_DATETIME_FORMAT) {
Ok(decoded_datetime) => {
let datetime = DateTime::from_utc(decoded_datetime, Utc);
if let Some(before) = before {
if before < &datetime {
// datetime is after the 'before' time
continue;
}
} else if let Some(after) = after {
if &datetime < after {
// datetime is before the 'after' time
continue;
}
}
match current_choice.as_ref() {
None => current_choice = Some((pointer_name, datetime)),
Some((_current_name, current_datetime)) => {
let should_replace = if after.is_some() {
// if we want the first one after a time, we want the earliest option!
// so replace if new datetime is earlier than current
&datetime < current_datetime
} else {
// replace if new datetime is after current datetime
current_datetime < &datetime
};
if should_replace {
current_choice = Some((pointer_name, datetime));
}
}
}
}
Err(e) => {
warn!(
"Ignoring {:?} because it seems to have a bad datetime: {:?}",
pointer_name, e
);
}
}
}
}
Ok(current_choice.map(|(a, _)| a))
}
fn extract_pointers_into_already_created_directory(
target: &Path,
pointers: Vec<String>,
pile: &PileT,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
num_workers: u8,
) -> anyhow::Result<()> {
for pointer in pointers {
info!("Extracting {:?} now.", pointer);
let pointer_target_dir = &target.join(&pointer);
std::fs::create_dir(pointer_target_dir)?;
yama::operations::extracting::extract_from_pointer_name(
pointer_target_dir,
&pointer,
pile,
true,
num_workers,
apply_permissions,
apply_mtime,
apply_ownership,
)?;
}
Ok(())
}

View File

@ -1,248 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::path::Path;
use anyhow::{anyhow, bail};
use crate::descriptor::{load_descriptor, SourceDescriptor};
use crate::labelling::{
label_node, load_labelling_rules, save_labelling_rules, GlobRule, Label, State,
};
use crate::tree::{scan, FileTree, FileTree1};
use arc_interner::ArcIntern;
use humansize::FileSize;
use std::io::{stdin, stdout, Write};
pub fn calculate_sizes(node: &mut FileTree1<u64>, real_path: &Path) -> anyhow::Result<u64> {
match node {
FileTree::NormalFile { meta, .. } => {
let size = std::fs::metadata(real_path)?.len();
*meta = size;
Ok(size)
}
FileTree::Directory { children, meta, .. } => {
let mut size = 0;
for (name, child) in children.iter_mut() {
size += calculate_sizes(child, &real_path.join(name))?;
}
*meta = size;
Ok(size)
}
FileTree::SymbolicLink { meta, target, .. } => {
*meta = target.len() as u64;
Ok(target.len() as u64)
}
FileTree::Other(_) => Ok(0),
}
}
pub fn string_to_outcome(s: &str) -> State {
match s {
"s" => State::Split,
"x" => State::Excluded,
other => State::Labelled(Label(ArcIntern::new(other.to_owned()))),
}
}
pub fn session(path: &Path, source_name: String) -> anyhow::Result<()> {
let mut current_path = String::from("");
let descriptor = load_descriptor(path)?;
let source_descriptor = descriptor
.sources
.get(&source_name)
.ok_or_else(|| anyhow!("Could not find source {:?}!", source_name))?;
let directory = match source_descriptor {
SourceDescriptor::DirectorySource { directory, .. } => directory,
SourceDescriptor::VirtualSource { .. } => {
bail!("Cannot browse virtual source.");
}
};
println!("Scanning source; this might take a little while...");
let mut dir_scan: FileTree1<Option<State>> = scan(directory)?
.ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None);
let mut size_dir_scan: FileTree1<u64> = dir_scan.replace_meta(&0);
calculate_sizes(&mut size_dir_scan, directory)?;
let mut rules = load_labelling_rules(path, &source_name)?;
let labels = descriptor
.labels
.iter()
.map(|l| Label(ArcIntern::new(l.clone())))
.collect();
label_node("".to_owned(), None, &mut dir_scan, &labels, &rules)?;
loop {
println!("---------------------------------------------------------");
println!("| {}", current_path);
println!("----");
if let Some(dir_node) = dir_scan.get_by_path(&current_path) {
if let FileTree::Directory { children, .. } = dir_node {
let size_node = size_dir_scan.get_by_path(&current_path).unwrap();
for (idx, (child_name, child)) in children.iter().enumerate() {
let size_child = size_node
.get_by_path(child_name)
.unwrap()
.get_metadata()
.unwrap();
if child.is_dir() {
println!("{}/", child_name);
} else if child.is_symlink() {
println!("{} (symlink)", child_name);
} else {
println!("{}", child_name);
}
print!("\t[{:3}] ", idx);
match child.get_metadata().unwrap() {
None => {
print!("unlabelled ");
}
Some(state) => match state {
State::Labelled(label) => {
print!("l:{} ", label.0.as_ref());
}
State::Split => {
print!("split ");
}
State::Excluded => {
print!("excluded ");
}
},
}
println!(
"({})",
size_child
.file_size(humansize::file_size_opts::BINARY)
.unwrap()
);
}
print!("\n> ");
stdout().flush()?;
let mut next_command = String::new();
if stdin().read_line(&mut next_command)? > 0 {
let split: Vec<&str> = next_command.trim_end_matches('\n').split(' ').collect();
match split[0] {
"x" => {
if let Ok(id) = split[1].parse::<usize>() {
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules
.position_based_rules
.insert(entry_path, State::Excluded);
} else {
eprintln!("not found.");
}
} else {
eprintln!("bad int :(");
}
}
"s" => {
if let Ok(id) = split[1].parse::<usize>() {
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules.position_based_rules.insert(entry_path, State::Split);
} else {
eprintln!("not found.");
}
} else {
eprintln!("bad int :(");
}
}
"p" => {
let outcome = split[1];
let pattern = split[2];
match glob::Pattern::new(&pattern) {
Ok(glob) => {
rules.glob_based_rules.push(GlobRule {
pattern: pattern.to_owned(),
glob,
outcome: string_to_outcome(&outcome),
});
}
Err(e) => {
eprintln!("{:?}", e);
}
}
}
"q" => {
break;
}
other => {
if other.chars().all(char::is_numeric) {
let id: usize = other.parse().unwrap();
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, entry))) = entry {
if entry.is_dir() {
current_path.extend("/".chars());
current_path.extend(name.chars());
} else {
eprintln!("not a dir.");
}
}
} else {
let label = split[1];
let id: usize = split[2].parse().unwrap(); // TODO
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules.position_based_rules.insert(
entry_path,
State::Labelled(Label(ArcIntern::new(label.to_owned()))),
);
}
}
}
}
} else {
println!("ending.");
break;
}
} else {
break;
}
} else {
break;
}
}
save_labelling_rules(path, &source_name, &rules)?;
Ok(())
}

View File

@ -1,260 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::io;
use std::io::{StdinLock, Stdout, Write};
use std::path::Path;
use arc_interner::ArcIntern;
use byteorder::ReadBytesExt;
use termion::input::TermRead;
use termion::raw::{IntoRawMode, RawTerminal};
use crate::descriptor::{load_descriptor, Descriptor, SourceDescriptor};
use crate::labelling::State::{Excluded, Labelled, Split};
use crate::labelling::{
load_labelling_rules, save_labelling_rules, GlobRule, Label, LabellingRules, State,
};
use crate::tree::{scan, FileTree, FileTree1};
use log::info;
use crate::get_hostname;
use crate::remote::backup_source_requester;
use crate::remote::backup_source_requester::connect_to_remote;
use anyhow::{anyhow, bail};
pub fn interactive_label_node(
path: String,
current_state: Option<State>,
node: &mut FileTree1<Option<State>>,
labels: &Vec<Label>,
rules: &mut LabellingRules,
stdin: &mut StdinLock,
stdout: &mut RawTerminal<Stdout>,
) -> anyhow::Result<()> {
let mut next_state = current_state;
if let Some(rule_state) = rules.apply(&path) {
next_state = Some(rule_state.clone());
} else if !next_state
.as_ref()
.map(|s| s.should_inherit())
.unwrap_or(false)
{
if node.is_dir() {
stdout.write_all(format!("\r{}/: _", path).as_bytes())?;
} else if node.is_symlink() {
stdout.write_all(format!("\r{} (symlink): _", path).as_bytes())?;
} else {
stdout.write_all(format!("\r{}: _", path).as_bytes())?;
}
stdout.flush()?;
let user_input_state = loop {
let next_char = stdin.read_u8()? as char;
if next_char >= '1' && next_char <= '9' {
let index = next_char as usize - '1' as usize;
if let Some(label) = labels.get(index) {
rules
.position_based_rules
.insert(path.clone(), Labelled(label.clone()));
print!("\x08{}\r\n", label.0);
break Some(Labelled(label.clone()));
}
} else if next_char == 'x' {
rules.position_based_rules.insert(path.clone(), Excluded);
print!("\x08{}\r\n", next_char);
break Some(Excluded);
} else if next_char == 's' {
if node.is_dir() {
rules.position_based_rules.insert(path.clone(), Split);
print!("\x08{}\r\n", next_char);
break Some(Split);
} else {
print!("\x08!");
stdout.flush()?;
}
} else if next_char == 'p' {
print!("\x08p\r\n\tPattern mode. Choose a label or other effect to apply to the pattern matches: _");
stdout.flush()?;
let rule_apply_state = loop {
let next_char = stdin.read_u8()? as char;
if next_char >= '1' && next_char <= '9' {
let index = next_char as usize - '1' as usize;
if let Some(label) = labels.get(index) {
print!("\x08{}\r\n", label.0);
break Labelled(label.clone());
}
} else if next_char == 'x' {
print!("\x08{}\r\n", next_char);
break Excluded;
} else if next_char == 's' {
print!("\x08{}\r\n", next_char);
break Split;
}
};
stdout.flush()?;
stdout.suspend_raw_mode()?;
print!("\tEnter a glob pattern to match on:\n\t");
stdout.flush()?;
let (pattern, glob) = loop {
let pattern = stdin
.read_line()?
.ok_or_else(|| anyhow!("EOT? when reading glob pattern"))?;
match glob::Pattern::new(&pattern) {
Ok(glob) => {
if !glob.matches(&path) {
println!("Doesn't match the path in question.");
continue;
}
break (pattern, glob);
}
Err(error) => {
println!("Error: {:?}. Try again.", error);
}
}
};
stdout.activate_raw_mode()?;
rules.glob_based_rules.push(GlobRule {
pattern,
glob,
outcome: rule_apply_state.clone(),
});
break Some(rule_apply_state);
} else if next_char == 'q' {
return Ok(());
}
};
next_state = user_input_state;
}
match node {
FileTree::NormalFile { meta, .. } => {
*meta = next_state;
}
FileTree::Directory { meta, children, .. } => {
*meta = next_state.clone();
for (child_name, child) in children.iter_mut() {
let child_path = format!("{}/{}", path, child_name);
interactive_label_node(
child_path,
next_state.clone(),
child,
labels,
rules,
stdin,
stdout,
)?;
}
}
FileTree::SymbolicLink { meta, .. } => {
*meta = next_state;
}
FileTree::Other(_) => {
panic!("Other() nodes shouldn't be present here.");
}
}
Ok(())
}
pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow::Result<()> {
let descriptor: Descriptor = load_descriptor(path)?;
let source = descriptor
.sources
.get(&source_name)
.ok_or_else(|| anyhow!("No source found by that name!"))?;
if let SourceDescriptor::DirectorySource {
hostname,
directory,
} = source
{
let my_hostname = get_hostname();
let mut dir_scan = if &my_hostname == hostname {
info!("Scanning source; this might take a little while...");
scan(directory)?
.ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None)
} else {
info!("Requesting scan over network. This might take a while.");
let connection = connect_to_remote(&descriptor, hostname)?;
let mut read = connection.stdout.expect("Requested stdout");
let mut write = connection.stdin.expect("Requested stdin");
// first start off with an introduction
info!("Connecting to remote source...");
backup_source_requester::introduction(&mut read, &mut write)?;
// then request to scan
info!("Requesting scan from remote source... (this may take some time)");
let scan =
backup_source_requester::scanning(&mut read, &mut write, directory.as_ref())?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?
.replace_meta(&None);
backup_source_requester::quit(&mut read, &mut write)?;
scan
};
let mut rules = load_labelling_rules(path, &source_name)?;
let labels: Vec<Label> = descriptor
.labels
.iter()
.map(|label| Label(ArcIntern::new(label.clone())))
.collect();
println!("The following label mappings are available:");
for (idx, label) in labels.iter().enumerate() {
println!("\tFor {:?}, press {}!", label.0.as_ref(), idx + 1);
}
println!("\tTo split a directory, press 's'!");
println!("\tTo exclude an entry, press 'x'!");
println!("\tTo apply a pattern, press 'p'...");
// Set terminal to raw mode to allow reading stdin one key at a time
let mut stdout = io::stdout().into_raw_mode().unwrap();
let stdin_unlocked = io::stdin();
let mut stdin = stdin_unlocked.lock();
interactive_label_node(
"".to_owned(),
None,
&mut dir_scan,
&labels,
&mut rules,
&mut stdin,
&mut stdout,
)?;
drop(stdout);
drop(stdin);
println!("\nLabelling completed!");
// save rules
save_labelling_rules(path, &source_name, &rules)?;
} else {
bail!("Can't do interactive labelling on a non-directory source.");
}
Ok(())
}

26
datman/src/datetime.rs Normal file
View File

@ -0,0 +1,26 @@
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone};
use eyre::bail;
use std::str::FromStr;
pub struct HumanDateTime(pub DateTime<Local>);
impl FromStr for HumanDateTime {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(date_only) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
let local_datetime = Local
.from_local_datetime(&date_only.and_hms_opt(0, 0, 0).unwrap())
.unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
let local_datetime = Local.from_local_datetime(&date_and_time).unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
let local_datetime = Local.from_local_datetime(&date_and_time).unwrap();
Ok(HumanDateTime(local_datetime))
} else {
bail!("Couldn't parse using any format. Use one of: 2021-05-16 OR 2021-05-16T17:42:14 OR 2021-05-16 17:42:14");
}
}
}

View File

@ -1,102 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
// TODO how do we handle?:
// - (important) yama push of one pile to another
// - backup policy stuff like 'minimum backup frequency' ... show when it's not been done
// - backup policy stuff like 'minimum on two different disks, not powered at the same time...'
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct Descriptor {
/// Dataset labels
pub labels: Vec<String>,
/// Sources
pub sources: HashMap<String, SourceDescriptor>,
/// Paths to destination Yama Piles. Remote Piles need a local virtual pile to specify the layers.
pub piles: HashMap<String, DestPileDescriptor>,
pub remote_hosts: HashMap<String, RemoteHostDescriptor>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RemoteHostDescriptor {
pub user_at_host: String,
pub path_to_datman: Option<String>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum SourceDescriptor {
DirectorySource {
hostname: String,
directory: PathBuf,
},
VirtualSource {
/// The name of the helper program that will be used to do this backup.
helper: String,
/// The label that will be assigned to this source.
label: String,
/// The kind of virtual source (how it operates).
kind: VirtualSourceKind,
#[serde(flatten)]
extra_args: HashMap<String, toml::Value>,
},
}
impl SourceDescriptor {
/// Gets the hostname that this source descriptor is for, if possible.
pub fn get_remote_hostname(&self) -> Option<&str> {
match self {
SourceDescriptor::DirectorySource { hostname, .. } => Some(hostname.as_str()),
SourceDescriptor::VirtualSource { .. } => None,
}
}
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum VirtualSourceKind {
Stdout {
#[serde(rename = "stdout")]
filename: String,
},
// TODO(feature) TempDir
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct DestPileDescriptor {
pub path: PathBuf,
pub included_labels: Vec<String>,
}
pub fn load_descriptor(path: &Path) -> anyhow::Result<Descriptor> {
let descriptor_file = path.join("datman.toml");
let mut buf = Vec::new();
File::open(descriptor_file)?.read_to_end(&mut buf)?;
Ok(toml::de::from_slice(&buf)?)
}

View File

@ -0,0 +1,152 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use eyre::{Context, ContextCompat};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use yama::pile_connector::PileConnectionScheme;
// TODO how do we handle?:
// - (important) yama push of one pile to another
// - backup policy stuff like 'minimum backup frequency' ... show when it's not been done
// - backup policy stuff like 'minimum on two different disks, not powered at the same time...'
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct Descriptor {
/// Sources
pub sources: HashMap<String, SourceDescriptor>,
/// Paths to destination Yama Piles. Remote Piles need a local virtual pile to specify the layers.
pub piles: HashMap<String, PilePathOrConnector>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub retention: Option<RetentionPolicyConfig>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum PilePathOrConnector {
PilePath(PathBuf),
PileConnector {
#[serde(flatten)]
scheme: PileConnectionScheme,
yamakey: PathBuf,
},
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RetentionPolicyConfig {
pub daily: u32,
pub weekly: u32,
pub monthly: u32,
pub yearly: u32,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct SourceDescriptor {
/// The host to run this backup task on.
pub host: String,
#[serde(flatten)]
pub inner: SourceDescriptorInner,
}
impl SourceDescriptor {
pub fn is_directory_source(&self) -> bool {
matches!(&self.inner, &SourceDescriptorInner::DirectorySource { .. })
}
pub fn is_virtual_source(&self) -> bool {
matches!(&self.inner, &SourceDescriptorInner::VirtualSource { .. })
}
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum SourceDescriptorInner {
DirectorySource {
path: PathBuf,
#[serde(default)]
cross_filesystems: bool,
/// TODO Paths to ignore
#[serde(default)]
ignore: Vec<String>,
},
VirtualSource(VirtualSource),
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct VirtualSource {
/// The name of the helper program that will be used to do this backup.
pub helper: String,
/// The kind of virtual source (how it operates).
pub kind: VirtualSourceKind,
#[serde(flatten)]
pub extra_args: HashMap<String, toml::Value>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum VirtualSourceKind {
Stdout {
#[serde(rename = "stdout")]
filename: String,
},
// TODO(feature) TempDir
}
/// Loads a descriptor and resolves relative paths contained within.
pub async fn load_descriptor(path: &Path) -> eyre::Result<Descriptor> {
let text = tokio::fs::read_to_string(path).await?;
let mut descriptor: Descriptor = toml::de::from_str(&text)?;
let dir = path
.parent()
.context("there must be a parent path for the descriptor file")?;
// Absolutise pile paths
for (_, pile_path_or_connector) in descriptor.piles.iter_mut() {
match pile_path_or_connector {
PilePathOrConnector::PilePath(pile_path) => {
*pile_path = dir
.join(&*pile_path)
.canonicalize()
.context("Failed to canonicalise path in descriptor")?;
}
PilePathOrConnector::PileConnector {
scheme:
PileConnectionScheme::Local {
directory: pile_path,
},
..
} => {
*pile_path = dir
.join(&*pile_path)
.canonicalize()
.context("Failed to canonicalise path in descriptor")?;
}
PilePathOrConnector::PileConnector { .. } => { /* nop */ }
}
}
Ok(descriptor)
}

183
datman/src/extract.rs Normal file
View File

@ -0,0 +1,183 @@
use crate::datetime::HumanDateTime;
use crate::pointer_names::split_pointer_name;
use chrono::{DateTime, Utc};
use eyre::{bail, eyre, Context, ContextCompat};
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};
use std::path::Path;
use std::sync::Arc;
use tracing::{info_span, warn, Instrument};
use yama::extract;
use yama::extract::flatten_treenode;
use yama::pile_with_cache::{PileWithCache, PointerIntegrationStatistics};
use yama_pile::tree::{FilesystemOwnership, FilesystemPermissions, RootTreeNode, TreeNode};
use yama_wormfile::boxed::BoxedWormFileProvider;
/// Given a list of source names and conditions to find pointers within,
/// returns a mapping of source names to pointers.
pub async fn select_to_extract(
pwc: &PileWithCache<BoxedWormFileProvider>,
sources: BTreeSet<String>,
before: Option<HumanDateTime>,
after: Option<HumanDateTime>,
accept_partial: bool,
) -> eyre::Result<BTreeMap<String, String>> {
let before = before.map(|dt| dt.0.with_timezone(&Utc));
let after = after.map(|dt| dt.0.with_timezone(&Utc));
let pointers_list = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
select_to_extract_impl(pointers_list, sources, before, after, accept_partial)
}
/// Given a list of source names and conditions to find pointers within,
/// returns a mapping of source names to pointers.
fn select_to_extract_impl(
pointers_list: Vec<String>,
sources: BTreeSet<String>,
before: Option<DateTime<Utc>>,
after: Option<DateTime<Utc>>,
accept_partial: bool,
) -> eyre::Result<BTreeMap<String, String>> {
if after.is_some() && before.is_some() {
bail!("Can't specify both before and after!");
}
let mut pointers_by_source: BTreeMap<String, String> = BTreeMap::new();
for pointer in pointers_list {
if let Some((source_name, pointer_datetime)) = split_pointer_name(&pointer) {
if !sources.contains(&source_name) {
// Not a source that we're interested in.
continue;
}
if let Some(before) = before {
if before < pointer_datetime {
// datetime is after the 'before' time
continue;
}
} else if let Some(after) = after {
if pointer_datetime < after {
// datetime is before the 'after' time
continue;
}
}
match pointers_by_source.entry(source_name) {
Entry::Vacant(ve) => {
ve.insert(pointer);
}
Entry::Occupied(mut oe) => {
let current_choice = oe.get_mut();
let (_, current_datetime) = split_pointer_name(&current_choice).unwrap();
let should_replace = if after.is_some() {
// if we want the first one after a time, we want the earliest option!
// so replace if new datetime is earlier than current
pointer_datetime < current_datetime
} else {
// replace if new datetime is after current datetime
current_datetime < pointer_datetime
};
if should_replace {
*current_choice = pointer;
}
}
}
};
}
if pointers_by_source.is_empty() {
bail!("No pointers selected for ANY of the sources: {sources:?}");
}
let missing: Vec<&String> = sources
.iter()
.filter(|src| !pointers_by_source.contains_key(*src))
.collect();
if !missing.is_empty() {
if accept_partial {
warn!("Some sources didn't have any pointers selected: {missing:?}. Continuing because --accept-partial passed.");
} else {
bail!("Some sources didn't have any pointers selected: {missing:?}. Pass --accept-partial if this is intended anyway.");
}
}
Ok(pointers_by_source)
}
pub async fn load_pointers_for_extraction(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
what_to_extract: BTreeMap<String, String>,
) -> eyre::Result<BTreeMap<String, RootTreeNode>> {
let mut result = BTreeMap::new();
for (source_name, pointer_name) in &what_to_extract {
let mut stats = PointerIntegrationStatistics::default();
let pointer = pwc
.read_pointer_fully_integrated(&pointer_name, &mut stats)
.await?
.context("pointer doesn't exist??")?;
// TODO(ownership): adapt uid/gids here
result.insert(source_name.clone(), pointer.root);
}
Ok(result)
}
pub fn merge_roots_for_batch_extract(extracts: BTreeMap<String, RootTreeNode>) -> TreeNode {
let mut children = BTreeMap::new();
for (name, entry) in extracts {
if matches!(entry.node, TreeNode::NormalFile { .. }) {
let mut children2 = BTreeMap::new();
children2.insert(entry.name, entry.node);
children.insert(
name,
TreeNode::Directory {
ownership: FilesystemOwnership {
// TODO(ownership): populate this correctly (current user?)
uid: 0,
gid: 0,
},
permissions: FilesystemPermissions { mode: 0o700 },
children: children2,
},
);
} else {
children.insert(name, entry.node);
}
}
TreeNode::Directory {
ownership: FilesystemOwnership {
// TODO(ownership): populate this correctly (current user?)
uid: 0,
gid: 0,
},
permissions: FilesystemPermissions { mode: 0o700 },
children,
}
}
pub async fn extract(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
node: TreeNode,
destination: &Path,
) -> eyre::Result<()> {
let flat = flatten_treenode(&node)?;
drop(node);
extract::unpack_nonfiles(destination, &flat.nonfiles, false, true).await?;
let extract_span = info_span!("extract_files");
extract::unpack_files(&pwc, destination, &flat.files, false, true)
.instrument(extract_span)
.await?;
Arc::try_unwrap(pwc)
.map_err(|_| eyre!("pwc still in use; can't close down gracefully"))?
.close()
.await?;
Ok(())
}

View File

@ -1,271 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader, Write};
use std::path::Path;
use anyhow::anyhow;
use anyhow::Context;
use arc_interner::ArcIntern;
use byteorder::WriteBytesExt;
use glob::Pattern;
use log::warn;
use serde::{Deserialize, Serialize};
use crate::labelling::State::{Excluded, Labelled, Split};
use crate::tree::{FileTree, FileTree1};
pub fn load_labelling_rules(path: &Path, source_name: &str) -> anyhow::Result<LabellingRules> {
let rule_path = path.join("labelling").join(format!("{}.zst", source_name));
if rule_path.exists() {
let rule_file = File::open(&rule_path)?;
let rule_reader = zstd::stream::read::Decoder::new(rule_file)?;
let buf_reader = BufReader::new(rule_reader);
Ok(LabellingRules::load(buf_reader)?)
} else {
Ok(LabellingRules::default())
}
}
pub fn save_labelling_rules(
path: &Path,
source_name: &str,
rules: &LabellingRules,
) -> anyhow::Result<()> {
let rule_path = path.join("labelling").join(format!("{}.zst", source_name));
if rule_path.exists() {
let backup_rule_path = path.join("labelling").join(format!("{}.zst~", source_name));
std::fs::rename(&rule_path, &backup_rule_path)?;
}
let rule_file = File::create(rule_path)?;
let mut zstd_writer = zstd::stream::write::Encoder::new(rule_file, 18)?;
rules.save(&mut zstd_writer)?;
zstd_writer.finish()?; // MUST CALL finish here!
Ok(())
}
#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialOrd, PartialEq, Hash)]
pub struct Label(pub ArcIntern<String>);
#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialOrd, PartialEq)]
pub enum State {
Labelled(Label),
Split,
Excluded,
}
impl State {
pub fn should_inherit(&self) -> bool {
match self {
Labelled(_) => true,
Split => false,
Excluded => true,
}
}
}
#[derive(Clone, Debug)]
pub struct GlobRule {
pub pattern: String,
pub glob: Pattern,
pub outcome: State,
}
#[derive(Clone, Debug, Default)]
pub struct LabellingRules {
pub position_based_rules: HashMap<String, State>,
pub glob_based_rules: Vec<GlobRule>,
}
impl LabellingRules {
pub fn load<R: BufRead>(mut input: R) -> anyhow::Result<Self> {
let mut result = LabellingRules {
position_based_rules: Default::default(),
glob_based_rules: Default::default(),
};
let mut str = String::new();
loop {
str.clear();
let line_len = input.read_line(&mut str)?;
if line_len == 0 {
break;
}
if &str == "---\n" {
// start reading glob patterns now.
break;
}
let pieces: Vec<&str> = str.trim_end_matches('\n').split('\t').collect();
if pieces.len() == 2 {
match pieces[1] {
"?" => {
result
.position_based_rules
.insert(pieces[0].to_owned(), Split);
}
"!" => {
result
.position_based_rules
.insert(pieces[0].to_owned(), Excluded);
}
label_str => {
result.position_based_rules.insert(
pieces[0].to_owned(),
Labelled(Label(ArcIntern::new(label_str.to_owned()))),
);
}
}
} else {
warn!("not 2 pieces: {:?}", str);
}
}
loop {
str.clear();
let line_len = input.read_line(&mut str)?;
if line_len == 0 {
break;
}
let pieces: Vec<&str> = str.trim().split('\t').collect();
if pieces.len() == 2 {
let outcome = match pieces[1] {
"?" => Split,
"!" => Excluded,
label_str => Labelled(Label(ArcIntern::new(label_str.to_owned()))),
};
let pattern = pieces[0].to_owned();
let glob = Pattern::new(&pattern)
.with_context(|| anyhow!("Whilst compiling glob: {:?}", pattern))?;
result.glob_based_rules.push(GlobRule {
pattern,
glob,
outcome,
});
} else {
warn!("not 2 pieces: {:?}", str);
}
}
Ok(result)
}
pub fn save<W: Write>(&self, mut output: W) -> anyhow::Result<()> {
for (path, rule) in self.position_based_rules.iter() {
output.write_all(path.as_bytes())?;
output.write_u8('\t' as u8)?;
match rule {
Labelled(label) => {
output.write_all(label.0.as_bytes())?;
}
Split => {
output.write_u8('?' as u8)?;
}
Excluded => {
output.write_u8('!' as u8)?;
}
}
output.write_u8('\n' as u8)?;
}
output.write_all("---\n".as_bytes())?;
for glob_rule in self.glob_based_rules.iter() {
output.write_all(glob_rule.pattern.as_bytes())?;
output.write_u8('\t' as u8)?;
match &glob_rule.outcome {
Labelled(label) => {
output.write_all(label.0.as_bytes())?;
}
Split => {
output.write_u8('?' as u8)?;
}
Excluded => {
output.write_u8('!' as u8)?;
}
}
output.write_u8('\n' as u8)?;
}
output.flush()?;
Ok(())
}
pub fn apply(&self, path: &str) -> Option<State> {
if let Some(rule_state) = self.position_based_rules.get(path) {
return Some(rule_state.clone());
}
for glob_rule in self.glob_based_rules.iter() {
if glob_rule.glob.matches(path) {
return Some(glob_rule.outcome.clone());
}
}
None
}
}
/// Uninteractively label the nodes.
pub fn label_node(
path: String,
current_state: Option<State>,
node: &mut FileTree1<Option<State>>,
labels: &Vec<Label>,
rules: &LabellingRules,
) -> anyhow::Result<()> {
let mut next_state = current_state;
if let Some(rule_state) = rules.apply(&path) {
next_state = Some(rule_state.clone());
} else if !next_state
.as_ref()
.map(|s| s.should_inherit())
.unwrap_or(false)
{
next_state = None;
}
match node {
FileTree::NormalFile { meta, .. } => {
*meta = next_state;
}
FileTree::Directory { meta, children, .. } => {
*meta = next_state.clone();
for (child_name, child) in children.iter_mut() {
let child_path = format!("{}/{}", path, child_name);
label_node(child_path, next_state.clone(), child, labels, rules)?;
}
}
FileTree::SymbolicLink { meta, .. } => {
*meta = next_state;
}
FileTree::Other(_) => {
panic!("Other() nodes shouldn't be present here.");
}
}
Ok(())
}
pub fn str_to_label<I: AsRef<str>>(input: I) -> Label {
Label(ArcIntern::new(input.as_ref().to_owned()))
}

View File

@ -1,12 +1,6 @@
pub mod commands;
pub mod descriptor;
pub mod labelling;
pub mod remote;
pub mod tree;
pub mod backup;
pub mod descriptor_config;
pub mod extract;
pub fn get_hostname() -> String {
hostname::get()
.expect("No hostname")
.into_string()
.expect("Hostname string must be sensible.")
}
pub mod datetime;
pub mod pointer_names;

View File

@ -0,0 +1,20 @@
use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
pub const POINTER_DATETIME_FORMAT: &'static str = "%F_%T";
pub const POINTER_NAME_DATETIME_SPLITTER: &'static str = "+";
pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String {
format!(
"{}{}{}",
source_name,
POINTER_NAME_DATETIME_SPLITTER,
datetime.format(POINTER_DATETIME_FORMAT).to_string()
)
}
pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
let (source_name, date_time_str) = pointer_name.rsplit_once(POINTER_NAME_DATETIME_SPLITTER)?;
let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
let date_time = Utc.from_utc_datetime(&date_time);
Some((source_name.to_owned(), date_time))
}

View File

@ -1,2 +0,0 @@
pub mod backup_source_requester;
pub mod backup_source_responder;

View File

@ -1,278 +0,0 @@
use crate::commands::backup::{get_pointer_name_at, label_filter_and_convert};
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor};
use crate::tree::FileTree;
use anyhow::{anyhow, bail};
use chrono::Utc;
use log::info;
use std::io::{Read, Write};
use std::path::Path;
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use yama::commands::{load_pile_descriptor, open_pile};
use yama::definitions::TreeNode;
use yama::pile::{Pile, RawPile, StoragePipelineSettings};
use yama::progress::ProgressTracker;
use yama::remote::responder::{Responder, ResponderWritingPipeline};
use yama::remote::{read_message, write_message};
use yama::utils::get_number_of_workers;
// SECURITY WARNING: the system you connect to using this mechanism will receive full access to
// your Yama pile. Do NOT connect to untrusted or compromised systems using this mechanism (yet).
pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
info!("Introduction.");
let version = env!("CARGO_PKG_VERSION");
write_message(
write,
&format!("Datman v{} Backup Source Requester", version),
)?;
write.flush()?;
let foreign_side: String = read_message(read)?;
let expected_foreign_side = format!("Datman v{} Backup Source Responder", version);
if &foreign_side != &expected_foreign_side {
bail!(
"Datman version mismatch. Expected {:?}, got {:?}",
expected_foreign_side,
foreign_side
);
}
Ok(())
}
pub fn scanning<R: Read, W: Write>(
read: &mut R,
write: &mut W,
path: &Path,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
info!("Scanning.");
write_message(write, &"scan")?;
write_message(write, &path)?;
write.flush()?;
let scan_result: Option<FileTree<(), (), (), ()>> = read_message(read)?;
Ok(scan_result)
}
pub fn chunking<
R: Read + Send + 'static,
W: Write + Send + 'static,
RP: RawPile + 'static,
PT: ProgressTracker + Send + 'static,
>(
read: R,
mut write: W,
path: &Path,
pointer_name: String,
tree_node: &TreeNode,
raw_pile: Arc<RP>,
parent: Option<String>,
progress_bar: PT,
use_writing_pipeline: bool,
) -> anyhow::Result<(R, W)> {
info!("Chunking.");
write_message(&mut write, &"chunk")?;
write_message(&mut write, &path)?;
write_message(&mut write, &pointer_name)?;
write_message(&mut write, tree_node)?;
write_message(&mut write, &parent)?;
write.flush()?;
let (writing_pipeline, control_rx) = if use_writing_pipeline {
let sps = StoragePipelineSettings {
num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
compressor_input_bound: 32,
writer_input_bound: 32,
};
let (control_tx, control_rx) = crossbeam_channel::unbounded();
let pipeline = raw_pile.build_storage_pipeline(sps, control_tx)?;
(
Some(ResponderWritingPipeline {
pipeline_submission: pipeline,
}),
Some(control_rx),
)
} else {
(None, None)
};
let (r_handle, w_handle, join_handles) = Responder::start(
read,
write,
get_number_of_workers("YAMA_RESPONDERS") as u16,
raw_pile,
writing_pipeline,
progress_bar,
);
info!("Waiting for remote to finish chunking.");
for handle in join_handles {
handle.join().expect("Join handle should not fail");
}
let read = r_handle.join().unwrap();
let write = w_handle.join().unwrap();
if let Some(control_rx) = control_rx {
while let Ok(_) = control_rx.recv() {
// TODO nop
}
}
info!("Remote finished chunking.");
Ok((read, write))
}
pub fn quit<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
write_message(write, &"exit")?;
write.flush()?;
let scan_result: String = read_message(read)?;
if scan_result.as_str() != "exit" {
bail!("Exit failed");
}
Ok(())
}
pub fn connect_to_remote(descriptor: &Descriptor, hostname: &str) -> anyhow::Result<Child> {
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
let connection = Command::new("ssh")
.arg(&remote_host_descriptor.user_at_host)
.arg("--")
.arg(
&remote_host_descriptor
.path_to_datman
.as_ref()
.map(|x| x.as_str())
.unwrap_or("datman"),
)
.arg("_backup_source_responder")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.spawn()?;
Ok(connection)
}
pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>(
source: &SourceDescriptor,
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
source_name: &str,
dest_name: &str,
_num_workers: u8,
progress_bar: PT,
) -> anyhow::Result<()> {
match source {
SourceDescriptor::DirectorySource {
hostname,
directory,
} => {
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
info!(
"Looking to backup {} (from {}) to {}",
source_name, remote_host_descriptor.user_at_host, dest_name
);
let connection = connect_to_remote(descriptor, hostname)?;
let mut read = connection.stdout.expect("Requested stdout");
let mut write = connection.stdin.expect("Requested stdin");
// first start off with an introduction
info!("Connecting...");
introduction(&mut read, &mut write)?;
// then request to scan
info!("Requesting scan... (this may take some time)");
let scan_result = scanning(&mut read, &mut write, directory.as_ref())?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?;
let root =
label_filter_and_convert(scan_result, descriptor, desc_path, source_name, dest)?
.ok_or_else(|| anyhow!("Empty filter..."))?;
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
info!("Searching for suitable parents.");
let mut parent: Option<String> = None;
let prefix = format!("{}+", source_name);
for pointer in pile.list_pointers()?.iter() {
if pointer.starts_with(&prefix) {
match parent.as_ref() {
None => {
parent = Some(pointer.to_owned());
}
Some(cur_parent) => {
if cur_parent < pointer {
parent = Some(pointer.to_owned());
}
}
}
}
}
match parent.as_ref() {
Some(parent) => {
info!("Using parent: {}", parent);
}
None => {
info!("No suitable parent found.");
}
}
info!("Storing remote using Yama (this may take some time)...");
let raw_pile = Arc::new(pile.raw_pile);
let pile = Pile::new(raw_pile.clone());
let (mut read, mut write) = chunking(
read,
write,
directory.as_ref(),
pointer_name.clone(),
&root,
raw_pile,
parent,
progress_bar,
true,
)?;
quit(&mut read, &mut write)?;
pile.flush()?;
info!("Stored! Checking for existence...");
if pile.list_pointers()?.contains(&pointer_name) {
info!("Exists!");
} else {
bail!("Pointer {:?} does not exist...", &pointer_name);
}
}
SourceDescriptor::VirtualSource { .. } => {
unimplemented!("Can't currently back up virtualsources on remotes...")
}
}
Ok(())
}

View File

@ -1,238 +0,0 @@
// This file implements the responder side of the backup source protocol -- the protocol used
// to connect to remote backup sources.
use crate::tree::scan;
use anyhow::bail;
use crossbeam_channel::Sender;
use log::info;
use std::io::{stdin, stdout, Read, Write};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use yama::definitions::TreeNode;
use yama::pile::{Pile, RawPile};
use yama::progress::ProgressTracker;
use yama::remote::requester::Requester;
use yama::remote::{read_message, write_message, RequestBody, ResponseBody};
use yama::utils::get_number_of_workers;
pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
write_message(
write,
&format!("Datman v{} Backup Source Responder", version),
)?;
write.flush()?;
let foreign_side: String = read_message(read)?;
let expected_foreign_side = format!("Datman v{} Backup Source Requester", version);
if &foreign_side != &expected_foreign_side {
bail!(
"Datman version mismatch. Expected {:?}, got {:?}",
expected_foreign_side,
foreign_side
);
}
Ok(())
}
pub fn scanning<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let path: PathBuf = read_message(read)?;
let scan_result = scan(&path)?;
write_message(write, &scan_result)?;
write.flush()?;
Ok(())
}
pub fn chunking<R: Read + Send + 'static, W: Write + Send + 'static>(
mut read: R,
write: W,
) -> anyhow::Result<()> {
let path: PathBuf = read_message(&mut read)?;
let pointer_name: String = read_message(&mut read)?;
let tree_node: TreeNode = read_message(&mut read)?;
let parent: Option<String> = read_message(&mut read)?;
let (yama_requester, requester_join_handles) = Requester::new(read, write);
let raw_pile: Box<dyn RawPile> = Box::new(yama_requester);
let pile = Pile::new(raw_pile);
// TODO TODO progress
let progress_bar = &mut ();
yama::operations::storing::store_fully(
Arc::new(pile),
&path,
&pointer_name,
tree_node,
parent,
get_number_of_workers("YAMA_CHUNKERS"),
progress_bar,
true,
)?;
for join_handle in requester_join_handles {
join_handle.join().expect("Expected to join handle");
}
Ok(())
}
pub struct ProgressSender {
pub last_sent: Instant,
pub current_progress: u64,
pub current_max: u64,
// TODO actually propagate this
pub current_message: String,
pub sender: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
}
impl ProgressSender {
pub fn send_now(&mut self, _include_message: bool) {
self.sender
.send((
RequestBody::Progress {
current: self.current_progress,
max: self.current_max,
},
None,
))
.expect("Progress sender failed");
self.last_sent = Instant::now();
}
pub fn send_if_overdue(&mut self) {
//info!("send if overdue...");
if Instant::now().duration_since(self.last_sent).as_millis() >= 1024 {
self.send_now(false);
}
}
}
impl ProgressTracker for ProgressSender {
fn inc_progress(&mut self, delta_progress: u64) {
self.current_progress += delta_progress;
self.send_if_overdue();
}
fn set_current(&mut self, current_progress: u64) {
self.current_progress = current_progress;
self.send_if_overdue();
}
fn set_max_size(&mut self, max_size: u64) {
self.current_max = max_size;
self.send_if_overdue();
}
}
pub fn chunking_stdio() -> anyhow::Result<()> {
let (path, pointer_name, tree_node, parent) = {
let stdin = stdin();
let mut read = stdin.lock();
let path: PathBuf = read_message(&mut read)?;
let pointer_name: String = read_message(&mut read)?;
let tree_node: TreeNode = read_message(&mut read)?;
let parent: Option<String> = read_message(&mut read)?;
(path, pointer_name, tree_node, parent)
};
info!(
"Have pointer_name = {:?}, parent = {:?}",
pointer_name, parent
);
let requester_join_handles = {
let (yama_requester, requester_join_handles) = Requester::new_from_stdio();
let command_sender = yama_requester.clone_command_sender();
info!("progress sender in use");
let mut progress_bar = ProgressSender {
last_sent: Instant::now(),
current_progress: 0,
current_max: 0,
current_message: "".to_string(),
sender: command_sender,
};
let raw_pile: Box<dyn RawPile> = Box::new(yama_requester);
let pile = Pile::new(raw_pile);
yama::operations::storing::store_fully(
Arc::new(pile),
&path,
&pointer_name,
tree_node,
parent,
get_number_of_workers("YAMA_CHUNKERS"),
&mut progress_bar,
true,
)?;
requester_join_handles
};
info!("Waiting to join.");
for join_handle in requester_join_handles {
join_handle.join().expect("Expected to join handle");
}
info!("Chunking completed.");
Ok(())
}
pub fn handler<R: Read + Send + 'static, W: Write + Send + 'static>(
mut read: R,
mut write: W,
) -> anyhow::Result<()> {
introduction(&mut read, &mut write)?;
scanning(&mut read, &mut write)?;
chunking(read, write)?;
Ok(())
}
pub fn handler_stdio() -> anyhow::Result<()> {
let stdin = stdin();
let stdout = stdout();
let mut read = stdin.lock();
let mut write = stdout.lock();
info!("Introduction.");
introduction(&mut read, &mut write)?;
loop {
let command: String = read_message(&mut read)?;
match command.as_str() {
"scan" => {
info!("Scanning.");
scanning(&mut read, &mut write)?;
}
"chunk" => {
info!("Chunking.");
drop(read);
drop(write);
chunking_stdio()?;
read = stdin.lock();
write = stdout.lock();
}
"exit" => {
write_message(&mut write, &"exit")?;
write.flush()?;
break;
}
othercommand => {
bail!("Don't understand {:?}", othercommand);
}
}
}
Ok(())
}

View File

@ -1,329 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeMap;
use std::fmt::Debug;
use std::fs::{read_link, symlink_metadata, DirEntry, Metadata};
use std::io::ErrorKind;
use std::os::unix::fs::MetadataExt;
use std::path::Path;
use anyhow::anyhow;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::warn;
use serde::{Deserialize, Serialize};
pub use yama::definitions::FilesystemOwnership;
pub use yama::definitions::FilesystemPermissions;
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub enum FileTree<NMeta, DMeta, SMeta, Other>
where
NMeta: Debug + Clone + Eq + PartialEq,
DMeta: Debug + Clone + Eq + PartialEq,
SMeta: Debug + Clone + Eq + PartialEq,
Other: Debug + Clone + Eq + PartialEq,
{
NormalFile {
/// modification time in ms
mtime: u64,
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
meta: NMeta,
},
Directory {
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
children: BTreeMap<String, FileTree<NMeta, DMeta, SMeta, Other>>,
meta: DMeta,
},
SymbolicLink {
ownership: FilesystemOwnership,
target: String,
meta: SMeta,
},
Other(Other),
}
pub type FileTree1<A> = FileTree<A, A, A, ()>;
impl<NMeta, DMeta, SMeta, Other> FileTree<NMeta, DMeta, SMeta, Other>
where
NMeta: Debug + Clone + Eq + PartialEq,
DMeta: Debug + Clone + Eq + PartialEq,
SMeta: Debug + Clone + Eq + PartialEq,
Other: Debug + Clone + Eq + PartialEq,
{
pub fn is_dir(&self) -> bool {
match self {
FileTree::NormalFile { .. } => false,
FileTree::Directory { .. } => true,
FileTree::SymbolicLink { .. } => false,
FileTree::Other(_) => false,
}
}
pub fn is_symlink(&self) -> bool {
match self {
FileTree::NormalFile { .. } => false,
FileTree::Directory { .. } => false,
FileTree::SymbolicLink { .. } => true,
FileTree::Other(_) => false,
}
}
pub fn get_by_path(&self, path: &String) -> Option<&FileTree<NMeta, DMeta, SMeta, Other>> {
let mut node = self;
for piece in path.split('/') {
if piece.is_empty() {
continue;
}
match node {
FileTree::Directory { children, .. } => match children.get(piece) {
None => {
return None;
}
Some(new_node) => {
node = new_node;
}
},
_ => {
return None;
}
}
}
Some(node)
}
pub fn replace_meta<Replacement: Clone + Debug + Eq + PartialEq>(
&self,
replacement: &Replacement,
) -> FileTree<Replacement, Replacement, Replacement, Other> {
match self {
FileTree::NormalFile {
mtime,
ownership,
permissions,
..
} => FileTree::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
meta: replacement.clone(),
},
FileTree::Directory {
ownership,
permissions,
children,
..
} => {
let children = children
.iter()
.map(|(str, ft)| (str.clone(), ft.replace_meta(replacement)))
.collect();
FileTree::Directory {
ownership: ownership.clone(),
permissions: permissions.clone(),
children,
meta: replacement.clone(),
}
}
FileTree::SymbolicLink {
ownership, target, ..
} => FileTree::SymbolicLink {
ownership: ownership.clone(),
target: target.clone(),
meta: replacement.clone(),
},
FileTree::Other(other) => FileTree::Other(other.clone()),
}
}
/// Filters the tree in-place by removing nodes that do not satisfy the predicate.
/// 'Inclusive' in the sense that if a directory does not satisfy the predicate but one of its
/// descendants does, then the directory will be included anyway.
/// (So nodes that satisfy the predicate will never be excluded because of a parent not doing so.)
///
/// Returns true if this node should be included, and false if it should not be.
pub fn filter_inclusive<F>(&mut self, predicate: &mut F) -> bool
where
F: FnMut(&Self) -> bool,
{
match self {
FileTree::Directory { children, .. } => {
let mut to_remove = Vec::new();
for (name, child) in children.iter_mut() {
if !child.filter_inclusive(predicate) {
to_remove.push(name.clone());
}
}
for name in to_remove {
children.remove(&name);
}
!children.is_empty() || predicate(&self)
}
_ => predicate(&self),
}
}
}
impl<X: Debug + Clone + Eq, YAny: Debug + Clone + Eq> FileTree<X, X, X, YAny> {
pub fn get_metadata(&self) -> Option<&X> {
match self {
FileTree::NormalFile { meta, .. } => Some(meta),
FileTree::Directory { meta, .. } => Some(meta),
FileTree::SymbolicLink { meta, .. } => Some(meta),
FileTree::Other(_) => None,
}
}
pub fn set_metadata(&mut self, new_meta: X) {
match self {
FileTree::NormalFile { meta, .. } => {
*meta = new_meta;
}
FileTree::Directory { meta, .. } => {
*meta = new_meta;
}
FileTree::SymbolicLink { meta, .. } => {
*meta = new_meta;
}
FileTree::Other(_) => {
// nop
}
}
}
}
/// Given a file's metadata, returns the mtime in milliseconds.
pub fn mtime_msec(metadata: &Metadata) -> u64 {
(metadata.mtime() * 1000 + metadata.mtime_nsec() / 1_000_000) as u64
}
/// Scan the filesystem to produce a Tree, using a default progress bar.
pub fn scan(path: &Path) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(2));
pbar.set_style(ProgressStyle::default_spinner().template("{spinner} {pos:7} {msg}"));
pbar.set_message("dir scan");
let result = scan_with_progress_bar(path, &pbar);
pbar.finish_at_current_pos();
result
}
/// Scan the filesystem to produce a Tree, using the specified progress bar.
pub fn scan_with_progress_bar(
path: &Path,
progress_bar: &ProgressBar,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
let metadata_res = symlink_metadata(path);
progress_bar.inc(1);
if let Err(e) = &metadata_res {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
let metadata = metadata_res?;
let filetype = metadata.file_type();
/*let name = path
.file_name()
.ok_or(anyhow!("No filename, wat"))?
.to_str()
.ok_or(anyhow!("Filename can't be to_str()d"))?
.to_owned();*/
let ownership = FilesystemOwnership {
uid: metadata.uid() as u16,
gid: metadata.gid() as u16,
};
let permissions = FilesystemPermissions {
mode: metadata.mode(),
};
if filetype.is_file() {
// Leave an unpopulated file node. It's not my responsibility to chunk it right now.
Ok(Some(FileTree::NormalFile {
mtime: mtime_msec(&metadata),
ownership,
permissions,
meta: (),
}))
} else if filetype.is_dir() {
let mut children = BTreeMap::new();
progress_bar.set_message(&format!("{:?}", path));
let dir_read = path.read_dir();
if let Err(e) = &dir_read {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished/: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied/: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
for entry in dir_read? {
let entry: DirEntry = entry?;
let scanned = scan_with_progress_bar(&entry.path(), progress_bar)?;
if let Some(scanned) = scanned {
children.insert(
entry
.file_name()
.into_string()
.expect("OsString not String"),
scanned,
);
}
}
Ok(Some(FileTree::Directory {
ownership,
permissions,
children,
meta: (),
}))
} else if filetype.is_symlink() {
let target = read_link(path)?
.to_str()
.ok_or(anyhow!("target path cannot be to_str()d"))?
.to_owned();
Ok(Some(FileTree::SymbolicLink {
ownership,
target,
meta: (),
}))
} else {
Ok(None)
}
}

12
datman_cli_readme.txt Normal file
View File

@ -0,0 +1,12 @@
`datman backup-one <sourceName> <destName>`
`datman backup-all <destName>`
Backs up now (either just one source or all sources) to a destination.
With `--config <>`, use a specified Datman config file; otherwise try current directory.
..
`datman extract <sourceName>` with same filtering options as now.
(unimportant: Yama tools should be decent for this right now.)

11
docs.old/SUMMARY.md Normal file
View File

@ -0,0 +1,11 @@
# Summary
- [Yama](./yama/index.md)
- [Getting Started](./yama/getting_started.md)
- [Internals](./yama/internals.md)
- [Raw Piles](./yama/internals/raw-piles.md)
- [Pointers and Nodes](./yama/internals/pointers-and-nodes.md)
- [Datman](./datman/index.md)
- [Getting Started](./datman/getting_started.md)
- [Remote Backups](./datman/remote_backups.md)

View File

@ -1,11 +0,0 @@
# Summary
- [Yama](./yama/index.md)
- [Getting Started](./yama/getting_started.md)
- [Internals](./yama/internals.md)
- [Raw Piles](./yama/internals/raw-piles.md)
- [Pointers and Nodes](./yama/internals/pointers-and-nodes.md)
- [Datman](./datman/index.md)
- [Getting Started](./datman/getting_started.md)
- [Remote Backups](./datman/remote_backups.md)

5
docs/yama/zstd.md Normal file
View File

@ -0,0 +1,5 @@
# Using a Zstd dictionary with Yama
## Creating a Zstd dictionary

216
flake.lock generated Normal file
View File

@ -0,0 +1,216 @@
{
"nodes": {
"fenix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
],
"rust-analyzer-src": "rust-analyzer-src"
},
"locked": {
"lastModified": 1682230876,
"narHash": "sha256-vCnd1pZRQKCdNvivQBD7WzaOlU1GcN91OCAz1rnoe5M=",
"owner": "nix-community",
"repo": "fenix",
"rev": "378f052d9f1cd90060ec4329f81782fee80490a4",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "fenix",
"type": "github"
}
},
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"naersk": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1662220400,
"narHash": "sha256-9o2OGQqu4xyLZP9K6kNe1pTHnyPz0Wr3raGYnr9AIgY=",
"owner": "nix-community",
"repo": "naersk",
"rev": "6944160c19cb591eb85bbf9b2f2768a935623ed3",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "naersk",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1703863825,
"narHash": "sha256-rXwqjtwiGKJheXB43ybM8NwWB8rO2dSRrEqes0S7F5Y=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "5163432afc817cf8bd1f031418d1869e4c9d5547",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1714971268,
"narHash": "sha256-IKwMSwHj9+ec660l+I4tki/1NRoeGpyA2GdtdYpAgEw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "27c13997bf450a01219899f5a83bd6ffbfc70d3c",
"type": "github"
},
"original": {
"id": "nixpkgs",
"ref": "nixos-23.11",
"type": "indirect"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": "flake-utils",
"nix-github-actions": "nix-github-actions",
"nixpkgs": [
"nixpkgs"
],
"systems": "systems_2",
"treefmt-nix": "treefmt-nix"
},
"locked": {
"lastModified": 1715017507,
"narHash": "sha256-RN2Vsba56PfX02DunWcZYkMLsipp928h+LVAWMYmbZg=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "e6b36523407ae6a7a4dfe29770c30b3a3563b43a",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"root": {
"inputs": {
"fenix": "fenix",
"naersk": "naersk",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix",
"utils": "utils"
}
},
"rust-analyzer-src": {
"flake": false,
"locked": {
"lastModified": 1682163822,
"narHash": "sha256-u7vaRlI6rYiutytoTk8lyOtNKO/rz5Q63Z6S6QzYCtU=",
"owner": "rust-lang",
"repo": "rust-analyzer",
"rev": "2feabc4dc462644287372922928110eea4c60ca7",
"type": "github"
},
"original": {
"owner": "rust-lang",
"ref": "nightly",
"repo": "rust-analyzer",
"type": "github"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"id": "systems",
"type": "indirect"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1714058656,
"narHash": "sha256-Qv4RBm4LKuO4fNOfx9wl40W2rBbv5u5m+whxRYUMiaA=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "c6aaf729f34a36c445618580a9f95a48f5e4e03f",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
},
"utils": {
"locked": {
"lastModified": 1659877975,
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

181
flake.nix Normal file
View File

@ -0,0 +1,181 @@
{
description = "Yama and Datman";
inputs = {
utils.url = "github:numtide/flake-utils";
naersk = {
url = "github:nix-community/naersk";
inputs.nixpkgs.follows = "nixpkgs";
};
# Current Rust in nixpkgs is too old unfortunately — let's use the Fenix overlay's packages...
fenix = {
url = "github:nix-community/fenix";
inputs.nixpkgs.follows = "nixpkgs";
};
nixpkgs.url = "nixpkgs/nixos-23.11";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs.nixpkgs.follows = "nixpkgs";
};
};
outputs = { self, nixpkgs, utils, naersk, fenix, poetry2nix }:
utils.lib.eachDefaultSystem (system: let
pkgs = nixpkgs.legacyPackages."${system}";
inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) mkPoetryApplication;
#fenixRustToolchain = fenix.packages."${system}".minimal.toolchain
# fenixRustToolchain =
# fenix."${system}".complete.withComponents [
# "cargo"
# "clippy"
# "rust-src"
# "rustc"
# "rustfmt"
# ];
# fenixRustToolchain = fenix.packages."${system}".stable.toolchain;
fenixRustToolchain =
fenix.packages."${system}".stable.withComponents [
"cargo"
"clippy"
"rust-src"
"rustc"
"rustfmt"
"rust-analyzer"
];
# rust-toolchain = pkgs.symlinkJoin {
# name = "rust-toolchain";
# paths = [fenixRustToolchain.rustc fenixRustToolchain.cargo fenixRustToolchain.clippy fenixRustToolchain.rustfmt fenixRustToolchain.rustPlatform.rustcSrc];
# };
#naersk-lib = naersk.lib."${system}";
naersk-lib = pkgs.callPackage naersk {
cargo = fenixRustToolchain;
rustc = fenixRustToolchain;
};
rustComponents = naersk-lib.buildPackage {
pname = "yama";
root = ./.;
overrideMain = attrs: {
# Set up the dev database, needed for compile-time query checking.
preConfigure = ''
export PATH="${pkgs.sqlx-cli}/bin:$PATH"
pushd yama_localcache
bash dev_db.sh
popd
'';
# Temporary, whilst we still need to occasionally rely on a debugger:
# don't strip debug symbols, at the cost of a much larger binary!
dontStrip = true;
};
buildInputs = with pkgs; [
openssl
pkg-config
sqlite
];
};
mysqlHelper = mkPoetryApplication {
projectDir = ./datman-helper-mysql;
};
postgresHelper = mkPoetryApplication {
projectDir = ./datman-helper-postgres;
};
# We want to produce a package with all of these together, with wrappers that let them
# refer to each other by name (i.e. have each other on the path).
# Datman needs the helpers on the path.
# The helpers need lz4 on the path.
allInOne = pkgs.stdenv.mkDerivation {
name = "datman-aio";
src = "${pkgs.emptyDirectory}";
installPhase = ''
# set -eu
mkdir $out $out/bin
ln -s ${rustComponents}/bin/{yama,datman,yamascan} $out/bin
ln -s ${mysqlHelper}/bin/datman-helper-mysql-{backup,restore} $out/bin
ln -s ${postgresHelper}/bin/datman-helper-postgres-{backup,restore} $out/bin
ln -s ${pkgs.lz4}/bin/lz4 $out/bin/
runHook postInstall
'';
buildInputs = [ pkgs.makeWrapper ];
postInstall = ''
# set -eu
for fnbase in {datman,yama,yamascan,datman-helper-{mysql,postgres}-{backup,restore}}; do
fn="$out/bin/$fnbase"
wrapProgram $fn --suffix PATH : $out/bin
mv "$out/bin/$fnbase" "$out/bin/7$fnbase"
done
'';
};
in rec {
# `nix build`
packages.yama = allInOne;
defaultPackage = packages.yama;
# NixOS Modules
# nixosModules = {
# yama = import ./nixos_modules/yama.nix self;
# };
# `nix run`
apps.yama = utils.lib.mkApp {
drv = rustComponents;
};
defaultApp = apps.yama;
# `nix develop`
devShell = pkgs.mkShell {
buildInputs = [
fenixRustToolchain
#rust-toolchain
pkgs.pkg-config
pkgs.alsa-lib
pkgs.sqlite
pkgs.sqlx-cli
#pkgs.libclang # ??
];
nativeBuildInputs = [
pkgs.openssl
pkgs.python3
];
# Needed for bindgen when binding to avahi
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
# Don't know if this var does anything by itself, but you need to feed this value in to IntelliJ IDEA and it's probably easier to pull out of an env var than look it up each time.
RUST_SRC_PATH = "${fenixRustToolchain}/lib/rustlib/src/rust/library";
# Cargo culted:
# Add to rustc search path
RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
]);
# Add to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
#nativeBuildInputs = with pkgs; [ rustc cargo ];
};
});
}

View File

@ -4,7 +4,7 @@ if [ $# -ge 1 ]
then
files=$*
else
files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-postgres/setup.py datman-helper-mysql/datman_helper_mysql datman-helper-mysql/setup.py"
files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-mysql/datman_helper_mysql"
fi
echo "Linting these locations: $files"

51
shell.nix Normal file
View File

@ -0,0 +1,51 @@
{ pkgs ? import <nixpkgs> {} }:
let
# We may need some packages from nixpkgs-unstable
#unstable = import <nixpkgs-unstable> {};
rust-toolchain = pkgs.symlinkJoin {
name = "rust-toolchain";
paths = [pkgs.rustc pkgs.cargo pkgs.clippy pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
};
in
pkgs.mkShell {
buildInputs = [
rust-toolchain
pkgs.pkg-config
pkgs.alsa-lib
pkgs.sqlite
pkgs.sqlx-cli
#pkgs.libclang # ??
];
nativeBuildInputs = [
pkgs.openssl
pkgs.python3
];
# Needed for bindgen when binding to avahi
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
# Cargo culted:
# Add to rustc search path
RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
]);
# Add to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
}

View File

@ -251,7 +251,8 @@ kind = {{ stdout = "blahblah.txt" }}
seed = 7555
print(f"seed: {seed}")
rng.seed(seed)
# min_files is 8 because we need enough files to use each label for this test to succeed.
# min_files is 8 because we need enough files to use each label for this
# test to succeed.
initial_descriptor, _ = generate_random_dir(rng, src_path, 32, min_files=8)
labellings = generate_labels(initial_descriptor, rng)
save_labelling_rules(labelling_path.joinpath("srca.zst"), labellings)
@ -298,3 +299,81 @@ kind = {{ stdout = "blahblah.txt" }}
)
td.cleanup()
def test_backup_incremental_with_mid_delete(self):
td = TemporaryDirectory("test_backup_incremental_with_mid_delete")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
print("storing")
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
# now mutate and store incremental
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
time.sleep(2)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
# now mutate and store incremental again!
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
mutated_descriptor = scan_dir(src_path)
time.sleep(2)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
pointer_names = [
line
for line in subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
.decode()
.split("\n")
if line
]
self.assertEqual(len(pointer_names), 3)
self.assertLess(pointer_names[0], pointer_names[1])
self.assertLess(pointer_names[1], pointer_names[2])
print(f"removing mid pointer {pointer_names[1]}")
subprocess.check_call(
("yama", "debug", "rmp", pointer_names[1]),
cwd=yama_path,
)
print("extracting last pointer to check still valid")
dest_path = tdpath.joinpath("desta")
subprocess.check_call(
(
"datman",
"extract",
"--skip-metadata",
"--accept-partial",
"main",
"../desta",
),
cwd=datman_path,
)
# this will be wrapped in a directory that starts with the name srca+
extracted_dir_descriptor_wrapper = scan_dir(dest_path)
contents = extracted_dir_descriptor_wrapper.contents
self.assertEqual(len(contents), 1)
key, value = next(iter(contents.items()))
self.assertTrue(key.startswith("srca+"))
self.assertIsInstance(value, DirectoryDescriptor)
key, value = next(iter(value.contents.items()))
self.assertEqual(key, "srca")
self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
td.cleanup()

View File

@ -1,6 +1,7 @@
import shutil
import subprocess
from pathlib import Path
from typing import Set
def set_up_simple_yama(path: Path):
@ -10,3 +11,13 @@ def set_up_simple_yama(path: Path):
"example_zstd.dict"
)
shutil.copyfile(example_zstd_path, path.joinpath("important_zstd.dict"))
def list_bloblog_ids(pile: Path) -> Set[int]:
result = set()
for p in pile.joinpath("bloblog").iterdir():
try:
result.add(int(p.name))
except ValueError:
pass
return result

View File

@ -22,7 +22,7 @@ REQUIRED = ["green", "attrs", "immutabledict"]
# What packages are optional?
EXTRAS = {"dev": ["black==21.7b0", "flake8==3.9.2", "isort==5.9.2"]}
EXTRAS = {"dev": ["black==22.10.0", "flake8==3.9.2", "isort==5.9.2"]}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------

View File

@ -0,0 +1,175 @@
import subprocess
from pathlib import Path
from random import Random
from tempfile import TemporaryDirectory
from unittest import TestCase
from helpers import (
DirectoryDescriptor,
generate_random_dir,
randomly_mutate_directory_in_descriptor,
scan_dir,
)
from helpers.datman_helpers import set_up_simple_datman
from helpers.yama_helpers import list_bloblog_ids, set_up_simple_yama
class TestYamaCompact(TestCase):
def test_compaction_merge_two_small_bloblogs(self):
td = TemporaryDirectory("test_check_fails_after_random_corruption")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
later_expected_descriptor, _ = generate_random_dir(rng, src_path, 32)
# Back up twice: that way we should get at least two bloblogs!
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
old_bloblog_ids = list_bloblog_ids(yama_path)
self.assertGreater(
len(old_bloblog_ids), 1, "Should be many bloblogs at this point"
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2",
"--small",
str(2 * 1024 * 1024 * 1024),
),
cwd=yama_path,
)
new_bloblog_ids = list_bloblog_ids(yama_path)
self.assertEqual(
len(new_bloblog_ids), 1, "Should only be 1 bloblog at this point."
)
self.assertEqual(
list(new_bloblog_ids)[0],
max(old_bloblog_ids) + 1,
"New bloblog ID should be 1 greater than the max old one.",
)
def test_gc_then_compact(self):
td = TemporaryDirectory("test_gc_then_compact")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
orig_pointer_name = (
subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
.decode()
.split("\n")[0]
)
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
mutated_descriptor = scan_dir(src_path)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
old_bloblog_ids = list_bloblog_ids(yama_path)
# Try a GC and check that it's a no-op
subprocess.check_call(
("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2000",
"--reclaim",
"1",
"--max-dealloc",
"1",
),
cwd=yama_path,
)
unchanged_bloblog_ids = list_bloblog_ids(yama_path)
self.assertEqual(
old_bloblog_ids,
unchanged_bloblog_ids,
"No GC: no compaction should have happened.",
)
subprocess.check_call(
("yama", "debug", "rmp", orig_pointer_name), cwd=yama_path
)
# Try a GC and check that it did something
subprocess.check_call(
("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2000",
"--reclaim",
"1",
"--max-dealloc",
"1",
),
cwd=yama_path,
)
new_bloblog_ids = list_bloblog_ids(yama_path)
self.assertNotEqual(
old_bloblog_ids, new_bloblog_ids, "GC: compaction should have happened."
)
# Check that we can still extract the files!
dest_path = tdpath.joinpath("desta")
subprocess.check_call(
(
"datman",
"extract",
"--skip-metadata",
"--accept-partial",
"main",
"../desta",
),
cwd=datman_path,
)
extracted_dir_descriptor_wrapper = scan_dir(dest_path)
contents = extracted_dir_descriptor_wrapper.contents
self.assertEqual(len(contents), 1)
key, value = next(iter(contents.items()))
self.assertTrue(key.startswith("srca+"))
self.assertIsInstance(value, DirectoryDescriptor)
key, value = next(iter(value.contents.items()))
self.assertEqual(key, "srca")
self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
td.cleanup()

View File

@ -1,6 +1,6 @@
[package]
name = "yama"
version = "0.5.0-alpha.2"
version = "0.7.0-alpha.1"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2018"
description = "Deduplicated, compressed and encrypted content pile manager"
@ -11,35 +11,48 @@ license = "GPL-3.0-or-later"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
fastcdc = "1.0.2"
zstd = "0.6.0" # 0.6.0+zstd.1.4.8
sshish = "0.1.0"
clap = "= 3.0.0-beta.5"
blake = "2.0.0"
twox-hash = "1.5.0"
serde = { version = "1.0.104", features = ["derive"] }
serde_bare = "0.3.0"
users = "0.9.1"
crossbeam-channel = "0.5.1"
crossbeam-utils = "0.8.5"
toml = "0.5.5"
glob = "0.3.0"
nix = "0.17.0"
log = "0.4"
env_logger = "0.7.1"
indicatif = "0.14.0"
num_cpus = "1"
anyhow = "1.0"
thiserror = "1.0"
sodiumoxide = "0.2.6"
byteorder = "1"
itertools = "0.9.0"
rayon = "1.5.0"
rusqlite = "0.24.2"
chrono = "0.4.19"
rustyline = "7.1.0"
derivative = "2.2.0"
metrics = "0.17.1"
eyre = "0.6.8"
tracing = "0.1.37"
ignore = "0.4.20"
patricia_tree = "0.5.7"
users = "0.11.0"
serde = { version = "1.0.160", features = ["derive"] }
yama_pile = { path = "../yama_pile" }
yama_localcache = { path = "../yama_localcache" }
yama_wormfile = { path = "../yama_wormfile" }
yama_wormfile_fs = { path = "../yama_wormfile_fs" }
yama_wormfile_s3 = { path = "../yama_wormfile_s3" }
yama_wormfile_sftp = { path = "../yama_wormfile_sftp" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
clap = { version = "4.2.2", features = ["derive"] }
tokio = { version = "1.28.1", features = ["full"] }
appdirs = "0.2.0"
twox-hash = "1.6.3"
hostname = "0.3.1"
tracing-subscriber = { version = "0.3.16", features = ["tracing-log", "env-filter"] }
tracing-indicatif = "0.3.0"
indicatif = "0.17.3"
dashmap = "5.4.0"
fastcdc = "3.0.3"
zstd = "0.12.3"
memmap2 = "0.5.10"
flume = "0.10.14"
async-recursion = "1.0.4"
toml = "0.7.3"
io-streams = "0.14.3"
dust_style_filetree_display = "0.8.5"
async-backtrace = "0.2.6"
signal-hook = "0.3.17"
[dev-dependencies]
temp-dir = "0.1.11"
maplit = "1.0.2"

File diff suppressed because it is too large Load Diff

237
yama/src/bin/yamascan.rs Normal file
View File

@ -0,0 +1,237 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeMap;
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use clap::Parser;
use dust_style_filetree_display::display::{draw_it, InitialDisplayData};
use dust_style_filetree_display::filter::AggregateData;
use dust_style_filetree_display::node::Node;
use dust_style_filetree_display::{get_height_of_terminal, get_width_of_terminal, init_color};
use eyre::{bail, Context, ContextCompat};
use patricia_tree::PatriciaMap;
use tokio::fs::OpenOptions;
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use yama::scan;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use yama_pile::tree::unpopulated::ScanEntry;
#[derive(Parser, Clone, Debug)]
pub enum YamaScanCommand {
/// Add an entry to an ignore file
#[command(alias = "i")]
Ignore {
/// What to ignore
path: String,
/// Don't anchor the match to this directory.
#[arg(short = 'a')]
unanchored: bool,
},
/// Show dust-style usage graph of the current directory, excluding excluded files.
#[command(alias = "du")]
Usage {
/// Specify an ignore rule. Can use multiple times.
#[arg(short = 'I', long = "ignore")]
ignore: Vec<String>,
},
}
#[tokio::main]
async fn main() -> eyre::Result<()> {
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "sqlx=warn,yama=debug,info".into()),
)
.with(tracing_subscriber::fmt::layer())
.init();
match YamaScanCommand::parse() {
YamaScanCommand::Usage { ignore } => {
let idd = InitialDisplayData {
short_paths: true,
is_reversed: false,
colors_on: !init_color(false),
by_filecount: false,
is_screen_reader: false,
iso: false,
};
let scan = scan::scan(Path::new("."), &ignore).context("Couldn't scan")?;
let top_nodes = assemble_display_tree_from_scan_entries(scan)?.children;
let root_display_node = dust_style_filetree_display::filter::get_biggest(
top_nodes,
AggregateData {
min_size: None,
only_dir: false,
only_file: false,
number_of_lines: get_height_of_terminal(),
depth: usize::MAX,
using_a_filter: false,
},
)
.expect("no root?");
draw_it(
idd,
false,
get_width_of_terminal(),
&root_display_node,
false,
)
}
YamaScanCommand::Ignore { path, unanchored } => {
let mut oo = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(false)
.open(".yamaignore")
.await
.context("failed to open .yamaignore for r/w")?;
let pos = oo.seek(SeekFrom::End(0)).await?;
if pos > 1 {
oo.seek(SeekFrom::End(-1)).await?;
let last_byte = oo.read_u8().await?;
if last_byte != b'\n' {
oo.write_u8(b'\n').await?;
}
}
if unanchored {
oo.write_all(format!("{}\n", path).as_bytes()).await?;
} else {
oo.write_all(format!("/{}\n", path).as_bytes()).await?;
}
oo.flush().await?;
drop(oo);
}
_other => todo!(),
}
Ok(())
}
pub fn assemble_display_tree_from_scan_entries(scan: PatriciaMap<ScanEntry>) -> eyre::Result<Node> {
let mut dirs: BTreeMap<String, BTreeMap<String, Node>> = BTreeMap::new();
// special-case the root ("")
dirs.insert(String::new(), BTreeMap::new());
for (key, entry) in scan.into_iter() {
let key_string = String::from_utf8(key).context("bad UTF-8 in PMap")?;
let (parent_dir_name, child_name) =
key_string.rsplit_once('/').unwrap_or(("", &key_string));
match entry {
ScanEntry::NormalFile { size, .. } => {
// note: for the root, this inserts the root file entry as a child called "" within a fake root 'directory'.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
ScanEntry::Directory {
ownership: _,
permissions: _,
} => {
dirs.insert(key_string.clone(), BTreeMap::new());
// note: for the root, this inserts the root directory entry as a child called "" within the root.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size: 4096,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
ScanEntry::SymbolicLink {
ownership: _,
target: _,
} => {
// note: for the root, this inserts the root symlink entry as a child called "" within a fake root 'directory'.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size: 4096,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
}
}
// Now roll up the directories. In Rustc v1.66 it'd be nice to use pop_last()...
while let Some(last_key) = dirs.keys().last().cloned() {
let mut last_children = dirs.remove(&last_key).unwrap();
if last_key.is_empty() {
assert!(
dirs.is_empty(),
"when pulling out root pseudo-dir, dirs must be empty for roll-up."
);
let mut real_root = last_children.remove("").unwrap();
real_root.children = last_children.into_values().collect();
real_root.size += real_root.children.iter().map(|c| c.size).sum::<u64>();
return Ok(real_root);
}
// We want to roll up the directory last/key -> {child -> ...}
// so last -> {key -> {child -> ...}}
let (parent_dir, child_name) = last_key.rsplit_once('/').unwrap_or(("", &last_key));
let parent = dirs
.get_mut(parent_dir)
.context("bad PMap? no parent in rollup")?;
let child_in_parent = parent
.get_mut(child_name)
.context("dir child not populated")?;
child_in_parent.children = last_children.into_values().collect();
child_in_parent.size += child_in_parent.children.iter().map(|c| c.size).sum::<u64>();
}
bail!("no root found; bad PMap or bad roll-up???");
}

223
yama/src/check.rs Normal file
View File

@ -0,0 +1,223 @@
use crate::extract::expand_chunkrefs;
use crate::pile_with_cache::PileWithCache;
use crate::retriever::decompressor::DECOMPRESS_CAPACITY;
use crate::PROGRESS_BAR_STYLE;
use eyre::{bail, eyre, Context, ContextCompat};
use flume::{Receiver, Sender};
use indicatif::ProgressStyle;
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use tokio::task::JoinSet;
use tracing::{error, info, info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::BloblogId;
use yama_pile::tree::TreeNode;
use yama_wormfile::boxed::BoxedWormFileProvider;
use zstd::bulk::Decompressor;
/// Check that all pointers point to chunks that exist **in our local cache**.
pub async fn check_pointers_point_to_indexed_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<()> {
let pointer_names = pwc.pile.list_pointers().await?;
let mut rcrs_to_check = BTreeSet::new();
for pointer_name in &pointer_names {
let pointer = pwc
.pile
.read_pointer(pointer_name)
.await?
.context("pointer vanished")?;
if let Some(parent_name) = pointer.parent {
if !pointer_names.contains(pointer_name) {
bail!("{parent_name:?}, the parent of {pointer_name:?}, does not exist");
}
}
pointer
.root
.node
.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
rcrs_to_check.insert(*content);
}
Ok(())
},
String::new(),
)
.unwrap();
}
let chunk_ids: BTreeSet<ChunkId> =
expand_chunkrefs(pwc, rcrs_to_check.into_iter().map(|x| ((), x)))
.await?
.into_iter()
.map(|(_, x)| x)
.flatten()
.collect();
info!("{} chunks to check for existence", chunk_ids.len());
let mut cache = pwc.localcache.read().await?;
let resolved_chunks = cache.locate_chunks(&chunk_ids).await?;
if chunk_ids.len() != resolved_chunks.len() {
bail!("Not all chunk IDs could be resolved. TODO: this check error is currently not granular enough.");
}
info!("All {} chunks accounted for!", resolved_chunks.len());
Ok(())
}
/// Checks all the chunks in the bloblog and then returns the number of chunks that were checked.
pub async fn check_all_chunk_hashes_in_bloblog(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
bloblog_id: BloblogId,
) -> eyre::Result<u64> {
let mut decompressor = match &pwc.pile.pile_config.zstd_dict {
Some(dict) => Decompressor::with_dictionary(dict)?,
None => Decompressor::new()?,
};
let chunk_id_key = &pwc.pile.pile_config.chunk_id_key;
let mut bloblog = pwc
.pile
.read_bloblog(bloblog_id)
.await
.with_context(|| format!("could not open bloblog for checking: {bloblog_id:?}"))?;
let offsets_and_chunks_to_read: BTreeMap<u64, ChunkId> = bloblog
.footer()
.chunks
.iter()
.map(|(chunk_id, locator)| (locator.offset, *chunk_id))
.collect();
let mut buf = Vec::with_capacity(DECOMPRESS_CAPACITY);
let mut checked = 0;
for (_, chunk_id) in offsets_and_chunks_to_read {
let blob = bloblog
.read_chunk(chunk_id)
.await?
.context("missing chunk")?;
(decompressor, buf) = tokio::task::spawn_blocking(move || {
decompressor
.decompress_to_buffer(&blob, &mut buf)
.with_context(|| format!("failed to decompress {chunk_id:?} in {bloblog_id:?}"))?;
Ok::<_, eyre::Error>((decompressor, buf))
})
.await??;
if !chunk_id.verify(&buf, chunk_id_key) {
bail!("verification failure: chunk {chunk_id:?} in bloblog {bloblog_id:?} is corrupt!");
}
checked += 1;
}
Ok(checked)
}
pub async fn check_all_chunks_in_bloblogs(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
bloblogs: &BTreeSet<BloblogId>,
) -> eyre::Result<()> {
let prog_span = info_span!("check_all_chunks");
async {
let prog_span = Span::current();
prog_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
prog_span.pb_set_length(bloblogs.len() as u64);
prog_span.pb_set_message("checking all bloblogs");
let mut workers = JoinSet::new();
let (bloblog_id_tx, bloblog_id_rx) = flume::bounded(16);
let (progress_tx, progress_rx) = flume::bounded(16);
for _ in 0..4 {
let pwc = pwc.clone();
let bloblog_id_rx = bloblog_id_rx.clone();
let progress_tx = progress_tx.clone();
workers.spawn(async {
if let Err(err) =
check_all_chunks_in_bloblogs_worker(pwc, bloblog_id_rx, progress_tx).await
{
error!("Error in chunk checker worker: {err:?}")
}
});
}
drop(progress_tx);
drop(bloblog_id_rx);
let mut success = true;
let mut num_bloblogs_outstanding = bloblogs.len();
let mut total_chunks_checked = 0u64;
tokio::join!(
async move {
for bloblog_id in bloblogs {
bloblog_id_tx
.send_async(*bloblog_id)
.await
.expect("can no longer submit new bloblogs to be checked; fault?");
}
drop(bloblog_id_tx);
},
async {
while let Ok(res) = progress_rx.recv_async().await {
match res {
Ok(chunks_checked) => {
total_chunks_checked += chunks_checked;
}
Err(err) => {
error!("check failure: {err:?}");
success = false;
}
}
prog_span.pb_inc(1);
num_bloblogs_outstanding = num_bloblogs_outstanding
.checked_sub(1)
.expect("more bloblogs progress reports than expected?");
}
}
);
if num_bloblogs_outstanding > 0 {
bail!("{num_bloblogs_outstanding} bloblogs outstanding somehow");
}
info!("{total_chunks_checked} chunks checked!");
if !success {
bail!("There were chunk check failures.");
}
Ok(())
}
.instrument(prog_span)
.await
}
pub async fn check_all_chunks_in_bloblogs_worker(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
bloblogs_rx: Receiver<BloblogId>,
progress_tx: Sender<eyre::Result<u64>>,
) -> eyre::Result<()> {
while let Ok(bloblog_id) = bloblogs_rx.recv_async().await {
let check = check_all_chunk_hashes_in_bloblog(&pwc, bloblog_id).await;
progress_tx
.send_async(check)
.await
.map_err(|_| eyre!("check progress tx shut down"))?;
}
Ok(())
}

View File

@ -1,239 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::cmp::min;
use std::io;
use std::io::{Cursor, Read, Write};
use anyhow::anyhow;
use fastcdc::FastCDC;
use crate::definitions::{ChunkId, RecursiveChunkRef};
use crate::pile::{Pile, RawPile};
use crate::utils::bytes_to_hexstring;
pub const SENSIBLE_THRESHOLD: usize = 1024 * 1024;
// 256 kiB
pub const FASTCDC_MIN: usize = 256 * 1024;
// 1 MiB
pub const FASTCDC_AVG: usize = 1024 * 1024;
// 8 MiB
pub const FASTCDC_MAX: usize = 8 * 1024 * 1024;
pub trait ChunkSubmissionTarget: Sync {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()>;
}
impl<RP: RawPile> ChunkSubmissionTarget for Pile<RP> {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
self.submit_chunk(chunk_id, chunk_data)
}
}
impl ChunkSubmissionTarget for crossbeam_channel::Sender<(ChunkId, Vec<u8>)> {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
self.send((chunk_id, chunk_data.to_vec()))
.map_err(|_| anyhow::anyhow!("Failed to send to pipeline."))
}
}
/// A chunker that will generate nested chunks of chunk references if there is that much data
/// to store.
/// The root RecursiveChunker is fed data bytes.
/// If it exceeds the nominated threshold, it grows a child RecursiveChunker (which may do the same).
/// When done, `finish` should be called to flush the buffers and obtain a `RecursiveChunkRef`.
pub struct RecursiveChunker<'cst, CST: ChunkSubmissionTarget> {
/// The pile to submit chunks to.
target: &'cst CST,
/// Buffer of data at this level.
buffer: Vec<u8>,
/// The next-layer recursive chunker. This is where this chunker will submit chunk IDs to for
/// recursive chunking.
next_layer: Option<Box<RecursiveChunker<'cst, CST>>>,
/// The size at which this chunker will perform recursive chunking.
threshold: usize,
}
impl<'cst, CST: ChunkSubmissionTarget> RecursiveChunker<'cst, CST> {
pub fn new(threshold: usize, target: &'cst CST) -> Self {
RecursiveChunker {
target,
buffer: vec![],
next_layer: None,
threshold,
}
}
/// finalise: true iff this is the last chunk (we will not reject a chunk which may have been
/// truncated)
fn do_chunking(&mut self, finalise: bool) -> anyhow::Result<Vec<u8>> {
let fastcdc = FastCDC::new(&self.buffer, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX);
let mut new_chunks: Vec<u8> = Vec::new();
let mut consumed_until: Option<usize> = None;
for chunk in fastcdc {
let is_final = chunk.offset + chunk.length == self.buffer.len();
if !is_final || finalise {
consumed_until = Some(chunk.offset + chunk.length);
let chunk_data = &self.buffer[chunk.offset..chunk.offset + chunk.length];
let chunk_id = calculate_chunkid(chunk_data);
new_chunks.extend_from_slice(&chunk_id);
self.target.submit(chunk_id, chunk_data)?;
}
}
if let Some(consumed_until) = consumed_until {
if consumed_until > 0 {
self.buffer.drain(0..consumed_until);
}
}
Ok(new_chunks)
}
pub fn finish(mut self) -> anyhow::Result<RecursiveChunkRef> {
if self.next_layer.is_some() {
// we are chunking so make this the last chunk
let new_chunks = self.do_chunking(true)?;
let mut subchunker = self.next_layer.unwrap();
subchunker.write(&new_chunks)?;
let mut rcr = subchunker.finish()?;
// as there is a level of chunking, increase the depth
rcr.depth += 1;
Ok(rcr)
} else {
// no chunking, so depth=0 (raw) and just emit our unchunked data
let chunk_id = calculate_chunkid(&self.buffer);
self.target.submit(chunk_id, &self.buffer)?;
Ok(RecursiveChunkRef { chunk_id, depth: 0 })
}
}
fn write_impl(&mut self, buf: &[u8]) -> anyhow::Result<usize> {
self.buffer.extend_from_slice(buf);
if self.buffer.len() > self.threshold {
if self.next_layer.is_none() {
// start chunking
self.next_layer = Some(Box::new(RecursiveChunker::new(
self.threshold,
self.target.clone(),
)));
}
let new_chunks = self.do_chunking(false)?;
self.next_layer.as_mut().unwrap().write(&new_chunks)?;
}
Ok(buf.len())
}
}
impl<'cst, CST: ChunkSubmissionTarget> Write for RecursiveChunker<'cst, CST> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match self.write_impl(buf) {
Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)),
Ok(written) => Ok(written),
}
}
fn flush(&mut self) -> io::Result<()> {
// nop is probably the most correct action here...
Ok(())
}
}
#[inline]
pub fn calculate_chunkid(chunk: &[u8]) -> ChunkId {
let mut chunk_id: ChunkId = Default::default();
blake::hash(256, &chunk, &mut chunk_id).expect("BLAKE problem");
chunk_id
}
pub struct RecursiveUnchunker<'pile, RP: RawPile> {
sub_reader: Box<dyn Read + 'pile>,
buffer: Vec<u8>,
pile: &'pile Pile<RP>,
}
impl<'pile, RP: RawPile> RecursiveUnchunker<'pile, RP> {
/// Create a new recursive unchunker.
/// This will automatically create sub-unchunkers as needed.
pub fn new(pile: &'pile Pile<RP>, reference: RecursiveChunkRef) -> Self {
if reference.depth == 0 {
// this unchunker only needs to unchunk the one chunk
RecursiveUnchunker {
sub_reader: Box::new(Cursor::new(reference.chunk_id.to_vec())),
buffer: vec![],
pile: &pile,
}
} else {
let next_ref = RecursiveChunkRef {
depth: reference.depth - 1,
..reference
};
let sub_unchunker = RecursiveUnchunker::new(pile, next_ref);
RecursiveUnchunker {
sub_reader: Box::new(sub_unchunker),
buffer: vec![],
pile: &pile,
}
}
}
}
impl<'pile, RP: RawPile> Read for RecursiveUnchunker<'pile, RP> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
while self.buffer.is_empty() {
// Internal buffer is empty, so we need to load another chunk.
// (we use 'while' in case we load an empty chunk...)
let mut next_chunk_id: ChunkId = Default::default();
let read_bytes = self.sub_reader.read(&mut next_chunk_id)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
return Ok(0);
}
if read_bytes < next_chunk_id.len() {
// any error, including EOF at this point, is an error
self.sub_reader
.read_exact(&mut next_chunk_id[read_bytes..])?;
}
let chunk = self
.pile
.read_chunk(&next_chunk_id)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
if let Some(chunk) = chunk {
self.buffer.extend_from_slice(&chunk);
} else {
return Err(io::Error::new(
io::ErrorKind::NotFound,
anyhow!("Chunk {} not found", bytes_to_hexstring(&next_chunk_id)),
));
}
}
let to_read = min(self.buffer.len(), buf.len());
buf[0..to_read].copy_from_slice(&self.buffer[0..to_read]);
self.buffer.drain(0..to_read);
Ok(to_read)
}
}

View File

@ -1,184 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::io;
use std::io::{Read, Write};
use std::path::Path;
use std::sync::Arc;
use anyhow::{anyhow, bail, Context};
use clap::crate_version;
use log::warn;
use crate::chunking::{RecursiveChunker, RecursiveUnchunker, SENSIBLE_THRESHOLD};
use crate::definitions::{PointerData, RecursiveChunkRef, RootTreeNode, TreeNode};
use crate::pile::compression::{CompressionSettings, RawPileCompressor};
use crate::pile::integrity::RawPileIntegrityChecker;
use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;
use crate::pile::{Pile, PileDescriptor, PileStorage, RawPile};
use crate::tree::{integrate_node_in_place, merge_uid_or_gid_tables};
use crate::utils::get_number_of_workers;
pub fn init(dir: &Path) -> anyhow::Result<()> {
let yama_toml = dir.join("yama.toml");
if yama_toml.exists() {
bail!("yama.toml already exists. Cannot create yama pile here.");
}
/*
let pile_db = sled::open(dir.join("pile.sled"))?;
pile_db.flush()?;
*/
let mut file = File::create(yama_toml)?;
let desc = PileDescriptor {
yama_version: crate_version!().to_owned(),
storage: PileStorage::SqliteIndexedBloblog,
compression: Some(12),
};
file.write_all(&toml::to_vec(&desc)?)?;
Ok(())
}
pub fn load_pile_descriptor(dir: &Path) -> anyhow::Result<PileDescriptor> {
let yama_toml = dir.join("yama.toml");
if !yama_toml.exists() {
bail!("yama.toml does not exist here. Is this an existing pile?");
}
let mut file = File::open(yama_toml)?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)?;
Ok(toml::from_slice(&buf)?)
}
pub fn open_pile(dir: &Path, desc: &PileDescriptor) -> anyhow::Result<Pile<Box<dyn RawPile>>> {
let num_compressors = get_number_of_workers("YAMA_COMPRESSORS");
let num_decompressors = get_number_of_workers("YAMA_DECOMPRESSORS");
match desc.storage {
PileStorage::RemoteOnly => {
bail!("This is a remote-only pile. No local storage allowed.");
}
PileStorage::SqliteIndexedBloblog => {
let blob_raw_pile = RawPileIntegrityChecker::new(SqliteBloblogPile::open(dir)?);
let raw_pile: Box<dyn RawPile> = match desc.compression {
None => Box::new(blob_raw_pile),
Some(comp_level) => {
let mut dictionary = Vec::new();
let dict_path = dir.join("important_zstd.dict");
File::open(dict_path)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary)?;
let (compressed_pile, _handles) = RawPileCompressor::new(
blob_raw_pile,
CompressionSettings {
dictionary: Arc::new(dictionary),
level: comp_level as i32,
num_compressors: num_compressors as u32,
num_decompressors: num_decompressors as u32,
},
)?;
Box::new(compressed_pile)
}
};
Ok(Pile::new(raw_pile))
}
}
}
pub fn store_tree_node<RP: RawPile>(
pile: &Pile<RP>,
root_tree_node: &RootTreeNode,
) -> anyhow::Result<RecursiveChunkRef> {
let serialised = serde_bare::to_vec(root_tree_node)?;
let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, pile);
io::copy(&mut (&serialised[..]), &mut chunker)?;
let chunk_ref = chunker.finish()?;
Ok(chunk_ref)
}
pub fn retrieve_tree_node<RP: RawPile>(
pile: &Pile<RP>,
chunk_ref: RecursiveChunkRef,
) -> anyhow::Result<RootTreeNode> {
let mut serialised = Vec::new();
let mut unchunker = RecursiveUnchunker::new(pile, chunk_ref);
io::copy(&mut unchunker, &mut serialised)?;
Ok(serde_bare::from_slice(&serialised)?)
/*
let unchunker = RecursiveUnchunker::new(pile, chunk_ref);
Ok(serde_bare::from_reader(unchunker)?)
*/
}
/// Given a pointer, fully integrates it in-place. The pointer will no longer have a parent when
/// this operation is finished.
pub fn fully_integrate_pointer_node<RP: RawPile>(
pile: &Pile<RP>,
tree_node: &mut TreeNode,
pointer: &mut PointerData,
) -> anyhow::Result<()> {
if let Some(parent_name) = &pointer.parent_pointer {
let mut parent = pile
.read_pointer(parent_name.as_str())?
.ok_or_else(|| anyhow!("Parent pointer {:?} not found.", parent_name))?;
let mut parent_node = retrieve_tree_node(pile, parent.chunk_ref.clone())?.node;
fully_integrate_pointer_node(pile, &mut parent_node, &mut parent)?;
integrate_node_in_place(tree_node, &mut parent_node)?;
// merge in the UID and GID tables when integrating.
if !merge_uid_or_gid_tables(&mut pointer.uid_lookup, &parent.uid_lookup) {
warn!(
"Overlap when merging parent:{:?}'s UID table into child.",
parent_name
);
}
if !merge_uid_or_gid_tables(&mut pointer.gid_lookup, &parent.gid_lookup) {
warn!(
"Overlap when merging parent:{:?}'s GID table into child.",
parent_name
);
}
pointer.parent_pointer = None;
}
Ok(())
}
/// Loads a pointer and fully integrates it.
/// The result will be a fully-integrated pointer (it won't have a parent).
pub fn fully_load_pointer<RP: RawPile>(
pile: &Pile<RP>,
pointer_name: &str,
) -> anyhow::Result<(PointerData, RootTreeNode)> {
let mut pointer_data = pile
.read_pointer(pointer_name)?
.ok_or_else(|| anyhow!("Pointer {:?} not found.", pointer_name))?;
let mut root_node = retrieve_tree_node(pile, pointer_data.chunk_ref.clone())?;
fully_integrate_pointer_node(pile, &mut root_node.node, &mut pointer_data)?;
Ok((pointer_data, root_node))
}

View File

@ -1,257 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node};
use crate::definitions::{FilesystemOwnership, FilesystemPermissions, TreeNode};
use crate::pile::{Pile, PileDescriptor, RawPile};
use crate::tree::integrate_node_in_place;
use anyhow::anyhow;
use clap::Parser;
use log::info;
use rustyline::error::ReadlineError;
use rustyline::Editor;
#[derive(Parser)]
pub enum DebugCommand {
/// List the pointers that are stored in this yama pile.
#[clap(name = "lsp")]
ListPointers {
/// List details about each pointer.
#[clap(short)]
verbose: bool,
},
/// Delete a pointer from the yama pile.
#[clap(name = "rmp")]
DeletePointer {
/// Name of the pointer to delete.
name: String,
},
/// Reads the information on a pointer.
#[clap(name = "infop")]
PointerInfo {
/// Name of the pointer to read.
name: String,
},
/// Reads statistics from the Pile
#[clap(name = "stats")]
Statistics {},
}
pub fn debug_prompt<RP: RawPile>(pdesc: PileDescriptor, pile: Pile<RP>) -> anyhow::Result<()> {
let mut rl = Editor::<()>::new();
if rl.load_history(".yama-history").is_err() {
// no previous history...
}
loop {
let readline = rl.readline("debug 山 ");
match readline {
Ok(line) => {
rl.add_history_entry(line.as_str());
let mut args = vec![""];
args.extend(line.split_ascii_whitespace());
match DebugCommand::try_parse_from(args) {
Ok(command) => {
if let Err(e) = debug_command(&pdesc, &pile, command) {
eprintln!("Failed {:?}", e);
}
}
Err(e) => {
eprintln!("{}", e);
}
}
}
Err(ReadlineError::Interrupted) => {
eprintln!("^C");
break;
}
Err(ReadlineError::Eof) => {
eprintln!("^D");
break;
}
Err(err) => {
eprintln!("Error: {:?}", err);
break;
}
}
}
pile.flush()?;
rl.save_history(".yama-history").unwrap();
Ok(())
}
pub fn debug_command<RP: RawPile>(
_pdesc: &PileDescriptor,
pile: &Pile<RP>,
command: DebugCommand,
) -> anyhow::Result<()> {
match &command {
DebugCommand::ListPointers { verbose } => {
for pointer in pile.list_pointers()?.iter() {
if *verbose {
let pointer_data = pile.read_pointer(pointer.as_str())?;
match pointer_data {
None => {
println!("{} → ??? MISSING DATA", pointer);
}
Some(data) => {
println!(
"{} → {:?} [parent={:?}]",
pointer, data.chunk_ref, data.parent_pointer
);
}
}
} else {
println!("{}", pointer);
}
}
}
DebugCommand::DeletePointer { name } => {
// retrieve this pointer
let mut this_pointer = pile.read_pointer(name.as_str())?.ok_or_else(|| {
anyhow!("Pointer {:?} does not exist so can not be deleted.", name)
})?;
let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())?;
// fully integrate the pointer
fully_integrate_pointer_node(&pile, &mut this_node.node, &mut this_pointer)?;
assert!(this_pointer.parent_pointer.is_none());
// now integrate any pointers that rely on this one
// so that they no longer rely on this one.
for pointer in pile.list_pointers()?.iter() {
if pointer == name {
continue;
}
if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
if parent_pointer == name {
info!("Pointer is now an orphan: {:?}", pointer);
// need to integrate this node, so retrieve it
let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
// integrate it in-place
integrate_node_in_place(&mut node.node, &this_node.node)?;
// mark it as orphaned (no parent)
pointer_data.parent_pointer = None;
// store the orphaned node
let new_chunk_ref = store_tree_node(&pile, &node)?;
// associate the orphaned node with the orphaned pointer
pointer_data.chunk_ref = new_chunk_ref;
// write the pointer back.
pile.write_pointer(pointer.as_str(), &pointer_data)?;
}
}
}
}
// then delete the pointer
pile.delete_pointer(name)?;
info!("Deleted pointer: {:?}", name);
}
DebugCommand::PointerInfo { name } => {
let this_pointer = pile
.read_pointer(name.as_str())?
.ok_or_else(|| anyhow!("Pointer {:?} does not exist.", name))?;
let this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())?;
eprintln!(" --- Pointer data --- ");
eprintln!("{:#?}", this_pointer);
eprintln!(" --- Tree node --- ");
//eprintln!("{:#?}", this_node.node);
tree_node_printer(&this_node.name, &this_node.node, 0);
}
DebugCommand::Statistics { .. } => {
if let Some(stats) = pile.raw_pile.debug_statistics()? {
println!("Statistics for this pile");
println!(" chunk count: {} chunks", stats.number_of_chunks);
println!(
" total chunk stored space: {} bytes (may exclude deleted chunks)",
stats.total_chunk_size
);
let average_size =
((stats.total_chunk_size as f64) / (stats.number_of_chunks as f64)) as u64;
println!(" (average chunk size: {} bytes)", average_size);
} else {
eprintln!("{:?}", pile);
eprintln!("Statistics appear not to be supported on this kind of pile?");
}
}
}
Ok(())
}
pub fn compact_ownership(ownership: &FilesystemOwnership) -> String {
format!("uid={}, gid={}", ownership.uid, ownership.gid)
}
pub fn compact_permissions(perms: &FilesystemPermissions) -> String {
format!("{:4o}", perms.mode)
}
pub fn tree_node_printer(name: &str, node: &TreeNode, depth: usize) {
match node {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
content,
} => {
eprintln!(
"{}{} = {:?} ({}, {}, mtime={})",
" ".repeat(depth),
name,
content,
compact_ownership(ownership),
compact_permissions(permissions),
mtime
);
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
eprintln!(
"{}{}/ ({}, {})",
" ".repeat(depth),
name,
compact_ownership(ownership),
compact_permissions(permissions)
);
for (name, child) in children.iter() {
tree_node_printer(name, child, depth + 1);
}
}
TreeNode::SymbolicLink { ownership, target } => {
eprintln!(
"{}{} → {} ({})",
" ".repeat(depth),
name,
target,
compact_ownership(ownership)
);
}
TreeNode::Deleted => {
eprintln!("{}{} DELETED", " ".repeat(depth), name);
}
}
}

20
yama/src/debugging.rs Normal file
View File

@ -0,0 +1,20 @@
use tokio::signal::unix::SignalKind;
use tracing::warn;
/// Registers a signal handler on SIGUSR1 that dumps a backtrace of the tokio task tree.
///
/// May be useful for debugging deadlocks etc.
pub fn register_sigusr1_backtrace_helper() {
tokio::spawn(async {
while let Some(()) = tokio::signal::unix::signal(SignalKind::user_defined1())
.unwrap()
.recv()
.await
{
warn!(
"SIGUSR1 received; debug task backtrace:\n{}",
async_backtrace::taskdump_tree(false)
);
}
});
}

View File

@ -1,326 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeMap;
use std::fmt::{Debug, Formatter};
use anyhow::bail;
use serde::{Deserialize, Serialize};
use crate::utils::bytes_to_hexstring;
pub type ChunkId = [u8; 32];
pub type XXHash = u64;
pub const XXH64_SEED: u64 = 424242;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PointerData {
pub chunk_ref: RecursiveChunkRef,
pub parent_pointer: Option<String>,
pub uid_lookup: BTreeMap<u16, Option<String>>,
pub gid_lookup: BTreeMap<u16, Option<String>>,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct RecursiveChunkRef {
/// The root Chunk ID.
pub chunk_id: ChunkId,
/// The depth of the data bytes.
/// 0 means that the chunk addressed by `chunk_id` contains data bytes.
/// 1 means that the chunk addressed by `chunk_id` contains references to chunk that contain
/// data bytes.
/// (and so on)
pub depth: u32,
}
impl Debug for RecursiveChunkRef {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}<{}>", bytes_to_hexstring(&self.chunk_id), self.depth)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RootTreeNode {
pub name: String,
pub node: TreeNode,
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub enum TreeNode {
NormalFile {
/// modification time in ms
mtime: u64,
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
// TODO size: u64 or not
// can perhaps cache chunk-wise (but not sure.)
content: RecursiveChunkRef,
},
Directory {
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
children: BTreeMap<String, TreeNode>,
},
SymbolicLink {
ownership: FilesystemOwnership,
target: String,
},
// TODO is there any other kind of file we need to store?
Deleted,
}
impl TreeNode {
pub fn metadata_invalidates(&self, other: &TreeNode) -> anyhow::Result<bool> {
Ok(match self {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
..
} => {
if let TreeNode::NormalFile {
mtime: other_mtime,
ownership: other_ownership,
permissions: other_permissions,
..
} = other
{
mtime != other_mtime
|| ownership != other_ownership
|| permissions != other_permissions
} else {
true
}
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
if let TreeNode::Directory {
ownership: other_ownership,
permissions: other_permissions,
children: other_children,
} = other
{
if ownership != other_ownership || permissions != other_permissions {
return Ok(true);
}
if children.len() != other_children.len() {
return Ok(true);
}
for ((left_name, left_node), (right_name, right_node)) in
children.iter().zip(other_children.iter())
{
if left_name != right_name || left_node.metadata_invalidates(right_node)? {
return Ok(true);
}
}
false
} else {
true
}
}
TreeNode::SymbolicLink { ownership, target } => {
if let TreeNode::SymbolicLink {
ownership: other_ownership,
target: other_target,
} = other
{
ownership != other_ownership || target != other_target
} else {
true
}
}
TreeNode::Deleted => {
// unreachable
bail!("Why is Deleted here?");
}
})
}
//
///// Guarantees consistent visit order.
// pub fn visit_mut<F>(
// &mut self,
// visitor: &mut F,
// path_prefix: &str,
// skip_components: u32,
// ) -> anyhow::Result<()>
// where
// F: FnMut(&mut Self, &str) -> anyhow::Result<()>,
// {
// let mut my_path_buf = String::new();
// my_path_buf.push_str(path_prefix);
//
// if skip_components == 0 {
// if !my_path_buf.is_empty() {
// my_path_buf.push('/');
// }
// my_path_buf.push_str(&self.name);
// }
//
// visitor(self, &my_path_buf)?;
//
// if let TreeNode::Directory { children, .. } = &mut self.content {
// for child in children.iter_mut() {
// let new_skip = if skip_components > 0 {
// skip_components - 1
// } else {
// 0
// };
// child.visit_mut(visitor, &my_path_buf, new_skip)?;
// }
// }
// Ok(())
// }
pub fn count_normal_files(&self) -> u32 {
match self {
TreeNode::NormalFile { .. } => 1,
TreeNode::Directory { children, .. } => children
.values()
.map(|child| child.count_normal_files())
.sum(),
_ => 0,
}
}
pub fn visit<F: FnMut(&TreeNode, &str) -> anyhow::Result<()>>(
&self,
func: &mut F,
prefix: String,
) -> anyhow::Result<()> {
func(self, &prefix)?;
if let TreeNode::Directory { children, .. } = &self {
for (name, child) in children.iter() {
if prefix.is_empty() {
// don't want a slash prefix
child.visit(func, name.clone())?;
} else {
child.visit(func, format!("{}/{}", prefix, name))?;
}
}
}
Ok(())
}
pub fn visit_mut<F: FnMut(&mut TreeNode, &str) -> anyhow::Result<()>>(
&mut self,
func: &mut F,
prefix: String,
) -> anyhow::Result<()> {
func(self, &prefix)?;
if let TreeNode::Directory { children, .. } = self {
for (name, child) in children.iter_mut() {
if prefix.is_empty() {
// don't want a slash prefix
child.visit_mut(func, name.clone())?;
} else {
child.visit_mut(func, format!("{}/{}", prefix, name))?;
}
}
}
Ok(())
}
pub fn exists(&self, pieces: &[&str]) -> bool {
match pieces.first() {
None => true,
Some(subpath) => {
if let TreeNode::Directory { children, .. } = self {
match children.get(*subpath) {
None => false,
Some(child) => child.exists(&pieces[1..]),
}
} else {
false
}
}
}
}
}
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct FilesystemOwnership {
pub uid: u16,
pub gid: u16,
}
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct FilesystemPermissions {
pub mode: u32,
}
#[cfg(test)]
pub mod tests {
use crate::definitions::{
FilesystemOwnership, FilesystemPermissions, RecursiveChunkRef, TreeNode,
};
use std::collections::BTreeMap;
pub fn example_file() -> TreeNode {
TreeNode::NormalFile {
mtime: 424242,
ownership: FilesystemOwnership {
uid: 1042,
gid: 1043,
},
permissions: FilesystemPermissions { mode: 0o760 },
content: RecursiveChunkRef {
chunk_id: Default::default(),
depth: 0,
},
}
}
pub fn example_dir(
file1: Option<(&str, TreeNode)>,
file2: Option<(&str, TreeNode)>,
) -> TreeNode {
let mut map = BTreeMap::new();
if let Some((name, file)) = file1 {
map.insert(name.to_owned(), file);
}
if let Some((name, file)) = file2 {
map.insert(name.to_owned(), file);
}
TreeNode::Directory {
ownership: FilesystemOwnership {
uid: 1042,
gid: 1043,
},
permissions: FilesystemPermissions { mode: 0o770 },
children: map,
}
}
#[test]
pub fn test_exists() {
let file = example_file();
assert!(file.exists(&[]));
assert!(!file.exists(&["anything"]));
let subdir = example_dir(Some(("fetchmailrc", example_file())), None);
let dir = example_dir(Some(("boot.img", example_file())), Some(("etc", subdir)));
assert!(dir.exists(&[]));
assert!(dir.exists(&["boot.img"]));
assert!(dir.exists(&["etc", "fetchmailrc"]));
assert!(!dir.exists(&["bin"]));
assert!(!dir.exists(&["etc", "resolv.conf"]));
assert!(!dir.exists(&["boot.img", "hehe"]));
}
}

483
yama/src/extract.rs Normal file
View File

@ -0,0 +1,483 @@
use crate::pile_with_cache::PileWithCache;
use crate::retriever::decompressor::PipelineDecompressor;
use crate::retriever::{create_fixed_retriever, FileId, JobChunkReq, JobId, RetrieverResp};
use crate::PROGRESS_BAR_STYLE;
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use flume::Receiver;
use indicatif::ProgressStyle;
use patricia_tree::PatriciaMap;
use std::cmp::Reverse;
use std::collections::{BTreeMap, BTreeSet};
use std::fs::Permissions;
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;
use tokio::task::JoinSet;
use tracing::{info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BloblogId, RecursiveChunkRef};
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{FilesystemPermissions, TreeNode};
use yama_wormfile::boxed::BoxedWormFileProvider;
#[derive(Clone, Debug, Default)]
pub struct FlattenedTree {
pub files: PatriciaMap<(ScanEntry, RecursiveChunkRef)>,
pub nonfiles: PatriciaMap<ScanEntry>,
}
pub fn flatten_treenode(root_node: &TreeNode) -> eyre::Result<FlattenedTree> {
let mut flat = FlattenedTree::default();
root_node.visit(
&mut |node, path| {
match node {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
size,
content,
} => {
flat.files.insert(
path,
(
ScanEntry::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
size: *size,
},
*content,
),
);
}
TreeNode::Directory {
ownership,
permissions,
children: _,
} => {
flat.nonfiles.insert(
path,
ScanEntry::Directory {
ownership: *ownership,
permissions: *permissions,
},
);
}
TreeNode::SymbolicLink { ownership, target } => {
flat.nonfiles.insert(
path,
ScanEntry::SymbolicLink {
ownership: *ownership,
target: target.clone(),
},
);
}
TreeNode::Deleted => {
bail!("unexpected TreeNode::Deleted in flatten_treenode");
}
}
Ok(())
},
String::new(),
)?;
Ok(flat)
}
/// Create directories and symbolic links.
pub async fn unpack_nonfiles(
root: &Path,
nonfiles: &PatriciaMap<ScanEntry>,
restore_ownership: bool,
restore_permissions: bool,
) -> eyre::Result<()> {
if restore_ownership {
bail!("restoring ownership is not yet supported...");
}
for (rel_path, scan_entry) in nonfiles.iter() {
let path = root
.join(String::from_utf8(rel_path).context("nonfiles map contains non-string keys?")?);
match scan_entry {
ScanEntry::NormalFile { .. } => {
bail!("found NormalFile in unpack_nonfiles()");
}
ScanEntry::Directory {
ownership: _,
permissions,
} => {
tokio::fs::create_dir(&path).await?;
if restore_permissions {
tokio::fs::set_permissions(&path, Permissions::from_mode(permissions.mode))
.await?;
}
}
ScanEntry::SymbolicLink {
ownership: _,
target,
} => {
tokio::fs::symlink(target, &path).await?;
}
}
}
Ok(())
}
// TODO(perf): move out file writes into separate tasks...
pub async fn unpack_files(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
root: &Path,
files: &PatriciaMap<(ScanEntry, RecursiveChunkRef)>,
restore_ownership: bool,
restore_permissions: bool,
) -> eyre::Result<()> {
if restore_ownership {
bail!("restoring ownership is not yet supported...");
}
let expanded_chunkrefs = expand_chunkrefs(
pwc,
files
.iter()
.map(|(path_bytes, (scan_entry, rcr))| ((path_bytes, scan_entry), *rcr)),
)
.await?;
let total_chunks = expanded_chunkrefs
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>();
let unpack_span = info_span!("unpack_files");
async move {
let unpack_span = Span::current();
unpack_span.pb_set_style(&ProgressStyle::default_bar().template(
PROGRESS_BAR_STYLE,
).unwrap());
unpack_span.pb_set_message("unpack");
unpack_span.pb_set_length(total_chunks);
let mut join_set = JoinSet::new();
let (file_part_retriever, mut jobs) =
lookup_chunkrefs_and_create_retriever(pwc, expanded_chunkrefs).await?;
let mut open_files = BTreeMap::new();
loop {
tokio::select! {
Ok(next_part) = file_part_retriever.recv_async() => {
match next_part {
RetrieverResp::Blob { job, subjob, blob } => {
if subjob == 0 {
// eprintln!("subjob 0 for job {job:?}");
let (path_bytes, scan_entry) = jobs
.remove(&job)
.with_context(|| format!("bad job {job:?} to extract"))?;
let (permissions, _ownership) = if let ScanEntry::NormalFile {
permissions,
ownership,
..
} = scan_entry
{
(permissions, ownership)
} else {
bail!("not a Normal File in unpack_files()");
};
let path = root.join(String::from_utf8(path_bytes).context("bad utf-8 in PM")?);
let (tx, rx) = flume::bounded(16);
join_set.spawn(file_unpacker_writer(path, *permissions, restore_permissions, rx));
open_files.insert(job, tx);
}
open_files
.get_mut(&job)
.context("bad job to write file")?
.send_async(Some(blob))
.await
.map_err(|_| eyre!("file tx shutdown"))?;
unpack_span.pb_inc(1);
}
RetrieverResp::JobComplete(job) => {
open_files
.remove(&job)
.context("bad job to finish file")?
.send_async(None)
.await
.map_err(|_| eyre!("file tx shutdown"))?;
}
}
},
Some(join_result) = join_set.join_next() => {
join_result
.context("failed file unpacker writer (a)")?
.context("failed file unpacker writer (b)")?;
},
else => {
break;
}
}
}
// we should have already drained the join set, but check...
assert!(join_set.join_next().await.is_none());
if !open_files.is_empty() || !jobs.is_empty() {
bail!("There were errors extracting.");
}
Ok(())
}.instrument(unpack_span).await
}
pub async fn unpack_sync_stream(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
chunkref: RecursiveChunkRef,
mut stream: impl Write,
) -> eyre::Result<()> {
let expanded_chunkrefs = expand_chunkrefs(pwc, vec![((), chunkref)].into_iter()).await?;
let total_chunks = expanded_chunkrefs
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>();
let unpack_span = info_span!("unpack_files");
async move {
let unpack_span = Span::current();
unpack_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
unpack_span.pb_set_message("unpack");
unpack_span.pb_set_length(total_chunks);
let (file_part_retriever, _) =
lookup_chunkrefs_and_create_retriever(pwc, expanded_chunkrefs).await?;
let mut done = false;
while let Ok(next_part) = file_part_retriever.recv_async().await {
match next_part {
RetrieverResp::Blob { blob, .. } => {
tokio::task::block_in_place(|| stream.write_all(&blob))
.context("Failed to write to output stream on Blob")?;
unpack_span.pb_inc(1);
}
RetrieverResp::JobComplete(_) => {
tokio::task::block_in_place(|| stream.flush())
.context("Failed to flush output stream on JobComplete")?;
done = true;
}
}
}
if !done {
bail!("There were errors extracting.");
}
Ok(())
}
.instrument(unpack_span)
.await
}
async fn file_unpacker_writer(
path: PathBuf,
permissions: FilesystemPermissions,
restore_permissions: bool,
rx: Receiver<Option<Vec<u8>>>,
) -> eyre::Result<()> {
let mut oo = OpenOptions::new();
oo.write(true).create_new(true);
if restore_permissions {
oo.mode(permissions.mode);
};
let mut file = oo
.open(&path)
.await
.with_context(|| format!("can't create {path:?}"))?;
loop {
match rx.recv_async().await {
Ok(Some(next_block)) => {
file.write_all(&next_block).await?;
}
Ok(None) => {
file.flush().await.context("failed to flush")?;
return Ok(());
}
Err(_) => {
bail!("rx for file unpacking into {path:?} disconnected unexpectedly");
}
}
}
}
pub(crate) async fn expand_chunkrefs<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
chunkrefs: impl Iterator<Item = (T, RecursiveChunkRef)>,
) -> eyre::Result<Vec<(T, Vec<ChunkId>)>> {
let mut by_depth = BTreeMap::<Reverse<u32>, Vec<(T, Vec<ChunkId>)>>::new();
for (t, rec) in chunkrefs {
by_depth
.entry(Reverse(rec.depth))
.or_default()
.push((t, vec![rec.chunk_id]));
}
while let Some(Reverse(next_depth)) = by_depth.keys().next().cloned() {
let ts_and_chunks = by_depth.remove(&Reverse(next_depth)).unwrap();
if next_depth == 0 {
return Ok(ts_and_chunks);
}
let ec_span = info_span!("expand_chunkrefs");
ec_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
ec_span.pb_set_length(
ts_and_chunks
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>(),
);
ec_span.pb_set_message(&format!("resolve (d={next_depth})"));
let expanded_ts_and_chunks = expand_chunkrefs_one_layer(pwc, ts_and_chunks)
.instrument(ec_span)
.await?;
by_depth
.entry(Reverse(next_depth - 1))
.or_default()
.extend(expanded_ts_and_chunks);
}
Ok(Vec::new())
}
async fn lookup_chunkrefs_and_create_retriever<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
input: Vec<(T, Vec<ChunkId>)>,
) -> eyre::Result<(Receiver<RetrieverResp>, BTreeMap<JobId, T>)> {
let mut next_job_id = JobId(0);
let chunks_to_lookup: BTreeSet<ChunkId> = input
.iter()
.flat_map(|(_t, chunkids)| chunkids)
.copied()
.collect();
let looked_up_chunks = pwc
.localcache
.read()
.await?
.locate_chunks(&chunks_to_lookup)
.await?;
ensure!(
chunks_to_lookup.len() == looked_up_chunks.len(),
"chunks are missing"
);
let bloblog_ids: BTreeSet<BloblogId> = looked_up_chunks.values().map(|(bi, _)| *bi).collect();
let num_bloblogs = bloblog_ids.len();
let bloblog_to_file_ids: BTreeMap<BloblogId, FileId> = bloblog_ids
.into_iter()
.zip((0..num_bloblogs as u32).map(FileId))
.collect();
let files: BTreeMap<FileId, BloblogId> =
bloblog_to_file_ids.iter().map(|(&k, &v)| (v, k)).collect();
let mut out_by_job = BTreeMap::<JobId, T>::new();
let mut jobs = BTreeMap::<JobId, Vec<JobChunkReq>>::new();
for (t, chunks) in input {
let job_id = next_job_id;
next_job_id.0 += 1;
out_by_job.insert(job_id, t);
jobs.insert(
job_id,
chunks
.into_iter()
.map(|c| {
let (bloblog_id, blob_locator) = &looked_up_chunks[&c];
JobChunkReq {
file: bloblog_to_file_ids[bloblog_id],
offset: blob_locator.offset,
length: blob_locator.length,
}
})
.collect(),
);
}
let retriever = create_fixed_retriever(pwc.clone(), jobs, files, 8)?;
let retriever =
PipelineDecompressor::start(pwc.pile.pile_config.zstd_dict.clone(), 2, retriever)?;
Ok((retriever, out_by_job))
}
pub(crate) async fn expand_chunkrefs_one_layer<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
input: Vec<(T, Vec<ChunkId>)>,
) -> eyre::Result<Vec<(T, Vec<ChunkId>)>> {
let (retriever, jobs_to_ts) = lookup_chunkrefs_and_create_retriever(pwc, input).await?;
let mut out_by_job: BTreeMap<JobId, (T, Vec<u8>)> = jobs_to_ts
.into_iter()
.map(|(ji, t)| (ji, (t, Vec::new())))
.collect();
let mut num_jobs_left = out_by_job.len();
while let Ok(result) = retriever.recv_async().await {
match result {
RetrieverResp::Blob {
job,
subjob: _,
blob,
} => {
out_by_job
.get_mut(&job)
.context("bad job gm")?
.1
.extend_from_slice(&blob);
Span::current().pb_inc(1);
}
RetrieverResp::JobComplete(_) => {
num_jobs_left -= 1;
}
}
}
ensure!(num_jobs_left == 0, "jobs left over, recovery not complete");
out_by_job
.into_values()
.map(|(t, bytes)| {
let chunk_ids = bytes
.chunks(32)
.map(|b| {
if b.len() != 32 {
bail!("wrong number of bytes for chunk refs");
}
let mut b32 = [0u8; 32];
b32.copy_from_slice(b);
Ok(ChunkId::from(b32))
})
.collect::<eyre::Result<_>>()?;
Ok((t, chunk_ids))
})
.collect()
}

90
yama/src/init.rs Normal file
View File

@ -0,0 +1,90 @@
use eyre::{bail, Context, ContextCompat};
use std::path::Path;
use tokio::io::AsyncWriteExt;
use yama_midlevel_crypto::byte_layer::{ByteLayer, CborSerde};
use yama_midlevel_crypto::key_derivation::KeyDerivationParameters;
use yama_midlevel_crypto::sym_box::SymBox;
use yama_pile::definitions::{PackedKeyring, PackedPileConfig, UnlockedOrLockedKeyring};
use yama_pile::keyring::Keyring;
use yama_pile::{DIR_BLOBLOGS, DIR_INDICES, DIR_LOCKS, FILE_MASTER_KEYRING, FILE_YAMA_CONFIG};
use yama_wormfile::paths::WormPath;
use yama_wormfile::{WormFileProvider, WormFileWriter};
/// Perform checks before we init a pile in the given directory.
pub async fn pre_init_check(path: &Path) -> eyre::Result<()> {
if path.exists() && !path.is_dir() {
bail!("{path:?} is not a directory; cannot create pile or connector here.");
}
for important_path in [
"yama.toml",
DIR_BLOBLOGS,
DIR_LOCKS,
FILE_YAMA_CONFIG,
DIR_INDICES,
] {
let important_path = path.join(important_path);
if important_path.exists() {
bail!("{important_path:?} already exists: can't create pile or connector here.");
}
}
Ok(())
}
/// Perform checks before we init a pile in the given WormFileProvider.
pub async fn pre_init_check_wfp(wfp: &impl WormFileProvider) -> eyre::Result<()> {
for important_path in ["yama.toml", FILE_YAMA_CONFIG] {
let important_path = WormPath::new(important_path).unwrap();
if wfp.is_regular_file(&important_path).await? {
bail!("{important_path:?} already exists: can't create pile.");
}
}
Ok(())
}
/// Initialise a pile.
/// Should be run after `pre_init_check_wfp`.
pub async fn init_pile(
wfp: &impl WormFileProvider,
pile_config: PackedPileConfig,
master_keyring_copy: Option<PackedKeyring>,
) -> eyre::Result<()> {
let mut writer = wfp.write().await?;
writer.write_all(&pile_config.into_byte_vec()).await?;
writer.flush().await?;
writer
.finalise(WormPath::new(FILE_YAMA_CONFIG).unwrap(), false)
.await?;
if let Some(master_keyring_copy) = master_keyring_copy {
let mut writer = wfp.write().await?;
writer
.write_all(&master_keyring_copy.into_byte_vec())
.await?;
writer.flush().await?;
writer
.finalise(WormPath::new(FILE_MASTER_KEYRING).unwrap(), false)
.await?;
}
Ok(())
}
// todo move this
pub fn pack_keyring(unpacked: Keyring, password: Option<&str>) -> eyre::Result<PackedKeyring> {
let packed = if let Some(password) = password {
let deriver = KeyDerivationParameters::new_recommended();
let key = deriver
.derive(password)
.context("Failed to derive key from password")?;
let symkey = key.into_symkey();
let lockbox = SymBox::new(CborSerde::serialise(&unpacked).unwrap(), &symkey)
.context("Failed to encrypt keyring")?;
UnlockedOrLockedKeyring::Locked { deriver, lockbox }
} else {
UnlockedOrLockedKeyring::Unlocked(unpacked)
};
Ok(PackedKeyring::serialise(&packed).unwrap())
}

View File

@ -1,10 +1,25 @@
pub mod chunking;
pub mod commands;
pub mod debug;
pub mod definitions;
pub mod operations;
pub mod pile;
pub mod progress;
pub mod remote;
pub mod tree;
pub mod utils;
pub mod init;
pub mod open;
pub mod check;
pub mod extract;
pub mod scan;
pub mod storing;
pub mod vacuum;
pub mod pile_connector;
pub mod pile_with_cache;
pub mod retriever;
pub const PROGRESS_BAR_STYLE: &'static str =
"[{elapsed_precise}]/[{eta}] {wide_bar:.cyan/blue} {pos:>7}/{len:7} {msg}";
pub fn get_hostname() -> String {
hostname::get()
.expect("No hostname")
.into_string()
.expect("Hostname string must be sensible.")
}
pub mod debugging;

228
yama/src/open.rs Normal file
View File

@ -0,0 +1,228 @@
use crate::pile_connector::PileConnectionScheme;
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, Context, ContextCompat};
use std::borrow::Cow;
use std::collections::BTreeSet;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::io::{AsyncBufReadExt, BufReader};
use tracing::debug;
use twox_hash::XxHash64;
use yama_midlevel_crypto::byte_layer::ByteLayer;
use yama_pile::definitions::{IndexId, PackedKeyring, UnlockedOrLockedKeyring};
use yama_pile::keyring::Keyring;
use yama_pile::locks::LockKind;
use yama_pile::{Pile, FILE_YAMA_CONFIG, FILE_YAMA_CONNECTOR};
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const KEYRING_LOOKUP_SEQ: [&'static str; 2] = ["access.yamakeyring", "master.yamakeyring"];
pub async fn pre_open_keyring(connector_in_dir: &Path) -> eyre::Result<UnlockedOrLockedKeyring> {
for lookup in KEYRING_LOOKUP_SEQ {
let keyring_path = connector_in_dir.join(lookup);
if keyring_path.exists() {
return pre_open_keyring_at_path(&keyring_path).await;
}
}
bail!(
"No keyring found in {:?}. Expected to see one at one of: {:?}",
connector_in_dir,
KEYRING_LOOKUP_SEQ
);
}
pub async fn pre_open_keyring_at_path(
keyring_path: &Path,
) -> eyre::Result<UnlockedOrLockedKeyring> {
let packed_keyring_bytes = tokio::fs::read(&keyring_path)
.await
.with_context(|| format!("failed to read keyring file at {:?}", keyring_path))?;
let packed_keyring = PackedKeyring::from_byte_vec(packed_keyring_bytes)
.deserialise()
.with_context(|| format!("failed to deserialise keyring file at {:?}", keyring_path))?;
return Ok(packed_keyring);
}
pub async fn open_keyring_interactive(input: UnlockedOrLockedKeyring) -> eyre::Result<Keyring> {
match input {
UnlockedOrLockedKeyring::Locked { deriver, lockbox } => {
println!("enter keyring password to decrypt:");
let stdin = tokio::io::stdin();
let mut stdin_br = BufReader::new(stdin);
let mut line = String::new();
stdin_br.read_line(&mut line).await?;
let derived = deriver
.derive(line.trim())
.context("failed to derive key from password")?;
let keyring = lockbox
.unlock(&derived.into_symkey())
.context("failed to decrypt keyring")?
.deserialise()
.context("failed to deserialise keyring")?;
Ok(keyring)
}
UnlockedOrLockedKeyring::Unlocked(keyring) => Ok(keyring),
}
}
pub async fn open_pile(
connector_in_dir: &Path,
keyring: Keyring,
lock_kind: LockKind,
lock_holder: String,
) -> eyre::Result<PileWithCache<BoxedWormFileProvider>> {
let connection_scheme = if connector_in_dir.join(FILE_YAMA_CONFIG).exists() {
PileConnectionScheme::Local {
directory: connector_in_dir
.canonicalize()
.context("can't canonicalise local pile path")?
.to_owned(),
}
} else if connector_in_dir.join(FILE_YAMA_CONNECTOR).exists() {
let connector_toml = tokio::fs::read_to_string(&connector_in_dir.join(FILE_YAMA_CONNECTOR))
.await
.context("failed to read connector")?;
let connector: PileConnectionScheme =
toml::from_str(&connector_toml).context("failed to deserialise connector")?;
connector
} else {
bail!("Neither yama.cfg nor yama.toml exists; doesn't look like a Yama pile or pile connector.");
};
// Calculate a prefix for the cache name
let canon_connector_in_dir = connector_in_dir
.canonicalize()
.unwrap_or(connector_in_dir.to_owned());
let cache_base_name = canon_connector_in_dir
.file_name()
.map(|f| f.to_string_lossy())
.unwrap_or(Cow::Borrowed("_"));
open_pile_using_connector(
&connection_scheme,
cache_base_name.as_ref(),
keyring,
lock_kind,
lock_holder,
)
.await
}
pub async fn open_pile_using_connector(
connection_scheme: &PileConnectionScheme,
cache_base_name: &str,
keyring: Keyring,
lock_kind: LockKind,
lock_holder: String,
) -> eyre::Result<PileWithCache<BoxedWormFileProvider>> {
let wormfileprovider = Arc::new(connection_scheme.connect_to_wormfileprovider().await?);
let pile = Pile::open_manual(wormfileprovider, lock_kind, lock_holder, keyring).await?;
let cache_dir = appdirs::user_cache_dir(Some("yama"), None).expect("can't obtain cache dir!");
let mut hasher = XxHash64::default();
connection_scheme.hash(&mut hasher);
let u64_hash = hasher.finish();
let cache_key = format!("{}-{:016x}.sqlite3", cache_base_name, u64_hash);
tokio::fs::create_dir_all(&cache_dir).await?;
let cache_file = cache_dir.join(&cache_key);
let localcache = yama_localcache::Store::new(&cache_file)
.await
.with_context(|| format!("failed to open local cache"))?;
Ok(PileWithCache { pile, localcache })
}
pub async fn update_cache(pwc: &PileWithCache<BoxedWormFileProvider>) -> eyre::Result<()> {
debug!("updating cache");
let available_indices = pwc
.pile
.list_indices()
.await
.context("can't list available indices")?;
let present_indices = pwc
.localcache
.read()
.await?
.list_indices()
.await
.context("can't list cached indices")?;
let missing_indices: BTreeSet<IndexId> = available_indices
.difference(&present_indices)
.cloned()
.collect();
let deleted_indices: BTreeSet<IndexId> = present_indices
.difference(&available_indices)
.cloned()
.collect();
let mut downloaded_indices = Vec::new();
debug!(
"{} new indices to cache, {} deleted indices to back out",
missing_indices.len(),
deleted_indices.len()
);
for missing_index in missing_indices {
debug!("downloading index {missing_index:?}");
downloaded_indices.push((missing_index, pwc.pile.read_index(missing_index).await?));
}
let mut txn = pwc.localcache.write().await?;
for deleted_index in deleted_indices {
debug!("backing out index {deleted_index:?}");
txn.delete_index(deleted_index).await?;
}
for (index_id, index) in downloaded_indices {
debug!("applying index {index_id:?}");
txn.apply_index(index_id, Arc::new(index)).await?;
}
debug!("finished updating cache");
Ok(())
}
pub async fn open_lock_and_update_cache(
pile_connector_path: PathBuf,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let keyring = pre_open_keyring(&pile_connector_path).await?;
let keyring = open_keyring_interactive(keyring).await?;
let pwc = open_pile(&pile_connector_path, keyring, LockKind::Shared, lock_name).await?;
update_cache(&pwc).await?;
Ok(Arc::new(pwc))
}
pub async fn open_lock_and_update_cache_with_connector(
pile_connection_scheme: &PileConnectionScheme,
cache_base_name: &str,
keyring_path: &Path,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let keyring = pre_open_keyring_at_path(keyring_path).await?;
let keyring = open_keyring_interactive(keyring).await?;
let pwc = open_pile_using_connector(
pile_connection_scheme,
cache_base_name,
keyring,
LockKind::Shared,
lock_name,
)
.await?;
update_cache(&pwc).await?;
Ok(Arc::new(pwc))
}

View File

@ -1,4 +0,0 @@
pub mod checking;
pub mod extracting;
pub mod pushpull;
pub mod storing;

View File

@ -1,410 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::chunking::RecursiveUnchunker;
use crate::commands::retrieve_tree_node;
use crate::definitions::{ChunkId, TreeNode};
use crate::pile::{ControllerMessage, Keyspace, Pile, RawPile, StoragePipelineSettings};
use anyhow::bail;
use crossbeam_channel::Sender;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::{error, info, warn};
use std::collections::HashSet;
use std::io::{Read, Write};
use std::sync::Mutex;
#[derive(PartialEq, Eq, Copy, Clone, Debug)]
pub enum VacuumMode {
NoVacuum,
DryRunVacuum,
Vacuum,
}
pub struct NullWriter {}
impl Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
/// Mark-and-sweep style vacuuming system.
/// We mark all the chunks that we run into (following the structure of all the pointers and
/// recursive chunk references) and sweep the chunks that have not been read.
#[derive(Debug)]
pub struct VacuumRawPile<RP: RawPile> {
underlying: RP,
vacuum_tracking_enabled: bool,
pub retrieved_chunks: Mutex<HashSet<ChunkId>>,
}
impl<RP: RawPile> VacuumRawPile<RP> {
pub fn new(underlying: RP, vacuum_tracking_enabled: bool) -> Self {
VacuumRawPile {
underlying,
vacuum_tracking_enabled,
retrieved_chunks: Default::default(),
}
}
pub fn calculate_vacuum_for_sweeping(&self) -> anyhow::Result<HashSet<ChunkId>> {
if !self.vacuum_tracking_enabled {
bail!("Vacuum tracking not enabled, you can't calculate the vacuum set!");
}
let mut to_sweep = HashSet::new();
let retrieved_chunks = self.retrieved_chunks.lock().unwrap();
let mut chunk_id: ChunkId = Default::default();
for key in self.list_keys(Keyspace::Chunk)? {
chunk_id.clone_from_slice(&key?);
if !retrieved_chunks.contains(&chunk_id) {
to_sweep.insert(chunk_id.clone());
}
}
Ok(to_sweep)
}
}
impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if self.vacuum_tracking_enabled && kind == Keyspace::Chunk {
let mut chunk_id: ChunkId = Default::default();
chunk_id.clone_from_slice(key);
self.retrieved_chunks.lock().unwrap().insert(chunk_id);
}
self.underlying.read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.underlying.write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.underlying
.build_storage_pipeline(settings, controller_send)
}
}
/// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time.
/// This is also capable of finding and vacuuming unused chunks.
/// This checks:
/// - the integrity of each chunk (assuming an integrity-aware raw pile is used)
/// - the structure of pointers and multi-level chunk references
pub fn check_deep<RP: RawPile>(
pile: Pile<RP>,
vacuum: VacuumMode,
make_progress_bar: bool,
) -> anyhow::Result<u32> {
let pile = Pile::new(VacuumRawPile::new(
pile.raw_pile,
vacuum != VacuumMode::NoVacuum,
));
let mut errors = 0;
let mut to_check = Vec::new();
let pointer_list = pile.list_pointers()?;
for pointer in pointer_list.iter() {
info!("Checking pointer {:?}", pointer);
match pile.read_pointer(&pointer)? {
Some(pointer_data) => {
if let Some(parent) = pointer_data.parent_pointer {
if !pointer_list.contains(&parent) {
errors += 1;
error!(
"Pointer {:?} has a parent {:?} which does not exist.",
pointer, parent
);
}
}
let tree_node = retrieve_tree_node(&pile, pointer_data.chunk_ref.clone())?;
tree_node.node.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
to_check.push(content.clone());
}
Ok(())
},
"".to_owned(),
)?;
}
None => {
errors += 1;
error!("Pointer {:?} does not seem to exist.", pointer);
}
}
}
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("checking");
let mut done = 0;
while let Some(next_to_check) = to_check.pop() {
done += 1;
pbar.set_length(done + to_check.len() as u64);
pbar.set_position(done);
let mut unchunker = RecursiveUnchunker::new(&pile, next_to_check.clone());
match std::io::copy(&mut unchunker, &mut NullWriter {}) {
Ok(_) => {}
Err(err) => {
errors += 1;
warn!(
"Error occurred when reading {:?}: {:?}.",
next_to_check, err
);
}
}
}
pbar.finish_and_clear();
if errors > 0 {
error!("There were {:?}", errors);
} else {
info!("No errors.");
}
if errors == 0 && vacuum != VacuumMode::NoVacuum {
info!("Calculating sweep set for vacuuming.");
let to_vacuum = pile.raw_pile.calculate_vacuum_for_sweeping()?;
info!("{} chunks are ready to be vacuumed.", to_vacuum.len());
if vacuum == VacuumMode::Vacuum {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
to_vacuum.len() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("vacuuming");
// actually do the vacuum!
info!("Going to vacuum them up.");
for vacuum_id in to_vacuum {
pile.raw_pile.delete(Keyspace::Chunk, &vacuum_id)?;
pbar.inc(1);
}
pile.flush()?;
pbar.finish_and_clear();
}
}
Ok(errors)
}
/// A shallower check than the deep one. This avoids reading the last layer of chunks.
/// (they are simply assumed to be OK.).
/// This leads to much faster performance and is mostly intended for GC.
/// We can check existence for those leaf chunks if desired. This still avoids the
/// overhead of decryption, decompression and reading from disk/network.
pub fn check_shallow<RP: RawPile>(
pile: Pile<RP>,
vacuum: VacuumMode,
make_progress_bar: bool,
check_existence: bool,
) -> anyhow::Result<u32> {
let pile = Pile::new(VacuumRawPile::new(
pile.raw_pile,
vacuum != VacuumMode::NoVacuum,
));
let mut additional_seen: HashSet<ChunkId> = HashSet::new();
let mut errors = 0;
let mut to_check = Vec::new();
let pointer_list = pile.list_pointers()?;
for pointer in pointer_list.iter() {
info!("Checking pointer {:?}", pointer);
match pile.read_pointer(&pointer)? {
Some(pointer_data) => {
if let Some(parent) = pointer_data.parent_pointer {
if !pointer_list.contains(&parent) {
errors += 1;
error!(
"Pointer {:?} has a parent {:?} which does not exist.",
pointer, parent
);
}
}
let tree_node = retrieve_tree_node(&pile, pointer_data.chunk_ref.clone())?;
tree_node.node.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
to_check.push(content.clone());
}
Ok(())
},
"".to_owned(),
)?;
}
None => {
errors += 1;
error!("Pointer {:?} does not seem to exist.", pointer);
}
}
}
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("checking");
let mut done = 0;
while let Some(next_to_check) = to_check.pop() {
done += 1;
pbar.set_length(done + to_check.len() as u64);
pbar.set_position(done);
if next_to_check.depth > 0 {
let mut reduced_height = next_to_check.clone();
reduced_height.depth -= 1;
let mut chunk_id_buf: ChunkId = Default::default();
let mut unchunker = RecursiveUnchunker::new(&pile, reduced_height);
loop {
let read_bytes = unchunker.read(&mut chunk_id_buf)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
break;
}
if read_bytes < chunk_id_buf.len() {
// any error, including EOF at this point, is an error
unchunker.read_exact(&mut chunk_id_buf[read_bytes..])?;
}
if check_existence && !pile.chunk_exists(&chunk_id_buf)? {
errors += 1;
warn!("Chunk missing: {:?}", &chunk_id_buf);
}
additional_seen.insert(chunk_id_buf.clone());
}
} else {
// already shallowest, just add the reference to the seen list.
additional_seen.insert(next_to_check.chunk_id);
}
}
pbar.finish_and_clear();
if errors > 0 {
error!("There were {:?}", errors);
} else {
info!("No errors.");
}
if errors == 0 && vacuum != VacuumMode::NoVacuum {
info!("Calculating sweep set for vacuuming.");
let mut to_vacuum = pile.raw_pile.calculate_vacuum_for_sweeping()?;
// don't forget to include the leaves that we didn't actually visit!
for element in additional_seen {
to_vacuum.remove(&element);
}
info!("{} chunks are ready to be vacuumed.", to_vacuum.len());
if vacuum == VacuumMode::Vacuum {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
to_vacuum.len() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("vacuuming");
// actually do the vacuum!
info!("Going to vacuum them up.");
for vacuum_id in to_vacuum {
pile.raw_pile.delete(Keyspace::Chunk, &vacuum_id)?;
pbar.inc(1);
}
pile.flush()?;
pbar.finish_and_clear();
}
}
Ok(errors)
}

View File

@ -1,370 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::convert::TryInto;
use std::fs::OpenOptions;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use std::{fs, io};
use anyhow::{anyhow, Context};
use crossbeam_channel::{Receiver, Sender};
use crossbeam_utils::thread;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::error;
use nix::sys::time::{TimeVal, TimeValLike};
use nix::unistd::{Gid, Uid};
use crate::chunking::RecursiveUnchunker;
use crate::commands::fully_load_pointer;
use crate::definitions::{FilesystemOwnership, RecursiveChunkRef, TreeNode};
use crate::pile::{Pile, RawPile};
use std::collections::{BTreeMap, HashMap};
/// Given a fully-integrated root node, extracts the files from the pile.
pub fn extract<RP: RawPile>(
target_path: &Path,
root: &mut TreeNode,
pile: &Pile<RP>,
make_progress_bar: bool,
num_workers: u8,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
root.count_normal_files() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("extracting");
let (paths_send, paths_recv) = crossbeam_channel::unbounded();
let (results_send, results_recv) = crossbeam_channel::bounded(16);
let failures = AtomicU32::new(0);
thread::scope(|s| {
for worker in 0..num_workers {
let paths_recv = paths_recv.clone();
let results_send = results_send.clone();
let failures = &failures; // needed because of move
s.builder()
.name(format!("yama unchunker {}", worker))
.spawn(move |_| {
if let Err(e) = extract_worker(pile, paths_recv, results_send) {
error!("Extraction worker {} failed: {:?}!", worker, e);
failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
// Needed to allow the manager to join once the workers finish and drop their senders.
drop(results_send);
drop(paths_recv);
s.spawn(|_| {
if let Err(e) = manager(root, target_path, paths_send, results_recv, &pbar) {
error!("Extraction manager failed: {:?}!", e);
failures.fetch_add(1, Ordering::Relaxed);
}
});
})
.expect("join issue");
pbar.set_message("applying metadata");
apply_metadata(
root,
target_path,
apply_permissions,
apply_mtime,
apply_ownership,
)?;
Ok(())
}
/// Given the name of a pointer, extracts it.
pub fn extract_from_pointer_name<RP: RawPile>(
target_path: &Path,
pointer_name: &str,
pile: &Pile<RP>,
make_progress_bar: bool,
num_workers: u8,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
) -> anyhow::Result<()> {
let (pointer_data, mut root_node) = fully_load_pointer(pile, pointer_name.as_ref())?;
let uid_translation_table = build_uid_translation_table(&pointer_data.uid_lookup);
let gid_translation_table = build_gid_translation_table(&pointer_data.gid_lookup);
// convert the UIDs and GIDs to match this system, which may be different from the usual.
apply_uid_and_gid_translation_tables(
&mut root_node.node,
&uid_translation_table,
&gid_translation_table,
);
extract(
&target_path.join(&root_node.name),
&mut root_node.node,
pile,
make_progress_bar,
num_workers,
apply_permissions,
apply_mtime,
apply_ownership,
)
}
pub fn build_uid_translation_table(
uid_lookup: &BTreeMap<u16, Option<String>>,
) -> HashMap<u16, u16> {
let mut result: HashMap<u16, u16> = Default::default();
for (old_uid, name) in uid_lookup.iter() {
if let Some(name) = name {
if let Some(user) = users::get_user_by_name(name) {
let new_uid = user.uid() as u16;
if new_uid != *old_uid {
result.insert(*old_uid, new_uid);
}
}
}
}
result
}
pub fn build_gid_translation_table(
gid_lookup: &BTreeMap<u16, Option<String>>,
) -> HashMap<u16, u16> {
let mut result: HashMap<u16, u16> = Default::default();
for (old_gid, name) in gid_lookup.iter() {
if let Some(name) = name {
if let Some(group) = users::get_group_by_name(name) {
let new_gid = group.gid() as u16;
if new_gid != *old_gid {
result.insert(*old_gid, new_gid);
}
}
}
}
result
}
pub fn apply_uid_and_gid_translation_tables(
node: &mut TreeNode,
uid_translation: &HashMap<u16, u16>,
gid_translation: &HashMap<u16, u16>,
) {
if uid_translation.is_empty() && gid_translation.is_empty() {
// nothing to do here :).
return;
}
let apply_to = |ownership: &mut FilesystemOwnership| {
ownership.uid = *uid_translation
.get(&ownership.uid)
.unwrap_or(&ownership.uid);
ownership.gid = *gid_translation
.get(&ownership.gid)
.unwrap_or(&ownership.gid);
};
node.visit_mut(
&mut |node, _| {
match node {
TreeNode::NormalFile { ownership, .. } => {
apply_to(ownership);
}
TreeNode::Directory { ownership, .. } => {
apply_to(ownership);
}
TreeNode::SymbolicLink { ownership, .. } => {
apply_to(ownership);
}
TreeNode::Deleted => {}
}
Ok(())
},
"".to_owned(),
)
.expect("Can't fail since we don't fail.");
}
/// A worker thread for extracting
pub fn extract_worker<RP: RawPile>(
pile: &Pile<RP>,
paths: Receiver<(PathBuf, RecursiveChunkRef)>,
results: Sender<()>,
) -> anyhow::Result<()> {
while let Ok((path, chunk_ref)) = paths.recv() {
let mut extractor = RecursiveUnchunker::new(pile, chunk_ref);
let mut file = OpenOptions::new()
.write(true)
.create_new(true)
.open(&path)
.with_context(|| format!("Failed to open {:?}", path))?;
io::copy(&mut extractor, &mut file)?;
results
.send(())
.or_else(|_| Err(anyhow!("Failed to send result")))?;
}
Ok(())
}
/// A single thread that manages the workers
pub fn manager(
root: &mut TreeNode,
target_path: &Path,
paths_sender: Sender<(PathBuf, RecursiveChunkRef)>,
results_receiver: Receiver<()>,
progress_bar: &ProgressBar,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, name| {
let final_path = if name.is_empty() {
target_path.to_path_buf()
} else {
target_path.join(name)
};
match tree_node {
TreeNode::NormalFile { content, .. } => {
paths_sender
.send((final_path, content.clone()))
.expect("Unable to send to should-be unbounded channel");
}
TreeNode::Directory { .. } => {
fs::create_dir(&final_path)?;
}
TreeNode::SymbolicLink { target, .. } => {
// TODO may want to perform rewrites ...?
std::os::unix::fs::symlink(target, &final_path)?;
}
TreeNode::Deleted => {
panic!("should not be extracting 'Deleted!' --- BUG.");
}
};
Ok(())
},
"".to_string(),
)?;
// Needed to allow the workers to finish; otherwise we never join.
drop(paths_sender);
while let Ok(()) = results_receiver.recv() {
progress_bar.inc(1);
}
Ok(())
}
/// Applies metadata (permissions, mtime, ownership) to files from a tree node.
pub fn apply_metadata(
root: &TreeNode,
target: &Path,
apply_permissions: bool,
apply_mtime: bool,
apply_owner: bool,
) -> anyhow::Result<()> {
match root {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
..
} => {
if apply_permissions {
let mut perms = fs::metadata(&target)?.permissions();
perms.set_mode(permissions.mode);
fs::set_permissions(&target, perms)?;
}
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
if apply_mtime {
if let Ok(mtime) = (*mtime).try_into() {
let tv = TimeVal::milliseconds(mtime);
nix::sys::stat::lutimes(target, &tv, &tv)?;
}
}
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
if apply_permissions {
let mut perms = fs::metadata(&target)?.permissions();
perms.set_mode(permissions.mode);
fs::set_permissions(&target, perms)?;
}
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
for (name, child) in children.iter() {
let child_path = target.join(name);
apply_metadata(
child,
&child_path,
apply_permissions,
apply_mtime,
apply_owner,
)?;
}
}
TreeNode::SymbolicLink { ownership, .. } => {
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
}
TreeNode::Deleted => {
panic!("Deleted is not meant to be reachable here.");
}
}
Ok(())
}

View File

@ -1,333 +0,0 @@
use crate::chunking::RecursiveUnchunker;
use crate::commands::fully_load_pointer;
use crate::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use crate::operations::checking::VacuumRawPile;
use crate::operations::pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone};
use crate::pile::compression::{CompressionSettings, RawPileCompressor};
use crate::pile::integrity::RawPileIntegrityChecker;
use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;
use crate::pile::{Keyspace, Pile, PileDescriptor, PileStorage, RawPile};
use crate::utils::get_number_of_workers;
use anyhow::{bail, Context};
use crossbeam_channel::{Receiver, Sender};
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::error;
use std::collections::HashSet;
use std::fs::File;
use std::io::Read;
use std::path::Path;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
/// Pushes chunks (and pointers) from one pile to another.
/// This is a thorough implementation that could be slow but at least should give good confidence.
/// (Presumably we could do better by looking at the pointers that already exist on the destination
/// and only integrating as much as we need to.)
pub fn push_to(
from_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
from_rp_bypass: Arc<Box<dyn RawPile>>,
to_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
to_rp_bypass: Arc<Box<dyn RawPile>>,
pointers: Vec<String>,
make_progress_bar: bool,
num_workers: u32,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
1, // TODO
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("push/pull");
let (jobs_tx, jobs_rx) = crossbeam_channel::unbounded();
let (stat_tx, stat_rx) = crossbeam_channel::bounded(32);
let mut to_process = Vec::new();
for pointer in pointers {
let (pointer_data, root_node) = fully_load_pointer(&from_pile, &pointer)?;
// schedule storing the pointer chunks
to_process.push(pointer_data.chunk_ref.clone());
if to_pile.read_pointer(&pointer)?.is_some() {
unimplemented!("pointer in target exists.");
}
// copy across the pointer data
to_pile.write_pointer(&pointer, &pointer_data)?;
root_node
.node
.visit(
&mut |node, _path| {
match node {
TreeNode::NormalFile { content, .. } => {
to_process.push(content.clone());
}
_ => {} // nop
}
Ok(())
},
String::new(),
)
.expect("No fail");
}
// start the work
let critical_failures = Arc::new(AtomicU32::new(0));
for worker_num in 0..num_workers {
let jobs_rx = jobs_rx.clone();
let stat_tx = stat_tx.clone();
let critical_failures = critical_failures.clone();
let from_pile = from_pile.clone();
let from_rp_bypass = from_rp_bypass.clone();
let to_pile = to_pile.clone();
let to_rp_bypass = to_rp_bypass.clone();
std::thread::Builder::new()
.name(format!("yama pusher {}", worker_num))
.spawn(move || {
if let Err(e) = pusher_worker(
from_pile,
from_rp_bypass,
to_pile,
to_rp_bypass,
jobs_rx,
stat_tx,
) {
error!("[critical!] Push worker {} FAILED: {:?}", worker_num, e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
for task in to_process {
stat_tx
.send(NewTask(task))
.expect("unbounded so should be able to send");
}
// must drop here for ending to happen
drop(jobs_rx);
drop(stat_tx);
pbar.set_length(0);
if let Err(e) = pusher_manager(&pbar, stat_rx, jobs_tx) {
error!("[critical!] Push manager FAILED: {:?}", e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
Ok(())
}
enum PushWorkerToManagerMessage {
NewTask(RecursiveChunkRef),
TaskDone,
}
fn pusher_manager(
pbar: &ProgressBar,
update_receiver: Receiver<PushWorkerToManagerMessage>,
job_queue: Sender<RecursiveChunkRef>,
) -> anyhow::Result<()> {
let mut outstanding = 0;
let mut already_done = HashSet::new();
while let Ok(status) = update_receiver.recv() {
match status {
PushWorkerToManagerMessage::NewTask(task) => {
if already_done.insert(task.clone()) {
job_queue.send(task)?;
pbar.inc_length(1);
outstanding += 1;
}
}
PushWorkerToManagerMessage::TaskDone => {
pbar.inc(1);
outstanding -= 1;
if outstanding == 0 {
break;
}
}
}
}
Ok(())
}
fn pusher_worker(
from_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
from_rp_bypass: Arc<Box<dyn RawPile>>,
to_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
to_rp_bypass: Arc<Box<dyn RawPile>>,
jobs_rx: Receiver<RecursiveChunkRef>,
stat_tx: Sender<PushWorkerToManagerMessage>,
) -> anyhow::Result<()> {
while let Ok(job) = jobs_rx.recv() {
if !to_pile.chunk_exists(&job.chunk_id)? {
if let Some(bypass_chunk_data) = from_rp_bypass.read(Keyspace::Chunk, &job.chunk_id)? {
to_rp_bypass.write(Keyspace::Chunk, &job.chunk_id, &bypass_chunk_data)?;
} else {
bail!("Chunk cannot be copied because doesn't exist (in bypass pile).");
}
}
if job.depth > 0 {
// we want to (partially) unchunk this and submit all subchunks.
let vacuum_rp = VacuumRawPile::new(from_pile.raw_pile.clone(), true);
let vacuum_pile = Pile::new(vacuum_rp);
// First read the bottom-level chunk IDs
let mut reduced_height = job.clone();
reduced_height.depth -= 1;
let mut chunk_id_buf: ChunkId = Default::default();
let mut unchunker = RecursiveUnchunker::new(&vacuum_pile, reduced_height);
loop {
let read_bytes = unchunker.read(&mut chunk_id_buf)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
break;
}
if read_bytes < chunk_id_buf.len() {
// any error, including EOF at this point, is an error
unchunker.read_exact(&mut chunk_id_buf[read_bytes..])?;
}
stat_tx
.send(NewTask(RecursiveChunkRef {
chunk_id: chunk_id_buf.clone(),
depth: 0,
}))
.expect("Should be able to send");
}
// Then track the chunks that we read whilst doing the above
for needed_chunk_id in vacuum_pile
.raw_pile
.retrieved_chunks
.lock()
.expect("Should be able to lock")
.iter()
{
if needed_chunk_id != &job.chunk_id {
// only track them if they're not the same as the one on this job.
stat_tx
.send(NewTask(RecursiveChunkRef {
chunk_id: needed_chunk_id.clone(),
depth: 0,
}))
.expect("Should be able to send");
}
}
}
stat_tx.send(TaskDone)?;
}
Ok(())
}
#[derive(Copy, Clone, Debug)]
pub enum BypassLevel {
NoBypass,
CompressionBypass,
}
pub fn determine_bypass_level(
desc1: &PileDescriptor,
dir1: &Path,
desc2: &PileDescriptor,
dir2: &Path,
) -> anyhow::Result<BypassLevel> {
if desc1.compression.is_some() && desc2.compression.is_some() {
let mut dictionary1 = Vec::new();
let dict_path1 = dir1.join("important_zstd.dict");
File::open(dict_path1)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary1)?;
let mut dictionary2 = Vec::new();
let dict_path2 = dir2.join("important_zstd.dict");
File::open(dict_path2)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary2)?;
if dictionary1 == dictionary2 {
// we can only bypass if both dictionaries are the same
Ok(BypassLevel::CompressionBypass)
} else {
Ok(BypassLevel::NoBypass)
}
} else {
Ok(BypassLevel::NoBypass)
}
}
/// Opens a pile with potential for returning a 'complete' pile as well as a lower-level 'bypass'
/// pile, which, for example, skips performing compression operations.
///
/// Return tuple: (actual pile, bypass raw pile)
pub fn open_pile_with_work_bypass(
dir: &Path,
desc: &PileDescriptor,
bypass_level: BypassLevel,
) -> anyhow::Result<(Pile<Arc<Box<dyn RawPile>>>, Arc<Box<dyn RawPile>>)> {
let num_compressors = get_number_of_workers("YAMA_COMPRESSORS");
let num_decompressors = get_number_of_workers("YAMA_DECOMPRESSORS");
match desc.storage {
PileStorage::RemoteOnly => {
bail!("This is a remote-only pile. No local storage allowed.");
}
PileStorage::SqliteIndexedBloblog => {
let blob_raw_pile = RawPileIntegrityChecker::new(SqliteBloblogPile::open(dir)?);
match bypass_level {
BypassLevel::NoBypass => {
unimplemented!()
}
BypassLevel::CompressionBypass => {
let common_raw_pile: Arc<Box<dyn RawPile>> = Arc::new(Box::new(blob_raw_pile));
let raw_pile: Arc<Box<dyn RawPile>> = match desc.compression {
None => common_raw_pile.clone(),
Some(comp_level) => {
let mut dictionary = Vec::new();
let dict_path = dir.join("important_zstd.dict");
File::open(dict_path)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary)?;
let (compressed_pile, _handles) = RawPileCompressor::new(
common_raw_pile.clone(),
CompressionSettings {
dictionary: Arc::new(dictionary),
level: comp_level as i32,
num_compressors: num_compressors as u32,
num_decompressors: num_decompressors as u32,
},
)?;
Arc::new(Box::new(compressed_pile))
}
};
Ok((Pile::new(raw_pile), common_raw_pile))
}
}
}
}
}

View File

@ -1,315 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::io;
use std::io::ErrorKind;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use anyhow::{anyhow, bail, Context};
use crossbeam_channel::{Receiver, Sender};
use crossbeam_utils::thread;
use log::{error, warn};
use crate::chunking::{ChunkSubmissionTarget, RecursiveChunker, SENSIBLE_THRESHOLD};
use crate::commands;
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node};
use crate::definitions::{PointerData, RecursiveChunkRef, RootTreeNode, TreeNode};
use crate::pile::{existence_checker_stage, Pile, RawPile, StoragePipelineSettings};
use crate::progress::ProgressTracker;
use crate::tree::{create_uidgid_lookup_tables, differentiate_node_in_place};
use crate::utils::get_number_of_workers;
use std::collections::BTreeMap;
use std::sync::Arc;
pub fn store<CST: ChunkSubmissionTarget, PT: ProgressTracker>(
root_path: &Path,
root: &mut TreeNode,
target: &CST,
progress_bar: &mut PT,
num_workers: u8,
) -> anyhow::Result<()> {
let (paths_send, paths_recv) = crossbeam_channel::unbounded();
let (results_send, results_recv) = crossbeam_channel::bounded(16);
progress_bar.set_max_size(root.count_normal_files() as u64);
let critical_failures = AtomicU32::new(0);
thread::scope(|s| {
for worker_num in 0..num_workers {
let paths_recv = paths_recv.clone();
let results_send = results_send.clone();
let critical_failures = &critical_failures; // needed because of move
s.builder()
.name(format!("yama chunker {}", worker_num))
.spawn(move |_| {
if let Err(e) = store_worker(root_path, target, paths_recv, results_send) {
error!("[critical!] Storage worker {} FAILED: {:?}", worker_num, e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
drop(results_send);
drop(paths_recv);
if let Err(e) = manager(root, paths_send, results_recv, progress_bar) {
error!("[critical!] Storage manager FAILED: {:?}", e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("thread scope failed");
let critical_failures = critical_failures.load(Ordering::SeqCst);
if critical_failures > 0 {
bail!("There were {} critical failures.", critical_failures);
} else {
Ok(())
}
}
pub fn store_worker<CST: ChunkSubmissionTarget>(
root: &Path,
target: &CST,
paths: Receiver<String>,
results: Sender<(String, Option<RecursiveChunkRef>)>,
) -> anyhow::Result<()> {
while let Ok(path) = paths.recv() {
let full_path = root.join(&path);
match File::open(&full_path) {
Ok(mut file) => {
let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, target);
// streaming copy from file to chunker, really cool :)
io::copy(&mut file, &mut chunker)?;
let chunk_ref = chunker.finish()?;
results
.send((path, Some(chunk_ref)))
.or(Err(anyhow!("Failed to send result.")))?;
}
Err(err) => match err.kind() {
ErrorKind::NotFound => {
warn!("File vanished: {:?}. Will ignore.", full_path);
// send None so the manager knows to remove this from the tree.
results
.send((path, None))
.or(Err(anyhow!("Failed to send result.")))?;
}
ErrorKind::PermissionDenied => {
// TODO think about if we want a 'skip failed permissions' mode ...
error!(
"Permission denied to read {:?}; do you need to change user?",
full_path
);
Err(err)?;
}
_ => {
Err(err)?;
}
},
};
}
Ok(())
}
fn delete_node(root: &mut TreeNode, child_path: &str) -> anyhow::Result<()> {
let path_pieces: Vec<&str> = child_path.split('/').collect();
let mut this = root;
for &piece in &path_pieces[0..path_pieces.len() - 1] {
if let TreeNode::Directory { children, .. } = this {
match children.get_mut(piece) {
None => bail!(
"Tried to delete {} but {} does not exist.",
child_path,
piece
),
Some(next) => this = next,
}
} else {
bail!(
"Tried to delete {} from tree node but '{}' not a directory.",
child_path,
piece
);
}
}
if let TreeNode::Directory { children, .. } = this {
children.remove(*path_pieces.last().unwrap());
} else {
bail!(
"Tried to delete {} from tree node but parent not a directory.",
child_path
);
}
Ok(())
}
fn update_node(
root: &mut TreeNode,
child_path: &str,
new_ref: RecursiveChunkRef,
) -> anyhow::Result<()> {
let mut this = root;
for piece in child_path.split('/') {
if let TreeNode::Directory { children, .. } = this {
this = children
.get_mut(piece)
.ok_or_else(|| anyhow!("Tried to update {} but {} not found", child_path, piece))?;
} else {
bail!(
"Tried to update {} but {} not a directory.",
child_path,
piece
);
}
}
if let TreeNode::NormalFile { content, .. } = this {
*content = new_ref;
} else {
bail!("Tried to update {} but it's not a NormalFile.", child_path);
}
Ok(())
}
pub fn manager<PT: ProgressTracker>(
root: &mut TreeNode,
paths_sender: Sender<String>,
results_receiver: Receiver<(String, Option<RecursiveChunkRef>)>,
progress_bar: &mut PT,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, name| {
if let TreeNode::NormalFile { .. } = tree_node {
paths_sender
.send(name.to_string())
.or_else(|_| Err(anyhow!("Unable to send to should-be unbounded channel")))?;
}
Ok(())
},
"".to_string(),
)?;
drop(paths_sender);
while let Ok((path, opt_chunk_ref)) = results_receiver.recv() {
progress_bar.inc_progress(1);
match opt_chunk_ref {
None => {
delete_node(root, &path)?;
}
Some(new_chunk_ref) => {
update_node(root, &path, new_chunk_ref)?;
}
}
}
Ok(())
}
/// Stores files into the pile, potentially differentiating using a parent pointer (which will be
/// loaded and fully-integrated).
/// This also creates a pointer (which is why this is called `store_fully`).
pub fn store_fully<PT: ProgressTracker>(
pile: Arc<Pile<Box<dyn RawPile>>>,
root_dir: &PathBuf,
new_pointer_name: &String,
mut root_node: TreeNode,
parent: Option<String>,
num_workers: u8,
progress_bar: &mut PT,
use_pipelined_storage: bool,
) -> anyhow::Result<()> {
if let Some(parent) = parent.as_ref() {
let mut parent_pointer = pile.read_pointer(parent)?.ok_or_else(|| {
anyhow!(
"Selected parent pointer {:?} didn't exist when tried to retrieve it.",
parent
)
})?;
let mut parent_node = retrieve_tree_node(&pile, parent_pointer.chunk_ref.clone())?;
fully_integrate_pointer_node(&pile, &mut parent_node.node, &mut parent_pointer)?;
differentiate_node_in_place(&mut root_node, &parent_node.node)?;
}
if use_pipelined_storage {
// TODO make these configurable
let sps = StoragePipelineSettings {
num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
compressor_input_bound: 64,
writer_input_bound: 64,
};
let (control_tx, control_rx) = crossbeam_channel::unbounded();
let pile2 = pile.clone();
let pipeline = pile.raw_pile.build_storage_pipeline(sps, control_tx)?;
let pipeline = existence_checker_stage(pile2, pipeline);
store(
&root_dir,
&mut root_node,
&pipeline,
progress_bar,
num_workers,
)?;
// must drop the pipeline to allow the threads to close
drop(pipeline);
while let Ok(_) = control_rx.recv() {
// TODO nothing for now.
}
} else {
store(
&root_dir,
&mut root_node,
pile.as_ref(),
progress_bar,
num_workers,
)?;
}
let mut uid_lookup = BTreeMap::new();
let mut gid_lookup = BTreeMap::new();
create_uidgid_lookup_tables(&root_node, &mut uid_lookup, &mut gid_lookup)
.context("Failed to build UID and GID lookup tables :(.")?;
let chunk_ref = commands::store_tree_node(
&pile,
&RootTreeNode {
name: root_dir.file_name().unwrap().to_str().unwrap().to_owned(),
node: root_node,
},
)?;
let pointer_data = PointerData {
chunk_ref,
parent_pointer: parent,
uid_lookup,
gid_lookup,
};
pile.write_pointer(&new_pointer_name, &pointer_data)?;
pile.flush()?;
Ok(())
}

View File

@ -1,353 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use crate::definitions::{ChunkId, PointerData};
use crate::utils::get_number_of_workers;
use crossbeam_channel::Sender;
use std::collections::HashSet;
use std::fmt::Debug;
use std::sync::{Arc, Condvar, Mutex};
pub mod compression;
pub mod encryption;
pub mod integrity;
pub mod local_sqlitebloblogs;
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct PileDescriptor {
/// The last version of yama that was used with this pile.
pub yama_version: String,
/// The storage backend to use.
pub storage: PileStorage,
/// If specified, the compression level of the pile.
pub compression: Option<u16>,
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
pub enum PileStorage {
/// No local storage. Pile is only usable for remotes.
RemoteOnly,
/// Local storage backed by bloblogs that are indexed by a SQLite database.
SqliteIndexedBloblog,
// Local temporary storage in which chunks are only kept for long enough to send them to
// remotes. Unimplemented at present.
// TODO THIS IS NOT THE CORRECT NAME ANYWAY BarePushSled,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RemoteDescriptor {
pub encrypted: bool,
pub host: Option<String>,
pub user: Option<String>,
pub path: PathBuf,
}
#[derive(PartialOrd, PartialEq, Copy, Clone, Serialize, Deserialize, Eq)]
pub enum Keyspace {
Chunk,
ChunkHash,
Pointer,
}
/// Useful information for humans. Doesn't need to be spot on, but kind of interesting.
#[derive(Debug, Clone)]
pub struct DebugStatistics {
pub number_of_chunks: u64,
pub minimum_chunk_size: Option<u32>,
pub maximum_chunk_size: Option<u32>,
pub total_chunk_size: u64,
}
#[derive(Debug, Clone)]
pub struct StoragePipelineSettings {
pub num_compressors: u32,
pub compressor_input_bound: u32,
pub writer_input_bound: u32,
}
pub fn existence_checker_stage<RP: RawPile>(
pile: Arc<Pile<RP>>,
next_stage: Sender<(ChunkId, Vec<u8>)>,
) -> Sender<(ChunkId, Vec<u8>)> {
let shared_seen_set: Arc<Mutex<HashSet<ChunkId>>> = Default::default();
let (tx, rx) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(32);
// TODO would like something better for the networked case
for _ in 0..get_number_of_workers("YAMA_EXISTENCE_CHECKERS") {
let shared_seen_set = shared_seen_set.clone();
let next_stage = next_stage.clone();
let rx = rx.clone();
let pile = pile.clone();
std::thread::Builder::new()
.name("yama exist?er".to_string())
.spawn(move || {
while let Ok((chunk_id, chunk)) = rx.recv() {
// TODO handle errors properly
let is_new = { shared_seen_set.lock().unwrap().insert(chunk_id) };
if !is_new {
continue;
}
if !pile.chunk_exists(&chunk_id).unwrap() {
next_stage.send((chunk_id, chunk)).unwrap();
}
}
})
.unwrap();
}
tx
}
pub enum ControllerMessage {
Failure {
worker_id: Arc<String>,
error_message: String,
},
}
pub trait RawPile: Send + Sync + Debug + 'static {
// TODO expose verification errors?
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool>;
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>>;
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()>;
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()>;
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>>;
/*
fn list_keyvalue_pairs(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<(Vec<u8>, Vec<u8>)>>>>;
*/
fn flush(&self) -> anyhow::Result<()>;
// TODO return a progress Receiver
fn check_lowlevel(&self) -> anyhow::Result<bool>;
/// Return a few statistics, if possible.
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
Ok(None)
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>>;
}
impl RawPile for Box<dyn RawPile> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.as_ref().exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
self.as_ref().read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.as_ref().write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.as_ref().list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.as_ref().flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.as_ref().check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.as_ref().debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.as_ref()
.build_storage_pipeline(settings, controller_send)
}
}
impl<RP: RawPile> RawPile for Arc<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.as_ref().exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
self.as_ref().read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.as_ref().write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.as_ref().list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.as_ref().flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.as_ref().check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.as_ref().debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.as_ref()
.build_storage_pipeline(settings, controller_send)
}
}
#[derive(Debug)]
pub struct Pile<R: RawPile> {
pub raw_pile: R,
pub racy_submission_mutex: Mutex<HashSet<ChunkId>>,
pub racy_submission_condvar: Condvar,
}
impl<R: RawPile> Pile<R> {
pub fn new(raw_pile: R) -> Self {
Pile {
raw_pile,
racy_submission_mutex: Mutex::new(Default::default()),
racy_submission_condvar: Default::default(),
}
}
// TODO(clarity, features): have a special kind of error for verification failures
// may be wanted for best-effort restores
pub fn read_chunk(&self, key: &ChunkId) -> anyhow::Result<Option<Vec<u8>>> {
self.raw_pile.read(Keyspace::Chunk, key)
/*
let result = self.raw_pile.read(Keyspace::Chunk, &key)?;
if let Some(chunk) = result {
if verify {
let hash = self
.raw_pile
.read(Keyspace::ChunkHash, &key)?
.ok_or_else(|| {
anyhow!(
"Hash not found for chunk {}; can't verify",
bytes_to_hexstring(&key)
)
})?;
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&chunk);
let computed_hash = hasher.finish().to_be_bytes();
if &computed_hash[..] != &hash {
bail!(
"Hash mismatch for chunk {}: expected {} computed {}",
bytes_to_hexstring(&key),
bytes_to_hexstring(&hash),
bytes_to_hexstring(&computed_hash),
);
}
}
Ok(Some(chunk))
} else {
Ok(None)
}
*/
}
pub fn write_chunk(&self, key: &ChunkId, value: &[u8]) -> anyhow::Result<()> {
self.raw_pile.write(Keyspace::Chunk, key, value)
}
pub fn chunk_exists(&self, key: &ChunkId) -> anyhow::Result<bool> {
self.raw_pile.exists(Keyspace::Chunk, key)
}
pub fn read_pointer(&self, key: &str) -> anyhow::Result<Option<PointerData>> {
Ok(
if let Some(pointer_data_raw) = self.raw_pile.read(Keyspace::Pointer, key.as_bytes())? {
Some(serde_bare::from_slice(&pointer_data_raw)?)
} else {
None
},
)
}
pub fn write_pointer(&self, key: &str, pointer: &PointerData) -> anyhow::Result<()> {
self.raw_pile.write(
Keyspace::Pointer,
key.as_bytes(),
&serde_bare::to_vec(pointer)?,
)
}
pub fn delete_pointer(&self, key: &str) -> anyhow::Result<()> {
self.raw_pile.delete(Keyspace::Pointer, key.as_bytes())
}
pub fn list_pointers(&self) -> anyhow::Result<Vec<String>> {
let mut result = Vec::new();
for key in self.raw_pile.list_keys(Keyspace::Pointer)? {
result.push(String::from_utf8(key?)?);
}
Ok(result)
}
pub fn submit_chunk(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
let mut racy_submissions = self.racy_submission_mutex.lock().unwrap();
if racy_submissions.insert(chunk_id) {
drop(racy_submissions);
if !self.chunk_exists(&chunk_id)? {
self.write_chunk(&chunk_id, chunk_data)?;
}
racy_submissions = self.racy_submission_mutex.lock().unwrap();
racy_submissions.remove(&chunk_id);
// wake up anyone who might be waiting for this chunk
self.racy_submission_condvar.notify_all();
} else {
loop {
racy_submissions = self.racy_submission_condvar.wait(racy_submissions).unwrap();
if !racy_submissions.contains(&chunk_id) {
break;
}
}
}
Ok(())
}
/// Flushes buffered writes. Should really run this before exiting, so I can sleep better at
/// night (rather than relying on the destructor).
pub fn flush(&self) -> anyhow::Result<()> {
self.raw_pile.flush()
}
}

View File

@ -1,333 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::sync::Arc;
use std::thread;
use std::thread::JoinHandle;
use anyhow::anyhow;
use crossbeam_channel::{Receiver, Sender};
use derivative::Derivative;
use log::error;
use metrics::{register_counter, Unit};
use zstd::block::{Compressor, Decompressor};
use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings};
pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
#[derive(Clone, Debug)]
pub struct CompressionSettings {
/// Raw dictionary to pass to Zstd for compression and decompression
pub dictionary: Arc<Vec<u8>>,
/// The compression level, passed to Zstd.
pub level: i32,
/// The number of compressor threads to use.
pub num_compressors: u32,
/// The number of decompressor threads to use.
pub num_decompressors: u32,
}
#[derive(Debug, Derivative)]
#[derivative(Clone(bound = ""))]
// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
pub struct RawPileCompressor<R: RawPile> {
underlying: Arc<R>,
compressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
decompressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
settings: Arc<CompressionSettings>,
}
impl<R: RawPile> RawPileCompressor<R> {
pub fn new(
underlying: R,
settings: CompressionSettings,
) -> anyhow::Result<(Self, Vec<JoinHandle<()>>)> {
register_counter!(
"compressor_in_bytes",
Unit::Bytes,
"Number of bytes that have been fed into the compressor"
);
register_counter!(
"compressor_out_bytes",
Unit::Bytes,
"Number of bytes that have come out of the compressor"
);
register_counter!(
"compressor_chunks",
Unit::Count,
"Number of chunks that have been compressed"
);
register_counter!(
"decompressor_in_bytes",
Unit::Bytes,
"Number of bytes that have been fed into the decompressor"
);
register_counter!(
"decompressor_out_bytes",
Unit::Bytes,
"Number of bytes that have come out of the decompressor"
);
register_counter!(
"decompressor_chunks",
Unit::Count,
"Number of chunks that have been decompressed"
);
if settings.num_compressors == 0 && settings.num_decompressors == 0 {
// optimisation for when we're only building a pipeline: we don't want to
return Ok((
RawPileCompressor {
underlying: Arc::new(underlying),
compressor: None,
decompressor: None,
settings: Arc::new(settings),
},
Vec::with_capacity(0),
));
}
let (com_s, com_r) = crossbeam_channel::bounded(4);
let (dec_s, dec_r) = crossbeam_channel::bounded(4);
let mut handles = Vec::new();
for worker in 0..settings.num_compressors {
let settings = settings.clone();
let com_r = com_r.clone();
let builder = thread::Builder::new().name(format!("yama compressor {}", worker));
handles.push(builder.spawn(move || {
if let Err(e) = Self::compressor_worker(com_r, settings) {
error!("compressor worker failed: {:?}", e);
}
})?);
}
for worker in 0..settings.num_decompressors {
let settings = settings.clone();
let dec_r = dec_r.clone();
let builder = thread::Builder::new().name(format!("yama decompressor {}", worker));
handles.push(builder.spawn(move || {
if let Err(e) = Self::decompressor_worker(dec_r, settings) {
error!("decompressor worker failed: {:?}", e);
}
})?);
}
Ok((
RawPileCompressor {
underlying: Arc::new(underlying),
compressor: Some(com_s),
decompressor: Some(dec_s),
settings: Arc::new(settings),
},
handles,
))
}
fn compressor_worker(
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings,
) -> anyhow::Result<()> {
let mut compressor = Compressor::with_dict(settings.dictionary.as_ref().clone());
while let Ok((job, response_sender)) = queue.recv() {
let result = compressor.compress(&job, settings.level)?;
response_sender
.send(result)
.or(Err(anyhow!("Couldn't send compression result")))?;
}
Ok(())
}
fn decompressor_worker(
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings,
) -> anyhow::Result<()> {
let mut decompressor = Decompressor::with_dict(settings.dictionary.as_ref().clone());
while let Ok((job, response_sender)) = queue.recv() {
let result = decompressor.decompress(&job, DECOMPRESS_CAPACITY)?;
response_sender
.send(result)
.or(Err(anyhow!("Couldn't send decompression result")))?;
}
Ok(())
}
fn decompress(&self, data: &[u8]) -> anyhow::Result<Vec<u8>> {
let (ret_s, ret_r) = crossbeam_channel::bounded(0);
self.decompressor
.as_ref()
.expect("No decompressors configured")
.send((data.to_vec(), ret_s))
.or(Err(anyhow!("couldn't send to decompressor")))?;
Ok(ret_r.recv().or(Err(anyhow!("couldn't receive result")))?)
}
fn compress(&self, compressed_data: &[u8]) -> anyhow::Result<Vec<u8>> {
let (ret_s, ret_r) = crossbeam_channel::bounded(0);
self.compressor
.as_ref()
.expect("No compressors configured")
.send((compressed_data.to_vec(), ret_s))
.or(Err(anyhow!("couldn't send to compressor")))?;
Ok(ret_r.recv().or(Err(anyhow!("couldn't receive result")))?)
}
fn storage_pipeline_worker(
&self,
next_stage: Sender<(ChunkId, Vec<u8>)>,
input: Receiver<(ChunkId, Vec<u8>)>,
worker_id: String,
) -> anyhow::Result<()> {
// the worker ID has to live forever, so we leak it :/
let worker_id: &'static str = Box::leak(worker_id.into_boxed_str());
metrics::register_histogram!(
"compressor_idle_time",
metrics::Unit::Seconds,
"Time spent waiting between chunks",
"id" => worker_id
);
metrics::register_counter!(
"compressor_bytes_input",
metrics::Unit::Bytes,
"Number of bytes input into the compressor.",
"id" => worker_id
);
metrics::register_counter!(
"compressor_bytes_output",
metrics::Unit::Bytes,
"Number of bytes output from the compressor.",
"id" => worker_id
);
metrics::register_counter!(
"compressor_chunks_processed",
metrics::Unit::Count,
"Number of bytes input into the compressor.",
"id" => worker_id
);
let mut compressor = Compressor::with_dict(self.settings.dictionary.as_ref().clone());
let level = self.settings.level;
while let Ok((chunk_id, bytes)) = input.recv() {
let in_bytes = bytes.len();
let bytes = compressor.compress(&bytes, level)?;
let out_bytes = bytes.len();
next_stage.send((chunk_id, bytes))?;
// Per-worker metrics
// TODO rename
metrics::counter!("compressor_bytes_input", in_bytes as u64, "id" => worker_id);
metrics::counter!("compressor_bytes_output", out_bytes as u64, "id" => worker_id);
// Global metrics
metrics::counter!("compressor_in_bytes", in_bytes as u64);
metrics::counter!("compressor_out_bytes", out_bytes as u64);
metrics::increment_counter!("compressor_chunks");
}
Ok(())
}
}
impl<R: RawPile> RawPile for RawPileCompressor<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if let Some(data) = self.underlying.read(kind, key)? {
Ok(Some(self.decompress(&data)?))
} else {
Ok(None)
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
let compressed = self.compress(value)?;
self.underlying.write(kind, key, &compressed)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.underlying.debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// this one should have a few threads behind it! yarr!
let subsequent_pipeline = self
.underlying
.build_storage_pipeline(settings.clone(), controller_send.clone())?;
let (input_to_this_stage, receiver) =
crossbeam_channel::bounded(settings.compressor_input_bound as usize);
for compressor_number in 0..settings.num_compressors {
let subsequent_pipeline = subsequent_pipeline.clone();
let receiver = receiver.clone();
let controller_send = controller_send.clone();
let this = (*self).clone();
thread::Builder::new()
.name(format!("yama Pcomp{}", compressor_number))
.spawn(move || {
let worker_id = Arc::new(format!("compressor-{}", compressor_number));
if let Err(err) = this.storage_pipeline_worker(
subsequent_pipeline,
receiver,
worker_id.to_string(),
) {
controller_send
.send(ControllerMessage::Failure {
worker_id,
error_message: format!("err {:?}", err),
})
.expect("This is BAD: failed to send failure message to controller.");
}
})
.unwrap();
}
Ok(input_to_this_stage)
}
}

View File

@ -1,122 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use anyhow::anyhow;
use log::warn;
use sodiumoxide::crypto::secretbox;
use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES};
use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, Keyspace, RawPile, StoragePipelineSettings};
use crossbeam_channel::Sender;
/// A RawPile that provides encryption of chunk contents.
/// Please note that keys are not currently encrypted, so this scheme is not CPA-secure.
/// It seems easily possible to test the pile for inclusion of a known file (by first chunking it and
/// looking for matching chunk IDs).
/// Use of compression a custom Zstd dictionary may make that harder but in general it seems dubious
/// to rely on that.
/// This feature will be revisited soon...
/// Notably, keys should be passed through a secure permutation first.
#[derive(Debug)]
pub struct RawPileEncryptor<R: RawPile> {
underlying: R,
secret_key: Key,
}
impl<R: RawPile> RawPileEncryptor<R> {
pub fn new(underlying: R, key: Key) -> Self {
warn!(
"WARNING! Encrypted RawPiles are not CPA secure. Do not rely on them for security yet!"
);
RawPileEncryptor {
underlying,
secret_key: key,
}
}
fn decrypt(&self, kind: Keyspace, key: &[u8], data: &[u8]) -> anyhow::Result<Vec<u8>> {
Ok(if kind == Keyspace::Chunk {
let mut nonce = [0u8; NONCEBYTES];
nonce[0..key.len()].copy_from_slice(key);
secretbox::open(data, &Nonce(nonce), &self.secret_key)
.or(Err(anyhow!("Failed to decrypt")))?
} else {
let mut nonce = [0u8; NONCEBYTES];
nonce.copy_from_slice(&data[0..NONCEBYTES]);
secretbox::open(&data[NONCEBYTES..], &Nonce(nonce), &self.secret_key)
.or(Err(anyhow!("Failed to decrypt")))?
})
}
fn encrypt(&self, kind: Keyspace, key: &[u8], data: &[u8]) -> Vec<u8> {
if kind == Keyspace::Chunk {
let mut nonce = [0u8; NONCEBYTES];
nonce[0..key.len()].copy_from_slice(key);
secretbox::seal(data, &Nonce(nonce), &self.secret_key)
} else {
let nonce = secretbox::gen_nonce();
let mut out = Vec::new();
out.extend_from_slice(&nonce.0);
out.extend_from_slice(&secretbox::seal(data, &nonce, &self.secret_key));
out
}
}
}
impl<R: RawPile> RawPile for RawPileEncryptor<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if let Some(data) = self.underlying.read(kind, key)? {
Ok(Some(self.decrypt(kind, key, &data)?))
} else {
Ok(None)
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
let encrypted = self.encrypt(kind, key, value);
self.underlying.write(kind, key, &encrypted)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
_settings: StoragePipelineSettings,
_controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
todo!()
}
}

View File

@ -1,143 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::hash::Hasher;
use thiserror::Error;
use crate::definitions::{ChunkId, XXH64_SEED};
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings};
use crate::utils::bytes_to_hexstring;
use crossbeam_channel::Sender;
/// This RawPile enables checking the integrity of stored chunks.
/// This is done by storing a hash along with the chunk contents, which can later be verified.
#[derive(Debug)]
pub struct RawPileIntegrityChecker<RP: RawPile> {
underlying: RP,
}
impl<RP: RawPile> RawPileIntegrityChecker<RP> {
pub fn new(underlying: RP) -> Self {
RawPileIntegrityChecker { underlying }
}
}
#[derive(Error, Debug)]
#[error("Integrity error for chunk {chunk_id}; expected XXHash {expected_hash} but computed {computed_hash}!")]
pub struct IntegrityError {
pub chunk_id: String,
pub expected_hash: String,
pub computed_hash: String,
}
impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
match self.underlying.read(kind, key)? {
None => Ok(None),
Some(mut data_then_hash) => {
let len = data_then_hash.len();
let data_only = &data_then_hash[..len - 8];
let xxhash = &data_then_hash[len - 8..];
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&data_only);
let computed_hash = hasher.finish().to_be_bytes();
if computed_hash != xxhash {
Err(IntegrityError {
chunk_id: bytes_to_hexstring(key),
expected_hash: bytes_to_hexstring(&xxhash),
computed_hash: bytes_to_hexstring(&computed_hash),
})?;
}
// remove hash from end
data_then_hash.drain(len - 8..);
Ok(Some(data_then_hash))
}
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
// start with the data
let mut buf = Vec::new();
buf.extend_from_slice(&value[..]);
// then append the hash
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&value);
let computed_hash = hasher.finish().to_be_bytes();
buf.extend_from_slice(&computed_hash);
self.underlying.write(kind, key, &buf)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
// TODO integrity check ...?
self.underlying.check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.underlying.debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// TODO primitive implementation but good enough for now.
// May want metrics later?
let next_stage = self
.underlying
.build_storage_pipeline(settings, controller_send)?;
let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(64);
std::thread::Builder::new()
.name("yama integrity".to_string())
.spawn(move || {
while let Ok((chunk_id, mut chunk)) = receiver.recv() {
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&chunk);
let computed_hash = hasher.finish().to_be_bytes();
chunk.extend_from_slice(&computed_hash);
next_stage.send((chunk_id, chunk)).unwrap();
}
})
.unwrap();
Ok(input)
}
}

View File

@ -1,796 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::hash_map::Entry;
use std::collections::{HashMap, VecDeque};
use std::convert::{TryFrom, TryInto};
use std::fs::{read_dir, File, OpenOptions};
use std::io::{Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::sync::{Arc, Condvar, Mutex};
use std::{fs, thread};
use anyhow::{bail, Context};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use log::{info, warn};
use nix::unistd::sync;
use rusqlite::{params, Error, ErrorCode};
use rusqlite::{Connection, OptionalExtension};
use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings};
use crate::utils::bytes_to_hexstring;
use crossbeam_channel::{Receiver, Sender};
use rusqlite::ffi::ErrorCode::ConstraintViolation;
use std::time::Duration;
/// Bloblogs will not be reused if they are already 2 GiB large.
pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
/// This many pointers will be batched up for writing.
pub const POINTER_WRITE_BATCHES: usize = 2048;
/// A file storing a log of blobs.
/// Format:
/// Repeated:
/// <32 byte ChunkId><u32: length><length × u8: data>
#[derive(Debug)]
pub struct Bloblog {
pub file: File,
}
impl Bloblog {
pub fn open(path: &Path) -> anyhow::Result<Bloblog> {
let file = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(false)
.open(path)
.with_context(|| format!("Trying to open bloblog: {:?}", path))?;
Ok(Bloblog { file })
}
pub fn blob_len(&mut self, offset: u64, chunk_id: &ChunkId) -> anyhow::Result<u32> {
self.file.seek(SeekFrom::Start(offset))?;
let mut chunk_id_verif_buf: ChunkId = Default::default();
self.file.read_exact(&mut chunk_id_verif_buf)?;
if &chunk_id_verif_buf != chunk_id {
bail!(
"ChunkId of blob did not match; read {} wanted {}",
bytes_to_hexstring(&chunk_id_verif_buf),
bytes_to_hexstring(chunk_id)
);
}
Ok(self.file.read_u32::<BigEndian>()?)
}
pub fn read_blob(
&mut self,
offset: u64,
chunk_id: &ChunkId,
buf: &mut Vec<u8>,
) -> anyhow::Result<()> {
buf.clear();
self.file.seek(SeekFrom::Start(offset))?;
let mut chunk_id_verif_buf: ChunkId = Default::default();
self.file.read_exact(&mut chunk_id_verif_buf)?;
if &chunk_id_verif_buf != chunk_id {
bail!(
"ChunkId of blob did not match; read {} wanted {}",
bytes_to_hexstring(&chunk_id_verif_buf),
bytes_to_hexstring(chunk_id)
);
}
let blob_length = self.file.read_u32::<BigEndian>()?;
let start = buf.len();
for _ in 0..blob_length {
// add enough space on the end of the buffer to read the data
buf.push(0);
}
self.file.read_exact(&mut buf[start..])?;
Ok(())
}
/// Writes a blob to the end of this bloblog. Returns the offset of the blob, as needed to read
/// it with read_blob.
pub fn write_blob(&mut self, chunk_id: &ChunkId, data: &[u8]) -> anyhow::Result<u64> {
let position = self.file.seek(SeekFrom::End(0))?;
self.file.write_all(chunk_id)?;
self.file.write_u32::<BigEndian>(data.len().try_into()?)?;
self.file.write_all(data)?;
Ok(position)
}
pub fn filesize(&mut self) -> anyhow::Result<u64> {
Ok(self.file.seek(SeekFrom::End(0))?)
}
}
pub type BloblogId = u32;
#[derive(Debug)]
pub struct Inner {
next_bloblog_id: BloblogId,
writer_bloblogs: Vec<BloblogId>,
open_bloblogs: HashMap<BloblogId, Arc<Mutex<Bloblog>>>, // TODO want an LRU cache with a weak hashmap...?
connection: Connection,
writers_in_progress: u16,
// We batch up pointer writes because sync() performance really hurts us if we do them one by
// one.
queued_pointer_writes: HashMap<ChunkId, BloblogPointer>,
}
impl Inner {
pub fn raw_put_chunk_pointer(
&self,
chunk_id: &ChunkId,
bloblog: BloblogId,
offset_i64: i64,
) -> anyhow::Result<()> {
match self.connection.execute(
"INSERT INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)",
params![&chunk_id[..], bloblog, offset_i64],
) {
Ok(_) => Ok(()),
Err(Error::SqliteFailure(e, str)) => {
if e.code == ConstraintViolation {
warn!(
"(ignoring) SQLite constraint violation on insertion... {:?}",
str
);
Ok(())
} else {
Err(Error::SqliteFailure(e, str))?;
unreachable!();
}
}
other => {
other?;
unreachable!();
}
}
}
pub fn flush(&mut self) -> anyhow::Result<()> {
// Create a non-allocated hashmap to satisfy borrow checker, then swap it in and out
let mut queued_pointer_writes = HashMap::with_capacity(0);
std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
for (chunk_id, pointer) in queued_pointer_writes.drain() {
let offset_i64 =
i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
self.raw_put_chunk_pointer(&chunk_id, pointer.bloblog, offset_i64)?;
}
std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
Ok(())
}
}
/// A Pile built on the idea of SQLite-indexed 'blob logs'.
/// 'Blob logs' are append-only binary files which contain simple concatenations of chunks (with a
/// small header). This format is very dense but does not inherently provide random access.
/// Granularity of deletes is also impacted by this structure, so vacuuming steps may involve needing
/// to re-write bloblogs to remove deleted chunks.
/// Because random access is important for performance, an additional SQLite database is used
/// as a map from chunk IDs to their positions in the blob logs, allowing readers to seek to the
/// appropriate place and read a chunk randomly.
#[derive(Clone, Debug)]
pub struct SqliteBloblogPile {
inner: Arc<Mutex<Inner>>,
path: PathBuf,
writers_reach_zero: Arc<Condvar>,
should_batch_pointer_writes: bool,
}
/// A pointer to a blob in a 'blob log'.
#[derive(Debug)]
pub struct BloblogPointer {
/// Which blob log the blob is stored in.
bloblog: BloblogId,
/// The seek offset at which the blob is located in the log.
offset: u64,
}
impl SqliteBloblogPile {
pub fn open(path: &Path) -> anyhow::Result<SqliteBloblogPile> {
let path = &path.join("bloblog");
let is_new = !path.is_dir();
if is_new {
fs::create_dir(path)?;
}
let connection = Connection::open(path.join("index.db"))?;
if is_new {
connection.execute_batch(
"CREATE TABLE chunks (
chunk_id BLOB PRIMARY KEY,
bloblog INT NOT NULL,
offset INT NOT NULL
) WITHOUT ROWID;
CREATE TABLE deleted (
bloblog INT NOT NULL,
offset INT NOT NULL,
size INT NOT NULL,
PRIMARY KEY (bloblog, offset)
) WITHOUT ROWID;
CREATE TABLE pointers (
key BLOB PRIMARY KEY,
data BLOB NOT NULL
);",
)?;
}
Ok(SqliteBloblogPile {
inner: Arc::new(Mutex::new(Inner {
next_bloblog_id: 0,
writer_bloblogs: Vec::new(),
open_bloblogs: HashMap::new(),
connection,
writers_in_progress: 0,
queued_pointer_writes: Default::default(),
})),
path: path.to_owned(),
writers_reach_zero: Default::default(),
should_batch_pointer_writes: true,
})
}
fn open_bloblog(&self, bloblog_id: BloblogId) -> anyhow::Result<Arc<Mutex<Bloblog>>> {
let mut inner = self.inner.lock().unwrap();
Ok(match inner.open_bloblogs.entry(bloblog_id) {
Entry::Occupied(entry) => entry.get().clone(),
Entry::Vacant(entry) => {
let bloblog = Arc::new(Mutex::new(Bloblog::open(
&self.path.join(&bloblog_id.to_string()),
)?));
entry.insert(bloblog.clone());
bloblog
}
})
}
fn get_writing_bloblog(&self) -> anyhow::Result<(BloblogId, Arc<Mutex<Bloblog>>)> {
let mut inner = self.inner.lock().unwrap();
let writing_bloblog_id: BloblogId = match inner.writer_bloblogs.pop() {
None => {
loop {
let pre_inc = inner.next_bloblog_id;
inner.next_bloblog_id += 1;
// only start writing here if it doesn't already exist!
let bloblog_path = &self.path.join(&pre_inc.to_string());
if !bloblog_path.exists() {
break pre_inc;
}
}
}
Some(id) => id,
};
let result = Ok((
writing_bloblog_id,
match inner.open_bloblogs.entry(writing_bloblog_id) {
Entry::Occupied(entry) => entry.get().clone(),
Entry::Vacant(entry) => {
let bloblog = Arc::new(Mutex::new(Bloblog::open(
&self.path.join(&writing_bloblog_id.to_string()),
)?));
entry.insert(bloblog.clone());
bloblog
}
},
));
inner.writers_in_progress += 1;
result
}
/// Should be called once the bloblog has been finished writing to for the moment.
/// Will return it to the pool if there is any space left, otherwise it will close the file.
fn return_writing_bloblog(
&self,
id: BloblogId,
bloblog: Arc<Mutex<Bloblog>>,
) -> anyhow::Result<()> {
let size = bloblog.lock().unwrap().filesize()?;
let mut inner = self.inner.lock().unwrap();
if size < MAX_BLOBLOG_REUSE_SIZE {
inner.writer_bloblogs.push(id);
}
inner.writers_in_progress -= 1;
if inner.writers_in_progress == 0 {
self.writers_reach_zero.notify_all();
}
Ok(())
}
fn get_chunk_pointer(&self, chunk_id: &ChunkId) -> anyhow::Result<Option<BloblogPointer>> {
let inner = self.inner.lock().unwrap();
Ok(inner
.connection
.query_row(
"SELECT bloblog, offset FROM chunks WHERE chunk_id = ?1",
params![&chunk_id[..]],
|row| {
Ok(BloblogPointer {
bloblog: row.get(0)?,
offset: row.get::<_, i64>(1)? as u64,
})
},
)
.optional()?)
}
fn put_chunk_pointer(&self, chunk_id: &ChunkId, pointer: BloblogPointer) -> anyhow::Result<()> {
let inner = self.inner.lock().unwrap();
let offset_i64 = i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
inner.raw_put_chunk_pointer(chunk_id, pointer.bloblog, offset_i64)
}
fn batched_put_chunk_pointer(
&self,
chunk_id: &ChunkId,
pointer: BloblogPointer,
) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
inner
.queued_pointer_writes
.insert(chunk_id.clone(), pointer);
if inner.queued_pointer_writes.len() >= POINTER_WRITE_BATCHES {
inner.flush()?;
}
Ok(())
}
fn flush_queued_pointer_writes(&self) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
inner.flush()
}
fn storage_pipeline_worker(
&self,
incoming: Receiver<(ChunkId, Vec<u8>)>,
) -> anyhow::Result<()> {
// can hold on to the same bloblog as long as we'd like!
const POINTERS_BUFFER_SIZE: usize = 256;
let mut pointers_buffered = Vec::with_capacity(POINTERS_BUFFER_SIZE);
fn flush_pointers(
this: &SqliteBloblogPile,
pointers_buffered: &mut Vec<(ChunkId, BloblogPointer)>,
) -> anyhow::Result<()> {
let mut inner = this.inner.lock().unwrap();
let txn = inner.connection.transaction()?;
{
let mut stmt = txn.prepare(
"INSERT OR FAIL INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)",
)?;
for (chunk_id, pointer) in pointers_buffered.drain(..) {
match stmt.execute(params![
&chunk_id[..],
pointer.bloblog,
pointer.offset as i64
]) {
Err(Error::SqliteFailure(e, str))
if e.code == ErrorCode::ConstraintViolation =>
{
warn!(
"(ignoring) SQLite constraint violation on insertion... {:?}",
str
);
}
other => {
other?;
}
}
}
}
txn.commit()?;
Ok(())
}
fn write_blob(
this: &SqliteBloblogPile,
bloblog_id: BloblogId,
bloblog: &mut Bloblog,
pointers_buffered: &mut Vec<(ChunkId, BloblogPointer)>,
(chunk_id, chunk): (ChunkId, Vec<u8>),
) -> anyhow::Result<()> {
let offset = bloblog.write_blob(&chunk_id, &chunk)?;
let pointer = BloblogPointer {
bloblog: bloblog_id,
offset,
};
pointers_buffered.push((chunk_id, pointer));
if pointers_buffered.len() >= POINTERS_BUFFER_SIZE {
flush_pointers(this, pointers_buffered)?;
}
Ok(())
}
while let Ok(chunk) = incoming.recv() {
let (bloblog_id, bloglog_mutex) = self.get_writing_bloblog()?;
let mut bloblog = bloglog_mutex.lock().expect("Failed to lock bloblog?");
write_blob(
self,
bloblog_id,
&mut bloblog,
&mut pointers_buffered,
chunk,
)?;
while let Ok(chunk) = incoming.recv_timeout(Duration::from_secs(5)) {
write_blob(
self,
bloblog_id,
&mut bloblog,
&mut pointers_buffered,
chunk,
)?;
if bloblog.filesize()? > MAX_BLOBLOG_REUSE_SIZE {
// get a new bloblog to write with.
break;
}
}
drop(bloblog);
self.return_writing_bloblog(bloblog_id, bloglog_mutex)?;
}
info!("Flushing pointers (storage pipeline shutdown).");
flush_pointers(self, &mut pointers_buffered)?;
// we MUST have flushed ALL the pointers by now.
assert!(pointers_buffered.is_empty());
Ok(())
}
}
impl Drop for SqliteBloblogPile {
fn drop(&mut self) {
{
let inner = self.inner.lock().unwrap();
if !inner.queued_pointer_writes.is_empty() {
eprintln!(
"WARNING: DROPPING SQLITE BLOBLOG PILE with {} unflushed writes.",
inner.queued_pointer_writes.len()
)
}
}
self.flush_queued_pointer_writes()
.expect("POSSIBILITY OF LOSS OF NEW DATA: failed to flush queued pointer writes!");
}
}
impl RawPile for SqliteBloblogPile {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
match kind {
Keyspace::Chunk => {
let mut chunk_id: ChunkId = Default::default();
chunk_id.copy_from_slice(key);
Ok(self.get_chunk_pointer(&chunk_id)?.is_some())
}
Keyspace::ChunkHash => unimplemented!(),
Keyspace::Pointer => {
let inner = self.inner.lock().unwrap();
let pointer_exists = inner
.connection
.query_row(
"SELECT data FROM pointers WHERE key=?1",
params![key],
|_row| Ok(()),
)
.optional()?
.is_some();
Ok(pointer_exists)
}
}
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
match kind {
Keyspace::Chunk => {
let mut chunk_id: ChunkId = Default::default();
chunk_id.copy_from_slice(key);
let chunk_pointer = self.get_chunk_pointer(&chunk_id)?;
if let Some(pointer) = chunk_pointer {
let bloblog_mutex = self.open_bloblog(pointer.bloblog)?;
let mut bloblog = bloblog_mutex.lock().unwrap();
let mut buf = Vec::new();
bloblog.read_blob(pointer.offset, &chunk_id, &mut buf)?;
Ok(Some(buf))
} else {
Ok(None)
}
}
Keyspace::ChunkHash => unimplemented!(),
Keyspace::Pointer => {
let inner = self.inner.lock().unwrap();
let pointer_data = inner
.connection
.query_row(
"SELECT data FROM pointers WHERE key=?1",
params![key],
|row| row.get::<_, Vec<u8>>(0),
)
.optional()?;
Ok(pointer_data)
}
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
match kind {
Keyspace::Chunk => {
// need to handle deletion of existing, same-named chunks
self.delete(kind, key)?;
let mut chunk_id: ChunkId = Default::default();
chunk_id.copy_from_slice(key);
let (bloblog_id, bloblog) = self.get_writing_bloblog()?;
let offset = bloblog.lock().unwrap().write_blob(&chunk_id, value)?;
self.return_writing_bloblog(bloblog_id, bloblog)?;
let pointer = BloblogPointer {
bloblog: bloblog_id,
offset,
};
if self.should_batch_pointer_writes {
self.batched_put_chunk_pointer(&chunk_id, pointer)?;
} else {
self.put_chunk_pointer(&chunk_id, pointer)?;
}
Ok(())
}
Keyspace::ChunkHash => unimplemented!(),
Keyspace::Pointer => {
let inner = self.inner.lock().unwrap();
inner.connection.execute(
"INSERT OR REPLACE INTO pointers (key, data) VALUES (?1, ?2)",
params![key, value],
)?;
Ok(())
}
}
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
match kind {
Keyspace::Chunk => {
let mut chunk_id: ChunkId = Default::default();
chunk_id.copy_from_slice(key);
let chunk_pointer = self.get_chunk_pointer(&chunk_id)?;
if let Some(pointer) = chunk_pointer {
let bloblog_mutex = self.open_bloblog(pointer.bloblog)?;
let mut bloblog = bloblog_mutex.lock().unwrap();
let size = bloblog.blob_len(pointer.offset, &chunk_id)?;
let inner = self.inner.lock().unwrap();
let offset_i64 =
i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
inner.connection.execute(
"INSERT OR IGNORE INTO deleted (bloblog, offset, size)
VALUES (?1, ?2, ?3)",
params![pointer.bloblog, offset_i64, size],
)?;
}
Ok(())
}
Keyspace::ChunkHash => unimplemented!(),
Keyspace::Pointer => {
let inner = self.inner.lock().unwrap();
inner
.connection
.execute("DELETE FROM pointers WHERE key=?1", params![key])?;
Ok(())
}
}
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
let query = match kind {
Keyspace::Chunk => "SELECT chunk_id FROM chunks WHERE chunk_id > ?1",
Keyspace::ChunkHash => unimplemented!(),
Keyspace::Pointer => "SELECT key FROM pointers WHERE key > ?1",
};
let ki = KeyIterator {
next_limit: Vec::with_capacity(0),
query,
buffer: Default::default(),
connection_inner: self.inner.clone(),
};
Ok(Box::new(ki))
}
fn flush(&self) -> anyhow::Result<()> {
// must do this before we lock inner.
self.flush_queued_pointer_writes()?;
let inner = self.inner.lock().unwrap();
if inner.writers_in_progress > 0 {
let _inner = self
.writers_reach_zero
.wait_while(inner, |inner| inner.writers_in_progress != 0)
.unwrap();
}
// TODO sync all files we have open first!
sync();
Ok(())
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
unimplemented!()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
let inner = self.inner.lock().unwrap();
let chunk_count: i64 =
inner
.connection
.query_row("SELECT COUNT(1) FROM chunks", params![], |row| row.get(0))?;
let (deleted_chunk_count, deleted_chunk_space): (i64, i64) = inner.connection.query_row(
"SELECT COUNT(1), COALESCE(SUM(size), 0) FROM deleted",
params![],
|row| Ok((row.get(0)?, row.get(1)?)),
)?;
let mut total_on_disk_size = 0;
for dir_entry in read_dir(&self.path)? {
let dir_entry = dir_entry?;
if !dir_entry.file_type()?.is_file() {
continue;
}
if let Some(name) = dir_entry.file_name().to_str() {
if !name.chars().all(|c| c.is_numeric()) {
// bloblogs have numeric names.
continue;
}
total_on_disk_size += dir_entry.metadata()?.len();
}
}
// 32 bytes for the chunk ID.
// 4 bytes for the chunk length.
let chunk_overhead_per_chunk: u64 = 32 + 4;
let total_chunk_size = total_on_disk_size
- chunk_overhead_per_chunk * (deleted_chunk_count + chunk_count) as u64
- deleted_chunk_space as u64;
Ok(Some(DebugStatistics {
number_of_chunks: chunk_count.try_into().unwrap(),
minimum_chunk_size: None,
maximum_chunk_size: None,
total_chunk_size,
}))
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
let (sender, incoming) = crossbeam_channel::bounded(settings.writer_input_bound as usize);
let this = self.clone();
thread::Builder::new()
.name("SQLBloblogStPpln".to_string())
.spawn(move || {
let worker_id = Arc::new(format!("bloblogwriter"));
if let Err(err) = this.storage_pipeline_worker(incoming) {
controller_send
.send(ControllerMessage::Failure {
worker_id,
error_message: format!("err {:?}", err),
})
.expect("This is BAD: failed to send failure message to controller.");
}
})
.unwrap();
Ok(sender)
}
}
struct KeyIterator {
next_limit: Vec<u8>,
query: &'static str,
buffer: VecDeque<Vec<u8>>,
connection_inner: Arc<Mutex<Inner>>,
}
impl KeyIterator {
fn next_result(&mut self) -> anyhow::Result<Option<Vec<u8>>> {
match self.buffer.pop_front() {
None => {
let conn = &self.connection_inner.lock().unwrap().connection;
let mut stmt = conn.prepare(self.query)?;
let new_rows = stmt
.query_map(params![&self.next_limit], |row| row.get::<_, Vec<u8>>(0))?
.map(|row| row.unwrap());
self.buffer.extend(new_rows);
if let Some(next_limit) = self.buffer.back() {
self.next_limit = next_limit.clone();
}
Ok(self.buffer.pop_front())
}
Some(ele) => Ok(Some(ele)),
}
}
}
impl Iterator for KeyIterator {
type Item = anyhow::Result<Vec<u8>>;
fn next(&mut self) -> Option<Self::Item> {
match self.next_result() {
Ok(Some(vec)) => Some(Ok(vec)),
Ok(None) => None,
Err(err) => Some(Err(err)),
}
}
}
#[cfg(test)]
mod tests {
use crate::pile::local_sqlitebloblogs::Bloblog;
use temp_dir::TempDir;
#[test]
pub fn bloblog_read_write_test() {
let td = TempDir::new().unwrap();
let path = td.path();
let mut bloblog = Bloblog::open(path.join("bloblog1.log").as_ref()).unwrap();
let bloblog_id_1 = bloblog.write_blob(&Default::default(), &[1, 2, 3]).unwrap();
let bloblog_id_2 = bloblog.write_blob(&Default::default(), &[4, 5]).unwrap();
drop(bloblog);
let mut bloblog = Bloblog::open(path.join("bloblog1.log").as_ref()).unwrap();
let mut vec = Vec::new();
bloblog
.read_blob(bloblog_id_2, &Default::default(), &mut vec)
.unwrap();
assert_eq!(&vec, &[4, 5]);
bloblog
.read_blob(bloblog_id_1, &Default::default(), &mut vec)
.unwrap();
assert_eq!(&vec, &[1, 2, 3]);
}
}

View File

@ -0,0 +1,68 @@
use eyre::{bail, Context};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile_fs::LocalWormFilesystem;
use yama_wormfile_sftp::SftpWormFilesystem;
#[derive(Clone, Serialize, Deserialize, Debug, Hash)]
#[serde(tag = "scheme")]
pub enum PileConnectionScheme {
#[serde(rename = "local")]
Local { directory: PathBuf },
#[serde(rename = "sftp")]
Sftp {
user_at_host: String,
directory: String,
},
#[serde(rename = "s3")]
S3 {},
}
impl PileConnectionScheme {
pub async fn connect_to_wormfileprovider(&self) -> eyre::Result<BoxedWormFileProvider> {
match self {
PileConnectionScheme::Local { directory } => {
if directory.exists() {
if !directory.is_dir() {
bail!("Can't connect to local pile {directory:?}: not a directory.");
}
} else {
tokio::fs::create_dir(directory)
.await
.context("Can't connect to local pile; can't create directory.")?;
}
Ok(BoxedWormFileProvider::new(LocalWormFilesystem::new(
directory,
)?))
}
PileConnectionScheme::Sftp {
user_at_host,
directory,
} => Ok(BoxedWormFileProvider::new(
SftpWormFilesystem::new(user_at_host, directory)
.await
.context("Failed SFTP connection")?,
)),
PileConnectionScheme::S3 { .. } => {
//S3WormFilesystem::new()
todo!()
}
}
}
}
#[derive(Clone, Serialize, Deserialize)]
pub struct PileConnectionDetails {
#[serde(flatten)]
pub scheme: PileConnectionScheme,
pub keyring: PathBuf,
}
impl PileConnectionDetails {
pub async fn connect(self) -> eyre::Result<()> {
// TODO
Ok(())
}
}

View File

@ -0,0 +1,72 @@
use yama_localcache::Store;
use yama_pile::Pile;
use yama_pile::{pointers::Pointer, tree::IntegrationStats};
use yama_wormfile::WormFileProvider;
use crate::scan::integrate_uid_or_gid_map;
use async_recursion::async_recursion;
use eyre::{Context, ContextCompat};
use yama_pile::tree::integrate_node_in_place;
pub struct PileWithCache<WFP: WormFileProvider> {
pub pile: Pile<WFP>,
pub localcache: Store,
}
#[derive(Clone, Debug, Default)]
pub struct PointerIntegrationStatistics {
pub integration: IntegrationStats,
/// Number of pointers that were integrated to get here.
pub depth: u64,
}
impl<WFP: WormFileProvider + 'static> PileWithCache<WFP> {
pub async fn fully_integrate_pointer_in_place(
&self,
pointer: &mut Pointer,
stats: &mut PointerIntegrationStatistics,
) -> eyre::Result<()> {
if let Some(parent_pointer_name) = pointer.parent.as_ref() {
let parent_pointer = self
.read_pointer_fully_integrated(parent_pointer_name, stats)
.await
.with_context(|| {
format!("failed to read pointer {parent_pointer_name} whilst integrating")
})?
.with_context(|| {
format!("whilst integrating, expected pointer {parent_pointer_name} to exist")
})?;
stats.integration +=
integrate_node_in_place(&mut pointer.root.node, &parent_pointer.root.node);
integrate_uid_or_gid_map(&mut pointer.uids, &parent_pointer.uids);
integrate_uid_or_gid_map(&mut pointer.gids, &parent_pointer.gids);
pointer.parent = None;
stats.depth += 1;
}
Ok(())
}
#[async_recursion]
pub async fn read_pointer_fully_integrated(
&self,
name: &str,
stats: &mut PointerIntegrationStatistics,
) -> eyre::Result<Option<Pointer>> {
match self.pile.read_pointer(name).await? {
Some(mut pointer) => {
self.fully_integrate_pointer_in_place(&mut pointer, stats)
.await?;
Ok(Some(pointer))
}
None => Ok(None),
}
}
/// Gracefully close this pile + local cache.
pub async fn close(self) -> eyre::Result<()> {
self.pile.close().await?;
Ok(())
}
}

View File

@ -1,42 +0,0 @@
use indicatif::ProgressBar;
pub trait ProgressTracker {
fn inc_progress(&mut self, delta_progress: u64);
fn set_current(&mut self, current_progress: u64);
fn set_max_size(&mut self, max_size: u64);
}
impl ProgressTracker for ProgressBar {
#[inline]
fn set_max_size(&mut self, max_size: u64) {
self.set_length(max_size);
}
#[inline]
fn inc_progress(&mut self, delta_progress: u64) {
self.inc(delta_progress);
}
#[inline]
fn set_current(&mut self, current_progress: u64) {
self.set_position(current_progress);
}
}
/// No-operation progress tracker.
impl ProgressTracker for () {
#[inline]
fn set_max_size(&mut self, _max_size: u64) {
// nop
}
#[inline]
fn inc_progress(&mut self, _delta_progress: u64) {
// nop
}
#[inline]
fn set_current(&mut self, _current_progress: u64) {
// nop
}
}

View File

@ -1,100 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::convert::TryInto;
use std::io::{Read, Write};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
use crate::pile::Keyspace;
pub mod requester;
pub mod responder;
#[derive(Serialize, Deserialize, Clone)]
pub struct Request {
id: u16,
body: RequestBody,
}
#[derive(Serialize, Deserialize, Clone, Eq, PartialEq)]
pub enum RequestBody {
Read {
kind: Keyspace,
key: Vec<u8>,
},
CheckExists {
kind: Keyspace,
key: Vec<u8>,
},
Write {
kind: Keyspace,
key: Vec<u8>,
value: Vec<u8>,
},
Delete {
kind: Keyspace,
key: Vec<u8>,
},
List {
kind: Keyspace,
},
NextBatch {
token: u16,
},
Flush,
LowLevelCheck,
Shutdown,
Progress {
current: u64,
max: u64,
},
}
#[derive(Serialize, Deserialize, Clone)]
pub struct Response {
response_to: u16,
body: ResponseBody,
}
#[derive(Serialize, Deserialize, Clone)]
pub enum ResponseBody {
Success,
Failed(String),
NotExists,
Data(Vec<u8>),
BatchData {
batch: Vec<Vec<u8>>,
next_token: u16,
},
}
pub fn read_message<R: Read, D: DeserializeOwned>(read: &mut R) -> anyhow::Result<D> {
let len = read.read_u32::<BigEndian>()?;
let mut data_vec = vec![0u8; len as usize];
read.read_exact(&mut data_vec)?;
Ok(serde_bare::from_slice(&data_vec)?)
}
pub fn write_message<W: Write, S: Serialize>(write: &mut W, message: &S) -> anyhow::Result<()> {
let data_vec = serde_bare::to_vec(&message)?;
write.write_u32::<BigEndian>(data_vec.len().try_into()?)?;
write.write_all(&data_vec)?;
Ok(())
}

View File

@ -1,499 +0,0 @@
use std::collections::HashMap;
use std::io::{stdin, stdout, Read, Write};
use std::sync::{Arc, Mutex};
use std::thread;
use std::thread::JoinHandle;
use anyhow::anyhow;
use crossbeam_channel::{Receiver, Sender};
use log::{error, info};
use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, Keyspace, RawPile, StoragePipelineSettings};
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
use metrics::{
gauge, histogram, increment_counter, register_counter, register_gauge, register_histogram, Unit,
};
use std::sync::atomic::{AtomicBool, AtomicU16, Ordering};
use std::time::Instant;
/// A kind of RawPile which can make requests to a RawPile over a pipe (e.g. TCP socket or an
/// SSH connection).
/// The requests are handled by a `Responder` on the other end of the pipe.
#[derive(Debug)]
pub struct Requester {
commands: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
}
impl Requester {
pub fn new<R: Read + Send + 'static, W: Write + Send + 'static>(
read: R,
write: W,
) -> (Self, Vec<JoinHandle<()>>) {
register_histogram!(
"requester_cmd_response_time_ms",
Unit::Milliseconds,
"Time between request being issued and a response being received"
);
let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
Arc::new(Mutex::new(HashMap::new()));
let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
let mut handles = Vec::new();
let shutdown_signal: Arc<(AtomicU16, AtomicBool)> = Default::default();
{
// Spawn a reader
let in_flight = in_flight.clone();
let shutdown_signal = shutdown_signal.clone();
handles.push(
thread::Builder::new()
.name("ReqstrReader".to_string())
.spawn(move || {
if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
error!("reader failed: {:?}", e);
}
})
.unwrap(),
);
}
{
// Spawn a writer
let in_flight = in_flight.clone();
let command_receiver = command_receiver.clone();
handles.push(
thread::Builder::new()
.name("ReqstrWriter".to_string())
.spawn(move || {
if let Err(e) =
Self::writer(write, in_flight, command_receiver, shutdown_signal)
{
error!("writer failed: {:?}", e);
}
})
.unwrap(),
);
}
(
Requester {
commands: command_sender,
},
handles,
)
}
pub fn new_from_stdio() -> (Self, Vec<JoinHandle<()>>) {
let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
Arc::new(Mutex::new(HashMap::new()));
let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
let mut handles = Vec::new();
let shutdown_signal: Arc<(AtomicU16, AtomicBool)> = Default::default();
{
// Spawn a reader
let in_flight = in_flight.clone();
let shutdown_signal = shutdown_signal.clone();
handles.push(
thread::Builder::new()
.name("ReqstrReaderSI".to_string())
.spawn(move || {
let stdin = stdin();
let read = stdin.lock();
if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
error!("reader failed: {:?}", e);
}
})
.unwrap(),
);
}
{
// Spawn a writer
let in_flight = in_flight.clone();
let command_receiver = command_receiver.clone();
handles.push(
thread::Builder::new()
.name("ReqstrWriterSO".to_string())
.spawn(move || {
let stdout = stdout();
let write = stdout.lock();
if let Err(e) =
Self::writer(write, in_flight, command_receiver, shutdown_signal)
{
error!("writer failed: {:?}", e);
}
})
.unwrap(),
);
}
(
Requester {
commands: command_sender,
},
handles,
)
}
pub fn clone_command_sender(&self) -> Sender<(RequestBody, Option<Sender<ResponseBody>>)> {
self.commands.clone()
}
/// Thread that reads messages and sends them along.
fn reader<R: Read>(
mut read: R,
in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
) -> anyhow::Result<()> {
loop {
let response: Response = read_message(&mut read)?;
if shutdown_request_channel.1.load(Ordering::Relaxed)
&& response.response_to == shutdown_request_channel.0.load(Ordering::Relaxed)
{
return Ok(());
}
let mut map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
// We free up the ID as we get the sender out of the map.
let (resp_sender, req_instant) = map
.remove(&response.response_to)
.ok_or(anyhow!("Didn't find response channel..."))?;
let req_resp_time_in_millis =
Instant::now().duration_since(req_instant).as_millis() as f64;
histogram!("requester_cmd_response_time_ms", req_resp_time_in_millis);
resp_sender
.send(response.body)
.or(Err(anyhow!("Failed to send response to channel")))?;
}
}
/// Thread that writes messages.
fn writer<W: Write>(
mut write: W,
in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
command_receiver: Receiver<(RequestBody, Option<Sender<ResponseBody>>)>,
shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
) -> anyhow::Result<()> {
while let Ok((req_body, response_channel)) = command_receiver.recv() {
let request_id = if let Some(response_channel) = response_channel {
let mut map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
let request_id = (1u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
let now = Instant::now();
map.insert(request_id, (response_channel, now));
request_id
} else {
0
};
let shutting_down = &req_body == &RequestBody::Shutdown;
if shutting_down {
shutdown_request_channel
.0
.store(request_id, Ordering::SeqCst);
shutdown_request_channel.1.store(true, Ordering::SeqCst);
}
write_message(
&mut write,
&Request {
id: request_id,
body: req_body,
},
)?;
write.flush()?;
if shutting_down {
return Ok(());
}
}
info!("Exited send loop without shutdown message, issuing Shutdown.");
// shutdown ourselves
let request_id = {
let map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
let request_id = (0u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
request_id
};
shutdown_request_channel
.0
.store(request_id, Ordering::SeqCst);
shutdown_request_channel.1.store(true, Ordering::SeqCst);
write_message(
&mut write,
&Request {
id: request_id,
body: RequestBody::Shutdown,
},
)?;
write.flush()?;
Ok(())
}
/// Helper to make a request and wait for the result.
fn request(&self, req: RequestBody) -> anyhow::Result<ResponseBody> {
let (sender, receiver) = crossbeam_channel::bounded(0);
self.commands
.send((req, Some(sender)))
.or(Err(anyhow!("Failed to queue request")))?;
Ok(receiver
.recv()
.or(Err(anyhow!("Failed to receive response")))?)
}
}
impl RawPile for Requester {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
match self.request(RequestBody::CheckExists {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(false),
ResponseBody::Data(_) => Err(anyhow!("Received Data for exists.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for exists.")),
}
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
match self.request(RequestBody::Read {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Err(anyhow!("Received Success for read.")),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(None),
ResponseBody::Data(data) => Ok(Some(data)),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for read.")),
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
match self.request(RequestBody::Write {
kind,
key: key.to_vec(),
value: value.to_vec(),
})? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for write.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for write.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for write.")),
}
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
match self.request(RequestBody::Delete {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for delete.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for delete.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for delete.")),
}
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
match self.request(RequestBody::List { kind })? {
ResponseBody::Success => Ok(Box::new(ListKeyIterator {
command_sender: self.commands.clone(),
batch_token: None,
buffer: Vec::with_capacity(0),
})),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for list_keys.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for list_keys.")),
ResponseBody::BatchData { batch, next_token } => Ok(Box::new(ListKeyIterator {
command_sender: self.commands.clone(),
batch_token: Some(next_token),
buffer: batch,
})),
}
}
fn flush(&self) -> anyhow::Result<()> {
match self.request(RequestBody::Flush)? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for Flush.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for Flush.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for Flush.")),
}
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
match self.request(RequestBody::LowLevelCheck)? {
ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for LowLevelCheck.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for LowLevelCheck.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for LowLevelCheck.")),
}
}
fn build_storage_pipeline(
&self,
_settings: StoragePipelineSettings,
_controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// this one is a little bit more complex.
// We want to be able to send off multiple write requests at once, but not too many, so we
// need to be able to apply backpressure.
let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(128);
let command_sender = self.commands.clone();
register_counter!(
"requester_pipeline_cmds_issued",
Unit::Count,
"Number of write commands issued by the Requester's storage pipeline"
);
register_gauge!(
"requester_pipeline_writes_inflight",
Unit::Count,
"Number of write commands in-flight"
);
std::thread::Builder::new()
.name("ReqStPpln".to_string())
.spawn(move || {
let (response_tx, response_rx) = crossbeam_channel::bounded::<ResponseBody>(32);
let mut in_flight_writes = 0;
const MAX_IN_FLIGHT_WRITES: u32 = 32;
let mut pipeline_still_going = true;
while pipeline_still_going || in_flight_writes > 0 {
gauge!(
"requester_pipeline_writes_inflight",
in_flight_writes as f64
);
// TODO this won't handle channel closure properly.
if in_flight_writes < MAX_IN_FLIGHT_WRITES && pipeline_still_going {
crossbeam_channel::select! {
recv(response_rx) -> resp => {
in_flight_writes -= 1;
match resp.unwrap() {
ResponseBody::Success => {
// nop
}
ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string);
}
ResponseBody::BatchData { .. } => {
panic!("wtf BatchData");
}
ResponseBody::NotExists => {
panic!("wtf NotExists");
}
ResponseBody::Data(_) => {
panic!("wtf Data");
}
}
}
recv(receiver) -> resp => {
if let Ok((chunk_id, write)) = resp {
in_flight_writes += 1;
increment_counter!("requester_pipeline_cmds_issued");
command_sender.send((RequestBody::Write {
kind: Keyspace::Chunk,
key: chunk_id.to_vec(),
value: write
}, Some(response_tx.clone()))).unwrap();
} else {
// the input has stopped
pipeline_still_going = false;
}
}
}
} else {
// Either the pipeline is stopping or we are too busy to accept new chunks,
// so only process responses.
let resp = response_rx.recv().unwrap();
in_flight_writes -= 1;
match resp {
ResponseBody::Success => {
// nop
}
ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string);
}
ResponseBody::BatchData { .. } => {
panic!("wtf BatchData");
}
ResponseBody::NotExists => {
panic!("wtf NotExists");
}
ResponseBody::Data(_) => {
panic!("wtf Data");
}
}
}
}
})
.unwrap();
Ok(input)
}
}
pub struct ListKeyIterator {
pub(crate) command_sender: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
pub(crate) batch_token: Option<u16>,
/// in reverse order
pub(crate) buffer: Vec<Vec<u8>>,
}
impl Iterator for ListKeyIterator {
type Item = anyhow::Result<Vec<u8>>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(next) = self.buffer.pop() {
Some(Ok(next))
} else if let Some(batch_token) = self.batch_token {
let (send, recv) = crossbeam_channel::bounded(0);
self.command_sender
.send((RequestBody::NextBatch { token: batch_token }, Some(send)))
.expect("Unable to send");
let resp = recv.recv().expect("Unable to recv");
match resp {
ResponseBody::Success => {
self.batch_token = None;
None
}
ResponseBody::Failed(err_msg) => Some(Err(anyhow!("Remote failure: {}", err_msg))),
ResponseBody::NotExists => Some(Err(anyhow!("Received NotExists for NextBatch."))),
ResponseBody::Data(_) => Some(Err(anyhow!("Received Data for NextBatch."))),
ResponseBody::BatchData { batch, next_token } => {
self.batch_token = Some(next_token);
self.buffer = batch;
self.buffer.reverse();
if let Some(next) = self.buffer.pop() {
Some(Ok(next))
} else {
None
}
}
}
} else {
None
}
}
}

View File

@ -1,360 +0,0 @@
use std::collections::HashMap;
use std::io::{Read, Write};
use std::sync::{Arc, Mutex};
use std::thread;
use std::thread::JoinHandle;
use anyhow::anyhow;
use crossbeam_channel::{Receiver, Sender};
use itertools::Itertools;
use log::{error, info, warn};
use crate::definitions::ChunkId;
use crate::pile::{Keyspace, RawPile};
use crate::progress::ProgressTracker;
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
#[derive(Clone)]
pub struct ResponderWritingPipeline {
pub pipeline_submission: Sender<(ChunkId, Vec<u8>)>,
}
#[derive(Clone)]
/// A wrapper for a RawPile which allows a `Requester` to access it over a pipe (e.g. TCP socket or
/// an SSH connection).
pub struct Responder {
continuation_tokens: Arc<Mutex<HashMap<u16, Sender<u16>>>>,
writing_pipeline: Option<ResponderWritingPipeline>,
}
impl Responder {
/// Start a 'responder' (command processor).
pub fn start<
RP: RawPile + 'static,
R: Read + Send + 'static,
W: Write + Send + 'static,
PT: ProgressTracker + Send + 'static,
>(
read: R,
write: W,
num_workers: u16,
pile: Arc<RP>,
writing_pipeline: Option<ResponderWritingPipeline>,
mut progress_bar: PT,
) -> (JoinHandle<R>, JoinHandle<W>, Vec<JoinHandle<()>>) {
let mut handles = Vec::new();
let (work_queue_send, work_queue_recv) = crossbeam_channel::bounded::<Request>(16);
let (resp_send, resp_recv) = crossbeam_channel::bounded::<Response>(4);
let responder = Responder {
continuation_tokens: Arc::new(Mutex::new(Default::default())),
writing_pipeline,
};
let r_handle = {
// spawn the reader
let work_queue_send = work_queue_send.clone();
let responder = responder.clone();
thread::Builder::new()
.name("RespdrReader".to_string())
.spawn(move || {
let mut read = read;
if let Err(e) = responder.reader(&mut read, work_queue_send, &mut progress_bar)
{
error!("reader failed: {:?}", e);
}
read
})
.unwrap()
};
let w_handle = {
// spawn the writer
let resp_recv = resp_recv.clone();
let responder = responder.clone();
thread::Builder::new()
.name("RespdrWriter".to_string())
.spawn(move || {
let mut write = write;
if let Err(e) = responder.writer(&mut write, resp_recv) {
error!("writer failed: {:?}", e);
}
write
})
.unwrap()
};
for worker_num in 0..num_workers {
// spawn a worker
let responder = responder.clone();
let work_queue_recv = work_queue_recv.clone();
let resp_send = resp_send.clone();
let pile = pile.clone();
handles.push(
thread::Builder::new()
.name("RespdrWorker".to_string())
.spawn(move || {
if let Err(e) = responder.worker(pile.as_ref(), work_queue_recv, resp_send)
{
error!("worker {} failed: {:?}", worker_num, e);
}
})
.unwrap(),
);
}
(r_handle, w_handle, handles)
}
/// Thread that reads messages and sends them along.
fn reader<R: Read + Send + 'static, PT: ProgressTracker>(
&self,
read: &mut R,
worker_queue_send: Sender<Request>,
progress_tracker: &mut PT,
) -> anyhow::Result<()> {
loop {
let request: Request = read_message(read)?;
match request.body {
RequestBody::Shutdown => {
worker_queue_send.send(request)?;
info!("Shutting down responder");
return Ok(());
}
RequestBody::NextBatch { token } => {
let tokens = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
tokens
.get(&token)
.ok_or(anyhow!("Could not find that token."))?
.send(request.id)
.or(Err(anyhow!(
"Failed to send continuation token to continuer"
)))?;
}
RequestBody::Progress { current, max } => {
progress_tracker.set_max_size(max);
progress_tracker.set_current(current);
}
_ => {
worker_queue_send.send(request)?;
}
}
}
}
/// Thread that writes messages.
fn writer<W: Write + Send>(
&self,
mut write: W,
responses: Receiver<Response>,
) -> anyhow::Result<()> {
while let Ok(response) = responses.recv() {
write_message(&mut write, &response)?;
write.flush()?;
}
Ok(())
}
/// Thread which performs the actual work using the pile.
fn worker<RP: RawPile>(
&self,
pile: &RP,
requests: Receiver<Request>,
responses: Sender<Response>,
) -> anyhow::Result<()> {
while let Ok(request) = requests.recv() {
let response = match request.body {
RequestBody::Read { kind, key } => match pile.read(kind, &key) {
Ok(Some(data)) => Response {
response_to: request.id,
body: ResponseBody::Data(data),
},
Ok(None) => Response {
response_to: request.id,
body: ResponseBody::NotExists,
},
Err(err) => {
warn!("Error whilst doing a raw read: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::CheckExists { kind, key } => match pile.exists(kind, &key) {
Ok(true) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Ok(false) => Response {
response_to: request.id,
body: ResponseBody::NotExists,
},
Err(err) => {
warn!("Error whilst doing a raw exists: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::Write { kind, key, value } => {
if let Some(writing_pipeline) = self
.writing_pipeline
.as_ref()
.filter(|_| kind == Keyspace::Chunk)
{
let mut chunk_id = ChunkId::default();
chunk_id.copy_from_slice(&key[..]);
writing_pipeline
.pipeline_submission
.send((chunk_id, value))?;
// We lie and say it was successful once we submit.
// We'll complain on our side if anything goes wrong, anyway.
Response {
response_to: request.id,
body: ResponseBody::Success,
}
} else {
match pile.write(kind, &key, &value) {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw write: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
}
}
}
RequestBody::Delete { kind, key } => match pile.delete(kind, &key) {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw delete: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::List { kind } => match pile.list_keys(kind) {
Ok(iterator) => {
let mut resp_id = request.id;
let (cont_sender, cont_receiver) = crossbeam_channel::bounded(1);
let batch_token = {
let mut map = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
let batch_token = (0u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
map.insert(batch_token, cont_sender);
batch_token
};
for chunk in &iterator.chunks(32) {
let mut results = Vec::with_capacity(32);
for result in chunk {
results.push(result?);
}
responses
.send(Response {
response_to: resp_id,
body: ResponseBody::BatchData {
batch: results,
next_token: batch_token,
},
})
.or(Err(anyhow!("Failed to queue response")))?;
resp_id = cont_receiver
.recv()
.or(Err(anyhow!("Failed to receive continuation response ID")))?;
}
let mut map = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
map.remove(&batch_token);
Response {
response_to: resp_id,
body: ResponseBody::Success,
}
}
Err(err) => {
warn!("Error whilst doing a raw list_keys: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::NextBatch { .. } => {
unreachable!("because this is handled by the reader")
}
RequestBody::Flush => match pile.flush() {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw flush: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::LowLevelCheck => match pile.check_lowlevel() {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw check_lowlevel: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::Shutdown => Response {
response_to: request.id,
body: ResponseBody::Success,
},
RequestBody::Progress { .. } => {
unreachable!("handled by readea")
}
};
responses
.send(response)
.or(Err(anyhow!("Failed to queue response")))?;
}
Ok(())
}
}

413
yama/src/retriever.rs Normal file
View File

@ -0,0 +1,413 @@
// TODO The retriever should possibly live somewhere else
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, ensure, eyre, ContextCompat};
use flume::{Receiver, Sender};
use std::collections::{BTreeMap, BTreeSet};
use std::pin::Pin;
use std::sync::Arc;
use tracing::error;
use yama_pile::bloblogs::BloblogReader;
use yama_pile::definitions::BloblogId;
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile::WormFileReader;
pub mod decompressor;
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
#[repr(transparent)]
pub struct JobId(pub u32);
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
#[repr(transparent)]
pub struct FileId(pub u32);
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct JobChunkReq {
pub file: FileId,
pub offset: u64,
pub length: u64,
}
#[derive(Clone, Debug)]
pub enum RetrieverResp {
Blob {
job: JobId,
subjob: u32,
blob: Vec<u8>,
},
JobComplete(JobId),
}
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
struct FileRegionMarker {
pub file: FileId,
pub offset: u64,
pub length: u64,
pub job: JobId,
pub subjob: u32,
}
#[derive(Debug)]
struct OpenFileState {
pub req_tx: Sender<OpenFileReq>,
pub offset: u64,
}
#[derive(Debug)]
struct OpenFileReq {
pub offset: u64,
pub length: u64,
pub job: JobId,
pub subjob: u32,
}
#[derive(Debug)]
struct ActiveJobState {
pub subjobs: Vec<JobChunkReq>,
pub next_subjob: u32,
pub inflight: u32,
}
struct RetrieverInternals {
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
jobs_queue: BTreeMap<JobId, Vec<JobChunkReq>>,
file_regions: BTreeSet<FileRegionMarker>,
files: BTreeMap<FileId, BloblogId>,
open_files: BTreeMap<FileId, OpenFileState>,
results_tx: Sender<RetrieverResp>,
active_jobs: BTreeMap<JobId, ActiveJobState>,
ack_rx: Receiver<JobId>,
self_ack_tx: Sender<JobId>,
rec_active_jobs: u16,
}
pub fn create_fixed_retriever(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
jobs: BTreeMap<JobId, Vec<JobChunkReq>>,
files: BTreeMap<FileId, BloblogId>,
rec_active_jobs: u16,
) -> eyre::Result<Receiver<RetrieverResp>> {
let (results_tx, results_rx) = flume::bounded(4);
let (self_ack_tx, ack_rx) = flume::bounded(4);
let mut rint = RetrieverInternals {
pwc,
jobs_queue: Default::default(),
file_regions: Default::default(),
files,
open_files: Default::default(),
results_tx,
active_jobs: Default::default(),
ack_rx,
self_ack_tx,
rec_active_jobs,
};
for (job_id, job) in jobs {
rint.set_up_job(job_id, job);
}
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) = rint.retrieval_task().await {
error!("retriever failed: {e:?}");
}
}));
Ok(results_rx)
}
impl RetrieverInternals {
fn set_up_job(&mut self, job_id: JobId, job: Vec<JobChunkReq>) {
for (subjob, chunk) in job.iter().enumerate() {
self.file_regions.insert(FileRegionMarker {
file: chunk.file,
offset: chunk.offset,
length: chunk.length,
job: job_id,
subjob: subjob as u32,
});
}
self.jobs_queue.insert(job_id, job);
// eprintln!("new job {job_id:?}");
}
async fn file_request(
open_file: &mut OpenFileState,
job: JobId,
subjob: u32,
offset: u64,
length: u64,
) -> eyre::Result<()> {
// debug!("sched {job:?}->{subjob:?}");
open_file
.req_tx
.send_async(OpenFileReq {
offset,
length,
job,
subjob,
})
.await
.map_err(|_| eyre!("open file shut down :/"))?;
open_file.offset = offset + length;
Ok(())
}
async fn open_file(&mut self, file_id: FileId) -> eyre::Result<()> {
assert!(!self.open_files.contains_key(&file_id));
let &bloblog_id = self.files.get(&file_id).context("no file by that ID")?;
let bloblog_reader = self.pwc.pile.read_bloblog(bloblog_id).await?;
let completion_tx = self.results_tx.clone();
let ack_tx = self.self_ack_tx.clone();
let (subjob_tx, subjob_rx) = flume::unbounded();
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) =
Self::reader_task(bloblog_reader, subjob_rx, ack_tx, completion_tx).await
{
error!("error in reader for {bloblog_id:?}: {e:?}");
}
}));
self.open_files.insert(
file_id,
OpenFileState {
req_tx: subjob_tx,
offset: 0,
},
);
Ok(())
}
async fn reader_task(
mut bloblog_reader: BloblogReader<Pin<Box<dyn WormFileReader>>>,
subjob_rx: Receiver<OpenFileReq>,
ack_tx: Sender<JobId>,
completion_tx: Sender<RetrieverResp>,
) -> eyre::Result<()> {
while let Ok(next_job) = subjob_rx.recv_async().await {
let mut blob = Vec::with_capacity(next_job.length as usize);
bloblog_reader
.read_to_buf(&mut blob, next_job.offset, next_job.length)
.await?;
completion_tx
.send_async(RetrieverResp::Blob {
job: next_job.job,
subjob: next_job.subjob,
blob,
})
.await
.expect("completions shut");
// debug!("read,acking! {:?}", next_job);
ack_tx.send_async(next_job.job).await?;
}
Ok(())
}
async fn retrieval_task(&mut self) -> eyre::Result<()> {
// let mut icount = 0u64;
loop {
// icount += 1;
// debug!("[{icount}] active jobs {:#?}", self.active_jobs);
// debug!("[{icount}] open files {:#?}", self.open_files);
// 0. Try to progress open jobs if they are staring right at the bytes they need...
let mut to_remove = Vec::new();
for (active_job_id, active_job) in &mut self.active_jobs {
if active_job.inflight > 0 {
// skip if it's busy, we don't want to send blobs out of order...
continue;
}
if active_job.next_subjob as usize >= active_job.subjobs.len() {
// this job is to be finished!
to_remove.push(*active_job_id);
continue;
}
// Which file we are 'staring at' and requesting a run of chunks from
let mut stare_file = None;
'single_job_staring: loop {
let desired_blob = &active_job.subjobs[active_job.next_subjob as usize];
if stare_file.is_some() && stare_file != Some(desired_blob.file) {
// We have changed which file we are looking at, we can't request any further
// because they might get retrieved out of order.
break 'single_job_staring;
}
if let Some(open_file) = self.open_files.get_mut(&desired_blob.file) {
stare_file = Some(desired_blob.file);
if open_file.offset == desired_blob.offset {
Self::file_request(
open_file,
*active_job_id,
active_job.next_subjob,
desired_blob.offset,
desired_blob.length,
)
.await?;
ensure!(
self.file_regions.remove(&FileRegionMarker {
file: desired_blob.file,
offset: desired_blob.offset,
length: desired_blob.length,
job: *active_job_id,
subjob: active_job.next_subjob,
}),
"no FRM to remove (0)"
);
active_job.next_subjob += 1;
active_job.inflight += 1;
if active_job.next_subjob as usize >= active_job.subjobs.len() {
// this job is to be finished!
break 'single_job_staring;
}
} else {
break 'single_job_staring;
}
} else {
break 'single_job_staring;
}
}
}
for remove in to_remove {
self.active_jobs.remove(&remove);
// eprintln!("job complete {remove:?}");
self.results_tx
.send_async(RetrieverResp::JobComplete(remove))
.await
.map_err(|_| eyre!("results_tx shutdown"))?;
}
// 1. Try to make the most of open files by opening new jobs in convenient locations.
// Basically: if we have slots for new active jobs, then look to see if we have any
// jobs that begin at the offset in question...
if self.active_jobs.len() < self.rec_active_jobs as usize {
let mut allowed = self.rec_active_jobs as usize - self.active_jobs.len();
let mut progress = false;
for (open_file_id, open_file_state) in &self.open_files {
for region in self.file_regions.range(
FileRegionMarker {
file: *open_file_id,
offset: open_file_state.offset,
length: 0,
job: JobId(0),
subjob: 0,
}..FileRegionMarker {
file: *open_file_id,
offset: open_file_state.offset + 1,
length: 0,
job: JobId(0),
subjob: 0,
},
) {
if region.subjob != 0 {
// only accept this region if it's the start of a job
continue;
}
if let Some(subjobs) = self.jobs_queue.remove(&region.job) {
self.active_jobs.insert(
region.job,
ActiveJobState {
subjobs,
next_subjob: 0,
inflight: 0,
},
);
allowed -= 1;
progress = true;
break;
}
}
if allowed == 0 {
break;
}
}
if progress {
continue;
}
}
// 2. Try to progress active jobs, even if we have to open new files or seek.
let mut files_to_open = BTreeSet::new();
for (active_job_id, active_job) in &mut self.active_jobs {
if active_job.inflight > 0 {
// skip if it's busy, we don't want to send blobs out of order...
continue;
}
let desired_blob = &active_job.subjobs[active_job.next_subjob as usize];
if let Some(open_file) = self.open_files.get_mut(&desired_blob.file) {
Self::file_request(
open_file,
*active_job_id,
active_job.next_subjob,
desired_blob.offset,
desired_blob.length,
)
.await?;
ensure!(
self.file_regions.remove(&FileRegionMarker {
file: desired_blob.file,
offset: desired_blob.offset,
length: desired_blob.length,
job: *active_job_id,
subjob: active_job.next_subjob,
}),
"no FRM to remove (0)"
);
active_job.next_subjob += 1;
active_job.inflight += 1;
} else {
// can't open immediately here due to mut borrow.
files_to_open.insert(desired_blob.file);
}
}
if !files_to_open.is_empty() {
for file in files_to_open {
self.open_file(file).await?;
}
continue;
}
// 3. Start new jobs
if self.active_jobs.len() < self.rec_active_jobs as usize {
// spawn a new job...
if let Some(activate_job_id) = self.jobs_queue.keys().next().cloned() {
let new_job = self.jobs_queue.remove(&activate_job_id).unwrap();
self.active_jobs.insert(
activate_job_id,
ActiveJobState {
subjobs: new_job,
next_subjob: 0,
inflight: 0,
},
);
continue;
}
}
// 4. Block for acks, unless there are no jobs in which case we should just finish!
if self.active_jobs.is_empty() {
break;
}
if let Ok(ack) = self.ack_rx.recv_async().await {
if let Some(job) = self.active_jobs.get_mut(&ack) {
ensure!(job.inflight > 0, "recv'd ack for job that has 0 inflight");
job.inflight -= 1;
} else {
bail!("recv'd ack for bad job {ack:?}");
}
}
}
Ok(())
}
}

View File

@ -0,0 +1,213 @@
use crate::retriever::{JobId, RetrieverResp};
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use flume::{Receiver, Sender};
use std::collections::BTreeMap;
use std::sync::Arc;
use tracing::error;
use zstd::bulk::Decompressor;
pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
pub struct PipelineDecompressor {
rx: Receiver<RetrieverResp>,
tx: Sender<RetrieverResp>,
job_pool_tx: Sender<(JobId, u32, Vec<u8>)>,
complete_rx: Receiver<(JobId, u32, Vec<u8>)>,
processing: BTreeMap<JobId, JobState>,
}
struct JobState {
pub next_submit_subjob: u32,
pub next_enqueue_subjob: u32,
pub queued: BTreeMap<u32, Vec<u8>>,
pub complete: bool,
}
impl PipelineDecompressor {
pub fn start(
decom_dict: Option<Arc<Vec<u8>>>,
num_decom: u8,
rx: Receiver<RetrieverResp>,
) -> eyre::Result<Receiver<RetrieverResp>> {
let (out_tx, out_rx) = flume::bounded(4);
let (job_pool_tx, job_pool_rx) = flume::bounded(0);
let (complete_tx, complete_rx) = flume::unbounded();
for num in 0..num_decom {
let decom_dict = decom_dict.clone();
let job_pool_rx = job_pool_rx.clone();
let complete_tx = complete_tx.clone();
std::thread::Builder::new()
.name(format!("decomp {num}"))
.spawn(move || {
if let Err(err) =
Self::decompressor_worker(decom_dict, job_pool_rx, complete_tx)
{
error!("error in decompressor worker: {err:?}");
}
})?;
}
let mut pd = PipelineDecompressor {
rx,
tx: out_tx,
job_pool_tx,
complete_rx,
processing: Default::default(),
};
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) = pd.decompressor_manager().await {
eprintln!("pipeline decompressor error: {e:?}");
}
}));
Ok(out_rx)
}
fn decompressor_worker(
decom_dict: Option<Arc<Vec<u8>>>,
job_pool_rx: Receiver<(JobId, u32, Vec<u8>)>,
complete_tx: Sender<(JobId, u32, Vec<u8>)>,
) -> eyre::Result<()> {
let mut decompressor = match decom_dict {
Some(dict) => Decompressor::with_dictionary(&dict)?,
None => Decompressor::new()?,
};
while let Ok((job_id, subjob, compressed_bytes)) = job_pool_rx.recv() {
let decompressed_bytes = decompressor
.decompress(&compressed_bytes, DECOMPRESS_CAPACITY)
.context("failed to decompress")?;
complete_tx
.send((job_id, subjob, decompressed_bytes))
.map_err(|_| eyre!("complete_tx shutdown"))?;
}
Ok(())
}
async fn decompressor_manager(&mut self) -> eyre::Result<()> {
let mut incoming_open = true;
loop {
// Always process completed jobs as top priority
while let Ok(completion) = self.complete_rx.try_recv() {
self.handle_completion(completion).await?;
}
// Then it doesn't matter so much what we process after that
tokio::select! {
Ok(completion) = self.complete_rx.recv_async(), if !self.processing.is_empty() => {
self.handle_completion(completion).await?;
},
incoming_res = self.rx.recv_async(), if incoming_open => {
if let Ok(incoming) = incoming_res {
self.handle_incoming(incoming).await?;
} else {
incoming_open = false;
}
}
else => {
if !self.processing.is_empty() {
bail!("decompressor still procesing but shutting down?");
}
// eprintln!("D shutdown");
break Ok(());
}
};
}
}
async fn handle_completion(
&mut self,
(job_id, subjob, decompressed): (JobId, u32, Vec<u8>),
) -> eyre::Result<()> {
let state = self
.processing
.get_mut(&job_id)
.context("bad job when recv complete decomp")?;
ensure!(
state.queued.insert(subjob, decompressed).is_none(),
"overwrote decompressed block??"
);
while let Some(send_off) = state.queued.remove(&state.next_submit_subjob) {
// eprintln!("D send off {job_id:?} {subjob}");
self.tx
.send(RetrieverResp::Blob {
job: job_id,
subjob: state.next_submit_subjob,
blob: send_off,
})
.map_err(|_| eyre!("tx shutdown"))?;
state.next_submit_subjob += 1;
}
if state.queued.is_empty()
&& state.complete
&& state.next_submit_subjob == state.next_enqueue_subjob
{
// This job is done now
// eprintln!("D jc {job_id:?}");
self.tx
.send(RetrieverResp::JobComplete(job_id))
.map_err(|_| eyre!("tx shutdown"))?;
self.processing.remove(&job_id);
}
Ok(())
}
async fn handle_incoming(&mut self, incoming: RetrieverResp) -> eyre::Result<()> {
match incoming {
RetrieverResp::Blob { job, subjob, blob } => {
if subjob == 0 {
ensure!(
self.processing
.insert(
job,
JobState {
next_submit_subjob: 0,
next_enqueue_subjob: 0,
queued: Default::default(),
complete: false,
}
)
.is_none(),
"job was overwritten"
);
}
// debug!("blob {job:?} {subjob:?}");
let state = self.processing.get_mut(&job).with_context(|| {
format!("bad job/not starting at 0 for job {job:?} (subjob={subjob:?})")
})?;
ensure!(
state.next_enqueue_subjob == subjob,
"out of order Blob commands"
);
state.next_enqueue_subjob += 1;
self.job_pool_tx
.send_async((job, subjob, blob))
.await
.map_err(|_| eyre!("job_pool_tx shutdown"))?;
}
RetrieverResp::JobComplete(job) => {
let state = self
.processing
.get_mut(&job)
.context("bad job to complete")?;
state.complete = true;
// debug!("complete {job:?}");
let can_remove = state.next_submit_subjob == state.next_enqueue_subjob;
if can_remove {
self.tx
.send(RetrieverResp::JobComplete(job))
.map_err(|_| eyre!("tx shutdown"))?;
self.processing.remove(&job);
}
}
}
Ok(())
}
}

466
yama/src/scan.rs Normal file
View File

@ -0,0 +1,466 @@
use eyre::{bail, eyre, Context, ContextCompat};
use ignore::overrides::OverrideBuilder;
use ignore::WalkBuilder;
use patricia_tree::PatriciaMap;
use std::cmp::max;
use std::collections::{BTreeMap, BTreeSet};
use std::fs::{read_link, Metadata};
use std::io::ErrorKind;
use std::os::unix::fs::MetadataExt;
use std::path::{Component, Path};
use tracing::warn;
use yama_pile::definitions::RecursiveChunkRef;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{mtime_msec, FilesystemOwnership, FilesystemPermissions, TreeNode};
/// Given a node, recursively constructs a UID and GID lookup table based on THIS system's
/// users and groups.
///
/// Returns UIDs then GIDs.
pub fn create_uidgid_lookup_tables(
node: &TreeNode,
) -> eyre::Result<(BTreeMap<u16, String>, BTreeMap<u16, String>)> {
let mut uids = BTreeMap::<u16, String>::new();
let mut gids = BTreeMap::<u16, String>::new();
let mut used_uids = BTreeSet::new();
let mut used_gids = BTreeSet::new();
find_used_uidsgids(&node, &mut used_uids, &mut used_gids);
for uid in used_uids {
if let Some(user) = users::get_user_by_uid(uid.into()) {
uids.insert(
uid,
user.name()
.to_str()
.ok_or(eyre!("uid leads to non-String name"))?
.to_owned(),
);
}
}
for gid in used_gids {
if let Some(group) = users::get_group_by_gid(gid.into()) {
gids.insert(
gid,
group
.name()
.to_str()
.ok_or(eyre!("gid leads to non-String name"))?
.to_owned(),
);
}
}
Ok((uids, gids))
}
fn find_used_uidsgids(node: &TreeNode, uids: &mut BTreeSet<u16>, gids: &mut BTreeSet<u16>) {
match &node {
TreeNode::NormalFile { ownership, .. }
| TreeNode::Directory { ownership, .. }
| TreeNode::SymbolicLink { ownership, .. } => {
uids.insert(ownership.uid);
gids.insert(ownership.gid);
}
TreeNode::Deleted => { /* nop */ }
};
if let TreeNode::Directory { children, .. } = &node {
for (_name, child) in children {
find_used_uidsgids(child, uids, gids);
}
}
}
/// Calculates the relative path.
///
/// Returns empty string if the paths are the same, otherwise it's a /-separated string.
/// The returned string is not allowed to contain any . or .. components.
pub fn relative_path(base: &Path, leaf: &Path) -> Option<String> {
assert_eq!(std::path::MAIN_SEPARATOR, '/');
let relative = leaf.strip_prefix(base).ok()?;
if relative
.components()
.any(|c| c == Component::CurDir || c == Component::ParentDir || c == Component::RootDir)
{
return None;
}
relative.to_str().map(|s| s.to_owned())
}
/// Scans a directory tree.
///
/// Aborts if any errors (permission, bad .yamaignore files, etc) are encountered.
/// In the future, we possibly want to consider allowing
pub fn scan(root: &Path, ignores: &Vec<String>) -> eyre::Result<PatriciaMap<ScanEntry>> {
let mut entries: PatriciaMap<ScanEntry> = PatriciaMap::new();
if !root.is_dir() {
let metadata = std::fs::symlink_metadata(root).context("reading metadata of root")?;
entries.insert(
"",
scan_one_no_recurse(root, metadata)
.context("failed to generate scan entry for root")?
.context("root probably doesn't exist, or is ignored?")?,
);
return Ok(entries);
}
let mut walker = WalkBuilder::new(root);
walker
.standard_filters(false)
.add_custom_ignore_filename(".yamaignore")
.parents(false)
.follow_links(false)
.same_file_system(true);
let mut overrides = OverrideBuilder::new(root);
for ign in ignores {
overrides
.add(&("!".to_owned() + ign))
.with_context(|| format!("failed to add ignore rule: {ign:?}"))?;
}
walker.overrides(
overrides
.build()
.context("failed to create overrides with ignore entries")?,
);
// TODO(performance): We could use `WalkParallel` if we restructure this code.
let walker = walker.build();
for entry in walker {
let entry = entry?;
if !entry.path().starts_with(root) {
bail!(
"Scanned entry {:?} does not start with search path {:?}",
entry.path(),
root
);
}
let rel_path = if let Some(rel_path) = relative_path(root, entry.path()) {
rel_path
} else {
continue;
};
if !rel_path.is_empty() {
let parent_relpath = rel_path
.rsplit_once('/')
.map(|(parent, _child)| parent)
.unwrap_or("");
assert!(
entries.contains_key(parent_relpath),
"have not scanned parent for {}",
rel_path
);
}
if let Some(single_scan) = scan_one_no_recurse(
entry.path(),
entry
.metadata()
.with_context(|| format!("Failed to read metadata for {:?}", rel_path))?,
)
.with_context(|| format!("Failed to scan {:?}", rel_path))?
{
entries.insert(&rel_path, single_scan);
}
}
Ok(entries)
}
fn scan_one_no_recurse(path: &Path, metadata: Metadata) -> eyre::Result<Option<ScanEntry>> {
let filetype = metadata.file_type();
let ownership = FilesystemOwnership {
uid: metadata.uid() as u16,
gid: metadata.gid() as u16,
};
let permissions = FilesystemPermissions {
mode: metadata.mode(),
};
if filetype.is_file() {
// Leave an unpopulated file node. It's not my responsibility to chunk it right now.
Ok(Some(ScanEntry::NormalFile {
mtime: mtime_msec(&metadata),
ownership,
permissions,
size: metadata.size(),
}))
} else if filetype.is_dir() {
// TODO(perforance): this call is very likely wasteful
let dir_read = path.read_dir();
if let Err(e) = &dir_read {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished/: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied/: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
Ok(Some(ScanEntry::Directory {
ownership,
permissions,
}))
} else if filetype.is_symlink() {
let target = read_link(path)?
.to_str()
.ok_or(eyre!("target path cannot be to_str()d"))?
.to_owned();
Ok(Some(ScanEntry::SymbolicLink { ownership, target }))
} else {
Ok(None)
}
}
/// Given the parent pointer's root TreeNode and a scan entry map of the current pointer,
/// return a chunkings map prepopulated with the reusable entries.
/// Also returns a pruned and prepopulated copy of the scan entry map.
pub fn prepopulate_unmodified(
parent_tree: &TreeNode,
scan_entry_map: &PatriciaMap<ScanEntry>,
) -> (
PatriciaMap<Option<(RecursiveChunkRef, u64)>>,
PatriciaMap<ScanEntry>,
PatriciaMap<ScanEntry>,
) {
let mut reusable_chunkings = PatriciaMap::new();
let mut prepopulated_scan_entry_map = PatriciaMap::new();
let mut pruned_scan_entry_map = scan_entry_map.clone();
parent_tree
.visit(
&mut |tree_node, path| {
// TODO We should consider prepopulating symlinks and empty dirs too, if they're
// included in the parent.
if let TreeNode::NormalFile {
mtime: prev_mtime,
ownership: prev_ownership,
permissions: prev_permissions,
size: prev_size,
content: prev_content,
} = tree_node
{
if let Some(ScanEntry::NormalFile {
mtime,
ownership,
permissions,
size,
}) = scan_entry_map.get(path)
{
if mtime == prev_mtime
&& size == prev_size
&& ownership == prev_ownership
&& prev_permissions == permissions
{
// Nothing seems to have changed about this file, let's just reuse the `content` from last time.
reusable_chunkings.insert(path, Some((*prev_content, *size)));
prepopulated_scan_entry_map.insert(
path,
pruned_scan_entry_map.remove(path).expect("checked removal"),
);
// Pull out parent directories so our subset always contains the parents for their children.
for path_fragment in iterate_dirs_upwards(path.as_bytes()) {
if let Some(directory) = pruned_scan_entry_map.remove(path_fragment)
{
prepopulated_scan_entry_map.insert(path_fragment, directory);
}
}
}
}
}
Ok(())
},
String::new(),
)
.expect("no reason to fail");
(
reusable_chunkings,
pruned_scan_entry_map,
prepopulated_scan_entry_map,
)
}
pub fn integrate_uid_or_gid_map(new: &mut BTreeMap<u16, String>, old: &BTreeMap<u16, String>) {
for (old_uid, old_user) in old {
new.entry(*old_uid).or_insert_with(|| old_user.clone());
}
}
/// Given a scan entry map, creates another one whose size is limited to not containing more than
/// the given number of file bytes to be stored.
/// There is one exception: if an individual file exceeds the max size, the returned scan entry map
/// may contain just that one file.
///
/// Useful for gradually starting backups without having to do the first in one shot.
pub fn limit_scan_entry_map_to_size(
scan_entry_map: PatriciaMap<ScanEntry>,
soft_max_size: u64,
) -> PatriciaMap<ScanEntry> {
let mut accum_size: u64 = 0;
let mut have_file = false;
let mut result = PatriciaMap::new();
// let mut removeds = PatriciaSet::new();
let mut unincluded_directories = PatriciaMap::new();
for (path_bytes, entry) in scan_entry_map.into_iter() {
if accum_size >= soft_max_size {
// we're already full!
break;
}
let size_of_entry = match entry {
ScanEntry::NormalFile { size, .. } => {
// even zero-byte files are not for free, so don't let them be.
max(size, 4096)
}
ScanEntry::Directory { .. } => {
// slightly made up number, but typical size of an inode?
4096
}
ScanEntry::SymbolicLink { .. } => {
// slightly made up number, but typical size of an inode?
4096
}
};
let size_limit_would_be_respected = accum_size + size_of_entry <= soft_max_size;
let this_is_the_only_file = !have_file && matches!(&entry, ScanEntry::NormalFile { .. });
if size_limit_would_be_respected || this_is_the_only_file {
have_file |= matches!(&entry, ScanEntry::NormalFile { .. });
result.insert(&path_bytes, entry);
accum_size += size_of_entry;
// Pull out parent directories so our subset always contains the parents for their children.
for path_fragment in iterate_dirs_upwards(&path_bytes) {
if let Some(directory) = unincluded_directories.remove(path_fragment) {
result.insert(path_fragment, directory);
accum_size += 4096;
}
}
} else if matches!(&entry, &ScanEntry::Directory { .. }) {
// put the directory to one side in case we need it...
unincluded_directories.insert(path_bytes, entry);
} else {
// removeds.insert(path_bytes);
}
}
// for (key, _) in unincluded_directories {
// removeds.insert(key);
// }
// (result, removeds)
result
}
/// Returns a list of all the parent paths of the given path (in bytes),
/// including the root, in order from leaf to root.
pub fn iterate_dirs_upwards(path_bytes: &[u8]) -> Vec<&[u8]> {
let mut result = Vec::new();
let mut path_fragment = &path_bytes[..];
while let Some((index, _)) = path_fragment
.iter()
.enumerate()
.rev()
.find(|(_idx, char_byte)| **char_byte == b'/')
{
path_fragment = &path_bytes[0..index];
result.push(path_fragment);
}
result.push(&path_bytes[0..0]);
result
}
#[cfg(test)]
mod tests {
use crate::scan::limit_scan_entry_map_to_size;
use maplit::btreeset;
use patricia_tree::PatriciaMap;
use std::collections::BTreeSet;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{FilesystemOwnership, FilesystemPermissions};
#[test]
fn test_limit_scan_entry_map_to_size() {
let mut orig = PatriciaMap::new();
orig.insert(
"somedir".as_bytes(),
ScanEntry::Directory {
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
},
);
orig.insert(
"somedir/a_small_file".as_bytes(),
ScanEntry::NormalFile {
mtime: 0,
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
size: 4,
},
);
orig.insert(
"somedir/somefile".as_bytes(),
ScanEntry::NormalFile {
mtime: 0,
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
size: 8192,
},
);
// 16k = 4k (dir) + 8k (somefile) + 4k (small file; minimum)
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 16384)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
b"somedir/somefile".to_vec(),
}
);
// now we don't have room for the big file.
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 16383)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
}
);
// because we must always include at least one file so we make forward progress, it doesn't
// matter that this violates the size limit.
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 1)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
}
);
}
}

543
yama/src/storing.rs Normal file
View File

@ -0,0 +1,543 @@
use crate::pile_with_cache::PileWithCache;
use dashmap::DashSet;
use eyre::{bail, eyre, Context};
use fastcdc::v2020::{FastCDC, StreamCDC};
use flume::{Receiver, RecvError, SendError, Sender};
use std::cmp::Reverse;
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Debug;
use std::fs::File;
use std::io;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::sync::Arc;
use std::thread::JoinHandle;
use tokio::runtime::Handle;
use tracing::{debug, error, info, info_span, warn};
use yama_localcache::StoreConnection;
use yama_midlevel_crypto::chunk_id::{ChunkId, ChunkIdKey};
use yama_pile::bloblogs::BloblogWriter;
use yama_pile::definitions::{BlobLocator, BloblogId, Index, IndexBloblogEntry, RecursiveChunkRef};
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile::WormFileWriter;
use zstd::bulk::Compressor;
pub const DESIRED_INDEX_SIZE_ENTRIES: usize = 32768;
// 256 kiB
pub const FASTCDC_MIN: u32 = 256 * 1024;
// 1 MiB
pub const FASTCDC_AVG: u32 = 1024 * 1024;
// 8 MiB
pub const FASTCDC_MAX: u32 = 8 * 1024 * 1024;
pub struct StoringState {
/// A connection to the local cache for checking whether
pub cache_conn: StoreConnection<false>,
/// Set of unflushed chunks, not present in any index, which we can assume have been created in this session.
pub new_unflushed_chunks: Arc<DashSet<ChunkId>>,
/// New bloblogs that we have created but not yet written out indices for.
pub new_bloblogs: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
pub pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
pub chunk_id_key: ChunkIdKey,
pub compressor: zstd::bulk::Compressor<'static>,
pub tokio_handle: Handle,
}
impl StoringState {
pub async fn new(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
tokio_handle: Handle,
) -> eyre::Result<Self> {
let compressor = match pwc.pile.pile_config.zstd_dict.as_ref() {
None => {
Compressor::new(get_zstd_level()).context("can't create dictless compressor")?
}
Some(dict_bytes) => Compressor::with_dictionary(get_zstd_level(), dict_bytes)
.context("can't create dictful compressor")?,
};
let chunk_id_key = pwc.pile.pile_config.chunk_id_key;
Ok(StoringState {
cache_conn: pwc.localcache.read().await?,
new_unflushed_chunks,
new_bloblogs: vec![],
pwc,
chunk_id_key,
compressor,
tokio_handle,
})
}
}
struct StoringIntermediate {
/// New bloblogs that we have created but not yet written out indices for.
pub new_bloblogs: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
}
impl From<StoringState> for StoringIntermediate {
fn from(ss: StoringState) -> Self {
StoringIntermediate {
new_bloblogs: ss.new_bloblogs,
}
}
}
#[derive(Default)]
pub struct StoringBloblogWriters {
/// Bloblog writer for actual file contents (we try to keep file contents sequential in the
/// common case)
pub file_contents: Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
/// Bloblog writer for chunks of chunks
pub metachunks: Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
}
impl StoringBloblogWriters {
pub async fn finish_bloblogs(&mut self, ss: &mut StoringState) -> eyre::Result<()> {
if let Some(writer_to_finish) = self.file_contents.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
ss.new_bloblogs.push((bloblog_id, chunkmap));
}
if let Some(writer_to_finish) = self.metachunks.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
ss.new_bloblogs.push((bloblog_id, chunkmap));
}
Ok(())
}
}
impl StoringState {
/// Acquire a bloblog writer handle, reusing the existing one in the slot if suitable.
#[async_backtrace::framed]
async fn obtain_bloblog_writer<'a>(
&mut self,
slot: &'a mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<&'a mut BloblogWriter<Pin<Box<dyn WormFileWriter>>>> {
// if let Some(ref mut writer) = slot {
// if !writer.should_finish() {
// return Ok(writer);
// }
// }
// awkward avoidance of strange borrow issues that I don't fully grok
if slot.as_ref().map(|w| w.should_finish()) == Some(false) {
return Ok(slot.as_mut().unwrap());
}
if let Some(writer_to_finish) = slot.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
self.new_bloblogs.push((bloblog_id, chunkmap));
}
*slot = Some(self.pwc.pile.create_bloblog().await?);
Ok(slot.as_mut().unwrap())
}
/// For internal use only.
fn process_chunk(
&mut self,
chunk_bytes: &[u8],
result: &mut Vec<ChunkId>,
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<()> {
let chunk_id = ChunkId::compute(chunk_bytes, &self.chunk_id_key);
result.push(chunk_id);
let tokio_handle = self.tokio_handle.clone();
let is_new = tokio_handle.block_on(async_backtrace::frame!(async {
Ok::<bool, eyre::Report>(
self.cache_conn.is_chunk_new(chunk_id).await?
&& self.new_unflushed_chunks.insert(chunk_id),
)
}))?;
if is_new {
let compressed_bytes = self.compressor.compress(&chunk_bytes)?;
tokio_handle.block_on(async_backtrace::frame!(async {
let writer = self.obtain_bloblog_writer(slot).await?;
writer.write_chunk(chunk_id, &compressed_bytes).await?;
Ok::<(), eyre::Report>(())
}))?;
}
Ok(())
}
fn store_full_slice_returning_chunks(
&mut self,
store_slice: &[u8],
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<Vec<ChunkId>> {
let mut result = Vec::new();
for chunk in FastCDC::new(store_slice, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX) {
let chunk_bytes = &store_slice[chunk.offset..chunk.offset + chunk.length];
self.process_chunk(chunk_bytes, &mut result, slot)?
}
if result.is_empty() {
self.process_chunk(&[], &mut result, slot)?;
}
Ok(result)
}
fn store_full_stream_returning_chunks(
&mut self,
store_stream: impl Read,
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<(Vec<ChunkId>, u64)> {
let mut stream_length = 0u64;
let mut result = Vec::new();
for chunk in StreamCDC::new(store_stream, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX) {
let chunk = chunk.context("failed to read in for StreamCDC")?;
let chunk_bytes = chunk.data.as_slice();
stream_length += chunk_bytes.len() as u64;
self.process_chunk(chunk_bytes, &mut result, slot)?;
}
if result.is_empty() {
self.process_chunk(&[], &mut result, slot)?;
}
Ok((result, stream_length))
}
pub fn store_full_slice(
&mut self,
store_slice: &[u8],
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<RecursiveChunkRef> {
// First calculate all the chunk IDs needed to be written here.
let mut chunk_ids =
self.store_full_slice_returning_chunks(store_slice, &mut sbw.file_contents)?;
let mut depth = 0;
// If we have the wrong number of chunks, we should chunk the chunk list...
while chunk_ids.len() != 1 {
let mut metachunks_list_bytes: Vec<u8> = Vec::with_capacity(chunk_ids.len() * 32);
for chunk_id in chunk_ids {
metachunks_list_bytes.extend_from_slice(&chunk_id.to_bytes());
}
// TODO It might be nice to store these in opposite order, so a read is a true sequential
// scan.
// i.e. (depth=3) (depth=2) (depth=1) (depth=0) ...
chunk_ids = self
.store_full_slice_returning_chunks(&metachunks_list_bytes, &mut sbw.metachunks)?;
depth += 1;
}
Ok(RecursiveChunkRef {
chunk_id: chunk_ids[0],
depth,
})
}
/// Stores a full stream (`Read`) and returns the recursive chunk ref plus the length of the
/// stream.
#[async_backtrace::framed]
pub fn store_full_stream(
&mut self,
store_stream: impl Read,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<(RecursiveChunkRef, u64)> {
// First calculate all the chunk IDs needed to be written here.
let (mut chunk_ids, stream_length) =
self.store_full_stream_returning_chunks(store_stream, &mut sbw.file_contents)?;
let mut depth = 0;
// If we have the wrong number of chunks, we should chunk the chunk list...
while chunk_ids.len() != 1 {
let mut metachunks_list_bytes: Vec<u8> = Vec::with_capacity(chunk_ids.len() * 32);
for chunk_id in chunk_ids {
metachunks_list_bytes.extend_from_slice(&chunk_id.to_bytes());
}
// TODO It might be nice to store these in opposite order, so a read is a true sequential
// scan.
// i.e. (depth=3) (depth=2) (depth=1) (depth=0) ...
chunk_ids = self
.store_full_slice_returning_chunks(&metachunks_list_bytes, &mut sbw.metachunks)?;
depth += 1;
}
Ok((
RecursiveChunkRef {
chunk_id: chunk_ids[0],
depth,
},
stream_length,
))
}
}
/// Stores a file, returning Ok(Some(...)) if fine, Ok(None) if the file doesn't exist (vanished)
/// or Err(...) for any other error.
///
/// WARNING! This memory-maps the file and should NOT be used on files that are being written to
/// by other applications. If the underlying data changes during storage, it can cause issues with
/// Zstd (and presumably can also cause the chunk hashes to be invalid).
///
/// Further, I have had issues with this seeming to 'use' a lot of memory. Whilst it should only
/// be virtual memory, for some reason it seems to cause swap to be used and it makes diagnosis
/// of REAL memory issues much harder.
/// For that reason it is hard to recommend this approach for now.
#[allow(dead_code)]
fn store_file_mmap_blocking(
file_path: &Path,
storing_state: &mut StoringState,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<Option<(RecursiveChunkRef, u64)>> {
let file = match File::open(file_path) {
Ok(file) => file,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
warn!("file vanished: {file_path:?}");
return Ok(None);
}
Err(other) => {
bail!("error storing {file_path:?}: {other:?}");
}
};
let mapped = unsafe { memmap2::Mmap::map(&file) }?;
let size_of_file = mapped.as_ref().len();
let chunkref = storing_state.store_full_slice(mapped.as_ref(), sbw)?;
Ok(Some((chunkref, size_of_file as u64)))
}
fn store_file_non_mmap_blocking(
file_path: &Path,
storing_state: &mut StoringState,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<Option<(RecursiveChunkRef, u64)>> {
let file = match File::open(file_path) {
Ok(file) => file,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
warn!("file vanished: {file_path:?}");
return Ok(None);
}
Err(other) => {
bail!("error storing {file_path:?}: {other:?}");
}
};
storing_state.store_full_stream(file, sbw).map(Some)
}
pub struct StoragePipeline<JobName> {
result_rx: Receiver<(JobName, Option<(RecursiveChunkRef, u64)>)>,
join_set: Vec<JoinHandle<eyre::Result<StoringIntermediate>>>,
}
fn storage_pipeline_worker_blocking<JobName: Debug>(
job_rx: Receiver<(JobName, PathBuf)>,
result_tx: Sender<(JobName, Option<(RecursiveChunkRef, u64)>)>,
mut storing_state: StoringState,
tokio_handle: Handle,
) -> eyre::Result<StoringIntermediate> {
let mut bloblog_writers = StoringBloblogWriters::default();
debug!("SPW startup");
while let Ok((job_id, file_path)) = job_rx.recv() {
// TODO(span): is this correctly a child of the parent span?
let span = info_span!("store_file", file=?file_path);
let _span_entered = span.enter();
let file_store_opt =
store_file_non_mmap_blocking(&file_path, &mut storing_state, &mut bloblog_writers)
.with_context(|| format!("failed to store {file_path:?}"))?;
if let Err(SendError(to_be_sent)) = result_tx.send((job_id, file_store_opt)) {
bail!("Can't return result for {to_be_sent:?} — result_tx shut down.");
}
}
debug!("finishing bloblogs");
tokio_handle.block_on(bloblog_writers.finish_bloblogs(&mut storing_state))?;
debug!("finished bloblogs!");
Ok(StoringIntermediate::from(storing_state))
}
fn get_zstd_level() -> i32 {
// TODO Do something more proper
if let Ok(var) = std::env::var("YAMA_HACK_ZSTD_LEVEL") {
if let Ok(level) = var.parse() {
info!("YAMA_HACK_ZSTD_LEVEL: using {level}");
return level;
} else {
error!("YAMA_HACK_ZSTD_LEVEL was not set to a valid i32: {var:?}")
}
}
return 16;
}
impl<JobName: Debug + Send + 'static> StoragePipeline<JobName> {
#[async_backtrace::framed]
pub async fn launch_new(
workers: u32,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<(StoragePipeline<JobName>, Sender<(JobName, PathBuf)>)> {
let (job_tx, job_rx) = flume::bounded(16);
let (result_tx, result_rx) = flume::bounded(4);
let mut join_set = Vec::new();
for spw_num in 0..workers {
let job_rx = job_rx.clone();
let result_tx = result_tx.clone();
let tokio_handle = Handle::current();
let storing_state = StoringState::new(
pwc.clone(),
new_unflushed_chunks.clone(),
tokio_handle.clone(),
)
.await
.context("failed to create storing state")?;
// make a logging span for the Storage Pipeline Workers
let spw_span = info_span!("spw", n = spw_num);
let thread = std::thread::Builder::new()
.name(format!("spw-{spw_num}"))
.spawn(move || {
let _spw_span_entered = spw_span.enter();
let result = storage_pipeline_worker_blocking(
job_rx,
result_tx,
storing_state,
tokio_handle,
);
if let Err(ref err) = result {
error!("Error in SPW {err:?}");
}
result
})
.expect("failed to spawn SPW thread!");
join_set.push(thread);
}
Ok((
StoragePipeline {
result_rx,
join_set,
},
job_tx,
))
}
#[inline]
pub async fn next_result(
&self,
) -> Result<(JobName, Option<(RecursiveChunkRef, u64)>), RecvError> {
self.result_rx.recv_async().await
}
/// Must be sure that all results have been collected first.
#[async_backtrace::framed]
pub async fn finish_into_chunkmaps(
mut self,
) -> eyre::Result<BTreeMap<BloblogId, IndexBloblogEntry>> {
if let Ok(msg) = self.result_rx.recv_async().await {
bail!("Haven't processed all results yet! {msg:?}");
}
let mut chunkmap = BTreeMap::new();
while let Some(thread) = self.join_set.pop() {
// TODO(blocking on async thread): do this differently.
let join_resres = thread.join().map_err(|panic_err| eyre!("{panic_err:?}"));
chunkmap.extend(join_resres??.new_bloblogs.into_iter().map(|(k, nb)| {
(
k,
IndexBloblogEntry {
chunks: nb,
forgotten_bytes: 0,
},
)
}));
}
Ok(chunkmap)
}
}
fn assemble_indices(chunkmap: BTreeMap<BloblogId, IndexBloblogEntry>) -> Vec<Index> {
let mut sorted_map = BTreeMap::new();
for (idx, chunkmap) in chunkmap.into_iter().enumerate() {
let size_of_chunkmap = chunkmap.1.chunks.len() + 1;
sorted_map.insert(Reverse((size_of_chunkmap, idx)), chunkmap);
}
let mut indices = Vec::new();
while let Some((Reverse((size, _)), (bloblog_id, bloblog_chunks))) = sorted_map.pop_first() {
let mut new_index_contents = BTreeMap::new();
new_index_contents.insert(bloblog_id, bloblog_chunks);
let mut new_index_size_so_far = size;
while new_index_size_so_far < DESIRED_INDEX_SIZE_ENTRIES && !sorted_map.is_empty() {
if let Some((k, _)) = sorted_map
.range(
Reverse((
DESIRED_INDEX_SIZE_ENTRIES - new_index_size_so_far,
usize::MAX,
))..,
)
.next()
{
let k = k.clone();
let (Reverse((add_size, _)), (bloblog_id, bloblog_chunks)) =
sorted_map.remove_entry(&k).unwrap();
new_index_size_so_far += add_size;
new_index_contents.insert(bloblog_id, bloblog_chunks);
} else {
break;
}
}
indices.push(Index {
supersedes: BTreeSet::new(),
bloblogs: new_index_contents,
});
}
indices
}
async fn write_indices(
pwc: &PileWithCache<BoxedWormFileProvider>,
indices: Vec<Index>,
) -> eyre::Result<()> {
for index in indices {
let index_id = pwc.pile.create_index(&index).await?;
if !pwc
.localcache
.write()
.await?
.apply_index(index_id, Arc::new(index))
.await?
{
error!("freshly-created index wasn't new. This is suspicious.");
};
}
Ok(())
}
#[async_backtrace::framed]
pub async fn assemble_and_write_indices(
pwc: &PileWithCache<BoxedWormFileProvider>,
chunkmap: BTreeMap<BloblogId, IndexBloblogEntry>,
) -> eyre::Result<()> {
let indices = assemble_indices(chunkmap);
write_indices(pwc, indices).await
}

View File

@ -1,44 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fmt::Write;
pub fn bytes_to_hexstring(chunkid: &[u8]) -> String {
let mut s = String::with_capacity(chunkid.len() * 2);
for &byte in chunkid.iter() {
write!(&mut s, "{:02x}", byte).expect("Unable to write");
}
s
}
pub fn get_number_of_workers(first_try_env_name: &str) -> u8 {
let from_env_var = std::env::var(first_try_env_name)
.ok()
.or_else(|| std::env::var("YAMA_WORKERS").ok());
if let Some(from_env_var) = from_env_var {
from_env_var
.parse()
.expect("Number of workers should be an integer from 1 to 255.")
} else {
let number = num_cpus::get();
if number > u8::MAX.into() {
u8::MAX
} else {
number as u8
}
}
}

4
yama/src/vacuum.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod delete_unrefd_bloblogs;
pub mod forget_chunks;
pub mod merge_indices;
pub mod repack_bloblogs_and_indices;

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,171 @@
use crate::extract::expand_chunkrefs_one_layer;
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, ensure, Context, ContextCompat};
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use tracing::info;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::IndexId;
use yama_pile::tree::TreeNode;
use yama_wormfile::boxed::BoxedWormFileProvider;
pub async fn find_forgettable_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
indices: BTreeSet<IndexId>,
) -> eyre::Result<BTreeSet<ChunkId>> {
let mut unseen_chunk_ids = BTreeSet::new();
// Find all chunks in the given indices
{
let mut cache_conn = pwc.localcache.read().await?;
for index_id in &indices {
unseen_chunk_ids.extend(cache_conn.list_chunks_in_index(*index_id).await?);
}
};
let chunks_to_scan = prepare_chunkrefs_to_scan(pwc).await?;
scan_chunks(pwc, &mut unseen_chunk_ids, chunks_to_scan)
.await
.context("failed to do a sweep")?;
Ok(unseen_chunk_ids)
}
async fn prepare_chunkrefs_to_scan(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<BTreeMap<u32, BTreeSet<ChunkId>>> {
let pointer_names = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
let mut chunks_to_scan_by_depth: BTreeMap<u32, BTreeSet<ChunkId>> = BTreeMap::new();
for pointer_name in &pointer_names {
let pointer = pwc
.pile
.read_pointer(pointer_name)
.await?
.context("pointer vanished")?;
if let Some(parent_name) = pointer.parent {
if !pointer_names.contains(pointer_name) {
bail!("{parent_name:?}, the parent of {pointer_name:?}, does not exist");
}
}
pointer
.root
.node
.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
chunks_to_scan_by_depth
.entry(content.depth)
.or_default()
.insert(content.chunk_id);
}
Ok(())
},
String::new(),
)
.unwrap();
}
Ok(chunks_to_scan_by_depth)
}
/// Scans the recursive chunkrefs that are passed in, ticking off chunks from the `unseen` set as
/// we go.
async fn scan_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
unseen: &mut BTreeSet<ChunkId>,
chunks_to_scan_by_depth: BTreeMap<u32, BTreeSet<ChunkId>>,
) -> eyre::Result<()> {
let mut to_scan: Vec<(u32, Vec<ChunkId>)> = chunks_to_scan_by_depth
.into_iter()
.flat_map(|(depth, chunkset)| {
chunkset
.into_iter()
.map(move |chunk_id| (depth, vec![chunk_id]))
})
.collect();
while !to_scan.is_empty() {
// Mark as seen.
for (_, chunk_ids) in &to_scan {
for chunk_id in chunk_ids {
unseen.remove(chunk_id);
}
}
// Don't descend further into zero-depth elements.
to_scan = to_scan
.into_iter()
.filter(|(depth, _)| *depth > 0)
.collect();
// Decrement depth counters.
to_scan = expand_chunkrefs_one_layer(pwc, to_scan)
.await?
.into_iter()
.map(|(old_depth, chunkids)| (old_depth - 1, chunkids))
.collect();
}
Ok(())
}
pub async fn forget_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
indices: BTreeSet<IndexId>,
forgettable: BTreeSet<ChunkId>,
) -> eyre::Result<()> {
let mut indices_to_rewrite = Vec::new();
// First do a cache-only check to see which indices need rewriting.
{
let mut cache_conn = pwc.localcache.read().await?;
for index_id in &indices {
let chunks_in_this_index = cache_conn.list_chunks_in_index(*index_id).await?;
if !chunks_in_this_index.is_disjoint(&forgettable) {
indices_to_rewrite.push(index_id);
}
}
}
info!(
"{} indices to rewrite in order to forget chunks",
indices_to_rewrite.len()
);
// Go through each index and clean out whatever needs forgetting (then re-create the index and
// remove the old one).
for index_id in indices_to_rewrite {
let mut index = pwc.pile.read_index(*index_id).await?;
let mut changed = false;
for bloblog_entry in index.bloblogs.values_mut() {
let removable: Vec<ChunkId> = bloblog_entry
.chunks
.keys()
.filter(|ci| forgettable.contains(ci))
.cloned()
.collect();
changed |= !removable.is_empty();
for chunk_id in removable {
bloblog_entry.forgotten_bytes +=
bloblog_entry.chunks.remove(&chunk_id).unwrap().length;
}
}
ensure!(changed, "no change to index {index_id:?}");
index.supersedes.clear();
index.supersedes.insert(*index_id);
// TODO APPLY THE NEW INDEX DIRECTLY (how do we do that again?)
let new_index_id = pwc.pile.create_index(&index).await?;
ensure!(new_index_id != *index_id, "index ID bounce");
pwc.pile.delete_index_dangerous_exclusive(*index_id).await?;
}
Ok(())
}

View File

@ -0,0 +1,127 @@
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, Context};
use std::collections::btree_map::Entry;
use std::collections::BTreeSet;
use std::sync::Arc;
use tracing::{debug, warn};
use yama_pile::definitions::{Index, IndexId};
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const MERGE_THRESHOLD_SIZE: u32 = 2 * 1024 * 1024;
pub const MERGE_TARGET_SIZE: u32 = 16 * 1024 * 1024;
/// Selects indices for merge.
///
/// Criteria:
/// - size is less than the `threshold_size`
/// - (FUTURE; TODO) two indices that cover the same bloblog should be merged
pub async fn select_indices_for_merge(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
target_size: u32,
threshold_size: u32,
) -> eyre::Result<Vec<BTreeSet<IndexId>>> {
let mut result = Vec::new();
let mut mergeable_indices: BTreeSet<(u64, IndexId)> = pwc
.pile
.list_indices_with_meta()
.await?
.into_iter()
.filter(|(_, meta)| meta.file_size < threshold_size as u64)
.map(|(index_id, meta)| (meta.file_size, index_id))
.collect();
while mergeable_indices.len() >= 2 {
let mut merge_set = BTreeSet::new();
let mut merge_size = 0u64;
let (first_size, first_index) = mergeable_indices.pop_first().unwrap();
merge_size += first_size;
merge_set.insert(first_index);
while let Some((size, index)) = mergeable_indices.first() {
if merge_size + *size < target_size as u64 {
merge_size += *size;
merge_set.insert(*index);
mergeable_indices.pop_first();
} else {
break;
}
}
if merge_set.len() > 1 {
result.push(merge_set);
}
}
Ok(result)
}
/// Merges some indices, deleting them in the process.
/// Requires exclusive lock.
/// (Note: in the future we could only supersede the indices, which only needs a shared lock.
/// However you need an exclusive lock to eventually delete superseded indices...).
pub async fn merge_indices(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
merge_sets: Vec<BTreeSet<IndexId>>,
) -> eyre::Result<()> {
for merge_set in merge_sets {
let mut final_index = Index {
supersedes: merge_set.clone(),
bloblogs: Default::default(),
};
for index_id in &merge_set {
let index_being_subsumed = pwc.pile.read_index(*index_id).await?;
// TODO: do we need to worry about the 'supersedes' property on the index here?
// I think not, or at least not if the superseded indices don't exist,
// but worth thinking about in the future if we don't immediately delete
// superseded indices...
for (bloblog_id, bloblog_entry) in index_being_subsumed.bloblogs {
match final_index.bloblogs.entry(bloblog_id) {
Entry::Vacant(ve) => {
ve.insert(bloblog_entry);
}
Entry::Occupied(mut oe) => {
let new_entry = oe.get_mut();
let (existing_chunks, new_chunks): (Vec<_>, Vec<_>) = bloblog_entry
.chunks
.into_iter()
.partition(|(chunk_id, _)| new_entry.chunks.contains_key(chunk_id));
for (chunk_id, locator) in new_chunks {
// Subtract from the forgotten byte count, since this may be us re-remembering bytes out of safety...
new_entry.forgotten_bytes =
new_entry.forgotten_bytes.saturating_sub(locator.length);
let is_new = new_entry.chunks.insert(chunk_id, locator).is_none();
assert!(is_new);
}
for (chunk_id, locator) in existing_chunks {
if &new_entry.chunks[&chunk_id] != &locator {
bail!("Attempted to merge indices that disagree about {bloblog_id:?}/{chunk_id:?}");
}
}
}
}
}
}
let merged_index_id = pwc
.pile
.create_index(&final_index)
.await
.context("failed to create merged index")?;
if merge_set.contains(&merged_index_id) {
// I don't see how this could be possible, but let's avoid deleting the new index if it somehow is a merge of itself...
warn!("strange: created index ID is one of its own merges...");
continue;
}
debug!("merged indices {merge_set:?} into {merged_index_id:?}; deleting mergees");
for index_to_delete in merge_set {
pwc.pile
.delete_index_dangerous_exclusive(index_to_delete)
.await?;
}
}
Ok(())
}

View File

@ -0,0 +1,191 @@
use crate::pile_with_cache::PileWithCache;
use crate::storing::assemble_and_write_indices;
use eyre::ContextCompat;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use yama_localcache::BloblogStats;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BloblogId, IndexBloblogEntry};
use yama_wormfile::boxed::BoxedWormFileProvider;
/// Repack bloblogs that have this much forgotten space in them.
pub const REPACK_BLOBLOGS_TO_RECLAIM_SPACE_BYTES: u64 = 32 * 1024 * 1024;
/// Defines what a 'small bloblog' is (one that is below a certain size, excluding forgotten bytes).
pub const SMALL_BLOBLOG_THRESHOLD: u64 = 64 * 1024 * 1024;
/// Clump together small bloblogs when together they would hit or exceed this size.
pub const REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES: u64 = 2 * 1024 * 1024 * 1024;
/// The target size to reach when repacking, in terms of blob bytes.
pub const REPACK_TARGET_SIZE: u64 = 4 * 1024 * 1024;
/// The limit size to use when repacking, in terms of blob bytes.
pub const REPACK_TARGET_LIMIT: u64 = 5 * 1024 * 1024;
/// Gets bloblogs' stats. Only considers bloblogs referenced by exactly one index, so we don't
/// have to deal with unifying indices.
pub async fn get_bloblogs_stats(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<BTreeMap<BloblogId, BloblogStats>> {
let mut cache_conn = pwc.localcache.read().await?;
let indices = cache_conn.list_indices().await?;
let mut bloblogs: BTreeMap<BloblogId, Option<BloblogStats>> = BTreeMap::new();
for index in indices {
for (bloblog, stats) in cache_conn.index_bloblog_stats(index).await? {
match bloblogs.entry(bloblog) {
Entry::Vacant(ve) => {
ve.insert(Some(stats));
}
Entry::Occupied(mut oe) => {
// only allow one stats per bloblog, then replace with None.
oe.insert(None);
}
}
}
}
Ok(bloblogs
.into_iter()
.flat_map(|(k, v)| v.map(|v| (k, v)))
.collect())
}
/// Choose some bloblogs to repack. Assumes an updated local cache.
///
/// Only bloblogs referenced by exactly one index will be considered for repacking.
pub async fn select_bloblogs_for_repack(
stats: BTreeMap<BloblogId, BloblogStats>,
) -> eyre::Result<Vec<BTreeMap<BloblogId, BloblogStats>>> {
let mut repack_for_space: BTreeSet<BloblogId> = stats
.iter()
.filter(|(_, v)| v.forgotten_bytes >= REPACK_BLOBLOGS_TO_RECLAIM_SPACE_BYTES)
.map(|(&k, _)| k)
.collect();
let maybe_repack_for_clumping: BTreeSet<BloblogId> = stats
.iter()
.filter(|(_, v)| v.blob_size <= SMALL_BLOBLOG_THRESHOLD)
.map(|(&k, _)| k)
.collect();
let should_repack_for_clumping = maybe_repack_for_clumping.len() > 1
&& maybe_repack_for_clumping
.iter()
.map(|bi| stats[bi].blob_size)
.sum::<u64>()
> REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES;
let to_repack = repack_for_space.clone();
if should_repack_for_clumping {
repack_for_space.extend(maybe_repack_for_clumping);
}
let mut to_repack: BTreeSet<(u64, BloblogId)> = to_repack
.into_iter()
.map(|bi| (stats[&bi].blob_size, bi))
.collect();
let mut repack_sets = Vec::new();
while !to_repack.is_empty() {
let mut new_repack_group = BTreeMap::new();
let mut new_repack_group_size = 0u64;
let (first_sz, first_to_repack) = to_repack.pop_last().unwrap();
new_repack_group_size += first_sz;
new_repack_group.insert(first_to_repack, stats[&first_to_repack].clone());
while new_repack_group_size < REPACK_TARGET_SIZE {
let Some((first_size, _)) = to_repack.first() else { break; };
if new_repack_group_size + *first_size > REPACK_TARGET_LIMIT {
break;
}
let (extra_size, extra_bloblog_id) = to_repack.pop_first().unwrap();
new_repack_group_size += extra_size;
new_repack_group.insert(extra_bloblog_id, stats[&extra_bloblog_id].clone());
}
// now check the repack group is good
if new_repack_group
.keys()
.any(|bi| repack_for_space.contains(bi))
|| new_repack_group_size > REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES
{
repack_sets.push(new_repack_group);
}
}
Ok(repack_sets)
}
pub async fn perform_repack(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
repack_sets: Vec<BTreeMap<BloblogId, BloblogStats>>,
) -> eyre::Result<()> {
// 1. Write new bloblogs
let mut indices_buffer = BTreeMap::new();
let mut index_parts: BTreeMap<BloblogId, IndexBloblogEntry> = BTreeMap::new();
for repack_set in &repack_sets {
let mut new_bloblog = pwc.pile.create_bloblog().await?;
for (old_bloblog_id, old_bloblog_stats) in repack_set {
let index_id = old_bloblog_stats.in_index;
if !indices_buffer.contains_key(&index_id) {
indices_buffer.insert(index_id, pwc.pile.read_index(index_id).await?);
}
let index_bloblog_entry = indices_buffer
.get_mut(&index_id)
.unwrap()
.bloblogs
.remove(&old_bloblog_id)
.context("bug: no IBE despite rewrite from context of this index")?;
let mut old_bloblog = pwc.pile.read_bloblog(*old_bloblog_id).await?;
let locators: BTreeMap<u64, ChunkId> = index_bloblog_entry
.chunks
.into_iter()
.map(|(blob, locator)| (locator.offset, blob))
.collect();
for chunk_id in locators.into_values() {
let chunk = old_bloblog
.read_chunk(chunk_id)
.await?
.context("bug or corrupt bloblog: promised chunk missing")?;
new_bloblog.write_chunk(chunk_id, &chunk).await?;
}
}
let (_wormpath, new_bloblog_id, new_bloblog_index_info) = new_bloblog.finish().await?;
index_parts.insert(
new_bloblog_id,
IndexBloblogEntry {
chunks: new_bloblog_index_info,
forgotten_bytes: 0,
},
);
}
// 2. Write new indices, but make sure to also write out index entries for unaffected bloblogs
// that appear in the indices we want to replace shortly.
for (_, index) in indices_buffer.iter_mut() {
index_parts.extend(std::mem::take(&mut index.bloblogs));
}
assemble_and_write_indices(&pwc, index_parts).await?;
// 3. Delete old indices
for index_id in indices_buffer.into_keys() {
pwc.pile.delete_index_dangerous_exclusive(index_id).await?;
}
// 4. Delete old bloblogs
for repack_group in repack_sets {
for bloblog_id in repack_group.into_keys() {
pwc.pile
.delete_bloblog_dangerous_exclusive(bloblog_id)
.await?;
}
}
Ok(())
}

91
yama_cli_readme.txt Normal file
View File

@ -0,0 +1,91 @@
`yama init` → init a yama pile right here, right now
creates:
* config
* directory structure
* master keyring (prompts for password)
`--zstd-dict <dict> | --no-zstd-dict`: choose a Zstd dictionary (or lack thereof)
OR
`yama init --sftp` → interactively create SFTP pile
`yama init --s3` → interactively create S3 pile
creates:
* config (remote)
* directory structure (remote)
* master keyring (local + optionally remote too, prompts for password)
* connection information file (local)
`yama keyring`
`inspect <file>.yamakeyring` → print contents of keyring, ask for password if needed
`new|create <new> [--from <src>] [--with <keylist>] [--no-password]`
create a new keyring based on another one. If `--from` not specified, then defaults to the master key in this directory (`master.yamakeyring`).
if `--no-password` is specified, then the new keyring will be unprotected
if `--with` is specified, then it's either a list of keynames to include (e.g. `r_bloblog_contents`, etc)
or a list of opinionated roles (ALL, config, backup, restore, ...?)
e.g. you might give your server a keyring with:
`yama keyring new myserver.yamakeyring --from master.yamakeyring --with backup --no-password` to allow it to create backups but not read from them
`yama store <source file/dir> [<dest pile/pileconnector dir>:]<pointer name>`
Stores a file/directory into Yama, with the given pointer.
If `--stdin` is passed, then the contents to store are actually read from stdin instead and the provided filename is a fake filename for informational purposes only.
Would be suitable for `pg_dump | yama store --stdin mydbname.sql
If `--force` is passed, this can overwrite a pointer name.
I expect we will also have `--exclude` and `--exclude-list` options.
I expect we will also have a `--dry-run` option.
`yama extract [<dest pile/pileconnector dir>:]<pointer name>[/path/to/subtree] (--stdout | <target file/dir>)`
Extracts a file/directory from Yama, from the given pointer.
If `--stdout` is passed, writes to stdout, in which case the input must be just one file.
I expect we will also have `--exclude` and `--exclude-list` options.
I expect we will also have a `--dry-run` option.
`yama mount [<dest pile/pileconnector dir>:]<pointer name>[/path/to/subtree] <target file/dir>`
Mount a pointer as a read-only FUSE filesystem.
`yama check`
Checks consistency of the pile. One of the levels must be specified:
`--pointers`|`-1`: checks that all pointers are valid
`--shallow`|`-2`: checks that all pointers' tree nodes point to chunks that exist.
`--intensive`|`-9`: checks that all chunks have the correct hash, that all indices correctly represent the bloblogs, that all pointers point to valid files in the end, ... as much as possible
`yama lsp [[<dest pile/pileconnector dir>:]<glob>]`
(glob defaults to `.:*`)
Lists pointers in the pile.
If `--deleted` is specified, includes deleted pointers that have yet to be vacuumed.
`yama rmp [<dest pile/pileconnector dir>:]<pointer>`
Deletes pointers, or marks them as deleted.
If `--glob` specified, then `<pointer>` is a glob.
If `--now` is specified, an exclusive lock is required to actually delete the pointer.
If `--now` is *not* specified, then the pointer is merely marked as deleted and this only requires a shared lock.
`yama vacuum`
Vacuums the pile, reclaiming disk space. Holds an exclusive lock over the pile.
Does things like:
- (--pointers) clean up deleted pointers that need to be actually deleted
- (--sweep) scans all pointers to discover all the chunks that are present in bloblogs but not used, then removes them from the indices (possibly slow, but necessary to actually make bloblog repacking possible)
- (--indices) writes new indices to replace existing indices, if the existing indices are not space-efficient
- (--bloblogs) repacks bloblogs that aren't space-efficient, removing unindexed blobs in the process
`--all` for everything.

View File

@ -0,0 +1,16 @@
[package]
name = "yama_localcache"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
sqlx = { version = "0.6.3", features = ["sqlite", "runtime-tokio-rustls"] }
tracing = "0.1.37"
eyre = "0.6.8"
tokio = "1.27.0"
yama_pile = { path = "../yama_pile" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
itertools = "0.10.5"
async-backtrace = "0.2.6"

7
yama_localcache/dev_db.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
set -eu
dbpath="$(dirname "$0")/testdb.sqlite"
#echo $dbpath
sqlx db create --database-url sqlite:"$dbpath"
sqlx migrate run --database-url sqlite:"$dbpath"

View File

@ -0,0 +1,39 @@
-- Create a local cache of indices.
CREATE TABLE indices (
index_short_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
index_sha256 TEXT NOT NULL
);
CREATE UNIQUE INDEX indices_index_sha256 ON indices(index_sha256);
CREATE TABLE bloblogs (
bloblog_short_id INTEGER PRIMARY KEY NOT NULL,
bloblog_sha256 TEXT NOT NULL
);
CREATE UNIQUE INDEX bloblogs_bloblog_sha256 ON bloblogs(bloblog_sha256);
-- Track the relationship between indices and bloblogs
CREATE TABLE indices_bloblogs (
index_short_id INTEGER NOT NULL REFERENCES indices(index_short_id),
bloblog_short_id INTEGER NOT NULL REFERENCES bloblogs(bloblog_short_id),
forgotten_bytes INTEGER NOT NULL,
PRIMARY KEY (index_short_id, bloblog_short_id)
);
CREATE TABLE blobs (
chunk_id TEXT NOT NULL,
bloblog_short_id INTEGER NOT NULL,
index_short_id INTEGER NOT NULL,
offset INTEGER NOT NULL,
size INTEGER NOT NULL,
PRIMARY KEY (chunk_id, bloblog_short_id, index_short_id),
FOREIGN KEY (index_short_id, bloblog_short_id) REFERENCES indices_bloblogs(index_short_id, bloblog_short_id)
);
CREATE INDEX blobs_bloblog_short_id ON blobs(bloblog_short_id);
CREATE INDEX blobs_index_short_id ON blobs(index_short_id);
CREATE TABLE indices_supersede (
superseded_sha256 TEXT NOT NULL,
successor_sha256 TEXT NOT NULL REFERENCES indices(index_sha256),
PRIMARY KEY (superseded_sha256, successor_sha256)
);

423
yama_localcache/src/lib.rs Normal file
View File

@ -0,0 +1,423 @@
use eyre::Context;
use itertools::Itertools;
use sqlx::pool::PoolConnection;
use sqlx::sqlite::{
SqliteConnectOptions, SqliteJournalMode, SqlitePoolOptions, SqliteRow, SqliteSynchronous,
};
use sqlx::{query, Connection, Row, Sqlite, SqlitePool};
use std::collections::{BTreeMap, BTreeSet};
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BlobLocator, BloblogId, Index, IndexId};
#[derive(Clone)]
pub struct Store {
pool: Arc<SqlitePool>,
writer_semaphore: Arc<Semaphore>,
}
pub struct StoreConnection<const RW: bool> {
/// The underlying 'connection'.
conn: PoolConnection<Sqlite>,
/// Permit to write. Only here so that it is dropped at the correct time.
#[allow(dead_code)]
writer_permit: Option<OwnedSemaphorePermit>,
}
const MAX_SQLITE_CONNECTIONS: u32 = 16;
impl Store {
pub async fn new(path: &Path) -> eyre::Result<Store> {
let pool = SqlitePoolOptions::new()
.max_connections(MAX_SQLITE_CONNECTIONS)
.connect_with(
SqliteConnectOptions::new()
.create_if_missing(true)
.journal_mode(SqliteJournalMode::Wal)
.synchronous(SqliteSynchronous::Normal)
.foreign_keys(true)
.filename(path),
)
.await?;
let store = Store {
pool: Arc::new(pool),
writer_semaphore: Arc::new(Semaphore::new(1)),
};
let mut conn = store.pool.acquire().await?;
// This will run the necessary migrations.
sqlx::migrate!("./migrations").run(&mut conn).await?;
Ok(store)
}
async fn txn<const RW: bool>(&self) -> eyre::Result<StoreConnection<RW>> {
let writer_permit = if RW {
Some(self.writer_semaphore.clone().acquire_owned().await?)
} else {
None
};
let conn = self.pool.acquire().await?;
Ok(StoreConnection {
conn,
writer_permit,
})
}
pub async fn read(&self) -> eyre::Result<StoreConnection<false>> {
self.txn().await
}
pub async fn write(&self) -> eyre::Result<StoreConnection<true>> {
self.txn().await
}
}
impl StoreConnection<true> {
/// Store an index into the local index cache.
/// If the index supersedes any others, then those supersedings are stored and the blob entries
/// for the superseded indices are removed.
///
/// Returns true iff the index was new.
pub async fn apply_index(
&mut self,
index_id: IndexId,
index: Arc<Index>,
) -> eyre::Result<bool> {
let index_id_txt = index_id.to_string();
self.conn.transaction(|txn| Box::pin(async move {
let needs_index = query!("
SELECT 1 AS one FROM indices WHERE index_sha256 = ?
", index_id_txt).fetch_optional(&mut *txn).await?.is_none();
if !needs_index {
return Ok(false);
}
let index_short_id = query!("
INSERT INTO indices (index_sha256)
VALUES (?)
RETURNING index_short_id
", index_id_txt).fetch_one(&mut *txn).await?.index_short_id;
for supersede in &index.supersedes {
let supersede_txt = supersede.to_string();
query!("
INSERT INTO indices_supersede (superseded_sha256, successor_sha256)
VALUES (?, ?)
", supersede_txt, index_id_txt).execute(&mut *txn).await?;
if let Some(row) = query!("
SELECT index_short_id FROM indices WHERE index_sha256 = ?
", supersede_txt).fetch_optional(&mut *txn).await? {
// Clear out any chunk entries for the superseded indices.
// This ensures we don't rely on them in the future and also clears up space.
query!("
DELETE FROM blobs WHERE index_short_id = ?
", row.index_short_id).execute(&mut *txn).await?;
}
}
// Check that the index hasn't already been superseded, before adding blobs
let is_superseded = query!("
SELECT 1 as _yes FROM indices_supersede WHERE superseded_sha256 = ?",
index_id_txt
).fetch_optional(&mut *txn).await?.is_some();
if !is_superseded {
for (bloblog_sha256, index_bloblog_entry) in &index.bloblogs {
let bloblog_sha256_txt = bloblog_sha256.to_string();
let bloblog_short_id_opt = query!("
SELECT bloblog_short_id FROM bloblogs WHERE bloblog_sha256 = ?
", bloblog_sha256_txt).fetch_optional(&mut *txn).await?;
let bloblog_short_id = match bloblog_short_id_opt {
None => {
query!("
INSERT INTO bloblogs (bloblog_sha256)
VALUES (?)
RETURNING bloblog_short_id
", bloblog_sha256_txt).fetch_one(&mut *txn).await?.bloblog_short_id
},
Some(row) => row.bloblog_short_id,
};
let forgotten_bytes = index_bloblog_entry.forgotten_bytes as i64;
query!("
INSERT INTO indices_bloblogs (index_short_id, bloblog_short_id, forgotten_bytes)
VALUES (?, ?, ?)
", index_short_id, bloblog_short_id, forgotten_bytes)
.execute(&mut *txn)
.await?;
for (chunk_id, chunk_locator) in index_bloblog_entry.chunks.iter() {
let chunk_id_txt = chunk_id.to_string();
let coffset = chunk_locator.offset as i64;
let clen = chunk_locator.length as i64;
query!("
INSERT INTO blobs (index_short_id, bloblog_short_id, chunk_id, offset, size)
VALUES (?, ?, ?, ?, ?)
", index_short_id, bloblog_short_id, chunk_id_txt, coffset, clen).execute(&mut *txn).await?;
}
}
}
Ok(true)
})).await
}
/// Delete an index from the cache, if the cache was deleted from the pile.
pub async fn delete_index(&mut self, index_id: IndexId) -> eyre::Result<()> {
self.conn
.transaction(|txn| {
Box::pin(async move {
let index_id_txt = index_id.to_string();
query!(
"
DELETE FROM indices_supersede WHERE successor_sha256 = ?
",
index_id_txt
)
.execute(&mut *txn)
.await?;
let index_short_id = query!(
"
SELECT index_short_id FROM indices WHERE index_sha256 = ?
",
index_id_txt
)
.fetch_one(&mut *txn)
.await?
.index_short_id;
query!(
"
DELETE FROM blobs WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
query!(
"
DELETE FROM indices_bloblogs WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
query!(
"
DELETE FROM indices WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
Ok::<_, eyre::Report>(())
})
})
.await?;
Ok(())
}
}
impl<const RW: bool> StoreConnection<RW> {
pub async fn locate_chunk(
&mut self,
chunk_id: ChunkId,
) -> eyre::Result<Option<(BloblogId, BlobLocator)>> {
let chunk_id_text = chunk_id.to_string();
let row_opt = query!(
"
SELECT bl.bloblog_sha256, b.offset, b.size
FROM blobs b
JOIN bloblogs bl USING (bloblog_short_id)
WHERE b.chunk_id = ?
LIMIT 1
",
chunk_id_text
)
.fetch_optional(&mut *self.conn)
.await?;
match row_opt {
None => Ok(None),
Some(row) => {
let bloblog_id =
BloblogId::try_from(row.bloblog_sha256.as_str()).with_context(|| {
format!("failed to decode bloblog ID: {:?}", row.bloblog_sha256)
})?;
Ok(Some((
bloblog_id,
BlobLocator {
offset: row.offset.try_into().context("offset too big")?,
length: row.size.try_into().context("size too big")?,
},
)))
}
}
}
/// Returns all chunk locations.
/// If a chunk does not exist, it is just not returned in the output map.
pub async fn locate_chunks(
&mut self,
chunk_ids: &BTreeSet<ChunkId>,
) -> eyre::Result<BTreeMap<ChunkId, (BloblogId, BlobLocator)>> {
let mut out = BTreeMap::new();
for batch in &chunk_ids.iter().chunks(64) {
let chunk_id_texts: Vec<String> = batch.map(|ci| ci.to_string()).collect();
let query_param_str = format!("({})", &",?".repeat(chunk_id_texts.len())[1..]);
let sql = format!(
"
SELECT b.chunk_id, bl.bloblog_sha256, b.offset, b.size
FROM blobs b
JOIN bloblogs bl USING (bloblog_short_id)
WHERE b.chunk_id IN {query_param_str}
"
);
let mut q = query(&sql);
for chunk_id in &chunk_id_texts {
q = q.bind(chunk_id);
}
let rows = q
.map(|row: SqliteRow| {
Ok::<_, eyre::Report>((
ChunkId::from_str(row.get::<&str, _>(0))?,
BloblogId::try_from(row.get::<&str, _>(1))?,
row.get::<i64, _>(2),
row.get::<i64, _>(3),
))
})
.fetch_all(&mut *self.conn)
.await?;
for row in rows {
let (chunk_id, bloblog_id, offset, size) = row?;
out.insert(
chunk_id,
(
bloblog_id,
BlobLocator {
offset: offset as u64,
length: size as u64,
},
),
);
}
}
Ok(out)
}
pub async fn list_indices(&mut self) -> eyre::Result<BTreeSet<IndexId>> {
let row_results = query!(
"
SELECT index_sha256 FROM indices
"
)
.map(|row| {
IndexId::try_from(row.index_sha256.as_ref())
.context("failed to decode IndexId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn list_bloblogs(&mut self) -> eyre::Result<BTreeSet<BloblogId>> {
let row_results = query!(
"
SELECT bloblog_sha256 FROM bloblogs
"
)
.map(|row| {
BloblogId::try_from(row.bloblog_sha256.as_ref())
.context("failed to decode BloblogId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn is_chunk_new(&mut self, chunk_id: ChunkId) -> eyre::Result<bool> {
let chunk_id_text = chunk_id.to_string();
let is_new = query!(
"
SELECT 1 AS _yes FROM blobs WHERE chunk_id = ?
",
chunk_id_text
)
.fetch_optional(&mut *self.conn)
.await?
.is_none();
Ok(is_new)
}
pub async fn list_chunks_in_index(
&mut self,
index_id: IndexId,
) -> eyre::Result<BTreeSet<ChunkId>> {
let index_id_text = index_id.to_string();
let row_results = query!(
"
SELECT chunk_id AS \"chunk_id!\" FROM indices i
JOIN blobs b USING (index_short_id)
WHERE index_sha256 = ?
",
index_id_text
)
.map(|row| {
ChunkId::from_str(&row.chunk_id).context("failed to decode ChunkId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn index_bloblog_stats(
&mut self,
index_id: IndexId,
) -> eyre::Result<BTreeMap<BloblogId, BloblogStats>> {
let index_id_text = index_id.to_string();
let row_results = query!("
SELECT bloblog_sha256 AS bloblog_id, ib.forgotten_bytes AS forgotten_bytes, COUNT(size) AS \"num_chunks!: i64\", SUM(size) AS \"num_bytes!: i64\" FROM indices i
LEFT JOIN indices_bloblogs ib USING (index_short_id)
LEFT JOIN bloblogs b USING (bloblog_short_id)
LEFT JOIN blobs USING (index_short_id, bloblog_short_id)
WHERE index_sha256 = ?
GROUP BY bloblog_sha256
", index_id_text)
.map(|row| {
Ok((BloblogId::try_from(row.bloblog_id.as_ref())?, BloblogStats {
in_index: index_id,
blob_size: row.num_bytes as u64,
forgotten_bytes: row.forgotten_bytes as u64,
num_chunks: row.num_chunks as u32,
}))
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
}
#[derive(Clone, Debug)]
pub struct BloblogStats {
pub in_index: IndexId,
pub blob_size: u64,
pub forgotten_bytes: u64,
pub num_chunks: u32,
}

View File

@ -0,0 +1,35 @@
[package]
name = "yama_midlevel_crypto"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde = { version = "1.0.159", features = ["derive"] }
ciborium = "0.2.0"
blake3 = "1.3.3"
# Unauthenticated symmetric seekable stream constructions
chacha20 = "0.9.1"
x25519-dalek = { version = "2.0.0-rc.2", features = ["serde", "static_secrets"] }
poly1305 = "0.8.0"
ed25519-dalek = { version = "2.0.0-rc.2", features = ["serde", "rand_core"] }
# Hybrid quantum-resistant asymmetric 'key encapsulation' mechanisms
pqc_kyber = { version = "0.5.0", features = ["kyber1024"] }
#alkali = "0.3.0"
rand = "0.8.5"
eyre = "0.6.8"
# 0.12.3+zstd.1.5.2
zstd = "0.12.3"
hex = "0.4.3"
argon2 = { version = "0.4.1", default-features = false, features = ["alloc", "std"] }

Some files were not shown because too many files have changed in this diff Show More