Compare commits

...

83 Commits

Author SHA1 Message Date
Olivier 'reivilibre' 565c99cf8c Update flake and fix it 2024-05-08 20:41:28 +01:00
Olivier 'reivilibre' b57dbad890 Simplify flake lock
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2023-04-01 16:57:04 +01:00
Olivier 'reivilibre' 9001177143 Batch up chunk deletions in an attempt to make vacuuming more performant
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-28 21:03:07 +00:00
Olivier 'reivilibre' c9d64b2962 Make sure to flush + add some error contexts
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-21 21:23:38 +00:00
Olivier 'reivilibre' 50ff9bb36a Fix including trailing empty line as pointer name
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-20 22:11:05 +00:00
Olivier 'reivilibre' 7e41408815 Add test for incremental backup with mid delete
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
Just for validation that delete does the right thing
2022-11-20 20:58:45 +00:00
Olivier 'reivilibre' 4072c5ae82 Fix parent not being integrated before being used to differentiate whilst removing a pointer
ci/woodpecker/push/build Pipeline is pending Details
ci/woodpecker/push/release Pipeline is pending Details
2022-11-20 20:42:26 +00:00
Olivier 'reivilibre' d3fe111a06 Replace debug rmp with new implementation
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-20 19:44:21 +00:00
Olivier 'reivilibre' 6e1e173cb6 Implement datman prune 2022-11-20 19:43:20 +00:00
Olivier 'reivilibre' fcc79ca95d Hopefully fix descriptors to compare in test
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-20 10:02:27 +00:00
Olivier 'reivilibre' c1de1341ef Tweak wording 2022-11-20 10:02:13 +00:00
Olivier 'reivilibre' e85c606c95 Make a no-op compaction really a no-op compaction
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-20 08:57:22 +00:00
Olivier 'reivilibre' 34c619ef41 Fix compact thresholds in tests to demonstrate what we need
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-19 17:43:27 +00:00
Olivier 'reivilibre' b9dce3ddfc rustfmt 2022-11-19 17:42:09 +00:00
Olivier 'reivilibre' 52202874f2 Update images to remove deprecated ones
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-19 16:47:25 +00:00
Olivier 'reivilibre' 69656131af Fix linter
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-11-19 16:35:36 +00:00
Olivier 'reivilibre' cc93997230 Linting 2022-11-19 16:35:33 +00:00
Olivier 'reivilibre' 41248fe396 Add tests for yama compact 2022-11-19 16:35:24 +00:00
Olivier 'reivilibre' e7eb9ef288 Update nix shell to have python 2022-11-19 16:33:14 +00:00
Olivier 'reivilibre' b5e9e55cad Add yama compact command 2022-11-19 15:49:09 +00:00
Olivier 'reivilibre' cf502b7f7e rustfmt 2022-11-19 15:28:36 +00:00
Olivier 'reivilibre' 58c5c3f039 Add compaction logic 2022-11-19 15:27:41 +00:00
Olivier 'reivilibre' 30b261d172 Add Nix shell for Rust devel 2022-11-19 13:13:19 +00:00
Olivier 'reivilibre' 0811c11c48 Add ability to extract subset of files from yama
ci/woodpecker/push/build Pipeline is pending Details
ci/woodpecker/push/release Pipeline is pending Details
2022-10-04 20:21:23 +01:00
Olivier 'reivilibre' aa2722607e Skip directories with .datmanskip files
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' 8612804298 Do short exclusions for remote backups (also bump version as protocol version incompatible) 2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' 080875bfce Add debug log for not descending 2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' 098895d913 Do short exclusions for local backups 2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' bd5e18bc9f Extract load_labelling_rules 2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' e25e92b273 Introduce 'exclusions' parameter to scanner 2022-07-25 10:35:47 +01:00
Olivier 'reivilibre' 4aa1948350 Sort chunk IDs by hint to make pull more efficient
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-07-23 22:43:35 +01:00
Olivier 'reivilibre' ee9ca73224 Put reader bloblogs in an LRU map to prevent hitting open FD limit 2022-07-23 22:38:29 +01:00
Olivier 'reivilibre' 05c6d3e662 Ignore non-UTF-8 file names instead of panicking
ci/woodpecker/push/build Pipeline is pending Details
ci/woodpecker/push/release Pipeline is pending Details
ci/woodpecker/tag/build Pipeline failed Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-07-23 21:55:04 +01:00
Olivier 'reivilibre' 0b84c793bf Flush chunk pointers in one transaction
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-06-15 21:13:58 +01:00
Olivier 'reivilibre' eef22e7009 Use WAL mode in SQLite bloblogs 2022-06-15 21:13:56 +01:00
Olivier 'reivilibre' 332563f5a7 Bump version
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-06-14 22:55:04 +01:00
Olivier 'reivilibre' 14be0ef0a3 Add a version check to the pull protocol
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-06-14 22:54:32 +01:00
Olivier 'reivilibre' 375d68eb0e Don't forget the terminator
ci/woodpecker/push/build Pipeline is pending Details
ci/woodpecker/push/release Pipeline was successful Details
2022-06-14 19:57:34 +01:00
Olivier 'reivilibre' fc29c6fca1 Add some writer flushes that are probably necessary 2022-06-14 19:57:14 +01:00
Olivier 'reivilibre' d384b1bcbd Write down basic implementation of datman pull 2022-06-14 19:56:45 +01:00
Olivier 'reivilibre' e357547777 Glue together an implementation for the pull responder 2022-06-14 19:54:21 +01:00
Olivier 'reivilibre' c83e2be66d Flesh out both sides 2022-06-14 08:54:17 +01:00
Olivier 'reivilibre' a24778209e Finish off the basic offering side implementation 2022-06-13 23:27:43 +01:00
Olivier 'reivilibre' bb8fc355f0 Lay down the basic structure of push/pull offerer 2022-06-13 23:15:46 +01:00
Olivier 'reivilibre' 14fc925dbc Make existing push/pull legacy 2022-06-13 23:15:34 +01:00
Olivier 'reivilibre' 9e51c2428e Report the size used by the pile itself in the report 2022-06-13 23:15:34 +01:00
Olivier 'reivilibre' 01c98cb415 Aggregate reports by month and reorder sections 2022-06-04 12:33:14 +01:00
Olivier 'reivilibre' 3637b00f38 Add lz4 to path to ensure the backup helpers can work
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-06-03 11:42:08 +01:00
Olivier 'reivilibre' 25b1e14d84 Make it less strict?
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-06-02 22:26:24 +01:00
Olivier 'reivilibre' ef70e0998e Attempt to package up the helpers alongside yama and datman
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-06-02 20:30:22 +01:00
Olivier 'reivilibre' 5fd9a72de8 Convert helpers to Poetry to make them easier to package for NixOS 2022-06-02 20:29:54 +01:00
Olivier 'reivilibre' 4244fb88a7 Update version
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-06-01 09:51:56 +01:00
Olivier 'reivilibre' d62e864bee Don't crash when backing up the root directory because it has no name 2022-06-01 09:16:05 +01:00
Olivier 'reivilibre' 001d626ccd Update version (protocol changed too)
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-05-31 09:45:24 +01:00
Olivier 'reivilibre' af553d1fed Only scan one filesystem by default (can configure 'cross_filesystems' if needed)
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-31 09:39:25 +01:00
Olivier 'reivilibre' e8fc448ace Remove needless bare_cnr crate
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
ci/woodpecker/tag/build Pipeline was successful Details
ci/woodpecker/tag/release Pipeline was successful Details
2022-05-30 23:11:16 +01:00
Olivier 'reivilibre' 4216243dcf Remove src input to try and avoid getting told off about relative paths
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-30 22:46:54 +01:00
Olivier 'reivilibre' 1cd0b9887a Add a Nix flake
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-29 17:43:03 +01:00
Olivier 'reivilibre' ec8c5ff42d Add times and disk space to the report
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-29 17:24:46 +01:00
Olivier 'reivilibre' 948ca3f2b5 Add a command to show a report of the Datman system 2022-05-29 13:35:15 +01:00
Olivier 'reivilibre' 438af9164e Guard the Requester so that the Responder can't do whatever it wants 2022-05-29 09:45:02 +01:00
Olivier 'reivilibre' e1c6d31ee3 Fix: remember to flush 2022-05-29 09:36:54 +01:00
Olivier 'reivilibre' ac97957394 Remove obsolete comment
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-29 09:12:51 +01:00
Olivier 'reivilibre' 23e112b1be Automatically pull updates for images in CI
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-29 08:43:03 +01:00
Olivier 'reivilibre' 8692d83510 Push things around so that the chunking process doesn't need to know about pointers
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-29 00:10:53 +01:00
Olivier 'reivilibre' f1b73b28ee Split up store_fully so that we can avoid pointer ops in a restricted context 2022-05-28 23:53:09 +01:00
Olivier 'reivilibre' db0d9dd493 Add a basic access guard 2022-05-28 23:31:25 +01:00
Olivier 'reivilibre' 081a1922c7 Separate out the pointer operations 2022-05-28 23:30:52 +01:00
Olivier 'reivilibre' 23b352f936 Remove non-pipelined storage 2022-05-28 23:30:52 +01:00
Olivier 'reivilibre' d82176075a Bump up Rust version
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-28 22:53:35 +01:00
Olivier 'reivilibre' 1803946b4a Disable ARM64 runners for now 2022-05-28 22:47:59 +01:00
Olivier 'reivilibre' 760626d01e Add operation to describe the pipeline
ci/woodpecker/push/build Pipeline failed Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-28 22:44:36 +01:00
Olivier 'reivilibre' f4debbc9fe Fix up clap changes 2022-05-28 22:27:33 +01:00
Olivier 'reivilibre' f9c0d814c2 Update deps 2022-05-28 22:27:33 +01:00
Olivier 'reivilibre' a06b393630 Make some remarks about where I'd like to go 2022-05-28 22:14:43 +01:00
Olivier 'reivilibre' 56dafc6b5f Remove bare_cnr_ssh crate since it does depend a little bit 2022-05-28 21:43:20 +01:00
Olivier 'reivilibre' 0a9cb559bd Remove transport module as it was obsolete 2022-05-28 21:22:25 +01:00
Olivier 'reivilibre' 00b06963d7 Add tests and useful functionality 2022-05-28 21:21:40 +01:00
Olivier 'reivilibre' b659a5ddac Introduce channel handles that can be passed in serde messages 2022-05-28 20:40:04 +01:00
Olivier 'reivilibre' 183f365032 Start a half decent Bare CnR crate 2022-05-28 19:47:08 +01:00
Olivier 'reivilibre' d0ed984dca Upgrade some dependencies 2022-05-28 13:21:56 +01:00
Olivier 'reivilibre' 675c8884f9 Start the v0.6.0-alpha.1 'next' branch
ci/woodpecker/push/build Pipeline was successful Details
ci/woodpecker/push/release Pipeline was successful Details
2022-05-28 13:08:56 +01:00
Olivier 'reivilibre' 60cf81c59c Use plain old Rust image rather than sccache image
continuous-integration/drone the build was successful Details
2022-01-14 21:17:51 +00:00
58 changed files with 3930 additions and 880 deletions

2
.envrc Normal file
View File

@ -0,0 +1,2 @@
use nix

2
.gitignore vendored
View File

@ -15,3 +15,5 @@
__pycache__ __pycache__
/datman-helper-postgres/datman_helper_postgres.egg-info /datman-helper-postgres/datman_helper_postgres.egg-info
/datman-helper-mysql/datman_helper_mysql.egg-info /datman-helper-mysql/datman_helper_mysql.egg-info
/result

View File

@ -5,27 +5,19 @@ platform: linux/amd64
pipeline: pipeline:
unitTests: unitTests:
image: "docker.bics.ga/rei_ci/rust-sccache:latest-amd64" image: "rust:1.65.0"
pull: true
commands: commands:
- DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null - DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
- DEBIAN_FRONTEND=noninteractive apt-get -yqq install pkg-config libssl-dev build-essential libsqlite3-dev > /dev/null - DEBIAN_FRONTEND=noninteractive apt-get -yqq install pkg-config libssl-dev build-essential libsqlite3-dev > /dev/null
- cargo build --all - cargo build --all
- cargo test --all - cargo test --all
- sccache --show-stats
environment:
RUSTC_WRAPPER: /usr/local/bin/sccache
SCCACHE_S3_USE_SSL: "true"
SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
secrets:
- sccache_bucket
- aws_access_key_id
- aws_secret_access_key
when: when:
event: [push, pull_request] event: [push, pull_request]
testSuite: testSuite:
image: "docker.bics.ga/rei_ci/rust-sccache:latest-amd64" image: "rust:1.65.0"
commands: commands:
- DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null - DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
- DEBIAN_FRONTEND=noninteractive apt-get -yqq -o=Dpkg::Use-Pty=0 install pkg-config libssl-dev build-essential libsqlite3-dev python3.9 python3.9-venv postgresql postgresql-client mariadb-server mariadb-client zstd lz4 > /dev/null - DEBIAN_FRONTEND=noninteractive apt-get -yqq -o=Dpkg::Use-Pty=0 install pkg-config libssl-dev build-essential libsqlite3-dev python3.9 python3.9-venv postgresql postgresql-client mariadb-server mariadb-client zstd lz4 > /dev/null
@ -38,22 +30,14 @@ pipeline:
- cargo install -q --path yama - cargo install -q --path yama
- cargo install -q --path datman - cargo install -q --path datman
- python3.9 -m venv testsuite/.venv - python3.9 -m venv testsuite/.venv
- ./testsuite/.venv/bin/pip install -e testsuite -e datman-helper-postgres -e datman-helper-mysql - ./testsuite/.venv/bin/pip install ./testsuite ./datman-helper-postgres ./datman-helper-mysql
- cd testsuite && . .venv/bin/activate && TEST_POSTGRES=$(hostname),testsuitedb,root TEST_MYSQL=$(hostname),testsuitemydb,root green - cd testsuite && . .venv/bin/activate && TEST_POSTGRES=$(hostname),testsuitedb,root TEST_MYSQL=$(hostname),testsuitemydb,root green
- sccache --show-stats
environment:
RUSTC_WRAPPER: /usr/local/bin/sccache
SCCACHE_S3_USE_SSL: "true"
SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
secrets:
- sccache_bucket
- aws_access_key_id
- aws_secret_access_key
when: when:
event: [push, pull_request] event: [push, pull_request]
deployManual: deployManual:
image: "docker.bics.ga/rei_ci/mdbook:latest-amd64" image: "docker.emunest.net/rei_ci/mdbook:latest-amd64"
pull: true
when: when:
branch: branch:
- develop - develop

View File

@ -4,7 +4,8 @@ platform: linux/${ARCH}
matrix: matrix:
ARCH: ARCH:
- arm64 # I don't have an arm64 runner at the moment.
#- arm64
- amd64 - amd64
.a1: &when .a1: &when
@ -15,7 +16,10 @@ pipeline:
buildRelease: buildRelease:
when: *when when: *when
image: "docker.bics.ga/rei_ci/rust-sccache:latest-${ARCH}" # Disabled for now because I'm trying to get infinite build times to stop :-(.
# Suspect a kernel bug but any workaround will do for now.
#image: "docker.bics.ga/rei_ci/rust-sccache:latest-${ARCH}"
image: "rust:1.61"
commands: commands:
- apt-get -qq update && apt-get -yqq install pkg-config libssl-dev build-essential libolm-dev cmake - apt-get -qq update && apt-get -yqq install pkg-config libssl-dev build-essential libolm-dev cmake
- cargo build --release - cargo build --release

866
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
[workspace] [workspace]
members = [ members = [
"yama", "yama",
"datman" "datman",
] ]
[profile.release] [profile.release]

8
datman-helper-mysql/poetry.lock generated Normal file
View File

@ -0,0 +1,8 @@
package = []
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
[metadata.files]

View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "datman-helper-mysql"
version = "0.1.0"
description = "MySQL integration for Datman"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
license = "GPL-3.0-or-later"
[tool.poetry.dependencies]
python = "^3.8"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
datman-helper-mysql-backup="datman_helper_mysql.backup:cli"
datman-helper-mysql-restore="datman_helper_mysql.restore:cli"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,119 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import sys
from shutil import rmtree
from setuptools import Command, find_packages, setup
# Package meta-data.
NAME = "datman_helper_mysql"
DESCRIPTION = "MySQL integration for Datman"
URL = "https://bics.ga/reivilibre/yama"
EMAIL = "reivi@librepush.net"
AUTHOR = "Olivier 'reivilibre'"
REQUIRES_PYTHON = ">=3.7.0"
VERSION = "0.1.0"
# What packages are required for this module to be executed?
REQUIRED = []
# What packages are optional?
EXTRAS = {}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------
# Except, perhaps the License and Trove Classifiers!
# If you do change the License, remember to change the Trove Classifier for that!
here = os.path.abspath(os.path.dirname(__file__))
# Import the README and use it as the long-description.
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
try:
with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = "\n" + f.read()
except FileNotFoundError:
long_description = DESCRIPTION
# Load the package's __version__.py module as a dictionary.
about = {}
if not VERSION:
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
with open(os.path.join(here, project_slug, "__version__.py")) as f:
exec(f.read(), about)
else:
about["__version__"] = VERSION
class UploadCommand(Command):
"""Support setup.py upload."""
description = "Build and publish the package."
user_options = []
@staticmethod
def status(s):
"""Prints things in bold."""
print("\033[1m{0}\033[0m".format(s))
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
self.status("Removing previous builds…")
rmtree(os.path.join(here, "dist"))
except OSError:
pass
self.status("Building Source and Wheel (universal) distribution…")
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
self.status("Uploading the package to PyPI via Twine…")
os.system("twine upload dist/*")
self.status("Pushing git tags…")
os.system("git tag v{0}".format(about["__version__"]))
os.system("git push --tags")
sys.exit()
# Where the magic happens:
setup(
name=NAME,
version=about["__version__"],
description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
author=AUTHOR,
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
url=URL,
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
# If your package is a single module, use this instead of 'packages':
# py_modules=['mypackage'],
entry_points={
"console_scripts": [
"datman-helper-mysql-backup=datman_helper_mysql.backup:cli",
"datman-helper-mysql-restore=datman_helper_mysql.restore:cli",
],
},
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
# TODO license='GPL3',
classifiers=[
# Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
"Programming Language :: Python",
"Programming Language :: Python :: 3",
],
)

View File

@ -39,10 +39,7 @@ def cli():
# The process (if any) that is our LZ4 decompressor. # The process (if any) that is our LZ4 decompressor.
lz4_process = None lz4_process = None
dump_command = [ dump_command = ["pg_dump", database_to_use]
"pg_dump",
database_to_use
]
if host_to_use is not None: if host_to_use is not None:
if use_lz4: if use_lz4:
@ -63,21 +60,19 @@ def cli():
# (rather than lz4 covering it). # (rather than lz4 covering it).
command = [ command = [
"ssh", "ssh",
f"{user_to_use}@{host_to_use}" if user_to_use is not None else f"{host_to_use}", f"{user_to_use}@{host_to_use}"
if user_to_use is not None
else f"{host_to_use}",
"bash", "bash",
"-o", "-o",
"pipefail", "pipefail",
"-c", "-c",
shlex.quote(" ".join(dump_command)) shlex.quote(" ".join(dump_command)),
] ]
elif user_to_use is not None: elif user_to_use is not None:
current_username = pwd.getpwuid(os.getuid()).pw_name current_username = pwd.getpwuid(os.getuid()).pw_name
if current_username != user_to_use: if current_username != user_to_use:
command = [ command = ["sudo", "-u", user_to_use] + dump_command
"sudo",
"-u",
user_to_use
] + dump_command
else: else:
command = dump_command command = dump_command
else: else:

8
datman-helper-postgres/poetry.lock generated Normal file
View File

@ -0,0 +1,8 @@
package = []
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
[metadata.files]

View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "datman-helper-postgres"
version = "0.1.0"
description = "Postgres integration for Datman"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
license = "GPL-3.0-or-later"
[tool.poetry.dependencies]
python = "^3.8"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
datman-helper-postgres-backup="datman_helper_postgres.backup:cli"
datman-helper-postgres-restore="datman_helper_postgres.restore:cli"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,119 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import sys
from shutil import rmtree
from setuptools import Command, find_packages, setup
# Package meta-data.
NAME = "datman_helper_postgres"
DESCRIPTION = "Postgres integration for Datman"
URL = "https://bics.ga/reivilibre/yama"
EMAIL = "reivi@librepush.net"
AUTHOR = "Olivier 'reivilibre'"
REQUIRES_PYTHON = ">=3.7.0"
VERSION = "0.1.0"
# What packages are required for this module to be executed?
REQUIRED = []
# What packages are optional?
EXTRAS = {}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------
# Except, perhaps the License and Trove Classifiers!
# If you do change the License, remember to change the Trove Classifier for that!
here = os.path.abspath(os.path.dirname(__file__))
# Import the README and use it as the long-description.
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
try:
with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
long_description = "\n" + f.read()
except FileNotFoundError:
long_description = DESCRIPTION
# Load the package's __version__.py module as a dictionary.
about = {}
if not VERSION:
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
with open(os.path.join(here, project_slug, "__version__.py")) as f:
exec(f.read(), about)
else:
about["__version__"] = VERSION
class UploadCommand(Command):
"""Support setup.py upload."""
description = "Build and publish the package."
user_options = []
@staticmethod
def status(s):
"""Prints things in bold."""
print("\033[1m{0}\033[0m".format(s))
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
self.status("Removing previous builds…")
rmtree(os.path.join(here, "dist"))
except OSError:
pass
self.status("Building Source and Wheel (universal) distribution…")
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
self.status("Uploading the package to PyPI via Twine…")
os.system("twine upload dist/*")
self.status("Pushing git tags…")
os.system("git tag v{0}".format(about["__version__"]))
os.system("git push --tags")
sys.exit()
# Where the magic happens:
setup(
name=NAME,
version=about["__version__"],
description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
author=AUTHOR,
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
url=URL,
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
# If your package is a single module, use this instead of 'packages':
# py_modules=['mypackage'],
entry_points={
"console_scripts": [
"datman-helper-postgres-backup=datman_helper_postgres.backup:cli",
"datman-helper-postgres-restore=datman_helper_postgres.restore:cli",
],
},
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
# TODO license='GPL3',
classifiers=[
# Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
"Programming Language :: Python",
"Programming Language :: Python :: 3",
],
)

View File

@ -1,8 +1,8 @@
[package] [package]
name = "datman" name = "datman"
version = "0.5.0-alpha.2" version = "0.6.0-alpha.5"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"] authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2018" edition = "2021"
repository = "https://bics.ga/reivilibre/yama" repository = "https://bics.ga/reivilibre/yama"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
@ -11,7 +11,7 @@ description = "A chunked and deduplicated backup system using Yama"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
clap = "= 3.0.0-beta.5" clap = { version = "3.1.18", features = ["derive"] }
crossbeam-channel = "0.5.1" crossbeam-channel = "0.5.1"
anyhow = "1.0" anyhow = "1.0"
thiserror = "1.0" thiserror = "1.0"
@ -22,7 +22,7 @@ log = "0.4"
env_logger = "0.7.1" env_logger = "0.7.1"
indicatif = "0.14.0" indicatif = "0.14.0"
arc-interner = "0.5.1" arc-interner = "0.5.1"
zstd = "0.6.0" # 0.6.0+zstd.1.4.8 zstd = "0.11.2" # 0.11.2+zstd.1.5.2
byteorder = "1" byteorder = "1"
termion = "1.5.6" termion = "1.5.6"
glob = "0.3.0" glob = "0.3.0"
@ -30,6 +30,9 @@ humansize = "1.1.1"
chrono = "0.4.19" chrono = "0.4.19"
itertools = "0.10.1" itertools = "0.10.1"
hostname = "0.3.1" hostname = "0.3.1"
yama = { path = "../yama", version = "0.5.0-alpha.1" } yama = { path = "../yama", version = "0.6.0-alpha.5" }
metrics = "0.17.1" metrics = "0.17.1"
bare-metrics-recorder = { version = "0.1.0" } bare-metrics-recorder = { version = "0.1.0" }
comfy-table = "6.0.0-rc.1"
libc = "0.2.126"
io-streams = "0.11.0"

View File

@ -8,5 +8,6 @@ Features:
* (optional) Compression using Zstd and a specifiable dictionary * (optional) Compression using Zstd and a specifiable dictionary
* (optional) Encryption * (optional) Encryption
* Ability to back up to remote machines over SSH * Ability to back up to remote machines over SSH
* Labelling of files in a backup source; different destinations can choose to backup either all or a subset of the labels.
See the documentation for more information. See the documentation for more information.

View File

@ -16,24 +16,30 @@ along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::fs::File; use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use clap::Parser; use clap::Parser;
use env_logger::Env; use env_logger::Env;
use anyhow::bail; use anyhow::{bail, Context};
use bare_metrics_recorder::recording::BareMetricsRecorderCore; use bare_metrics_recorder::recording::BareMetricsRecorderCore;
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone, Utc}; use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone, Utc};
use datman::commands::backup::{backup_all_sources_to_destination, backup_source_to_destination}; use datman::commands::backup::{backup_all_sources_to_destination, backup_source_to_destination};
use datman::commands::ilabel::interactive_labelling_session; use datman::commands::ilabel::interactive_labelling_session;
use datman::commands::init_descriptor; use datman::commands::prune::{prune_with_retention_policy, RetentionPolicy};
use datman::commands::{init_descriptor, pushpull};
use datman::descriptor::{load_descriptor, SourceDescriptor}; use datman::descriptor::{load_descriptor, SourceDescriptor};
use datman::get_hostname; use datman::get_hostname;
use datman::remote::backup_source_requester::backup_remote_source_to_destination; use datman::remote::backup_source_requester::backup_remote_source_to_destination;
use datman::remote::backup_source_responder; use datman::remote::backup_source_responder;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use itertools::Itertools;
use log::info; use log::info;
use std::str::FromStr; use std::str::FromStr;
use yama::commands::load_pile_descriptor;
use yama::operations::legacy_pushpull::{open_pile_with_work_bypass, BypassLevel};
pub const FAILURE_SYMBOL_OBNOXIOUS_FLASHING: &str = "\x1b[5m\x1b[31m⚠ \x1b[25m\x1b[22m"; pub const FAILURE_SYMBOL_OBNOXIOUS_FLASHING: &str = "\x1b[5m\x1b[31m⚠ \x1b[25m\x1b[22m";
pub const BOLD: &str = "\x1b[1m"; pub const BOLD: &str = "\x1b[1m";
@ -111,8 +117,39 @@ pub enum DatmanCommand {
skip_metadata: bool, skip_metadata: bool,
}, },
Report {
/// Name of the pile to report on.
pile_name: String,
/// Don't summarise months.
#[clap(long)]
individual: bool,
},
#[clap(name = "_backup_source_responder")] #[clap(name = "_backup_source_responder")]
InternalBackupSourceResponder, InternalBackupSourceResponder,
/// Pulls all pointers from a remote pile to a local pile.
/// Does not yet support label filtering, but will do in the future.
Pull {
/// e.g. 'myserver:main'
remote_and_remote_pile: String,
pile_name: String,
},
/// Applies a retention policy by removing unnecessary backups.
/// Does not reclaim space by itself: use
/// `yama check --apply-gc --shallow`
/// & `yama compact`
/// to do that.
Prune { pile_name: String },
#[clap(name = "_pull_responder_offerer")]
InternalPullResponderOfferer {
datman_path: PathBuf,
pile_name: String,
},
} }
pub struct HumanDateTime(pub DateTime<Local>); pub struct HumanDateTime(pub DateTime<Local>);
@ -190,10 +227,10 @@ fn main() -> anyhow::Result<()> {
unimplemented!(); unimplemented!();
} }
DatmanCommand::InteractiveLabelling { source_name } => { DatmanCommand::InteractiveLabelling { source_name } => {
interactive_labelling_session(Path::new("."), source_name).unwrap(); interactive_labelling_session(Path::new("."), source_name)?;
} }
DatmanCommand::InteractiveBrowsing { source_name } => { DatmanCommand::InteractiveBrowsing { source_name } => {
datman::commands::ibrowse::session(Path::new("."), source_name).unwrap(); datman::commands::ibrowse::session(Path::new("."), source_name)?;
} }
DatmanCommand::BackupOne { DatmanCommand::BackupOne {
source_name, source_name,
@ -307,6 +344,125 @@ fn main() -> anyhow::Result<()> {
info!("Datman responder at {:?}", std::env::current_exe()?); info!("Datman responder at {:?}", std::env::current_exe()?);
backup_source_responder::handler_stdio()?; backup_source_responder::handler_stdio()?;
} }
DatmanCommand::Report {
pile_name,
individual,
} => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let destination = &descriptor.piles[&pile_name];
let report =
datman::commands::report::generate_report(destination, &descriptor, !individual)?;
datman::commands::report::print_filesystem_space(&destination.path)?;
datman::commands::report::print_report(&report)?;
}
DatmanCommand::Pull {
remote_and_remote_pile,
pile_name,
} => {
let (hostname, remote_datman_path, remote_pile_name) = remote_and_remote_pile
.split(':')
.collect_tuple()
.context("You must pull from a remote pile specified as remote:path:pile.")?;
let descriptor = load_descriptor(Path::new(".")).unwrap();
let source = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&source.path)?;
let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
&source.path,
&pile_desc,
BypassLevel::CompressionBypass,
)?;
let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("pulling");
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
let mut connection = Command::new("ssh")
.arg(&remote_host_descriptor.user_at_host)
.arg("--")
.arg(
&remote_host_descriptor
.path_to_datman
.as_ref()
.map(|x| x.as_str())
.unwrap_or("datman"),
)
.arg("_pull_responder_offerer")
.arg(remote_datman_path)
.arg(remote_pile_name)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.spawn()?;
let mut reader = BufReader::new(connection.stdout.take().unwrap());
let mut writer = BufWriter::new(connection.stdin.take().unwrap());
pushpull::accepting_side(
&pile,
&bypass_raw_pile,
&mut reader,
&mut writer,
Box::new(pbar),
)?;
}
DatmanCommand::Prune { pile_name } => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let retention_policy = descriptor
.retention
.context("No retention policy set in descriptor")?;
let dest_desc = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&dest_desc.path)?;
prune_with_retention_policy(
&dest_desc.path,
&pile_desc,
&RetentionPolicy::from_config(retention_policy),
true,
)?;
}
DatmanCommand::InternalPullResponderOfferer {
datman_path,
pile_name,
} => {
let descriptor = load_descriptor(&datman_path).unwrap();
let source = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&source.path)?;
let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
&source.path,
&pile_desc,
BypassLevel::CompressionBypass,
)?;
let mut stdin = BufReader::new(io_streams::StreamReader::stdin()?);
let mut stdout = BufWriter::new(io_streams::StreamWriter::stdout()?);
pushpull::offering_side(
&pile,
&bypass_raw_pile,
&mut stdin,
&mut stdout,
Box::new(()),
)?;
stdout.flush()?;
}
} }
Ok(()) Ok(())
} }

View File

@ -20,12 +20,15 @@ use std::fs::File;
use std::io::Write; use std::io::Write;
use std::path::Path; use std::path::Path;
use crate::descriptor::{Descriptor, SourceDescriptor}; use crate::descriptor::{Descriptor, RetentionPolicyConfig, SourceDescriptor};
pub mod backup; pub mod backup;
pub mod extract; pub mod extract;
pub mod ibrowse; pub mod ibrowse;
pub mod ilabel; pub mod ilabel;
pub mod prune;
pub mod pushpull;
pub mod report;
pub fn init_descriptor(path: &Path) -> anyhow::Result<()> { pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
std::fs::create_dir_all(path)?; std::fs::create_dir_all(path)?;
@ -49,6 +52,12 @@ pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
sources: source, sources: source,
piles: Default::default(), piles: Default::default(),
remote_hosts: Default::default(), remote_hosts: Default::default(),
retention: Some(RetentionPolicyConfig {
daily: 14,
weekly: 12,
monthly: 24,
yearly: 9001,
}),
})?; })?;
datman_toml_file.write_all(&bytes)?; datman_toml_file.write_all(&bytes)?;

View File

@ -17,11 +17,13 @@ along with Yama. If not, see <https://www.gnu.org/licenses/>.
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor, VirtualSourceKind}; use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor, VirtualSourceKind};
use crate::get_hostname; use crate::get_hostname;
use crate::labelling::{label_node, load_labelling_rules, str_to_label, Label, State}; use crate::labelling::{
label_node, load_labelling_rules, str_to_label, Label, LabellingRules, State,
};
use crate::tree::{scan, FileTree, FileTree1}; use crate::tree::{scan, FileTree, FileTree1};
use anyhow::{anyhow, bail}; use anyhow::{anyhow, bail};
use arc_interner::ArcIntern; use arc_interner::ArcIntern;
use chrono::{DateTime, Utc}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
use log::{info, warn}; use log::{info, warn};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::fmt::Debug; use std::fmt::Debug;
@ -49,6 +51,13 @@ pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String
) )
} }
pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
let (source_name, date_time_str) = pointer_name.rsplit_once("+")?;
let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
let date_time = Utc.from_utc_datetime(&date_time);
Some((source_name.to_owned(), date_time))
}
pub fn open_stdout_backup_process( pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>, extra_args: &HashMap<String, toml::Value>,
program_name: &str, program_name: &str,
@ -69,8 +78,8 @@ pub fn open_stdout_backup_process(
pub fn label_filter_and_convert( pub fn label_filter_and_convert(
tree: FileTree1<()>, tree: FileTree1<()>,
descriptor: &Descriptor, descriptor: &Descriptor,
desc_path: &Path,
source_name: &str, source_name: &str,
rules: &LabellingRules,
dest: &DestPileDescriptor, dest: &DestPileDescriptor,
) -> anyhow::Result<Option<TreeNode>> { ) -> anyhow::Result<Option<TreeNode>> {
info!("Labelling."); info!("Labelling.");
@ -80,8 +89,7 @@ pub fn label_filter_and_convert(
.iter() .iter()
.map(|l| Label(ArcIntern::new(l.clone()))) .map(|l| Label(ArcIntern::new(l.clone())))
.collect(); .collect();
let rules = load_labelling_rules(desc_path, source_name)?; label_node("".to_owned(), None, &mut tree, &labels, rules)?;
label_node("".to_owned(), None, &mut tree, &labels, &rules)?;
let included_labels: HashSet<Label> = dest.included_labels.iter().map(str_to_label).collect(); let included_labels: HashSet<Label> = dest.included_labels.iter().map(str_to_label).collect();
@ -140,17 +148,23 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(
SourceDescriptor::DirectorySource { SourceDescriptor::DirectorySource {
hostname: _, hostname: _,
directory, directory,
cross_filesystems,
} => { } => {
info!("Looking to backup {} to {}", source_name, dest_name); info!("Looking to backup {} to {}", source_name, dest_name);
let rules = load_labelling_rules(desc_path, source_name)?;
let exclusions = rules.get_exclusions_set(directory);
info!("Scanning."); info!("Scanning.");
let tree = scan(directory)?.ok_or_else(|| anyhow!("Source does not exist."))?; let tree = scan(directory, !*cross_filesystems, &exclusions)?
.ok_or_else(|| anyhow!("Source does not exist."))?;
let absolute_source_path = desc_path.join(directory); let absolute_source_path = desc_path.join(directory);
let absolute_dest_path = desc_path.join(&dest.path); let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?; let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?; let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let root = if let Some(root) = let root = if let Some(root) =
label_filter_and_convert(tree, descriptor, desc_path, source_name, dest)? label_filter_and_convert(tree, descriptor, source_name, &rules, dest)?
{ {
root root
} else { } else {
@ -202,7 +216,6 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(
parent, parent,
num_workers, num_workers,
progress_bar, progress_bar,
true,
)?; )?;
info!("Stored!"); info!("Stored!");

View File

@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::BTreeSet;
use std::path::Path; use std::path::Path;
use anyhow::{anyhow, bail}; use anyhow::{anyhow, bail};
@ -68,15 +69,19 @@ pub fn session(path: &Path, source_name: String) -> anyhow::Result<()> {
.get(&source_name) .get(&source_name)
.ok_or_else(|| anyhow!("Could not find source {:?}!", source_name))?; .ok_or_else(|| anyhow!("Could not find source {:?}!", source_name))?;
let directory = match source_descriptor { let (directory, one_filesystem) = match source_descriptor {
SourceDescriptor::DirectorySource { directory, .. } => directory, SourceDescriptor::DirectorySource {
directory,
cross_filesystems,
..
} => (directory, !*cross_filesystems),
SourceDescriptor::VirtualSource { .. } => { SourceDescriptor::VirtualSource { .. } => {
bail!("Cannot browse virtual source."); bail!("Cannot browse virtual source.");
} }
}; };
println!("Scanning source; this might take a little while..."); println!("Scanning source; this might take a little while...");
let mut dir_scan: FileTree1<Option<State>> = scan(directory)? let mut dir_scan: FileTree1<Option<State>> = scan(directory, one_filesystem, &BTreeSet::new())?
.ok_or_else(|| anyhow!("Empty source."))? .ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None); .replace_meta(&None);

View File

@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::BTreeSet;
use std::io; use std::io;
use std::io::{StdinLock, Stdout, Write}; use std::io::{StdinLock, Stdout, Write};
use std::path::Path; use std::path::Path;
@ -186,12 +187,13 @@ pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow
if let SourceDescriptor::DirectorySource { if let SourceDescriptor::DirectorySource {
hostname, hostname,
directory, directory,
cross_filesystems,
} = source } = source
{ {
let my_hostname = get_hostname(); let my_hostname = get_hostname();
let mut dir_scan = if &my_hostname == hostname { let mut dir_scan = if &my_hostname == hostname {
info!("Scanning source; this might take a little while..."); info!("Scanning source; this might take a little while...");
scan(directory)? scan(directory, !*cross_filesystems, &BTreeSet::new())?
.ok_or_else(|| anyhow!("Empty source."))? .ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None) .replace_meta(&None)
} else { } else {
@ -206,8 +208,13 @@ pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow
// then request to scan // then request to scan
info!("Requesting scan from remote source... (this may take some time)"); info!("Requesting scan from remote source... (this may take some time)");
let scan = let scan = backup_source_requester::scanning(
backup_source_requester::scanning(&mut read, &mut write, directory.as_ref())? &mut read,
&mut write,
directory.as_ref(),
!*cross_filesystems,
&BTreeSet::new(),
)?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))? .ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?
.replace_meta(&None); .replace_meta(&None);

View File

@ -0,0 +1,220 @@
use crate::commands::backup::split_pointer_name;
use crate::descriptor::RetentionPolicyConfig;
use anyhow::{bail, Context};
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::io;
use std::path::Path;
use yama::commands::open_pile;
use yama::operations::remove_pointer_safely;
use yama::pile::PileDescriptor;
pub struct RetentionBand {
pub interval_s: u64,
pub number_to_retain: u32,
}
pub struct RetentionPolicy {
pub retention_bands: Vec<RetentionBand>,
}
const DAY: u64 = 86400;
const WEEK: u64 = 7 * DAY;
const MONTH: u64 = 31 * DAY;
const YEAR: u64 = 365 * DAY;
impl RetentionPolicy {
pub fn from_config(descriptor: RetentionPolicyConfig) -> RetentionPolicy {
let mut policy = RetentionPolicy {
retention_bands: vec![],
};
if descriptor.daily != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: DAY,
number_to_retain: descriptor.daily,
});
}
if descriptor.weekly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: WEEK,
number_to_retain: descriptor.weekly,
});
}
if descriptor.monthly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: MONTH,
number_to_retain: descriptor.monthly,
});
}
if descriptor.yearly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: YEAR,
number_to_retain: descriptor.yearly,
});
}
policy
}
/// Returns the set of snapshots to remove.
pub fn apply_returning_prunable(
&self,
snapshots_by_unix_time: BTreeMap<u64, String>,
) -> BTreeSet<String> {
if snapshots_by_unix_time.is_empty() {
return BTreeSet::new();
}
let mut snapshots_included: BTreeSet<u64> = BTreeSet::new();
// Always mark the most recent snapshot as retained!
let last_snapshot = snapshots_by_unix_time.keys().rev().next().unwrap();
snapshots_included.insert(*last_snapshot);
let now_time = *last_snapshot;
for band in &self.retention_bands {
for multiple in 1..=band.number_to_retain {
let target_time = now_time - (multiple as u64) * band.interval_s;
if let Some((k, _)) = snapshots_by_unix_time.range(0..=target_time).rev().next() {
snapshots_included.insert(*k);
}
}
}
// Find all prunable (unincluded) snapshots.
snapshots_by_unix_time
.into_iter()
.filter(|(k, _v)| !snapshots_included.contains(k))
.map(|(_k, v)| v)
.collect()
}
}
pub fn prune_with_retention_policy(
pile_path: &Path,
pile_desc: &PileDescriptor,
policy: &RetentionPolicy,
prompt_first: bool,
) -> anyhow::Result<()> {
let pile = open_pile(&pile_path, &pile_desc).context("Failed to open pile")?;
let pointers = pile
.list_pointers()
.context("Failed to list pointers in pile")?;
let mut pointers_to_keep: BTreeSet<String> = pointers.iter().cloned().collect();
let pointers_to_remove = get_prunable_pointers(&policy, pointers);
for remove in &pointers_to_remove {
pointers_to_keep.remove(remove);
}
info!("Gory details:\n---\nKeep: {pointers_to_keep:?}\n---\nRemove: {pointers_to_remove:?}");
info!(
"{} pointers to remove ({} to keep) based on retention policy.",
pointers_to_remove.len(),
pointers_to_keep.len()
);
if prompt_first {
println!("Would you like to proceed? [y/N]: ");
let mut buffer = String::new();
let stdin = io::stdin(); // We get `Stdin` here.
stdin.read_line(&mut buffer)?;
if buffer.trim().to_ascii_lowercase() != "y" {
bail!("Aborted by user.");
}
}
for to_remove in pointers_to_remove {
let res = remove_pointer_safely(&pile, &to_remove).context("removing prunable pointers");
pile.flush()
.context("flushing pile after removing pointers")?;
res?;
}
Ok(())
}
fn get_prunable_pointers(policy: &RetentionPolicy, pointers: Vec<String>) -> BTreeSet<String> {
let mut split_pointers_by_name: BTreeMap<String, BTreeMap<u64, String>> = BTreeMap::new();
for pointer in pointers {
let (name, datetime) = if let Some(x) = split_pointer_name(&pointer) {
x
} else {
continue;
};
split_pointers_by_name
.entry(name)
.or_default()
.insert(datetime.timestamp().try_into().unwrap(), pointer);
}
let mut pointers_to_remove = BTreeSet::new();
for (_pointer_base_name, ts_to_pointer) in split_pointers_by_name {
let to_remove = policy.apply_returning_prunable(ts_to_pointer);
pointers_to_remove.extend(to_remove);
}
pointers_to_remove
}
#[cfg(test)]
mod test {
use crate::commands::prune::{get_prunable_pointers, RetentionPolicy};
use crate::descriptor::RetentionPolicyConfig;
#[test]
fn test_prunable_pointers() {
let pointers = vec![
"alice+2022-09-28_05:00:00",
"alice+2022-09-28_02:00:00",
"alice+2022-09-21_05:00:00",
"alice+2022-09-14_05:00:00",
"alice+2022-09-08_05:00:00",
"alice+2022-09-07_05:00:00",
"alice+2022-09-01_05:00:00",
"bob+2022-09-28_06:00:00",
"bob+2022-09-28_03:00:00",
"bob+2022-09-21_06:00:00",
"bob+2022-09-14_06:00:00",
"bob+2022-09-08_06:00:00",
"bob+2022-09-07_06:00:00",
"bob+2022-09-01_06:00:00",
]
.into_iter()
.map(|s| s.to_owned())
.collect();
let policy = RetentionPolicy::from_config(RetentionPolicyConfig {
daily: 0,
weekly: 3,
monthly: 0,
yearly: 0,
});
assert_eq!(
get_prunable_pointers(&policy, pointers)
.into_iter()
.collect::<Vec<_>>(),
vec![
"alice+2022-09-01_05:00:00",
"alice+2022-09-08_05:00:00",
"alice+2022-09-28_02:00:00",
"bob+2022-09-01_06:00:00",
"bob+2022-09-08_06:00:00",
"bob+2022-09-28_03:00:00",
]
);
}
}

View File

@ -0,0 +1,306 @@
// Push and Pull support for Datman
use anyhow::{bail, ensure, Context};
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::io::{Read, Write};
use std::sync::Arc;
use std::time::Instant;
use yama::chunking::RecursiveUnchunker;
use yama::commands::retrieve_tree_node;
use yama::definitions::{ChunkId, PointerData, RecursiveChunkRef, TreeNode};
use yama::pile::{Keyspace, Pile, PipelineDescription, RawPile};
use yama::progress::ProgressTracker;
use yama::remote::{read_message, write_message};
pub fn offer_pointers<W: Write, RP: RawPile>(
pile: &Pile<RP>,
writer: &mut W,
) -> anyhow::Result<BTreeMap<String, PointerData>> {
let mut pointers_to_offer: BTreeMap<String, PointerData> = BTreeMap::new();
for pointer_name in pile.list_pointers()? {
let pointer_data = pile
.read_pointer(&pointer_name)?
.context("Listed pointer not present")?;
pointers_to_offer.insert(pointer_name, pointer_data);
}
write_message(writer, &pointers_to_offer)?;
Ok(pointers_to_offer)
}
pub fn ensure_compatible_bypasses(
my_full: &Vec<PipelineDescription>,
my_bypass: &Vec<PipelineDescription>,
their_full: &Vec<PipelineDescription>,
their_bypass: &Vec<PipelineDescription>,
) -> anyhow::Result<()> {
ensure!(
my_full.starts_with(&my_bypass),
"Our full pipeline is not an extension of the bypass pipeline."
);
ensure!(
their_full.starts_with(&their_bypass),
"Their full pipeline is not an extension of their bypass pipeline."
);
let my_bypassed_parts = &my_full[my_bypass.len()..];
let their_bypassed_parts = &their_full[their_bypass.len()..];
ensure!(
my_bypassed_parts == their_bypassed_parts,
"Our bypassed parts and their bypassed parts are not the same.\nOurs: {:?}\nTheirs: {:?}",
my_bypassed_parts,
their_bypassed_parts
);
Ok(())
}
pub fn negotiate_bypassed_pile<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
) -> anyhow::Result<()> {
let my_full_pipeline = pile.raw_pile.describe_pipeline()?;
let my_bypass_pipeline = bypass_pile.describe_pipeline()?;
write_message(writer, &my_full_pipeline)?;
write_message(writer, &my_bypass_pipeline)?;
writer.flush()?;
let their_full_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
let their_bypass_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
ensure_compatible_bypasses(
&my_full_pipeline,
&my_bypass_pipeline,
&their_full_pipeline,
&their_bypass_pipeline,
)?;
Ok(())
}
fn collect_chunk_ids(
pile: &Pile<Arc<Box<dyn RawPile>>>,
root: &TreeNode,
chunk_ids: &mut BTreeSet<ChunkId>,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, _| {
match tree_node {
TreeNode::NormalFile { content, .. } => {
collect_chunk_ids_from_chunkref(pile, content, chunk_ids)?;
}
_ => {}
}
Ok(())
},
"".to_owned(),
)?;
Ok(())
}
fn collect_chunk_ids_from_chunkref(
pile: &Pile<Arc<Box<dyn RawPile>>>,
chunk_ref: &RecursiveChunkRef,
collection: &mut BTreeSet<ChunkId>,
) -> anyhow::Result<()> {
if chunk_ref.depth == 0 {
collection.insert(chunk_ref.chunk_id);
} else {
let shallower_chunk_ref = RecursiveChunkRef {
chunk_id: chunk_ref.chunk_id,
depth: chunk_ref.depth - 1,
};
let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
let mut next_chunk_id: ChunkId = Default::default();
loop {
let read = unchunker.read(&mut next_chunk_id[..])?;
if read == 0 {
break;
} else if read < next_chunk_id.len() {
unchunker.read_exact(&mut next_chunk_id[read..])?;
}
collection.insert(next_chunk_id);
}
}
Ok(())
}
pub fn offering_side<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
mut progress: Box<dyn ProgressTracker>,
) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
let expecting = format!("Datman Pull Accepter {}", version);
write_message(writer, &format!("Datman Pull Offerer {}", version))?;
writer.flush()?;
let found: String = read_message(reader)?;
ensure!(
found == expecting,
"Version mismatch. Expecting {:?} got {:?}",
expecting,
found
);
// First 'negotiate' (for now: assert) a pile bypass.
// This lets us avoid decompressing things before recompressing them at the other end,
// assuming both ends use the same dictionary.
negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
let offered_pointers = offer_pointers(pile, writer)?;
let wanted_pointers = read_message::<_, BTreeSet<String>>(reader)?;
let mut chunks_to_offer: BTreeSet<ChunkId> = BTreeSet::new();
for pointer_name in &wanted_pointers {
let pointer_data = offered_pointers
.get(pointer_name)
.with_context(|| format!("Requested pointer {:?} was not offered", pointer_name))?;
collect_chunk_ids_from_chunkref(pile, &pointer_data.chunk_ref, &mut chunks_to_offer)?;
let root_node = retrieve_tree_node(pile, pointer_data.chunk_ref.clone())?;
collect_chunk_ids(pile, &root_node.node, &mut chunks_to_offer)?;
}
write_message(writer, &chunks_to_offer)?;
writer.flush()?;
let chunks_to_skip: BTreeSet<ChunkId> = read_message(reader)?;
let chunks_to_send: Vec<ChunkId> = chunks_to_offer
.difference(&chunks_to_skip)
.cloned()
.collect();
drop(chunks_to_offer);
drop(chunks_to_skip);
let start_sort_by_hints = Instant::now();
let chunks_to_send_with_hints: BTreeSet<(u64, ChunkId)> = chunks_to_send
.into_iter()
.map(|chunk_id| {
pile.raw_pile
.chunk_id_transfer_ordering_hint(&chunk_id)
.map(|hint| (hint, chunk_id))
})
.collect::<anyhow::Result<_>>()?;
let time_to_sort_by_hints = Instant::now() - start_sort_by_hints;
info!(
"{} s to sort {} chunks by their hints",
time_to_sort_by_hints.as_secs_f32(),
chunks_to_send_with_hints.len()
);
progress.set_max_size(chunks_to_send_with_hints.len() as u64);
progress.set_current(0);
for (_hint, chunk_id) in chunks_to_send_with_hints {
let chunk_data = bypass_pile
.read(Keyspace::Chunk, &chunk_id)?
.context("Chunk vanished")?;
write_message(writer, &Some((chunk_id, chunk_data)))?;
progress.inc_progress(1);
}
write_message(writer, &None::<Option<(ChunkId, Vec<u8>)>>)?;
writer.flush()?;
Ok(())
}
pub fn accepting_side<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
mut progress: Box<dyn ProgressTracker>,
) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
let expecting = format!("Datman Pull Offerer {}", version);
write_message(writer, &format!("Datman Pull Accepter {}", version))?;
writer.flush()?;
let found: String = read_message(reader)?;
ensure!(
found == expecting,
"Version mismatch. Expecting {:?} got {:?}",
expecting,
found
);
// First 'negotiate' (for now: assert) a pile bypass.
// This lets us avoid decompressing things before recompressing them at the other end,
// assuming both ends use the same dictionary.
negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
let offered_pointers: BTreeMap<String, PointerData> = read_message(reader)?;
let mut wanted_pointers: BTreeSet<String> = BTreeSet::new();
for (pointer_name, pointer_data) in &offered_pointers {
if pile.read_pointer(pointer_name)?.is_none() {
wanted_pointers.insert(pointer_name.clone());
if let Some(parent) = &pointer_data.parent_pointer {
if pile.read_pointer(parent)?.is_none() && !offered_pointers.contains_key(parent) {
bail!("Offered pointer {:?} requires parent {:?} which we don't have and isn't offered.", pointer_name, parent);
}
}
}
}
write_message(writer, &wanted_pointers)?;
writer.flush()?;
let offered_chunks: BTreeSet<ChunkId> = read_message(reader)?;
let mut chunks_to_skip: BTreeSet<ChunkId> = BTreeSet::new();
for chunk_id in &offered_chunks {
if pile.chunk_exists(chunk_id)? {
chunks_to_skip.insert(*chunk_id);
}
}
write_message(writer, &chunks_to_skip)?;
writer.flush()?;
let num_chunks_to_recv = offered_chunks.len() - chunks_to_skip.len();
let mut chunks_to_recv: BTreeSet<ChunkId> = offered_chunks
.difference(&chunks_to_skip)
.cloned()
.collect();
drop(offered_chunks);
drop(chunks_to_skip);
progress.set_max_size(num_chunks_to_recv as u64);
progress.set_current(0);
while let Some((chunk_id, chunk_data)) = read_message::<_, Option<(ChunkId, Vec<u8>)>>(reader)?
{
ensure!(
chunks_to_recv.remove(&chunk_id),
"Received unexpected chunk"
);
bypass_pile.write(Keyspace::Chunk, &chunk_id, &chunk_data)?;
progress.inc_progress(1);
}
ensure!(chunks_to_recv.is_empty(), "Unreceived chunks.");
for (pointer_name, pointer_data) in &offered_pointers {
pile.write_pointer(pointer_name, pointer_data)?;
}
pile.flush()?;
Ok(())
}

View File

@ -0,0 +1,456 @@
use crate::commands::backup::split_pointer_name;
use crate::descriptor::{Descriptor, DestPileDescriptor};
use anyhow::Context;
use chrono::{Date, DateTime, Utc};
use comfy_table::presets::UTF8_FULL;
use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table};
use humansize::FileSize;
use itertools::Itertools;
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::ffi::CString;
use std::io::Read;
use std::mem;
use std::mem::size_of;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::MetadataExt;
use std::path::Path;
use yama::chunking::RecursiveUnchunker;
use yama::commands::{load_pile_descriptor, open_pile, retrieve_tree_node};
use yama::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use yama::pile::{DebugStatistics, Pile, RawPile};
// This module generates reports for a Datman system.
// Referenced Chunk IDs are counted and used to give an indication of size.
// Chunk IDs are summarised into u32s to reduce memory usage. Since the report is approximate,
// it doesn't matter if there are a few collisions (although they are still fairly unlikely to
// affect much).
#[derive(Clone)]
pub struct Report {
pub last_source_backups: BTreeMap<String, Option<DateTime<Utc>>>,
pub chunk_usages_aggregated: bool,
pub chunk_usage: BTreeMap<String, Sizes>,
pub debug_stats: Option<DebugStatistics>,
}
#[derive(Clone, Default)]
pub struct Sizes {
/// Total number of chunks that we refer to.
pub total: u32,
/// Each referred chunk is counted once here, but divided by the number of sharers.
/// We are 'morally responsible' for this many chunks.
pub moral: u32,
/// Number of chunks that only we point to.
pub unique: u32,
/// Number of chunks for which we are the oldest (lexicographically earliest) pointer to point
/// to those chunks.
pub rollup: u32,
}
type CondensedChunkId = u32;
fn condense_chunk_id(chunk_id: ChunkId) -> CondensedChunkId {
CondensedChunkId::from_be_bytes(
chunk_id[0..size_of::<CondensedChunkId>()]
.try_into()
.unwrap(),
)
}
pub fn generate_report(
dest_pile_descriptor: &DestPileDescriptor,
descriptor: &Descriptor,
aggregate_chunk_usage_by_month: bool,
) -> anyhow::Result<Report> {
let pile_descriptor = load_pile_descriptor(&dest_pile_descriptor.path)?;
let pile = open_pile(&dest_pile_descriptor.path, &pile_descriptor)?;
let debug_stats = pile.raw_pile.debug_statistics()?;
let mut pointers_to_parent_and_chunkids = BTreeMap::new();
let mut pointergroups_to_pointers: BTreeMap<String, Vec<String>> = BTreeMap::new();
info!("Collecting chunk IDs... This will probably be slow.");
for pointer_name in pile.list_pointers()? {
let pointer = pile
.read_pointer(&pointer_name)?
.context("listed pointer doesn't exist")?;
let root_node = retrieve_tree_node(&pile, pointer.chunk_ref)?;
let pointer_chunk_ids = collect_chunk_ids(&pile, &root_node.node)?;
let pointergroup = if aggregate_chunk_usage_by_month {
let (base, date_time) =
split_pointer_name(&pointer_name).context("Can't split pointer name")?;
format!("{}+{}", base, date_time.format("%Y-%m"))
} else {
pointer_name.clone()
};
pointergroups_to_pointers
.entry(pointergroup)
.or_default()
.push(pointer_name.clone());
pointers_to_parent_and_chunkids
.insert(pointer_name, (pointer.parent_pointer, pointer_chunk_ids));
}
// Now we iterate in reverse order, making a list of count of Chunk IDs.
// At the same time, we can also calculate 'rollup' sizes.
let mut chunk_sharer_counts: BTreeMap<CondensedChunkId, u16> = BTreeMap::new();
let mut pointergroup_stats: BTreeMap<String, Sizes> = BTreeMap::new();
for (pointergroup_name, pointers_in_group) in pointergroups_to_pointers.iter().rev() {
let mut deduped_chunks = BTreeSet::new();
for pointer_name in pointers_in_group {
deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
&pointers_to_parent_and_chunkids,
&pointer_name,
))
}
let mut rollup_count = 0;
for chunk in deduped_chunks {
let count = chunk_sharer_counts.entry(chunk).or_default();
*count += 1;
if *count == 1 {
rollup_count += 1;
}
}
let entry = pointergroup_stats
.entry(pointergroup_name.to_owned())
.or_default();
entry.rollup = rollup_count;
}
// Now go through again and update all the stats!
for (pointergroup_name, pointers_in_group) in &pointergroups_to_pointers {
let mut deduped_chunks = BTreeSet::new();
for pointer_name in pointers_in_group {
deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
&pointers_to_parent_and_chunkids,
&pointer_name,
))
}
let mut unique_count = 0;
let mut shared_count_by_sharers = [0u32; 256];
let total_count = deduped_chunks.len();
for chunk in deduped_chunks {
let count = chunk_sharer_counts[&chunk];
if count == 1 {
unique_count += 1;
} else {
let num_sharers = (count as usize).min(256);
shared_count_by_sharers[num_sharers - 1] += 1;
}
}
let mut sharers_sum: f64 = 0.0;
for (sharers_minus_one, count) in shared_count_by_sharers.into_iter().enumerate() {
sharers_sum += (count as f64) / (sharers_minus_one + 1) as f64;
}
let entry = pointergroup_stats
.entry(pointergroup_name.to_owned())
.or_default();
entry.moral = (sharers_sum.ceil() as u32) + unique_count;
entry.unique = unique_count;
entry.total = total_count as u32;
}
let mut last_backed_up = BTreeMap::new();
for source_name in descriptor.sources.keys().cloned() {
last_backed_up.insert(source_name, None);
}
for pointer_name in pointers_to_parent_and_chunkids.keys() {
if let Some((source_name, date_time)) = split_pointer_name(&pointer_name) {
last_backed_up.insert(source_name, Some(date_time));
}
}
Ok(Report {
last_source_backups: last_backed_up,
chunk_usage: pointergroup_stats,
chunk_usages_aggregated: aggregate_chunk_usage_by_month,
debug_stats,
})
}
// Does not filter duplicates...
fn iter_over_all_chunkids_incl_parents<'a>(
pointers_to_parent_and_chunkids: &'a BTreeMap<
String,
(Option<String>, BTreeSet<CondensedChunkId>),
>,
pointer_name: &'a str,
) -> Box<dyn Iterator<Item = CondensedChunkId> + 'a> {
let (parent, chunks) = &pointers_to_parent_and_chunkids[pointer_name];
match parent {
None => Box::new(chunks.iter().copied()),
Some(parent) => Box::new(chunks.iter().copied().chain(
iter_over_all_chunkids_incl_parents(pointers_to_parent_and_chunkids, &parent),
)),
}
}
fn collect_chunk_ids<RP: RawPile>(
pile: &Pile<RP>,
root: &TreeNode,
) -> anyhow::Result<BTreeSet<CondensedChunkId>> {
let mut chunk_ids = BTreeSet::new();
root.visit(
&mut |tree_node, _| {
match tree_node {
TreeNode::NormalFile { content, .. } => {
collect_chunk_ids_from_chunkref(pile, content, &mut chunk_ids)?;
}
_ => {}
}
Ok(())
},
"".to_owned(),
)?;
Ok(chunk_ids)
}
fn collect_chunk_ids_from_chunkref<RP: RawPile>(
pile: &Pile<RP>,
chunk_ref: &RecursiveChunkRef,
collection: &mut BTreeSet<CondensedChunkId>,
) -> anyhow::Result<()> {
if chunk_ref.depth == 0 {
collection.insert(condense_chunk_id(chunk_ref.chunk_id));
} else {
let shallower_chunk_ref = RecursiveChunkRef {
chunk_id: chunk_ref.chunk_id,
depth: chunk_ref.depth - 1,
};
let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
let mut next_chunk_id: ChunkId = Default::default();
loop {
let read = unchunker.read(&mut next_chunk_id[..])?;
if read == 0 {
break;
} else if read < next_chunk_id.len() {
unchunker.read_exact(&mut next_chunk_id[read..])?;
}
collection.insert(condense_chunk_id(next_chunk_id));
}
}
Ok(())
}
pub fn print_report(report: &Report) -> anyhow::Result<()> {
print_time_report(report)?;
print_size_report(report)?;
Ok(())
}
pub fn print_time_report(report: &Report) -> anyhow::Result<()> {
println!("\nBackup times");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
table.set_header(vec![
Cell::new("Source name").fg(Color::Cyan),
Cell::new("Last backed up").fg(Color::Cyan),
]);
let today = Utc::today();
let sort_by_dates: Vec<(Option<Date<Utc>>, String)> = report
.last_source_backups
.iter()
.map(|(name, datetime)| (datetime.map(|dt| dt.date()), name.to_owned()))
.sorted()
.collect();
for (date, source_name) in sort_by_dates {
match date {
None => {
table.add_row(vec![
Cell::new(source_name).fg(Color::Magenta),
Cell::new("NEVER").fg(Color::Red).add_attributes(vec![
Attribute::SlowBlink,
Attribute::RapidBlink,
Attribute::Bold,
]),
]);
}
Some(date) => {
let number_of_days = today.signed_duration_since(date).num_days();
let num_days_human = if number_of_days > 0 {
format!("{number_of_days} days ago")
} else {
format!("today")
};
let colour = if number_of_days < 2 {
Color::Green
} else if number_of_days < 14 {
Color::Yellow
} else {
Color::Red
};
let formatted_date = date.format("%F");
let mut val_cell =
Cell::new(format!("{formatted_date} {num_days_human}")).fg(colour);
if number_of_days > 28 {
val_cell = val_cell.add_attribute(Attribute::SlowBlink);
}
table.add_row(vec![Cell::new(source_name).fg(Color::Magenta), val_cell]);
}
}
}
println!("{table}");
Ok(())
}
pub fn print_size_report(report: &Report) -> anyhow::Result<()> {
println!("\nPile size");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
//.set_width(100);
table.set_header(vec![
Cell::new("Pointer name").fg(Color::Cyan),
Cell::new("Rollup size").fg(Color::Magenta),
Cell::new("Unique size").fg(Color::Magenta),
Cell::new("Moral size").fg(Color::Magenta),
Cell::new("Total size").fg(Color::Magenta),
]);
let average_chunk_size = report
.debug_stats
.as_ref()
.map(|stats| stats.total_chunk_size as f64 / stats.number_of_chunks as f64);
for (pointer_name, sizes) in &report.chunk_usage {
table.add_row(vec![
Cell::new(pointer_name).fg(Color::Blue),
Cell::new(format_size(sizes.rollup, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.unique, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.moral, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.total, average_chunk_size)).fg(Color::Yellow),
]);
}
println!("{table}");
Ok(())
}
fn format_size(chunks: u32, average_chunk_size: Option<f64>) -> String {
let est_size_suffix = average_chunk_size
.map(|bytes_per_chunk| {
let num_bytes = (chunks as f64 * bytes_per_chunk) as u64;
let mut format = humansize::file_size_opts::BINARY;
format.decimal_places = 1;
format!(" ~{}", num_bytes.file_size(format).unwrap())
})
.unwrap_or_default();
format!("{} c{}", chunks, est_size_suffix)
}
fn calculate_total_filesize_of_dir(dir: &Path) -> anyhow::Result<u64> {
let mut total = 0;
for file in std::fs::read_dir(dir)? {
let file = file?;
let metadata = file.metadata()?;
total += metadata.size();
if metadata.is_dir() {
total += calculate_total_filesize_of_dir(&file.path())?;
}
}
Ok(total)
}
pub fn print_filesystem_space(pile_path: &Path) -> anyhow::Result<()> {
let usage_for_pile = calculate_total_filesize_of_dir(&pile_path)?;
let path_c = CString::new(pile_path.as_os_str().as_bytes()).unwrap();
let stats = unsafe {
let mut stats: libc::statfs = mem::zeroed();
match libc::statfs(path_c.as_ptr(), &mut stats) {
0 => Ok(stats),
other => Err(std::io::Error::from_raw_os_error(other)),
}
}?;
// On a BTRFS system with 2 disks in RAID1, note (about df -h):
// - 'Size' shows the average size of the two disks. I think of it as 'ideal size'.
// - 'Avail' seems to show the actual number of bytes usable.
// - 'Used' seems to show the actual number of bytes used.
// In short: probably avoid relying on 'size'.
let block_size = stats.f_bsize as i64;
let used_bytes = (stats.f_blocks - stats.f_bfree) as i64 * block_size;
let avail_bytes = stats.f_bavail as i64 * block_size;
let usable_bytes = used_bytes + avail_bytes;
let theoretical_size = stats.f_blocks as i64 * block_size;
let mut format = humansize::file_size_opts::BINARY;
format.decimal_places = 1;
format.decimal_zeroes = 1;
println!("\nFilesystem Information");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
//.set_width(100);
table.set_header(vec![
Cell::new("Theoretical Size").fg(Color::Cyan),
Cell::new("Usable Size").fg(Color::Cyan),
Cell::new("Used").fg(Color::Cyan),
Cell::new("Used for Pile").fg(Color::Cyan),
Cell::new("Available").fg(Color::Cyan),
]);
let available_space_colour = if avail_bytes < 8 * 1024 * 1024 * 1024 {
Color::Red
} else if avail_bytes < 64 * 1024 * 1024 * 1024 {
Color::Yellow
} else {
Color::Green
};
table.add_row(vec![
Cell::new(format!(
"{:>9}",
theoretical_size.file_size(&format).unwrap()
))
.fg(Color::Blue),
Cell::new(format!("{:>9}", usable_bytes.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", used_bytes.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", usage_for_pile.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", avail_bytes.file_size(&format).unwrap()))
.fg(available_space_colour),
]);
print!("{table}");
Ok(())
}

View File

@ -38,6 +38,10 @@ pub struct Descriptor {
pub piles: HashMap<String, DestPileDescriptor>, pub piles: HashMap<String, DestPileDescriptor>,
pub remote_hosts: HashMap<String, RemoteHostDescriptor>, pub remote_hosts: HashMap<String, RemoteHostDescriptor>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub retention: Option<RetentionPolicyConfig>,
} }
#[derive(Clone, Serialize, Deserialize, Debug)] #[derive(Clone, Serialize, Deserialize, Debug)]
@ -46,12 +50,22 @@ pub struct RemoteHostDescriptor {
pub path_to_datman: Option<String>, pub path_to_datman: Option<String>,
} }
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RetentionPolicyConfig {
pub daily: u32,
pub weekly: u32,
pub monthly: u32,
pub yearly: u32,
}
#[derive(Clone, Serialize, Deserialize, Debug)] #[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)] #[serde(untagged)]
pub enum SourceDescriptor { pub enum SourceDescriptor {
DirectorySource { DirectorySource {
hostname: String, hostname: String,
directory: PathBuf, directory: PathBuf,
#[serde(default)]
cross_filesystems: bool,
}, },
VirtualSource { VirtualSource {
/// The name of the helper program that will be used to do this backup. /// The name of the helper program that will be used to do this backup.

View File

@ -15,10 +15,10 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::HashMap; use std::collections::{BTreeSet, HashMap};
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader, Write}; use std::io::{BufRead, BufReader, Write};
use std::path::Path; use std::path::{Path, PathBuf};
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Context; use anyhow::Context;
@ -222,6 +222,23 @@ impl LabellingRules {
} }
None None
} }
pub fn get_exclusions_set(&self, base: &Path) -> BTreeSet<PathBuf> {
let mut exclusions = BTreeSet::new();
for (ext_path, state) in &self.position_based_rules {
assert!(ext_path.is_empty() || ext_path.starts_with('/'));
let full_path = PathBuf::from(format!(
"{}{ext_path}",
base.to_str().expect("base path must always be utf-8")
));
if state == &Excluded {
exclusions.insert(full_path);
}
}
exclusions
}
} }
/// Uninteractively label the nodes. /// Uninteractively label the nodes.

View File

@ -1,15 +1,19 @@
use crate::commands::backup::{get_pointer_name_at, label_filter_and_convert}; use crate::commands::backup::{get_pointer_name_at, label_filter_and_convert};
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor}; use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor};
use crate::labelling::load_labelling_rules;
use crate::tree::FileTree; use crate::tree::FileTree;
use anyhow::{anyhow, bail}; use anyhow::{anyhow, bail};
use chrono::Utc; use chrono::Utc;
use log::info; use log::info;
use std::collections::BTreeSet;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::path::Path; use std::path::{Path, PathBuf};
use std::process::{Child, Command, Stdio}; use std::process::{Child, Command, Stdio};
use std::sync::Arc; use std::sync::Arc;
use yama::commands::{load_pile_descriptor, open_pile}; use yama::commands::{load_pile_descriptor, open_pile};
use yama::definitions::TreeNode; use yama::definitions::{PartialPointerData, TreeNode};
use yama::operations::storing::{pointer_ops_prepare_to_store, pointers_ops_after_store};
use yama::pile::access_guard::PileGuard;
use yama::pile::{Pile, RawPile, StoragePipelineSettings}; use yama::pile::{Pile, RawPile, StoragePipelineSettings};
use yama::progress::ProgressTracker; use yama::progress::ProgressTracker;
use yama::remote::responder::{Responder, ResponderWritingPipeline}; use yama::remote::responder::{Responder, ResponderWritingPipeline};
@ -45,10 +49,14 @@ pub fn scanning<R: Read, W: Write>(
read: &mut R, read: &mut R,
write: &mut W, write: &mut W,
path: &Path, path: &Path,
one_filesystem: bool,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> { ) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
info!("Scanning."); info!("Scanning.");
write_message(write, &"scan")?; write_message(write, &"scan")?;
write_message(write, &path)?; write_message(write, &path)?;
write_message(write, &one_filesystem)?;
write_message(write, exclusions)?;
write.flush()?; write.flush()?;
let scan_result: Option<FileTree<(), (), (), ()>> = read_message(read)?; let scan_result: Option<FileTree<(), (), (), ()>> = read_message(read)?;
@ -64,19 +72,15 @@ pub fn chunking<
read: R, read: R,
mut write: W, mut write: W,
path: &Path, path: &Path,
pointer_name: String,
tree_node: &TreeNode, tree_node: &TreeNode,
raw_pile: Arc<RP>, raw_pile: Arc<RP>,
parent: Option<String>,
progress_bar: PT, progress_bar: PT,
use_writing_pipeline: bool, use_writing_pipeline: bool,
) -> anyhow::Result<(R, W)> { ) -> anyhow::Result<(R, W, PartialPointerData)> {
info!("Chunking."); info!("Chunking.");
write_message(&mut write, &"chunk")?; write_message(&mut write, &"chunk")?;
write_message(&mut write, &path)?; write_message(&mut write, &path)?;
write_message(&mut write, &pointer_name)?;
write_message(&mut write, tree_node)?; write_message(&mut write, tree_node)?;
write_message(&mut write, &parent)?;
write.flush()?; write.flush()?;
let (writing_pipeline, control_rx) = if use_writing_pipeline { let (writing_pipeline, control_rx) = if use_writing_pipeline {
@ -97,11 +101,13 @@ pub fn chunking<
(None, None) (None, None)
}; };
let guarded_pile = PileGuard::new(Arc::clone(&raw_pile), true);
let (r_handle, w_handle, join_handles) = Responder::start( let (r_handle, w_handle, join_handles) = Responder::start(
read, read,
write, write,
get_number_of_workers("YAMA_RESPONDERS") as u16, get_number_of_workers("YAMA_RESPONDERS") as u16,
raw_pile, Arc::new(guarded_pile),
writing_pipeline, writing_pipeline,
progress_bar, progress_bar,
); );
@ -111,7 +117,7 @@ pub fn chunking<
for handle in join_handles { for handle in join_handles {
handle.join().expect("Join handle should not fail"); handle.join().expect("Join handle should not fail");
} }
let read = r_handle.join().unwrap(); let mut read = r_handle.join().unwrap();
let write = w_handle.join().unwrap(); let write = w_handle.join().unwrap();
if let Some(control_rx) = control_rx { if let Some(control_rx) = control_rx {
@ -122,7 +128,9 @@ pub fn chunking<
info!("Remote finished chunking."); info!("Remote finished chunking.");
Ok((read, write)) let pointer_data: PartialPointerData = read_message(&mut read)?;
Ok((read, write, pointer_data))
} }
pub fn quit<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> { pub fn quit<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
@ -175,6 +183,7 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
SourceDescriptor::DirectorySource { SourceDescriptor::DirectorySource {
hostname, hostname,
directory, directory,
cross_filesystems,
} => { } => {
let remote_host_descriptor = descriptor let remote_host_descriptor = descriptor
.remote_hosts .remote_hosts
@ -194,13 +203,22 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
info!("Connecting..."); info!("Connecting...");
introduction(&mut read, &mut write)?; introduction(&mut read, &mut write)?;
let rules = load_labelling_rules(desc_path, source_name)?;
let exclusions = rules.get_exclusions_set(directory);
// then request to scan // then request to scan
info!("Requesting scan... (this may take some time)"); info!("Requesting scan... (this may take some time)");
let scan_result = scanning(&mut read, &mut write, directory.as_ref())? let scan_result = scanning(
&mut read,
&mut write,
directory.as_ref(),
!*cross_filesystems,
&exclusions,
)?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?; .ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?;
let root = let mut root =
label_filter_and_convert(scan_result, descriptor, desc_path, source_name, dest)? label_filter_and_convert(scan_result, descriptor, source_name, &rules, dest)?
.ok_or_else(|| anyhow!("Empty filter..."))?; .ok_or_else(|| anyhow!("Empty filter..."))?;
let absolute_dest_path = desc_path.join(&dest.path); let absolute_dest_path = desc_path.join(&dest.path);
@ -248,19 +266,27 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
let raw_pile = Arc::new(pile.raw_pile); let raw_pile = Arc::new(pile.raw_pile);
let pile = Pile::new(raw_pile.clone()); let pile = Pile::new(raw_pile.clone());
let (mut read, mut write) = chunking( pointer_ops_prepare_to_store(&pile, &mut root, &parent)?;
info!(
"Have pointer_name = {:?}, parent = {:?}",
pointer_name, parent
);
let (mut read, mut write, pointer_data) = chunking(
read, read,
write, write,
directory.as_ref(), directory.as_ref(),
pointer_name.clone(),
&root, &root,
raw_pile, raw_pile,
parent,
progress_bar, progress_bar,
true, true,
)?; )?;
quit(&mut read, &mut write)?; quit(&mut read, &mut write)?;
pointers_ops_after_store(&pile, &pointer_name, &pointer_data.complete(parent))?;
pile.flush()?; pile.flush()?;
info!("Stored! Checking for existence..."); info!("Stored! Checking for existence...");

View File

@ -1,21 +1,25 @@
// This file implements the responder side of the backup source protocol -- the protocol used // This file implements the responder side of the backup source protocol -- the protocol used
// to connect to remote backup sources. // to connect to remote backup sources.
use crate::tree::scan; use std::collections::BTreeSet;
use anyhow::bail;
use crossbeam_channel::Sender;
use log::info;
use std::io::{stdin, stdout, Read, Write}; use std::io::{stdin, stdout, Read, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
use yama::definitions::TreeNode;
use anyhow::bail;
use crossbeam_channel::Sender;
use log::info;
use yama::definitions::{PartialPointerData, TreeNode};
use yama::pile::{Pile, RawPile}; use yama::pile::{Pile, RawPile};
use yama::progress::ProgressTracker; use yama::progress::ProgressTracker;
use yama::remote::requester::Requester; use yama::remote::requester::Requester;
use yama::remote::{read_message, write_message, RequestBody, ResponseBody}; use yama::remote::{read_message, write_message, RequestBody, ResponseBody};
use yama::utils::get_number_of_workers; use yama::utils::get_number_of_workers;
use crate::tree::scan;
pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> { pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION"); let version = env!("CARGO_PKG_VERSION");
write_message( write_message(
@ -39,48 +43,14 @@ pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::R
pub fn scanning<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> { pub fn scanning<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let path: PathBuf = read_message(read)?; let path: PathBuf = read_message(read)?;
let scan_result = scan(&path)?; let one_filesystem: bool = read_message(read)?;
let exclusions: BTreeSet<PathBuf> = read_message(read)?;
let scan_result = scan(&path, one_filesystem, &exclusions)?;
write_message(write, &scan_result)?; write_message(write, &scan_result)?;
write.flush()?; write.flush()?;
Ok(()) Ok(())
} }
pub fn chunking<R: Read + Send + 'static, W: Write + Send + 'static>(
mut read: R,
write: W,
) -> anyhow::Result<()> {
let path: PathBuf = read_message(&mut read)?;
let pointer_name: String = read_message(&mut read)?;
let tree_node: TreeNode = read_message(&mut read)?;
let parent: Option<String> = read_message(&mut read)?;
let (yama_requester, requester_join_handles) = Requester::new(read, write);
let raw_pile: Box<dyn RawPile> = Box::new(yama_requester);
let pile = Pile::new(raw_pile);
// TODO TODO progress
let progress_bar = &mut ();
yama::operations::storing::store_fully(
Arc::new(pile),
&path,
&pointer_name,
tree_node,
parent,
get_number_of_workers("YAMA_CHUNKERS"),
progress_bar,
true,
)?;
for join_handle in requester_join_handles {
join_handle.join().expect("Expected to join handle");
}
Ok(())
}
pub struct ProgressSender { pub struct ProgressSender {
pub last_sent: Instant, pub last_sent: Instant,
pub current_progress: u64, pub current_progress: u64,
@ -105,7 +75,6 @@ impl ProgressSender {
} }
pub fn send_if_overdue(&mut self) { pub fn send_if_overdue(&mut self) {
//info!("send if overdue...");
if Instant::now().duration_since(self.last_sent).as_millis() >= 1024 { if Instant::now().duration_since(self.last_sent).as_millis() >= 1024 {
self.send_now(false); self.send_now(false);
} }
@ -129,23 +98,17 @@ impl ProgressTracker for ProgressSender {
} }
} }
pub fn chunking_stdio() -> anyhow::Result<()> { // TODO use io-streams crate and get rid of the duplication!!
let (path, pointer_name, tree_node, parent) = { pub fn chunking_stdio() -> anyhow::Result<PartialPointerData> {
let (path, tree_node) = {
let stdin = stdin(); let stdin = stdin();
let mut read = stdin.lock(); let mut read = stdin.lock();
let path: PathBuf = read_message(&mut read)?; let path: PathBuf = read_message(&mut read)?;
let pointer_name: String = read_message(&mut read)?;
let tree_node: TreeNode = read_message(&mut read)?; let tree_node: TreeNode = read_message(&mut read)?;
let parent: Option<String> = read_message(&mut read)?; (path, tree_node)
(path, pointer_name, tree_node, parent)
}; };
info!( let (pointer_data, requester_join_handles) = {
"Have pointer_name = {:?}, parent = {:?}",
pointer_name, parent
);
let requester_join_handles = {
let (yama_requester, requester_join_handles) = Requester::new_from_stdio(); let (yama_requester, requester_join_handles) = Requester::new_from_stdio();
let command_sender = yama_requester.clone_command_sender(); let command_sender = yama_requester.clone_command_sender();
info!("progress sender in use"); info!("progress sender in use");
@ -161,18 +124,15 @@ pub fn chunking_stdio() -> anyhow::Result<()> {
let pile = Pile::new(raw_pile); let pile = Pile::new(raw_pile);
yama::operations::storing::store_fully( let pointer_data = yama::operations::storing::store_without_pointer_ops(
Arc::new(pile), &Arc::new(pile),
&path, &path,
&pointer_name,
tree_node, tree_node,
parent,
get_number_of_workers("YAMA_CHUNKERS"), get_number_of_workers("YAMA_CHUNKERS"),
&mut progress_bar, &mut progress_bar,
true,
)?; )?;
requester_join_handles (pointer_data, requester_join_handles)
}; };
info!("Waiting to join."); info!("Waiting to join.");
@ -183,20 +143,7 @@ pub fn chunking_stdio() -> anyhow::Result<()> {
info!("Chunking completed."); info!("Chunking completed.");
Ok(()) Ok(pointer_data)
}
pub fn handler<R: Read + Send + 'static, W: Write + Send + 'static>(
mut read: R,
mut write: W,
) -> anyhow::Result<()> {
introduction(&mut read, &mut write)?;
scanning(&mut read, &mut write)?;
chunking(read, write)?;
Ok(())
} }
pub fn handler_stdio() -> anyhow::Result<()> { pub fn handler_stdio() -> anyhow::Result<()> {
@ -219,9 +166,11 @@ pub fn handler_stdio() -> anyhow::Result<()> {
info!("Chunking."); info!("Chunking.");
drop(read); drop(read);
drop(write); drop(write);
chunking_stdio()?; let pointer_data = chunking_stdio()?;
read = stdin.lock(); read = stdin.lock();
write = stdout.lock(); write = stdout.lock();
write_message(&mut write, &pointer_data)?;
write.flush()?;
} }
"exit" => { "exit" => {
write_message(&mut write, &"exit")?; write_message(&mut write, &"exit")?;

View File

@ -15,16 +15,16 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::BTreeMap; use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Debug; use std::fmt::Debug;
use std::fs::{read_link, symlink_metadata, DirEntry, Metadata}; use std::fs::{read_link, symlink_metadata, DirEntry, Metadata};
use std::io::ErrorKind; use std::io::ErrorKind;
use std::os::unix::fs::MetadataExt; use std::os::unix::fs::MetadataExt;
use std::path::Path; use std::path::{Path, PathBuf};
use anyhow::anyhow; use anyhow::anyhow;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::warn; use log::{debug, info, warn};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use yama::definitions::FilesystemOwnership; pub use yama::definitions::FilesystemOwnership;
@ -216,12 +216,18 @@ pub fn mtime_msec(metadata: &Metadata) -> u64 {
} }
/// Scan the filesystem to produce a Tree, using a default progress bar. /// Scan the filesystem to produce a Tree, using a default progress bar.
pub fn scan(path: &Path) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> { pub fn scan(
path: &Path,
one_filesystem: bool,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(2)); let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(2));
pbar.set_style(ProgressStyle::default_spinner().template("{spinner} {pos:7} {msg}")); pbar.set_style(ProgressStyle::default_spinner().template("{spinner} {pos:7} {msg}"));
pbar.set_message("dir scan"); pbar.set_message("dir scan");
let result = scan_with_progress_bar(path, &pbar); let one_filesystem = if one_filesystem { Some(None) } else { None };
let result = scan_with_progress_bar(path, &pbar, one_filesystem, exclusions);
pbar.finish_at_current_pos(); pbar.finish_at_current_pos();
result result
} }
@ -230,7 +236,15 @@ pub fn scan(path: &Path) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
pub fn scan_with_progress_bar( pub fn scan_with_progress_bar(
path: &Path, path: &Path,
progress_bar: &ProgressBar, progress_bar: &ProgressBar,
mut one_filesystem: Option<Option<u64>>,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> { ) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
if exclusions.contains(path) {
// Don't enter excluded paths.
debug!("Not descending into excluded path: {:?}", path);
return Ok(None);
}
let metadata_res = symlink_metadata(path); let metadata_res = symlink_metadata(path);
progress_bar.inc(1); progress_bar.inc(1);
if let Err(e) = &metadata_res { if let Err(e) = &metadata_res {
@ -249,6 +263,14 @@ pub fn scan_with_progress_bar(
let metadata = metadata_res?; let metadata = metadata_res?;
let filetype = metadata.file_type(); let filetype = metadata.file_type();
if let Some(one_filesystem) = one_filesystem.as_mut() {
let this_fs = metadata.dev();
if *one_filesystem.get_or_insert(this_fs) != this_fs {
info!("Stopping at filesystem boundary: {:?}", path);
return Ok(None);
}
}
/*let name = path /*let name = path
.file_name() .file_name()
.ok_or(anyhow!("No filename, wat"))? .ok_or(anyhow!("No filename, wat"))?
@ -294,15 +316,23 @@ pub fn scan_with_progress_bar(
for entry in dir_read? { for entry in dir_read? {
let entry: DirEntry = entry?; let entry: DirEntry = entry?;
let scanned = scan_with_progress_bar(&entry.path(), progress_bar)?;
if entry.file_name() == ".datmanskip" {
// Directories with .datmanskip in them are to be skipped entirely.
// TODO(perf): should this be checked upfront before some children may already
// have been scanned?
debug!("Skipping {path:?} because it has a .datmanskip file.");
return Ok(None);
}
let scanned =
scan_with_progress_bar(&entry.path(), progress_bar, one_filesystem, exclusions)?;
if let Some(scanned) = scanned { if let Some(scanned) = scanned {
children.insert( if let Ok(filename) = entry.file_name().into_string() {
entry children.insert(filename, scanned);
.file_name() } else {
.into_string() warn!("Non-UTF-8 filename; ignoring: {:?}", entry.file_name())
.expect("OsString not String"), }
scanned,
);
} }
} }

177
flake.lock Normal file
View File

@ -0,0 +1,177 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"naersk": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1662220400,
"narHash": "sha256-9o2OGQqu4xyLZP9K6kNe1pTHnyPz0Wr3raGYnr9AIgY=",
"owner": "nix-community",
"repo": "naersk",
"rev": "6944160c19cb591eb85bbf9b2f2768a935623ed3",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "naersk",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1703863825,
"narHash": "sha256-rXwqjtwiGKJheXB43ybM8NwWB8rO2dSRrEqes0S7F5Y=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "5163432afc817cf8bd1f031418d1869e4c9d5547",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1714971268,
"narHash": "sha256-IKwMSwHj9+ec660l+I4tki/1NRoeGpyA2GdtdYpAgEw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "27c13997bf450a01219899f5a83bd6ffbfc70d3c",
"type": "github"
},
"original": {
"id": "nixpkgs",
"ref": "nixos-23.11",
"type": "indirect"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": "flake-utils",
"nix-github-actions": "nix-github-actions",
"nixpkgs": [
"nixpkgs"
],
"systems": "systems_2",
"treefmt-nix": "treefmt-nix"
},
"locked": {
"lastModified": 1715017507,
"narHash": "sha256-RN2Vsba56PfX02DunWcZYkMLsipp928h+LVAWMYmbZg=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "e6b36523407ae6a7a4dfe29770c30b3a3563b43a",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"root": {
"inputs": {
"naersk": "naersk",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix",
"utils": "utils"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"id": "systems",
"type": "indirect"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1714058656,
"narHash": "sha256-Qv4RBm4LKuO4fNOfx9wl40W2rBbv5u5m+whxRYUMiaA=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "c6aaf729f34a36c445618580a9f95a48f5e4e03f",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
},
"utils": {
"locked": {
"lastModified": 1659877975,
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

92
flake.nix Normal file
View File

@ -0,0 +1,92 @@
{
description = "Yama and Datman";
inputs = {
utils.url = "github:numtide/flake-utils";
naersk = {
url = "github:nix-community/naersk";
inputs.nixpkgs.follows = "nixpkgs";
};
nixpkgs.url = "nixpkgs/nixos-23.11";
poetry2nix = {
url = "github:nix-community/poetry2nix";
inputs.nixpkgs.follows = "nixpkgs";
};
};
outputs = { self, nixpkgs, utils, naersk, poetry2nix }:
utils.lib.eachDefaultSystem (system: let
pkgs = nixpkgs.legacyPackages."${system}";
inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) mkPoetryApplication;
naersk-lib = naersk.lib."${system}";
rustComponents = naersk-lib.buildPackage {
pname = "yama";
root = ./.;
buildInputs = with pkgs; [
openssl
pkg-config
sqlite
];
};
mysqlHelper = mkPoetryApplication {
projectDir = ./datman-helper-mysql;
};
postgresHelper = mkPoetryApplication {
projectDir = ./datman-helper-postgres;
};
# We want to produce a package with all of these together, with wrappers that let them
# refer to each other by name (i.e. have each other on the path).
# Datman needs the helpers on the path.
# The helpers need lz4 on the path.
allInOne = pkgs.stdenv.mkDerivation {
name = "datman-aio";
src = "${pkgs.emptyDirectory}";
installPhase = ''
# set -eu
mkdir $out $out/bin
ln -s ${rustComponents}/bin/{yama,datman} $out/bin
ln -s ${mysqlHelper}/bin/datman-helper-mysql-{backup,restore} $out/bin
ln -s ${postgresHelper}/bin/datman-helper-postgres-{backup,restore} $out/bin
ln -s ${pkgs.lz4}/bin/lz4 $out/bin/
runHook postInstall
'';
buildInputs = [ pkgs.makeWrapper ];
postInstall = ''
# set -eu
for fn in $out/bin/{datman,yama,datman-helper-{mysql,postgres}-{backup,restore}}; do
wrapProgram $fn --suffix PATH : $out/bin
done
'';
};
in rec {
# `nix build`
packages.yama = allInOne;
defaultPackage = packages.yama;
# NixOS Modules
# nixosModules = {
# yama = import ./nixos_modules/yama.nix self;
# };
# `nix run`
apps.yama = utils.lib.mkApp {
drv = rustComponents;
};
defaultApp = apps.yama;
# `nix develop`
devShell = pkgs.mkShell {
nativeBuildInputs = with pkgs; [ rustc cargo ];
};
});
}

View File

@ -4,7 +4,7 @@ if [ $# -ge 1 ]
then then
files=$* files=$*
else else
files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-postgres/setup.py datman-helper-mysql/datman_helper_mysql datman-helper-mysql/setup.py" files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-mysql/datman_helper_mysql"
fi fi
echo "Linting these locations: $files" echo "Linting these locations: $files"

50
shell.nix Normal file
View File

@ -0,0 +1,50 @@
{ pkgs ? import <nixpkgs> {} }:
let
# We may need some packages from nixpkgs-unstable
#unstable = import <nixpkgs-unstable> {};
rust-toolchain = pkgs.symlinkJoin {
name = "rust-toolchain";
paths = [pkgs.rustc pkgs.cargo pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
};
in
pkgs.mkShell {
buildInputs = [
rust-toolchain
pkgs.pkg-config
pkgs.alsa-lib
pkgs.sqlite
#pkgs.libclang # ??
];
nativeBuildInputs = [
pkgs.openssl
pkgs.python3
];
# Needed for bindgen when binding to avahi
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
# Cargo culted:
# Add to rustc search path
RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
]);
# Add to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
}

View File

@ -251,7 +251,8 @@ kind = {{ stdout = "blahblah.txt" }}
seed = 7555 seed = 7555
print(f"seed: {seed}") print(f"seed: {seed}")
rng.seed(seed) rng.seed(seed)
# min_files is 8 because we need enough files to use each label for this test to succeed. # min_files is 8 because we need enough files to use each label for this
# test to succeed.
initial_descriptor, _ = generate_random_dir(rng, src_path, 32, min_files=8) initial_descriptor, _ = generate_random_dir(rng, src_path, 32, min_files=8)
labellings = generate_labels(initial_descriptor, rng) labellings = generate_labels(initial_descriptor, rng)
save_labelling_rules(labelling_path.joinpath("srca.zst"), labellings) save_labelling_rules(labelling_path.joinpath("srca.zst"), labellings)
@ -298,3 +299,81 @@ kind = {{ stdout = "blahblah.txt" }}
) )
td.cleanup() td.cleanup()
def test_backup_incremental_with_mid_delete(self):
td = TemporaryDirectory("test_backup_incremental_with_mid_delete")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
print("storing")
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
# now mutate and store incremental
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
time.sleep(2)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
# now mutate and store incremental again!
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
mutated_descriptor = scan_dir(src_path)
time.sleep(2)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
pointer_names = [
line
for line in subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
.decode()
.split("\n")
if line
]
self.assertEqual(len(pointer_names), 3)
self.assertLess(pointer_names[0], pointer_names[1])
self.assertLess(pointer_names[1], pointer_names[2])
print(f"removing mid pointer {pointer_names[1]}")
subprocess.check_call(
("yama", "debug", "rmp", pointer_names[1]),
cwd=yama_path,
)
print("extracting last pointer to check still valid")
dest_path = tdpath.joinpath("desta")
subprocess.check_call(
(
"datman",
"extract",
"--skip-metadata",
"--accept-partial",
"main",
"../desta",
),
cwd=datman_path,
)
# this will be wrapped in a directory that starts with the name srca+
extracted_dir_descriptor_wrapper = scan_dir(dest_path)
contents = extracted_dir_descriptor_wrapper.contents
self.assertEqual(len(contents), 1)
key, value = next(iter(contents.items()))
self.assertTrue(key.startswith("srca+"))
self.assertIsInstance(value, DirectoryDescriptor)
key, value = next(iter(value.contents.items()))
self.assertEqual(key, "srca")
self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
td.cleanup()

View File

@ -1,6 +1,7 @@
import shutil import shutil
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import Set
def set_up_simple_yama(path: Path): def set_up_simple_yama(path: Path):
@ -10,3 +11,13 @@ def set_up_simple_yama(path: Path):
"example_zstd.dict" "example_zstd.dict"
) )
shutil.copyfile(example_zstd_path, path.joinpath("important_zstd.dict")) shutil.copyfile(example_zstd_path, path.joinpath("important_zstd.dict"))
def list_bloblog_ids(pile: Path) -> Set[int]:
result = set()
for p in pile.joinpath("bloblog").iterdir():
try:
result.add(int(p.name))
except ValueError:
pass
return result

View File

@ -22,7 +22,7 @@ REQUIRED = ["green", "attrs", "immutabledict"]
# What packages are optional? # What packages are optional?
EXTRAS = {"dev": ["black==21.7b0", "flake8==3.9.2", "isort==5.9.2"]} EXTRAS = {"dev": ["black==22.10.0", "flake8==3.9.2", "isort==5.9.2"]}
# The rest you shouldn't have to touch too much :) # The rest you shouldn't have to touch too much :)
# ------------------------------------------------ # ------------------------------------------------

View File

@ -0,0 +1,175 @@
import subprocess
from pathlib import Path
from random import Random
from tempfile import TemporaryDirectory
from unittest import TestCase
from helpers import (
DirectoryDescriptor,
generate_random_dir,
randomly_mutate_directory_in_descriptor,
scan_dir,
)
from helpers.datman_helpers import set_up_simple_datman
from helpers.yama_helpers import list_bloblog_ids, set_up_simple_yama
class TestYamaCompact(TestCase):
def test_compaction_merge_two_small_bloblogs(self):
td = TemporaryDirectory("test_check_fails_after_random_corruption")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
later_expected_descriptor, _ = generate_random_dir(rng, src_path, 32)
# Back up twice: that way we should get at least two bloblogs!
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
old_bloblog_ids = list_bloblog_ids(yama_path)
self.assertGreater(
len(old_bloblog_ids), 1, "Should be many bloblogs at this point"
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2",
"--small",
str(2 * 1024 * 1024 * 1024),
),
cwd=yama_path,
)
new_bloblog_ids = list_bloblog_ids(yama_path)
self.assertEqual(
len(new_bloblog_ids), 1, "Should only be 1 bloblog at this point."
)
self.assertEqual(
list(new_bloblog_ids)[0],
max(old_bloblog_ids) + 1,
"New bloblog ID should be 1 greater than the max old one.",
)
def test_gc_then_compact(self):
td = TemporaryDirectory("test_gc_then_compact")
tdpath = Path(td.name)
datman_path = tdpath.joinpath("datman")
src_path = datman_path.joinpath("srca")
yama_path = datman_path.joinpath("main")
set_up_simple_datman(datman_path)
set_up_simple_yama(yama_path)
rng = Random()
seed = rng.randint(0, 9001)
print(f"seed: {seed}")
rng.seed(seed)
initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
orig_pointer_name = (
subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
.decode()
.split("\n")[0]
)
randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
mutated_descriptor = scan_dir(src_path)
subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
old_bloblog_ids = list_bloblog_ids(yama_path)
# Try a GC and check that it's a no-op
subprocess.check_call(
("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2000",
"--reclaim",
"1",
"--max-dealloc",
"1",
),
cwd=yama_path,
)
unchanged_bloblog_ids = list_bloblog_ids(yama_path)
self.assertEqual(
old_bloblog_ids,
unchanged_bloblog_ids,
"No GC: no compaction should have happened.",
)
subprocess.check_call(
("yama", "debug", "rmp", orig_pointer_name), cwd=yama_path
)
# Try a GC and check that it did something
subprocess.check_call(
("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
)
subprocess.check_call(
(
"yama",
"compact",
"--mergeable",
"2000",
"--reclaim",
"1",
"--max-dealloc",
"1",
),
cwd=yama_path,
)
new_bloblog_ids = list_bloblog_ids(yama_path)
self.assertNotEqual(
old_bloblog_ids, new_bloblog_ids, "GC: compaction should have happened."
)
# Check that we can still extract the files!
dest_path = tdpath.joinpath("desta")
subprocess.check_call(
(
"datman",
"extract",
"--skip-metadata",
"--accept-partial",
"main",
"../desta",
),
cwd=datman_path,
)
extracted_dir_descriptor_wrapper = scan_dir(dest_path)
contents = extracted_dir_descriptor_wrapper.contents
self.assertEqual(len(contents), 1)
key, value = next(iter(contents.items()))
self.assertTrue(key.startswith("srca+"))
self.assertIsInstance(value, DirectoryDescriptor)
key, value = next(iter(value.contents.items()))
self.assertEqual(key, "srca")
self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
td.cleanup()

View File

@ -1,6 +1,6 @@
[package] [package]
name = "yama" name = "yama"
version = "0.5.0-alpha.2" version = "0.6.0-alpha.5"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"] authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2018" edition = "2018"
description = "Deduplicated, compressed and encrypted content pile manager" description = "Deduplicated, compressed and encrypted content pile manager"
@ -11,11 +11,10 @@ license = "GPL-3.0-or-later"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
fastcdc = "1.0.2" fastcdc = "1.0.6"
zstd = "0.6.0" # 0.6.0+zstd.1.4.8 zstd = "0.11.2" # 0.11.2+zstd.1.5.2
sshish = "0.1.0" clap = { version = "3.1.18", features = ["derive"] }
clap = "= 3.0.0-beta.5" blake = "2.0.2"
blake = "2.0.0"
twox-hash = "1.5.0" twox-hash = "1.5.0"
serde = { version = "1.0.104", features = ["derive"] } serde = { version = "1.0.104", features = ["derive"] }
serde_bare = "0.3.0" serde_bare = "0.3.0"
@ -41,5 +40,6 @@ rustyline = "7.1.0"
derivative = "2.2.0" derivative = "2.2.0"
metrics = "0.17.1" metrics = "0.17.1"
[dev-dependencies] [dev-dependencies]
temp-dir = "0.1.11" temp-dir = "0.1.11"

View File

@ -18,21 +18,24 @@ along with Yama. If not, see <https://www.gnu.org/licenses/>.
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use clap::{crate_authors, crate_description, crate_version, Parser};
use log::info; use log::info;
use clap::Parser;
use env_logger::Env; use env_logger::Env;
use std::sync::Arc; use std::sync::Arc;
use yama::commands::{fully_integrate_pointer_node, load_pile_descriptor, open_pile}; use yama::commands::{fully_integrate_pointer_node, load_pile_descriptor, open_pile};
use yama::debug::{debug_command, DebugCommand}; use yama::debug::{debug_command, DebugCommand};
use yama::operations::checking::VacuumMode; use yama::operations::checking::VacuumMode;
use yama::operations::pushpull::{determine_bypass_level, open_pile_with_work_bypass, push_to}; use yama::operations::legacy_pushpull::{
use yama::operations::{checking, extracting}; determine_bypass_level, open_pile_with_work_bypass, push_to,
};
use yama::operations::{checking, cleanup, extracting};
use yama::pile::local_sqlitebloblogs::CompactionThresholds;
use yama::pile::{Pile, PileDescriptor, RawPile}; use yama::pile::{Pile, PileDescriptor, RawPile};
use yama::{commands, debug}; use yama::{commands, debug};
#[derive(Parser)] #[derive(Parser)]
#[clap(version = crate_version!(), author = crate_authors!(), about = crate_description!())] #[clap(version = env!("CARGO_PKG_VERSION"), author = env!("CARGO_PKG_AUTHORS"), about = env!("CARGO_PKG_DESCRIPTION"))]
struct Opts { struct Opts {
/// Chooses a different pile to be the working pile. /// Chooses a different pile to be the working pile.
/// If specified, must be the name of a remote in yama.toml. /// If specified, must be the name of a remote in yama.toml.
@ -54,8 +57,9 @@ enum PileCommand {
pointer_name: String, pointer_name: String,
/// Limited expression(s) of files to retrieve. /// Limited expression(s) of files to retrieve.
/// LIMITATION OF CURRENT VERSION: ONLY ONE EXACT PATH ALLOWED, PLEASE.
#[clap(short, long)] #[clap(short, long)]
subset: Vec<PathBuf>, subset: Option<String>,
destination: PathBuf, destination: PathBuf,
@ -80,6 +84,29 @@ enum PileCommand {
shallow: bool, shallow: bool,
}, },
Compact {
/// Don't actually perform any compaction; just plan it out.
#[clap(long)]
dry_run: bool,
/// Allocated size under which a bloblog is considered small.
#[clap(long = "small")]
small_thresh: Option<u64>,
/// Minimum amount of space to reclaim in order to run compaction for reclaim.
#[clap(long = "reclaim")]
min_reclaim: Option<u64>,
/// Maximum amount of space that can be deallocated in a bloblog before we consider it
/// worthwhile to replace.
#[clap(long = "max-dealloc")]
max_deallocated: Option<u64>,
/// Minimum number of mergeable small bloblogs in order to run compaction for merge.
#[clap(long)]
mergeable: Option<u32>,
},
/// Enter a debug prompt for manually operating on the yama pile. /// Enter a debug prompt for manually operating on the yama pile.
Debug { supplied_command: Vec<String> }, Debug { supplied_command: Vec<String> },
@ -116,7 +143,7 @@ fn wrapped_main() -> anyhow::Result<i32> {
match &opts.command { match &opts.command {
PileCommand::Retrieve { PileCommand::Retrieve {
pointer_name, pointer_name,
subset: _, subset,
destination, destination,
num_workers: workers, num_workers: workers,
} => { } => {
@ -134,10 +161,25 @@ fn wrapped_main() -> anyhow::Result<i32> {
fully_integrate_pointer_node(&pile, &mut root_tree_node.node, &mut pointer)?; fully_integrate_pointer_node(&pile, &mut root_tree_node.node, &mut pointer)?;
let mut node_to_extract = &mut root_tree_node.node;
if let Some(subset) = subset {
for path_to_descend in subset.split('/').filter(|s| !s.is_empty()) {
match node_to_extract.child(path_to_descend) {
Ok(new_node) => {
node_to_extract = new_node;
}
Err(msg) => {
bail!("Can't descend into {path_to_descend:?}: {msg}");
}
}
}
}
// todo allow disabling apply metadata // todo allow disabling apply metadata
extracting::extract( extracting::extract(
destination, destination,
&mut root_tree_node.node, node_to_extract,
&pile, &pile,
true, true,
workers.unwrap_or(2), workers.unwrap_or(2),
@ -173,6 +215,29 @@ fn wrapped_main() -> anyhow::Result<i32> {
return Ok(1); return Ok(1);
} }
} }
PileCommand::Compact {
dry_run,
small_thresh,
min_reclaim,
max_deallocated,
mergeable,
} => {
let this_dir = Path::new(".");
let descriptor =
load_pile_descriptor(this_dir).context("Failed to load pile descriptor")?;
cleanup::compact(
this_dir,
&descriptor,
!*dry_run,
true,
CompactionThresholds {
minimum_to_reclaim: min_reclaim.unwrap_or(2 * 1024 * 1024 * 1024),
minimum_small_bloblogs_to_merge: mergeable.unwrap_or(64),
cond_if_more_deallocated_than: max_deallocated.unwrap_or(256 * 1024 * 1024),
cond_if_less_allocated_than: small_thresh.unwrap_or(64 * 1024 * 1024),
},
)?;
}
PileCommand::Init {} => { PileCommand::Init {} => {
commands::init(".".as_ref())?; commands::init(".".as_ref())?;
} }

View File

@ -161,6 +161,8 @@ impl<'cst, CST: ChunkSubmissionTarget> Write for RecursiveChunker<'cst, CST> {
#[inline] #[inline]
pub fn calculate_chunkid(chunk: &[u8]) -> ChunkId { pub fn calculate_chunkid(chunk: &[u8]) -> ChunkId {
// TODO(newver) Allow pluggable chunkID calculations so that encrypted storage can work without
// leaking contents.
let mut chunk_id: ChunkId = Default::default(); let mut chunk_id: ChunkId = Default::default();
blake::hash(256, &chunk, &mut chunk_id).expect("BLAKE problem"); blake::hash(256, &chunk, &mut chunk_id).expect("BLAKE problem");
chunk_id chunk_id

View File

@ -22,7 +22,6 @@ use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use anyhow::{anyhow, bail, Context}; use anyhow::{anyhow, bail, Context};
use clap::crate_version;
use log::warn; use log::warn;
use crate::chunking::{RecursiveChunker, RecursiveUnchunker, SENSIBLE_THRESHOLD}; use crate::chunking::{RecursiveChunker, RecursiveUnchunker, SENSIBLE_THRESHOLD};
@ -48,7 +47,7 @@ pub fn init(dir: &Path) -> anyhow::Result<()> {
let mut file = File::create(yama_toml)?; let mut file = File::create(yama_toml)?;
let desc = PileDescriptor { let desc = PileDescriptor {
yama_version: crate_version!().to_owned(), yama_version: env!("CARGO_PKG_VERSION").to_owned(),
storage: PileStorage::SqliteIndexedBloblog, storage: PileStorage::SqliteIndexedBloblog,
compression: Some(12), compression: Some(12),
}; };

View File

@ -15,13 +15,12 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node}; use crate::commands::retrieve_tree_node;
use crate::definitions::{FilesystemOwnership, FilesystemPermissions, TreeNode}; use crate::definitions::{FilesystemOwnership, FilesystemPermissions, TreeNode};
use crate::operations::remove_pointer_safely;
use crate::pile::{Pile, PileDescriptor, RawPile}; use crate::pile::{Pile, PileDescriptor, RawPile};
use crate::tree::integrate_node_in_place;
use anyhow::anyhow; use anyhow::anyhow;
use clap::Parser; use clap::Parser;
use log::info;
use rustyline::error::ReadlineError; use rustyline::error::ReadlineError;
use rustyline::Editor; use rustyline::Editor;
@ -123,50 +122,7 @@ pub fn debug_command<RP: RawPile>(
} }
} }
DebugCommand::DeletePointer { name } => { DebugCommand::DeletePointer { name } => {
// retrieve this pointer remove_pointer_safely(pile, name)?;
let mut this_pointer = pile.read_pointer(name.as_str())?.ok_or_else(|| {
anyhow!("Pointer {:?} does not exist so can not be deleted.", name)
})?;
let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())?;
// fully integrate the pointer
fully_integrate_pointer_node(&pile, &mut this_node.node, &mut this_pointer)?;
assert!(this_pointer.parent_pointer.is_none());
// now integrate any pointers that rely on this one
// so that they no longer rely on this one.
for pointer in pile.list_pointers()?.iter() {
if pointer == name {
continue;
}
if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
if parent_pointer == name {
info!("Pointer is now an orphan: {:?}", pointer);
// need to integrate this node, so retrieve it
let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
// integrate it in-place
integrate_node_in_place(&mut node.node, &this_node.node)?;
// mark it as orphaned (no parent)
pointer_data.parent_pointer = None;
// store the orphaned node
let new_chunk_ref = store_tree_node(&pile, &node)?;
// associate the orphaned node with the orphaned pointer
pointer_data.chunk_ref = new_chunk_ref;
// write the pointer back.
pile.write_pointer(pointer.as_str(), &pointer_data)?;
}
}
}
}
// then delete the pointer
pile.delete_pointer(name)?;
info!("Deleted pointer: {:?}", name);
} }
DebugCommand::PointerInfo { name } => { DebugCommand::PointerInfo { name } => {
let this_pointer = pile let this_pointer = pile

View File

@ -35,6 +35,24 @@ pub struct PointerData {
pub gid_lookup: BTreeMap<u16, Option<String>>, pub gid_lookup: BTreeMap<u16, Option<String>>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartialPointerData {
pub chunk_ref: RecursiveChunkRef,
pub uid_lookup: BTreeMap<u16, Option<String>>,
pub gid_lookup: BTreeMap<u16, Option<String>>,
}
impl PartialPointerData {
pub fn complete(self, parent_pointer: Option<String>) -> PointerData {
PointerData {
chunk_ref: self.chunk_ref,
parent_pointer,
uid_lookup: self.uid_lookup,
gid_lookup: self.gid_lookup,
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct RecursiveChunkRef { pub struct RecursiveChunkRef {
/// The root Chunk ID. /// The root Chunk ID.
@ -252,6 +270,19 @@ impl TreeNode {
} }
} }
} }
/// Recurses into a child by name, or returns Err with a reason.
pub fn child(&mut self, name: &str) -> Result<&mut TreeNode, &'static str> {
match self {
TreeNode::NormalFile { .. } => Err("not a directory: normal file"),
TreeNode::Directory { children, .. } => match children.get_mut(name) {
None => Err("child not in directory"),
Some(node) => Ok(node),
},
TreeNode::SymbolicLink { .. } => Err("not a directory: symlink"),
TreeNode::Deleted => Err("not a directory: deleted"),
}
}
} }
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]

View File

@ -1,4 +1,80 @@
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node};
use crate::pile::{Pile, RawPile};
use crate::tree::{differentiate_node_in_place, integrate_node_in_place};
use anyhow::{anyhow, Context};
use log::info;
pub mod checking; pub mod checking;
pub mod cleanup;
pub mod extracting; pub mod extracting;
pub mod pushpull; pub mod legacy_pushpull;
pub mod storing; pub mod storing;
pub fn remove_pointer_safely<P: RawPile>(pile: &Pile<P>, name: &str) -> anyhow::Result<()> {
// retrieve this pointer
let mut this_pointer = pile
.read_pointer(name)?
.ok_or_else(|| anyhow!("Pointer {:?} does not exist so can not be deleted.", name))?;
let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())
.context("retrieving 'this' node")?;
let new_parent_name = this_pointer.parent_pointer.clone();
fully_integrate_pointer_node(pile, &mut this_node.node, &mut this_pointer)
.context("integrating new parent")?;
let new_parent = if let Some(ref new_parent_name) = new_parent_name {
let mut new_parent_pointer = pile
.read_pointer(new_parent_name.as_str())?
.ok_or_else(|| anyhow!("Parent pointer {:?} does not exist.", name))?;
let mut new_parent_node = retrieve_tree_node(&pile, new_parent_pointer.chunk_ref.clone())?;
fully_integrate_pointer_node(pile, &mut new_parent_node.node, &mut new_parent_pointer)?;
Some((new_parent_pointer, new_parent_node))
} else {
None
};
// now integrate any pointers that rely on this one
// so that they no longer rely on this one.
for pointer in pile.list_pointers()?.iter() {
if pointer == name {
continue;
}
if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
if parent_pointer == name {
info!("Pointer would be orphaned: {:?}; integrating", pointer);
// need to integrate this node, so retrieve it
let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
// integrate it in-place
integrate_node_in_place(&mut node.node, &this_node.node)?;
if let Some((_, ref new_parent_node)) = new_parent {
// then differentiate with respect to the NEW parent
differentiate_node_in_place(&mut node.node, &new_parent_node.node)?;
}
// pass through the parent
pointer_data.parent_pointer = new_parent_name.clone();
// store the updated version of the pointer
let new_chunk_ref = store_tree_node(&pile, &node)?;
// associate the new node with the new version of the pointer
pointer_data.chunk_ref = new_chunk_ref;
// write the pointer back.
pile.write_pointer(pointer.as_str(), &pointer_data)?;
// we must flush chunks before deleting the pointer
pile.flush()
.context("flushing after writing pointer back")?;
}
}
}
}
// then delete the pointer
pile.delete_pointer(name)?;
info!("Deleted pointer: {:?}", name);
Ok(())
}

View File

@ -18,12 +18,16 @@ along with Yama. If not, see <https://www.gnu.org/licenses/>.
use crate::chunking::RecursiveUnchunker; use crate::chunking::RecursiveUnchunker;
use crate::commands::retrieve_tree_node; use crate::commands::retrieve_tree_node;
use crate::definitions::{ChunkId, TreeNode}; use crate::definitions::{ChunkId, TreeNode};
use crate::pile::{ControllerMessage, Keyspace, Pile, RawPile, StoragePipelineSettings}; use crate::pile::{
ControllerMessage, Keyspace, Pile, PipelineDescription, RawPile, StoragePipelineSettings,
};
use anyhow::bail; use anyhow::bail;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use itertools::Itertools;
use log::{error, info, warn}; use log::{error, info, warn};
use std::collections::HashSet; use std::collections::HashSet;
use std::convert::TryInto;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::sync::Mutex; use std::sync::Mutex;
@ -108,6 +112,10 @@ impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
self.underlying.delete(kind, key) self.underlying.delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -131,6 +139,14 @@ impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
self.underlying self.underlying
.build_storage_pipeline(settings, controller_send) .build_storage_pipeline(settings, controller_send)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.underlying.describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
} }
/// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time. /// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time.
@ -397,9 +413,21 @@ pub fn check_shallow<RP: RawPile>(
// actually do the vacuum! // actually do the vacuum!
info!("Going to vacuum them up."); info!("Going to vacuum them up.");
for vacuum_id in to_vacuum { for vacuum_ids_chunk in to_vacuum
pile.raw_pile.delete(Keyspace::Chunk, &vacuum_id)?; .into_iter()
pbar.inc(1); .chunks(512)
.into_iter()
.map(|c| c.collect::<Vec<ChunkId>>())
{
pile.raw_pile.delete_many(
Keyspace::Chunk,
vacuum_ids_chunk
.iter()
.map(|ci| ci.as_slice())
.collect::<Vec<&[u8]>>()
.as_slice(),
)?;
pbar.inc(vacuum_ids_chunk.len().try_into().unwrap());
} }
pile.flush()?; pile.flush()?;
pbar.finish_and_clear(); pbar.finish_and_clear();

View File

@ -0,0 +1,64 @@
use crate::pile::local_sqlitebloblogs::{CompactionThresholds, SqliteBloblogPile};
use crate::pile::{PileDescriptor, PileStorage};
use anyhow::{bail, Context};
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::info;
use std::path::Path;
pub fn compact(
pile_path: &Path,
pile_desc: &PileDescriptor,
actually_run: bool,
make_progress_bar: bool,
thresholds: CompactionThresholds,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("compacting");
match pile_desc.storage {
PileStorage::SqliteIndexedBloblog => {
let bloblog_pile = SqliteBloblogPile::open(&pile_path)
.context("Failed to open SQLite-indexed Bloblog Pile")?;
compact_bloblogs(bloblog_pile, pbar, actually_run, thresholds)?;
Ok(())
}
other @ PileStorage::RemoteOnly => {
bail!("Cannot use compaction on this kind of pile: {other:?}!");
}
}
}
fn compact_bloblogs(
bloblog_pile: SqliteBloblogPile,
pbar: ProgressBar,
actually_run: bool,
thresholds: CompactionThresholds,
) -> anyhow::Result<()> {
info!("=== Analysing for compaction ===");
let analysis = bloblog_pile.analyse_for_compaction()?;
let chunks_total: u64 = analysis.values().map(|bs| bs.chunks_total).sum();
let chunks_deleted: u64 = analysis.values().map(|bs| bs.chunks_deleted).sum();
let bytes_total: u64 = analysis.values().map(|bs| bs.bytes_total).sum();
let bytes_deleted: u64 = analysis.values().map(|bs| bs.bytes_deleted).sum();
info!("{} bloblogs in this pile, with {chunks_total} chunks ({bytes_total} B) of which {chunks_deleted} ({bytes_deleted} B) are deleted.", analysis.len());
info!("=== Planning compaction ===");
let plan = bloblog_pile.plan_compaction(&thresholds, analysis)?;
info!("Planned compaction: replace {} bloblogs (of which {} are small), freeing up {} B and rewriting {} B", plan.bloblogs_to_replace.len(), plan.small_bloblogs, plan.reclaimable_space, plan.bytes_to_write);
if actually_run {
info!("=== Compacting ===");
bloblog_pile.perform_compaction(Box::new(pbar), plan)?;
}
Ok(())
}

View File

@ -2,7 +2,7 @@ use crate::chunking::RecursiveUnchunker;
use crate::commands::fully_load_pointer; use crate::commands::fully_load_pointer;
use crate::definitions::{ChunkId, RecursiveChunkRef, TreeNode}; use crate::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use crate::operations::checking::VacuumRawPile; use crate::operations::checking::VacuumRawPile;
use crate::operations::pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone}; use crate::operations::legacy_pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone};
use crate::pile::compression::{CompressionSettings, RawPileCompressor}; use crate::pile::compression::{CompressionSettings, RawPileCompressor};
use crate::pile::integrity::RawPileIntegrityChecker; use crate::pile::integrity::RawPileIntegrityChecker;
use crate::pile::local_sqlitebloblogs::SqliteBloblogPile; use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;

View File

@ -29,7 +29,9 @@ use log::{error, warn};
use crate::chunking::{ChunkSubmissionTarget, RecursiveChunker, SENSIBLE_THRESHOLD}; use crate::chunking::{ChunkSubmissionTarget, RecursiveChunker, SENSIBLE_THRESHOLD};
use crate::commands; use crate::commands;
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node}; use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node};
use crate::definitions::{PointerData, RecursiveChunkRef, RootTreeNode, TreeNode}; use crate::definitions::{
PartialPointerData, PointerData, RecursiveChunkRef, RootTreeNode, TreeNode,
};
use crate::pile::{existence_checker_stage, Pile, RawPile, StoragePipelineSettings}; use crate::pile::{existence_checker_stage, Pile, RawPile, StoragePipelineSettings};
use crate::progress::ProgressTracker; use crate::progress::ProgressTracker;
use crate::tree::{create_uidgid_lookup_tables, differentiate_node_in_place}; use crate::tree::{create_uidgid_lookup_tables, differentiate_node_in_place};
@ -240,7 +242,29 @@ pub fn store_fully<PT: ProgressTracker>(
parent: Option<String>, parent: Option<String>,
num_workers: u8, num_workers: u8,
progress_bar: &mut PT, progress_bar: &mut PT,
use_pipelined_storage: bool, ) -> anyhow::Result<()> {
pointer_ops_prepare_to_store(&pile, &mut root_node, &parent)?;
let pointer_data =
store_without_pointer_ops(&pile, &root_dir, root_node, num_workers, progress_bar)?
.complete(parent);
pointers_ops_after_store(&pile, &new_pointer_name, &pointer_data)?;
Ok(())
}
pub fn pointers_ops_after_store(
pile: &Pile<impl RawPile>,
new_pointer_name: &str,
pointer_data: &PointerData,
) -> anyhow::Result<()> {
pile.write_pointer(&new_pointer_name, &pointer_data)?;
pile.flush()?;
Ok(())
}
pub fn pointer_ops_prepare_to_store(
pile: &Pile<impl RawPile>,
mut root_node: &mut TreeNode,
parent: &Option<String>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
if let Some(parent) = parent.as_ref() { if let Some(parent) = parent.as_ref() {
let mut parent_pointer = pile.read_pointer(parent)?.ok_or_else(|| { let mut parent_pointer = pile.read_pointer(parent)?.ok_or_else(|| {
@ -254,8 +278,16 @@ pub fn store_fully<PT: ProgressTracker>(
fully_integrate_pointer_node(&pile, &mut parent_node.node, &mut parent_pointer)?; fully_integrate_pointer_node(&pile, &mut parent_node.node, &mut parent_pointer)?;
differentiate_node_in_place(&mut root_node, &parent_node.node)?; differentiate_node_in_place(&mut root_node, &parent_node.node)?;
} }
Ok(())
}
if use_pipelined_storage { pub fn store_without_pointer_ops<PT: ProgressTracker>(
pile: &Arc<Pile<Box<dyn RawPile>>>,
root_dir: &PathBuf,
mut root_node: TreeNode,
num_workers: u8,
progress_bar: &mut PT,
) -> anyhow::Result<PartialPointerData> {
// TODO make these configurable // TODO make these configurable
let sps = StoragePipelineSettings { let sps = StoragePipelineSettings {
num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32, num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
@ -265,7 +297,10 @@ pub fn store_fully<PT: ProgressTracker>(
let (control_tx, control_rx) = crossbeam_channel::unbounded(); let (control_tx, control_rx) = crossbeam_channel::unbounded();
let pile2 = pile.clone(); let pile2 = pile.clone();
let pipeline = pile.raw_pile.build_storage_pipeline(sps, control_tx)?; let pipeline = pile.raw_pile.build_storage_pipeline(sps, control_tx)?;
// TODO(newver) The existence checker stage should be able to be swapped between different implementations.
let pipeline = existence_checker_stage(pile2, pipeline); let pipeline = existence_checker_stage(pile2, pipeline);
store( store(
&root_dir, &root_dir,
&mut root_node, &mut root_node,
@ -278,15 +313,6 @@ pub fn store_fully<PT: ProgressTracker>(
while let Ok(_) = control_rx.recv() { while let Ok(_) = control_rx.recv() {
// TODO nothing for now. // TODO nothing for now.
} }
} else {
store(
&root_dir,
&mut root_node,
pile.as_ref(),
progress_bar,
num_workers,
)?;
}
let mut uid_lookup = BTreeMap::new(); let mut uid_lookup = BTreeMap::new();
let mut gid_lookup = BTreeMap::new(); let mut gid_lookup = BTreeMap::new();
@ -297,19 +323,20 @@ pub fn store_fully<PT: ProgressTracker>(
let chunk_ref = commands::store_tree_node( let chunk_ref = commands::store_tree_node(
&pile, &pile,
&RootTreeNode { &RootTreeNode {
name: root_dir.file_name().unwrap().to_str().unwrap().to_owned(), name: root_dir
.file_name()
.map(|s| s.to_str())
.flatten()
.unwrap_or("_root")
.to_owned(),
node: root_node, node: root_node,
}, },
)?; )?;
let pointer_data = PointerData { let pointer_data = PartialPointerData {
chunk_ref, chunk_ref,
parent_pointer: parent,
uid_lookup, uid_lookup,
gid_lookup, gid_lookup,
}; };
Ok(pointer_data)
pile.write_pointer(&new_pointer_name, &pointer_data)?;
pile.flush()?;
Ok(())
} }

View File

@ -26,6 +26,7 @@ use std::collections::HashSet;
use std::fmt::Debug; use std::fmt::Debug;
use std::sync::{Arc, Condvar, Mutex}; use std::sync::{Arc, Condvar, Mutex};
pub mod access_guard;
pub mod compression; pub mod compression;
pub mod encryption; pub mod encryption;
pub mod integrity; pub mod integrity;
@ -87,6 +88,8 @@ pub fn existence_checker_stage<RP: RawPile>(
pile: Arc<Pile<RP>>, pile: Arc<Pile<RP>>,
next_stage: Sender<(ChunkId, Vec<u8>)>, next_stage: Sender<(ChunkId, Vec<u8>)>,
) -> Sender<(ChunkId, Vec<u8>)> { ) -> Sender<(ChunkId, Vec<u8>)> {
// TODO(newver) Do better than this.
let shared_seen_set: Arc<Mutex<HashSet<ChunkId>>> = Default::default(); let shared_seen_set: Arc<Mutex<HashSet<ChunkId>>> = Default::default();
let (tx, rx) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(32); let (tx, rx) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(32);
@ -123,12 +126,22 @@ pub enum ControllerMessage {
}, },
} }
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum PipelineDescription {
Store,
Remote,
Integrity,
Compression { dictionary_fingerprint: u64 },
Encryption,
}
pub trait RawPile: Send + Sync + Debug + 'static { pub trait RawPile: Send + Sync + Debug + 'static {
// TODO expose verification errors? // TODO expose verification errors?
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool>; fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool>;
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>>; fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>>;
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()>; fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()>;
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()>; fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()>;
fn delete_many(&self, kind: Keyspace, key: &[&[u8]]) -> anyhow::Result<()>;
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -153,6 +166,12 @@ pub trait RawPile: Send + Sync + Debug + 'static {
settings: StoragePipelineSettings, settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>, controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>>; ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>>;
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>>;
/// Return a u64 order token that indicates the optimum order to read this chunk in
/// compared to other chunks.
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64>;
} }
impl RawPile for Box<dyn RawPile> { impl RawPile for Box<dyn RawPile> {
@ -168,6 +187,9 @@ impl RawPile for Box<dyn RawPile> {
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> { fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key) self.as_ref().delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.as_ref().delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -192,6 +214,14 @@ impl RawPile for Box<dyn RawPile> {
self.as_ref() self.as_ref()
.build_storage_pipeline(settings, controller_send) .build_storage_pipeline(settings, controller_send)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.as_ref().describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
}
} }
impl<RP: RawPile> RawPile for Arc<RP> { impl<RP: RawPile> RawPile for Arc<RP> {
@ -207,6 +237,9 @@ impl<RP: RawPile> RawPile for Arc<RP> {
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> { fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key) self.as_ref().delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.as_ref().delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -231,6 +264,14 @@ impl<RP: RawPile> RawPile for Arc<RP> {
self.as_ref() self.as_ref()
.build_storage_pipeline(settings, controller_send) .build_storage_pipeline(settings, controller_send)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.as_ref().describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
}
} }
#[derive(Debug)] #[derive(Debug)]

View File

@ -0,0 +1,141 @@
use crate::chunking::calculate_chunkid;
use crate::definitions::ChunkId;
use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use anyhow::{anyhow, bail};
use crossbeam_channel::{Receiver, Sender};
use derivative::Derivative;
use std::sync::Arc;
use std::thread;
/// PileGuard is a wrapper around a pile that prevents data exfiltration and malicious corruption.
/// It's basically a firewall for a Pile?
/// Preventing malicious corruption requires the chunks to be unprocessed. This way, their ID can be
/// checked by this module.
#[derive(Debug, Derivative)]
#[derivative(Clone(bound = ""))]
// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
pub struct PileGuard<R: Clone + RawPile> {
underlying: R,
/// Whether to verify chunk IDs to prevent malicious corruption
verify_chunk_ids: bool,
}
fn pipeline(
subsequent_pipeline: Sender<(ChunkId, Vec<u8>)>,
input: Receiver<(ChunkId, Vec<u8>)>,
) -> anyhow::Result<()> {
while let Ok((claimed_chunk_id, chunk)) = input.recv() {
let actual_chunk_id = calculate_chunkid(&chunk);
if actual_chunk_id != claimed_chunk_id {
bail!("CHUNK ID MISMATCH — is this forgery? (malicious storage process?) claimed{:?} actually{:?}", claimed_chunk_id, actual_chunk_id);
}
subsequent_pipeline
.send((claimed_chunk_id, chunk))
.map_err(|_| anyhow!("Subsequent step closed"))?;
}
Ok(())
}
impl<R: Clone + RawPile> PileGuard<R> {
pub fn new(underlying: R, verify_chunk_ids: bool) -> Self {
PileGuard {
underlying,
verify_chunk_ids,
}
}
}
impl<R: Clone + RawPile> RawPile for PileGuard<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
match kind {
Keyspace::Chunk => self.underlying.exists(kind, key),
Keyspace::ChunkHash => {
bail!("Access denied");
}
Keyspace::Pointer => {
bail!("Access denied");
}
}
}
fn read(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
bail!("Access denied");
}
fn write(&self, kind: Keyspace, _key: &[u8], _value: &[u8]) -> anyhow::Result<()> {
match kind {
Keyspace::Chunk => {
todo!()
}
Keyspace::ChunkHash => {
bail!("Access denied");
}
Keyspace::Pointer => {
bail!("Access denied");
}
}
}
fn delete(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<()> {
bail!("Access denied");
}
fn delete_many(&self, _kind: Keyspace, _keys: &[&[u8]]) -> anyhow::Result<()> {
bail!("Access denied");
}
fn list_keys(
&self,
_kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
bail!("Access denied");
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
let subsequent_pipeline = self
.underlying
.build_storage_pipeline(settings.clone(), controller_send.clone())?;
let (input_to_this_stage, receiver) = crossbeam_channel::bounded(8);
thread::Builder::new()
.name("yama Aguard".to_owned())
.spawn(move || {
if let Err(err) = pipeline(subsequent_pipeline, receiver) {
controller_send
.send(ControllerMessage::Failure {
worker_id: Arc::new(String::from("accessguard")),
error_message: format!("err {:?}", err),
})
.expect("This is BAD: failed to send failure message to controller.");
}
})
.unwrap();
Ok(input_to_this_stage)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
// TODO(question) Should we be described in the pipeline?
self.underlying.describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}

View File

@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::convert::TryInto;
use std::sync::Arc; use std::sync::Arc;
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
@ -24,10 +25,13 @@ use crossbeam_channel::{Receiver, Sender};
use derivative::Derivative; use derivative::Derivative;
use log::error; use log::error;
use metrics::{register_counter, Unit}; use metrics::{register_counter, Unit};
use zstd::block::{Compressor, Decompressor}; use zstd::bulk::{Compressor, Decompressor};
use crate::definitions::ChunkId; use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings}; use crate::pile::{
ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
StoragePipelineSettings,
};
pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024; pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
@ -150,9 +154,10 @@ impl<R: RawPile> RawPileCompressor<R> {
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>, queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings, settings: CompressionSettings,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut compressor = Compressor::with_dict(settings.dictionary.as_ref().clone()); let mut compressor =
Compressor::with_dictionary(settings.level, settings.dictionary.as_ref())?;
while let Ok((job, response_sender)) = queue.recv() { while let Ok((job, response_sender)) = queue.recv() {
let result = compressor.compress(&job, settings.level)?; let result = compressor.compress(&job)?;
response_sender response_sender
.send(result) .send(result)
.or(Err(anyhow!("Couldn't send compression result")))?; .or(Err(anyhow!("Couldn't send compression result")))?;
@ -164,7 +169,7 @@ impl<R: RawPile> RawPileCompressor<R> {
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>, queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings, settings: CompressionSettings,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut decompressor = Decompressor::with_dict(settings.dictionary.as_ref().clone()); let mut decompressor = Decompressor::with_dictionary(settings.dictionary.as_ref())?;
while let Ok((job, response_sender)) = queue.recv() { while let Ok((job, response_sender)) = queue.recv() {
let result = decompressor.decompress(&job, DECOMPRESS_CAPACITY)?; let result = decompressor.decompress(&job, DECOMPRESS_CAPACITY)?;
response_sender response_sender
@ -229,11 +234,11 @@ impl<R: RawPile> RawPileCompressor<R> {
"id" => worker_id "id" => worker_id
); );
let mut compressor = Compressor::with_dict(self.settings.dictionary.as_ref().clone()); let mut compressor =
let level = self.settings.level; Compressor::with_dictionary(self.settings.level, self.settings.dictionary.as_ref())?;
while let Ok((chunk_id, bytes)) = input.recv() { while let Ok((chunk_id, bytes)) = input.recv() {
let in_bytes = bytes.len(); let in_bytes = bytes.len();
let bytes = compressor.compress(&bytes, level)?; let bytes = compressor.compress(&bytes)?;
let out_bytes = bytes.len(); let out_bytes = bytes.len();
next_stage.send((chunk_id, bytes))?; next_stage.send((chunk_id, bytes))?;
// Per-worker metrics // Per-worker metrics
@ -273,6 +278,10 @@ impl<R: RawPile> RawPile for RawPileCompressor<R> {
self.underlying.delete(kind, key) self.underlying.delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -330,4 +339,21 @@ impl<R: RawPile> RawPile for RawPileCompressor<R> {
Ok(input_to_this_stage) Ok(input_to_this_stage)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
let mut dict_fingerprint_u256 = [0; 32];
blake::hash(256, &self.settings.dictionary, &mut dict_fingerprint_u256)?;
let dictionary_fingerprint: u64 =
u64::from_be_bytes(dict_fingerprint_u256[0..8].try_into().unwrap());
underlying.push(PipelineDescription::Compression {
dictionary_fingerprint,
});
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
} }

View File

@ -21,7 +21,9 @@ use sodiumoxide::crypto::secretbox;
use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES}; use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES};
use crate::definitions::ChunkId; use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, Keyspace, RawPile, StoragePipelineSettings}; use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
/// A RawPile that provides encryption of chunk contents. /// A RawPile that provides encryption of chunk contents.
@ -99,6 +101,10 @@ impl<R: RawPile> RawPile for RawPileEncryptor<R> {
self.underlying.delete(kind, key) self.underlying.delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -119,4 +125,14 @@ impl<R: RawPile> RawPile for RawPileEncryptor<R> {
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> { ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
todo!() todo!()
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
underlying.push(PipelineDescription::Encryption);
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
} }

View File

@ -20,7 +20,10 @@ use std::hash::Hasher;
use thiserror::Error; use thiserror::Error;
use crate::definitions::{ChunkId, XXH64_SEED}; use crate::definitions::{ChunkId, XXH64_SEED};
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings}; use crate::pile::{
ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
StoragePipelineSettings,
};
use crate::utils::bytes_to_hexstring; use crate::utils::bytes_to_hexstring;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
@ -95,6 +98,10 @@ impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
self.underlying.delete(kind, key) self.underlying.delete(kind, key)
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -140,4 +147,14 @@ impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
.unwrap(); .unwrap();
Ok(input) Ok(input)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
underlying.push(PipelineDescription::Integrity);
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
} }

View File

@ -15,28 +15,32 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::hash_map::Entry; use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
use std::collections::{HashMap, VecDeque};
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
use std::fs::{read_dir, File, OpenOptions}; use std::fs::{read_dir, remove_file, File, OpenOptions};
use std::io::{Read, Seek, SeekFrom, Write}; use std::io::{Read, Seek, SeekFrom, Write};
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::{Arc, Condvar, Mutex}; use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
use std::{fs, thread}; use std::{fs, thread};
use anyhow::{bail, Context}; use anyhow::{bail, ensure, Context};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use crossbeam_channel::{Receiver, Sender};
use log::{info, warn}; use log::{info, warn};
use nix::unistd::sync; use nix::unistd::sync;
use rusqlite::{params, Error, ErrorCode}; use rusqlite::ffi::ErrorCode::ConstraintViolation;
use rusqlite::{params, Error, ErrorCode, Transaction, TransactionBehavior, NO_PARAMS};
use rusqlite::{Connection, OptionalExtension}; use rusqlite::{Connection, OptionalExtension};
use crate::definitions::ChunkId; use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, DebugStatistics, Keyspace, RawPile, StoragePipelineSettings}; use crate::pile::{
use crate::utils::bytes_to_hexstring; ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
use crossbeam_channel::{Receiver, Sender}; StoragePipelineSettings,
use rusqlite::ffi::ErrorCode::ConstraintViolation; };
use std::time::Duration; use crate::progress::ProgressTracker;
use crate::utils::{bytes_to_hexstring, LruMap};
/// Bloblogs will not be reused if they are already 2 GiB large. /// Bloblogs will not be reused if they are already 2 GiB large.
pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024; pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
@ -44,6 +48,14 @@ pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
/// This many pointers will be batched up for writing. /// This many pointers will be batched up for writing.
pub const POINTER_WRITE_BATCHES: usize = 2048; pub const POINTER_WRITE_BATCHES: usize = 2048;
/// This many bloblogs will be kept open for reading, at maximum.
pub const BLOBLOG_MAX_READING_FILE_COUNT: usize = 128;
/// Size of a blob header within a bloblog.
/// 32 byte Chunk Id
/// 4 byte (u32) Blob size
pub const BLOB_HEADER_SIZE: u64 = 32 + 4;
/// A file storing a log of blobs. /// A file storing a log of blobs.
/// Format: /// Format:
/// Repeated: /// Repeated:
@ -133,8 +145,8 @@ pub type BloblogId = u32;
#[derive(Debug)] #[derive(Debug)]
pub struct Inner { pub struct Inner {
next_bloblog_id: BloblogId, next_bloblog_id: BloblogId,
writer_bloblogs: Vec<BloblogId>, writer_bloblogs: Vec<(BloblogId, Arc<Mutex<Bloblog>>)>,
open_bloblogs: HashMap<BloblogId, Arc<Mutex<Bloblog>>>, // TODO want an LRU cache with a weak hashmap...? reader_bloblogs: LruMap<BloblogId, Arc<Mutex<Bloblog>>>,
connection: Connection, connection: Connection,
writers_in_progress: u16, writers_in_progress: u16,
// We batch up pointer writes because sync() performance really hurts us if we do them one by // We batch up pointer writes because sync() performance really hurts us if we do them one by
@ -142,14 +154,13 @@ pub struct Inner {
queued_pointer_writes: HashMap<ChunkId, BloblogPointer>, queued_pointer_writes: HashMap<ChunkId, BloblogPointer>,
} }
impl Inner { fn raw_put_chunk_pointer_txn(
pub fn raw_put_chunk_pointer( txn: &Transaction,
&self,
chunk_id: &ChunkId, chunk_id: &ChunkId,
bloblog: BloblogId, bloblog: BloblogId,
offset_i64: i64, offset_i64: i64,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
match self.connection.execute( match txn.execute(
"INSERT INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)", "INSERT INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)",
params![&chunk_id[..], bloblog, offset_i64], params![&chunk_id[..], bloblog, offset_i64],
) { ) {
@ -162,27 +173,41 @@ impl Inner {
); );
Ok(()) Ok(())
} else { } else {
Err(Error::SqliteFailure(e, str))?; Err(Error::SqliteFailure(e, str).into())
unreachable!();
} }
} }
other => { Err(other) => Err(other.into()),
other?;
unreachable!();
}
} }
}
impl Inner {
pub fn raw_put_chunk_pointer(
&mut self,
chunk_id: &ChunkId,
bloblog: BloblogId,
offset_i64: i64,
) -> anyhow::Result<()> {
let txn = self.connection.transaction()?;
raw_put_chunk_pointer_txn(&txn, chunk_id, bloblog, offset_i64)?;
txn.commit()?;
Ok(())
} }
pub fn flush(&mut self) -> anyhow::Result<()> { pub fn flush(&mut self) -> anyhow::Result<()> {
// Create a non-allocated hashmap to satisfy borrow checker, then swap it in and out // Create a non-allocated hashmap to satisfy borrow checker, then swap it in and out
let mut queued_pointer_writes = HashMap::with_capacity(0); let mut queued_pointer_writes = HashMap::with_capacity(0);
std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes); std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
let txn = self.connection.transaction()?;
for (chunk_id, pointer) in queued_pointer_writes.drain() { for (chunk_id, pointer) in queued_pointer_writes.drain() {
let offset_i64 = let offset_i64 =
i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64..."); i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
self.raw_put_chunk_pointer(&chunk_id, pointer.bloblog, offset_i64)?; raw_put_chunk_pointer_txn(&txn, &chunk_id, pointer.bloblog, offset_i64)?;
} }
std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes); std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
txn.commit()?;
Ok(()) Ok(())
} }
} }
@ -244,11 +269,18 @@ impl SqliteBloblogPile {
)?; )?;
} }
// Enable WAL mode for significantly better write performance.
connection.execute_batch(
"
PRAGMA journal_mode=WAL;
",
)?;
Ok(SqliteBloblogPile { Ok(SqliteBloblogPile {
inner: Arc::new(Mutex::new(Inner { inner: Arc::new(Mutex::new(Inner {
next_bloblog_id: 0, next_bloblog_id: 0,
writer_bloblogs: Vec::new(), writer_bloblogs: Vec::new(),
open_bloblogs: HashMap::new(), reader_bloblogs: LruMap::new(BLOBLOG_MAX_READING_FILE_COUNT),
connection, connection,
writers_in_progress: 0, writers_in_progress: 0,
queued_pointer_writes: Default::default(), queued_pointer_writes: Default::default(),
@ -261,23 +293,33 @@ impl SqliteBloblogPile {
fn open_bloblog(&self, bloblog_id: BloblogId) -> anyhow::Result<Arc<Mutex<Bloblog>>> { fn open_bloblog(&self, bloblog_id: BloblogId) -> anyhow::Result<Arc<Mutex<Bloblog>>> {
let mut inner = self.inner.lock().unwrap(); let mut inner = self.inner.lock().unwrap();
Ok(match inner.open_bloblogs.entry(bloblog_id) {
Entry::Occupied(entry) => entry.get().clone(), match inner.reader_bloblogs.get(&bloblog_id) {
Entry::Vacant(entry) => { Some(bloblog) => Ok(bloblog.clone()),
None => {
let bloblog = Arc::new(Mutex::new(Bloblog::open( let bloblog = Arc::new(Mutex::new(Bloblog::open(
&self.path.join(&bloblog_id.to_string()), &self.path.join(&bloblog_id.to_string()),
)?)); )?));
entry.insert(bloblog.clone()); inner.reader_bloblogs.insert(bloblog_id, bloblog.clone());
bloblog Ok(bloblog)
}
} }
})
} }
fn get_writing_bloblog(&self) -> anyhow::Result<(BloblogId, Arc<Mutex<Bloblog>>)> { fn get_writing_bloblog(&self) -> anyhow::Result<(BloblogId, Arc<Mutex<Bloblog>>)> {
let mut inner = self.inner.lock().unwrap(); let mut inner = self.inner.lock().unwrap();
let writing_bloblog_id: BloblogId = match inner.writer_bloblogs.pop() {
None => { inner.writers_in_progress += 1;
loop {
if let Some(writing_bloblog) = inner.writer_bloblogs.pop() {
// We already have an open bloblog to give back.
return Ok(writing_bloblog);
}
// No open bloblogs to reuse; create a new one.
// It's very important to create a fresh one here; we definitely don't want to use a file
// that someone else is using!
let writing_bloblog_id = loop {
let pre_inc = inner.next_bloblog_id; let pre_inc = inner.next_bloblog_id;
inner.next_bloblog_id += 1; inner.next_bloblog_id += 1;
@ -286,26 +328,18 @@ impl SqliteBloblogPile {
if !bloblog_path.exists() { if !bloblog_path.exists() {
break pre_inc; break pre_inc;
} }
}
}
Some(id) => id,
}; };
let result = Ok((
writing_bloblog_id,
match inner.open_bloblogs.entry(writing_bloblog_id) {
Entry::Occupied(entry) => entry.get().clone(),
Entry::Vacant(entry) => {
let bloblog = Arc::new(Mutex::new(Bloblog::open( let bloblog = Arc::new(Mutex::new(Bloblog::open(
&self.path.join(&writing_bloblog_id.to_string()), &self.path.join(&writing_bloblog_id.to_string()),
)?)); )?));
entry.insert(bloblog.clone());
bloblog // MAYBE FUTURE // Insert a weak reference so we can easily get a reader for this if desired.
} // inner.open_bloblogs.insert(writing_bloblog_id, Arc::downgrade(&bloblog));
}, // For now, I don't think we actually care about reading a bloblog that we've written
)); // (at least not usually?)
inner.writers_in_progress += 1;
result Ok((writing_bloblog_id, bloblog))
} }
/// Should be called once the bloblog has been finished writing to for the moment. /// Should be called once the bloblog has been finished writing to for the moment.
@ -318,7 +352,7 @@ impl SqliteBloblogPile {
let size = bloblog.lock().unwrap().filesize()?; let size = bloblog.lock().unwrap().filesize()?;
let mut inner = self.inner.lock().unwrap(); let mut inner = self.inner.lock().unwrap();
if size < MAX_BLOBLOG_REUSE_SIZE { if size < MAX_BLOBLOG_REUSE_SIZE {
inner.writer_bloblogs.push(id); inner.writer_bloblogs.push((id, bloblog));
} }
inner.writers_in_progress -= 1; inner.writers_in_progress -= 1;
if inner.writers_in_progress == 0 { if inner.writers_in_progress == 0 {
@ -344,8 +378,33 @@ impl SqliteBloblogPile {
.optional()?) .optional()?)
} }
fn get_chunk_pointers(
&self,
chunk_ids: &[&[u8]],
) -> anyhow::Result<Vec<Option<BloblogPointer>>> {
let mut inner = self.inner.lock().unwrap();
let txn = inner.connection.transaction()?;
let mut result = Vec::with_capacity(chunk_ids.len());
{
let mut stmt = txn.prepare("SELECT bloblog, offset FROM chunks WHERE chunk_id = ?1")?;
for &chunk_id in chunk_ids {
let bloglog_pointer: Option<BloblogPointer> = stmt
.query_row(params![chunk_id], |row| {
Ok(BloblogPointer {
bloblog: row.get(0)?,
offset: row.get::<_, i64>(1)? as u64,
})
})
.optional()?;
result.push(bloglog_pointer);
}
}
txn.commit()?;
Ok(result)
}
fn put_chunk_pointer(&self, chunk_id: &ChunkId, pointer: BloblogPointer) -> anyhow::Result<()> { fn put_chunk_pointer(&self, chunk_id: &ChunkId, pointer: BloblogPointer) -> anyhow::Result<()> {
let inner = self.inner.lock().unwrap(); let mut inner = self.inner.lock().unwrap();
let offset_i64 = i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64..."); let offset_i64 = i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
inner.raw_put_chunk_pointer(chunk_id, pointer.bloblog, offset_i64) inner.raw_put_chunk_pointer(chunk_id, pointer.bloblog, offset_i64)
} }
@ -469,6 +528,341 @@ impl SqliteBloblogPile {
assert!(pointers_buffered.is_empty()); assert!(pointers_buffered.is_empty());
Ok(()) Ok(())
} }
/// Look at the bloblogs in this pile and see where space may be reclaimable if we were to
/// compact.
///
/// Next step: plan_compaction
pub fn analyse_for_compaction(&self) -> anyhow::Result<BTreeMap<BloblogId, BloblogStats>> {
let mut inner = self.inner.lock().unwrap();
// Lock the database right away.
let txn = inner
.connection
.transaction_with_behavior(TransactionBehavior::Exclusive)?;
let mut stmt = txn.prepare(
"
SELECT bloblog, COUNT(c.offset), COUNT(d.offset), SUM(COALESCE(d.size, 0))
FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
GROUP BY bloblog
",
)?;
struct UnpopulatedBloblogStats {
pub bloblog_id: BloblogId,
pub chunks_total: u64,
pub chunks_deleted: u64,
pub bytes_deleted: u64,
}
let unpopul_bloblog_stats = stmt.query_map(NO_PARAMS, |row| {
Ok(UnpopulatedBloblogStats {
bloblog_id: row.get(0)?,
chunks_total: row.get::<_, i64>(1)?.try_into().expect("i64 -> u64"),
chunks_deleted: row.get::<_, i64>(2)?.try_into().expect("i64 -> u64"),
bytes_deleted: row.get::<_, i64>(3)?.try_into().expect("i64 -> u64"),
})
})?;
let mut final_stats = BTreeMap::new();
for unpopul_stat in unpopul_bloblog_stats {
let UnpopulatedBloblogStats {
bloblog_id,
chunks_total,
chunks_deleted,
bytes_deleted,
} = unpopul_stat?;
let bloblog_path = self.path.join(&bloblog_id.to_string());
let bytes_total = std::fs::metadata(&bloblog_path)
.with_context(|| format!("Failed to get metadata for bloblog: {:?}", bloblog_path))?
.size();
final_stats.insert(
bloblog_id,
BloblogStats {
chunks_total,
chunks_deleted,
bytes_total,
// Add a slight correction since we can count the blob headers of deleted blobs
// as deleted.
bytes_deleted: bytes_deleted + chunks_deleted * BLOB_HEADER_SIZE,
},
);
}
Ok(final_stats)
}
/// Look at the analysis of compaction and, using the specified thresholds, come up with a plan
/// to perform compaction.
///
/// May return an empty plan if compaction isn't worthwhile.
///
/// Previous step: analyse_for_compaction
/// Next step: perform_compaction
pub fn plan_compaction(
&self,
thresholds: &CompactionThresholds,
analysis: BTreeMap<BloblogId, BloblogStats>,
) -> anyhow::Result<CompactionPlan> {
let bloblogs_to_replace: BTreeMap<BloblogId, BloblogStats> = analysis
.into_iter()
.filter(|(_id, stats)| thresholds.should_replace_bloblog(stats))
.collect();
let reclaimable_space: u64 = bloblogs_to_replace
.values()
.map(|bs| bs.bytes_deleted)
.sum();
let bytes_to_write: u64 = bloblogs_to_replace
.values()
.map(|bs| bs.bytes_total - bs.bytes_deleted)
.sum();
let small_bloblogs: u32 = bloblogs_to_replace
.values()
.filter(|bs| bs.bytes_total - bs.bytes_deleted < thresholds.cond_if_less_allocated_than)
.count() as u32;
if reclaimable_space < thresholds.minimum_to_reclaim
&& small_bloblogs < thresholds.minimum_small_bloblogs_to_merge
{
// Nothing worth doing: return an empty plan.
return Ok(CompactionPlan {
bloblogs_to_replace: Default::default(),
bytes_to_write: 0,
reclaimable_space: 0,
small_bloblogs: 0,
});
}
Ok(CompactionPlan {
bloblogs_to_replace: bloblogs_to_replace.keys().copied().collect(),
bytes_to_write,
reclaimable_space,
small_bloblogs,
})
}
/// Given a compaction plan, perform the compaction.
/// There shouldn't be any decisions left to be made at this point: just action.
///
/// TODO flock the bloblogs to be removed and make readers and writers also flock them too.
///
/// TODO find a way to deal with bloblogs that are entirely unreferenced from the index
/// (e.g. bloblogs that weren't written properly, e.g. if compaction fails.)
pub fn perform_compaction(
&self,
mut progress: Box<dyn ProgressTracker>,
plan: CompactionPlan,
) -> anyhow::Result<()> {
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
struct ReplacedBlobRow {
pub old_bloblog: BloblogId,
pub old_offset: u64,
pub chunk_id: ChunkId,
}
if plan.bloblogs_to_replace.is_empty() {
info!("No compaction to be done.");
return Ok(());
}
let mut to_preserve = BTreeSet::new();
let mut replacements = BTreeMap::new();
progress.set_max_size(plan.bytes_to_write);
// First find all the blobs we need to replace.
{
let mut inner = self.inner.lock().unwrap();
// Lock the database right away.
let txn = inner
.connection
.transaction_with_behavior(TransactionBehavior::Exclusive)?;
let mut stmt = txn.prepare(
"
SELECT chunk_id, c.offset
FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
WHERE bloblog = ?1 AND d.offset IS NULL
",
)?;
for bloblog in plan.bloblogs_to_replace.iter().copied() {
to_preserve.extend(
stmt.query_map([bloblog], |row| {
let mut chunk_id = ChunkId::default();
chunk_id.copy_from_slice(row.get::<_, Vec<u8>>(0).unwrap().as_slice());
Ok(ReplacedBlobRow {
old_bloblog: bloblog,
chunk_id,
old_offset: row.get::<_, i64>(1).unwrap().try_into().unwrap(),
})
})?
.collect::<Result<Vec<ReplacedBlobRow>, _>>()?,
);
}
}
// Then make the replacements
info!("Rewriting bloblogs...");
let mut buf = Vec::new();
let mut iterator = to_preserve.into_iter();
loop {
let (new_bloblog_id, bloglog_mutex) = self.get_writing_bloblog()?;
let mut new_bloblog = bloglog_mutex.lock().expect("Failed to lock bloblog?");
let mut is_more = false;
while let Some(preserve) = iterator.next() {
is_more = true;
// Get hold of the old bloblog
let old_bloblog = self.open_bloblog(preserve.old_bloblog)?;
let mut old_bloblog = old_bloblog.lock().unwrap();
// Transfer the blob
buf.clear();
old_bloblog.read_blob(preserve.old_offset, &preserve.chunk_id, &mut buf)?;
let new_offset = new_bloblog.write_blob(&preserve.chunk_id, &buf)?;
// Make a note of the replacement
replacements.insert(
preserve,
BloblogPointer {
bloblog: new_bloblog_id,
offset: new_offset,
},
);
progress.inc_progress(buf.len() as u64);
if new_bloblog.filesize()? > MAX_BLOBLOG_REUSE_SIZE {
// get a new bloblog to write with.
break;
}
}
drop(new_bloblog);
self.return_writing_bloblog(new_bloblog_id, bloglog_mutex)?;
if !is_more {
break;
}
}
info!("Applying replacements...");
{
let mut inner = self.inner.lock().unwrap();
// Lock the database right away.
let txn = inner
.connection
.transaction_with_behavior(TransactionBehavior::Exclusive)?;
let mut stmt = txn.prepare(
"
UPDATE chunks
SET bloblog = ?1, offset = ?2
WHERE chunk_id = ?3
",
)?;
for (replacement_row, new_pos) in replacements {
ensure!(
stmt.execute(params![
new_pos.bloblog,
new_pos.offset as i64,
&replacement_row.chunk_id as &[u8]
])? == 1,
"Wrong number of rows updated for replacement!"
);
}
drop(stmt);
txn.commit().context("committing replacements")?;
}
// TODO fsync new bloblogs
info!("Deleting old bloblogs...");
{
let mut inner = self.inner.lock().unwrap();
// Lock the database right away.
let txn = inner
.connection
.transaction_with_behavior(TransactionBehavior::Exclusive)?;
for bloblog_id in plan.bloblogs_to_replace.iter().copied() {
let deleted_chunks = txn.execute(
"
DELETE FROM chunks WHERE bloblog = ?1
",
params![bloblog_id],
)?;
let deleted_deleted = txn.execute(
"
DELETE FROM deleted WHERE bloblog = ?1
",
params![bloblog_id],
)?;
ensure!(deleted_chunks == deleted_deleted, "Undeleted chunks left in bloblog {bloblog_id}: CHUNKS={deleted_chunks} DELETED={deleted_deleted}");
let bloblog_path = self.path.join(bloblog_id.to_string());
remove_file(&bloblog_path).with_context(|| {
format!("Failed to remove obsolete bloblog: {:?}", bloblog_path)
})?;
}
txn.commit()?;
}
Ok(())
}
}
pub struct BloblogStats {
pub chunks_total: u64,
pub chunks_deleted: u64,
pub bytes_total: u64,
pub bytes_deleted: u64,
}
pub struct CompactionPlan {
pub bloblogs_to_replace: BTreeSet<BloblogId>,
pub bytes_to_write: u64,
pub reclaimable_space: u64,
pub small_bloblogs: u32,
}
pub struct CompactionThresholds {
/// Minimum bytes to be reclaimable overall for compaction to be worthwhile.
pub minimum_to_reclaim: u64,
/// (alternative reason) Minimum number of files to be undersized in order for compaction
/// to be worthwhile.
/// This gives us a way to make compaction run if we have lots of tiny bloblogs.
pub minimum_small_bloblogs_to_merge: u32,
/// A bloblog will be replaced if the deallocated size is greater than this.
pub cond_if_more_deallocated_than: u64,
/// A bloblog will be replaced if the allocated size is less than this.
pub cond_if_less_allocated_than: u64,
}
impl CompactionThresholds {
pub fn should_replace_bloblog(&self, bloblog_stats: &BloblogStats) -> bool {
let allocated = bloblog_stats.bytes_total - bloblog_stats.bytes_deleted;
// Note that this will also trigger for fully-deallocated files if
let is_small = allocated < self.cond_if_less_allocated_than;
let has_large_deallocations =
bloblog_stats.bytes_deleted > self.cond_if_more_deallocated_than;
is_small || has_large_deallocations
}
}
pub struct CompactionOutcome {
pub bloblogs_deleted: u32,
pub bloblogs_created: u32,
pub bytes_deleted: u32,
pub bytes_created: u32,
} }
impl Drop for SqliteBloblogPile { impl Drop for SqliteBloblogPile {
@ -613,6 +1007,59 @@ impl RawPile for SqliteBloblogPile {
} }
} }
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
match kind {
Keyspace::Chunk => {
let mut chunk_pointers_by_bloblog: BTreeMap<BloblogId, Vec<(u64, &[u8])>> =
BTreeMap::new();
for (chunk_pointer, chunk_id) in self
.get_chunk_pointers(keys)
.context("failed to get chunk pointers")?
.into_iter()
.zip(keys)
.filter_map(|(pointer, &chunk_id)| match pointer {
Some(pointer) => Some((pointer, chunk_id)),
None => None,
})
{
chunk_pointers_by_bloblog
.entry(chunk_pointer.bloblog)
.or_default()
.push((chunk_pointer.offset, chunk_id));
}
let mut inner = self.inner.lock().unwrap();
let txn = inner.connection.transaction()?;
{
let mut stmt = txn.prepare(
"INSERT OR IGNORE INTO deleted (bloblog, offset, size)
VALUES (?1, ?2, ?3)",
)?;
for (bloblog_id, entries) in chunk_pointers_by_bloblog {
let bloblog_mutex = self.open_bloblog(bloblog_id)?;
let mut bloblog = bloblog_mutex.lock().unwrap();
for (chunk_offset, raw_chunk_id) in entries {
let mut chunk_id: ChunkId = Default::default();
chunk_id.copy_from_slice(raw_chunk_id);
let size = bloblog.blob_len(chunk_offset, &chunk_id)?;
let offset_i64 = i64::try_from(chunk_offset)
.expect("ouch! can't turn u64 into i64...");
stmt.execute(params![bloblog_id, offset_i64, size])?;
}
}
}
txn.commit().context("Failed to commit chunk deletions")?;
}
_ => {
for &key in keys {
self.delete(kind, key)?;
}
}
}
Ok(())
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -723,6 +1170,20 @@ impl RawPile for SqliteBloblogPile {
Ok(sender) Ok(sender)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
Ok(vec![PipelineDescription::Store])
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
let chunk_pointer = self
.get_chunk_pointer(chunk_id)?
.context("Can't get chunk ID transfer ordering hint for chunk without pointer.")?;
// Scheme: 24-bit bloblog ID
// followed by 40-bit offset
Ok(((chunk_pointer.bloblog as u64) << 40) | (chunk_pointer.offset & 0xFF_FF_FF_FF_FF))
}
} }
struct KeyIterator { struct KeyIterator {
@ -768,9 +1229,10 @@ impl Iterator for KeyIterator {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::pile::local_sqlitebloblogs::Bloblog;
use temp_dir::TempDir; use temp_dir::TempDir;
use crate::pile::local_sqlitebloblogs::Bloblog;
#[test] #[test]
pub fn bloblog_read_write_test() { pub fn bloblog_read_write_test() {
let td = TempDir::new().unwrap(); let td = TempDir::new().unwrap();

View File

@ -22,7 +22,7 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::pile::Keyspace; use crate::pile::{Keyspace, PipelineDescription};
pub mod requester; pub mod requester;
pub mod responder; pub mod responder;
@ -60,6 +60,7 @@ pub enum RequestBody {
}, },
Flush, Flush,
LowLevelCheck, LowLevelCheck,
Describe,
Shutdown, Shutdown,
Progress { Progress {
current: u64, current: u64,
@ -73,7 +74,7 @@ pub struct Response {
body: ResponseBody, body: ResponseBody,
} }
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone, Debug)]
pub enum ResponseBody { pub enum ResponseBody {
Success, Success,
Failed(String), Failed(String),
@ -83,6 +84,7 @@ pub enum ResponseBody {
batch: Vec<Vec<u8>>, batch: Vec<Vec<u8>>,
next_token: u16, next_token: u16,
}, },
Description(Vec<PipelineDescription>),
} }
pub fn read_message<R: Read, D: DeserializeOwned>(read: &mut R) -> anyhow::Result<D> { pub fn read_message<R: Read, D: DeserializeOwned>(read: &mut R) -> anyhow::Result<D> {

View File

@ -4,12 +4,14 @@ use std::sync::{Arc, Mutex};
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
use anyhow::anyhow; use anyhow::{anyhow, bail};
use crossbeam_channel::{Receiver, Sender}; use crossbeam_channel::{Receiver, Sender};
use log::{error, info}; use log::{error, info};
use crate::definitions::ChunkId; use crate::definitions::ChunkId;
use crate::pile::{ControllerMessage, Keyspace, RawPile, StoragePipelineSettings}; use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody}; use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
use metrics::{ use metrics::{
gauge, histogram, increment_counter, register_counter, register_gauge, register_histogram, Unit, gauge, histogram, increment_counter, register_counter, register_gauge, register_histogram, Unit,
@ -269,8 +271,7 @@ impl RawPile for Requester {
ResponseBody::Success => Ok(true), ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(false), ResponseBody::NotExists => Ok(false),
ResponseBody::Data(_) => Err(anyhow!("Received Data for exists.")), other => Err(anyhow!("Received {:?} for Exists", other)),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for exists.")),
} }
} }
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> { fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
@ -282,7 +283,7 @@ impl RawPile for Requester {
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(None), ResponseBody::NotExists => Ok(None),
ResponseBody::Data(data) => Ok(Some(data)), ResponseBody::Data(data) => Ok(Some(data)),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for read.")), other => Err(anyhow!("Received {:?} for Read", other)),
} }
} }
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> { fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
@ -293,9 +294,7 @@ impl RawPile for Requester {
})? { })? {
ResponseBody::Success => Ok(()), ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for write.")), other => Err(anyhow!("Received {:?} for Write", other)),
ResponseBody::Data(_) => Err(anyhow!("Received Data for write.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for write.")),
} }
} }
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> { fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
@ -305,11 +304,15 @@ impl RawPile for Requester {
})? { })? {
ResponseBody::Success => Ok(()), ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for delete.")), other => Err(anyhow!("Received {:?} for Delete", other)),
ResponseBody::Data(_) => Err(anyhow!("Received Data for delete.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for delete.")),
} }
} }
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
for &key in keys {
self.delete(kind, key)?;
}
Ok(())
}
fn list_keys( fn list_keys(
&self, &self,
kind: Keyspace, kind: Keyspace,
@ -321,31 +324,26 @@ impl RawPile for Requester {
buffer: Vec::with_capacity(0), buffer: Vec::with_capacity(0),
})), })),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for list_keys.")),
ResponseBody::Data(_) => Err(anyhow!("Received Data for list_keys.")),
ResponseBody::BatchData { batch, next_token } => Ok(Box::new(ListKeyIterator { ResponseBody::BatchData { batch, next_token } => Ok(Box::new(ListKeyIterator {
command_sender: self.commands.clone(), command_sender: self.commands.clone(),
batch_token: Some(next_token), batch_token: Some(next_token),
buffer: batch, buffer: batch,
})), })),
other => Err(anyhow!("Received {:?} for List", other)),
} }
} }
fn flush(&self) -> anyhow::Result<()> { fn flush(&self) -> anyhow::Result<()> {
match self.request(RequestBody::Flush)? { match self.request(RequestBody::Flush)? {
ResponseBody::Success => Ok(()), ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for Flush.")), other => Err(anyhow!("Received {:?} for Flush", other)),
ResponseBody::Data(_) => Err(anyhow!("Received Data for Flush.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for Flush.")),
} }
} }
fn check_lowlevel(&self) -> anyhow::Result<bool> { fn check_lowlevel(&self) -> anyhow::Result<bool> {
match self.request(RequestBody::LowLevelCheck)? { match self.request(RequestBody::LowLevelCheck)? {
ResponseBody::Success => Ok(true), ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)), ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Err(anyhow!("Received NotExists for LowLevelCheck.")), other => Err(anyhow!("Received {:?} for LowLevelCheck", other)),
ResponseBody::Data(_) => Err(anyhow!("Received Data for LowLevelCheck.")),
ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for LowLevelCheck.")),
} }
} }
@ -396,15 +394,7 @@ impl RawPile for Requester {
ResponseBody::Failed(string) => { ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string); panic!("Requester pipeline fail {}", string);
} }
ResponseBody::BatchData { .. } => { other => panic!("wtf {:?}", other),
panic!("wtf BatchData");
}
ResponseBody::NotExists => {
panic!("wtf NotExists");
}
ResponseBody::Data(_) => {
panic!("wtf Data");
}
} }
} }
recv(receiver) -> resp => { recv(receiver) -> resp => {
@ -434,15 +424,7 @@ impl RawPile for Requester {
ResponseBody::Failed(string) => { ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string); panic!("Requester pipeline fail {}", string);
} }
ResponseBody::BatchData { .. } => { other => panic!("wtf {:?}", other),
panic!("wtf BatchData");
}
ResponseBody::NotExists => {
panic!("wtf NotExists");
}
ResponseBody::Data(_) => {
panic!("wtf Data");
}
} }
} }
} }
@ -451,6 +433,21 @@ impl RawPile for Requester {
Ok(input) Ok(input)
} }
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
match self.request(RequestBody::Describe)? {
ResponseBody::Description(mut description) => {
description.push(PipelineDescription::Remote);
Ok(description)
}
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for Describe", other)),
}
}
fn chunk_id_transfer_ordering_hint(&self, _chunk_id: &ChunkId) -> anyhow::Result<u64> {
bail!("You probably shouldn't be using chunk ID transfer ordering hints with a remote.");
}
} }
pub struct ListKeyIterator { pub struct ListKeyIterator {
@ -478,8 +475,6 @@ impl Iterator for ListKeyIterator {
None None
} }
ResponseBody::Failed(err_msg) => Some(Err(anyhow!("Remote failure: {}", err_msg))), ResponseBody::Failed(err_msg) => Some(Err(anyhow!("Remote failure: {}", err_msg))),
ResponseBody::NotExists => Some(Err(anyhow!("Received NotExists for NextBatch."))),
ResponseBody::Data(_) => Some(Err(anyhow!("Received Data for NextBatch."))),
ResponseBody::BatchData { batch, next_token } => { ResponseBody::BatchData { batch, next_token } => {
self.batch_token = Some(next_token); self.batch_token = Some(next_token);
self.buffer = batch; self.buffer = batch;
@ -491,6 +486,7 @@ impl Iterator for ListKeyIterator {
None None
} }
} }
other => Some(Err(anyhow!("Received {:?} for NextBatch", other))),
} }
} else { } else {
None None

View File

@ -349,6 +349,20 @@ impl Responder {
RequestBody::Progress { .. } => { RequestBody::Progress { .. } => {
unreachable!("handled by readea") unreachable!("handled by readea")
} }
RequestBody::Describe => match pile.describe_pipeline() {
Ok(description) => Response {
response_to: request.id,
body: ResponseBody::Description(description),
},
Err(err) => {
warn!("Error whilst doing a raw describe_pipeline: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
}; };
responses responses

View File

@ -185,7 +185,7 @@ pub fn differentiate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow
/// result is in-place. /// result is in-place.
/// ///
/// Preconditions: /// Preconditions:
/// - `old` must be an integrated pointer. /// - `old` must be an integrated pointer. (Otherwise this algorithm is not correct.)
/// - `old` is the parent of `new` /// - `old` is the parent of `new`
pub fn integrate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow::Result<()> { pub fn integrate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow::Result<()> {
if let TreeNode::Directory { children, .. } = new { if let TreeNode::Directory { children, .. } = new {

View File

@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>. along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/ */
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write; use std::fmt::Write;
pub fn bytes_to_hexstring(chunkid: &[u8]) -> String { pub fn bytes_to_hexstring(chunkid: &[u8]) -> String {
@ -42,3 +43,98 @@ pub fn get_number_of_workers(first_try_env_name: &str) -> u8 {
} }
} }
} }
#[derive(Clone, Debug)]
pub struct LruMap<K, V> {
capacity: usize,
last_access: BTreeSet<(u64, K)>,
items: BTreeMap<K, (V, u64)>,
counter: u64,
}
impl<K: Ord + Clone, V> LruMap<K, V> {
pub fn new(capacity: usize) -> LruMap<K, V> {
LruMap {
capacity,
last_access: BTreeSet::new(),
items: BTreeMap::new(),
counter: 0,
}
}
/// Gets an item from the LRU map.
pub fn get(&mut self, key: &K) -> Option<&V> {
match self.items.get_mut(key) {
Some((value, last_used_instant)) => {
assert!(
self.last_access.remove(&(*last_used_instant, key.clone())),
"Corrupt LRU map: freshen not correct."
);
let new_instant = self.counter;
self.counter += 1;
self.last_access.insert((new_instant, key.clone()));
*last_used_instant = new_instant;
Some(value)
}
None => None,
}
}
pub fn insert(&mut self, key: K, value: V) -> Option<V> {
let new_instant = self.counter;
self.counter += 1;
let retval = match self.items.insert(key.clone(), (value, new_instant)) {
Some((old_entry, old_instant)) => {
assert!(
self.last_access.remove(&(old_instant, key.clone())),
"Corrupt LRU map: insert not correct."
);
Some(old_entry)
}
None => None,
};
self.last_access.insert((new_instant, key));
if retval.is_none() {
// We didn't replace any item, so we have grown by 1.
// Check if we need to evict.
if self.items.len() > self.capacity {
self.evict();
}
}
retval
}
pub fn evict(&mut self) -> Option<(K, V)> {
if let Some(first_entry) = self.last_access.iter().next().cloned() {
self.last_access.remove(&first_entry);
let (_, key) = first_entry;
let (value, _) = self
.items
.remove(&key)
.expect("Corrupt LRU map: last access and items out of sync");
Some((key, value))
} else {
None
}
}
}
#[cfg(test)]
mod test {
use crate::utils::LruMap;
#[test]
fn test_lru_map() {
let mut lmap = LruMap::new(3);
lmap.insert(1, 1);
lmap.insert(2, 1);
lmap.insert(3, 1);
assert_eq!(lmap.get(&1), Some(&1));
lmap.insert(4, 1);
assert_eq!(lmap.get(&2), None);
}
}