Update flake and fix it

Simplify flake lock
Batch up chunk deletions in an attempt to make vacuuming more performant
2024-05-08 20:41:28 +01:00 · 2023-04-01 16:57:04 +01:00 · 2022-11-28 21:03:07 +00:00 · 2022-11-21 21:23:38 +00:00 · 2022-11-20 22:11:05 +00:00 · 2022-11-20 20:58:45 +00:00
58 changed files with 4941 additions and 940 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1,2 @@
+use nix
+
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,5 @@
 __pycache__
 /datman-helper-postgres/datman_helper_postgres.egg-info
 /datman-helper-mysql/datman_helper_mysql.egg-info
+/result
+
--- a/.woodpecker/build.yml
+++ b/.woodpecker/build.yml
@ -5,28 +5,22 @@ platform: linux/amd64

 pipeline:
  unitTests:
-    image: docker.bics.ga/rei_ci/rust-sccache
+    image: "rust:1.65.0"
+    pull: true
    commands:
-      - DEBIAN_FRONTEND=noninteractive apt-get -qq update && apt-get -yqq install pkg-config libssl-dev build-essential libsqlite3-dev > /dev/null
+      - DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
+      - DEBIAN_FRONTEND=noninteractive apt-get -yqq install pkg-config libssl-dev build-essential libsqlite3-dev > /dev/null
      - cargo build --all
      - cargo test --all
-      - sccache --show-stats
-    environment:
-      RUSTC_WRAPPER: /usr/local/bin/sccache
-      SCCACHE_S3_USE_SSL: "true"
-      SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
-    secrets:
-      - sccache_bucket
-      - aws_access_key_id
-      - aws_secret_access_key
    when:
      event: [push, pull_request]


  testSuite:
-    image: docker.bics.ga/rei_ci/rust-sccache
+    image: "rust:1.65.0"
    commands:
-      - DEBIAN_FRONTEND=noninteractive apt-get -qq update && apt-get -yqq -o=Dpkg::Use-Pty=0 install pkg-config libssl-dev build-essential libsqlite3-dev python3.9 python3.9-venv postgresql postgresql-client mariadb-server mariadb-client zstd lz4 > /dev/null
+      - DEBIAN_FRONTEND=noninteractive apt-get -qq update > /dev/null
+      - DEBIAN_FRONTEND=noninteractive apt-get -yqq -o=Dpkg::Use-Pty=0 install pkg-config libssl-dev build-essential libsqlite3-dev python3.9 python3.9-venv postgresql postgresql-client mariadb-server mariadb-client zstd lz4 > /dev/null
      - pg_ctlcluster 13 main start
      - "mysqld_safe &"
      - su postgres -c 'createuser root'
@ -36,22 +30,14 @@ pipeline:
      - cargo install -q --path yama
      - cargo install -q --path datman
      - python3.9 -m venv testsuite/.venv
-      - ./testsuite/.venv/bin/pip install -e testsuite -e datman-helper-postgres -e datman-helper-mysql
+      - ./testsuite/.venv/bin/pip install ./testsuite ./datman-helper-postgres ./datman-helper-mysql
      - cd testsuite && . .venv/bin/activate && TEST_POSTGRES=$(hostname),testsuitedb,root TEST_MYSQL=$(hostname),testsuitemydb,root green
-      - sccache --show-stats
-    environment:
-      RUSTC_WRAPPER: /usr/local/bin/sccache
-      SCCACHE_S3_USE_SSL: "true"
-      SCCACHE_ENDPOINT: "richie.m4.tanukitsu.net:443"
-    secrets:
-      - sccache_bucket
-      - aws_access_key_id
-      - aws_secret_access_key
    when:
      event: [push, pull_request]

  deployManual:
-    image: docker.bics.ga/rei_ci/mdbook
+    image: "docker.emunest.net/rei_ci/mdbook:latest-amd64"
+    pull: true
    when:
      branch:
        - develop
--- a/.woodpecker/release.yml
+++ b/.woodpecker/release.yml
@ -4,7 +4,8 @@ platform: linux/${ARCH}

 matrix:
  ARCH:
-    - arm64
+    # I don't have an arm64 runner at the moment.
+    #- arm64
    - amd64

 .a1: &when
@ -15,7 +16,10 @@ pipeline:
  buildRelease:
    when: *when

-    image: docker.bics.ga/rei_ci/rust-sccache
+    # Disabled for now because I'm trying to get infinite build times to stop :-(.
+    # Suspect a kernel bug but any workaround will do for now.
+    #image: "docker.bics.ga/rei_ci/rust-sccache:latest-${ARCH}"
+    image: "rust:1.61"
    commands:
      - apt-get -qq update && apt-get -yqq install pkg-config libssl-dev build-essential libolm-dev cmake
      - cargo build --release
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,11 @@
 [workspace]
 members = [
  "yama",
-  "datman"
+  "datman",
 ]

+[profile.release]
+# Include FULL debug information in the release binaries
+debug = 2
+# When this feature stabilises, it will be possible to split the debug information into a file alongside the binary
+#split-debuginfo = "packed"
--- a/datman-helper-mysql/poetry.lock
+++ b/datman-helper-mysql/poetry.lock
@ -0,0 +1,8 @@
+package = []
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.8"
+content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
+
+[metadata.files]
--- a/datman-helper-mysql/pyproject.toml
+++ b/datman-helper-mysql/pyproject.toml
@ -0,0 +1,19 @@
+[tool.poetry]
+name = "datman-helper-mysql"
+version = "0.1.0"
+description = "MySQL integration for Datman"
+authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
+license = "GPL-3.0-or-later"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+
+[tool.poetry.dev-dependencies]
+
+[tool.poetry.scripts]
+datman-helper-mysql-backup="datman_helper_mysql.backup:cli"
+datman-helper-mysql-restore="datman_helper_mysql.restore:cli"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/datman-helper-mysql/setup.py
+++ b/datman-helper-mysql/setup.py
@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import io
-import os
-import sys
-from shutil import rmtree
-
-from setuptools import Command, find_packages, setup
-
-# Package meta-data.
-NAME = "datman_helper_mysql"
-DESCRIPTION = "MySQL integration for Datman"
-URL = "https://bics.ga/reivilibre/yama"
-EMAIL = "reivi@librepush.net"
-AUTHOR = "Olivier 'reivilibre'"
-REQUIRES_PYTHON = ">=3.7.0"
-VERSION = "0.1.0"
-
-# What packages are required for this module to be executed?
-REQUIRED = []
-
-
-# What packages are optional?
-EXTRAS = {}
-
-# The rest you shouldn't have to touch too much :)
-# ------------------------------------------------
-# Except, perhaps the License and Trove Classifiers!
-# If you do change the License, remember to change the Trove Classifier for that!
-
-here = os.path.abspath(os.path.dirname(__file__))
-
-# Import the README and use it as the long-description.
-# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
-try:
-    with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
-        long_description = "\n" + f.read()
-except FileNotFoundError:
-    long_description = DESCRIPTION
-
-# Load the package's __version__.py module as a dictionary.
-about = {}
-if not VERSION:
-    project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
-    with open(os.path.join(here, project_slug, "__version__.py")) as f:
-        exec(f.read(), about)
-else:
-    about["__version__"] = VERSION
-
-
-class UploadCommand(Command):
-    """Support setup.py upload."""
-
-    description = "Build and publish the package."
-    user_options = []
-
-    @staticmethod
-    def status(s):
-        """Prints things in bold."""
-        print("\033[1m{0}\033[0m".format(s))
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        try:
-            self.status("Removing previous builds…")
-            rmtree(os.path.join(here, "dist"))
-        except OSError:
-            pass
-
-        self.status("Building Source and Wheel (universal) distribution…")
-        os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
-
-        self.status("Uploading the package to PyPI via Twine…")
-        os.system("twine upload dist/*")
-
-        self.status("Pushing git tags…")
-        os.system("git tag v{0}".format(about["__version__"]))
-        os.system("git push --tags")
-
-        sys.exit()
-
-
-# Where the magic happens:
-setup(
-    name=NAME,
-    version=about["__version__"],
-    description=DESCRIPTION,
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author=AUTHOR,
-    author_email=EMAIL,
-    python_requires=REQUIRES_PYTHON,
-    url=URL,
-    packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
-    # If your package is a single module, use this instead of 'packages':
-    # py_modules=['mypackage'],
-    entry_points={
-        "console_scripts": [
-            "datman-helper-mysql-backup=datman_helper_mysql.backup:cli",
-            "datman-helper-mysql-restore=datman_helper_mysql.restore:cli",
-        ],
-    },
-    install_requires=REQUIRED,
-    extras_require=EXTRAS,
-    include_package_data=True,
-    # TODO license='GPL3',
-    classifiers=[
-        # Trove classifiers
-        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3",
-    ],
-)
--- a/datman-helper-postgres/datman_helper_postgres/backup.py
+++ b/datman-helper-postgres/datman_helper_postgres/backup.py
@ -39,10 +39,7 @@ def cli():
    # The process (if any) that is our LZ4 decompressor.
    lz4_process = None

-    dump_command = [
-        "pg_dump",
-        database_to_use
-    ]
+    dump_command = ["pg_dump", database_to_use]

    if host_to_use is not None:
        if use_lz4:
@ -63,21 +60,19 @@ def cli():
        # (rather than lz4 covering it).
        command = [
            "ssh",
-            f"{user_to_use}@{host_to_use}" if user_to_use is not None else f"{host_to_use}",
+            f"{user_to_use}@{host_to_use}"
+            if user_to_use is not None
+            else f"{host_to_use}",
            "bash",
            "-o",
            "pipefail",
            "-c",
-            shlex.quote(" ".join(dump_command))
+            shlex.quote(" ".join(dump_command)),
        ]
    elif user_to_use is not None:
        current_username = pwd.getpwuid(os.getuid()).pw_name
        if current_username != user_to_use:
-            command = [
-                "sudo",
-                "-u",
-                user_to_use
-            ] + dump_command
+            command = ["sudo", "-u", user_to_use] + dump_command
        else:
            command = dump_command
    else:
--- a/datman-helper-postgres/poetry.lock
+++ b/datman-helper-postgres/poetry.lock
@ -0,0 +1,8 @@
+package = []
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.8"
+content-hash = "fafb334cb038533f851c23d0b63254223abf72ce4f02987e7064b0c95566699a"
+
+[metadata.files]
--- a/datman-helper-postgres/pyproject.toml
+++ b/datman-helper-postgres/pyproject.toml
@ -0,0 +1,19 @@
+[tool.poetry]
+name = "datman-helper-postgres"
+version = "0.1.0"
+description = "Postgres integration for Datman"
+authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
+license = "GPL-3.0-or-later"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+
+[tool.poetry.dev-dependencies]
+
+[tool.poetry.scripts]
+datman-helper-postgres-backup="datman_helper_postgres.backup:cli"
+datman-helper-postgres-restore="datman_helper_postgres.restore:cli"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/datman-helper-postgres/setup.py
+++ b/datman-helper-postgres/setup.py
@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import io
-import os
-import sys
-from shutil import rmtree
-
-from setuptools import Command, find_packages, setup
-
-# Package meta-data.
-NAME = "datman_helper_postgres"
-DESCRIPTION = "Postgres integration for Datman"
-URL = "https://bics.ga/reivilibre/yama"
-EMAIL = "reivi@librepush.net"
-AUTHOR = "Olivier 'reivilibre'"
-REQUIRES_PYTHON = ">=3.7.0"
-VERSION = "0.1.0"
-
-# What packages are required for this module to be executed?
-REQUIRED = []
-
-
-# What packages are optional?
-EXTRAS = {}
-
-# The rest you shouldn't have to touch too much :)
-# ------------------------------------------------
-# Except, perhaps the License and Trove Classifiers!
-# If you do change the License, remember to change the Trove Classifier for that!
-
-here = os.path.abspath(os.path.dirname(__file__))
-
-# Import the README and use it as the long-description.
-# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
-try:
-    with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
-        long_description = "\n" + f.read()
-except FileNotFoundError:
-    long_description = DESCRIPTION
-
-# Load the package's __version__.py module as a dictionary.
-about = {}
-if not VERSION:
-    project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
-    with open(os.path.join(here, project_slug, "__version__.py")) as f:
-        exec(f.read(), about)
-else:
-    about["__version__"] = VERSION
-
-
-class UploadCommand(Command):
-    """Support setup.py upload."""
-
-    description = "Build and publish the package."
-    user_options = []
-
-    @staticmethod
-    def status(s):
-        """Prints things in bold."""
-        print("\033[1m{0}\033[0m".format(s))
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        try:
-            self.status("Removing previous builds…")
-            rmtree(os.path.join(here, "dist"))
-        except OSError:
-            pass
-
-        self.status("Building Source and Wheel (universal) distribution…")
-        os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
-
-        self.status("Uploading the package to PyPI via Twine…")
-        os.system("twine upload dist/*")
-
-        self.status("Pushing git tags…")
-        os.system("git tag v{0}".format(about["__version__"]))
-        os.system("git push --tags")
-
-        sys.exit()
-
-
-# Where the magic happens:
-setup(
-    name=NAME,
-    version=about["__version__"],
-    description=DESCRIPTION,
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author=AUTHOR,
-    author_email=EMAIL,
-    python_requires=REQUIRES_PYTHON,
-    url=URL,
-    packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
-    # If your package is a single module, use this instead of 'packages':
-    # py_modules=['mypackage'],
-    entry_points={
-        "console_scripts": [
-            "datman-helper-postgres-backup=datman_helper_postgres.backup:cli",
-            "datman-helper-postgres-restore=datman_helper_postgres.restore:cli",
-        ],
-    },
-    install_requires=REQUIRED,
-    extras_require=EXTRAS,
-    include_package_data=True,
-    # TODO license='GPL3',
-    classifiers=[
-        # Trove classifiers
-        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3",
-    ],
-)
--- a/datman/Cargo.toml
+++ b/datman/Cargo.toml
@ -1,8 +1,8 @@
 [package]
 name = "datman"
-version = "0.5.0-alpha.1"
+version = "0.6.0-alpha.5"
 authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
-edition = "2018"
+edition = "2021"
 repository = "https://bics.ga/reivilibre/yama"
 license = "GPL-3.0-or-later"

@ -11,8 +11,8 @@ description = "A chunked and deduplicated backup system using Yama"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-clap = "= 3.0.0-beta.5"
-crossbeam-channel = "0.4"
+clap = { version = "3.1.18", features = ["derive"] }
+crossbeam-channel = "0.5.1"
 anyhow = "1.0"
 thiserror = "1.0"
 serde = { version = "1.0.104", features = ["derive"] }
@ -22,7 +22,7 @@ log = "0.4"
 env_logger = "0.7.1"
 indicatif = "0.14.0"
 arc-interner = "0.5.1"
-zstd = "0.6.0"  # 0.6.0+zstd.1.4.8
+zstd = "0.11.2"  # 0.11.2+zstd.1.5.2
 byteorder = "1"
 termion = "1.5.6"
 glob = "0.3.0"
@ -30,4 +30,9 @@ humansize = "1.1.1"
 chrono = "0.4.19"
 itertools = "0.10.1"
 hostname = "0.3.1"
-yama = { path = "../yama", version = "0.5.0-alpha.1" }
+yama = { path = "../yama", version = "0.6.0-alpha.5" }
+metrics = "0.17.1"
+bare-metrics-recorder = { version = "0.1.0" }
+comfy-table = "6.0.0-rc.1"
+libc = "0.2.126"
+io-streams = "0.11.0"
--- a/datman/README.md
+++ b/datman/README.md
@ -8,5 +8,6 @@ Features:
 * (optional) Compression using Zstd and a specifiable dictionary
 * (optional) Encryption
 * Ability to back up to remote machines over SSH
+* Labelling of files in a backup source; different destinations can choose to backup either all or a subset of the labels.

 See the documentation for more information.
--- a/datman/src/bin/datman.rs
+++ b/datman/src/bin/datman.rs
@ -15,23 +15,31 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

+use std::fs::File;
+use std::io::{BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};

 use clap::Parser;
 use env_logger::Env;

-use anyhow::bail;
+use anyhow::{bail, Context};
+use bare_metrics_recorder::recording::BareMetricsRecorderCore;
 use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone, Utc};
 use datman::commands::backup::{backup_all_sources_to_destination, backup_source_to_destination};
 use datman::commands::ilabel::interactive_labelling_session;
-use datman::commands::init_descriptor;
+use datman::commands::prune::{prune_with_retention_policy, RetentionPolicy};
+use datman::commands::{init_descriptor, pushpull};
 use datman::descriptor::{load_descriptor, SourceDescriptor};
 use datman::get_hostname;
 use datman::remote::backup_source_requester::backup_remote_source_to_destination;
 use datman::remote::backup_source_responder;
 use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
+use itertools::Itertools;
 use log::info;
 use std::str::FromStr;
+use yama::commands::load_pile_descriptor;
+use yama::operations::legacy_pushpull::{open_pile_with_work_bypass, BypassLevel};

 pub const FAILURE_SYMBOL_OBNOXIOUS_FLASHING: &str = "\x1b[5m\x1b[31m⚠️ \x1b[25m\x1b[22m";
 pub const BOLD: &str = "\x1b[1m";
@ -109,8 +117,39 @@ pub enum DatmanCommand {
        skip_metadata: bool,
    },

+    Report {
+        /// Name of the pile to report on.
+        pile_name: String,
+
+        /// Don't summarise months.
+        #[clap(long)]
+        individual: bool,
+    },
+
    #[clap(name = "_backup_source_responder")]
    InternalBackupSourceResponder,
+
+    /// Pulls all pointers from a remote pile to a local pile.
+    /// Does not yet support label filtering, but will do in the future.
+    Pull {
+        /// e.g. 'myserver:main'
+        remote_and_remote_pile: String,
+
+        pile_name: String,
+    },
+
+    /// Applies a retention policy by removing unnecessary backups.
+    /// Does not reclaim space by itself: use
+    ///   `yama check --apply-gc --shallow`
+    /// & `yama compact`
+    /// to do that.
+    Prune { pile_name: String },
+
+    #[clap(name = "_pull_responder_offerer")]
+    InternalPullResponderOfferer {
+        datman_path: PathBuf,
+        pile_name: String,
+    },
 }

 pub struct HumanDateTime(pub DateTime<Local>);
@ -169,6 +208,15 @@ fn with_exitcode<R>(result: anyhow::Result<R>) {
 fn main() -> anyhow::Result<()> {
    env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();

+    let now = Utc::now();
+
+    let (shard, _stopper) = BareMetricsRecorderCore::new(File::create(format!(
+        "/tmp/datman_{}.baremetrics",
+        now.format("%F_%H%M%S")
+    ))?)
+    .start("datman".to_string())?;
+    shard.install_as_metrics_recorder()?;
+
    let opts: DatmanCommand = DatmanCommand::parse();

    match opts {
@ -179,10 +227,10 @@ fn main() -> anyhow::Result<()> {
            unimplemented!();
        }
        DatmanCommand::InteractiveLabelling { source_name } => {
-            interactive_labelling_session(Path::new("."), source_name).unwrap();
+            interactive_labelling_session(Path::new("."), source_name)?;
        }
        DatmanCommand::InteractiveBrowsing { source_name } => {
-            datman::commands::ibrowse::session(Path::new("."), source_name).unwrap();
+            datman::commands::ibrowse::session(Path::new("."), source_name)?;
        }
        DatmanCommand::BackupOne {
            source_name,
@ -296,6 +344,125 @@ fn main() -> anyhow::Result<()> {
            info!("Datman responder at {:?}", std::env::current_exe()?);
            backup_source_responder::handler_stdio()?;
        }
+
+        DatmanCommand::Report {
+            pile_name,
+            individual,
+        } => {
+            let descriptor = load_descriptor(Path::new(".")).unwrap();
+            let destination = &descriptor.piles[&pile_name];
+            let report =
+                datman::commands::report::generate_report(destination, &descriptor, !individual)?;
+
+            datman::commands::report::print_filesystem_space(&destination.path)?;
+            datman::commands::report::print_report(&report)?;
+        }
+        DatmanCommand::Pull {
+            remote_and_remote_pile,
+            pile_name,
+        } => {
+            let (hostname, remote_datman_path, remote_pile_name) = remote_and_remote_pile
+                .split(':')
+                .collect_tuple()
+                .context("You must pull from a remote pile specified as remote:path:pile.")?;
+
+            let descriptor = load_descriptor(Path::new(".")).unwrap();
+            let source = &descriptor.piles[&pile_name];
+
+            let pile_desc = load_pile_descriptor(&source.path)?;
+            let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
+                &source.path,
+                &pile_desc,
+                BypassLevel::CompressionBypass,
+            )?;
+
+            let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
+            pbar.set_style(
+                ProgressStyle::default_bar().template(
+                    "[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
+                ),
+            );
+            pbar.set_message("pulling");
+
+            let remote_host_descriptor = descriptor
+                .remote_hosts
+                .get(hostname)
+                .ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
+
+            let mut connection = Command::new("ssh")
+                .arg(&remote_host_descriptor.user_at_host)
+                .arg("--")
+                .arg(
+                    &remote_host_descriptor
+                        .path_to_datman
+                        .as_ref()
+                        .map(|x| x.as_str())
+                        .unwrap_or("datman"),
+                )
+                .arg("_pull_responder_offerer")
+                .arg(remote_datman_path)
+                .arg(remote_pile_name)
+                .stdin(Stdio::piped())
+                .stdout(Stdio::piped())
+                .stderr(Stdio::inherit())
+                .spawn()?;
+
+            let mut reader = BufReader::new(connection.stdout.take().unwrap());
+            let mut writer = BufWriter::new(connection.stdin.take().unwrap());
+
+            pushpull::accepting_side(
+                &pile,
+                &bypass_raw_pile,
+                &mut reader,
+                &mut writer,
+                Box::new(pbar),
+            )?;
+        }
+
+        DatmanCommand::Prune { pile_name } => {
+            let descriptor = load_descriptor(Path::new(".")).unwrap();
+            let retention_policy = descriptor
+                .retention
+                .context("No retention policy set in descriptor")?;
+            let dest_desc = &descriptor.piles[&pile_name];
+
+            let pile_desc = load_pile_descriptor(&dest_desc.path)?;
+
+            prune_with_retention_policy(
+                &dest_desc.path,
+                &pile_desc,
+                &RetentionPolicy::from_config(retention_policy),
+                true,
+            )?;
+        }
+
+        DatmanCommand::InternalPullResponderOfferer {
+            datman_path,
+            pile_name,
+        } => {
+            let descriptor = load_descriptor(&datman_path).unwrap();
+            let source = &descriptor.piles[&pile_name];
+
+            let pile_desc = load_pile_descriptor(&source.path)?;
+            let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
+                &source.path,
+                &pile_desc,
+                BypassLevel::CompressionBypass,
+            )?;
+
+            let mut stdin = BufReader::new(io_streams::StreamReader::stdin()?);
+            let mut stdout = BufWriter::new(io_streams::StreamWriter::stdout()?);
+
+            pushpull::offering_side(
+                &pile,
+                &bypass_raw_pile,
+                &mut stdin,
+                &mut stdout,
+                Box::new(()),
+            )?;
+
+            stdout.flush()?;
+        }
    }
    Ok(())
 }
--- a/datman/src/commands.rs
+++ b/datman/src/commands.rs
@ -20,12 +20,15 @@ use std::fs::File;
 use std::io::Write;
 use std::path::Path;

-use crate::descriptor::{Descriptor, SourceDescriptor};
+use crate::descriptor::{Descriptor, RetentionPolicyConfig, SourceDescriptor};

 pub mod backup;
 pub mod extract;
 pub mod ibrowse;
 pub mod ilabel;
+pub mod prune;
+pub mod pushpull;
+pub mod report;

 pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
    std::fs::create_dir_all(path)?;
@ -49,6 +52,12 @@ pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
        sources: source,
        piles: Default::default(),
        remote_hosts: Default::default(),
+        retention: Some(RetentionPolicyConfig {
+            daily: 14,
+            weekly: 12,
+            monthly: 24,
+            yearly: 9001,
+        }),
    })?;

    datman_toml_file.write_all(&bytes)?;
--- a/datman/src/commands/backup.rs
+++ b/datman/src/commands/backup.rs
@ -17,17 +17,20 @@ along with Yama.  If not, see <https://www.gnu.org/licenses/>.

 use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor, VirtualSourceKind};
 use crate::get_hostname;
-use crate::labelling::{label_node, load_labelling_rules, str_to_label, Label, State};
+use crate::labelling::{
+    label_node, load_labelling_rules, str_to_label, Label, LabellingRules, State,
+};
 use crate::tree::{scan, FileTree, FileTree1};
 use anyhow::{anyhow, bail};
 use arc_interner::ArcIntern;
-use chrono::{DateTime, Utc};
+use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
 use log::{info, warn};
 use std::collections::{HashMap, HashSet};
 use std::fmt::Debug;
 use std::io::Write;
 use std::path::Path;
 use std::process::{Child, Command, Stdio};
+use std::sync::Arc;
 use yama::chunking::SENSIBLE_THRESHOLD;
 use yama::commands::{load_pile_descriptor, open_pile, store_tree_node};
 use yama::definitions::{
@ -48,6 +51,13 @@ pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String
    )
 }

+pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
+    let (source_name, date_time_str) = pointer_name.rsplit_once("+")?;
+    let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
+    let date_time = Utc.from_utc_datetime(&date_time);
+    Some((source_name.to_owned(), date_time))
+}
+
 pub fn open_stdout_backup_process(
    extra_args: &HashMap<String, toml::Value>,
    program_name: &str,
@ -68,8 +78,8 @@ pub fn open_stdout_backup_process(
 pub fn label_filter_and_convert(
    tree: FileTree1<()>,
    descriptor: &Descriptor,
-    desc_path: &Path,
    source_name: &str,
+    rules: &LabellingRules,
    dest: &DestPileDescriptor,
 ) -> anyhow::Result<Option<TreeNode>> {
    info!("Labelling.");
@ -79,8 +89,7 @@ pub fn label_filter_and_convert(
        .iter()
        .map(|l| Label(ArcIntern::new(l.clone())))
        .collect();
-    let rules = load_labelling_rules(desc_path, source_name)?;
-    label_node("".to_owned(), None, &mut tree, &labels, &rules)?;
+    label_node("".to_owned(), None, &mut tree, &labels, rules)?;

    let included_labels: HashSet<Label> = dest.included_labels.iter().map(str_to_label).collect();

@ -139,17 +148,23 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(
        SourceDescriptor::DirectorySource {
            hostname: _,
            directory,
+            cross_filesystems,
        } => {
            info!("Looking to backup {} to {}", source_name, dest_name);
+            let rules = load_labelling_rules(desc_path, source_name)?;
+            let exclusions = rules.get_exclusions_set(directory);
+
            info!("Scanning.");
-            let tree = scan(directory)?.ok_or_else(|| anyhow!("Source does not exist."))?;
+            let tree = scan(directory, !*cross_filesystems, &exclusions)?
+                .ok_or_else(|| anyhow!("Source does not exist."))?;

            let absolute_source_path = desc_path.join(directory);
            let absolute_dest_path = desc_path.join(&dest.path);
            let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
            let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
+
            let root = if let Some(root) =
-                label_filter_and_convert(tree, descriptor, desc_path, source_name, dest)?
+                label_filter_and_convert(tree, descriptor, source_name, &rules, dest)?
            {
                root
            } else {
@ -158,7 +173,10 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(

            let pointer_name = get_pointer_name_at(&source_name, Utc::now());
            if pile.read_pointer(pointer_name.as_str())?.is_some() {
-                bail!("Pointer by name {:?} already exists; refusing to overwrite.", pointer_name);
+                bail!(
+                    "Pointer by name {:?} already exists; refusing to overwrite.",
+                    pointer_name
+                );
            }
            info!("Will write as pointer {:?}.", pointer_name);

@ -191,7 +209,7 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(

            info!("Storing using yama.");
            yama::operations::storing::store_fully(
-                &pile,
+                Arc::new(pile),
                &absolute_source_path,
                &pointer_name,
                root,
@ -221,7 +239,10 @@ pub fn backup_source_to_destination<PT: ProgressTracker>(

            let pointer_name = get_pointer_name_at(&source_name, Utc::now());
            if pile.read_pointer(pointer_name.as_str())?.is_some() {
-                bail!("Pointer by name {:?} already exists; refusing to overwrite.", pointer_name);
+                bail!(
+                    "Pointer by name {:?} already exists; refusing to overwrite.",
+                    pointer_name
+                );
            }
            info!("Will write as pointer {:?}.", pointer_name);

--- a/datman/src/commands/ibrowse.rs
+++ b/datman/src/commands/ibrowse.rs
@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

+use std::collections::BTreeSet;
 use std::path::Path;

 use anyhow::{anyhow, bail};
@ -68,15 +69,19 @@ pub fn session(path: &Path, source_name: String) -> anyhow::Result<()> {
        .get(&source_name)
        .ok_or_else(|| anyhow!("Could not find source {:?}!", source_name))?;

-    let directory = match source_descriptor {
-        SourceDescriptor::DirectorySource { directory, .. } => directory,
+    let (directory, one_filesystem) = match source_descriptor {
+        SourceDescriptor::DirectorySource {
+            directory,
+            cross_filesystems,
+            ..
+        } => (directory, !*cross_filesystems),
        SourceDescriptor::VirtualSource { .. } => {
            bail!("Cannot browse virtual source.");
        }
    };

    println!("Scanning source; this might take a little while...");
-    let mut dir_scan: FileTree1<Option<State>> = scan(directory)?
+    let mut dir_scan: FileTree1<Option<State>> = scan(directory, one_filesystem, &BTreeSet::new())?
        .ok_or_else(|| anyhow!("Empty source."))?
        .replace_meta(&None);

--- a/datman/src/commands/ilabel.rs
+++ b/datman/src/commands/ilabel.rs
@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

+use std::collections::BTreeSet;
 use std::io;
 use std::io::{StdinLock, Stdout, Write};
 use std::path::Path;
@ -186,12 +187,13 @@ pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow
    if let SourceDescriptor::DirectorySource {
        hostname,
        directory,
+        cross_filesystems,
    } = source
    {
        let my_hostname = get_hostname();
        let mut dir_scan = if &my_hostname == hostname {
            info!("Scanning source; this might take a little while...");
-            scan(directory)?
+            scan(directory, !*cross_filesystems, &BTreeSet::new())?
                .ok_or_else(|| anyhow!("Empty source."))?
                .replace_meta(&None)
        } else {
@ -206,8 +208,13 @@ pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow

            // then request to scan
            info!("Requesting scan from remote source... (this may take some time)");
-            let scan =
-                backup_source_requester::scanning(&mut read, &mut write, directory.as_ref())?
+            let scan = backup_source_requester::scanning(
+                &mut read,
+                &mut write,
+                directory.as_ref(),
+                !*cross_filesystems,
+                &BTreeSet::new(),
+            )?
            .ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?
            .replace_meta(&None);

--- a/datman/src/commands/prune.rs
+++ b/datman/src/commands/prune.rs
@ -0,0 +1,220 @@
+use crate::commands::backup::split_pointer_name;
+use crate::descriptor::RetentionPolicyConfig;
+use anyhow::{bail, Context};
+use log::info;
+use std::collections::{BTreeMap, BTreeSet};
+use std::io;
+use std::path::Path;
+use yama::commands::open_pile;
+use yama::operations::remove_pointer_safely;
+use yama::pile::PileDescriptor;
+
+pub struct RetentionBand {
+    pub interval_s: u64,
+    pub number_to_retain: u32,
+}
+
+pub struct RetentionPolicy {
+    pub retention_bands: Vec<RetentionBand>,
+}
+
+const DAY: u64 = 86400;
+const WEEK: u64 = 7 * DAY;
+const MONTH: u64 = 31 * DAY;
+const YEAR: u64 = 365 * DAY;
+
+impl RetentionPolicy {
+    pub fn from_config(descriptor: RetentionPolicyConfig) -> RetentionPolicy {
+        let mut policy = RetentionPolicy {
+            retention_bands: vec![],
+        };
+
+        if descriptor.daily != 0 {
+            policy.retention_bands.push(RetentionBand {
+                interval_s: DAY,
+                number_to_retain: descriptor.daily,
+            });
+        }
+
+        if descriptor.weekly != 0 {
+            policy.retention_bands.push(RetentionBand {
+                interval_s: WEEK,
+                number_to_retain: descriptor.weekly,
+            });
+        }
+
+        if descriptor.monthly != 0 {
+            policy.retention_bands.push(RetentionBand {
+                interval_s: MONTH,
+                number_to_retain: descriptor.monthly,
+            });
+        }
+
+        if descriptor.yearly != 0 {
+            policy.retention_bands.push(RetentionBand {
+                interval_s: YEAR,
+                number_to_retain: descriptor.yearly,
+            });
+        }
+
+        policy
+    }
+
+    /// Returns the set of snapshots to remove.
+    pub fn apply_returning_prunable(
+        &self,
+        snapshots_by_unix_time: BTreeMap<u64, String>,
+    ) -> BTreeSet<String> {
+        if snapshots_by_unix_time.is_empty() {
+            return BTreeSet::new();
+        }
+        let mut snapshots_included: BTreeSet<u64> = BTreeSet::new();
+
+        // Always mark the most recent snapshot as retained!
+        let last_snapshot = snapshots_by_unix_time.keys().rev().next().unwrap();
+        snapshots_included.insert(*last_snapshot);
+
+        let now_time = *last_snapshot;
+
+        for band in &self.retention_bands {
+            for multiple in 1..=band.number_to_retain {
+                let target_time = now_time - (multiple as u64) * band.interval_s;
+                if let Some((k, _)) = snapshots_by_unix_time.range(0..=target_time).rev().next() {
+                    snapshots_included.insert(*k);
+                }
+            }
+        }
+
+        // Find all prunable (unincluded) snapshots.
+        snapshots_by_unix_time
+            .into_iter()
+            .filter(|(k, _v)| !snapshots_included.contains(k))
+            .map(|(_k, v)| v)
+            .collect()
+    }
+}
+
+pub fn prune_with_retention_policy(
+    pile_path: &Path,
+    pile_desc: &PileDescriptor,
+    policy: &RetentionPolicy,
+    prompt_first: bool,
+) -> anyhow::Result<()> {
+    let pile = open_pile(&pile_path, &pile_desc).context("Failed to open pile")?;
+
+    let pointers = pile
+        .list_pointers()
+        .context("Failed to list pointers in pile")?;
+
+    let mut pointers_to_keep: BTreeSet<String> = pointers.iter().cloned().collect();
+
+    let pointers_to_remove = get_prunable_pointers(&policy, pointers);
+
+    for remove in &pointers_to_remove {
+        pointers_to_keep.remove(remove);
+    }
+
+    info!("Gory details:\n---\nKeep: {pointers_to_keep:?}\n---\nRemove: {pointers_to_remove:?}");
+    info!(
+        "{} pointers to remove ({} to keep) based on retention policy.",
+        pointers_to_remove.len(),
+        pointers_to_keep.len()
+    );
+
+    if prompt_first {
+        println!("Would you like to proceed? [y/N]: ");
+        let mut buffer = String::new();
+        let stdin = io::stdin(); // We get `Stdin` here.
+        stdin.read_line(&mut buffer)?;
+        if buffer.trim().to_ascii_lowercase() != "y" {
+            bail!("Aborted by user.");
+        }
+    }
+
+    for to_remove in pointers_to_remove {
+        let res = remove_pointer_safely(&pile, &to_remove).context("removing prunable pointers");
+
+        pile.flush()
+            .context("flushing pile after removing pointers")?;
+
+        res?;
+    }
+
+    Ok(())
+}
+
+fn get_prunable_pointers(policy: &RetentionPolicy, pointers: Vec<String>) -> BTreeSet<String> {
+    let mut split_pointers_by_name: BTreeMap<String, BTreeMap<u64, String>> = BTreeMap::new();
+
+    for pointer in pointers {
+        let (name, datetime) = if let Some(x) = split_pointer_name(&pointer) {
+            x
+        } else {
+            continue;
+        };
+
+        split_pointers_by_name
+            .entry(name)
+            .or_default()
+            .insert(datetime.timestamp().try_into().unwrap(), pointer);
+    }
+
+    let mut pointers_to_remove = BTreeSet::new();
+
+    for (_pointer_base_name, ts_to_pointer) in split_pointers_by_name {
+        let to_remove = policy.apply_returning_prunable(ts_to_pointer);
+
+        pointers_to_remove.extend(to_remove);
+    }
+
+    pointers_to_remove
+}
+
+#[cfg(test)]
+mod test {
+    use crate::commands::prune::{get_prunable_pointers, RetentionPolicy};
+    use crate::descriptor::RetentionPolicyConfig;
+
+    #[test]
+    fn test_prunable_pointers() {
+        let pointers = vec![
+            "alice+2022-09-28_05:00:00",
+            "alice+2022-09-28_02:00:00",
+            "alice+2022-09-21_05:00:00",
+            "alice+2022-09-14_05:00:00",
+            "alice+2022-09-08_05:00:00",
+            "alice+2022-09-07_05:00:00",
+            "alice+2022-09-01_05:00:00",
+            "bob+2022-09-28_06:00:00",
+            "bob+2022-09-28_03:00:00",
+            "bob+2022-09-21_06:00:00",
+            "bob+2022-09-14_06:00:00",
+            "bob+2022-09-08_06:00:00",
+            "bob+2022-09-07_06:00:00",
+            "bob+2022-09-01_06:00:00",
+        ]
+        .into_iter()
+        .map(|s| s.to_owned())
+        .collect();
+        let policy = RetentionPolicy::from_config(RetentionPolicyConfig {
+            daily: 0,
+            weekly: 3,
+            monthly: 0,
+            yearly: 0,
+        });
+
+        assert_eq!(
+            get_prunable_pointers(&policy, pointers)
+                .into_iter()
+                .collect::<Vec<_>>(),
+            vec![
+                "alice+2022-09-01_05:00:00",
+                "alice+2022-09-08_05:00:00",
+                "alice+2022-09-28_02:00:00",
+                "bob+2022-09-01_06:00:00",
+                "bob+2022-09-08_06:00:00",
+                "bob+2022-09-28_03:00:00",
+            ]
+        );
+    }
+}
--- a/datman/src/commands/pushpull.rs
+++ b/datman/src/commands/pushpull.rs
@ -0,0 +1,306 @@
+// Push and Pull support for Datman
+
+use anyhow::{bail, ensure, Context};
+use log::info;
+use std::collections::{BTreeMap, BTreeSet};
+use std::io::{Read, Write};
+use std::sync::Arc;
+use std::time::Instant;
+use yama::chunking::RecursiveUnchunker;
+use yama::commands::retrieve_tree_node;
+use yama::definitions::{ChunkId, PointerData, RecursiveChunkRef, TreeNode};
+use yama::pile::{Keyspace, Pile, PipelineDescription, RawPile};
+use yama::progress::ProgressTracker;
+use yama::remote::{read_message, write_message};
+
+pub fn offer_pointers<W: Write, RP: RawPile>(
+    pile: &Pile<RP>,
+    writer: &mut W,
+) -> anyhow::Result<BTreeMap<String, PointerData>> {
+    let mut pointers_to_offer: BTreeMap<String, PointerData> = BTreeMap::new();
+
+    for pointer_name in pile.list_pointers()? {
+        let pointer_data = pile
+            .read_pointer(&pointer_name)?
+            .context("Listed pointer not present")?;
+
+        pointers_to_offer.insert(pointer_name, pointer_data);
+    }
+
+    write_message(writer, &pointers_to_offer)?;
+    Ok(pointers_to_offer)
+}
+
+pub fn ensure_compatible_bypasses(
+    my_full: &Vec<PipelineDescription>,
+    my_bypass: &Vec<PipelineDescription>,
+    their_full: &Vec<PipelineDescription>,
+    their_bypass: &Vec<PipelineDescription>,
+) -> anyhow::Result<()> {
+    ensure!(
+        my_full.starts_with(&my_bypass),
+        "Our full pipeline is not an extension of the bypass pipeline."
+    );
+    ensure!(
+        their_full.starts_with(&their_bypass),
+        "Their full pipeline is not an extension of their bypass pipeline."
+    );
+
+    let my_bypassed_parts = &my_full[my_bypass.len()..];
+    let their_bypassed_parts = &their_full[their_bypass.len()..];
+
+    ensure!(
+        my_bypassed_parts == their_bypassed_parts,
+        "Our bypassed parts and their bypassed parts are not the same.\nOurs: {:?}\nTheirs: {:?}",
+        my_bypassed_parts,
+        their_bypassed_parts
+    );
+
+    Ok(())
+}
+
+pub fn negotiate_bypassed_pile<R: Read, W: Write>(
+    pile: &Pile<Arc<Box<dyn RawPile>>>,
+    bypass_pile: &Box<dyn RawPile>,
+    reader: &mut R,
+    writer: &mut W,
+) -> anyhow::Result<()> {
+    let my_full_pipeline = pile.raw_pile.describe_pipeline()?;
+    let my_bypass_pipeline = bypass_pile.describe_pipeline()?;
+
+    write_message(writer, &my_full_pipeline)?;
+    write_message(writer, &my_bypass_pipeline)?;
+    writer.flush()?;
+
+    let their_full_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
+    let their_bypass_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
+
+    ensure_compatible_bypasses(
+        &my_full_pipeline,
+        &my_bypass_pipeline,
+        &their_full_pipeline,
+        &their_bypass_pipeline,
+    )?;
+
+    Ok(())
+}
+
+fn collect_chunk_ids(
+    pile: &Pile<Arc<Box<dyn RawPile>>>,
+    root: &TreeNode,
+    chunk_ids: &mut BTreeSet<ChunkId>,
+) -> anyhow::Result<()> {
+    root.visit(
+        &mut |tree_node, _| {
+            match tree_node {
+                TreeNode::NormalFile { content, .. } => {
+                    collect_chunk_ids_from_chunkref(pile, content, chunk_ids)?;
+                }
+                _ => {}
+            }
+            Ok(())
+        },
+        "".to_owned(),
+    )?;
+    Ok(())
+}
+
+fn collect_chunk_ids_from_chunkref(
+    pile: &Pile<Arc<Box<dyn RawPile>>>,
+    chunk_ref: &RecursiveChunkRef,
+    collection: &mut BTreeSet<ChunkId>,
+) -> anyhow::Result<()> {
+    if chunk_ref.depth == 0 {
+        collection.insert(chunk_ref.chunk_id);
+    } else {
+        let shallower_chunk_ref = RecursiveChunkRef {
+            chunk_id: chunk_ref.chunk_id,
+            depth: chunk_ref.depth - 1,
+        };
+        let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
+        let mut next_chunk_id: ChunkId = Default::default();
+        loop {
+            let read = unchunker.read(&mut next_chunk_id[..])?;
+            if read == 0 {
+                break;
+            } else if read < next_chunk_id.len() {
+                unchunker.read_exact(&mut next_chunk_id[read..])?;
+            }
+            collection.insert(next_chunk_id);
+        }
+    }
+    Ok(())
+}
+
+pub fn offering_side<R: Read, W: Write>(
+    pile: &Pile<Arc<Box<dyn RawPile>>>,
+    bypass_pile: &Box<dyn RawPile>,
+    reader: &mut R,
+    writer: &mut W,
+    mut progress: Box<dyn ProgressTracker>,
+) -> anyhow::Result<()> {
+    let version = env!("CARGO_PKG_VERSION");
+    let expecting = format!("Datman Pull Accepter {}", version);
+    write_message(writer, &format!("Datman Pull Offerer {}", version))?;
+    writer.flush()?;
+
+    let found: String = read_message(reader)?;
+    ensure!(
+        found == expecting,
+        "Version mismatch. Expecting {:?} got {:?}",
+        expecting,
+        found
+    );
+
+    // First 'negotiate' (for now: assert) a pile bypass.
+    // This lets us avoid decompressing things before recompressing them at the other end,
+    // assuming both ends use the same dictionary.
+    negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
+
+    let offered_pointers = offer_pointers(pile, writer)?;
+    let wanted_pointers = read_message::<_, BTreeSet<String>>(reader)?;
+
+    let mut chunks_to_offer: BTreeSet<ChunkId> = BTreeSet::new();
+
+    for pointer_name in &wanted_pointers {
+        let pointer_data = offered_pointers
+            .get(pointer_name)
+            .with_context(|| format!("Requested pointer {:?} was not offered", pointer_name))?;
+
+        collect_chunk_ids_from_chunkref(pile, &pointer_data.chunk_ref, &mut chunks_to_offer)?;
+
+        let root_node = retrieve_tree_node(pile, pointer_data.chunk_ref.clone())?;
+        collect_chunk_ids(pile, &root_node.node, &mut chunks_to_offer)?;
+    }
+
+    write_message(writer, &chunks_to_offer)?;
+    writer.flush()?;
+
+    let chunks_to_skip: BTreeSet<ChunkId> = read_message(reader)?;
+    let chunks_to_send: Vec<ChunkId> = chunks_to_offer
+        .difference(&chunks_to_skip)
+        .cloned()
+        .collect();
+
+    drop(chunks_to_offer);
+    drop(chunks_to_skip);
+
+    let start_sort_by_hints = Instant::now();
+    let chunks_to_send_with_hints: BTreeSet<(u64, ChunkId)> = chunks_to_send
+        .into_iter()
+        .map(|chunk_id| {
+            pile.raw_pile
+                .chunk_id_transfer_ordering_hint(&chunk_id)
+                .map(|hint| (hint, chunk_id))
+        })
+        .collect::<anyhow::Result<_>>()?;
+    let time_to_sort_by_hints = Instant::now() - start_sort_by_hints;
+    info!(
+        "{} s to sort {} chunks by their hints",
+        time_to_sort_by_hints.as_secs_f32(),
+        chunks_to_send_with_hints.len()
+    );
+
+    progress.set_max_size(chunks_to_send_with_hints.len() as u64);
+    progress.set_current(0);
+    for (_hint, chunk_id) in chunks_to_send_with_hints {
+        let chunk_data = bypass_pile
+            .read(Keyspace::Chunk, &chunk_id)?
+            .context("Chunk vanished")?;
+
+        write_message(writer, &Some((chunk_id, chunk_data)))?;
+        progress.inc_progress(1);
+    }
+
+    write_message(writer, &None::<Option<(ChunkId, Vec<u8>)>>)?;
+
+    writer.flush()?;
+
+    Ok(())
+}
+
+pub fn accepting_side<R: Read, W: Write>(
+    pile: &Pile<Arc<Box<dyn RawPile>>>,
+    bypass_pile: &Box<dyn RawPile>,
+    reader: &mut R,
+    writer: &mut W,
+    mut progress: Box<dyn ProgressTracker>,
+) -> anyhow::Result<()> {
+    let version = env!("CARGO_PKG_VERSION");
+    let expecting = format!("Datman Pull Offerer {}", version);
+    write_message(writer, &format!("Datman Pull Accepter {}", version))?;
+    writer.flush()?;
+
+    let found: String = read_message(reader)?;
+    ensure!(
+        found == expecting,
+        "Version mismatch. Expecting {:?} got {:?}",
+        expecting,
+        found
+    );
+
+    // First 'negotiate' (for now: assert) a pile bypass.
+    // This lets us avoid decompressing things before recompressing them at the other end,
+    // assuming both ends use the same dictionary.
+    negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
+
+    let offered_pointers: BTreeMap<String, PointerData> = read_message(reader)?;
+    let mut wanted_pointers: BTreeSet<String> = BTreeSet::new();
+
+    for (pointer_name, pointer_data) in &offered_pointers {
+        if pile.read_pointer(pointer_name)?.is_none() {
+            wanted_pointers.insert(pointer_name.clone());
+            if let Some(parent) = &pointer_data.parent_pointer {
+                if pile.read_pointer(parent)?.is_none() && !offered_pointers.contains_key(parent) {
+                    bail!("Offered pointer {:?} requires parent {:?} which we don't have and isn't offered.", pointer_name, parent);
+                }
+            }
+        }
+    }
+
+    write_message(writer, &wanted_pointers)?;
+    writer.flush()?;
+
+    let offered_chunks: BTreeSet<ChunkId> = read_message(reader)?;
+    let mut chunks_to_skip: BTreeSet<ChunkId> = BTreeSet::new();
+    for chunk_id in &offered_chunks {
+        if pile.chunk_exists(chunk_id)? {
+            chunks_to_skip.insert(*chunk_id);
+        }
+    }
+
+    write_message(writer, &chunks_to_skip)?;
+    writer.flush()?;
+
+    let num_chunks_to_recv = offered_chunks.len() - chunks_to_skip.len();
+
+    let mut chunks_to_recv: BTreeSet<ChunkId> = offered_chunks
+        .difference(&chunks_to_skip)
+        .cloned()
+        .collect();
+
+    drop(offered_chunks);
+    drop(chunks_to_skip);
+
+    progress.set_max_size(num_chunks_to_recv as u64);
+    progress.set_current(0);
+
+    while let Some((chunk_id, chunk_data)) = read_message::<_, Option<(ChunkId, Vec<u8>)>>(reader)?
+    {
+        ensure!(
+            chunks_to_recv.remove(&chunk_id),
+            "Received unexpected chunk"
+        );
+        bypass_pile.write(Keyspace::Chunk, &chunk_id, &chunk_data)?;
+        progress.inc_progress(1);
+    }
+
+    ensure!(chunks_to_recv.is_empty(), "Unreceived chunks.");
+
+    for (pointer_name, pointer_data) in &offered_pointers {
+        pile.write_pointer(pointer_name, pointer_data)?;
+    }
+    pile.flush()?;
+
+    Ok(())
+}
--- a/datman/src/commands/report.rs
+++ b/datman/src/commands/report.rs
@ -0,0 +1,456 @@
+use crate::commands::backup::split_pointer_name;
+use crate::descriptor::{Descriptor, DestPileDescriptor};
+use anyhow::Context;
+use chrono::{Date, DateTime, Utc};
+use comfy_table::presets::UTF8_FULL;
+use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table};
+use humansize::FileSize;
+use itertools::Itertools;
+use log::info;
+use std::collections::{BTreeMap, BTreeSet};
+use std::ffi::CString;
+use std::io::Read;
+use std::mem;
+use std::mem::size_of;
+use std::os::unix::ffi::OsStrExt;
+use std::os::unix::fs::MetadataExt;
+use std::path::Path;
+use yama::chunking::RecursiveUnchunker;
+use yama::commands::{load_pile_descriptor, open_pile, retrieve_tree_node};
+use yama::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
+use yama::pile::{DebugStatistics, Pile, RawPile};
+
+// This module generates reports for a Datman system.
+// Referenced Chunk IDs are counted and used to give an indication of size.
+// Chunk IDs are summarised into u32s to reduce memory usage. Since the report is approximate,
+// it doesn't matter if there are a few collisions (although they are still fairly unlikely to
+// affect much).
+
+#[derive(Clone)]
+pub struct Report {
+    pub last_source_backups: BTreeMap<String, Option<DateTime<Utc>>>,
+
+    pub chunk_usages_aggregated: bool,
+    pub chunk_usage: BTreeMap<String, Sizes>,
+
+    pub debug_stats: Option<DebugStatistics>,
+}
+
+#[derive(Clone, Default)]
+pub struct Sizes {
+    /// Total number of chunks that we refer to.
+    pub total: u32,
+
+    /// Each referred chunk is counted once here, but divided by the number of sharers.
+    /// We are 'morally responsible' for this many chunks.
+    pub moral: u32,
+
+    /// Number of chunks that only we point to.
+    pub unique: u32,
+
+    /// Number of chunks for which we are the oldest (lexicographically earliest) pointer to point
+    /// to those chunks.
+    pub rollup: u32,
+}
+
+type CondensedChunkId = u32;
+
+fn condense_chunk_id(chunk_id: ChunkId) -> CondensedChunkId {
+    CondensedChunkId::from_be_bytes(
+        chunk_id[0..size_of::<CondensedChunkId>()]
+            .try_into()
+            .unwrap(),
+    )
+}
+
+pub fn generate_report(
+    dest_pile_descriptor: &DestPileDescriptor,
+    descriptor: &Descriptor,
+    aggregate_chunk_usage_by_month: bool,
+) -> anyhow::Result<Report> {
+    let pile_descriptor = load_pile_descriptor(&dest_pile_descriptor.path)?;
+    let pile = open_pile(&dest_pile_descriptor.path, &pile_descriptor)?;
+
+    let debug_stats = pile.raw_pile.debug_statistics()?;
+
+    let mut pointers_to_parent_and_chunkids = BTreeMap::new();
+    let mut pointergroups_to_pointers: BTreeMap<String, Vec<String>> = BTreeMap::new();
+
+    info!("Collecting chunk IDs... This will probably be slow.");
+    for pointer_name in pile.list_pointers()? {
+        let pointer = pile
+            .read_pointer(&pointer_name)?
+            .context("listed pointer doesn't exist")?;
+        let root_node = retrieve_tree_node(&pile, pointer.chunk_ref)?;
+        let pointer_chunk_ids = collect_chunk_ids(&pile, &root_node.node)?;
+
+        let pointergroup = if aggregate_chunk_usage_by_month {
+            let (base, date_time) =
+                split_pointer_name(&pointer_name).context("Can't split pointer name")?;
+            format!("{}+{}", base, date_time.format("%Y-%m"))
+        } else {
+            pointer_name.clone()
+        };
+
+        pointergroups_to_pointers
+            .entry(pointergroup)
+            .or_default()
+            .push(pointer_name.clone());
+
+        pointers_to_parent_and_chunkids
+            .insert(pointer_name, (pointer.parent_pointer, pointer_chunk_ids));
+    }
+
+    // Now we iterate in reverse order, making a list of count of Chunk IDs.
+    // At the same time, we can also calculate 'rollup' sizes.
+    let mut chunk_sharer_counts: BTreeMap<CondensedChunkId, u16> = BTreeMap::new();
+
+    let mut pointergroup_stats: BTreeMap<String, Sizes> = BTreeMap::new();
+
+    for (pointergroup_name, pointers_in_group) in pointergroups_to_pointers.iter().rev() {
+        let mut deduped_chunks = BTreeSet::new();
+
+        for pointer_name in pointers_in_group {
+            deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
+                &pointers_to_parent_and_chunkids,
+                &pointer_name,
+            ))
+        }
+
+        let mut rollup_count = 0;
+        for chunk in deduped_chunks {
+            let count = chunk_sharer_counts.entry(chunk).or_default();
+            *count += 1;
+            if *count == 1 {
+                rollup_count += 1;
+            }
+        }
+        let entry = pointergroup_stats
+            .entry(pointergroup_name.to_owned())
+            .or_default();
+        entry.rollup = rollup_count;
+    }
+
+    // Now go through again and update all the stats!
+    for (pointergroup_name, pointers_in_group) in &pointergroups_to_pointers {
+        let mut deduped_chunks = BTreeSet::new();
+
+        for pointer_name in pointers_in_group {
+            deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
+                &pointers_to_parent_and_chunkids,
+                &pointer_name,
+            ))
+        }
+
+        let mut unique_count = 0;
+        let mut shared_count_by_sharers = [0u32; 256];
+        let total_count = deduped_chunks.len();
+        for chunk in deduped_chunks {
+            let count = chunk_sharer_counts[&chunk];
+            if count == 1 {
+                unique_count += 1;
+            } else {
+                let num_sharers = (count as usize).min(256);
+                shared_count_by_sharers[num_sharers - 1] += 1;
+            }
+        }
+
+        let mut sharers_sum: f64 = 0.0;
+        for (sharers_minus_one, count) in shared_count_by_sharers.into_iter().enumerate() {
+            sharers_sum += (count as f64) / (sharers_minus_one + 1) as f64;
+        }
+
+        let entry = pointergroup_stats
+            .entry(pointergroup_name.to_owned())
+            .or_default();
+        entry.moral = (sharers_sum.ceil() as u32) + unique_count;
+        entry.unique = unique_count;
+        entry.total = total_count as u32;
+    }
+
+    let mut last_backed_up = BTreeMap::new();
+    for source_name in descriptor.sources.keys().cloned() {
+        last_backed_up.insert(source_name, None);
+    }
+
+    for pointer_name in pointers_to_parent_and_chunkids.keys() {
+        if let Some((source_name, date_time)) = split_pointer_name(&pointer_name) {
+            last_backed_up.insert(source_name, Some(date_time));
+        }
+    }
+
+    Ok(Report {
+        last_source_backups: last_backed_up,
+        chunk_usage: pointergroup_stats,
+        chunk_usages_aggregated: aggregate_chunk_usage_by_month,
+        debug_stats,
+    })
+}
+
+// Does not filter duplicates...
+fn iter_over_all_chunkids_incl_parents<'a>(
+    pointers_to_parent_and_chunkids: &'a BTreeMap<
+        String,
+        (Option<String>, BTreeSet<CondensedChunkId>),
+    >,
+    pointer_name: &'a str,
+) -> Box<dyn Iterator<Item = CondensedChunkId> + 'a> {
+    let (parent, chunks) = &pointers_to_parent_and_chunkids[pointer_name];
+    match parent {
+        None => Box::new(chunks.iter().copied()),
+        Some(parent) => Box::new(chunks.iter().copied().chain(
+            iter_over_all_chunkids_incl_parents(pointers_to_parent_and_chunkids, &parent),
+        )),
+    }
+}
+
+fn collect_chunk_ids<RP: RawPile>(
+    pile: &Pile<RP>,
+    root: &TreeNode,
+) -> anyhow::Result<BTreeSet<CondensedChunkId>> {
+    let mut chunk_ids = BTreeSet::new();
+    root.visit(
+        &mut |tree_node, _| {
+            match tree_node {
+                TreeNode::NormalFile { content, .. } => {
+                    collect_chunk_ids_from_chunkref(pile, content, &mut chunk_ids)?;
+                }
+                _ => {}
+            }
+            Ok(())
+        },
+        "".to_owned(),
+    )?;
+    Ok(chunk_ids)
+}
+
+fn collect_chunk_ids_from_chunkref<RP: RawPile>(
+    pile: &Pile<RP>,
+    chunk_ref: &RecursiveChunkRef,
+    collection: &mut BTreeSet<CondensedChunkId>,
+) -> anyhow::Result<()> {
+    if chunk_ref.depth == 0 {
+        collection.insert(condense_chunk_id(chunk_ref.chunk_id));
+    } else {
+        let shallower_chunk_ref = RecursiveChunkRef {
+            chunk_id: chunk_ref.chunk_id,
+            depth: chunk_ref.depth - 1,
+        };
+        let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
+        let mut next_chunk_id: ChunkId = Default::default();
+        loop {
+            let read = unchunker.read(&mut next_chunk_id[..])?;
+            if read == 0 {
+                break;
+            } else if read < next_chunk_id.len() {
+                unchunker.read_exact(&mut next_chunk_id[read..])?;
+            }
+            collection.insert(condense_chunk_id(next_chunk_id));
+        }
+    }
+    Ok(())
+}
+
+pub fn print_report(report: &Report) -> anyhow::Result<()> {
+    print_time_report(report)?;
+    print_size_report(report)?;
+    Ok(())
+}
+
+pub fn print_time_report(report: &Report) -> anyhow::Result<()> {
+    println!("\nBackup times");
+    let mut table = Table::new();
+    table
+        .load_preset(UTF8_FULL)
+        .set_content_arrangement(ContentArrangement::DynamicFullWidth)
+        .enforce_styling();
+
+    table.set_header(vec![
+        Cell::new("Source name").fg(Color::Cyan),
+        Cell::new("Last backed up").fg(Color::Cyan),
+    ]);
+
+    let today = Utc::today();
+
+    let sort_by_dates: Vec<(Option<Date<Utc>>, String)> = report
+        .last_source_backups
+        .iter()
+        .map(|(name, datetime)| (datetime.map(|dt| dt.date()), name.to_owned()))
+        .sorted()
+        .collect();
+
+    for (date, source_name) in sort_by_dates {
+        match date {
+            None => {
+                table.add_row(vec![
+                    Cell::new(source_name).fg(Color::Magenta),
+                    Cell::new("NEVER").fg(Color::Red).add_attributes(vec![
+                        Attribute::SlowBlink,
+                        Attribute::RapidBlink,
+                        Attribute::Bold,
+                    ]),
+                ]);
+            }
+            Some(date) => {
+                let number_of_days = today.signed_duration_since(date).num_days();
+                let num_days_human = if number_of_days > 0 {
+                    format!("{number_of_days} days ago")
+                } else {
+                    format!("today")
+                };
+
+                let colour = if number_of_days < 2 {
+                    Color::Green
+                } else if number_of_days < 14 {
+                    Color::Yellow
+                } else {
+                    Color::Red
+                };
+
+                let formatted_date = date.format("%F");
+
+                let mut val_cell =
+                    Cell::new(format!("{formatted_date} {num_days_human}")).fg(colour);
+                if number_of_days > 28 {
+                    val_cell = val_cell.add_attribute(Attribute::SlowBlink);
+                }
+
+                table.add_row(vec![Cell::new(source_name).fg(Color::Magenta), val_cell]);
+            }
+        }
+    }
+
+    println!("{table}");
+
+    Ok(())
+}
+
+pub fn print_size_report(report: &Report) -> anyhow::Result<()> {
+    println!("\nPile size");
+    let mut table = Table::new();
+    table
+        .load_preset(UTF8_FULL)
+        .set_content_arrangement(ContentArrangement::DynamicFullWidth)
+        .enforce_styling();
+    //.set_width(100);
+    table.set_header(vec![
+        Cell::new("Pointer name").fg(Color::Cyan),
+        Cell::new("Rollup size").fg(Color::Magenta),
+        Cell::new("Unique size").fg(Color::Magenta),
+        Cell::new("Moral size").fg(Color::Magenta),
+        Cell::new("Total size").fg(Color::Magenta),
+    ]);
+
+    let average_chunk_size = report
+        .debug_stats
+        .as_ref()
+        .map(|stats| stats.total_chunk_size as f64 / stats.number_of_chunks as f64);
+    for (pointer_name, sizes) in &report.chunk_usage {
+        table.add_row(vec![
+            Cell::new(pointer_name).fg(Color::Blue),
+            Cell::new(format_size(sizes.rollup, average_chunk_size)).fg(Color::Yellow),
+            Cell::new(format_size(sizes.unique, average_chunk_size)).fg(Color::Yellow),
+            Cell::new(format_size(sizes.moral, average_chunk_size)).fg(Color::Yellow),
+            Cell::new(format_size(sizes.total, average_chunk_size)).fg(Color::Yellow),
+        ]);
+    }
+
+    println!("{table}");
+
+    Ok(())
+}
+
+fn format_size(chunks: u32, average_chunk_size: Option<f64>) -> String {
+    let est_size_suffix = average_chunk_size
+        .map(|bytes_per_chunk| {
+            let num_bytes = (chunks as f64 * bytes_per_chunk) as u64;
+            let mut format = humansize::file_size_opts::BINARY;
+            format.decimal_places = 1;
+            format!(" ~{}", num_bytes.file_size(format).unwrap())
+        })
+        .unwrap_or_default();
+    format!("{} c{}", chunks, est_size_suffix)
+}
+
+fn calculate_total_filesize_of_dir(dir: &Path) -> anyhow::Result<u64> {
+    let mut total = 0;
+    for file in std::fs::read_dir(dir)? {
+        let file = file?;
+        let metadata = file.metadata()?;
+        total += metadata.size();
+        if metadata.is_dir() {
+            total += calculate_total_filesize_of_dir(&file.path())?;
+        }
+    }
+    Ok(total)
+}
+
+pub fn print_filesystem_space(pile_path: &Path) -> anyhow::Result<()> {
+    let usage_for_pile = calculate_total_filesize_of_dir(&pile_path)?;
+
+    let path_c = CString::new(pile_path.as_os_str().as_bytes()).unwrap();
+    let stats = unsafe {
+        let mut stats: libc::statfs = mem::zeroed();
+        match libc::statfs(path_c.as_ptr(), &mut stats) {
+            0 => Ok(stats),
+            other => Err(std::io::Error::from_raw_os_error(other)),
+        }
+    }?;
+
+    // On a BTRFS system with 2 disks in RAID1, note (about df -h):
+    // - 'Size' shows the average size of the two disks. I think of it as 'ideal size'.
+    // - 'Avail' seems to show the actual number of bytes usable.
+    // - 'Used' seems to show the actual number of bytes used.
+    // In short: probably avoid relying on 'size'.
+
+    let block_size = stats.f_bsize as i64;
+    let used_bytes = (stats.f_blocks - stats.f_bfree) as i64 * block_size;
+    let avail_bytes = stats.f_bavail as i64 * block_size;
+    let usable_bytes = used_bytes + avail_bytes;
+    let theoretical_size = stats.f_blocks as i64 * block_size;
+
+    let mut format = humansize::file_size_opts::BINARY;
+    format.decimal_places = 1;
+    format.decimal_zeroes = 1;
+
+    println!("\nFilesystem Information");
+
+    let mut table = Table::new();
+    table
+        .load_preset(UTF8_FULL)
+        .set_content_arrangement(ContentArrangement::DynamicFullWidth)
+        .enforce_styling();
+    //.set_width(100);
+    table.set_header(vec![
+        Cell::new("Theoretical Size").fg(Color::Cyan),
+        Cell::new("Usable Size").fg(Color::Cyan),
+        Cell::new("Used").fg(Color::Cyan),
+        Cell::new("Used for Pile").fg(Color::Cyan),
+        Cell::new("Available").fg(Color::Cyan),
+    ]);
+
+    let available_space_colour = if avail_bytes < 8 * 1024 * 1024 * 1024 {
+        Color::Red
+    } else if avail_bytes < 64 * 1024 * 1024 * 1024 {
+        Color::Yellow
+    } else {
+        Color::Green
+    };
+
+    table.add_row(vec![
+        Cell::new(format!(
+            "{:>9}",
+            theoretical_size.file_size(&format).unwrap()
+        ))
+        .fg(Color::Blue),
+        Cell::new(format!("{:>9}", usable_bytes.file_size(&format).unwrap())).fg(Color::Blue),
+        Cell::new(format!("{:>9}", used_bytes.file_size(&format).unwrap())).fg(Color::Blue),
+        Cell::new(format!("{:>9}", usage_for_pile.file_size(&format).unwrap())).fg(Color::Blue),
+        Cell::new(format!("{:>9}", avail_bytes.file_size(&format).unwrap()))
+            .fg(available_space_colour),
+    ]);
+
+    print!("{table}");
+
+    Ok(())
+}
--- a/datman/src/descriptor.rs
+++ b/datman/src/descriptor.rs
@ -38,6 +38,10 @@ pub struct Descriptor {
    pub piles: HashMap<String, DestPileDescriptor>,

    pub remote_hosts: HashMap<String, RemoteHostDescriptor>,
+
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub retention: Option<RetentionPolicyConfig>,
 }

 #[derive(Clone, Serialize, Deserialize, Debug)]
@ -46,12 +50,22 @@ pub struct RemoteHostDescriptor {
    pub path_to_datman: Option<String>,
 }

+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct RetentionPolicyConfig {
+    pub daily: u32,
+    pub weekly: u32,
+    pub monthly: u32,
+    pub yearly: u32,
+}
+
 #[derive(Clone, Serialize, Deserialize, Debug)]
 #[serde(untagged)]
 pub enum SourceDescriptor {
    DirectorySource {
        hostname: String,
        directory: PathBuf,
+        #[serde(default)]
+        cross_filesystems: bool,
    },
    VirtualSource {
        /// The name of the helper program that will be used to do this backup.
--- a/datman/src/labelling.rs
+++ b/datman/src/labelling.rs
@ -15,10 +15,10 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

-use std::collections::HashMap;
+use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
 use std::io::{BufRead, BufReader, Write};
-use std::path::Path;
+use std::path::{Path, PathBuf};

 use anyhow::anyhow;
 use anyhow::Context;
@ -222,6 +222,23 @@ impl LabellingRules {
        }
        None
    }
+
+    pub fn get_exclusions_set(&self, base: &Path) -> BTreeSet<PathBuf> {
+        let mut exclusions = BTreeSet::new();
+
+        for (ext_path, state) in &self.position_based_rules {
+            assert!(ext_path.is_empty() || ext_path.starts_with('/'));
+            let full_path = PathBuf::from(format!(
+                "{}{ext_path}",
+                base.to_str().expect("base path must always be utf-8")
+            ));
+            if state == &Excluded {
+                exclusions.insert(full_path);
+            }
+        }
+
+        exclusions
+    }
 }

 /// Uninteractively label the nodes.
--- a/datman/src/remote/backup_source_requester.rs
+++ b/datman/src/remote/backup_source_requester.rs
@ -1,18 +1,22 @@
 use crate::commands::backup::{get_pointer_name_at, label_filter_and_convert};
 use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor};
+use crate::labelling::load_labelling_rules;
 use crate::tree::FileTree;
 use anyhow::{anyhow, bail};
 use chrono::Utc;
 use log::info;
+use std::collections::BTreeSet;
 use std::io::{Read, Write};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::process::{Child, Command, Stdio};
 use std::sync::Arc;
 use yama::commands::{load_pile_descriptor, open_pile};
-use yama::definitions::TreeNode;
-use yama::pile::{Pile, RawPile};
+use yama::definitions::{PartialPointerData, TreeNode};
+use yama::operations::storing::{pointer_ops_prepare_to_store, pointers_ops_after_store};
+use yama::pile::access_guard::PileGuard;
+use yama::pile::{Pile, RawPile, StoragePipelineSettings};
 use yama::progress::ProgressTracker;
-use yama::remote::responder::Responder;
+use yama::remote::responder::{Responder, ResponderWritingPipeline};
 use yama::remote::{read_message, write_message};
 use yama::utils::get_number_of_workers;

@ -45,10 +49,14 @@ pub fn scanning<R: Read, W: Write>(
    read: &mut R,
    write: &mut W,
    path: &Path,
+    one_filesystem: bool,
+    exclusions: &BTreeSet<PathBuf>,
 ) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
    info!("Scanning.");
    write_message(write, &"scan")?;
    write_message(write, &path)?;
+    write_message(write, &one_filesystem)?;
+    write_message(write, exclusions)?;
    write.flush()?;
    let scan_result: Option<FileTree<(), (), (), ()>> = read_message(read)?;

@ -64,25 +72,43 @@ pub fn chunking<
    read: R,
    mut write: W,
    path: &Path,
-    pointer_name: String,
    tree_node: &TreeNode,
    raw_pile: Arc<RP>,
-    parent: Option<String>,
    progress_bar: PT,
-) -> anyhow::Result<(R, W)> {
+    use_writing_pipeline: bool,
+) -> anyhow::Result<(R, W, PartialPointerData)> {
    info!("Chunking.");
    write_message(&mut write, &"chunk")?;
    write_message(&mut write, &path)?;
-    write_message(&mut write, &pointer_name)?;
    write_message(&mut write, tree_node)?;
-    write_message(&mut write, &parent)?;
    write.flush()?;

+    let (writing_pipeline, control_rx) = if use_writing_pipeline {
+        let sps = StoragePipelineSettings {
+            num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
+            compressor_input_bound: 32,
+            writer_input_bound: 32,
+        };
+        let (control_tx, control_rx) = crossbeam_channel::unbounded();
+        let pipeline = raw_pile.build_storage_pipeline(sps, control_tx)?;
+        (
+            Some(ResponderWritingPipeline {
+                pipeline_submission: pipeline,
+            }),
+            Some(control_rx),
+        )
+    } else {
+        (None, None)
+    };
+
+    let guarded_pile = PileGuard::new(Arc::clone(&raw_pile), true);
+
    let (r_handle, w_handle, join_handles) = Responder::start(
        read,
        write,
        get_number_of_workers("YAMA_RESPONDERS") as u16,
-        raw_pile,
+        Arc::new(guarded_pile),
+        writing_pipeline,
        progress_bar,
    );

@ -91,12 +117,20 @@ pub fn chunking<
    for handle in join_handles {
        handle.join().expect("Join handle should not fail");
    }
-    let read = r_handle.join().unwrap();
+    let mut read = r_handle.join().unwrap();
    let write = w_handle.join().unwrap();

+    if let Some(control_rx) = control_rx {
+        while let Ok(_) = control_rx.recv() {
+            // TODO nop
+        }
+    }
+
    info!("Remote finished chunking.");

-    Ok((read, write))
+    let pointer_data: PartialPointerData = read_message(&mut read)?;
+
+    Ok((read, write, pointer_data))
 }

 pub fn quit<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
@ -149,6 +183,7 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
        SourceDescriptor::DirectorySource {
            hostname,
            directory,
+            cross_filesystems,
        } => {
            let remote_host_descriptor = descriptor
                .remote_hosts
@ -168,13 +203,22 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
            info!("Connecting...");
            introduction(&mut read, &mut write)?;

+            let rules = load_labelling_rules(desc_path, source_name)?;
+            let exclusions = rules.get_exclusions_set(directory);
+
            // then request to scan
            info!("Requesting scan... (this may take some time)");
-            let scan_result = scanning(&mut read, &mut write, directory.as_ref())?
+            let scan_result = scanning(
+                &mut read,
+                &mut write,
+                directory.as_ref(),
+                !*cross_filesystems,
+                &exclusions,
+            )?
            .ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?;

-            let root =
-                label_filter_and_convert(scan_result, descriptor, desc_path, source_name, dest)?
+            let mut root =
+                label_filter_and_convert(scan_result, descriptor, source_name, &rules, dest)?
                    .ok_or_else(|| anyhow!("Empty filter..."))?;

            let absolute_dest_path = desc_path.join(&dest.path);
@ -183,7 +227,10 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>

            let pointer_name = get_pointer_name_at(&source_name, Utc::now());
            if pile.read_pointer(pointer_name.as_str())?.is_some() {
-                bail!("Pointer by name {:?} already exists; refusing to overwrite.", pointer_name);
+                bail!(
+                    "Pointer by name {:?} already exists; refusing to overwrite.",
+                    pointer_name
+                );
            }
            info!("Will write as pointer {:?}.", pointer_name);

@ -219,18 +266,27 @@ pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>
            let raw_pile = Arc::new(pile.raw_pile);
            let pile = Pile::new(raw_pile.clone());

-            let (mut read, mut write) = chunking(
+            pointer_ops_prepare_to_store(&pile, &mut root, &parent)?;
+
+            info!(
+                "Have pointer_name = {:?}, parent = {:?}",
+                pointer_name, parent
+            );
+
+            let (mut read, mut write, pointer_data) = chunking(
                read,
                write,
                directory.as_ref(),
-                pointer_name.clone(),
                &root,
                raw_pile,
-                parent,
                progress_bar,
+                true,
            )?;

            quit(&mut read, &mut write)?;
+
+            pointers_ops_after_store(&pile, &pointer_name, &pointer_data.complete(parent))?;
+
            pile.flush()?;

            info!("Stored! Checking for existence...");
--- a/datman/src/remote/backup_source_responder.rs
+++ b/datman/src/remote/backup_source_responder.rs
@ -1,20 +1,25 @@
 // This file implements the responder side of the backup source protocol -- the protocol used
 // to connect to remote backup sources.

-use crate::tree::scan;
+use std::collections::BTreeSet;
+use std::io::{stdin, stdout, Read, Write};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
 use anyhow::bail;
 use crossbeam_channel::Sender;
 use log::info;
-use std::io::{stdin, stdout, Read, Write};
-use std::path::PathBuf;
-use std::time::Instant;
-use yama::definitions::TreeNode;
+
+use yama::definitions::{PartialPointerData, TreeNode};
 use yama::pile::{Pile, RawPile};
 use yama::progress::ProgressTracker;
 use yama::remote::requester::Requester;
 use yama::remote::{read_message, write_message, RequestBody, ResponseBody};
 use yama::utils::get_number_of_workers;

+use crate::tree::scan;
+
 pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
    let version = env!("CARGO_PKG_VERSION");
    write_message(
@ -38,47 +43,14 @@ pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::R

 pub fn scanning<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
    let path: PathBuf = read_message(read)?;
-    let scan_result = scan(&path)?;
+    let one_filesystem: bool = read_message(read)?;
+    let exclusions: BTreeSet<PathBuf> = read_message(read)?;
+    let scan_result = scan(&path, one_filesystem, &exclusions)?;
    write_message(write, &scan_result)?;
    write.flush()?;
    Ok(())
 }

-pub fn chunking<R: Read + Send + 'static, W: Write + Send + 'static>(
-    mut read: R,
-    write: W,
-) -> anyhow::Result<()> {
-    let path: PathBuf = read_message(&mut read)?;
-    let pointer_name: String = read_message(&mut read)?;
-    let tree_node: TreeNode = read_message(&mut read)?;
-    let parent: Option<String> = read_message(&mut read)?;
-
-    let (yama_requester, requester_join_handles) = Requester::new(read, write);
-
-    let raw_pile: Box<dyn RawPile> = Box::new(yama_requester);
-
-    let pile = Pile::new(raw_pile);
-
-    // TODO TODO progress
-    let progress_bar = &mut ();
-
-    yama::operations::storing::store_fully(
-        &pile,
-        &path,
-        &pointer_name,
-        tree_node,
-        parent,
-        get_number_of_workers("YAMA_CHUNKERS"),
-        progress_bar,
-    )?;
-
-    for join_handle in requester_join_handles {
-        join_handle.join().expect("Expected to join handle");
-    }
-
-    Ok(())
-}
-
 pub struct ProgressSender {
    pub last_sent: Instant,
    pub current_progress: u64,
@ -89,7 +61,7 @@ pub struct ProgressSender {
 }

 impl ProgressSender {
-    pub fn send_now(&mut self, include_message: bool) {
+    pub fn send_now(&mut self, _include_message: bool) {
        self.sender
            .send((
                RequestBody::Progress {
@ -103,7 +75,6 @@ impl ProgressSender {
    }

    pub fn send_if_overdue(&mut self) {
-        //info!("send if overdue...");
        if Instant::now().duration_since(self.last_sent).as_millis() >= 1024 {
            self.send_now(false);
        }
@ -127,23 +98,17 @@ impl ProgressTracker for ProgressSender {
    }
 }

-pub fn chunking_stdio() -> anyhow::Result<()> {
-    let (path, pointer_name, tree_node, parent) = {
+// TODO use io-streams crate and get rid of the duplication!!
+pub fn chunking_stdio() -> anyhow::Result<PartialPointerData> {
+    let (path, tree_node) = {
        let stdin = stdin();
        let mut read = stdin.lock();
        let path: PathBuf = read_message(&mut read)?;
-        let pointer_name: String = read_message(&mut read)?;
        let tree_node: TreeNode = read_message(&mut read)?;
-        let parent: Option<String> = read_message(&mut read)?;
-        (path, pointer_name, tree_node, parent)
+        (path, tree_node)
    };

-    info!(
-        "Have pointer_name = {:?}, parent = {:?}",
-        pointer_name, parent
-    );
-
-    let requester_join_handles = {
+    let (pointer_data, requester_join_handles) = {
        let (yama_requester, requester_join_handles) = Requester::new_from_stdio();
        let command_sender = yama_requester.clone_command_sender();
        info!("progress sender in use");
@ -159,17 +124,15 @@ pub fn chunking_stdio() -> anyhow::Result<()> {

        let pile = Pile::new(raw_pile);

-        yama::operations::storing::store_fully(
-            &pile,
+        let pointer_data = yama::operations::storing::store_without_pointer_ops(
+            &Arc::new(pile),
            &path,
-            &pointer_name,
            tree_node,
-            parent,
            get_number_of_workers("YAMA_CHUNKERS"),
            &mut progress_bar,
        )?;

-        requester_join_handles
+        (pointer_data, requester_join_handles)
    };

    info!("Waiting to join.");
@ -180,20 +143,7 @@ pub fn chunking_stdio() -> anyhow::Result<()> {

    info!("Chunking completed.");

-    Ok(())
-}
-
-pub fn handler<R: Read + Send + 'static, W: Write + Send + 'static>(
-    mut read: R,
-    mut write: W,
-) -> anyhow::Result<()> {
-    introduction(&mut read, &mut write)?;
-
-    scanning(&mut read, &mut write)?;
-
-    chunking(read, write)?;
-
-    Ok(())
+    Ok(pointer_data)
 }

 pub fn handler_stdio() -> anyhow::Result<()> {
@ -214,7 +164,13 @@ pub fn handler_stdio() -> anyhow::Result<()> {
            }
            "chunk" => {
                info!("Chunking.");
-                chunking_stdio()?;
+                drop(read);
+                drop(write);
+                let pointer_data = chunking_stdio()?;
+                read = stdin.lock();
+                write = stdout.lock();
+                write_message(&mut write, &pointer_data)?;
+                write.flush()?;
            }
            "exit" => {
                write_message(&mut write, &"exit")?;
--- a/datman/src/tree.rs
+++ b/datman/src/tree.rs
@ -15,16 +15,16 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet};
 use std::fmt::Debug;
 use std::fs::{read_link, symlink_metadata, DirEntry, Metadata};
 use std::io::ErrorKind;
 use std::os::unix::fs::MetadataExt;
-use std::path::Path;
+use std::path::{Path, PathBuf};

 use anyhow::anyhow;
 use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
-use log::warn;
+use log::{debug, info, warn};
 use serde::{Deserialize, Serialize};

 pub use yama::definitions::FilesystemOwnership;
@ -216,12 +216,18 @@ pub fn mtime_msec(metadata: &Metadata) -> u64 {
 }

 /// Scan the filesystem to produce a Tree, using a default progress bar.
-pub fn scan(path: &Path) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
+pub fn scan(
+    path: &Path,
+    one_filesystem: bool,
+    exclusions: &BTreeSet<PathBuf>,
+) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
    let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(2));
    pbar.set_style(ProgressStyle::default_spinner().template("{spinner} {pos:7} {msg}"));
    pbar.set_message("dir scan");

-    let result = scan_with_progress_bar(path, &pbar);
+    let one_filesystem = if one_filesystem { Some(None) } else { None };
+
+    let result = scan_with_progress_bar(path, &pbar, one_filesystem, exclusions);
    pbar.finish_at_current_pos();
    result
 }
@ -230,7 +236,15 @@ pub fn scan(path: &Path) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
 pub fn scan_with_progress_bar(
    path: &Path,
    progress_bar: &ProgressBar,
+    mut one_filesystem: Option<Option<u64>>,
+    exclusions: &BTreeSet<PathBuf>,
 ) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
+    if exclusions.contains(path) {
+        // Don't enter excluded paths.
+        debug!("Not descending into excluded path: {:?}", path);
+        return Ok(None);
+    }
+
    let metadata_res = symlink_metadata(path);
    progress_bar.inc(1);
    if let Err(e) = &metadata_res {
@ -249,6 +263,14 @@ pub fn scan_with_progress_bar(
    let metadata = metadata_res?;
    let filetype = metadata.file_type();

+    if let Some(one_filesystem) = one_filesystem.as_mut() {
+        let this_fs = metadata.dev();
+        if *one_filesystem.get_or_insert(this_fs) != this_fs {
+            info!("Stopping at filesystem boundary: {:?}", path);
+            return Ok(None);
+        }
+    }
+
    /*let name = path
    .file_name()
    .ok_or(anyhow!("No filename, wat"))?
@ -294,15 +316,23 @@ pub fn scan_with_progress_bar(

        for entry in dir_read? {
            let entry: DirEntry = entry?;
-            let scanned = scan_with_progress_bar(&entry.path(), progress_bar)?;
+
+            if entry.file_name() == ".datmanskip" {
+                // Directories with .datmanskip in them are to be skipped entirely.
+                // TODO(perf): should this be checked upfront before some children may already
+                //     have been scanned?
+                debug!("Skipping {path:?} because it has a .datmanskip file.");
+                return Ok(None);
+            }
+
+            let scanned =
+                scan_with_progress_bar(&entry.path(), progress_bar, one_filesystem, exclusions)?;
            if let Some(scanned) = scanned {
-                children.insert(
-                    entry
-                        .file_name()
-                        .into_string()
-                        .expect("OsString not String"),
-                    scanned,
-                );
+                if let Ok(filename) = entry.file_name().into_string() {
+                    children.insert(filename, scanned);
+                } else {
+                    warn!("Non-UTF-8 filename; ignoring: {:?}", entry.file_name())
+                }
            }
        }

--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,177 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "naersk": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1662220400,
+        "narHash": "sha256-9o2OGQqu4xyLZP9K6kNe1pTHnyPz0Wr3raGYnr9AIgY=",
+        "owner": "nix-community",
+        "repo": "naersk",
+        "rev": "6944160c19cb591eb85bbf9b2f2768a935623ed3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "naersk",
+        "type": "github"
+      }
+    },
+    "nix-github-actions": {
+      "inputs": {
+        "nixpkgs": [
+          "poetry2nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1703863825,
+        "narHash": "sha256-rXwqjtwiGKJheXB43ybM8NwWB8rO2dSRrEqes0S7F5Y=",
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "rev": "5163432afc817cf8bd1f031418d1869e4c9d5547",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1714971268,
+        "narHash": "sha256-IKwMSwHj9+ec660l+I4tki/1NRoeGpyA2GdtdYpAgEw=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "27c13997bf450a01219899f5a83bd6ffbfc70d3c",
+        "type": "github"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "ref": "nixos-23.11",
+        "type": "indirect"
+      }
+    },
+    "poetry2nix": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nix-github-actions": "nix-github-actions",
+        "nixpkgs": [
+          "nixpkgs"
+        ],
+        "systems": "systems_2",
+        "treefmt-nix": "treefmt-nix"
+      },
+      "locked": {
+        "lastModified": 1715017507,
+        "narHash": "sha256-RN2Vsba56PfX02DunWcZYkMLsipp928h+LVAWMYmbZg=",
+        "owner": "nix-community",
+        "repo": "poetry2nix",
+        "rev": "e6b36523407ae6a7a4dfe29770c30b3a3563b43a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "poetry2nix",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "naersk": "naersk",
+        "nixpkgs": "nixpkgs",
+        "poetry2nix": "poetry2nix",
+        "utils": "utils"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "id": "systems",
+        "type": "indirect"
+      }
+    },
+    "treefmt-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "poetry2nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1714058656,
+        "narHash": "sha256-Qv4RBm4LKuO4fNOfx9wl40W2rBbv5u5m+whxRYUMiaA=",
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "rev": "c6aaf729f34a36c445618580a9f95a48f5e4e03f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "type": "github"
+      }
+    },
+    "utils": {
+      "locked": {
+        "lastModified": 1659877975,
+        "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,92 @@
+{
+  description = "Yama and Datman";
+
+  inputs = {
+    utils.url = "github:numtide/flake-utils";
+    naersk = {
+      url = "github:nix-community/naersk";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+    nixpkgs.url = "nixpkgs/nixos-23.11";
+    poetry2nix = {
+      url = "github:nix-community/poetry2nix";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+  };
+
+  outputs = { self, nixpkgs, utils, naersk, poetry2nix }:
+    utils.lib.eachDefaultSystem (system: let
+      pkgs = nixpkgs.legacyPackages."${system}";
+      inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) mkPoetryApplication;
+      naersk-lib = naersk.lib."${system}";
+
+      rustComponents = naersk-lib.buildPackage {
+        pname = "yama";
+        root = ./.;
+
+        buildInputs = with pkgs; [
+          openssl
+          pkg-config
+          sqlite
+        ];
+      };
+
+      mysqlHelper = mkPoetryApplication {
+        projectDir = ./datman-helper-mysql;
+      };
+
+      postgresHelper = mkPoetryApplication {
+        projectDir = ./datman-helper-postgres;
+      };
+
+      # We want to produce a package with all of these together, with wrappers that let them
+      # refer to each other by name (i.e. have each other on the path).
+      # Datman needs the helpers on the path.
+      # The helpers need lz4 on the path.
+      allInOne = pkgs.stdenv.mkDerivation {
+        name = "datman-aio";
+
+        src = "${pkgs.emptyDirectory}";
+
+        installPhase = ''
+          # set -eu
+          mkdir $out $out/bin
+          ln -s ${rustComponents}/bin/{yama,datman} $out/bin
+          ln -s ${mysqlHelper}/bin/datman-helper-mysql-{backup,restore} $out/bin
+          ln -s ${postgresHelper}/bin/datman-helper-postgres-{backup,restore} $out/bin
+          ln -s ${pkgs.lz4}/bin/lz4 $out/bin/
+          runHook postInstall
+        '';
+
+        buildInputs = [ pkgs.makeWrapper ];
+
+        postInstall = ''
+          # set -eu
+          for fn in $out/bin/{datman,yama,datman-helper-{mysql,postgres}-{backup,restore}}; do
+            wrapProgram $fn --suffix PATH : $out/bin
+          done
+        '';
+      };
+    in rec {
+      # `nix build`
+      packages.yama = allInOne;
+
+      defaultPackage = packages.yama;
+
+      # NixOS Modules
+#       nixosModules = {
+#         yama = import ./nixos_modules/yama.nix self;
+#       };
+
+      # `nix run`
+      apps.yama = utils.lib.mkApp {
+        drv = rustComponents;
+      };
+      defaultApp = apps.yama;
+
+      # `nix develop`
+      devShell = pkgs.mkShell {
+        nativeBuildInputs = with pkgs; [ rustc cargo ];
+      };
+    });
+}
--- a/scripts-dev/lint.sh
+++ b/scripts-dev/lint.sh
@ -4,7 +4,7 @@ if [ $# -ge 1 ]
 then
  files=$*
 else
-  files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-postgres/setup.py datman-helper-mysql/datman_helper_mysql datman-helper-mysql/setup.py"
+  files="testsuite/setup.py testsuite/datmantests testsuite/helpers testsuite/yamatests datman-helper-postgres/datman_helper_postgres datman-helper-mysql/datman_helper_mysql"
 fi

 echo "Linting these locations: $files"
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,50 @@
+{ pkgs ? import <nixpkgs> {} }:
+
+let
+  # We may need some packages from nixpkgs-unstable
+  #unstable = import <nixpkgs-unstable> {};
+
+  rust-toolchain = pkgs.symlinkJoin {
+    name = "rust-toolchain";
+    paths = [pkgs.rustc pkgs.cargo pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
+  };
+in
+
+pkgs.mkShell {
+
+  buildInputs = [
+    rust-toolchain
+
+    pkgs.pkg-config
+
+    pkgs.alsa-lib
+    pkgs.sqlite
+    #pkgs.libclang # ??
+  ];
+
+  nativeBuildInputs = [
+    pkgs.openssl
+    pkgs.python3
+  ];
+
+  # Needed for bindgen when binding to avahi
+  LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
+
+  # Cargo culted:
+  # Add to rustc search path
+  RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
+  ]);
+  # Add to bindgen search path
+  BINDGEN_EXTRA_CLANG_ARGS =
+    # Includes with normal include path
+    (builtins.map (a: ''-I"${a}/include"'') [
+    ])
+    # Includes with special directory paths
+    ++ [
+      ''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
+      #''-I"${pkgs.glib.dev}/include/glib-2.0"''
+      #''-I${pkgs.glib.out}/lib/glib-2.0/include/''
+    ];
+
+
+}
--- a/testsuite/datmantests/test_backup_and_extract.py
+++ b/testsuite/datmantests/test_backup_and_extract.py
@ -251,7 +251,8 @@ kind = {{ stdout = "blahblah.txt" }}
        seed = 7555
        print(f"seed: {seed}")
        rng.seed(seed)
-        # min_files is 8 because we need enough files to use each label for this test to succeed.
+        # min_files is 8 because we need enough files to use each label for this
+        # test to succeed.
        initial_descriptor, _ = generate_random_dir(rng, src_path, 32, min_files=8)
        labellings = generate_labels(initial_descriptor, rng)
        save_labelling_rules(labelling_path.joinpath("srca.zst"), labellings)
@ -298,3 +299,81 @@ kind = {{ stdout = "blahblah.txt" }}
            )

        td.cleanup()
+
+    def test_backup_incremental_with_mid_delete(self):
+        td = TemporaryDirectory("test_backup_incremental_with_mid_delete")
+        tdpath = Path(td.name)
+
+        datman_path = tdpath.joinpath("datman")
+        src_path = datman_path.joinpath("srca")
+        yama_path = datman_path.joinpath("main")
+
+        set_up_simple_datman(datman_path)
+        set_up_simple_yama(yama_path)
+
+        rng = Random()
+        seed = rng.randint(0, 9001)
+        print(f"seed: {seed}")
+        rng.seed(seed)
+        initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
+
+        print("storing")
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+
+        # now mutate and store incremental
+        randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
+        time.sleep(2)
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+
+        # now mutate and store incremental again!
+        randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
+        mutated_descriptor = scan_dir(src_path)
+        time.sleep(2)
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+
+        pointer_names = [
+            line
+            for line in subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
+            .decode()
+            .split("\n")
+            if line
+        ]
+        self.assertEqual(len(pointer_names), 3)
+        self.assertLess(pointer_names[0], pointer_names[1])
+        self.assertLess(pointer_names[1], pointer_names[2])
+
+        print(f"removing mid pointer {pointer_names[1]}")
+        subprocess.check_call(
+            ("yama", "debug", "rmp", pointer_names[1]),
+            cwd=yama_path,
+        )
+
+        print("extracting last pointer to check still valid")
+        dest_path = tdpath.joinpath("desta")
+        subprocess.check_call(
+            (
+                "datman",
+                "extract",
+                "--skip-metadata",
+                "--accept-partial",
+                "main",
+                "../desta",
+            ),
+            cwd=datman_path,
+        )
+
+        # this will be wrapped in a directory that starts with the name srca+
+        extracted_dir_descriptor_wrapper = scan_dir(dest_path)
+
+        contents = extracted_dir_descriptor_wrapper.contents
+        self.assertEqual(len(contents), 1)
+        key, value = next(iter(contents.items()))
+        self.assertTrue(key.startswith("srca+"))
+
+        self.assertIsInstance(value, DirectoryDescriptor)
+        key, value = next(iter(value.contents.items()))
+        self.assertEqual(key, "srca")
+
+        self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
+
+        td.cleanup()
--- a/testsuite/helpers/yama_helpers.py
+++ b/testsuite/helpers/yama_helpers.py
@ -1,6 +1,7 @@
 import shutil
 import subprocess
 from pathlib import Path
+from typing import Set


 def set_up_simple_yama(path: Path):
@ -10,3 +11,13 @@ def set_up_simple_yama(path: Path):
        "example_zstd.dict"
    )
    shutil.copyfile(example_zstd_path, path.joinpath("important_zstd.dict"))
+
+
+def list_bloblog_ids(pile: Path) -> Set[int]:
+    result = set()
+    for p in pile.joinpath("bloblog").iterdir():
+        try:
+            result.add(int(p.name))
+        except ValueError:
+            pass
+    return result
--- a/testsuite/setup.py
+++ b/testsuite/setup.py
@ -22,7 +22,7 @@ REQUIRED = ["green", "attrs", "immutabledict"]


 # What packages are optional?
-EXTRAS = {"dev": ["black==21.7b0", "flake8==3.9.2", "isort==5.9.2"]}
+EXTRAS = {"dev": ["black==22.10.0", "flake8==3.9.2", "isort==5.9.2"]}

 # The rest you shouldn't have to touch too much :)
 # ------------------------------------------------
--- a/testsuite/yamatests/test_compact.py
+++ b/testsuite/yamatests/test_compact.py
@ -0,0 +1,175 @@
+import subprocess
+from pathlib import Path
+from random import Random
+from tempfile import TemporaryDirectory
+from unittest import TestCase
+
+from helpers import (
+    DirectoryDescriptor,
+    generate_random_dir,
+    randomly_mutate_directory_in_descriptor,
+    scan_dir,
+)
+from helpers.datman_helpers import set_up_simple_datman
+from helpers.yama_helpers import list_bloblog_ids, set_up_simple_yama
+
+
+class TestYamaCompact(TestCase):
+    def test_compaction_merge_two_small_bloblogs(self):
+        td = TemporaryDirectory("test_check_fails_after_random_corruption")
+        tdpath = Path(td.name)
+
+        datman_path = tdpath.joinpath("datman")
+        src_path = datman_path.joinpath("srca")
+        yama_path = datman_path.joinpath("main")
+
+        set_up_simple_datman(datman_path)
+        set_up_simple_yama(yama_path)
+
+        rng = Random()
+        seed = rng.randint(0, 9001)
+        print(f"seed: {seed}")
+        rng.seed(seed)
+        later_expected_descriptor, _ = generate_random_dir(rng, src_path, 32)
+
+        # Back up twice: that way we should get at least two bloblogs!
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+        old_bloblog_ids = list_bloblog_ids(yama_path)
+        self.assertGreater(
+            len(old_bloblog_ids), 1, "Should be many bloblogs at this point"
+        )
+
+        subprocess.check_call(
+            (
+                "yama",
+                "compact",
+                "--mergeable",
+                "2",
+                "--small",
+                str(2 * 1024 * 1024 * 1024),
+            ),
+            cwd=yama_path,
+        )
+
+        new_bloblog_ids = list_bloblog_ids(yama_path)
+        self.assertEqual(
+            len(new_bloblog_ids), 1, "Should only be 1 bloblog at this point."
+        )
+        self.assertEqual(
+            list(new_bloblog_ids)[0],
+            max(old_bloblog_ids) + 1,
+            "New bloblog ID should be 1 greater than the max old one.",
+        )
+
+    def test_gc_then_compact(self):
+        td = TemporaryDirectory("test_gc_then_compact")
+        tdpath = Path(td.name)
+
+        datman_path = tdpath.joinpath("datman")
+        src_path = datman_path.joinpath("srca")
+        yama_path = datman_path.joinpath("main")
+
+        set_up_simple_datman(datman_path)
+        set_up_simple_yama(yama_path)
+
+        rng = Random()
+        seed = rng.randint(0, 9001)
+        print(f"seed: {seed}")
+        rng.seed(seed)
+        initial_descriptor, _ = generate_random_dir(rng, src_path, 32)
+
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+        orig_pointer_name = (
+            subprocess.check_output(("yama", "debug", "lsp"), cwd=yama_path)
+            .decode()
+            .split("\n")[0]
+        )
+
+        randomly_mutate_directory_in_descriptor(initial_descriptor, src_path, rng)
+        mutated_descriptor = scan_dir(src_path)
+
+        subprocess.check_call(("datman", "backup-one", "srca", "main"), cwd=datman_path)
+
+        old_bloblog_ids = list_bloblog_ids(yama_path)
+
+        # Try a GC and check that it's a no-op
+        subprocess.check_call(
+            ("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
+        )
+        subprocess.check_call(
+            (
+                "yama",
+                "compact",
+                "--mergeable",
+                "2000",
+                "--reclaim",
+                "1",
+                "--max-dealloc",
+                "1",
+            ),
+            cwd=yama_path,
+        )
+
+        unchanged_bloblog_ids = list_bloblog_ids(yama_path)
+        self.assertEqual(
+            old_bloblog_ids,
+            unchanged_bloblog_ids,
+            "No GC: no compaction should have happened.",
+        )
+
+        subprocess.check_call(
+            ("yama", "debug", "rmp", orig_pointer_name), cwd=yama_path
+        )
+
+        # Try a GC and check that it did something
+        subprocess.check_call(
+            ("yama", "check", "--shallow", "--apply-gc"), cwd=yama_path
+        )
+        subprocess.check_call(
+            (
+                "yama",
+                "compact",
+                "--mergeable",
+                "2000",
+                "--reclaim",
+                "1",
+                "--max-dealloc",
+                "1",
+            ),
+            cwd=yama_path,
+        )
+
+        new_bloblog_ids = list_bloblog_ids(yama_path)
+        self.assertNotEqual(
+            old_bloblog_ids, new_bloblog_ids, "GC: compaction should have happened."
+        )
+
+        # Check that we can still extract the files!
+        dest_path = tdpath.joinpath("desta")
+        subprocess.check_call(
+            (
+                "datman",
+                "extract",
+                "--skip-metadata",
+                "--accept-partial",
+                "main",
+                "../desta",
+            ),
+            cwd=datman_path,
+        )
+
+        extracted_dir_descriptor_wrapper = scan_dir(dest_path)
+
+        contents = extracted_dir_descriptor_wrapper.contents
+        self.assertEqual(len(contents), 1)
+        key, value = next(iter(contents.items()))
+        self.assertTrue(key.startswith("srca+"))
+
+        self.assertIsInstance(value, DirectoryDescriptor)
+        key, value = next(iter(value.contents.items()))
+        self.assertEqual(key, "srca")
+
+        self.assertEqual(value.ignore_metadata(), mutated_descriptor.ignore_metadata())
+
+        td.cleanup()
--- a/yama/Cargo.toml
+++ b/yama/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "yama"
-version = "0.5.0-alpha.1"
+version = "0.6.0-alpha.5"
 authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
 edition = "2018"
 description = "Deduplicated, compressed and encrypted content pile manager"
@ -11,17 +11,16 @@ license = "GPL-3.0-or-later"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-fastcdc = "1.0.2"
-zstd = "0.6.0"  # 0.6.0+zstd.1.4.8
-sshish = "0.1.0"
-clap = "= 3.0.0-beta.5"
-blake = "2.0.0"
+fastcdc = "1.0.6"
+zstd = "0.11.2"  # 0.11.2+zstd.1.5.2
+clap = { version = "3.1.18", features = ["derive"] }
+blake = "2.0.2"
 twox-hash = "1.5.0"
 serde = { version = "1.0.104", features = ["derive"] }
 serde_bare = "0.3.0"
 users = "0.9.1"
-crossbeam-channel = "0.4"
-crossbeam-utils = "0.8.1"
+crossbeam-channel = "0.5.1"
+crossbeam-utils = "0.8.5"
 toml = "0.5.5"
 glob = "0.3.0"
 nix = "0.17.0"
@ -38,6 +37,8 @@ rayon = "1.5.0"
 rusqlite = "0.24.2"
 chrono = "0.4.19"
 rustyline = "7.1.0"
+derivative = "2.2.0"
+metrics = "0.17.1"


 [dev-dependencies]
--- a/yama/src/bin/yama.rs
+++ b/yama/src/bin/yama.rs
@ -18,21 +18,24 @@ along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 use std::path::{Path, PathBuf};

 use anyhow::{bail, Context};
-use clap::{crate_authors, crate_description, crate_version, Parser};
 use log::info;

+use clap::Parser;
 use env_logger::Env;
 use std::sync::Arc;
 use yama::commands::{fully_integrate_pointer_node, load_pile_descriptor, open_pile};
 use yama::debug::{debug_command, DebugCommand};
 use yama::operations::checking::VacuumMode;
-use yama::operations::pushpull::{determine_bypass_level, open_pile_with_work_bypass, push_to};
-use yama::operations::{checking, extracting};
+use yama::operations::legacy_pushpull::{
+    determine_bypass_level, open_pile_with_work_bypass, push_to,
+};
+use yama::operations::{checking, cleanup, extracting};
+use yama::pile::local_sqlitebloblogs::CompactionThresholds;
 use yama::pile::{Pile, PileDescriptor, RawPile};
 use yama::{commands, debug};

 #[derive(Parser)]
-#[clap(version = crate_version!(), author = crate_authors!(), about = crate_description!())]
+#[clap(version = env!("CARGO_PKG_VERSION"), author = env!("CARGO_PKG_AUTHORS"), about = env!("CARGO_PKG_DESCRIPTION"))]
 struct Opts {
    /// Chooses a different pile to be the working pile.
    /// If specified, must be the name of a remote in yama.toml.
@ -54,8 +57,9 @@ enum PileCommand {
        pointer_name: String,

        /// Limited expression(s) of files to retrieve.
+        /// LIMITATION OF CURRENT VERSION: ONLY ONE EXACT PATH ALLOWED, PLEASE.
        #[clap(short, long)]
-        subset: Vec<PathBuf>,
+        subset: Option<String>,

        destination: PathBuf,

@ -80,6 +84,29 @@ enum PileCommand {
        shallow: bool,
    },

+    Compact {
+        /// Don't actually perform any compaction; just plan it out.
+        #[clap(long)]
+        dry_run: bool,
+
+        /// Allocated size under which a bloblog is considered small.
+        #[clap(long = "small")]
+        small_thresh: Option<u64>,
+
+        /// Minimum amount of space to reclaim in order to run compaction for reclaim.
+        #[clap(long = "reclaim")]
+        min_reclaim: Option<u64>,
+
+        /// Maximum amount of space that can be deallocated in a bloblog before we consider it
+        /// worthwhile to replace.
+        #[clap(long = "max-dealloc")]
+        max_deallocated: Option<u64>,
+
+        /// Minimum number of mergeable small bloblogs in order to run compaction for merge.
+        #[clap(long)]
+        mergeable: Option<u32>,
+    },
+
    /// Enter a debug prompt for manually operating on the yama pile.
    Debug { supplied_command: Vec<String> },

@ -134,10 +161,25 @@ fn wrapped_main() -> anyhow::Result<i32> {

            fully_integrate_pointer_node(&pile, &mut root_tree_node.node, &mut pointer)?;

+            let mut node_to_extract = &mut root_tree_node.node;
+
+            if let Some(subset) = subset {
+                for path_to_descend in subset.split('/').filter(|s| !s.is_empty()) {
+                    match node_to_extract.child(path_to_descend) {
+                        Ok(new_node) => {
+                            node_to_extract = new_node;
+                        }
+                        Err(msg) => {
+                            bail!("Can't descend into {path_to_descend:?}: {msg}");
+                        }
+                    }
+                }
+            }
+
            // todo allow disabling apply metadata
            extracting::extract(
                destination,
-                &mut root_tree_node.node,
+                node_to_extract,
                &pile,
                true,
                workers.unwrap_or(2),
@ -173,6 +215,29 @@ fn wrapped_main() -> anyhow::Result<i32> {
                return Ok(1);
            }
        }
+        PileCommand::Compact {
+            dry_run,
+            small_thresh,
+            min_reclaim,
+            max_deallocated,
+            mergeable,
+        } => {
+            let this_dir = Path::new(".");
+            let descriptor =
+                load_pile_descriptor(this_dir).context("Failed to load pile descriptor")?;
+            cleanup::compact(
+                this_dir,
+                &descriptor,
+                !*dry_run,
+                true,
+                CompactionThresholds {
+                    minimum_to_reclaim: min_reclaim.unwrap_or(2 * 1024 * 1024 * 1024),
+                    minimum_small_bloblogs_to_merge: mergeable.unwrap_or(64),
+                    cond_if_more_deallocated_than: max_deallocated.unwrap_or(256 * 1024 * 1024),
+                    cond_if_less_allocated_than: small_thresh.unwrap_or(64 * 1024 * 1024),
+                },
+            )?;
+        }
        PileCommand::Init {} => {
            commands::init(".".as_ref())?;
        }
--- a/yama/src/chunking.rs
+++ b/yama/src/chunking.rs
@ -35,27 +35,44 @@ pub const FASTCDC_AVG: usize = 1024 * 1024;
 // 8 MiB
 pub const FASTCDC_MAX: usize = 8 * 1024 * 1024;

+pub trait ChunkSubmissionTarget: Sync {
+    fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()>;
+}
+
+impl<RP: RawPile> ChunkSubmissionTarget for Pile<RP> {
+    fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
+        self.submit_chunk(chunk_id, chunk_data)
+    }
+}
+
+impl ChunkSubmissionTarget for crossbeam_channel::Sender<(ChunkId, Vec<u8>)> {
+    fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
+        self.send((chunk_id, chunk_data.to_vec()))
+            .map_err(|_| anyhow::anyhow!("Failed to send to pipeline."))
+    }
+}
+
 /// A chunker that will generate nested chunks of chunk references if there is that much data
 /// to store.
 /// The root RecursiveChunker is fed data bytes.
 /// If it exceeds the nominated threshold, it grows a child RecursiveChunker (which may do the same).
 /// When done, `finish` should be called to flush the buffers and obtain a `RecursiveChunkRef`.
-pub struct RecursiveChunker<'pile, RP: RawPile> {
+pub struct RecursiveChunker<'cst, CST: ChunkSubmissionTarget> {
    /// The pile to submit chunks to.
-    pile: &'pile Pile<RP>,
+    target: &'cst CST,
    /// Buffer of data at this level.
    buffer: Vec<u8>,
    /// The next-layer recursive chunker. This is where this chunker will submit chunk IDs to for
    /// recursive chunking.
-    next_layer: Option<Box<RecursiveChunker<'pile, RP>>>,
+    next_layer: Option<Box<RecursiveChunker<'cst, CST>>>,
    /// The size at which this chunker will perform recursive chunking.
    threshold: usize,
 }

-impl<'pile, RP: RawPile> RecursiveChunker<'pile, RP> {
-    pub fn new(threshold: usize, pile: &'pile Pile<RP>) -> Self {
+impl<'cst, CST: ChunkSubmissionTarget> RecursiveChunker<'cst, CST> {
+    pub fn new(threshold: usize, target: &'cst CST) -> Self {
        RecursiveChunker {
-            pile,
+            target,
            buffer: vec![],
            next_layer: None,
            threshold,
@ -73,10 +90,10 @@ impl<'pile, RP: RawPile> RecursiveChunker<'pile, RP> {
            let is_final = chunk.offset + chunk.length == self.buffer.len();
            if !is_final || finalise {
                consumed_until = Some(chunk.offset + chunk.length);
-                let chunk_id = self
-                    .pile
-                    .submit_chunk(&self.buffer[chunk.offset..chunk.offset + chunk.length])?;
+                let chunk_data = &self.buffer[chunk.offset..chunk.offset + chunk.length];
+                let chunk_id = calculate_chunkid(chunk_data);
                new_chunks.extend_from_slice(&chunk_id);
+                self.target.submit(chunk_id, chunk_data)?;
            }
        }

@ -101,7 +118,8 @@ impl<'pile, RP: RawPile> RecursiveChunker<'pile, RP> {
            Ok(rcr)
        } else {
            // no chunking, so depth=0 (raw) and just emit our unchunked data
-            let chunk_id = self.pile.submit_chunk(&self.buffer)?;
+            let chunk_id = calculate_chunkid(&self.buffer);
+            self.target.submit(chunk_id, &self.buffer)?;
            Ok(RecursiveChunkRef { chunk_id, depth: 0 })
        }
    }
@ -114,7 +132,7 @@ impl<'pile, RP: RawPile> RecursiveChunker<'pile, RP> {
                // start chunking
                self.next_layer = Some(Box::new(RecursiveChunker::new(
                    self.threshold,
-                    self.pile.clone(),
+                    self.target.clone(),
                )));
            }

@ -127,7 +145,7 @@ impl<'pile, RP: RawPile> RecursiveChunker<'pile, RP> {
    }
 }

-impl<'pile, RP: RawPile> Write for RecursiveChunker<'pile, RP> {
+impl<'cst, CST: ChunkSubmissionTarget> Write for RecursiveChunker<'cst, CST> {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        match self.write_impl(buf) {
            Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)),
@ -143,6 +161,8 @@ impl<'pile, RP: RawPile> Write for RecursiveChunker<'pile, RP> {

 #[inline]
 pub fn calculate_chunkid(chunk: &[u8]) -> ChunkId {
+    // TODO(newver) Allow pluggable chunkID calculations so that encrypted storage can work without
+    //              leaking contents.
    let mut chunk_id: ChunkId = Default::default();
    blake::hash(256, &chunk, &mut chunk_id).expect("BLAKE problem");
    chunk_id
--- a/yama/src/commands.rs
+++ b/yama/src/commands.rs
@ -22,7 +22,6 @@ use std::path::Path;
 use std::sync::Arc;

 use anyhow::{anyhow, bail, Context};
-use clap::crate_version;
 use log::warn;

 use crate::chunking::{RecursiveChunker, RecursiveUnchunker, SENSIBLE_THRESHOLD};
@ -48,7 +47,7 @@ pub fn init(dir: &Path) -> anyhow::Result<()> {
    let mut file = File::create(yama_toml)?;

    let desc = PileDescriptor {
-        yama_version: crate_version!().to_owned(),
+        yama_version: env!("CARGO_PKG_VERSION").to_owned(),
        storage: PileStorage::SqliteIndexedBloblog,
        compression: Some(12),
    };
--- a/yama/src/debug.rs
+++ b/yama/src/debug.rs
@ -15,13 +15,12 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

-use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node};
+use crate::commands::retrieve_tree_node;
 use crate::definitions::{FilesystemOwnership, FilesystemPermissions, TreeNode};
+use crate::operations::remove_pointer_safely;
 use crate::pile::{Pile, PileDescriptor, RawPile};
-use crate::tree::integrate_node_in_place;
 use anyhow::anyhow;
 use clap::Parser;
-use log::info;
 use rustyline::error::ReadlineError;
 use rustyline::Editor;

@ -46,6 +45,9 @@ pub enum DebugCommand {
        /// Name of the pointer to read.
        name: String,
    },
+    /// Reads statistics from the Pile
+    #[clap(name = "stats")]
+    Statistics {},
 }

 pub fn debug_prompt<RP: RawPile>(pdesc: PileDescriptor, pile: Pile<RP>) -> anyhow::Result<()> {
@ -120,50 +122,7 @@ pub fn debug_command<RP: RawPile>(
            }
        }
        DebugCommand::DeletePointer { name } => {
-            // retrieve this pointer
-            let mut this_pointer = pile.read_pointer(name.as_str())?.ok_or_else(|| {
-                anyhow!("Pointer {:?} does not exist so can not be deleted.", name)
-            })?;
-            let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())?;
-
-            // fully integrate the pointer
-            fully_integrate_pointer_node(&pile, &mut this_node.node, &mut this_pointer)?;
-            assert!(this_pointer.parent_pointer.is_none());
-
-            // now integrate any pointers that rely on this one
-            // so that they no longer rely on this one.
-            for pointer in pile.list_pointers()?.iter() {
-                if pointer == name {
-                    continue;
-                }
-                if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
-                    if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
-                        if parent_pointer == name {
-                            info!("Pointer is now an orphan: {:?}", pointer);
-
-                            // need to integrate this node, so retrieve it
-                            let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
-
-                            // integrate it in-place
-                            integrate_node_in_place(&mut node.node, &this_node.node)?;
-
-                            // mark it as orphaned (no parent)
-                            pointer_data.parent_pointer = None;
-
-                            // store the orphaned node
-                            let new_chunk_ref = store_tree_node(&pile, &node)?;
-                            // associate the orphaned node with the orphaned pointer
-                            pointer_data.chunk_ref = new_chunk_ref;
-                            // write the pointer back.
-                            pile.write_pointer(pointer.as_str(), &pointer_data)?;
-                        }
-                    }
-                }
-            }
-
-            // then delete the pointer
-            pile.delete_pointer(name)?;
-            info!("Deleted pointer: {:?}", name);
+            remove_pointer_safely(pile, name)?;
        }
        DebugCommand::PointerInfo { name } => {
            let this_pointer = pile
@ -176,6 +135,22 @@ pub fn debug_command<RP: RawPile>(
            //eprintln!("{:#?}", this_node.node);
            tree_node_printer(&this_node.name, &this_node.node, 0);
        }
+        DebugCommand::Statistics { .. } => {
+            if let Some(stats) = pile.raw_pile.debug_statistics()? {
+                println!("Statistics for this pile");
+                println!("  chunk count: {} chunks", stats.number_of_chunks);
+                println!(
+                    "  total chunk stored space: {} bytes (may exclude deleted chunks)",
+                    stats.total_chunk_size
+                );
+                let average_size =
+                    ((stats.total_chunk_size as f64) / (stats.number_of_chunks as f64)) as u64;
+                println!("    (average chunk size: {} bytes)", average_size);
+            } else {
+                eprintln!("{:?}", pile);
+                eprintln!("Statistics appear not to be supported on this kind of pile?");
+            }
+        }
    }
    Ok(())
 }
--- a/yama/src/definitions.rs
+++ b/yama/src/definitions.rs
@ -35,6 +35,24 @@ pub struct PointerData {
    pub gid_lookup: BTreeMap<u16, Option<String>>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PartialPointerData {
+    pub chunk_ref: RecursiveChunkRef,
+    pub uid_lookup: BTreeMap<u16, Option<String>>,
+    pub gid_lookup: BTreeMap<u16, Option<String>>,
+}
+
+impl PartialPointerData {
+    pub fn complete(self, parent_pointer: Option<String>) -> PointerData {
+        PointerData {
+            chunk_ref: self.chunk_ref,
+            parent_pointer,
+            uid_lookup: self.uid_lookup,
+            gid_lookup: self.gid_lookup,
+        }
+    }
+}
+
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub struct RecursiveChunkRef {
    /// The root Chunk ID.
@ -252,6 +270,19 @@ impl TreeNode {
            }
        }
    }
+
+    /// Recurses into a child by name, or returns Err with a reason.
+    pub fn child(&mut self, name: &str) -> Result<&mut TreeNode, &'static str> {
+        match self {
+            TreeNode::NormalFile { .. } => Err("not a directory: normal file"),
+            TreeNode::Directory { children, .. } => match children.get_mut(name) {
+                None => Err("child not in directory"),
+                Some(node) => Ok(node),
+            },
+            TreeNode::SymbolicLink { .. } => Err("not a directory: symlink"),
+            TreeNode::Deleted => Err("not a directory: deleted"),
+        }
+    }
 }

 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
--- a/yama/src/operations.rs
+++ b/yama/src/operations.rs
@ -1,4 +1,80 @@
+use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node};
+use crate::pile::{Pile, RawPile};
+use crate::tree::{differentiate_node_in_place, integrate_node_in_place};
+use anyhow::{anyhow, Context};
+use log::info;
+
 pub mod checking;
+pub mod cleanup;
 pub mod extracting;
-pub mod pushpull;
+pub mod legacy_pushpull;
 pub mod storing;
+
+pub fn remove_pointer_safely<P: RawPile>(pile: &Pile<P>, name: &str) -> anyhow::Result<()> {
+    // retrieve this pointer
+    let mut this_pointer = pile
+        .read_pointer(name)?
+        .ok_or_else(|| anyhow!("Pointer {:?} does not exist so can not be deleted.", name))?;
+    let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())
+        .context("retrieving 'this' node")?;
+
+    let new_parent_name = this_pointer.parent_pointer.clone();
+    fully_integrate_pointer_node(pile, &mut this_node.node, &mut this_pointer)
+        .context("integrating new parent")?;
+
+    let new_parent = if let Some(ref new_parent_name) = new_parent_name {
+        let mut new_parent_pointer = pile
+            .read_pointer(new_parent_name.as_str())?
+            .ok_or_else(|| anyhow!("Parent pointer {:?} does not exist.", name))?;
+        let mut new_parent_node = retrieve_tree_node(&pile, new_parent_pointer.chunk_ref.clone())?;
+        fully_integrate_pointer_node(pile, &mut new_parent_node.node, &mut new_parent_pointer)?;
+        Some((new_parent_pointer, new_parent_node))
+    } else {
+        None
+    };
+
+    // now integrate any pointers that rely on this one
+    // so that they no longer rely on this one.
+    for pointer in pile.list_pointers()?.iter() {
+        if pointer == name {
+            continue;
+        }
+        if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
+            if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
+                if parent_pointer == name {
+                    info!("Pointer would be orphaned: {:?}; integrating", pointer);
+
+                    // need to integrate this node, so retrieve it
+                    let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
+
+                    // integrate it in-place
+                    integrate_node_in_place(&mut node.node, &this_node.node)?;
+
+                    if let Some((_, ref new_parent_node)) = new_parent {
+                        // then differentiate with respect to the NEW parent
+                        differentiate_node_in_place(&mut node.node, &new_parent_node.node)?;
+                    }
+
+                    // pass through the parent
+                    pointer_data.parent_pointer = new_parent_name.clone();
+
+                    // store the updated version of the pointer
+                    let new_chunk_ref = store_tree_node(&pile, &node)?;
+                    // associate the new node with the new version of the pointer
+                    pointer_data.chunk_ref = new_chunk_ref;
+                    // write the pointer back.
+                    pile.write_pointer(pointer.as_str(), &pointer_data)?;
+
+                    // we must flush chunks before deleting the pointer
+                    pile.flush()
+                        .context("flushing after writing pointer back")?;
+                }
+            }
+        }
+    }
+
+    // then delete the pointer
+    pile.delete_pointer(name)?;
+    info!("Deleted pointer: {:?}", name);
+    Ok(())
+}
--- a/yama/src/operations/checking.rs
+++ b/yama/src/operations/checking.rs
@ -18,11 +18,16 @@ along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 use crate::chunking::RecursiveUnchunker;
 use crate::commands::retrieve_tree_node;
 use crate::definitions::{ChunkId, TreeNode};
-use crate::pile::{Keyspace, Pile, RawPile};
+use crate::pile::{
+    ControllerMessage, Keyspace, Pile, PipelineDescription, RawPile, StoragePipelineSettings,
+};
 use anyhow::bail;
+use crossbeam_channel::Sender;
 use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
+use itertools::Itertools;
 use log::{error, info, warn};
 use std::collections::HashSet;
+use std::convert::TryInto;
 use std::io::{Read, Write};
 use std::sync::Mutex;

@ -48,6 +53,7 @@ impl Write for NullWriter {
 /// Mark-and-sweep style vacuuming system.
 /// We mark all the chunks that we run into (following the structure of all the pointers and
 /// recursive chunk references) and sweep the chunks that have not been read.
+#[derive(Debug)]
 pub struct VacuumRawPile<RP: RawPile> {
    underlying: RP,
    vacuum_tracking_enabled: bool,
@ -106,6 +112,10 @@ impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
        self.underlying.delete(kind, key)
    }

+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.underlying.delete_many(kind, keys)
+    }
+
    fn list_keys(
        &self,
        kind: Keyspace,
@ -120,6 +130,23 @@ impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        self.underlying.check_lowlevel()
    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        self.underlying
+            .build_storage_pipeline(settings, controller_send)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        self.underlying.describe_pipeline()
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }

 /// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time.
@ -386,9 +413,21 @@ pub fn check_shallow<RP: RawPile>(

            // actually do the vacuum!
            info!("Going to vacuum them up.");
-            for vacuum_id in to_vacuum {
-                pile.raw_pile.delete(Keyspace::Chunk, &vacuum_id)?;
-                pbar.inc(1);
+            for vacuum_ids_chunk in to_vacuum
+                .into_iter()
+                .chunks(512)
+                .into_iter()
+                .map(|c| c.collect::<Vec<ChunkId>>())
+            {
+                pile.raw_pile.delete_many(
+                    Keyspace::Chunk,
+                    vacuum_ids_chunk
+                        .iter()
+                        .map(|ci| ci.as_slice())
+                        .collect::<Vec<&[u8]>>()
+                        .as_slice(),
+                )?;
+                pbar.inc(vacuum_ids_chunk.len().try_into().unwrap());
            }
            pile.flush()?;
            pbar.finish_and_clear();
--- a/yama/src/operations/cleanup.rs
+++ b/yama/src/operations/cleanup.rs
@ -0,0 +1,64 @@
+use crate::pile::local_sqlitebloblogs::{CompactionThresholds, SqliteBloblogPile};
+use crate::pile::{PileDescriptor, PileStorage};
+use anyhow::{bail, Context};
+use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
+use log::info;
+use std::path::Path;
+
+pub fn compact(
+    pile_path: &Path,
+    pile_desc: &PileDescriptor,
+    actually_run: bool,
+    make_progress_bar: bool,
+    thresholds: CompactionThresholds,
+) -> anyhow::Result<()> {
+    let pbar = if make_progress_bar {
+        ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
+    } else {
+        ProgressBar::hidden()
+    };
+    pbar.set_style(
+        ProgressStyle::default_bar()
+            .template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
+    );
+    pbar.set_message("compacting");
+
+    match pile_desc.storage {
+        PileStorage::SqliteIndexedBloblog => {
+            let bloblog_pile = SqliteBloblogPile::open(&pile_path)
+                .context("Failed to open SQLite-indexed Bloblog Pile")?;
+            compact_bloblogs(bloblog_pile, pbar, actually_run, thresholds)?;
+            Ok(())
+        }
+        other @ PileStorage::RemoteOnly => {
+            bail!("Cannot use compaction on this kind of pile: {other:?}!");
+        }
+    }
+}
+
+fn compact_bloblogs(
+    bloblog_pile: SqliteBloblogPile,
+    pbar: ProgressBar,
+    actually_run: bool,
+    thresholds: CompactionThresholds,
+) -> anyhow::Result<()> {
+    info!("=== Analysing for compaction ===");
+    let analysis = bloblog_pile.analyse_for_compaction()?;
+    let chunks_total: u64 = analysis.values().map(|bs| bs.chunks_total).sum();
+    let chunks_deleted: u64 = analysis.values().map(|bs| bs.chunks_deleted).sum();
+    let bytes_total: u64 = analysis.values().map(|bs| bs.bytes_total).sum();
+    let bytes_deleted: u64 = analysis.values().map(|bs| bs.bytes_deleted).sum();
+
+    info!("{} bloblogs in this pile, with {chunks_total} chunks ({bytes_total} B) of which {chunks_deleted} ({bytes_deleted} B) are deleted.", analysis.len());
+
+    info!("=== Planning compaction ===");
+    let plan = bloblog_pile.plan_compaction(&thresholds, analysis)?;
+    info!("Planned compaction: replace {} bloblogs (of which {} are small), freeing up {} B and rewriting {} B", plan.bloblogs_to_replace.len(), plan.small_bloblogs, plan.reclaimable_space, plan.bytes_to_write);
+
+    if actually_run {
+        info!("=== Compacting ===");
+        bloblog_pile.perform_compaction(Box::new(pbar), plan)?;
+    }
+
+    Ok(())
+}
--- a/yama/src/operations/legacy_pushpull.rs
+++ b/yama/src/operations/legacy_pushpull.rs
@ -2,7 +2,7 @@ use crate::chunking::RecursiveUnchunker;
 use crate::commands::fully_load_pointer;
 use crate::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
 use crate::operations::checking::VacuumRawPile;
-use crate::operations::pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone};
+use crate::operations::legacy_pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone};
 use crate::pile::compression::{CompressionSettings, RawPileCompressor};
 use crate::pile::integrity::RawPileIntegrityChecker;
 use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;
--- a/yama/src/operations/storing.rs
+++ b/yama/src/operations/storing.rs
@ -26,19 +26,23 @@ use crossbeam_channel::{Receiver, Sender};
 use crossbeam_utils::thread;
 use log::{error, warn};

-use crate::chunking::{RecursiveChunker, SENSIBLE_THRESHOLD};
+use crate::chunking::{ChunkSubmissionTarget, RecursiveChunker, SENSIBLE_THRESHOLD};
 use crate::commands;
 use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node};
-use crate::definitions::{PointerData, RecursiveChunkRef, RootTreeNode, TreeNode};
-use crate::pile::{Pile, RawPile};
+use crate::definitions::{
+    PartialPointerData, PointerData, RecursiveChunkRef, RootTreeNode, TreeNode,
+};
+use crate::pile::{existence_checker_stage, Pile, RawPile, StoragePipelineSettings};
 use crate::progress::ProgressTracker;
 use crate::tree::{create_uidgid_lookup_tables, differentiate_node_in_place};
+use crate::utils::get_number_of_workers;
 use std::collections::BTreeMap;
+use std::sync::Arc;

-pub fn store<RP: RawPile, PT: ProgressTracker>(
+pub fn store<CST: ChunkSubmissionTarget, PT: ProgressTracker>(
    root_path: &Path,
    root: &mut TreeNode,
-    pile: &Pile<RP>,
+    target: &CST,
    progress_bar: &mut PT,
    num_workers: u8,
 ) -> anyhow::Result<()> {
@ -57,7 +61,7 @@ pub fn store<RP: RawPile, PT: ProgressTracker>(
            s.builder()
                .name(format!("yama chunker {}", worker_num))
                .spawn(move |_| {
-                    if let Err(e) = store_worker(root_path, pile, paths_recv, results_send) {
+                    if let Err(e) = store_worker(root_path, target, paths_recv, results_send) {
                        error!("[critical!] Storage worker {} FAILED: {:?}", worker_num, e);
                        critical_failures.fetch_add(1, Ordering::Relaxed);
                    }
@ -83,9 +87,9 @@ pub fn store<RP: RawPile, PT: ProgressTracker>(
    }
 }

-pub fn store_worker<RP: RawPile>(
+pub fn store_worker<CST: ChunkSubmissionTarget>(
    root: &Path,
-    pile: &Pile<RP>,
+    target: &CST,
    paths: Receiver<String>,
    results: Sender<(String, Option<RecursiveChunkRef>)>,
 ) -> anyhow::Result<()> {
@ -93,7 +97,7 @@ pub fn store_worker<RP: RawPile>(
        let full_path = root.join(&path);
        match File::open(&full_path) {
            Ok(mut file) => {
-                let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, &pile);
+                let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, target);
                // streaming copy from file to chunker, really cool :)
                io::copy(&mut file, &mut chunker)?;
                let chunk_ref = chunker.finish()?;
@ -231,13 +235,36 @@ pub fn manager<PT: ProgressTracker>(
 /// loaded and fully-integrated).
 /// This also creates a pointer (which is why this is called `store_fully`).
 pub fn store_fully<PT: ProgressTracker>(
-    pile: &Pile<Box<dyn RawPile>>,
+    pile: Arc<Pile<Box<dyn RawPile>>>,
    root_dir: &PathBuf,
    new_pointer_name: &String,
    mut root_node: TreeNode,
    parent: Option<String>,
    num_workers: u8,
    progress_bar: &mut PT,
+) -> anyhow::Result<()> {
+    pointer_ops_prepare_to_store(&pile, &mut root_node, &parent)?;
+    let pointer_data =
+        store_without_pointer_ops(&pile, &root_dir, root_node, num_workers, progress_bar)?
+            .complete(parent);
+    pointers_ops_after_store(&pile, &new_pointer_name, &pointer_data)?;
+    Ok(())
+}
+
+pub fn pointers_ops_after_store(
+    pile: &Pile<impl RawPile>,
+    new_pointer_name: &str,
+    pointer_data: &PointerData,
+) -> anyhow::Result<()> {
+    pile.write_pointer(&new_pointer_name, &pointer_data)?;
+    pile.flush()?;
+    Ok(())
+}
+
+pub fn pointer_ops_prepare_to_store(
+    pile: &Pile<impl RawPile>,
+    mut root_node: &mut TreeNode,
+    parent: &Option<String>,
 ) -> anyhow::Result<()> {
    if let Some(parent) = parent.as_ref() {
        let mut parent_pointer = pile.read_pointer(parent)?.ok_or_else(|| {
@ -251,8 +278,41 @@ pub fn store_fully<PT: ProgressTracker>(
        fully_integrate_pointer_node(&pile, &mut parent_node.node, &mut parent_pointer)?;
        differentiate_node_in_place(&mut root_node, &parent_node.node)?;
    }
+    Ok(())
+}

-    store(&root_dir, &mut root_node, &pile, progress_bar, num_workers)?;
+pub fn store_without_pointer_ops<PT: ProgressTracker>(
+    pile: &Arc<Pile<Box<dyn RawPile>>>,
+    root_dir: &PathBuf,
+    mut root_node: TreeNode,
+    num_workers: u8,
+    progress_bar: &mut PT,
+) -> anyhow::Result<PartialPointerData> {
+    // TODO make these configurable
+    let sps = StoragePipelineSettings {
+        num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
+        compressor_input_bound: 64,
+        writer_input_bound: 64,
+    };
+    let (control_tx, control_rx) = crossbeam_channel::unbounded();
+    let pile2 = pile.clone();
+    let pipeline = pile.raw_pile.build_storage_pipeline(sps, control_tx)?;
+
+    // TODO(newver) The existence checker stage should be able to be swapped between different implementations.
+    let pipeline = existence_checker_stage(pile2, pipeline);
+
+    store(
+        &root_dir,
+        &mut root_node,
+        &pipeline,
+        progress_bar,
+        num_workers,
+    )?;
+    // must drop the pipeline to allow the threads to close
+    drop(pipeline);
+    while let Ok(_) = control_rx.recv() {
+        // TODO nothing for now.
+    }

    let mut uid_lookup = BTreeMap::new();
    let mut gid_lookup = BTreeMap::new();
@ -263,19 +323,20 @@ pub fn store_fully<PT: ProgressTracker>(
    let chunk_ref = commands::store_tree_node(
        &pile,
        &RootTreeNode {
-            name: root_dir.file_name().unwrap().to_str().unwrap().to_owned(),
+            name: root_dir
+                .file_name()
+                .map(|s| s.to_str())
+                .flatten()
+                .unwrap_or("_root")
+                .to_owned(),
            node: root_node,
        },
    )?;

-    let pointer_data = PointerData {
+    let pointer_data = PartialPointerData {
        chunk_ref,
-        parent_pointer: parent,
        uid_lookup,
        gid_lookup,
    };
-
-    pile.write_pointer(&new_pointer_name, &pointer_data)?;
-    pile.flush()?;
-    Ok(())
+    Ok(pointer_data)
 }
--- a/yama/src/pile.rs
+++ b/yama/src/pile.rs
@ -19,11 +19,14 @@ use std::path::PathBuf;

 use serde::{Deserialize, Serialize};

-use crate::chunking::calculate_chunkid;
 use crate::definitions::{ChunkId, PointerData};
+use crate::utils::get_number_of_workers;
+use crossbeam_channel::Sender;
 use std::collections::HashSet;
+use std::fmt::Debug;
 use std::sync::{Arc, Condvar, Mutex};

+pub mod access_guard;
 pub mod compression;
 pub mod encryption;
 pub mod integrity;
@ -65,12 +68,80 @@ pub enum Keyspace {
    Pointer,
 }

-pub trait RawPile: Send + Sync {
+/// Useful information for humans. Doesn't need to be spot on, but kind of interesting.
+#[derive(Debug, Clone)]
+pub struct DebugStatistics {
+    pub number_of_chunks: u64,
+    pub minimum_chunk_size: Option<u32>,
+    pub maximum_chunk_size: Option<u32>,
+    pub total_chunk_size: u64,
+}
+
+#[derive(Debug, Clone)]
+pub struct StoragePipelineSettings {
+    pub num_compressors: u32,
+    pub compressor_input_bound: u32,
+    pub writer_input_bound: u32,
+}
+
+pub fn existence_checker_stage<RP: RawPile>(
+    pile: Arc<Pile<RP>>,
+    next_stage: Sender<(ChunkId, Vec<u8>)>,
+) -> Sender<(ChunkId, Vec<u8>)> {
+    // TODO(newver) Do better than this.
+
+    let shared_seen_set: Arc<Mutex<HashSet<ChunkId>>> = Default::default();
+    let (tx, rx) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(32);
+
+    // TODO would like something better for the networked case
+    for _ in 0..get_number_of_workers("YAMA_EXISTENCE_CHECKERS") {
+        let shared_seen_set = shared_seen_set.clone();
+        let next_stage = next_stage.clone();
+        let rx = rx.clone();
+        let pile = pile.clone();
+        std::thread::Builder::new()
+            .name("yama exist?er".to_string())
+            .spawn(move || {
+                while let Ok((chunk_id, chunk)) = rx.recv() {
+                    // TODO handle errors properly
+                    let is_new = { shared_seen_set.lock().unwrap().insert(chunk_id) };
+                    if !is_new {
+                        continue;
+                    }
+                    if !pile.chunk_exists(&chunk_id).unwrap() {
+                        next_stage.send((chunk_id, chunk)).unwrap();
+                    }
+                }
+            })
+            .unwrap();
+    }
+
+    tx
+}
+
+pub enum ControllerMessage {
+    Failure {
+        worker_id: Arc<String>,
+        error_message: String,
+    },
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum PipelineDescription {
+    Store,
+    Remote,
+    Integrity,
+    Compression { dictionary_fingerprint: u64 },
+    Encryption,
+}
+
+pub trait RawPile: Send + Sync + Debug + 'static {
    // TODO expose verification errors?
    fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool>;
    fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>>;
    fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()>;
    fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()>;
+    fn delete_many(&self, kind: Keyspace, key: &[&[u8]]) -> anyhow::Result<()>;
    fn list_keys(
        &self,
        kind: Keyspace,
@ -84,6 +155,23 @@ pub trait RawPile: Send + Sync {
    fn flush(&self) -> anyhow::Result<()>;
    // TODO return a progress Receiver
    fn check_lowlevel(&self) -> anyhow::Result<bool>;
+
+    /// Return a few statistics, if possible.
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        Ok(None)
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>>;
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>>;
+
+    /// Return a u64 order token that indicates the optimum order to read this chunk in
+    /// compared to other chunks.
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64>;
 }

 impl RawPile for Box<dyn RawPile> {
@ -99,6 +187,9 @@ impl RawPile for Box<dyn RawPile> {
    fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
        self.as_ref().delete(kind, key)
    }
+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.as_ref().delete_many(kind, keys)
+    }
    fn list_keys(
        &self,
        kind: Keyspace,
@ -111,6 +202,26 @@ impl RawPile for Box<dyn RawPile> {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        self.as_ref().check_lowlevel()
    }
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        self.as_ref().debug_statistics()
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        self.as_ref()
+            .build_storage_pipeline(settings, controller_send)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        self.as_ref().describe_pipeline()
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }

 impl<RP: RawPile> RawPile for Arc<RP> {
@ -126,6 +237,9 @@ impl<RP: RawPile> RawPile for Arc<RP> {
    fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
        self.as_ref().delete(kind, key)
    }
+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.as_ref().delete_many(kind, keys)
+    }
    fn list_keys(
        &self,
        kind: Keyspace,
@ -138,8 +252,29 @@ impl<RP: RawPile> RawPile for Arc<RP> {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        self.as_ref().check_lowlevel()
    }
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        self.as_ref().debug_statistics()
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        self.as_ref()
+            .build_storage_pipeline(settings, controller_send)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        self.as_ref().describe_pipeline()
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }

+#[derive(Debug)]
 pub struct Pile<R: RawPile> {
    pub raw_pile: R,
    pub racy_submission_mutex: Mutex<HashSet<ChunkId>>,
@ -229,9 +364,7 @@ impl<R: RawPile> Pile<R> {
        Ok(result)
    }

-    pub fn submit_chunk(&self, chunk_data: &[u8]) -> anyhow::Result<ChunkId> {
-        let chunk_id = calculate_chunkid(chunk_data);
-
+    pub fn submit_chunk(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
        let mut racy_submissions = self.racy_submission_mutex.lock().unwrap();
        if racy_submissions.insert(chunk_id) {
            drop(racy_submissions);
@ -250,7 +383,7 @@ impl<R: RawPile> Pile<R> {
                }
            }
        }
-        Ok(chunk_id)
+        Ok(())
    }

    /// Flushes buffered writes. Should really run this before exiting, so I can sleep better at
--- a/yama/src/pile/access_guard.rs
+++ b/yama/src/pile/access_guard.rs
@ -0,0 +1,141 @@
+use crate::chunking::calculate_chunkid;
+use crate::definitions::ChunkId;
+use crate::pile::{
+    ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
+};
+use anyhow::{anyhow, bail};
+use crossbeam_channel::{Receiver, Sender};
+use derivative::Derivative;
+use std::sync::Arc;
+use std::thread;
+
+/// PileGuard is a wrapper around a pile that prevents data exfiltration and malicious corruption.
+/// It's basically a firewall for a Pile?
+/// Preventing malicious corruption requires the chunks to be unprocessed. This way, their ID can be
+/// checked by this module.
+#[derive(Debug, Derivative)]
+#[derivative(Clone(bound = ""))]
+// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
+// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
+pub struct PileGuard<R: Clone + RawPile> {
+    underlying: R,
+    /// Whether to verify chunk IDs to prevent malicious corruption
+    verify_chunk_ids: bool,
+}
+
+fn pipeline(
+    subsequent_pipeline: Sender<(ChunkId, Vec<u8>)>,
+    input: Receiver<(ChunkId, Vec<u8>)>,
+) -> anyhow::Result<()> {
+    while let Ok((claimed_chunk_id, chunk)) = input.recv() {
+        let actual_chunk_id = calculate_chunkid(&chunk);
+        if actual_chunk_id != claimed_chunk_id {
+            bail!("CHUNK ID MISMATCH — is this forgery? (malicious storage process?) claimed{:?} actually{:?}", claimed_chunk_id, actual_chunk_id);
+        }
+        subsequent_pipeline
+            .send((claimed_chunk_id, chunk))
+            .map_err(|_| anyhow!("Subsequent step closed"))?;
+    }
+    Ok(())
+}
+
+impl<R: Clone + RawPile> PileGuard<R> {
+    pub fn new(underlying: R, verify_chunk_ids: bool) -> Self {
+        PileGuard {
+            underlying,
+            verify_chunk_ids,
+        }
+    }
+}
+
+impl<R: Clone + RawPile> RawPile for PileGuard<R> {
+    fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
+        match kind {
+            Keyspace::Chunk => self.underlying.exists(kind, key),
+            Keyspace::ChunkHash => {
+                bail!("Access denied");
+            }
+            Keyspace::Pointer => {
+                bail!("Access denied");
+            }
+        }
+    }
+
+    fn read(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
+        bail!("Access denied");
+    }
+
+    fn write(&self, kind: Keyspace, _key: &[u8], _value: &[u8]) -> anyhow::Result<()> {
+        match kind {
+            Keyspace::Chunk => {
+                todo!()
+            }
+            Keyspace::ChunkHash => {
+                bail!("Access denied");
+            }
+            Keyspace::Pointer => {
+                bail!("Access denied");
+            }
+        }
+    }
+
+    fn delete(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<()> {
+        bail!("Access denied");
+    }
+
+    fn delete_many(&self, _kind: Keyspace, _keys: &[&[u8]]) -> anyhow::Result<()> {
+        bail!("Access denied");
+    }
+
+    fn list_keys(
+        &self,
+        _kind: Keyspace,
+    ) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
+        bail!("Access denied");
+    }
+
+    fn flush(&self) -> anyhow::Result<()> {
+        self.underlying.flush()
+    }
+
+    fn check_lowlevel(&self) -> anyhow::Result<bool> {
+        self.underlying.check_lowlevel()
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        let subsequent_pipeline = self
+            .underlying
+            .build_storage_pipeline(settings.clone(), controller_send.clone())?;
+
+        let (input_to_this_stage, receiver) = crossbeam_channel::bounded(8);
+
+        thread::Builder::new()
+            .name("yama Aguard".to_owned())
+            .spawn(move || {
+                if let Err(err) = pipeline(subsequent_pipeline, receiver) {
+                    controller_send
+                        .send(ControllerMessage::Failure {
+                            worker_id: Arc::new(String::from("accessguard")),
+                            error_message: format!("err {:?}", err),
+                        })
+                        .expect("This is BAD: failed to send failure message to controller.");
+                }
+            })
+            .unwrap();
+
+        Ok(input_to_this_stage)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        // TODO(question) Should we be described in the pipeline?
+        self.underlying.describe_pipeline()
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
+    }
+}
--- a/yama/src/pile/compression.rs
+++ b/yama/src/pile/compression.rs
@ -15,20 +15,27 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

+use std::convert::TryInto;
 use std::sync::Arc;
 use std::thread;
 use std::thread::JoinHandle;

 use anyhow::anyhow;
 use crossbeam_channel::{Receiver, Sender};
+use derivative::Derivative;
 use log::error;
-use zstd::block::{Compressor, Decompressor};
+use metrics::{register_counter, Unit};
+use zstd::bulk::{Compressor, Decompressor};

-use crate::pile::{Keyspace, RawPile};
+use crate::definitions::ChunkId;
+use crate::pile::{
+    ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
+    StoragePipelineSettings,
+};

 pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct CompressionSettings {
    /// Raw dictionary to pass to Zstd for compression and decompression
    pub dictionary: Arc<Vec<u8>>,
@ -40,10 +47,15 @@ pub struct CompressionSettings {
    pub num_decompressors: u32,
 }

+#[derive(Debug, Derivative)]
+#[derivative(Clone(bound = ""))]
+// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
+// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
 pub struct RawPileCompressor<R: RawPile> {
-    underlying: R,
-    compressor: Sender<(Vec<u8>, Sender<Vec<u8>>)>,
-    decompressor: Sender<(Vec<u8>, Sender<Vec<u8>>)>,
+    underlying: Arc<R>,
+    compressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
+    decompressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
+    settings: Arc<CompressionSettings>,
 }

 impl<R: RawPile> RawPileCompressor<R> {
@ -51,6 +63,51 @@ impl<R: RawPile> RawPileCompressor<R> {
        underlying: R,
        settings: CompressionSettings,
    ) -> anyhow::Result<(Self, Vec<JoinHandle<()>>)> {
+        register_counter!(
+            "compressor_in_bytes",
+            Unit::Bytes,
+            "Number of bytes that have been fed into the compressor"
+        );
+        register_counter!(
+            "compressor_out_bytes",
+            Unit::Bytes,
+            "Number of bytes that have come out of the compressor"
+        );
+        register_counter!(
+            "compressor_chunks",
+            Unit::Count,
+            "Number of chunks that have been compressed"
+        );
+        register_counter!(
+            "decompressor_in_bytes",
+            Unit::Bytes,
+            "Number of bytes that have been fed into the decompressor"
+        );
+        register_counter!(
+            "decompressor_out_bytes",
+            Unit::Bytes,
+            "Number of bytes that have come out of the decompressor"
+        );
+        register_counter!(
+            "decompressor_chunks",
+            Unit::Count,
+            "Number of chunks that have been decompressed"
+        );
+
+        if settings.num_compressors == 0 && settings.num_decompressors == 0 {
+            // optimisation for when we're only building a pipeline: we don't want to
+            return Ok((
+                RawPileCompressor {
+                    underlying: Arc::new(underlying),
+                    compressor: None,
+                    decompressor: None,
+
+                    settings: Arc::new(settings),
+                },
+                Vec::with_capacity(0),
+            ));
+        }
+
        let (com_s, com_r) = crossbeam_channel::bounded(4);
        let (dec_s, dec_r) = crossbeam_channel::bounded(4);

@ -84,9 +141,10 @@ impl<R: RawPile> RawPileCompressor<R> {

        Ok((
            RawPileCompressor {
-                underlying,
-                compressor: com_s,
-                decompressor: dec_s,
+                underlying: Arc::new(underlying),
+                compressor: Some(com_s),
+                decompressor: Some(dec_s),
+                settings: Arc::new(settings),
            },
            handles,
        ))
@ -96,9 +154,10 @@ impl<R: RawPile> RawPileCompressor<R> {
        queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
        settings: CompressionSettings,
    ) -> anyhow::Result<()> {
-        let mut compressor = Compressor::with_dict(settings.dictionary.as_ref().clone());
+        let mut compressor =
+            Compressor::with_dictionary(settings.level, settings.dictionary.as_ref())?;
        while let Ok((job, response_sender)) = queue.recv() {
-            let result = compressor.compress(&job, settings.level)?;
+            let result = compressor.compress(&job)?;
            response_sender
                .send(result)
                .or(Err(anyhow!("Couldn't send compression result")))?;
@ -110,7 +169,7 @@ impl<R: RawPile> RawPileCompressor<R> {
        queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
        settings: CompressionSettings,
    ) -> anyhow::Result<()> {
-        let mut decompressor = Decompressor::with_dict(settings.dictionary.as_ref().clone());
+        let mut decompressor = Decompressor::with_dictionary(settings.dictionary.as_ref())?;
        while let Ok((job, response_sender)) = queue.recv() {
            let result = decompressor.decompress(&job, DECOMPRESS_CAPACITY)?;
            response_sender
@ -123,6 +182,8 @@ impl<R: RawPile> RawPileCompressor<R> {
    fn decompress(&self, data: &[u8]) -> anyhow::Result<Vec<u8>> {
        let (ret_s, ret_r) = crossbeam_channel::bounded(0);
        self.decompressor
+            .as_ref()
+            .expect("No decompressors configured")
            .send((data.to_vec(), ret_s))
            .or(Err(anyhow!("couldn't send to decompressor")))?;

@ -132,11 +193,67 @@ impl<R: RawPile> RawPileCompressor<R> {
    fn compress(&self, compressed_data: &[u8]) -> anyhow::Result<Vec<u8>> {
        let (ret_s, ret_r) = crossbeam_channel::bounded(0);
        self.compressor
+            .as_ref()
+            .expect("No compressors configured")
            .send((compressed_data.to_vec(), ret_s))
            .or(Err(anyhow!("couldn't send to compressor")))?;

        Ok(ret_r.recv().or(Err(anyhow!("couldn't receive result")))?)
    }
+
+    fn storage_pipeline_worker(
+        &self,
+        next_stage: Sender<(ChunkId, Vec<u8>)>,
+        input: Receiver<(ChunkId, Vec<u8>)>,
+        worker_id: String,
+    ) -> anyhow::Result<()> {
+        // the worker ID has to live forever, so we leak it :/
+        let worker_id: &'static str = Box::leak(worker_id.into_boxed_str());
+        metrics::register_histogram!(
+            "compressor_idle_time",
+            metrics::Unit::Seconds,
+            "Time spent waiting between chunks",
+            "id" => worker_id
+        );
+        metrics::register_counter!(
+            "compressor_bytes_input",
+            metrics::Unit::Bytes,
+            "Number of bytes input into the compressor.",
+            "id" => worker_id
+        );
+        metrics::register_counter!(
+            "compressor_bytes_output",
+            metrics::Unit::Bytes,
+            "Number of bytes output from the compressor.",
+            "id" => worker_id
+        );
+        metrics::register_counter!(
+            "compressor_chunks_processed",
+            metrics::Unit::Count,
+            "Number of bytes input into the compressor.",
+            "id" => worker_id
+        );
+
+        let mut compressor =
+            Compressor::with_dictionary(self.settings.level, self.settings.dictionary.as_ref())?;
+        while let Ok((chunk_id, bytes)) = input.recv() {
+            let in_bytes = bytes.len();
+            let bytes = compressor.compress(&bytes)?;
+            let out_bytes = bytes.len();
+            next_stage.send((chunk_id, bytes))?;
+            // Per-worker metrics
+            // TODO rename
+            metrics::counter!("compressor_bytes_input", in_bytes as u64, "id" => worker_id);
+            metrics::counter!("compressor_bytes_output", out_bytes as u64, "id" => worker_id);
+
+            // Global metrics
+            metrics::counter!("compressor_in_bytes", in_bytes as u64);
+            metrics::counter!("compressor_out_bytes", out_bytes as u64);
+            metrics::increment_counter!("compressor_chunks");
+        }
+
+        Ok(())
+    }
 }

 impl<R: RawPile> RawPile for RawPileCompressor<R> {
@ -151,6 +268,7 @@ impl<R: RawPile> RawPile for RawPileCompressor<R> {
            Ok(None)
        }
    }
+
    fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
        let compressed = self.compress(value)?;
        self.underlying.write(kind, key, &compressed)
@ -160,6 +278,10 @@ impl<R: RawPile> RawPile for RawPileCompressor<R> {
        self.underlying.delete(kind, key)
    }

+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.underlying.delete_many(kind, keys)
+    }
+
    fn list_keys(
        &self,
        kind: Keyspace,
@ -172,4 +294,66 @@ impl<R: RawPile> RawPile for RawPileCompressor<R> {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        self.underlying.check_lowlevel()
    }
+
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        self.underlying.debug_statistics()
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        // this one should have a few threads behind it! yarr!
+        let subsequent_pipeline = self
+            .underlying
+            .build_storage_pipeline(settings.clone(), controller_send.clone())?;
+
+        let (input_to_this_stage, receiver) =
+            crossbeam_channel::bounded(settings.compressor_input_bound as usize);
+
+        for compressor_number in 0..settings.num_compressors {
+            let subsequent_pipeline = subsequent_pipeline.clone();
+            let receiver = receiver.clone();
+            let controller_send = controller_send.clone();
+            let this = (*self).clone();
+            thread::Builder::new()
+                .name(format!("yama Pcomp{}", compressor_number))
+                .spawn(move || {
+                    let worker_id = Arc::new(format!("compressor-{}", compressor_number));
+                    if let Err(err) = this.storage_pipeline_worker(
+                        subsequent_pipeline,
+                        receiver,
+                        worker_id.to_string(),
+                    ) {
+                        controller_send
+                            .send(ControllerMessage::Failure {
+                                worker_id,
+                                error_message: format!("err {:?}", err),
+                            })
+                            .expect("This is BAD: failed to send failure message to controller.");
+                    }
+                })
+                .unwrap();
+        }
+
+        Ok(input_to_this_stage)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        let mut underlying = self.underlying.describe_pipeline()?;
+
+        let mut dict_fingerprint_u256 = [0; 32];
+        blake::hash(256, &self.settings.dictionary, &mut dict_fingerprint_u256)?;
+        let dictionary_fingerprint: u64 =
+            u64::from_be_bytes(dict_fingerprint_u256[0..8].try_into().unwrap());
+        underlying.push(PipelineDescription::Compression {
+            dictionary_fingerprint,
+        });
+        Ok(underlying)
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }
--- a/yama/src/pile/encryption.rs
+++ b/yama/src/pile/encryption.rs
@ -20,7 +20,11 @@ use log::warn;
 use sodiumoxide::crypto::secretbox;
 use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES};

-use crate::pile::{Keyspace, RawPile};
+use crate::definitions::ChunkId;
+use crate::pile::{
+    ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
+};
+use crossbeam_channel::Sender;

 /// A RawPile that provides encryption of chunk contents.
 /// Please note that keys are not currently encrypted, so this scheme is not CPA-secure.
@ -30,6 +34,7 @@ use crate::pile::{Keyspace, RawPile};
 /// to rely on that.
 /// This feature will be revisited soon...
 /// Notably, keys should be passed through a secure permutation first.
+#[derive(Debug)]
 pub struct RawPileEncryptor<R: RawPile> {
    underlying: R,
    secret_key: Key,
@ -96,6 +101,10 @@ impl<R: RawPile> RawPile for RawPileEncryptor<R> {
        self.underlying.delete(kind, key)
    }

+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.underlying.delete_many(kind, keys)
+    }
+
    fn list_keys(
        &self,
        kind: Keyspace,
@ -108,4 +117,22 @@ impl<R: RawPile> RawPile for RawPileEncryptor<R> {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        self.underlying.check_lowlevel()
    }
+
+    fn build_storage_pipeline(
+        &self,
+        _settings: StoragePipelineSettings,
+        _controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        todo!()
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        let mut underlying = self.underlying.describe_pipeline()?;
+        underlying.push(PipelineDescription::Encryption);
+        Ok(underlying)
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }
--- a/yama/src/pile/integrity.rs
+++ b/yama/src/pile/integrity.rs
@ -19,12 +19,17 @@ use std::hash::Hasher;

 use thiserror::Error;

-use crate::definitions::XXH64_SEED;
-use crate::pile::{Keyspace, RawPile};
+use crate::definitions::{ChunkId, XXH64_SEED};
+use crate::pile::{
+    ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
+    StoragePipelineSettings,
+};
 use crate::utils::bytes_to_hexstring;
+use crossbeam_channel::Sender;

 /// This RawPile enables checking the integrity of stored chunks.
 /// This is done by storing a hash along with the chunk contents, which can later be verified.
+#[derive(Debug)]
 pub struct RawPileIntegrityChecker<RP: RawPile> {
    underlying: RP,
 }
@ -93,6 +98,10 @@ impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
        self.underlying.delete(kind, key)
    }

+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        self.underlying.delete_many(kind, keys)
+    }
+
    fn list_keys(
        &self,
        kind: Keyspace,
@ -108,4 +117,44 @@ impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
        // TODO integrity check ...?
        self.underlying.check_lowlevel()
    }
+
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        self.underlying.debug_statistics()
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        // TODO primitive implementation but good enough for now.
+        // May want metrics later?
+        let next_stage = self
+            .underlying
+            .build_storage_pipeline(settings, controller_send)?;
+        let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(64);
+        std::thread::Builder::new()
+            .name("yama integrity".to_string())
+            .spawn(move || {
+                while let Ok((chunk_id, mut chunk)) = receiver.recv() {
+                    let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
+                    hasher.write(&chunk);
+                    let computed_hash = hasher.finish().to_be_bytes();
+                    chunk.extend_from_slice(&computed_hash);
+                    next_stage.send((chunk_id, chunk)).unwrap();
+                }
+            })
+            .unwrap();
+        Ok(input)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        let mut underlying = self.underlying.describe_pipeline()?;
+        underlying.push(PipelineDescription::Integrity);
+        Ok(underlying)
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
+    }
 }
--- a/yama/src/pile/local_sqlitebloblogs.rs
+++ b/yama/src/pile/local_sqlitebloblogs.rs
@ -15,26 +15,32 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

-use std::collections::hash_map::Entry;
-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
 use std::convert::{TryFrom, TryInto};
-use std::fs;
-use std::fs::{File, OpenOptions};
+use std::fs::{read_dir, remove_file, File, OpenOptions};
 use std::io::{Read, Seek, SeekFrom, Write};
+use std::os::unix::fs::MetadataExt;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, Condvar, Mutex};
+use std::time::Duration;
+use std::{fs, thread};

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
-use log::warn;
+use crossbeam_channel::{Receiver, Sender};
+use log::{info, warn};
 use nix::unistd::sync;
-use rusqlite::{params, Error};
+use rusqlite::ffi::ErrorCode::ConstraintViolation;
+use rusqlite::{params, Error, ErrorCode, Transaction, TransactionBehavior, NO_PARAMS};
 use rusqlite::{Connection, OptionalExtension};

 use crate::definitions::ChunkId;
-use crate::pile::{Keyspace, RawPile};
-use crate::utils::bytes_to_hexstring;
-use rusqlite::ffi::ErrorCode::ConstraintViolation;
+use crate::pile::{
+    ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
+    StoragePipelineSettings,
+};
+use crate::progress::ProgressTracker;
+use crate::utils::{bytes_to_hexstring, LruMap};

 /// Bloblogs will not be reused if they are already 2 GiB large.
 pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
@ -42,10 +48,19 @@ pub const MAX_BLOBLOG_REUSE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
 /// This many pointers will be batched up for writing.
 pub const POINTER_WRITE_BATCHES: usize = 2048;

+/// This many bloblogs will be kept open for reading, at maximum.
+pub const BLOBLOG_MAX_READING_FILE_COUNT: usize = 128;
+
+/// Size of a blob header within a bloblog.
+/// 32 byte Chunk Id
+/// 4 byte (u32) Blob size
+pub const BLOB_HEADER_SIZE: u64 = 32 + 4;
+
 /// A file storing a log of blobs.
 /// Format:
 ///     Repeated:
 ///         <32 byte ChunkId><u32: length><length × u8: data>
+#[derive(Debug)]
 pub struct Bloblog {
    pub file: File,
 }
@ -127,10 +142,11 @@ impl Bloblog {

 pub type BloblogId = u32;

+#[derive(Debug)]
 pub struct Inner {
    next_bloblog_id: BloblogId,
-    writer_bloblogs: Vec<BloblogId>,
-    open_bloblogs: HashMap<BloblogId, Arc<Mutex<Bloblog>>>, // TODO want an LRU cache with a weak hashmap...?
+    writer_bloblogs: Vec<(BloblogId, Arc<Mutex<Bloblog>>)>,
+    reader_bloblogs: LruMap<BloblogId, Arc<Mutex<Bloblog>>>,
    connection: Connection,
    writers_in_progress: u16,
    // We batch up pointer writes because sync() performance really hurts us if we do them one by
@ -138,14 +154,13 @@ pub struct Inner {
    queued_pointer_writes: HashMap<ChunkId, BloblogPointer>,
 }

-impl Inner {
-    pub fn raw_put_chunk_pointer(
-        &self,
+fn raw_put_chunk_pointer_txn(
+    txn: &Transaction,
    chunk_id: &ChunkId,
    bloblog: BloblogId,
    offset_i64: i64,
-    ) -> anyhow::Result<()> {
-        match self.connection.execute(
+) -> anyhow::Result<()> {
+    match txn.execute(
        "INSERT INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)",
        params![&chunk_id[..], bloblog, offset_i64],
    ) {
@ -158,27 +173,41 @@ impl Inner {
                );
                Ok(())
            } else {
-                    Err(Error::SqliteFailure(e, str))?;
-                    unreachable!();
+                Err(Error::SqliteFailure(e, str).into())
            }
        }
-            other => {
-                other?;
-                unreachable!();
-            }
+        Err(other) => Err(other.into()),
    }
+}
+
+impl Inner {
+    pub fn raw_put_chunk_pointer(
+        &mut self,
+        chunk_id: &ChunkId,
+        bloblog: BloblogId,
+        offset_i64: i64,
+    ) -> anyhow::Result<()> {
+        let txn = self.connection.transaction()?;
+        raw_put_chunk_pointer_txn(&txn, chunk_id, bloblog, offset_i64)?;
+        txn.commit()?;
+        Ok(())
    }

    pub fn flush(&mut self) -> anyhow::Result<()> {
        // Create a non-allocated hashmap to satisfy borrow checker, then swap it in and out
        let mut queued_pointer_writes = HashMap::with_capacity(0);
        std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
+
+        let txn = self.connection.transaction()?;
+
        for (chunk_id, pointer) in queued_pointer_writes.drain() {
            let offset_i64 =
                i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
-            self.raw_put_chunk_pointer(&chunk_id, pointer.bloblog, offset_i64)?;
+            raw_put_chunk_pointer_txn(&txn, &chunk_id, pointer.bloblog, offset_i64)?;
        }
        std::mem::swap(&mut self.queued_pointer_writes, &mut queued_pointer_writes);
+
+        txn.commit()?;
        Ok(())
    }
 }
@ -191,14 +220,16 @@ impl Inner {
 /// Because random access is important for performance, an additional SQLite database is used
 /// as a map from chunk IDs to their positions in the blob logs, allowing readers to seek to the
 /// appropriate place and read a chunk randomly.
+#[derive(Clone, Debug)]
 pub struct SqliteBloblogPile {
    inner: Arc<Mutex<Inner>>,
    path: PathBuf,
-    writers_reach_zero: Condvar,
+    writers_reach_zero: Arc<Condvar>,
    should_batch_pointer_writes: bool,
 }

 /// A pointer to a blob in a 'blob log'.
+#[derive(Debug)]
 pub struct BloblogPointer {
    /// Which blob log the blob is stored in.
    bloblog: BloblogId,
@ -238,11 +269,18 @@ impl SqliteBloblogPile {
            )?;
        }

+        // Enable WAL mode for significantly better write performance.
+        connection.execute_batch(
+            "
+            PRAGMA journal_mode=WAL;
+        ",
+        )?;
+
        Ok(SqliteBloblogPile {
            inner: Arc::new(Mutex::new(Inner {
                next_bloblog_id: 0,
                writer_bloblogs: Vec::new(),
-                open_bloblogs: HashMap::new(),
+                reader_bloblogs: LruMap::new(BLOBLOG_MAX_READING_FILE_COUNT),
                connection,
                writers_in_progress: 0,
                queued_pointer_writes: Default::default(),
@ -255,23 +293,33 @@ impl SqliteBloblogPile {

    fn open_bloblog(&self, bloblog_id: BloblogId) -> anyhow::Result<Arc<Mutex<Bloblog>>> {
        let mut inner = self.inner.lock().unwrap();
-        Ok(match inner.open_bloblogs.entry(bloblog_id) {
-            Entry::Occupied(entry) => entry.get().clone(),
-            Entry::Vacant(entry) => {
+
+        match inner.reader_bloblogs.get(&bloblog_id) {
+            Some(bloblog) => Ok(bloblog.clone()),
+            None => {
                let bloblog = Arc::new(Mutex::new(Bloblog::open(
                    &self.path.join(&bloblog_id.to_string()),
                )?));
-                entry.insert(bloblog.clone());
-                bloblog
+                inner.reader_bloblogs.insert(bloblog_id, bloblog.clone());
+                Ok(bloblog)
+            }
        }
-        })
    }

    fn get_writing_bloblog(&self) -> anyhow::Result<(BloblogId, Arc<Mutex<Bloblog>>)> {
        let mut inner = self.inner.lock().unwrap();
-        let writing_bloblog_id: BloblogId = match inner.writer_bloblogs.pop() {
-            None => {
-                loop {
+
+        inner.writers_in_progress += 1;
+
+        if let Some(writing_bloblog) = inner.writer_bloblogs.pop() {
+            // We already have an open bloblog to give back.
+            return Ok(writing_bloblog);
+        }
+
+        // No open bloblogs to reuse; create a new one.
+        // It's very important to create a fresh one here; we definitely don't want to use a file
+        // that someone else is using!
+        let writing_bloblog_id = loop {
            let pre_inc = inner.next_bloblog_id;
            inner.next_bloblog_id += 1;

@ -280,26 +328,18 @@ impl SqliteBloblogPile {
            if !bloblog_path.exists() {
                break pre_inc;
            }
-                }
-            }
-            Some(id) => id,
        };

-        let result = Ok((
-            writing_bloblog_id,
-            match inner.open_bloblogs.entry(writing_bloblog_id) {
-                Entry::Occupied(entry) => entry.get().clone(),
-                Entry::Vacant(entry) => {
        let bloblog = Arc::new(Mutex::new(Bloblog::open(
            &self.path.join(&writing_bloblog_id.to_string()),
        )?));
-                    entry.insert(bloblog.clone());
-                    bloblog
-                }
-            },
-        ));
-        inner.writers_in_progress += 1;
-        result
+
+        // MAYBE FUTURE // Insert a weak reference so we can easily get a reader for this if desired.
+        // inner.open_bloblogs.insert(writing_bloblog_id, Arc::downgrade(&bloblog));
+        // For now, I don't think we actually care about reading a bloblog that we've written
+        // (at least not usually?)
+
+        Ok((writing_bloblog_id, bloblog))
    }

    /// Should be called once the bloblog has been finished writing to for the moment.
@ -312,7 +352,7 @@ impl SqliteBloblogPile {
        let size = bloblog.lock().unwrap().filesize()?;
        let mut inner = self.inner.lock().unwrap();
        if size < MAX_BLOBLOG_REUSE_SIZE {
-            inner.writer_bloblogs.push(id);
+            inner.writer_bloblogs.push((id, bloblog));
        }
        inner.writers_in_progress -= 1;
        if inner.writers_in_progress == 0 {
@ -338,8 +378,33 @@ impl SqliteBloblogPile {
            .optional()?)
    }

+    fn get_chunk_pointers(
+        &self,
+        chunk_ids: &[&[u8]],
+    ) -> anyhow::Result<Vec<Option<BloblogPointer>>> {
+        let mut inner = self.inner.lock().unwrap();
+        let txn = inner.connection.transaction()?;
+        let mut result = Vec::with_capacity(chunk_ids.len());
+        {
+            let mut stmt = txn.prepare("SELECT bloblog, offset FROM chunks WHERE chunk_id = ?1")?;
+            for &chunk_id in chunk_ids {
+                let bloglog_pointer: Option<BloblogPointer> = stmt
+                    .query_row(params![chunk_id], |row| {
+                        Ok(BloblogPointer {
+                            bloblog: row.get(0)?,
+                            offset: row.get::<_, i64>(1)? as u64,
+                        })
+                    })
+                    .optional()?;
+                result.push(bloglog_pointer);
+            }
+        }
+        txn.commit()?;
+        Ok(result)
+    }
+
    fn put_chunk_pointer(&self, chunk_id: &ChunkId, pointer: BloblogPointer) -> anyhow::Result<()> {
-        let inner = self.inner.lock().unwrap();
+        let mut inner = self.inner.lock().unwrap();
        let offset_i64 = i64::try_from(pointer.offset).expect("ouch! can't turn u64 into i64...");
        inner.raw_put_chunk_pointer(chunk_id, pointer.bloblog, offset_i64)
    }
@ -363,6 +428,441 @@ impl SqliteBloblogPile {
        let mut inner = self.inner.lock().unwrap();
        inner.flush()
    }
+
+    fn storage_pipeline_worker(
+        &self,
+        incoming: Receiver<(ChunkId, Vec<u8>)>,
+    ) -> anyhow::Result<()> {
+        // can hold on to the same bloblog as long as we'd like!
+        const POINTERS_BUFFER_SIZE: usize = 256;
+        let mut pointers_buffered = Vec::with_capacity(POINTERS_BUFFER_SIZE);
+
+        fn flush_pointers(
+            this: &SqliteBloblogPile,
+            pointers_buffered: &mut Vec<(ChunkId, BloblogPointer)>,
+        ) -> anyhow::Result<()> {
+            let mut inner = this.inner.lock().unwrap();
+            let txn = inner.connection.transaction()?;
+            {
+                let mut stmt = txn.prepare(
+                    "INSERT OR FAIL INTO chunks (chunk_id, bloblog, offset) VALUES (?1, ?2, ?3)",
+                )?;
+                for (chunk_id, pointer) in pointers_buffered.drain(..) {
+                    match stmt.execute(params![
+                        &chunk_id[..],
+                        pointer.bloblog,
+                        pointer.offset as i64
+                    ]) {
+                        Err(Error::SqliteFailure(e, str))
+                            if e.code == ErrorCode::ConstraintViolation =>
+                        {
+                            warn!(
+                                "(ignoring) SQLite constraint violation on insertion... {:?}",
+                                str
+                            );
+                        }
+                        other => {
+                            other?;
+                        }
+                    }
+                }
+            }
+            txn.commit()?;
+            Ok(())
+        }
+
+        fn write_blob(
+            this: &SqliteBloblogPile,
+            bloblog_id: BloblogId,
+            bloblog: &mut Bloblog,
+            pointers_buffered: &mut Vec<(ChunkId, BloblogPointer)>,
+            (chunk_id, chunk): (ChunkId, Vec<u8>),
+        ) -> anyhow::Result<()> {
+            let offset = bloblog.write_blob(&chunk_id, &chunk)?;
+            let pointer = BloblogPointer {
+                bloblog: bloblog_id,
+                offset,
+            };
+            pointers_buffered.push((chunk_id, pointer));
+
+            if pointers_buffered.len() >= POINTERS_BUFFER_SIZE {
+                flush_pointers(this, pointers_buffered)?;
+            }
+
+            Ok(())
+        }
+
+        while let Ok(chunk) = incoming.recv() {
+            let (bloblog_id, bloglog_mutex) = self.get_writing_bloblog()?;
+            let mut bloblog = bloglog_mutex.lock().expect("Failed to lock bloblog?");
+            write_blob(
+                self,
+                bloblog_id,
+                &mut bloblog,
+                &mut pointers_buffered,
+                chunk,
+            )?;
+
+            while let Ok(chunk) = incoming.recv_timeout(Duration::from_secs(5)) {
+                write_blob(
+                    self,
+                    bloblog_id,
+                    &mut bloblog,
+                    &mut pointers_buffered,
+                    chunk,
+                )?;
+                if bloblog.filesize()? > MAX_BLOBLOG_REUSE_SIZE {
+                    // get a new bloblog to write with.
+                    break;
+                }
+            }
+
+            drop(bloblog);
+            self.return_writing_bloblog(bloblog_id, bloglog_mutex)?;
+        }
+
+        info!("Flushing pointers (storage pipeline shutdown).");
+        flush_pointers(self, &mut pointers_buffered)?;
+
+        // we MUST have flushed ALL the pointers by now.
+        assert!(pointers_buffered.is_empty());
+        Ok(())
+    }
+
+    /// Look at the bloblogs in this pile and see where space may be reclaimable if we were to
+    /// compact.
+    ///
+    /// Next step: plan_compaction
+    pub fn analyse_for_compaction(&self) -> anyhow::Result<BTreeMap<BloblogId, BloblogStats>> {
+        let mut inner = self.inner.lock().unwrap();
+        // Lock the database right away.
+        let txn = inner
+            .connection
+            .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+        let mut stmt = txn.prepare(
+            "
+            SELECT bloblog, COUNT(c.offset), COUNT(d.offset), SUM(COALESCE(d.size, 0))
+            FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
+            GROUP BY bloblog
+        ",
+        )?;
+
+        struct UnpopulatedBloblogStats {
+            pub bloblog_id: BloblogId,
+            pub chunks_total: u64,
+            pub chunks_deleted: u64,
+            pub bytes_deleted: u64,
+        }
+
+        let unpopul_bloblog_stats = stmt.query_map(NO_PARAMS, |row| {
+            Ok(UnpopulatedBloblogStats {
+                bloblog_id: row.get(0)?,
+                chunks_total: row.get::<_, i64>(1)?.try_into().expect("i64 -> u64"),
+                chunks_deleted: row.get::<_, i64>(2)?.try_into().expect("i64 -> u64"),
+                bytes_deleted: row.get::<_, i64>(3)?.try_into().expect("i64 -> u64"),
+            })
+        })?;
+
+        let mut final_stats = BTreeMap::new();
+
+        for unpopul_stat in unpopul_bloblog_stats {
+            let UnpopulatedBloblogStats {
+                bloblog_id,
+                chunks_total,
+                chunks_deleted,
+                bytes_deleted,
+            } = unpopul_stat?;
+            let bloblog_path = self.path.join(&bloblog_id.to_string());
+            let bytes_total = std::fs::metadata(&bloblog_path)
+                .with_context(|| format!("Failed to get metadata for bloblog: {:?}", bloblog_path))?
+                .size();
+
+            final_stats.insert(
+                bloblog_id,
+                BloblogStats {
+                    chunks_total,
+                    chunks_deleted,
+                    bytes_total,
+                    // Add a slight correction since we can count the blob headers of deleted blobs
+                    // as deleted.
+                    bytes_deleted: bytes_deleted + chunks_deleted * BLOB_HEADER_SIZE,
+                },
+            );
+        }
+
+        Ok(final_stats)
+    }
+
+    /// Look at the analysis of compaction and, using the specified thresholds, come up with a plan
+    /// to perform compaction.
+    ///
+    /// May return an empty plan if compaction isn't worthwhile.
+    ///
+    /// Previous step: analyse_for_compaction
+    /// Next step: perform_compaction
+    pub fn plan_compaction(
+        &self,
+        thresholds: &CompactionThresholds,
+        analysis: BTreeMap<BloblogId, BloblogStats>,
+    ) -> anyhow::Result<CompactionPlan> {
+        let bloblogs_to_replace: BTreeMap<BloblogId, BloblogStats> = analysis
+            .into_iter()
+            .filter(|(_id, stats)| thresholds.should_replace_bloblog(stats))
+            .collect();
+        let reclaimable_space: u64 = bloblogs_to_replace
+            .values()
+            .map(|bs| bs.bytes_deleted)
+            .sum();
+        let bytes_to_write: u64 = bloblogs_to_replace
+            .values()
+            .map(|bs| bs.bytes_total - bs.bytes_deleted)
+            .sum();
+        let small_bloblogs: u32 = bloblogs_to_replace
+            .values()
+            .filter(|bs| bs.bytes_total - bs.bytes_deleted < thresholds.cond_if_less_allocated_than)
+            .count() as u32;
+
+        if reclaimable_space < thresholds.minimum_to_reclaim
+            && small_bloblogs < thresholds.minimum_small_bloblogs_to_merge
+        {
+            // Nothing worth doing: return an empty plan.
+            return Ok(CompactionPlan {
+                bloblogs_to_replace: Default::default(),
+                bytes_to_write: 0,
+                reclaimable_space: 0,
+                small_bloblogs: 0,
+            });
+        }
+
+        Ok(CompactionPlan {
+            bloblogs_to_replace: bloblogs_to_replace.keys().copied().collect(),
+            bytes_to_write,
+            reclaimable_space,
+            small_bloblogs,
+        })
+    }
+
+    /// Given a compaction plan, perform the compaction.
+    /// There shouldn't be any decisions left to be made at this point: just action.
+    ///
+    /// TODO flock the bloblogs to be removed and make readers and writers also flock them too.
+    ///
+    /// TODO find a way to deal with bloblogs that are entirely unreferenced from the index
+    ///      (e.g. bloblogs that weren't written properly, e.g. if compaction fails.)
+    pub fn perform_compaction(
+        &self,
+        mut progress: Box<dyn ProgressTracker>,
+        plan: CompactionPlan,
+    ) -> anyhow::Result<()> {
+        #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+        struct ReplacedBlobRow {
+            pub old_bloblog: BloblogId,
+            pub old_offset: u64,
+            pub chunk_id: ChunkId,
+        }
+
+        if plan.bloblogs_to_replace.is_empty() {
+            info!("No compaction to be done.");
+            return Ok(());
+        }
+
+        let mut to_preserve = BTreeSet::new();
+        let mut replacements = BTreeMap::new();
+
+        progress.set_max_size(plan.bytes_to_write);
+
+        // First find all the blobs we need to replace.
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+            let mut stmt = txn.prepare(
+                "
+                SELECT chunk_id, c.offset
+                FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
+                WHERE bloblog = ?1 AND d.offset IS NULL
+            ",
+            )?;
+            for bloblog in plan.bloblogs_to_replace.iter().copied() {
+                to_preserve.extend(
+                    stmt.query_map([bloblog], |row| {
+                        let mut chunk_id = ChunkId::default();
+                        chunk_id.copy_from_slice(row.get::<_, Vec<u8>>(0).unwrap().as_slice());
+                        Ok(ReplacedBlobRow {
+                            old_bloblog: bloblog,
+                            chunk_id,
+                            old_offset: row.get::<_, i64>(1).unwrap().try_into().unwrap(),
+                        })
+                    })?
+                    .collect::<Result<Vec<ReplacedBlobRow>, _>>()?,
+                );
+            }
+        }
+
+        // Then make the replacements
+        info!("Rewriting bloblogs...");
+        let mut buf = Vec::new();
+        let mut iterator = to_preserve.into_iter();
+        loop {
+            let (new_bloblog_id, bloglog_mutex) = self.get_writing_bloblog()?;
+            let mut new_bloblog = bloglog_mutex.lock().expect("Failed to lock bloblog?");
+            let mut is_more = false;
+
+            while let Some(preserve) = iterator.next() {
+                is_more = true;
+
+                // Get hold of the old bloblog
+                let old_bloblog = self.open_bloblog(preserve.old_bloblog)?;
+                let mut old_bloblog = old_bloblog.lock().unwrap();
+
+                // Transfer the blob
+                buf.clear();
+                old_bloblog.read_blob(preserve.old_offset, &preserve.chunk_id, &mut buf)?;
+                let new_offset = new_bloblog.write_blob(&preserve.chunk_id, &buf)?;
+
+                // Make a note of the replacement
+                replacements.insert(
+                    preserve,
+                    BloblogPointer {
+                        bloblog: new_bloblog_id,
+                        offset: new_offset,
+                    },
+                );
+
+                progress.inc_progress(buf.len() as u64);
+
+                if new_bloblog.filesize()? > MAX_BLOBLOG_REUSE_SIZE {
+                    // get a new bloblog to write with.
+                    break;
+                }
+            }
+
+            drop(new_bloblog);
+            self.return_writing_bloblog(new_bloblog_id, bloglog_mutex)?;
+
+            if !is_more {
+                break;
+            }
+        }
+
+        info!("Applying replacements...");
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+            let mut stmt = txn.prepare(
+                "
+                UPDATE chunks
+                SET bloblog = ?1, offset = ?2
+                WHERE chunk_id = ?3
+            ",
+            )?;
+
+            for (replacement_row, new_pos) in replacements {
+                ensure!(
+                    stmt.execute(params![
+                        new_pos.bloblog,
+                        new_pos.offset as i64,
+                        &replacement_row.chunk_id as &[u8]
+                    ])? == 1,
+                    "Wrong number of rows updated for replacement!"
+                );
+            }
+
+            drop(stmt);
+            txn.commit().context("committing replacements")?;
+        }
+
+        // TODO fsync new bloblogs
+
+        info!("Deleting old bloblogs...");
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+
+            for bloblog_id in plan.bloblogs_to_replace.iter().copied() {
+                let deleted_chunks = txn.execute(
+                    "
+                    DELETE FROM chunks WHERE bloblog = ?1
+                ",
+                    params![bloblog_id],
+                )?;
+
+                let deleted_deleted = txn.execute(
+                    "
+                    DELETE FROM deleted WHERE bloblog = ?1
+                ",
+                    params![bloblog_id],
+                )?;
+
+                ensure!(deleted_chunks == deleted_deleted, "Undeleted chunks left in bloblog {bloblog_id}: CHUNKS={deleted_chunks} DELETED={deleted_deleted}");
+
+                let bloblog_path = self.path.join(bloblog_id.to_string());
+                remove_file(&bloblog_path).with_context(|| {
+                    format!("Failed to remove obsolete bloblog: {:?}", bloblog_path)
+                })?;
+            }
+
+            txn.commit()?;
+        }
+
+        Ok(())
+    }
+}
+
+pub struct BloblogStats {
+    pub chunks_total: u64,
+    pub chunks_deleted: u64,
+    pub bytes_total: u64,
+    pub bytes_deleted: u64,
+}
+
+pub struct CompactionPlan {
+    pub bloblogs_to_replace: BTreeSet<BloblogId>,
+    pub bytes_to_write: u64,
+    pub reclaimable_space: u64,
+    pub small_bloblogs: u32,
+}
+
+pub struct CompactionThresholds {
+    /// Minimum bytes to be reclaimable overall for compaction to be worthwhile.
+    pub minimum_to_reclaim: u64,
+
+    /// (alternative reason) Minimum number of files to be undersized in order for compaction
+    /// to be worthwhile.
+    /// This gives us a way to make compaction run if we have lots of tiny bloblogs.
+    pub minimum_small_bloblogs_to_merge: u32,
+
+    /// A bloblog will be replaced if the deallocated size is greater than this.
+    pub cond_if_more_deallocated_than: u64,
+
+    /// A bloblog will be replaced if the allocated size is less than this.
+    pub cond_if_less_allocated_than: u64,
+}
+
+impl CompactionThresholds {
+    pub fn should_replace_bloblog(&self, bloblog_stats: &BloblogStats) -> bool {
+        let allocated = bloblog_stats.bytes_total - bloblog_stats.bytes_deleted;
+        // Note that this will also trigger for fully-deallocated files if
+        let is_small = allocated < self.cond_if_less_allocated_than;
+        let has_large_deallocations =
+            bloblog_stats.bytes_deleted > self.cond_if_more_deallocated_than;
+        is_small || has_large_deallocations
+    }
+}
+
+pub struct CompactionOutcome {
+    pub bloblogs_deleted: u32,
+    pub bloblogs_created: u32,
+    pub bytes_deleted: u32,
+    pub bytes_created: u32,
 }

 impl Drop for SqliteBloblogPile {
@ -507,6 +1007,59 @@ impl RawPile for SqliteBloblogPile {
            }
        }
    }
+
+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        match kind {
+            Keyspace::Chunk => {
+                let mut chunk_pointers_by_bloblog: BTreeMap<BloblogId, Vec<(u64, &[u8])>> =
+                    BTreeMap::new();
+
+                for (chunk_pointer, chunk_id) in self
+                    .get_chunk_pointers(keys)
+                    .context("failed to get chunk pointers")?
+                    .into_iter()
+                    .zip(keys)
+                    .filter_map(|(pointer, &chunk_id)| match pointer {
+                        Some(pointer) => Some((pointer, chunk_id)),
+                        None => None,
+                    })
+                {
+                    chunk_pointers_by_bloblog
+                        .entry(chunk_pointer.bloblog)
+                        .or_default()
+                        .push((chunk_pointer.offset, chunk_id));
+                }
+                let mut inner = self.inner.lock().unwrap();
+                let txn = inner.connection.transaction()?;
+                {
+                    let mut stmt = txn.prepare(
+                        "INSERT OR IGNORE INTO deleted (bloblog, offset, size)
+                            VALUES (?1, ?2, ?3)",
+                    )?;
+                    for (bloblog_id, entries) in chunk_pointers_by_bloblog {
+                        let bloblog_mutex = self.open_bloblog(bloblog_id)?;
+                        let mut bloblog = bloblog_mutex.lock().unwrap();
+                        for (chunk_offset, raw_chunk_id) in entries {
+                            let mut chunk_id: ChunkId = Default::default();
+                            chunk_id.copy_from_slice(raw_chunk_id);
+                            let size = bloblog.blob_len(chunk_offset, &chunk_id)?;
+                            let offset_i64 = i64::try_from(chunk_offset)
+                                .expect("ouch! can't turn u64 into i64...");
+                            stmt.execute(params![bloblog_id, offset_i64, size])?;
+                        }
+                    }
+                }
+                txn.commit().context("Failed to commit chunk deletions")?;
+            }
+            _ => {
+                for &key in keys {
+                    self.delete(kind, key)?;
+                }
+            }
+        }
+        Ok(())
+    }
+
    fn list_keys(
        &self,
        kind: Keyspace,
@ -535,9 +1088,7 @@ impl RawPile for SqliteBloblogPile {
        if inner.writers_in_progress > 0 {
            let _inner = self
                .writers_reach_zero
-                .wait_while(inner, |inner| {
-                    inner.writers_in_progress != 0
-                })
+                .wait_while(inner, |inner| inner.writers_in_progress != 0)
                .unwrap();
        }

@ -548,6 +1099,91 @@ impl RawPile for SqliteBloblogPile {
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        unimplemented!()
    }
+
+    fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
+        let inner = self.inner.lock().unwrap();
+        let chunk_count: i64 =
+            inner
+                .connection
+                .query_row("SELECT COUNT(1) FROM chunks", params![], |row| row.get(0))?;
+
+        let (deleted_chunk_count, deleted_chunk_space): (i64, i64) = inner.connection.query_row(
+            "SELECT COUNT(1), COALESCE(SUM(size), 0) FROM deleted",
+            params![],
+            |row| Ok((row.get(0)?, row.get(1)?)),
+        )?;
+
+        let mut total_on_disk_size = 0;
+        for dir_entry in read_dir(&self.path)? {
+            let dir_entry = dir_entry?;
+            if !dir_entry.file_type()?.is_file() {
+                continue;
+            }
+            if let Some(name) = dir_entry.file_name().to_str() {
+                if !name.chars().all(|c| c.is_numeric()) {
+                    // bloblogs have numeric names.
+                    continue;
+                }
+                total_on_disk_size += dir_entry.metadata()?.len();
+            }
+        }
+
+        // 32 bytes for the chunk ID.
+        // 4 bytes for the chunk length.
+        let chunk_overhead_per_chunk: u64 = 32 + 4;
+
+        let total_chunk_size = total_on_disk_size
+            - chunk_overhead_per_chunk * (deleted_chunk_count + chunk_count) as u64
+            - deleted_chunk_space as u64;
+
+        Ok(Some(DebugStatistics {
+            number_of_chunks: chunk_count.try_into().unwrap(),
+            minimum_chunk_size: None,
+            maximum_chunk_size: None,
+            total_chunk_size,
+        }))
+    }
+
+    fn build_storage_pipeline(
+        &self,
+        settings: StoragePipelineSettings,
+        controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        let (sender, incoming) = crossbeam_channel::bounded(settings.writer_input_bound as usize);
+
+        let this = self.clone();
+
+        thread::Builder::new()
+            .name("SQLBloblogStPpln".to_string())
+            .spawn(move || {
+                let worker_id = Arc::new(format!("bloblogwriter"));
+                if let Err(err) = this.storage_pipeline_worker(incoming) {
+                    controller_send
+                        .send(ControllerMessage::Failure {
+                            worker_id,
+                            error_message: format!("err {:?}", err),
+                        })
+                        .expect("This is BAD: failed to send failure message to controller.");
+                }
+            })
+            .unwrap();
+
+        Ok(sender)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        Ok(vec![PipelineDescription::Store])
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        let chunk_pointer = self
+            .get_chunk_pointer(chunk_id)?
+            .context("Can't get chunk ID transfer ordering hint for chunk without pointer.")?;
+
+        // Scheme: 24-bit bloblog ID
+        // followed by 40-bit offset
+        Ok(((chunk_pointer.bloblog as u64) << 40) | (chunk_pointer.offset & 0xFF_FF_FF_FF_FF))
+    }
 }

 struct KeyIterator {
@ -593,9 +1229,10 @@ impl Iterator for KeyIterator {

 #[cfg(test)]
 mod tests {
-    use crate::pile::local_sqlitebloblogs::Bloblog;
    use temp_dir::TempDir;

+    use crate::pile::local_sqlitebloblogs::Bloblog;
+
    #[test]
    pub fn bloblog_read_write_test() {
        let td = TempDir::new().unwrap();
--- a/yama/src/remote.rs
+++ b/yama/src/remote.rs
@ -22,7 +22,7 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};

-use crate::pile::Keyspace;
+use crate::pile::{Keyspace, PipelineDescription};

 pub mod requester;
 pub mod responder;
@ -60,6 +60,7 @@ pub enum RequestBody {
    },
    Flush,
    LowLevelCheck,
+    Describe,
    Shutdown,
    Progress {
        current: u64,
@ -73,7 +74,7 @@ pub struct Response {
    body: ResponseBody,
 }

-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Debug)]
 pub enum ResponseBody {
    Success,
    Failed(String),
@ -83,6 +84,7 @@ pub enum ResponseBody {
        batch: Vec<Vec<u8>>,
        next_token: u16,
    },
+    Description(Vec<PipelineDescription>),
 }

 pub fn read_message<R: Read, D: DeserializeOwned>(read: &mut R) -> anyhow::Result<D> {
--- a/yama/src/remote/requester.rs
+++ b/yama/src/remote/requester.rs
@ -4,17 +4,25 @@ use std::sync::{Arc, Mutex};
 use std::thread;
 use std::thread::JoinHandle;

-use anyhow::anyhow;
+use anyhow::{anyhow, bail};
 use crossbeam_channel::{Receiver, Sender};
 use log::{error, info};

-use crate::pile::{Keyspace, RawPile};
+use crate::definitions::ChunkId;
+use crate::pile::{
+    ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
+};
 use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
+use metrics::{
+    gauge, histogram, increment_counter, register_counter, register_gauge, register_histogram, Unit,
+};
 use std::sync::atomic::{AtomicBool, AtomicU16, Ordering};
+use std::time::Instant;

 /// A kind of RawPile which can make requests to a RawPile over a pipe (e.g. TCP socket or an
 /// SSH connection).
 /// The requests are handled by a `Responder` on the other end of the pipe.
+#[derive(Debug)]
 pub struct Requester {
    commands: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
 }
@ -24,7 +32,13 @@ impl Requester {
        read: R,
        write: W,
    ) -> (Self, Vec<JoinHandle<()>>) {
-        let in_flight: Arc<Mutex<HashMap<u16, Sender<ResponseBody>>>> =
+        register_histogram!(
+            "requester_cmd_response_time_ms",
+            Unit::Milliseconds,
+            "Time between request being issued and a response being received"
+        );
+
+        let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
            Arc::new(Mutex::new(HashMap::new()));
        let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
        let mut handles = Vec::new();
@ -35,22 +49,34 @@ impl Requester {
            // Spawn a reader
            let in_flight = in_flight.clone();
            let shutdown_signal = shutdown_signal.clone();
-            handles.push(thread::spawn(move || {
+            handles.push(
+                thread::Builder::new()
+                    .name("ReqstrReader".to_string())
+                    .spawn(move || {
                        if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
                            error!("reader failed: {:?}", e);
                        }
-            }));
+                    })
+                    .unwrap(),
+            );
        }

        {
            // Spawn a writer
            let in_flight = in_flight.clone();
            let command_receiver = command_receiver.clone();
-            handles.push(thread::spawn(move || {
-                if let Err(e) = Self::writer(write, in_flight, command_receiver, shutdown_signal) {
+            handles.push(
+                thread::Builder::new()
+                    .name("ReqstrWriter".to_string())
+                    .spawn(move || {
+                        if let Err(e) =
+                            Self::writer(write, in_flight, command_receiver, shutdown_signal)
+                        {
                            error!("writer failed: {:?}", e);
                        }
-            }));
+                    })
+                    .unwrap(),
+            );
        }

        (
@ -62,7 +88,7 @@ impl Requester {
    }

    pub fn new_from_stdio() -> (Self, Vec<JoinHandle<()>>) {
-        let in_flight: Arc<Mutex<HashMap<u16, Sender<ResponseBody>>>> =
+        let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
            Arc::new(Mutex::new(HashMap::new()));
        let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
        let mut handles = Vec::new();
@ -73,26 +99,38 @@ impl Requester {
            // Spawn a reader
            let in_flight = in_flight.clone();
            let shutdown_signal = shutdown_signal.clone();
-            handles.push(thread::spawn(move || {
+            handles.push(
+                thread::Builder::new()
+                    .name("ReqstrReaderSI".to_string())
+                    .spawn(move || {
                        let stdin = stdin();
                        let read = stdin.lock();
                        if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
                            error!("reader failed: {:?}", e);
                        }
-            }));
+                    })
+                    .unwrap(),
+            );
        }

        {
            // Spawn a writer
            let in_flight = in_flight.clone();
            let command_receiver = command_receiver.clone();
-            handles.push(thread::spawn(move || {
+            handles.push(
+                thread::Builder::new()
+                    .name("ReqstrWriterSO".to_string())
+                    .spawn(move || {
                        let stdout = stdout();
                        let write = stdout.lock();
-                if let Err(e) = Self::writer(write, in_flight, command_receiver, shutdown_signal) {
+                        if let Err(e) =
+                            Self::writer(write, in_flight, command_receiver, shutdown_signal)
+                        {
                            error!("writer failed: {:?}", e);
                        }
-            }));
+                    })
+                    .unwrap(),
+            );
        }

        (
@ -110,7 +148,7 @@ impl Requester {
    /// Thread that reads messages and sends them along.
    fn reader<R: Read>(
        mut read: R,
-        in_flight: Arc<Mutex<HashMap<u16, Sender<ResponseBody>>>>,
+        in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
        shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
    ) -> anyhow::Result<()> {
        loop {
@ -122,9 +160,18 @@ impl Requester {
                return Ok(());
            }

-            let map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
-            map.get(&response.response_to)
-                .ok_or(anyhow!("Didn't find response channel..."))?
+            let mut map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
+
+            // We free up the ID as we get the sender out of the map.
+            let (resp_sender, req_instant) = map
+                .remove(&response.response_to)
+                .ok_or(anyhow!("Didn't find response channel..."))?;
+
+            let req_resp_time_in_millis =
+                Instant::now().duration_since(req_instant).as_millis() as f64;
+            histogram!("requester_cmd_response_time_ms", req_resp_time_in_millis);
+
+            resp_sender
                .send(response.body)
                .or(Err(anyhow!("Failed to send response to channel")))?;
        }
@ -133,7 +180,7 @@ impl Requester {
    /// Thread that writes messages.
    fn writer<W: Write>(
        mut write: W,
-        in_flight: Arc<Mutex<HashMap<u16, Sender<ResponseBody>>>>,
+        in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
        command_receiver: Receiver<(RequestBody, Option<Sender<ResponseBody>>)>,
        shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
    ) -> anyhow::Result<()> {
@ -144,7 +191,8 @@ impl Requester {
                    .into_iter()
                    .find(|id| !map.contains_key(&id))
                    .expect("No ID found");
-                map.insert(request_id, response_channel);
+                let now = Instant::now();
+                map.insert(request_id, (response_channel, now));
                request_id
            } else {
                0
@ -223,8 +271,7 @@ impl RawPile for Requester {
            ResponseBody::Success => Ok(true),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
            ResponseBody::NotExists => Ok(false),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for exists.")),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for exists.")),
+            other => Err(anyhow!("Received {:?} for Exists", other)),
        }
    }
    fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
@ -236,7 +283,7 @@ impl RawPile for Requester {
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
            ResponseBody::NotExists => Ok(None),
            ResponseBody::Data(data) => Ok(Some(data)),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for read.")),
+            other => Err(anyhow!("Received {:?} for Read", other)),
        }
    }
    fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
@ -247,9 +294,7 @@ impl RawPile for Requester {
        })? {
            ResponseBody::Success => Ok(()),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
-            ResponseBody::NotExists => Err(anyhow!("Received NotExists for write.")),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for write.")),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for write.")),
+            other => Err(anyhow!("Received {:?} for Write", other)),
        }
    }
    fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
@ -259,11 +304,15 @@ impl RawPile for Requester {
        })? {
            ResponseBody::Success => Ok(()),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
-            ResponseBody::NotExists => Err(anyhow!("Received NotExists for delete.")),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for delete.")),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for delete.")),
+            other => Err(anyhow!("Received {:?} for Delete", other)),
        }
    }
+    fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
+        for &key in keys {
+            self.delete(kind, key)?;
+        }
+        Ok(())
+    }
    fn list_keys(
        &self,
        kind: Keyspace,
@ -275,33 +324,130 @@ impl RawPile for Requester {
                buffer: Vec::with_capacity(0),
            })),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
-            ResponseBody::NotExists => Err(anyhow!("Received NotExists for list_keys.")),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for list_keys.")),
            ResponseBody::BatchData { batch, next_token } => Ok(Box::new(ListKeyIterator {
                command_sender: self.commands.clone(),
                batch_token: Some(next_token),
                buffer: batch,
            })),
+            other => Err(anyhow!("Received {:?} for List", other)),
        }
    }
    fn flush(&self) -> anyhow::Result<()> {
        match self.request(RequestBody::Flush)? {
            ResponseBody::Success => Ok(()),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
-            ResponseBody::NotExists => Err(anyhow!("Received NotExists for Flush.")),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for Flush.")),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for Flush.")),
+            other => Err(anyhow!("Received {:?} for Flush", other)),
        }
    }
    fn check_lowlevel(&self) -> anyhow::Result<bool> {
        match self.request(RequestBody::LowLevelCheck)? {
            ResponseBody::Success => Ok(true),
            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
-            ResponseBody::NotExists => Err(anyhow!("Received NotExists for LowLevelCheck.")),
-            ResponseBody::Data(_) => Err(anyhow!("Received Data for LowLevelCheck.")),
-            ResponseBody::BatchData { .. } => Err(anyhow!("Received BatchData for LowLevelCheck.")),
+            other => Err(anyhow!("Received {:?} for LowLevelCheck", other)),
        }
    }
+
+    fn build_storage_pipeline(
+        &self,
+        _settings: StoragePipelineSettings,
+        _controller_send: Sender<ControllerMessage>,
+    ) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
+        // this one is a little bit more complex.
+        // We want to be able to send off multiple write requests at once, but not too many, so we
+        // need to be able to apply backpressure.
+        let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(128);
+        let command_sender = self.commands.clone();
+
+        register_counter!(
+            "requester_pipeline_cmds_issued",
+            Unit::Count,
+            "Number of write commands issued by the Requester's storage pipeline"
+        );
+        register_gauge!(
+            "requester_pipeline_writes_inflight",
+            Unit::Count,
+            "Number of write commands in-flight"
+        );
+
+        std::thread::Builder::new()
+            .name("ReqStPpln".to_string())
+            .spawn(move || {
+                let (response_tx, response_rx) = crossbeam_channel::bounded::<ResponseBody>(32);
+                let mut in_flight_writes = 0;
+                const MAX_IN_FLIGHT_WRITES: u32 = 32;
+                let mut pipeline_still_going = true;
+
+                while pipeline_still_going || in_flight_writes > 0 {
+                    gauge!(
+                        "requester_pipeline_writes_inflight",
+                        in_flight_writes as f64
+                    );
+                    // TODO this won't handle channel closure properly.
+                    if in_flight_writes < MAX_IN_FLIGHT_WRITES && pipeline_still_going {
+                        crossbeam_channel::select! {
+                            recv(response_rx) -> resp => {
+                                in_flight_writes -= 1;
+                                match resp.unwrap() {
+                                    ResponseBody::Success => {
+                                        // nop
+                                    }
+                                    ResponseBody::Failed(string) => {
+                                        panic!("Requester pipeline fail {}", string);
+                                    }
+                                    other => panic!("wtf {:?}", other),
+                                }
+                            }
+                            recv(receiver) -> resp => {
+                                if let Ok((chunk_id, write)) = resp {
+                                    in_flight_writes += 1;
+                                    increment_counter!("requester_pipeline_cmds_issued");
+                                    command_sender.send((RequestBody::Write {
+                                        kind: Keyspace::Chunk,
+                                        key: chunk_id.to_vec(),
+                                        value: write
+                                    }, Some(response_tx.clone()))).unwrap();
+                                } else {
+                                    // the input has stopped
+                                    pipeline_still_going = false;
+                                }
+                            }
+                        }
+                    } else {
+                        // Either the pipeline is stopping or we are too busy to accept new chunks,
+                        // so only process responses.
+                        let resp = response_rx.recv().unwrap();
+                        in_flight_writes -= 1;
+                        match resp {
+                            ResponseBody::Success => {
+                                // nop
+                            }
+                            ResponseBody::Failed(string) => {
+                                panic!("Requester pipeline fail {}", string);
+                            }
+                            other => panic!("wtf {:?}", other),
+                        }
+                    }
+                }
+            })
+            .unwrap();
+
+        Ok(input)
+    }
+
+    fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
+        match self.request(RequestBody::Describe)? {
+            ResponseBody::Description(mut description) => {
+                description.push(PipelineDescription::Remote);
+                Ok(description)
+            }
+            ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
+            other => Err(anyhow!("Received {:?} for Describe", other)),
+        }
+    }
+
+    fn chunk_id_transfer_ordering_hint(&self, _chunk_id: &ChunkId) -> anyhow::Result<u64> {
+        bail!("You probably shouldn't be using chunk ID transfer ordering hints with a remote.");
+    }
 }

 pub struct ListKeyIterator {
@ -329,8 +475,6 @@ impl Iterator for ListKeyIterator {
                    None
                }
                ResponseBody::Failed(err_msg) => Some(Err(anyhow!("Remote failure: {}", err_msg))),
-                ResponseBody::NotExists => Some(Err(anyhow!("Received NotExists for NextBatch."))),
-                ResponseBody::Data(_) => Some(Err(anyhow!("Received Data for NextBatch."))),
                ResponseBody::BatchData { batch, next_token } => {
                    self.batch_token = Some(next_token);
                    self.buffer = batch;
@ -342,6 +486,7 @@ impl Iterator for ListKeyIterator {
                        None
                    }
                }
+                other => Some(Err(anyhow!("Received {:?} for NextBatch", other))),
            }
        } else {
            None
--- a/yama/src/remote/responder.rs
+++ b/yama/src/remote/responder.rs
@ -9,15 +9,22 @@ use crossbeam_channel::{Receiver, Sender};
 use itertools::Itertools;
 use log::{error, info, warn};

-use crate::pile::RawPile;
+use crate::definitions::ChunkId;
+use crate::pile::{Keyspace, RawPile};
 use crate::progress::ProgressTracker;
 use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};

+#[derive(Clone)]
+pub struct ResponderWritingPipeline {
+    pub pipeline_submission: Sender<(ChunkId, Vec<u8>)>,
+}
+
 #[derive(Clone)]
 /// A wrapper for a RawPile which allows a `Requester` to access it over a pipe (e.g. TCP socket or
 /// an SSH connection).
 pub struct Responder {
    continuation_tokens: Arc<Mutex<HashMap<u16, Sender<u16>>>>,
+    writing_pipeline: Option<ResponderWritingPipeline>,
 }

 impl Responder {
@ -32,6 +39,7 @@ impl Responder {
        write: W,
        num_workers: u16,
        pile: Arc<RP>,
+        writing_pipeline: Option<ResponderWritingPipeline>,
        mut progress_bar: PT,
    ) -> (JoinHandle<R>, JoinHandle<W>, Vec<JoinHandle<()>>) {
        let mut handles = Vec::new();
@ -39,32 +47,40 @@ impl Responder {
        let (resp_send, resp_recv) = crossbeam_channel::bounded::<Response>(4);
        let responder = Responder {
            continuation_tokens: Arc::new(Mutex::new(Default::default())),
+            writing_pipeline,
        };

        let r_handle = {
            // spawn the reader
            let work_queue_send = work_queue_send.clone();
            let responder = responder.clone();
-            thread::spawn(move || {
+            thread::Builder::new()
+                .name("RespdrReader".to_string())
+                .spawn(move || {
                    let mut read = read;
-                if let Err(e) = responder.reader(&mut read, work_queue_send, &mut progress_bar) {
+                    if let Err(e) = responder.reader(&mut read, work_queue_send, &mut progress_bar)
+                    {
                        error!("reader failed: {:?}", e);
                    }
                    read
                })
+                .unwrap()
        };

        let w_handle = {
            // spawn the writer
            let resp_recv = resp_recv.clone();
            let responder = responder.clone();
-            thread::spawn(move || {
+            thread::Builder::new()
+                .name("RespdrWriter".to_string())
+                .spawn(move || {
                    let mut write = write;
                    if let Err(e) = responder.writer(&mut write, resp_recv) {
                        error!("writer failed: {:?}", e);
                    }
                    write
                })
+                .unwrap()
        };

        for worker_num in 0..num_workers {
@ -73,11 +89,17 @@ impl Responder {
            let work_queue_recv = work_queue_recv.clone();
            let resp_send = resp_send.clone();
            let pile = pile.clone();
-            handles.push(thread::spawn(move || {
-                if let Err(e) = responder.worker(pile.as_ref(), work_queue_recv, resp_send) {
+            handles.push(
+                thread::Builder::new()
+                    .name("RespdrWorker".to_string())
+                    .spawn(move || {
+                        if let Err(e) = responder.worker(pile.as_ref(), work_queue_recv, resp_send)
+                        {
                            error!("worker {} failed: {:?}", worker_num, e);
                        }
-            }));
+                    })
+                    .unwrap(),
+            );
        }

        (r_handle, w_handle, handles)
@ -181,7 +203,25 @@ impl Responder {
                        }
                    }
                },
-                RequestBody::Write { kind, key, value } => match pile.write(kind, &key, &value) {
+                RequestBody::Write { kind, key, value } => {
+                    if let Some(writing_pipeline) = self
+                        .writing_pipeline
+                        .as_ref()
+                        .filter(|_| kind == Keyspace::Chunk)
+                    {
+                        let mut chunk_id = ChunkId::default();
+                        chunk_id.copy_from_slice(&key[..]);
+                        writing_pipeline
+                            .pipeline_submission
+                            .send((chunk_id, value))?;
+                        // We lie and say it was successful once we submit.
+                        // We'll complain on our side if anything goes wrong, anyway.
+                        Response {
+                            response_to: request.id,
+                            body: ResponseBody::Success,
+                        }
+                    } else {
+                        match pile.write(kind, &key, &value) {
                            Ok(_) => Response {
                                response_to: request.id,
                                body: ResponseBody::Success,
@ -194,7 +234,9 @@ impl Responder {
                                    body: ResponseBody::Failed(err),
                                }
                            }
-                },
+                        }
+                    }
+                }
                RequestBody::Delete { kind, key } => match pile.delete(kind, &key) {
                    Ok(_) => Response {
                        response_to: request.id,
@ -220,7 +262,7 @@ impl Responder {
                                .continuation_tokens
                                .lock()
                                .or(Err(anyhow!("Mutex poisoned")))?;
-                            let batch_token = (0u16..u16::max_value())
+                            let batch_token = (0u16..u16::MAX)
                                .into_iter()
                                .find(|id| !map.contains_key(&id))
                                .expect("No ID found");
@ -307,6 +349,20 @@ impl Responder {
                RequestBody::Progress { .. } => {
                    unreachable!("handled by readea")
                }
+                RequestBody::Describe => match pile.describe_pipeline() {
+                    Ok(description) => Response {
+                        response_to: request.id,
+                        body: ResponseBody::Description(description),
+                    },
+                    Err(err) => {
+                        warn!("Error whilst doing a raw describe_pipeline: {:?}", err);
+                        let err = format!("{:?}", err);
+                        Response {
+                            response_to: request.id,
+                            body: ResponseBody::Failed(err),
+                        }
+                    }
+                },
            };

            responses
--- a/yama/src/tree.rs
+++ b/yama/src/tree.rs
@ -185,7 +185,7 @@ pub fn differentiate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow
 /// result is in-place.
 ///
 /// Preconditions:
-/// - `old` must be an integrated pointer.
+/// - `old` must be an integrated pointer. (Otherwise this algorithm is not correct.)
 /// - `old` is the parent of `new`
 pub fn integrate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow::Result<()> {
    if let TreeNode::Directory { children, .. } = new {
--- a/yama/src/utils.rs
+++ b/yama/src/utils.rs
@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

+use std::collections::{BTreeMap, BTreeSet};
 use std::fmt::Write;

 pub fn bytes_to_hexstring(chunkid: &[u8]) -> String {
@ -42,3 +43,98 @@ pub fn get_number_of_workers(first_try_env_name: &str) -> u8 {
        }
    }
 }
+
+#[derive(Clone, Debug)]
+pub struct LruMap<K, V> {
+    capacity: usize,
+    last_access: BTreeSet<(u64, K)>,
+    items: BTreeMap<K, (V, u64)>,
+    counter: u64,
+}
+
+impl<K: Ord + Clone, V> LruMap<K, V> {
+    pub fn new(capacity: usize) -> LruMap<K, V> {
+        LruMap {
+            capacity,
+            last_access: BTreeSet::new(),
+            items: BTreeMap::new(),
+            counter: 0,
+        }
+    }
+
+    /// Gets an item from the LRU map.
+    pub fn get(&mut self, key: &K) -> Option<&V> {
+        match self.items.get_mut(key) {
+            Some((value, last_used_instant)) => {
+                assert!(
+                    self.last_access.remove(&(*last_used_instant, key.clone())),
+                    "Corrupt LRU map: freshen not correct."
+                );
+                let new_instant = self.counter;
+                self.counter += 1;
+                self.last_access.insert((new_instant, key.clone()));
+                *last_used_instant = new_instant;
+                Some(value)
+            }
+            None => None,
+        }
+    }
+
+    pub fn insert(&mut self, key: K, value: V) -> Option<V> {
+        let new_instant = self.counter;
+        self.counter += 1;
+
+        let retval = match self.items.insert(key.clone(), (value, new_instant)) {
+            Some((old_entry, old_instant)) => {
+                assert!(
+                    self.last_access.remove(&(old_instant, key.clone())),
+                    "Corrupt LRU map: insert not correct."
+                );
+                Some(old_entry)
+            }
+            None => None,
+        };
+        self.last_access.insert((new_instant, key));
+
+        if retval.is_none() {
+            // We didn't replace any item, so we have grown by 1.
+            // Check if we need to evict.
+            if self.items.len() > self.capacity {
+                self.evict();
+            }
+        }
+
+        retval
+    }
+
+    pub fn evict(&mut self) -> Option<(K, V)> {
+        if let Some(first_entry) = self.last_access.iter().next().cloned() {
+            self.last_access.remove(&first_entry);
+            let (_, key) = first_entry;
+            let (value, _) = self
+                .items
+                .remove(&key)
+                .expect("Corrupt LRU map: last access and items out of sync");
+
+            Some((key, value))
+        } else {
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::utils::LruMap;
+
+    #[test]
+    fn test_lru_map() {
+        let mut lmap = LruMap::new(3);
+        lmap.insert(1, 1);
+        lmap.insert(2, 1);
+        lmap.insert(3, 1);
+        assert_eq!(lmap.get(&1), Some(&1));
+        lmap.insert(4, 1);
+        assert_eq!(lmap.get(&2), None);
+    }
+}
Author	SHA1	Message	Date
Olivier 'reivilibre'	565c99cf8c	Update flake and fix it	2024-05-08 20:41:28 +01:00
Olivier 'reivilibre'	b57dbad890	Simplify flake lock ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2023-04-01 16:57:04 +01:00
Olivier 'reivilibre'	9001177143	Batch up chunk deletions in an attempt to make vacuuming more performant ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-28 21:03:07 +00:00
Olivier 'reivilibre'	c9d64b2962	Make sure to flush + add some error contexts ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-21 21:23:38 +00:00
Olivier 'reivilibre'	50ff9bb36a	Fix including trailing empty line as pointer name ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-20 22:11:05 +00:00
Olivier 'reivilibre'	7e41408815	Add test for incremental backup with mid delete ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details Just for validation that delete does the right thing	2022-11-20 20:58:45 +00:00
Olivier 'reivilibre'	4072c5ae82	Fix parent not being integrated before being used to differentiate whilst removing a pointer ci/woodpecker/push/build Pipeline is pending Details ci/woodpecker/push/release Pipeline is pending Details	2022-11-20 20:42:26 +00:00
Olivier 'reivilibre'	d3fe111a06	Replace debug rmp with new implementation ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-20 19:44:21 +00:00
Olivier 'reivilibre'	6e1e173cb6	Implement datman prune	2022-11-20 19:43:20 +00:00
Olivier 'reivilibre'	fcc79ca95d	Hopefully fix descriptors to compare in test ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-20 10:02:27 +00:00
Olivier 'reivilibre'	c1de1341ef	Tweak wording	2022-11-20 10:02:13 +00:00
Olivier 'reivilibre'	e85c606c95	Make a no-op compaction really a no-op compaction ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-20 08:57:22 +00:00
Olivier 'reivilibre'	34c619ef41	Fix compact thresholds in tests to demonstrate what we need ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-19 17:43:27 +00:00
Olivier 'reivilibre'	b9dce3ddfc	rustfmt	2022-11-19 17:42:09 +00:00
Olivier 'reivilibre'	52202874f2	Update images to remove deprecated ones ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-19 16:47:25 +00:00
Olivier 'reivilibre'	69656131af	Fix linter ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-11-19 16:35:36 +00:00
Olivier 'reivilibre'	cc93997230	Linting	2022-11-19 16:35:33 +00:00
Olivier 'reivilibre'	41248fe396	Add tests for yama compact	2022-11-19 16:35:24 +00:00
Olivier 'reivilibre'	e7eb9ef288	Update nix shell to have python	2022-11-19 16:33:14 +00:00
Olivier 'reivilibre'	b5e9e55cad	Add yama compact command	2022-11-19 15:49:09 +00:00
Olivier 'reivilibre'	cf502b7f7e	rustfmt	2022-11-19 15:28:36 +00:00
Olivier 'reivilibre'	58c5c3f039	Add compaction logic	2022-11-19 15:27:41 +00:00
Olivier 'reivilibre'	30b261d172	Add Nix shell for Rust devel	2022-11-19 13:13:19 +00:00
Olivier 'reivilibre'	0811c11c48	Add ability to extract subset of files from yama ci/woodpecker/push/build Pipeline is pending Details ci/woodpecker/push/release Pipeline is pending Details	2022-10-04 20:21:23 +01:00
Olivier 'reivilibre'	aa2722607e	Skip directories with .datmanskip files ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	8612804298	Do short exclusions for remote backups (also bump version as protocol version incompatible)	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	080875bfce	Add debug log for not descending	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	098895d913	Do short exclusions for local backups	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	bd5e18bc9f	Extract load_labelling_rules	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	e25e92b273	Introduce 'exclusions' parameter to scanner	2022-07-25 10:35:47 +01:00
Olivier 'reivilibre'	4aa1948350	Sort chunk IDs by hint to make pull more efficient ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-07-23 22:43:35 +01:00
Olivier 'reivilibre'	ee9ca73224	Put reader bloblogs in an LRU map to prevent hitting open FD limit	2022-07-23 22:38:29 +01:00
Olivier 'reivilibre'	05c6d3e662	Ignore non-UTF-8 file names instead of panicking ci/woodpecker/push/build Pipeline is pending Details ci/woodpecker/push/release Pipeline is pending Details ci/woodpecker/tag/build Pipeline failed Details ci/woodpecker/tag/release Pipeline was successful Details	2022-07-23 21:55:04 +01:00
Olivier 'reivilibre'	0b84c793bf	Flush chunk pointers in one transaction ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-06-15 21:13:58 +01:00
Olivier 'reivilibre'	eef22e7009	Use WAL mode in SQLite bloblogs	2022-06-15 21:13:56 +01:00
Olivier 'reivilibre'	332563f5a7	Bump version ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-06-14 22:55:04 +01:00
Olivier 'reivilibre'	14be0ef0a3	Add a version check to the pull protocol ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-06-14 22:54:32 +01:00
Olivier 'reivilibre'	375d68eb0e	Don't forget the terminator ci/woodpecker/push/build Pipeline is pending Details ci/woodpecker/push/release Pipeline was successful Details	2022-06-14 19:57:34 +01:00
Olivier 'reivilibre'	fc29c6fca1	Add some writer flushes that are probably necessary	2022-06-14 19:57:14 +01:00
Olivier 'reivilibre'	d384b1bcbd	Write down basic implementation of datman pull	2022-06-14 19:56:45 +01:00
Olivier 'reivilibre'	e357547777	Glue together an implementation for the pull responder	2022-06-14 19:54:21 +01:00
Olivier 'reivilibre'	c83e2be66d	Flesh out both sides	2022-06-14 08:54:17 +01:00
Olivier 'reivilibre'	a24778209e	Finish off the basic offering side implementation	2022-06-13 23:27:43 +01:00
Olivier 'reivilibre'	bb8fc355f0	Lay down the basic structure of push/pull offerer	2022-06-13 23:15:46 +01:00
Olivier 'reivilibre'	14fc925dbc	Make existing push/pull legacy	2022-06-13 23:15:34 +01:00
Olivier 'reivilibre'	9e51c2428e	Report the size used by the pile itself in the report	2022-06-13 23:15:34 +01:00
Olivier 'reivilibre'	01c98cb415	Aggregate reports by month and reorder sections	2022-06-04 12:33:14 +01:00
Olivier 'reivilibre'	3637b00f38	Add lz4 to path to ensure the backup helpers can work ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-06-03 11:42:08 +01:00
Olivier 'reivilibre'	25b1e14d84	Make it less strict? ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-06-02 22:26:24 +01:00
Olivier 'reivilibre'	ef70e0998e	Attempt to package up the helpers alongside yama and datman ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-06-02 20:30:22 +01:00
Olivier 'reivilibre'	5fd9a72de8	Convert helpers to Poetry to make them easier to package for NixOS	2022-06-02 20:29:54 +01:00
Olivier 'reivilibre'	4244fb88a7	Update version ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-06-01 09:51:56 +01:00
Olivier 'reivilibre'	d62e864bee	Don't crash when backing up the root directory because it has no name	2022-06-01 09:16:05 +01:00
Olivier 'reivilibre'	001d626ccd	Update version (protocol changed too) ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-05-31 09:45:24 +01:00
Olivier 'reivilibre'	af553d1fed	Only scan one filesystem by default (can configure 'cross_filesystems' if needed) ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-31 09:39:25 +01:00
Olivier 'reivilibre'	e8fc448ace	Remove needless bare_cnr crate ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details ci/woodpecker/tag/build Pipeline was successful Details ci/woodpecker/tag/release Pipeline was successful Details	2022-05-30 23:11:16 +01:00
Olivier 'reivilibre'	4216243dcf	Remove src input to try and avoid getting told off about relative paths ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-30 22:46:54 +01:00
Olivier 'reivilibre'	1cd0b9887a	Add a Nix flake ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-29 17:43:03 +01:00
Olivier 'reivilibre'	ec8c5ff42d	Add times and disk space to the report ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-29 17:24:46 +01:00
Olivier 'reivilibre'	948ca3f2b5	Add a command to show a report of the Datman system	2022-05-29 13:35:15 +01:00
Olivier 'reivilibre'	438af9164e	Guard the Requester so that the Responder can't do whatever it wants	2022-05-29 09:45:02 +01:00
Olivier 'reivilibre'	e1c6d31ee3	Fix: remember to flush	2022-05-29 09:36:54 +01:00
Olivier 'reivilibre'	ac97957394	Remove obsolete comment ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-29 09:12:51 +01:00
Olivier 'reivilibre'	23e112b1be	Automatically pull updates for images in CI ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-29 08:43:03 +01:00
Olivier 'reivilibre'	8692d83510	Push things around so that the chunking process doesn't need to know about pointers ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-29 00:10:53 +01:00
Olivier 'reivilibre'	f1b73b28ee	Split up store_fully so that we can avoid pointer ops in a restricted context	2022-05-28 23:53:09 +01:00
Olivier 'reivilibre'	db0d9dd493	Add a basic access guard	2022-05-28 23:31:25 +01:00
Olivier 'reivilibre'	081a1922c7	Separate out the pointer operations	2022-05-28 23:30:52 +01:00
Olivier 'reivilibre'	23b352f936	Remove non-pipelined storage	2022-05-28 23:30:52 +01:00
Olivier 'reivilibre'	d82176075a	Bump up Rust version ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-28 22:53:35 +01:00
Olivier 'reivilibre'	1803946b4a	Disable ARM64 runners for now	2022-05-28 22:47:59 +01:00
Olivier 'reivilibre'	760626d01e	Add operation to describe the pipeline ci/woodpecker/push/build Pipeline failed Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-28 22:44:36 +01:00
Olivier 'reivilibre'	f4debbc9fe	Fix up clap changes	2022-05-28 22:27:33 +01:00
Olivier 'reivilibre'	f9c0d814c2	Update deps	2022-05-28 22:27:33 +01:00
Olivier 'reivilibre'	a06b393630	Make some remarks about where I'd like to go	2022-05-28 22:14:43 +01:00
Olivier 'reivilibre'	56dafc6b5f	Remove bare_cnr_ssh crate since it does depend a little bit	2022-05-28 21:43:20 +01:00
Olivier 'reivilibre'	0a9cb559bd	Remove transport module as it was obsolete	2022-05-28 21:22:25 +01:00
Olivier 'reivilibre'	00b06963d7	Add tests and useful functionality	2022-05-28 21:21:40 +01:00
Olivier 'reivilibre'	b659a5ddac	Introduce channel handles that can be passed in serde messages	2022-05-28 20:40:04 +01:00
Olivier 'reivilibre'	183f365032	Start a half decent Bare CnR crate	2022-05-28 19:47:08 +01:00
Olivier 'reivilibre'	d0ed984dca	Upgrade some dependencies	2022-05-28 13:21:56 +01:00
Olivier 'reivilibre'	675c8884f9	Start the v0.6.0-alpha.1 'next' branch ci/woodpecker/push/build Pipeline was successful Details ci/woodpecker/push/release Pipeline was successful Details	2022-05-28 13:08:56 +01:00
Olivier 'reivilibre'	60cf81c59c	Use plain old Rust image rather than sccache image continuous-integration/drone the build was successful Details	2022-01-14 21:17:51 +00:00
Olivier 'reivilibre'	2a32184a7e	Use MAX instead of deprecated max_value() continuous-integration/drone the build is running Details	2022-01-13 20:18:27 +00:00
Olivier 'reivilibre'	680c8669d6	Fix request IDs not getting freed up after a response was received	2022-01-13 20:18:11 +00:00
Olivier 'reivilibre'	8f3b211d83	Run cargo fix continuous-integration/drone the build was successful Details	2022-01-11 07:33:26 +00:00
Olivier 'reivilibre'	bedb9785dc	Add some more metrics to the Requester storage pipeline	2022-01-11 07:31:20 +00:00
Olivier 'reivilibre'	b24a0771ed	Fix Requester's storage pipeline not returning to issuing new commands after pausing because too many were in flight	2022-01-11 07:23:40 +00:00
Olivier 'reivilibre'	ff583d7ed0	Include debug information in release builds continuous-integration/drone the build was successful Details	2022-01-11 06:53:41 +00:00
Olivier 'reivilibre'	7e2b13416b	Give a name to all non-main threads	2022-01-10 21:51:17 +00:00
Olivier 'reivilibre'	9a74fa2cdc	Track request-response time in milliseconds in the metrics	2022-01-10 21:25:46 +00:00
Olivier 'reivilibre'	660595046b	Use published version of bare-metrics-recorder continuous-integration/drone the build was successful Details	2022-01-09 10:43:30 +00:00
Olivier 'reivilibre'	3c09e741d4	Add some metrics and emit metric logs from Datman continuous-integration/drone the build failed Details	2022-01-09 07:43:13 +00:00
Olivier 'reivilibre'	bf150d61d8	Prevent duplicate chunk insertions in new pipeline continuous-integration/drone the build was successful Details	2021-11-21 12:17:37 +00:00
Olivier 'reivilibre'	a37acf3e74	Actually commit when flushing bloblog pointers	2021-11-21 12:11:51 +00:00
Olivier 'reivilibre'	8eeafa7626	Fix hang after storing continuous-integration/drone the build failed Details	2021-11-21 11:57:14 +00:00
Olivier 'reivilibre'	155d31626e	Exhaust controller receiver on finish so that we don't exit too soon continuous-integration/drone the build failed Details	2021-11-20 15:05:22 +00:00
Olivier 'reivilibre'	b4cac57ec5	Try to quieten apt-get more	2021-11-20 14:59:09 +00:00
Olivier 'reivilibre'	b055c5559c	Enable pipelined storage continuous-integration/drone the build failed Details	2021-11-20 14:47:40 +00:00
Olivier 'reivilibre'	8802994d96	Add naïve existence checking stage continuous-integration/drone the build was successful Details	2021-11-20 13:29:50 +00:00
Olivier 'reivilibre'	b00a6da993	Make chunking() able to use writing pipelines	2021-11-20 12:57:00 +00:00
Olivier 'reivilibre'	e6d618a29e	Make Responder technically capable of using a writing pipeline	2021-11-20 12:37:39 +00:00
Olivier 'reivilibre'	aaf2ea1493	Allow using pipelined storage for store_fully	2021-11-20 12:02:49 +00:00
Olivier 'reivilibre'	752b07e0b1	Make store() generic over chunk submission target	2021-11-20 11:25:09 +00:00
Olivier 'reivilibre'	202c7c57fd	Make store_worker generic over chunk submission target	2021-11-20 11:23:17 +00:00
Olivier 'reivilibre'	c73ac35df1	Switch over from Pile to ChunkSubmissionTarget	2021-11-20 11:20:34 +00:00
Olivier 'reivilibre'	5442dc582b	Add a generic ChunkSubmissionTarget trait	2021-11-20 11:15:04 +00:00
Olivier 'reivilibre'	3c6c19e126	Calculate chunk ID before submission	2021-11-20 11:02:07 +00:00
Olivier 'reivilibre'	ff52dd74e7	Rename new to new_from_pile.	2021-11-20 10:58:49 +00:00
Olivier 'reivilibre'	4fa300e575	Antilint	2021-11-20 10:41:35 +00:00
Olivier 'reivilibre'	aebd32da4a	Process pipeline stopping properly.	2021-11-20 10:40:59 +00:00
Olivier 'reivilibre'	efa1e6d51f	Add metrics dependency	2021-11-20 10:35:29 +00:00
Olivier 'reivilibre'	005da2589d	Add pipelined requester stage	2021-11-20 10:33:44 +00:00
Olivier 'reivilibre'	0a02eea478	Add primitive pipelined integrity stage	2021-11-20 10:33:30 +00:00
Olivier 'reivilibre'	1c1042e640	Remove unused args	2021-11-20 10:33:18 +00:00
Olivier 'reivilibre'	7b21b43bc6	Pass back the input to the pipeline for the SQLite-indexed bloblog writer	2021-11-19 21:40:37 +00:00
Olivier 'reivilibre'	b93cbe89e0	Implement compressor's pipeline stage	2021-11-19 21:40:20 +00:00
Olivier 'reivilibre'	ea2f48f437	Flush pointers when writing using new high-throughput pipeline	2021-11-19 21:09:18 +00:00
Olivier 'reivilibre'	cc60ae88a4	Start an attempt to create a high-performance chunking pipeline	2021-11-18 09:21:55 +00:00
Olivier 'reivilibre'	ccb50f2dd9	Add some basic pile statistics, as a debug command	2021-11-16 19:55:09 +00:00
Olivier 'reivilibre'	c3c0fdd240	Code formatting continuous-integration/drone the build was successful Details	2021-11-16 07:34:40 +00:00
Olivier 'reivilibre'	066292b6fe	Release the stdio lock before chunking	2021-11-16 07:34:04 +00:00
Olivier 'reivilibre'	c0f4f271ab	Use architecture-specific images due to build issues upstream continuous-integration/drone the build was successful Details	2021-11-14 19:12:06 +00:00