Compare commits

..

51 Commits

Author SHA1 Message Date
0873997f1e Track statistics when integrating pointers
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 09:53:24 +01:00
9d06016d06 Add TODOs for performance aspects
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 00:10:20 +01:00
b410114523 Add log lines with timings
Signed-off-by: Olivier <olivier@librepush.net>
2024-09-29 00:05:01 +01:00
1e029a1826 Reduce logging in SFTP wormfile implementation
Signed-off-by: Olivier <olivier@librepush.net>
2024-07-21 11:50:08 +01:00
0869aa1afb hack: Allow reading Zstd compression level from env
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
Signed-off-by: Olivier <olivier@librepush.net>
2024-06-27 22:36:29 +01:00
f17ad6fac3 Update flake and fix it 2024-05-08 20:40:01 +01:00
7728e0b0a1 Fix typo in 'Acquired' 2023-11-04 12:59:17 +00:00
c395a50803 Remove obsolete 'label' field from virtual sources 2023-11-04 12:59:11 +00:00
2551c0e641 Make it clear that 'enter keyring password' is for decryption 2023-11-04 10:28:57 +00:00
87b6530aed Allow specifying a connector inline (with custom key path) in Datman configs 2023-10-03 21:44:50 +01:00
1c2d7957ee Remove pretend password support for SFTP 2023-10-03 21:36:57 +01:00
ecda1e5359 Extract the connector loading part of open_pile 2023-10-03 21:31:46 +01:00
eb9d65b918 Add rust analyser to flake 2023-10-03 21:26:53 +01:00
feb05cfecf Add check routine that checks all chunk hashes 2023-08-15 20:13:17 +01:00
6f0e3de350 Don't use mmap for storing due to concurrency bug scare 2023-08-15 19:53:28 +01:00
d07351d465 Add size hints for Datman Backup on dir trees 2023-08-13 22:12:45 +01:00
9c3ea26ea6 nix flake: Don't strip debug symbols 2023-08-13 18:31:24 +01:00
e9c4400ea5 Add some more async_backtrace tracking and remove redundant block_in_places 2023-08-13 17:41:13 +01:00
6434190cf1 Convert store_file to blocking 2023-08-13 17:41:13 +01:00
27c5f09d0d Convert storage_pipeline_worker to blocking 2023-08-13 17:41:03 +01:00
96deadd270 Remove old yama and datman from tree 2023-08-13 17:41:03 +01:00
2c14654d29 Add a small amount of async_backtrace tracking 2023-08-13 16:30:50 +01:00
a9379dba14 Actually add a limit to prevent infinite buffering memory 2023-08-11 22:19:12 +01:00
e306acd196 Update SFTP client to get fix for infinite buffering memory leak 2023-08-11 21:44:14 +01:00
5137ac0640 Fix ignore rules 2023-08-11 20:31:08 +01:00
6b72672d29 Fix bug in path iteration leading to bug in gradual scans 2023-08-11 20:24:59 +01:00
e85c8c372d Don't special-case the root 2023-08-11 20:20:20 +01:00
31ffb1ce86 Add logging for new bug with gradual scans 2023-08-11 20:18:03 +01:00
22beea0c50 Fix bug when using gradual scans 2023-08-11 20:00:31 +01:00
2e08b2df47 Set Zstd level to 16 2023-08-10 21:33:03 +01:00
c812532541 Add --gradual flag to datman backup commands: allows softlimiting the size of a backup round 2023-08-10 20:02:32 +01:00
00fa9d0951 SFTP wormfile: pull SSH key from YAMA_SSH_KEY if set 2023-08-10 20:02:24 +01:00
1ac9bb6d8d Add yama keyring create command 2023-08-08 21:17:27 +01:00
53886aad46 Only produce warnings if files vanish during store
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-26 23:43:23 +01:00
32e514bd2e Introduce Option<> on file store entries 2023-05-26 23:30:14 +01:00
2b4608e06b Cleanups
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-26 22:58:41 +01:00
1bd46b934d Avoid use of PatriciaMap.insert_str to avoid causing bug 2023-05-26 22:58:37 +01:00
5ffbf4fc1c Add some debug lines for Bad PMap issue 2023-05-26 21:00:22 +01:00
470420665f Tweaks that tracked down SFTP infinite buffer problem
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-22 20:44:24 +01:00
a47924dc80 Fix flake and add 7-prefix so we can use it alongside the stable version
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-20 14:16:13 +01:00
3a2ece31b6 Fix query on empty local cache
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-20 13:22:06 +01:00
70663ad016 Fix progress bar in datman 2023-05-20 13:21:58 +01:00
dabf7c5cf0 overhaul: datman support 2023-05-20 13:11:30 +01:00
8e5649597b overhaul: streaming extract support
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-04 23:56:35 +01:00
00dec17da0 overhaul: streaming store support
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 23:50:55 +01:00
a8e1cc45ef CHECKPOINT overhaul 2
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 21:03:33 +01:00
5cd2700396 CHECKPOINT overhaul
Some checks failed
ci/woodpecker/push/build Pipeline failed
ci/woodpecker/push/release Pipeline was successful
2023-05-03 22:35:41 +01:00
8d5c373abc Add a rather dodgy untested S3 implementation 2023-04-03 21:09:26 +01:00
a5d6bf3085 Add an untested SFTP implementation of Yama Wormfiles 2023-04-02 22:54:24 +01:00
1fe4d9d2f3 Add yama_wormfile crates
These will be a useful component of the new on-disk storage format
2023-04-02 22:54:24 +01:00
7cd71695bc Start of Yama & Datman v0.7.0-alpha.1 2023-04-02 20:46:47 +01:00
123 changed files with 14484 additions and 11186 deletions

1
.env Normal file
View File

@ -0,0 +1 @@
DATABASE_URL=sqlite:yama_localcache/testdb.sqlite

3
.envrc
View File

@ -1,2 +1,3 @@
use nix
#use nix
use flake .

7
.gitignore vendored
View File

@ -17,3 +17,10 @@ __pycache__
/datman-helper-mysql/datman_helper_mysql.egg-info
/result
.direnv
yama7demo
yamaSFTPdemo
yama_localcache/testdb.sqlite
sftp7demo

3640
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,13 @@
members = [
"yama",
"datman",
"yama_wormfile",
"yama_wormfile_fs",
"yama_wormfile_sftp",
"yama_wormfile_s3",
"yama_midlevel_crypto",
"yama_pile",
"yama_localcache",
]
[profile.release]
@ -9,3 +16,16 @@ members = [
debug = 2
# When this feature stabilises, it will be possible to split the debug information into a file alongside the binary
#split-debuginfo = "packed"
# A few packages benefit from optimisations in the dev profile, otherwise Yama operations are needlessly slowed down.
[profile.dev.package.fastcdc]
opt-level = 2
[profile.dev.package.blake3]
opt-level = 2
# not so obvious with sqlx:
#[profile.dev.package.sqlx]
#opt-level = 2

5
GLOSSARY.md Normal file
View File

@ -0,0 +1,5 @@
## Internals
* **Chunk**: piece of a file that is obtained using a Content-Defined Chunking scheme
* **Chunk ID**: Blake3 hash of the contents of a chunk
* **Metachunk**: a chunk that itself contains (part of) a list of chunks.

View File

@ -1,6 +1,6 @@
[package]
name = "datman"
version = "0.6.0-alpha.5"
version = "0.7.0-alpha.1"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2021"
repository = "https://bics.ga/reivilibre/yama"
@ -11,28 +11,29 @@ description = "A chunked and deduplicated backup system using Yama"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "3.1.18", features = ["derive"] }
crossbeam-channel = "0.5.1"
anyhow = "1.0"
thiserror = "1.0"
serde = { version = "1.0.104", features = ["derive"] }
serde_json = "1.0.64"
toml = "0.5.5"
log = "0.4"
env_logger = "0.7.1"
indicatif = "0.14.0"
arc-interner = "0.5.1"
zstd = "0.11.2" # 0.11.2+zstd.1.5.2
byteorder = "1"
termion = "1.5.6"
glob = "0.3.0"
humansize = "1.1.1"
chrono = "0.4.19"
itertools = "0.10.1"
hostname = "0.3.1"
yama = { path = "../yama", version = "0.6.0-alpha.5" }
metrics = "0.17.1"
bare-metrics-recorder = { version = "0.1.0" }
comfy-table = "6.0.0-rc.1"
libc = "0.2.126"
io-streams = "0.11.0"
eyre = "0.6.8"
clap = { version = "4.2.2", features = ["derive", "env"] }
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.16", features = ["tracing-log", "env-filter"] }
tracing-indicatif = "0.3.0"
indicatif = "0.17.3"
serde = { version = "1.0.160", features = ["derive"] }
serde_json = "1.0.96"
toml = "0.7.3"
tokio = { version = "1.28.0", features = ["fs", "macros", "rt-multi-thread"] }
dashmap = "5.4.0"
chrono = "0.4.24"
users = "0.11.0"
bytesize = "1.2.0"
yama = { version = "0.7.0-alpha.1", path = "../yama" }
yama_pile = { path = "../yama_pile" }
#yama_localcache = { path = "../yama_localcache" }
yama_wormfile = { path = "../yama_wormfile" }
#yama_wormfile_fs = { path = "../yama_wormfile_fs" }
#yama_wormfile_s3 = { path = "../yama_wormfile_s3" }
#yama_wormfile_sftp = { path = "../yama_wormfile_sftp" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
patricia_tree = "0.5.7"
async-backtrace = "0.2.6"

583
datman/src/backup.rs Normal file
View File

@ -0,0 +1,583 @@
use crate::descriptor_config::{SourceDescriptor, SourceDescriptorInner, VirtualSourceKind};
use crate::pointer_names::{get_pointer_name_at, POINTER_NAME_DATETIME_SPLITTER};
use bytesize::ByteSize;
use chrono::{DateTime, Utc};
use clap::Args;
use dashmap::DashSet;
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use indicatif::ProgressStyle;
use patricia_tree::PatriciaMap;
use std::cmp::max;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::io::Write;
use std::path::PathBuf;
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use std::time::{Instant, SystemTime, UNIX_EPOCH};
use tokio::runtime::Handle;
use tokio::task::JoinSet;
use tracing::{debug, info, info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use users::{get_current_gid, get_current_uid};
use yama::pile_with_cache::{PileWithCache, PointerIntegrationStatistics};
use yama::scan::{create_uidgid_lookup_tables, limit_scan_entry_map_to_size};
use yama::storing::{
assemble_and_write_indices, StoragePipeline, StoringBloblogWriters, StoringState,
};
use yama::{scan, PROGRESS_BAR_STYLE};
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BlobLocator, BloblogId, IndexBloblogEntry, RecursiveChunkRef};
use yama_pile::pointers::Pointer;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{
assemble_tree_from_scan_entries, differentiate_node_in_place, FilesystemOwnership,
FilesystemPermissions, RootTreeNode, TreeNode,
};
use yama_wormfile::boxed::BoxedWormFileProvider;
#[derive(Args, Clone, Debug)]
pub struct BackupOptions {
/// Number of bytes to back up in one go. Intended for gradually getting a backup started.
/// Supports suffixes like MiB and MB.
/// Applies per-source. Does not apply to virtual sources.
#[clap(long)]
gradual: Option<ByteSize>,
}
pub async fn backup(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
sources_to_backup: BTreeMap<String, SourceDescriptor>,
options: &BackupOptions,
) -> eyre::Result<()> {
// Locate suitable parent pointers
let timestart = Instant::now();
let parents_to_use = find_suitable_parent_pointers(&pwc, &sources_to_backup)
.await
.context("failed to look for suitable parent pointers")?;
debug!(
"find_suitable_parent_pointers: {:?}",
Instant::now() - timestart
);
let now = Utc::now();
// (dirtrees) Scan
let timestart = Instant::now();
let dir_sources = scan_dir_sources(&sources_to_backup, parents_to_use, now, options)
.await
.context("failed to scan directory sources")?;
debug!("scan_dir_sources: {:?}", Instant::now() - timestart);
let new_unflushed_chunks: Arc<DashSet<ChunkId>> = Arc::new(Default::default());
// (dirtrees) Start a storage pipeline and submit jobs to it
let task_store_dirs = {
let new_unflushed_chunks = new_unflushed_chunks.clone();
let pwc = pwc.clone();
let bds_span = info_span!("storing");
tokio::spawn(async_backtrace::frame!(async move {
backup_dir_sources(dir_sources, pwc, new_unflushed_chunks)
.await
.context("whilst backing up dir sources")
}
.instrument(bds_span)))
};
// (virtual source streams) Store to bloblog writers
let task_store_virtuals = {
let bvs_span = info_span!("storing_virts");
let new_unflushed_chunks = new_unflushed_chunks.clone();
let pwc = pwc.clone();
tokio::spawn(async_backtrace::frame!(async move {
backup_virtual_sources(&sources_to_backup, now, pwc, new_unflushed_chunks)
.await
.context("whilst backing up virtual sources")
}
.instrument(bvs_span)))
};
let timestart = Instant::now();
let (dir_sources_and_chunkmaps, virt_sources) =
tokio::join!(task_store_dirs, task_store_virtuals);
debug!(
"join!(task_store_dirs, task_store_virtuals): {:?}",
Instant::now() - timestart
);
let dir_sources_and_chunkmaps: BackupDirSourcesReturn = dir_sources_and_chunkmaps??;
let mut virt_sources: Vec<VirtualSourceReturn> = virt_sources??;
let mut chunkmaps = dir_sources_and_chunkmaps.chunkmaps;
for source in &mut virt_sources {
chunkmaps.extend(
std::mem::take(&mut source.chunkmaps)
.into_iter()
.map(|(k, nb)| {
(
k,
IndexBloblogEntry {
chunks: nb,
forgotten_bytes: 0,
},
)
}),
);
}
// Chunkmaps, indices and write pointers
assemble_and_write_indices(&pwc, chunkmaps)
.await
.context("failed to assemble and write indices")?;
info!("All indices stored, writing pointer...");
for (dir_source_prep, chunk_file_map) in dir_sources_and_chunkmaps.dir_source_returns {
// Assemble and write a pointer
let mut tree = assemble_tree_from_scan_entries(
dir_source_prep.new_scan_entry_map,
dir_source_prep.reused_scan_entry_map,
chunk_file_map,
)
.context("failed to assemble tree")?;
let (uids, gids) =
create_uidgid_lookup_tables(&tree).context("failed to create uid/gid tables")?;
if let Some(ref parent_node) = dir_source_prep.parent {
differentiate_node_in_place(&mut tree, &parent_node.root.node)
.context("failed to differentiate?")?;
}
pwc.pile
.write_pointer(
&dir_source_prep.new_pointer_name,
false,
&Pointer {
parent: dir_source_prep.parent_name.clone(),
root: RootTreeNode {
name: dir_source_prep
.path
.file_name()
.map(|oss| oss.to_str())
.flatten()
.unwrap_or("")
.to_owned(),
node: tree,
},
uids,
gids,
},
)
.await
.context("failed to write pointer")?;
}
for virtual_source in virt_sources {
pwc.pile
.write_pointer(&virtual_source.pointer_name, false, &virtual_source.pointer)
.await
.context("failed to write pointer")?;
}
Arc::try_unwrap(pwc)
.map_err(|_| eyre!("pwc still in use; can't close down gracefully"))?
.close()
.await?;
Ok(())
}
/// Given access to a PWC and a map of sources to back up, returns a map of pointer names to use as parents.
/// For virtual sources, no parent is chosen.
/// For directory sources, the most recent pointer from the same source is chosen as a parent.
async fn find_suitable_parent_pointers(
pwc: &PileWithCache<BoxedWormFileProvider>,
sources_to_backup: &BTreeMap<String, SourceDescriptor>,
) -> eyre::Result<BTreeMap<String, (String, Pointer)>> {
let mut result = BTreeMap::new();
let pointers = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
for (source_name, source) in sources_to_backup.iter() {
if source.is_directory_source() {
let starter = format!("{source_name}{POINTER_NAME_DATETIME_SPLITTER}");
if let Some(most_recent_pointer) = pointers
.iter()
.rev()
.filter(|pn| pn.starts_with(&starter))
.next()
{
debug!("for {source_name:?}, using parent {most_recent_pointer:?}");
let mut stats = PointerIntegrationStatistics::default();
let pointer = pwc
.read_pointer_fully_integrated(&most_recent_pointer, &mut stats)
.await
.context("failed to read parent pointer")?
.context("no parent pointer despite having just listed it")?;
debug!("when loading parent, stats = {stats:?}");
result.insert(
source_name.to_owned(),
(most_recent_pointer.clone(), pointer),
);
}
}
}
Ok(result)
}
struct DirSourcePrep {
/// New entries only.
new_scan_entry_map: PatriciaMap<ScanEntry>,
/// Files: Reused entries only. Directories: can be partially changed but there's no chunking to be done.
reused_scan_entry_map: PatriciaMap<ScanEntry>,
parent_name: Option<String>,
parent: Option<Pointer>,
path: PathBuf,
new_pointer_name: String,
chunk_file_map: PatriciaMap<Option<(RecursiveChunkRef, u64)>>,
}
async fn scan_dir_sources(
sources_to_backup: &BTreeMap<String, SourceDescriptor>,
mut parents: BTreeMap<String, (String, Pointer)>,
now: DateTime<Utc>,
options: &BackupOptions,
) -> eyre::Result<Vec<DirSourcePrep>> {
let mut joinset = JoinSet::new();
for (source_name, source) in sources_to_backup {
if let SourceDescriptorInner::DirectorySource {
path,
cross_filesystems,
ignore,
} = &source.inner
{
let path = path.to_owned();
let ignore = ignore.to_owned();
let (parent_name, parent) = parents.remove(source_name).unzip();
let new_pointer_name = get_pointer_name_at(&source_name, now);
let options = options.clone();
joinset.spawn_blocking(move || -> eyre::Result<DirSourcePrep> {
let timestart = Instant::now();
let scan_entry_map = scan::scan(&path, &ignore).context("Failed to scan")?;
debug!("scan: {:?}", Instant::now() - timestart);
info!(
"size estimate for {:?} (full scan): {}",
path,
summarise_scan_entry_map_size(&scan_entry_map)
);
// TODO This whole section is messy.
// Maybe we should consider combining prepopulate_unmodified and limit_scan_entry_map_to_size
// as the latter might benefit from being able to see what is in the parent pointer...
let (chunk_file_map, pruned_scan_entry_map, prepopulated_scan_entry_map) =
if let Some(ref parent_node) = parent {
let (cfm, pruned, prepopulated) =
scan::prepopulate_unmodified(&parent_node.root.node, &scan_entry_map);
// debug
let pruned_keys: BTreeSet<String> = pruned
.keys()
.map(|b| String::from_utf8(b).unwrap())
.collect();
let prepop_keys: BTreeSet<String> = prepopulated
.keys()
.map(|b| String::from_utf8(b).unwrap())
.collect();
let ix_keys: BTreeSet<&String> =
pruned_keys.intersection(&prepop_keys).collect();
if !ix_keys.is_empty() {
bail!("bug: intersecting prepop and prune keys: {ix_keys:?}");
}
info!(
"size estimate for {:?} (differential): {}",
path,
summarise_scan_entry_map_size(&pruned)
);
(cfm, pruned, prepopulated)
} else {
(
PatriciaMap::<Option<(RecursiveChunkRef, u64)>>::new(),
scan_entry_map,
PatriciaMap::new(),
)
};
let pruned_scan_entry_map = match options.gradual {
Some(gradual_size_limit) => {
let limited = limit_scan_entry_map_to_size(
pruned_scan_entry_map,
gradual_size_limit.as_u64(),
);
info!(
"size estimate for {:?} (gradual/limited): {}",
path,
summarise_scan_entry_map_size(&limited)
);
limited
}
None => pruned_scan_entry_map,
};
Ok(DirSourcePrep {
chunk_file_map,
new_scan_entry_map: pruned_scan_entry_map,
reused_scan_entry_map: prepopulated_scan_entry_map,
parent_name,
parent,
path,
new_pointer_name,
})
});
}
}
let mut result = Vec::new();
while let Some(dsp_res_res) = joinset.join_next().await {
result.push(dsp_res_res??);
}
Ok(result)
}
fn summarise_scan_entry_map_size(scan_entry_map: &PatriciaMap<ScanEntry>) -> String {
let mut num_bytes = 0u64;
for (_, entry) in scan_entry_map.iter() {
num_bytes += match entry {
ScanEntry::NormalFile { size, .. } => max(*size, 4096),
_ => 4096,
};
}
let num_files = scan_entry_map.len();
format!(
"{num_files} files ({})",
ByteSize(num_bytes).to_string_as(true)
)
}
struct BackupDirSourcesReturn {
pub chunkmaps: BTreeMap<BloblogId, IndexBloblogEntry>,
pub dir_source_returns: Vec<(DirSourcePrep, PatriciaMap<Option<(RecursiveChunkRef, u64)>>)>,
}
#[async_backtrace::framed]
async fn backup_dir_sources(
mut dir_sources: Vec<DirSourcePrep>,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<BackupDirSourcesReturn> {
let mut chunk_file_maps = Vec::new();
let mut pruned_scan_entry_maps = Vec::new();
// First collect all that stuff together...
for dir_source in &mut dir_sources {
chunk_file_maps.push(std::mem::take(&mut dir_source.chunk_file_map));
}
for dir_source in &dir_sources {
pruned_scan_entry_maps.push(&dir_source.new_scan_entry_map);
}
let store_span = Span::current();
// store_span.pb_set_style(&ProgressStyle::default_bar());
store_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
store_span.pb_set_message("storing files");
store_span.pb_set_length(
pruned_scan_entry_maps
.iter()
.map(|pruned_scan_entry_map| {
pruned_scan_entry_map
.values()
.filter(|v| matches!(v, ScanEntry::NormalFile { .. }))
.count() as u64
})
.sum(),
);
//
let (pipeline, pipeline_job_tx) =
StoragePipeline::launch_new(4, pwc.clone(), new_unflushed_chunks).await?;
let dir_sources2 = &dir_sources;
let mut submitted = 0;
let mut completed = 0;
let submitted_mut = &mut submitted;
let (submitter_task, receiver_task) = tokio::join!(
async move {
let pipeline_job_tx = pipeline_job_tx;
for (dir_source_idx, dir_source) in dir_sources2.iter().enumerate() {
for (name_bytes, scan_entry) in pruned_scan_entry_maps[dir_source_idx].iter() {
if let ScanEntry::NormalFile { .. } = scan_entry {
let name = std::str::from_utf8(name_bytes.as_slice())
.context("name is not str")?;
// TODO(bug): if source name is a file, this doesn't work (.join(""))
pipeline_job_tx
.send_async((
(dir_source_idx, name.to_owned()),
dir_source.path.join(name),
))
.await
.map_err(|_| eyre!("unable to send to pipeline."))?;
*submitted_mut += 1;
}
}
}
drop(pipeline_job_tx);
Ok::<_, eyre::Report>(())
},
async {
while let Ok(((dir_source_idx, job_id), file_store_opt)) = pipeline.next_result().await
{
chunk_file_maps[dir_source_idx].insert(&job_id, file_store_opt);
completed += 1;
Span::current().pb_inc(1);
}
// eprintln!("fin rec");
Ok::<_, eyre::Report>(())
}
);
submitter_task?;
receiver_task?;
ensure!(
completed == submitted,
"completed: {completed:?} != submitted {submitted:?}"
);
assert_eq!(dir_sources.len(), chunk_file_maps.len());
let chunkmaps = pipeline.finish_into_chunkmaps().await?;
Ok(BackupDirSourcesReturn {
chunkmaps,
dir_source_returns: dir_sources
.into_iter()
.zip(chunk_file_maps.into_iter())
.collect(),
})
}
async fn backup_virtual_sources(
sources: &BTreeMap<String, SourceDescriptor>,
now: DateTime<Utc>,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<Vec<VirtualSourceReturn>> {
let mut joinset: JoinSet<eyre::Result<VirtualSourceReturn>> = JoinSet::new();
for (source_name, source) in sources {
if source.is_virtual_source() {
joinset.spawn(backup_virtual_source(
get_pointer_name_at(source_name, now),
source.clone(),
pwc.clone(),
new_unflushed_chunks.clone(),
));
}
}
let mut results = Vec::new();
while let Some(result_res_res) = joinset.join_next().await {
results.push(result_res_res??);
}
Ok(results)
}
struct VirtualSourceReturn {
pub pointer_name: String,
pub pointer: Pointer,
pub chunkmaps: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
}
async fn backup_virtual_source(
pointer_name: String,
source: SourceDescriptor,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<VirtualSourceReturn> {
let SourceDescriptorInner::VirtualSource(virtual_source) = &source.inner else {
bail!("bug: non-VS SDI passed to BVS");
};
let mut storing_state = StoringState::new(pwc.clone(), new_unflushed_chunks, Handle::current())
.await
.context("failed to create storing state")?;
let mut sbw = StoringBloblogWriters::default();
let ((chunkref, size), mut sbw, mut storing_state) = tokio::task::spawn_blocking({
let virtual_source = virtual_source.clone();
move || -> eyre::Result<((RecursiveChunkRef, u64), StoringBloblogWriters, StoringState)> {
let child = open_stdout_backup_process(&virtual_source.extra_args, &virtual_source.helper)?;
Ok((storing_state.store_full_stream(child.stdout.unwrap(), &mut sbw).context("Failed to store stream into Yama pile")?, sbw, storing_state))
}
}).await??;
sbw.finish_bloblogs(&mut storing_state)
.await
.context("Failed to finish bloblogs")?;
let chunkmaps = storing_state.new_bloblogs;
// Assemble and write a pointer
let uid = get_current_uid() as u16;
let gid = get_current_gid() as u16;
let tree = TreeNode::NormalFile {
mtime: SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_millis() as u64)
.unwrap_or(0),
ownership: FilesystemOwnership { uid, gid },
permissions: FilesystemPermissions { mode: 0o600 },
size,
content: chunkref,
};
let (uids, gids) =
create_uidgid_lookup_tables(&tree).context("failed to create uid/gid tables")?;
let VirtualSourceKind::Stdout { filename } = &virtual_source.kind;
Ok(VirtualSourceReturn {
pointer_name,
pointer: Pointer {
parent: None,
root: RootTreeNode {
name: filename.clone(),
node: tree,
},
uids,
gids,
},
chunkmaps,
})
}
pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>,
program_name: &str,
) -> eyre::Result<Child> {
let mut child = Command::new(format!("datman-helper-{}-backup", program_name))
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.stdin(Stdio::piped())
.spawn()?;
let mut child_stdin = child.stdin.as_mut().unwrap();
serde_json::to_writer(&mut child_stdin, extra_args)?;
child_stdin.flush()?;
// close stdin!
child.stdin = None;
Ok(child)
}

View File

@ -15,454 +15,281 @@ You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use clap::Parser;
use env_logger::Env;
use anyhow::{bail, Context};
use bare_metrics_recorder::recording::BareMetricsRecorderCore;
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone, Utc};
use datman::commands::backup::{backup_all_sources_to_destination, backup_source_to_destination};
use datman::commands::ilabel::interactive_labelling_session;
use datman::commands::prune::{prune_with_retention_policy, RetentionPolicy};
use datman::commands::{init_descriptor, pushpull};
use datman::descriptor::{load_descriptor, SourceDescriptor};
use datman::get_hostname;
use datman::remote::backup_source_requester::backup_remote_source_to_destination;
use datman::remote::backup_source_responder;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use itertools::Itertools;
use log::info;
use clap::{Parser, Subcommand};
use datman::backup::{backup, BackupOptions};
use datman::descriptor_config::{
load_descriptor, Descriptor, PilePathOrConnector, SourceDescriptor,
};
use datman::extract::{
extract, load_pointers_for_extraction, merge_roots_for_batch_extract, select_to_extract,
};
use eyre::{bail, Context, ContextCompat};
use std::collections::{BTreeMap, BTreeSet};
use std::path::PathBuf;
use std::str::FromStr;
use yama::commands::load_pile_descriptor;
use yama::operations::legacy_pushpull::{open_pile_with_work_bypass, BypassLevel};
use std::sync::Arc;
use tracing::info;
use tracing_indicatif::IndicatifLayer;
use tracing_subscriber::filter::filter_fn;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::Layer;
use yama::debugging::register_sigusr1_backtrace_helper;
use yama::get_hostname;
use yama::open::{open_lock_and_update_cache, open_lock_and_update_cache_with_connector};
use yama::pile_with_cache::PileWithCache;
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const FAILURE_SYMBOL_OBNOXIOUS_FLASHING: &str = "\x1b[5m\x1b[31m⚠ \x1b[25m\x1b[22m";
pub const BOLD: &str = "\x1b[1m";
pub const BOLD_OFF: &str = "\x1b[22m";
pub const WHITE: &str = "\x1b[37m";
pub const RED: &str = "\x1b[31m";
pub const GREEN: &str = "\x1b[32m";
#[derive(Clone, Debug)]
pub struct PileAndPointer {
pub pile_path: Option<PathBuf>,
pub pointer: PointerName,
}
#[derive(Parser)]
#[derive(Clone, Debug)]
#[repr(transparent)]
pub struct PointerName(String);
impl FromStr for PointerName {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if !s
.chars()
.all(|c| c.is_alphanumeric() || ['_', '+', '-', ':'].contains(&c))
{
bail!("Bad pointer name: {s:?}");
}
Ok(PointerName(s.to_owned()))
}
}
impl FromStr for PileAndPointer {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.split_once(":") {
None => Ok(PileAndPointer {
pile_path: None,
pointer: PointerName::from_str(s)?,
}),
Some((pile_path, pointer)) => Ok(PileAndPointer {
pile_path: Some(PathBuf::from(pile_path)),
pointer: PointerName::from_str(pointer)?,
}),
}
}
}
#[derive(Clone, Debug)]
pub struct PileAndPointerWithSubTree {
pub pile_path: Option<PathBuf>,
pub pointer: PointerName,
// TODO how to represent...
pub sub_tree: String,
}
impl FromStr for PileAndPointerWithSubTree {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let (pile_path, pointer_and_subtree) = match s.split_once(":") {
None => (None, s),
Some((pile_path, pointer)) => (Some(PathBuf::from(pile_path)), pointer),
};
if let Some(slash) = pointer_and_subtree.find('/') {
Ok(PileAndPointerWithSubTree {
pile_path,
pointer: PointerName::from_str(&pointer_and_subtree[0..slash])?,
sub_tree: pointer_and_subtree[slash + 1..].to_owned(),
})
} else {
Ok(PileAndPointerWithSubTree {
pile_path,
pointer: PointerName::from_str(&pointer_and_subtree)?,
sub_tree: String::new(),
})
}
}
}
#[derive(Parser, Clone, Debug)]
pub struct DatmanArgs {
#[arg(long, env = "DATMAN_CONFIG", default_value = "datman.toml")]
config: PathBuf,
#[command(subcommand)]
command: DatmanCommand,
}
#[derive(Subcommand, Clone, Debug)]
pub enum DatmanCommand {
/// Initialise a datman descriptor in this directory.
Init {},
///
Status {},
#[clap(name = "ilabel")]
InteractiveLabelling {
/// Name of the source to label.
source_name: String,
},
#[clap(name = "ibrowse")]
InteractiveBrowsing {
/// Name of the source to browse.
source_name: String,
},
/// Back up a source locally or over the network.
BackupOne {
/// Name of the source to back up.
source_name: String,
/// Name of the destination to back up to.
destination_name: String,
pile_name: String,
#[clap(flatten)]
options: BackupOptions,
},
BackupAll {
/// Name of the remote to back up.
/// Special value 'self' means 'this host only'.
/// Special value 'all' means 'all hosts'.
remote_name: String,
/// Name of the destination to back up to.
destination_name: String,
pile_name: String,
#[clap(flatten)]
options: BackupOptions,
},
Extract {
/// Name of the 'source' to extract
/// Omit for 'all'.
#[clap(short)]
source_name: Option<String>,
/// If specified, will get the first backup after this date.
#[clap(long)]
after: Option<HumanDateTime>,
/// If specified, will get the last backup before this date. The default behaviour is to get the latest.
#[clap(long)]
before: Option<HumanDateTime>,
/// If not specified, time-restricted extractions that don't have a pointer for every source
/// will instead lead to an error.
#[clap(long)]
accept_partial: bool, // TODO unimplemented.
/// Name of the pile to extract from
ExtractOne {
pile_name: String,
/// Place to extract to.
source_name: String,
destination: PathBuf,
/// Skip applying metadata. Might be needed to extract without superuser privileges.
#[clap(long)]
skip_metadata: bool,
},
Report {
/// Name of the pile to report on.
pile_name: String,
/// Don't summarise months.
#[clap(long)]
individual: bool,
},
#[clap(name = "_backup_source_responder")]
InternalBackupSourceResponder,
/// Pulls all pointers from a remote pile to a local pile.
/// Does not yet support label filtering, but will do in the future.
Pull {
/// e.g. 'myserver:main'
remote_and_remote_pile: String,
pile_name: String,
},
/// Applies a retention policy by removing unnecessary backups.
/// Does not reclaim space by itself: use
/// `yama check --apply-gc --shallow`
/// & `yama compact`
/// to do that.
Prune { pile_name: String },
#[clap(name = "_pull_responder_offerer")]
InternalPullResponderOfferer {
datman_path: PathBuf,
ExtractAll {
pile_name: String,
destination: PathBuf,
},
}
pub struct HumanDateTime(pub DateTime<Local>);
const PROGRESS_SPANS: &'static [&'static str] = &[
"store_file",
"storing",
"unpack_files",
"expand_chunkrefs",
"extract_files",
"check_all_chunks",
];
impl FromStr for HumanDateTime {
type Err = anyhow::Error;
#[tokio::main]
pub async fn main() -> eyre::Result<()> {
let indicatif_layer = IndicatifLayer::new();
let stderr_writer = indicatif_layer.get_stderr_writer();
let indicatif_layer = indicatif_layer.with_filter(filter_fn(|span_metadata| {
(span_metadata.target().starts_with("yama") || span_metadata.target().starts_with("datman"))
&& PROGRESS_SPANS.contains(&span_metadata.name())
}));
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(date_only) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
let local_date = chrono::offset::Local.from_local_date(&date_only).unwrap();
let local_datetime = local_date.and_hms(0, 0, 0);
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
let local_datetime = chrono::offset::Local
.from_local_datetime(&date_and_time)
.unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
let local_datetime = chrono::offset::Local
.from_local_datetime(&date_and_time)
.unwrap();
Ok(HumanDateTime(local_datetime))
} else {
bail!("Couldn't parse using either format. Use one of: 2021-05-16 OR 2021-05-16T17:42:14 OR 2021-05-16 17:42:14");
}
}
}
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| {
"sqlx=warn,yama=debug,datman=debug,yama_wormfile_sftp=debug,info".into()
}),
)
.with(tracing_subscriber::fmt::layer().with_writer(stderr_writer))
.with(indicatif_layer)
.init();
fn with_obvious_successfail_message<R>(result: anyhow::Result<R>) -> anyhow::Result<R> {
match &result {
Ok(_) => {
eprintln!("Operation {}successful{}.", GREEN, WHITE);
}
Err(error) => {
eprintln!("{:?}", error);
eprintln!(
"{}{}Operation {}{}FAILED{}!{}",
FAILURE_SYMBOL_OBNOXIOUS_FLASHING, WHITE, RED, BOLD, WHITE, BOLD_OFF
);
}
};
result
}
register_sigusr1_backtrace_helper();
fn with_exitcode<R>(result: anyhow::Result<R>) {
match &result {
Ok(_) => {
std::process::exit(0);
}
Err(_) => {
std::process::exit(5);
}
};
}
let args: DatmanArgs = dbg!(DatmanArgs::parse());
fn main() -> anyhow::Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
let descriptor = load_descriptor(&args.config)
.await
.context("failed to load Datman descriptor")?;
dbg!(&descriptor);
let now = Utc::now();
let (shard, _stopper) = BareMetricsRecorderCore::new(File::create(format!(
"/tmp/datman_{}.baremetrics",
now.format("%F_%H%M%S")
))?)
.start("datman".to_string())?;
shard.install_as_metrics_recorder()?;
let opts: DatmanCommand = DatmanCommand::parse();
match opts {
DatmanCommand::Init {} => {
init_descriptor(Path::new(".")).unwrap();
}
DatmanCommand::Status { .. } => {
unimplemented!();
}
DatmanCommand::InteractiveLabelling { source_name } => {
interactive_labelling_session(Path::new("."), source_name)?;
}
DatmanCommand::InteractiveBrowsing { source_name } => {
datman::commands::ibrowse::session(Path::new("."), source_name)?;
}
match args.command {
DatmanCommand::BackupOne {
source_name,
destination_name,
pile_name,
options,
} => {
let lock_name = format!("{} datman backup {:?}", get_hostname(), source_name);
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let source = descriptor
.sources
.get(&source_name)
.context("no source by that name")?;
let my_hostname = get_hostname();
let descriptor = load_descriptor(Path::new(".")).unwrap();
let source = &descriptor.sources[&source_name];
let destination = &descriptor.piles[&destination_name];
if &source.host != &my_hostname {
bail!(
"Current hostname is {:?}, not {:?} as expected for this source.",
my_hostname,
source.host
);
}
let mut pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("storing");
let mut sources_to_backup = BTreeMap::new();
sources_to_backup.insert(source_name.clone(), source.clone());
let is_remote = if let SourceDescriptor::DirectorySource { hostname, .. } = source {
hostname != &my_hostname
} else {
false
};
let result = if is_remote {
backup_remote_source_to_destination(
source,
destination,
&descriptor,
Path::new("."),
&source_name,
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
pbar,
)
} else {
backup_source_to_destination(
source,
destination,
&descriptor,
Path::new("."),
&source_name,
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
&mut pbar,
)
};
with_exitcode(with_obvious_successfail_message(result))
backup(pwc, sources_to_backup, &options).await?;
}
DatmanCommand::BackupAll {
remote_name,
destination_name,
} => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let destination = &descriptor.piles[&destination_name];
DatmanCommand::BackupAll { pile_name, options } => {
let lock_name = format!("{} datman backupall", get_hostname());
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let mut pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
let my_hostname = get_hostname();
let sources_to_backup: BTreeMap<String, SourceDescriptor> = descriptor
.sources
.clone()
.into_iter()
.filter(|(_, source)| &source.host == &my_hostname)
.collect();
if sources_to_backup.len() == 0 {
bail!(
"No sources to back up! The current hostname is {:?}; is it correct?",
my_hostname
);
}
info!(
"Backing up the following {} sources: {:?}",
sources_to_backup.len(),
sources_to_backup.keys().collect::<Vec<_>>()
);
pbar.set_message("storing");
backup_all_sources_to_destination(
destination,
&descriptor,
Path::new("."),
&destination_name,
yama::utils::get_number_of_workers("YAMA_CHUNKERS"),
&mut pbar,
remote_name,
)
.unwrap();
backup(pwc, sources_to_backup, &options).await?;
}
DatmanCommand::Extract {
DatmanCommand::ExtractOne {
pile_name,
source_name,
after,
before,
accept_partial,
destination,
} => {
let lock_name = format!("{} datman extract {:?}", get_hostname(), source_name);
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
let mut sources = BTreeSet::new();
sources.insert(source_name.clone());
let selected = select_to_extract(&pwc, sources, None, None, false).await?;
let mut for_extraction = load_pointers_for_extraction(pwc.clone(), selected).await?;
assert_eq!(for_extraction.len(), 1);
let root_node = for_extraction.remove(&source_name).unwrap();
extract(pwc, root_node.node, &destination).await?;
}
DatmanCommand::ExtractAll {
pile_name,
destination,
skip_metadata,
} => {
if !accept_partial {
bail!("Specify --accept-partial until running without it is supported.");
}
let lock_name = format!("{} datman extractall", get_hostname());
let pwc = open_destination(&descriptor, &pile_name, lock_name).await?;
if after.is_some() && before.is_some() {
bail!("Can't specify both before and after!");
}
let before = before.map(|dt| dt.0.with_timezone(&Utc));
let after = after.map(|dt| dt.0.with_timezone(&Utc));
datman::commands::extract::extract(
&destination,
Path::new("."),
source_name.as_ref().map(|x| x.as_ref()),
&pile_name,
before.into(),
after.into(),
!skip_metadata,
!skip_metadata,
!skip_metadata,
yama::utils::get_number_of_workers("YAMA_EXTRACTORS"),
)?;
}
DatmanCommand::InternalBackupSourceResponder => {
info!("Datman responder at {:?}", std::env::current_exe()?);
backup_source_responder::handler_stdio()?;
}
DatmanCommand::Report {
pile_name,
individual,
} => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let destination = &descriptor.piles[&pile_name];
let report =
datman::commands::report::generate_report(destination, &descriptor, !individual)?;
datman::commands::report::print_filesystem_space(&destination.path)?;
datman::commands::report::print_report(&report)?;
}
DatmanCommand::Pull {
remote_and_remote_pile,
pile_name,
} => {
let (hostname, remote_datman_path, remote_pile_name) = remote_and_remote_pile
.split(':')
.collect_tuple()
.context("You must pull from a remote pile specified as remote:path:pile.")?;
let descriptor = load_descriptor(Path::new(".")).unwrap();
let source = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&source.path)?;
let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
&source.path,
&pile_desc,
BypassLevel::CompressionBypass,
)?;
let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(10));
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("pulling");
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
let mut connection = Command::new("ssh")
.arg(&remote_host_descriptor.user_at_host)
.arg("--")
.arg(
&remote_host_descriptor
.path_to_datman
.as_ref()
.map(|x| x.as_str())
.unwrap_or("datman"),
)
.arg("_pull_responder_offerer")
.arg(remote_datman_path)
.arg(remote_pile_name)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.spawn()?;
let mut reader = BufReader::new(connection.stdout.take().unwrap());
let mut writer = BufWriter::new(connection.stdin.take().unwrap());
pushpull::accepting_side(
&pile,
&bypass_raw_pile,
&mut reader,
&mut writer,
Box::new(pbar),
)?;
}
DatmanCommand::Prune { pile_name } => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let retention_policy = descriptor
.retention
.context("No retention policy set in descriptor")?;
let dest_desc = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&dest_desc.path)?;
prune_with_retention_policy(
&dest_desc.path,
&pile_desc,
&RetentionPolicy::from_config(retention_policy),
true,
)?;
}
DatmanCommand::InternalPullResponderOfferer {
datman_path,
pile_name,
} => {
let descriptor = load_descriptor(&datman_path).unwrap();
let source = &descriptor.piles[&pile_name];
let pile_desc = load_pile_descriptor(&source.path)?;
let (pile, bypass_raw_pile) = open_pile_with_work_bypass(
&source.path,
&pile_desc,
BypassLevel::CompressionBypass,
)?;
let mut stdin = BufReader::new(io_streams::StreamReader::stdin()?);
let mut stdout = BufWriter::new(io_streams::StreamWriter::stdout()?);
pushpull::offering_side(
&pile,
&bypass_raw_pile,
&mut stdin,
&mut stdout,
Box::new(()),
)?;
stdout.flush()?;
let sources = descriptor.sources.keys().cloned().collect();
let selected = select_to_extract(&pwc, sources, None, None, false).await?;
let for_extraction = load_pointers_for_extraction(pwc.clone(), selected).await?;
let merged_node = merge_roots_for_batch_extract(for_extraction);
extract(pwc, merged_node, &destination).await?;
}
}
Ok(())
}
async fn open_destination(
descriptor: &Descriptor,
pile_name: &str,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let path_or_connector = descriptor
.piles
.get(pile_name)
.context("no pile by that name")?;
match path_or_connector {
PilePathOrConnector::PilePath(path) => {
open_lock_and_update_cache(path.clone(), lock_name).await
}
PilePathOrConnector::PileConnector { scheme, yamakey } => {
open_lock_and_update_cache_with_connector(scheme, pile_name, yamakey, lock_name).await
}
}
}

View File

@ -1,66 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::HashMap;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use crate::descriptor::{Descriptor, RetentionPolicyConfig, SourceDescriptor};
pub mod backup;
pub mod extract;
pub mod ibrowse;
pub mod ilabel;
pub mod prune;
pub mod pushpull;
pub mod report;
pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
std::fs::create_dir_all(path)?;
std::fs::create_dir(path.join("labelling"))?;
let mut datman_toml_file = File::create(path.join("datman.toml"))?;
let source: HashMap<String, SourceDescriptor> = Default::default();
/*source.insert("demo1".to_owned(), SourceDescriptor::DirectorySource {
hostname: "demohost1".to_string(),
directory: PathBuf::from("/dev/null")
});
source.insert("demo2".to_owned(), SourceDescriptor::VirtualSource { blah: "".to_string(), label: "wat".to_string() });*/
let bytes = toml::to_vec(&Descriptor {
labels: vec![
"pocket".to_owned(),
"precious".to_owned(),
"bulky".to_owned(),
],
sources: source,
piles: Default::default(),
remote_hosts: Default::default(),
retention: Some(RetentionPolicyConfig {
daily: 14,
weekly: 12,
monthly: 24,
yearly: 9001,
}),
})?;
datman_toml_file.write_all(&bytes)?;
Ok(())
}

View File

@ -1,391 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor, VirtualSourceKind};
use crate::get_hostname;
use crate::labelling::{
label_node, load_labelling_rules, str_to_label, Label, LabellingRules, State,
};
use crate::tree::{scan, FileTree, FileTree1};
use anyhow::{anyhow, bail};
use arc_interner::ArcIntern;
use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
use log::{info, warn};
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
use std::io::Write;
use std::path::Path;
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use yama::chunking::SENSIBLE_THRESHOLD;
use yama::commands::{load_pile_descriptor, open_pile, store_tree_node};
use yama::definitions::{
FilesystemOwnership, FilesystemPermissions, PointerData, RecursiveChunkRef, RootTreeNode,
TreeNode,
};
use yama::progress::ProgressTracker;
pub const POINTER_DATETIME_FORMAT: &'static str = "%F_%T";
pub const POINTER_FIELD_SEPARATOR: char = '+';
pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String {
format!(
"{}{}{}",
source_name,
POINTER_FIELD_SEPARATOR,
datetime.format(POINTER_DATETIME_FORMAT).to_string()
)
}
pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
let (source_name, date_time_str) = pointer_name.rsplit_once("+")?;
let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
let date_time = Utc.from_utc_datetime(&date_time);
Some((source_name.to_owned(), date_time))
}
pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>,
program_name: &str,
) -> anyhow::Result<Child> {
let mut child = Command::new(format!("datman-helper-{}-backup", program_name))
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.stdin(Stdio::piped())
.spawn()?;
let mut child_stdin = child.stdin.as_mut().unwrap();
serde_json::to_writer(&mut child_stdin, extra_args)?;
child_stdin.flush()?;
// close stdin!
child.stdin = None;
Ok(child)
}
pub fn label_filter_and_convert(
tree: FileTree1<()>,
descriptor: &Descriptor,
source_name: &str,
rules: &LabellingRules,
dest: &DestPileDescriptor,
) -> anyhow::Result<Option<TreeNode>> {
info!("Labelling.");
let mut tree = tree.replace_meta(&None);
let labels = descriptor
.labels
.iter()
.map(|l| Label(ArcIntern::new(l.clone())))
.collect();
label_node("".to_owned(), None, &mut tree, &labels, rules)?;
let included_labels: HashSet<Label> = dest.included_labels.iter().map(str_to_label).collect();
info!("Filtering.");
let mut unlabelled_included = false;
// filter_inclusive includes directories that must exist for the sake of their children.
if !tree.filter_inclusive(&mut |node| {
match node.get_metadata().unwrap() {
None => {
// unlabelled -- include by default for safety
unlabelled_included = true;
true
}
Some(State::Excluded) => {
// don't include excluded things
false
}
Some(State::Labelled(label)) => {
// include things only if we want the label
included_labels.contains(&label)
}
Some(State::Split) => {
// no point retaining this directory if its children aren't going to be!
assert!(
node.is_dir(),
"Non-directories should not be labelled for Split!"
);
false
}
}
}) {
info!("Empty filter. Stopping.");
return Ok(None);
}
if unlabelled_included {
warn!("Unlabelled nodes. They have been included for safety, but you should consider running\n\t'datman ilabel {}'\nat some point to assign labels.", source_name);
}
let root = convert_filetree_to_yamatree(&tree);
Ok(Some(root))
}
pub fn backup_source_to_destination<PT: ProgressTracker>(
source: &SourceDescriptor,
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
source_name: &str,
dest_name: &str,
num_workers: u8,
progress_bar: &mut PT,
) -> anyhow::Result<()> {
match source {
SourceDescriptor::DirectorySource {
hostname: _,
directory,
cross_filesystems,
} => {
info!("Looking to backup {} to {}", source_name, dest_name);
let rules = load_labelling_rules(desc_path, source_name)?;
let exclusions = rules.get_exclusions_set(directory);
info!("Scanning.");
let tree = scan(directory, !*cross_filesystems, &exclusions)?
.ok_or_else(|| anyhow!("Source does not exist."))?;
let absolute_source_path = desc_path.join(directory);
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let root = if let Some(root) =
label_filter_and_convert(tree, descriptor, source_name, &rules, dest)?
{
root
} else {
return Ok(());
};
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
info!("Searching for suitable parents.");
let mut parent: Option<String> = None;
let prefix = format!("{}+", source_name);
for pointer in pile.list_pointers()?.iter() {
if pointer.starts_with(&prefix) {
match parent.as_ref() {
None => {
parent = Some(pointer.to_owned());
}
Some(cur_parent) => {
if cur_parent < pointer {
parent = Some(pointer.to_owned());
}
}
}
}
}
match parent.as_ref() {
Some(parent) => {
info!("Using parent: {}", parent);
}
None => {
info!("No suitable parent found.");
}
}
info!("Storing using yama.");
yama::operations::storing::store_fully(
Arc::new(pile),
&absolute_source_path,
&pointer_name,
root,
parent,
num_workers,
progress_bar,
)?;
info!("Stored!");
}
SourceDescriptor::VirtualSource {
helper,
label,
kind: VirtualSourceKind::Stdout { filename },
extra_args,
} => {
if !dest.included_labels.contains(label) {
info!("Skipping because the source's label is not included in this destination!");
return Ok(());
}
info!("Starting up process and writing to yama store.");
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
let mut chunker = yama::chunking::RecursiveChunker::new(SENSIBLE_THRESHOLD, &pile);
let mut process = open_stdout_backup_process(extra_args, helper)?;
info!("Storing. No progress bar is available for this style of backup yet.");
// this bit does all the magic.
// TODO(feature): progress bar for
std::io::copy(process.stdout.as_mut().unwrap(), &mut chunker)?;
let exit_status = process.wait()?;
if !exit_status.success() {
bail!(
"The process was not successful (exit code {}). Exiting.",
exit_status.code().unwrap()
);
}
let data_chunk_ref = chunker.finish()?;
eprintln!("Stored data! Now writing a pointer...");
let root = TreeNode::NormalFile {
mtime: Utc::now().timestamp_millis() as u64,
ownership: FilesystemOwnership {
uid: u16::MAX,
gid: u16::MAX,
},
permissions: FilesystemPermissions { mode: 0o600 },
content: data_chunk_ref,
};
// very important: store the pointer!
let pointer_chunk_ref = store_tree_node(
&pile,
&RootTreeNode {
name: filename.to_owned(),
node: root,
},
)?;
let pointer_data = PointerData {
chunk_ref: pointer_chunk_ref,
parent_pointer: None,
uid_lookup: Default::default(),
gid_lookup: Default::default(),
};
pile.write_pointer(&pointer_name, &pointer_data)?;
pile.flush()?;
eprintln!("Pointer saved!");
}
}
Ok(())
}
pub fn convert_filetree_to_yamatree<A, B, C, D>(
filetree: &FileTree<A, B, C, D>,
) -> yama::definitions::TreeNode
where
A: Debug + Clone + Eq + PartialEq,
B: Debug + Clone + Eq + PartialEq,
C: Debug + Clone + Eq + PartialEq,
D: Debug + Clone + Eq + PartialEq,
{
match filetree {
FileTree::NormalFile {
mtime,
ownership,
permissions,
meta: _,
} => TreeNode::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
content: RecursiveChunkRef {
chunk_id: Default::default(),
depth: 0,
},
},
FileTree::Directory {
ownership,
permissions,
children,
meta: _,
} => TreeNode::Directory {
ownership: *ownership,
permissions: *permissions,
children: children
.iter()
.map(|(k, v)| (k.clone(), convert_filetree_to_yamatree(v)))
.collect(),
},
FileTree::SymbolicLink {
ownership,
target,
meta: _,
} => TreeNode::SymbolicLink {
ownership: *ownership,
target: target.clone(),
},
FileTree::Other(_) => {
panic!("Shouldn't be any Others in the tree.");
}
}
}
pub fn backup_all_sources_to_destination<PT: ProgressTracker>(
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
dest_name: &str,
num_workers: u8,
progress_bar: &mut PT,
restricted_remote_name: String,
) -> anyhow::Result<()> {
let restricted_remote = match restricted_remote_name.as_str() {
"all" => None,
"self" | "this" | "here" => Some(get_hostname()),
other => Some(other.to_string()),
};
for (source_name, source_descriptor) in descriptor.sources.iter() {
if let (Some(source_host), Some(restricted_host)) = (
source_descriptor.get_remote_hostname(),
restricted_remote.as_ref(),
) {
if source_host != restricted_host {
// Skip this one, it wasn't requested right now.
continue;
}
}
backup_source_to_destination(
source_descriptor,
dest,
descriptor,
desc_path,
source_name.as_str(),
dest_name,
num_workers,
progress_bar,
)?;
}
Ok(())
}

View File

@ -1,182 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::commands::backup::POINTER_DATETIME_FORMAT;
use crate::descriptor::load_descriptor;
use anyhow::bail;
use chrono::{DateTime, NaiveDateTime, Utc};
use itertools::Itertools;
use log::{info, warn};
use std::path::Path;
use yama::commands::{load_pile_descriptor, open_pile};
use yama::pile::{Pile, RawPile};
pub type PileT = Pile<Box<dyn RawPile>>;
pub fn extract(
destination: &Path,
descriptor_path: &Path,
source_name: Option<&str>,
pile_name: &str,
before: Option<DateTime<Utc>>,
after: Option<DateTime<Utc>>,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
num_workers: u8,
) -> anyhow::Result<()> {
if destination.exists() {
bail!("For now, the destination is not allowed to exist prior to extraction.");
}
let descriptor = load_descriptor(descriptor_path)?;
let dest_descriptor = &descriptor.piles[pile_name];
let dest_pile_path = descriptor_path.join(&dest_descriptor.path);
let pile_descriptor = load_pile_descriptor(&dest_pile_path)?;
let pile = open_pile(&dest_pile_path, &pile_descriptor)?;
std::fs::create_dir_all(&destination)?;
let mut pointers_to_extract = Vec::new();
match source_name {
Some(source_name) => match find_pointer_for_source(source_name, &pile, &before, &after)? {
None => {
bail!(
"No pointer found for {:?} and it's the only one requested.",
source_name
);
}
Some(pointer) => {
pointers_to_extract.push(pointer);
}
},
None => {
for source in descriptor.sources.keys() {
match find_pointer_for_source(source, &pile, &before, &after)? {
None => {
warn!("No pointer found for {:?}! Carrying on anyway...", source);
}
Some(pointer) => {
pointers_to_extract.push(pointer);
}
}
}
}
}
extract_pointers_into_already_created_directory(
destination,
pointers_to_extract,
&pile,
apply_permissions,
apply_mtime,
apply_ownership,
num_workers,
)?;
Ok(())
}
fn find_pointer_for_source(
source_name: &str,
pile: &PileT,
before: &Option<DateTime<Utc>>,
after: &Option<DateTime<Utc>>,
) -> anyhow::Result<Option<String>> {
let mut current_choice: Option<(String, DateTime<Utc>)> = None;
for pointer_name in pile.list_pointers()? {
if let Some((pointer_source_name, encoded_datetime)) =
pointer_name.split('+').collect_tuple()
{
if source_name != pointer_source_name {
// don't accept pointers for other sources!
continue;
}
match NaiveDateTime::parse_from_str(encoded_datetime, POINTER_DATETIME_FORMAT) {
Ok(decoded_datetime) => {
let datetime = DateTime::from_utc(decoded_datetime, Utc);
if let Some(before) = before {
if before < &datetime {
// datetime is after the 'before' time
continue;
}
} else if let Some(after) = after {
if &datetime < after {
// datetime is before the 'after' time
continue;
}
}
match current_choice.as_ref() {
None => current_choice = Some((pointer_name, datetime)),
Some((_current_name, current_datetime)) => {
let should_replace = if after.is_some() {
// if we want the first one after a time, we want the earliest option!
// so replace if new datetime is earlier than current
&datetime < current_datetime
} else {
// replace if new datetime is after current datetime
current_datetime < &datetime
};
if should_replace {
current_choice = Some((pointer_name, datetime));
}
}
}
}
Err(e) => {
warn!(
"Ignoring {:?} because it seems to have a bad datetime: {:?}",
pointer_name, e
);
}
}
}
}
Ok(current_choice.map(|(a, _)| a))
}
fn extract_pointers_into_already_created_directory(
target: &Path,
pointers: Vec<String>,
pile: &PileT,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
num_workers: u8,
) -> anyhow::Result<()> {
for pointer in pointers {
info!("Extracting {:?} now.", pointer);
let pointer_target_dir = &target.join(&pointer);
std::fs::create_dir(pointer_target_dir)?;
yama::operations::extracting::extract_from_pointer_name(
pointer_target_dir,
&pointer,
pile,
true,
num_workers,
apply_permissions,
apply_mtime,
apply_ownership,
)?;
}
Ok(())
}

View File

@ -1,253 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeSet;
use std::path::Path;
use anyhow::{anyhow, bail};
use crate::descriptor::{load_descriptor, SourceDescriptor};
use crate::labelling::{
label_node, load_labelling_rules, save_labelling_rules, GlobRule, Label, State,
};
use crate::tree::{scan, FileTree, FileTree1};
use arc_interner::ArcIntern;
use humansize::FileSize;
use std::io::{stdin, stdout, Write};
pub fn calculate_sizes(node: &mut FileTree1<u64>, real_path: &Path) -> anyhow::Result<u64> {
match node {
FileTree::NormalFile { meta, .. } => {
let size = std::fs::metadata(real_path)?.len();
*meta = size;
Ok(size)
}
FileTree::Directory { children, meta, .. } => {
let mut size = 0;
for (name, child) in children.iter_mut() {
size += calculate_sizes(child, &real_path.join(name))?;
}
*meta = size;
Ok(size)
}
FileTree::SymbolicLink { meta, target, .. } => {
*meta = target.len() as u64;
Ok(target.len() as u64)
}
FileTree::Other(_) => Ok(0),
}
}
pub fn string_to_outcome(s: &str) -> State {
match s {
"s" => State::Split,
"x" => State::Excluded,
other => State::Labelled(Label(ArcIntern::new(other.to_owned()))),
}
}
pub fn session(path: &Path, source_name: String) -> anyhow::Result<()> {
let mut current_path = String::from("");
let descriptor = load_descriptor(path)?;
let source_descriptor = descriptor
.sources
.get(&source_name)
.ok_or_else(|| anyhow!("Could not find source {:?}!", source_name))?;
let (directory, one_filesystem) = match source_descriptor {
SourceDescriptor::DirectorySource {
directory,
cross_filesystems,
..
} => (directory, !*cross_filesystems),
SourceDescriptor::VirtualSource { .. } => {
bail!("Cannot browse virtual source.");
}
};
println!("Scanning source; this might take a little while...");
let mut dir_scan: FileTree1<Option<State>> = scan(directory, one_filesystem, &BTreeSet::new())?
.ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None);
let mut size_dir_scan: FileTree1<u64> = dir_scan.replace_meta(&0);
calculate_sizes(&mut size_dir_scan, directory)?;
let mut rules = load_labelling_rules(path, &source_name)?;
let labels = descriptor
.labels
.iter()
.map(|l| Label(ArcIntern::new(l.clone())))
.collect();
label_node("".to_owned(), None, &mut dir_scan, &labels, &rules)?;
loop {
println!("---------------------------------------------------------");
println!("| {}", current_path);
println!("----");
if let Some(dir_node) = dir_scan.get_by_path(&current_path) {
if let FileTree::Directory { children, .. } = dir_node {
let size_node = size_dir_scan.get_by_path(&current_path).unwrap();
for (idx, (child_name, child)) in children.iter().enumerate() {
let size_child = size_node
.get_by_path(child_name)
.unwrap()
.get_metadata()
.unwrap();
if child.is_dir() {
println!("{}/", child_name);
} else if child.is_symlink() {
println!("{} (symlink)", child_name);
} else {
println!("{}", child_name);
}
print!("\t[{:3}] ", idx);
match child.get_metadata().unwrap() {
None => {
print!("unlabelled ");
}
Some(state) => match state {
State::Labelled(label) => {
print!("l:{} ", label.0.as_ref());
}
State::Split => {
print!("split ");
}
State::Excluded => {
print!("excluded ");
}
},
}
println!(
"({})",
size_child
.file_size(humansize::file_size_opts::BINARY)
.unwrap()
);
}
print!("\n> ");
stdout().flush()?;
let mut next_command = String::new();
if stdin().read_line(&mut next_command)? > 0 {
let split: Vec<&str> = next_command.trim_end_matches('\n').split(' ').collect();
match split[0] {
"x" => {
if let Ok(id) = split[1].parse::<usize>() {
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules
.position_based_rules
.insert(entry_path, State::Excluded);
} else {
eprintln!("not found.");
}
} else {
eprintln!("bad int :(");
}
}
"s" => {
if let Ok(id) = split[1].parse::<usize>() {
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules.position_based_rules.insert(entry_path, State::Split);
} else {
eprintln!("not found.");
}
} else {
eprintln!("bad int :(");
}
}
"p" => {
let outcome = split[1];
let pattern = split[2];
match glob::Pattern::new(&pattern) {
Ok(glob) => {
rules.glob_based_rules.push(GlobRule {
pattern: pattern.to_owned(),
glob,
outcome: string_to_outcome(&outcome),
});
}
Err(e) => {
eprintln!("{:?}", e);
}
}
}
"q" => {
break;
}
other => {
if other.chars().all(char::is_numeric) {
let id: usize = other.parse().unwrap();
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, entry))) = entry {
if entry.is_dir() {
current_path.extend("/".chars());
current_path.extend(name.chars());
} else {
eprintln!("not a dir.");
}
}
} else {
let label = split[1];
let id: usize = split[2].parse().unwrap(); // TODO
let entry = children
.iter()
.enumerate()
.find(|(index, _item)| *index == id);
if let Some((_index, (name, _entry))) = entry {
let entry_path = format!("{}/{}", &current_path, name);
rules.position_based_rules.insert(
entry_path,
State::Labelled(Label(ArcIntern::new(label.to_owned()))),
);
}
}
}
}
} else {
println!("ending.");
break;
}
} else {
break;
}
} else {
break;
}
}
save_labelling_rules(path, &source_name, &rules)?;
Ok(())
}

View File

@ -1,267 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeSet;
use std::io;
use std::io::{StdinLock, Stdout, Write};
use std::path::Path;
use arc_interner::ArcIntern;
use byteorder::ReadBytesExt;
use termion::input::TermRead;
use termion::raw::{IntoRawMode, RawTerminal};
use crate::descriptor::{load_descriptor, Descriptor, SourceDescriptor};
use crate::labelling::State::{Excluded, Labelled, Split};
use crate::labelling::{
load_labelling_rules, save_labelling_rules, GlobRule, Label, LabellingRules, State,
};
use crate::tree::{scan, FileTree, FileTree1};
use log::info;
use crate::get_hostname;
use crate::remote::backup_source_requester;
use crate::remote::backup_source_requester::connect_to_remote;
use anyhow::{anyhow, bail};
pub fn interactive_label_node(
path: String,
current_state: Option<State>,
node: &mut FileTree1<Option<State>>,
labels: &Vec<Label>,
rules: &mut LabellingRules,
stdin: &mut StdinLock,
stdout: &mut RawTerminal<Stdout>,
) -> anyhow::Result<()> {
let mut next_state = current_state;
if let Some(rule_state) = rules.apply(&path) {
next_state = Some(rule_state.clone());
} else if !next_state
.as_ref()
.map(|s| s.should_inherit())
.unwrap_or(false)
{
if node.is_dir() {
stdout.write_all(format!("\r{}/: _", path).as_bytes())?;
} else if node.is_symlink() {
stdout.write_all(format!("\r{} (symlink): _", path).as_bytes())?;
} else {
stdout.write_all(format!("\r{}: _", path).as_bytes())?;
}
stdout.flush()?;
let user_input_state = loop {
let next_char = stdin.read_u8()? as char;
if next_char >= '1' && next_char <= '9' {
let index = next_char as usize - '1' as usize;
if let Some(label) = labels.get(index) {
rules
.position_based_rules
.insert(path.clone(), Labelled(label.clone()));
print!("\x08{}\r\n", label.0);
break Some(Labelled(label.clone()));
}
} else if next_char == 'x' {
rules.position_based_rules.insert(path.clone(), Excluded);
print!("\x08{}\r\n", next_char);
break Some(Excluded);
} else if next_char == 's' {
if node.is_dir() {
rules.position_based_rules.insert(path.clone(), Split);
print!("\x08{}\r\n", next_char);
break Some(Split);
} else {
print!("\x08!");
stdout.flush()?;
}
} else if next_char == 'p' {
print!("\x08p\r\n\tPattern mode. Choose a label or other effect to apply to the pattern matches: _");
stdout.flush()?;
let rule_apply_state = loop {
let next_char = stdin.read_u8()? as char;
if next_char >= '1' && next_char <= '9' {
let index = next_char as usize - '1' as usize;
if let Some(label) = labels.get(index) {
print!("\x08{}\r\n", label.0);
break Labelled(label.clone());
}
} else if next_char == 'x' {
print!("\x08{}\r\n", next_char);
break Excluded;
} else if next_char == 's' {
print!("\x08{}\r\n", next_char);
break Split;
}
};
stdout.flush()?;
stdout.suspend_raw_mode()?;
print!("\tEnter a glob pattern to match on:\n\t");
stdout.flush()?;
let (pattern, glob) = loop {
let pattern = stdin
.read_line()?
.ok_or_else(|| anyhow!("EOT? when reading glob pattern"))?;
match glob::Pattern::new(&pattern) {
Ok(glob) => {
if !glob.matches(&path) {
println!("Doesn't match the path in question.");
continue;
}
break (pattern, glob);
}
Err(error) => {
println!("Error: {:?}. Try again.", error);
}
}
};
stdout.activate_raw_mode()?;
rules.glob_based_rules.push(GlobRule {
pattern,
glob,
outcome: rule_apply_state.clone(),
});
break Some(rule_apply_state);
} else if next_char == 'q' {
return Ok(());
}
};
next_state = user_input_state;
}
match node {
FileTree::NormalFile { meta, .. } => {
*meta = next_state;
}
FileTree::Directory { meta, children, .. } => {
*meta = next_state.clone();
for (child_name, child) in children.iter_mut() {
let child_path = format!("{}/{}", path, child_name);
interactive_label_node(
child_path,
next_state.clone(),
child,
labels,
rules,
stdin,
stdout,
)?;
}
}
FileTree::SymbolicLink { meta, .. } => {
*meta = next_state;
}
FileTree::Other(_) => {
panic!("Other() nodes shouldn't be present here.");
}
}
Ok(())
}
pub fn interactive_labelling_session(path: &Path, source_name: String) -> anyhow::Result<()> {
let descriptor: Descriptor = load_descriptor(path)?;
let source = descriptor
.sources
.get(&source_name)
.ok_or_else(|| anyhow!("No source found by that name!"))?;
if let SourceDescriptor::DirectorySource {
hostname,
directory,
cross_filesystems,
} = source
{
let my_hostname = get_hostname();
let mut dir_scan = if &my_hostname == hostname {
info!("Scanning source; this might take a little while...");
scan(directory, !*cross_filesystems, &BTreeSet::new())?
.ok_or_else(|| anyhow!("Empty source."))?
.replace_meta(&None)
} else {
info!("Requesting scan over network. This might take a while.");
let connection = connect_to_remote(&descriptor, hostname)?;
let mut read = connection.stdout.expect("Requested stdout");
let mut write = connection.stdin.expect("Requested stdin");
// first start off with an introduction
info!("Connecting to remote source...");
backup_source_requester::introduction(&mut read, &mut write)?;
// then request to scan
info!("Requesting scan from remote source... (this may take some time)");
let scan = backup_source_requester::scanning(
&mut read,
&mut write,
directory.as_ref(),
!*cross_filesystems,
&BTreeSet::new(),
)?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?
.replace_meta(&None);
backup_source_requester::quit(&mut read, &mut write)?;
scan
};
let mut rules = load_labelling_rules(path, &source_name)?;
let labels: Vec<Label> = descriptor
.labels
.iter()
.map(|label| Label(ArcIntern::new(label.clone())))
.collect();
println!("The following label mappings are available:");
for (idx, label) in labels.iter().enumerate() {
println!("\tFor {:?}, press {}!", label.0.as_ref(), idx + 1);
}
println!("\tTo split a directory, press 's'!");
println!("\tTo exclude an entry, press 'x'!");
println!("\tTo apply a pattern, press 'p'...");
// Set terminal to raw mode to allow reading stdin one key at a time
let mut stdout = io::stdout().into_raw_mode().unwrap();
let stdin_unlocked = io::stdin();
let mut stdin = stdin_unlocked.lock();
interactive_label_node(
"".to_owned(),
None,
&mut dir_scan,
&labels,
&mut rules,
&mut stdin,
&mut stdout,
)?;
drop(stdout);
drop(stdin);
println!("\nLabelling completed!");
// save rules
save_labelling_rules(path, &source_name, &rules)?;
} else {
bail!("Can't do interactive labelling on a non-directory source.");
}
Ok(())
}

View File

@ -1,220 +0,0 @@
use crate::commands::backup::split_pointer_name;
use crate::descriptor::RetentionPolicyConfig;
use anyhow::{bail, Context};
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::io;
use std::path::Path;
use yama::commands::open_pile;
use yama::operations::remove_pointer_safely;
use yama::pile::PileDescriptor;
pub struct RetentionBand {
pub interval_s: u64,
pub number_to_retain: u32,
}
pub struct RetentionPolicy {
pub retention_bands: Vec<RetentionBand>,
}
const DAY: u64 = 86400;
const WEEK: u64 = 7 * DAY;
const MONTH: u64 = 31 * DAY;
const YEAR: u64 = 365 * DAY;
impl RetentionPolicy {
pub fn from_config(descriptor: RetentionPolicyConfig) -> RetentionPolicy {
let mut policy = RetentionPolicy {
retention_bands: vec![],
};
if descriptor.daily != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: DAY,
number_to_retain: descriptor.daily,
});
}
if descriptor.weekly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: WEEK,
number_to_retain: descriptor.weekly,
});
}
if descriptor.monthly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: MONTH,
number_to_retain: descriptor.monthly,
});
}
if descriptor.yearly != 0 {
policy.retention_bands.push(RetentionBand {
interval_s: YEAR,
number_to_retain: descriptor.yearly,
});
}
policy
}
/// Returns the set of snapshots to remove.
pub fn apply_returning_prunable(
&self,
snapshots_by_unix_time: BTreeMap<u64, String>,
) -> BTreeSet<String> {
if snapshots_by_unix_time.is_empty() {
return BTreeSet::new();
}
let mut snapshots_included: BTreeSet<u64> = BTreeSet::new();
// Always mark the most recent snapshot as retained!
let last_snapshot = snapshots_by_unix_time.keys().rev().next().unwrap();
snapshots_included.insert(*last_snapshot);
let now_time = *last_snapshot;
for band in &self.retention_bands {
for multiple in 1..=band.number_to_retain {
let target_time = now_time - (multiple as u64) * band.interval_s;
if let Some((k, _)) = snapshots_by_unix_time.range(0..=target_time).rev().next() {
snapshots_included.insert(*k);
}
}
}
// Find all prunable (unincluded) snapshots.
snapshots_by_unix_time
.into_iter()
.filter(|(k, _v)| !snapshots_included.contains(k))
.map(|(_k, v)| v)
.collect()
}
}
pub fn prune_with_retention_policy(
pile_path: &Path,
pile_desc: &PileDescriptor,
policy: &RetentionPolicy,
prompt_first: bool,
) -> anyhow::Result<()> {
let pile = open_pile(&pile_path, &pile_desc).context("Failed to open pile")?;
let pointers = pile
.list_pointers()
.context("Failed to list pointers in pile")?;
let mut pointers_to_keep: BTreeSet<String> = pointers.iter().cloned().collect();
let pointers_to_remove = get_prunable_pointers(&policy, pointers);
for remove in &pointers_to_remove {
pointers_to_keep.remove(remove);
}
info!("Gory details:\n---\nKeep: {pointers_to_keep:?}\n---\nRemove: {pointers_to_remove:?}");
info!(
"{} pointers to remove ({} to keep) based on retention policy.",
pointers_to_remove.len(),
pointers_to_keep.len()
);
if prompt_first {
println!("Would you like to proceed? [y/N]: ");
let mut buffer = String::new();
let stdin = io::stdin(); // We get `Stdin` here.
stdin.read_line(&mut buffer)?;
if buffer.trim().to_ascii_lowercase() != "y" {
bail!("Aborted by user.");
}
}
for to_remove in pointers_to_remove {
let res = remove_pointer_safely(&pile, &to_remove).context("removing prunable pointers");
pile.flush()
.context("flushing pile after removing pointers")?;
res?;
}
Ok(())
}
fn get_prunable_pointers(policy: &RetentionPolicy, pointers: Vec<String>) -> BTreeSet<String> {
let mut split_pointers_by_name: BTreeMap<String, BTreeMap<u64, String>> = BTreeMap::new();
for pointer in pointers {
let (name, datetime) = if let Some(x) = split_pointer_name(&pointer) {
x
} else {
continue;
};
split_pointers_by_name
.entry(name)
.or_default()
.insert(datetime.timestamp().try_into().unwrap(), pointer);
}
let mut pointers_to_remove = BTreeSet::new();
for (_pointer_base_name, ts_to_pointer) in split_pointers_by_name {
let to_remove = policy.apply_returning_prunable(ts_to_pointer);
pointers_to_remove.extend(to_remove);
}
pointers_to_remove
}
#[cfg(test)]
mod test {
use crate::commands::prune::{get_prunable_pointers, RetentionPolicy};
use crate::descriptor::RetentionPolicyConfig;
#[test]
fn test_prunable_pointers() {
let pointers = vec![
"alice+2022-09-28_05:00:00",
"alice+2022-09-28_02:00:00",
"alice+2022-09-21_05:00:00",
"alice+2022-09-14_05:00:00",
"alice+2022-09-08_05:00:00",
"alice+2022-09-07_05:00:00",
"alice+2022-09-01_05:00:00",
"bob+2022-09-28_06:00:00",
"bob+2022-09-28_03:00:00",
"bob+2022-09-21_06:00:00",
"bob+2022-09-14_06:00:00",
"bob+2022-09-08_06:00:00",
"bob+2022-09-07_06:00:00",
"bob+2022-09-01_06:00:00",
]
.into_iter()
.map(|s| s.to_owned())
.collect();
let policy = RetentionPolicy::from_config(RetentionPolicyConfig {
daily: 0,
weekly: 3,
monthly: 0,
yearly: 0,
});
assert_eq!(
get_prunable_pointers(&policy, pointers)
.into_iter()
.collect::<Vec<_>>(),
vec![
"alice+2022-09-01_05:00:00",
"alice+2022-09-08_05:00:00",
"alice+2022-09-28_02:00:00",
"bob+2022-09-01_06:00:00",
"bob+2022-09-08_06:00:00",
"bob+2022-09-28_03:00:00",
]
);
}
}

View File

@ -1,306 +0,0 @@
// Push and Pull support for Datman
use anyhow::{bail, ensure, Context};
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::io::{Read, Write};
use std::sync::Arc;
use std::time::Instant;
use yama::chunking::RecursiveUnchunker;
use yama::commands::retrieve_tree_node;
use yama::definitions::{ChunkId, PointerData, RecursiveChunkRef, TreeNode};
use yama::pile::{Keyspace, Pile, PipelineDescription, RawPile};
use yama::progress::ProgressTracker;
use yama::remote::{read_message, write_message};
pub fn offer_pointers<W: Write, RP: RawPile>(
pile: &Pile<RP>,
writer: &mut W,
) -> anyhow::Result<BTreeMap<String, PointerData>> {
let mut pointers_to_offer: BTreeMap<String, PointerData> = BTreeMap::new();
for pointer_name in pile.list_pointers()? {
let pointer_data = pile
.read_pointer(&pointer_name)?
.context("Listed pointer not present")?;
pointers_to_offer.insert(pointer_name, pointer_data);
}
write_message(writer, &pointers_to_offer)?;
Ok(pointers_to_offer)
}
pub fn ensure_compatible_bypasses(
my_full: &Vec<PipelineDescription>,
my_bypass: &Vec<PipelineDescription>,
their_full: &Vec<PipelineDescription>,
their_bypass: &Vec<PipelineDescription>,
) -> anyhow::Result<()> {
ensure!(
my_full.starts_with(&my_bypass),
"Our full pipeline is not an extension of the bypass pipeline."
);
ensure!(
their_full.starts_with(&their_bypass),
"Their full pipeline is not an extension of their bypass pipeline."
);
let my_bypassed_parts = &my_full[my_bypass.len()..];
let their_bypassed_parts = &their_full[their_bypass.len()..];
ensure!(
my_bypassed_parts == their_bypassed_parts,
"Our bypassed parts and their bypassed parts are not the same.\nOurs: {:?}\nTheirs: {:?}",
my_bypassed_parts,
their_bypassed_parts
);
Ok(())
}
pub fn negotiate_bypassed_pile<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
) -> anyhow::Result<()> {
let my_full_pipeline = pile.raw_pile.describe_pipeline()?;
let my_bypass_pipeline = bypass_pile.describe_pipeline()?;
write_message(writer, &my_full_pipeline)?;
write_message(writer, &my_bypass_pipeline)?;
writer.flush()?;
let their_full_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
let their_bypass_pipeline = read_message::<_, Vec<PipelineDescription>>(reader)?;
ensure_compatible_bypasses(
&my_full_pipeline,
&my_bypass_pipeline,
&their_full_pipeline,
&their_bypass_pipeline,
)?;
Ok(())
}
fn collect_chunk_ids(
pile: &Pile<Arc<Box<dyn RawPile>>>,
root: &TreeNode,
chunk_ids: &mut BTreeSet<ChunkId>,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, _| {
match tree_node {
TreeNode::NormalFile { content, .. } => {
collect_chunk_ids_from_chunkref(pile, content, chunk_ids)?;
}
_ => {}
}
Ok(())
},
"".to_owned(),
)?;
Ok(())
}
fn collect_chunk_ids_from_chunkref(
pile: &Pile<Arc<Box<dyn RawPile>>>,
chunk_ref: &RecursiveChunkRef,
collection: &mut BTreeSet<ChunkId>,
) -> anyhow::Result<()> {
if chunk_ref.depth == 0 {
collection.insert(chunk_ref.chunk_id);
} else {
let shallower_chunk_ref = RecursiveChunkRef {
chunk_id: chunk_ref.chunk_id,
depth: chunk_ref.depth - 1,
};
let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
let mut next_chunk_id: ChunkId = Default::default();
loop {
let read = unchunker.read(&mut next_chunk_id[..])?;
if read == 0 {
break;
} else if read < next_chunk_id.len() {
unchunker.read_exact(&mut next_chunk_id[read..])?;
}
collection.insert(next_chunk_id);
}
}
Ok(())
}
pub fn offering_side<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
mut progress: Box<dyn ProgressTracker>,
) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
let expecting = format!("Datman Pull Accepter {}", version);
write_message(writer, &format!("Datman Pull Offerer {}", version))?;
writer.flush()?;
let found: String = read_message(reader)?;
ensure!(
found == expecting,
"Version mismatch. Expecting {:?} got {:?}",
expecting,
found
);
// First 'negotiate' (for now: assert) a pile bypass.
// This lets us avoid decompressing things before recompressing them at the other end,
// assuming both ends use the same dictionary.
negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
let offered_pointers = offer_pointers(pile, writer)?;
let wanted_pointers = read_message::<_, BTreeSet<String>>(reader)?;
let mut chunks_to_offer: BTreeSet<ChunkId> = BTreeSet::new();
for pointer_name in &wanted_pointers {
let pointer_data = offered_pointers
.get(pointer_name)
.with_context(|| format!("Requested pointer {:?} was not offered", pointer_name))?;
collect_chunk_ids_from_chunkref(pile, &pointer_data.chunk_ref, &mut chunks_to_offer)?;
let root_node = retrieve_tree_node(pile, pointer_data.chunk_ref.clone())?;
collect_chunk_ids(pile, &root_node.node, &mut chunks_to_offer)?;
}
write_message(writer, &chunks_to_offer)?;
writer.flush()?;
let chunks_to_skip: BTreeSet<ChunkId> = read_message(reader)?;
let chunks_to_send: Vec<ChunkId> = chunks_to_offer
.difference(&chunks_to_skip)
.cloned()
.collect();
drop(chunks_to_offer);
drop(chunks_to_skip);
let start_sort_by_hints = Instant::now();
let chunks_to_send_with_hints: BTreeSet<(u64, ChunkId)> = chunks_to_send
.into_iter()
.map(|chunk_id| {
pile.raw_pile
.chunk_id_transfer_ordering_hint(&chunk_id)
.map(|hint| (hint, chunk_id))
})
.collect::<anyhow::Result<_>>()?;
let time_to_sort_by_hints = Instant::now() - start_sort_by_hints;
info!(
"{} s to sort {} chunks by their hints",
time_to_sort_by_hints.as_secs_f32(),
chunks_to_send_with_hints.len()
);
progress.set_max_size(chunks_to_send_with_hints.len() as u64);
progress.set_current(0);
for (_hint, chunk_id) in chunks_to_send_with_hints {
let chunk_data = bypass_pile
.read(Keyspace::Chunk, &chunk_id)?
.context("Chunk vanished")?;
write_message(writer, &Some((chunk_id, chunk_data)))?;
progress.inc_progress(1);
}
write_message(writer, &None::<Option<(ChunkId, Vec<u8>)>>)?;
writer.flush()?;
Ok(())
}
pub fn accepting_side<R: Read, W: Write>(
pile: &Pile<Arc<Box<dyn RawPile>>>,
bypass_pile: &Box<dyn RawPile>,
reader: &mut R,
writer: &mut W,
mut progress: Box<dyn ProgressTracker>,
) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
let expecting = format!("Datman Pull Offerer {}", version);
write_message(writer, &format!("Datman Pull Accepter {}", version))?;
writer.flush()?;
let found: String = read_message(reader)?;
ensure!(
found == expecting,
"Version mismatch. Expecting {:?} got {:?}",
expecting,
found
);
// First 'negotiate' (for now: assert) a pile bypass.
// This lets us avoid decompressing things before recompressing them at the other end,
// assuming both ends use the same dictionary.
negotiate_bypassed_pile(pile, &bypass_pile, reader, writer)?;
let offered_pointers: BTreeMap<String, PointerData> = read_message(reader)?;
let mut wanted_pointers: BTreeSet<String> = BTreeSet::new();
for (pointer_name, pointer_data) in &offered_pointers {
if pile.read_pointer(pointer_name)?.is_none() {
wanted_pointers.insert(pointer_name.clone());
if let Some(parent) = &pointer_data.parent_pointer {
if pile.read_pointer(parent)?.is_none() && !offered_pointers.contains_key(parent) {
bail!("Offered pointer {:?} requires parent {:?} which we don't have and isn't offered.", pointer_name, parent);
}
}
}
}
write_message(writer, &wanted_pointers)?;
writer.flush()?;
let offered_chunks: BTreeSet<ChunkId> = read_message(reader)?;
let mut chunks_to_skip: BTreeSet<ChunkId> = BTreeSet::new();
for chunk_id in &offered_chunks {
if pile.chunk_exists(chunk_id)? {
chunks_to_skip.insert(*chunk_id);
}
}
write_message(writer, &chunks_to_skip)?;
writer.flush()?;
let num_chunks_to_recv = offered_chunks.len() - chunks_to_skip.len();
let mut chunks_to_recv: BTreeSet<ChunkId> = offered_chunks
.difference(&chunks_to_skip)
.cloned()
.collect();
drop(offered_chunks);
drop(chunks_to_skip);
progress.set_max_size(num_chunks_to_recv as u64);
progress.set_current(0);
while let Some((chunk_id, chunk_data)) = read_message::<_, Option<(ChunkId, Vec<u8>)>>(reader)?
{
ensure!(
chunks_to_recv.remove(&chunk_id),
"Received unexpected chunk"
);
bypass_pile.write(Keyspace::Chunk, &chunk_id, &chunk_data)?;
progress.inc_progress(1);
}
ensure!(chunks_to_recv.is_empty(), "Unreceived chunks.");
for (pointer_name, pointer_data) in &offered_pointers {
pile.write_pointer(pointer_name, pointer_data)?;
}
pile.flush()?;
Ok(())
}

View File

@ -1,456 +0,0 @@
use crate::commands::backup::split_pointer_name;
use crate::descriptor::{Descriptor, DestPileDescriptor};
use anyhow::Context;
use chrono::{Date, DateTime, Utc};
use comfy_table::presets::UTF8_FULL;
use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table};
use humansize::FileSize;
use itertools::Itertools;
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::ffi::CString;
use std::io::Read;
use std::mem;
use std::mem::size_of;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::MetadataExt;
use std::path::Path;
use yama::chunking::RecursiveUnchunker;
use yama::commands::{load_pile_descriptor, open_pile, retrieve_tree_node};
use yama::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use yama::pile::{DebugStatistics, Pile, RawPile};
// This module generates reports for a Datman system.
// Referenced Chunk IDs are counted and used to give an indication of size.
// Chunk IDs are summarised into u32s to reduce memory usage. Since the report is approximate,
// it doesn't matter if there are a few collisions (although they are still fairly unlikely to
// affect much).
#[derive(Clone)]
pub struct Report {
pub last_source_backups: BTreeMap<String, Option<DateTime<Utc>>>,
pub chunk_usages_aggregated: bool,
pub chunk_usage: BTreeMap<String, Sizes>,
pub debug_stats: Option<DebugStatistics>,
}
#[derive(Clone, Default)]
pub struct Sizes {
/// Total number of chunks that we refer to.
pub total: u32,
/// Each referred chunk is counted once here, but divided by the number of sharers.
/// We are 'morally responsible' for this many chunks.
pub moral: u32,
/// Number of chunks that only we point to.
pub unique: u32,
/// Number of chunks for which we are the oldest (lexicographically earliest) pointer to point
/// to those chunks.
pub rollup: u32,
}
type CondensedChunkId = u32;
fn condense_chunk_id(chunk_id: ChunkId) -> CondensedChunkId {
CondensedChunkId::from_be_bytes(
chunk_id[0..size_of::<CondensedChunkId>()]
.try_into()
.unwrap(),
)
}
pub fn generate_report(
dest_pile_descriptor: &DestPileDescriptor,
descriptor: &Descriptor,
aggregate_chunk_usage_by_month: bool,
) -> anyhow::Result<Report> {
let pile_descriptor = load_pile_descriptor(&dest_pile_descriptor.path)?;
let pile = open_pile(&dest_pile_descriptor.path, &pile_descriptor)?;
let debug_stats = pile.raw_pile.debug_statistics()?;
let mut pointers_to_parent_and_chunkids = BTreeMap::new();
let mut pointergroups_to_pointers: BTreeMap<String, Vec<String>> = BTreeMap::new();
info!("Collecting chunk IDs... This will probably be slow.");
for pointer_name in pile.list_pointers()? {
let pointer = pile
.read_pointer(&pointer_name)?
.context("listed pointer doesn't exist")?;
let root_node = retrieve_tree_node(&pile, pointer.chunk_ref)?;
let pointer_chunk_ids = collect_chunk_ids(&pile, &root_node.node)?;
let pointergroup = if aggregate_chunk_usage_by_month {
let (base, date_time) =
split_pointer_name(&pointer_name).context("Can't split pointer name")?;
format!("{}+{}", base, date_time.format("%Y-%m"))
} else {
pointer_name.clone()
};
pointergroups_to_pointers
.entry(pointergroup)
.or_default()
.push(pointer_name.clone());
pointers_to_parent_and_chunkids
.insert(pointer_name, (pointer.parent_pointer, pointer_chunk_ids));
}
// Now we iterate in reverse order, making a list of count of Chunk IDs.
// At the same time, we can also calculate 'rollup' sizes.
let mut chunk_sharer_counts: BTreeMap<CondensedChunkId, u16> = BTreeMap::new();
let mut pointergroup_stats: BTreeMap<String, Sizes> = BTreeMap::new();
for (pointergroup_name, pointers_in_group) in pointergroups_to_pointers.iter().rev() {
let mut deduped_chunks = BTreeSet::new();
for pointer_name in pointers_in_group {
deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
&pointers_to_parent_and_chunkids,
&pointer_name,
))
}
let mut rollup_count = 0;
for chunk in deduped_chunks {
let count = chunk_sharer_counts.entry(chunk).or_default();
*count += 1;
if *count == 1 {
rollup_count += 1;
}
}
let entry = pointergroup_stats
.entry(pointergroup_name.to_owned())
.or_default();
entry.rollup = rollup_count;
}
// Now go through again and update all the stats!
for (pointergroup_name, pointers_in_group) in &pointergroups_to_pointers {
let mut deduped_chunks = BTreeSet::new();
for pointer_name in pointers_in_group {
deduped_chunks.extend(iter_over_all_chunkids_incl_parents(
&pointers_to_parent_and_chunkids,
&pointer_name,
))
}
let mut unique_count = 0;
let mut shared_count_by_sharers = [0u32; 256];
let total_count = deduped_chunks.len();
for chunk in deduped_chunks {
let count = chunk_sharer_counts[&chunk];
if count == 1 {
unique_count += 1;
} else {
let num_sharers = (count as usize).min(256);
shared_count_by_sharers[num_sharers - 1] += 1;
}
}
let mut sharers_sum: f64 = 0.0;
for (sharers_minus_one, count) in shared_count_by_sharers.into_iter().enumerate() {
sharers_sum += (count as f64) / (sharers_minus_one + 1) as f64;
}
let entry = pointergroup_stats
.entry(pointergroup_name.to_owned())
.or_default();
entry.moral = (sharers_sum.ceil() as u32) + unique_count;
entry.unique = unique_count;
entry.total = total_count as u32;
}
let mut last_backed_up = BTreeMap::new();
for source_name in descriptor.sources.keys().cloned() {
last_backed_up.insert(source_name, None);
}
for pointer_name in pointers_to_parent_and_chunkids.keys() {
if let Some((source_name, date_time)) = split_pointer_name(&pointer_name) {
last_backed_up.insert(source_name, Some(date_time));
}
}
Ok(Report {
last_source_backups: last_backed_up,
chunk_usage: pointergroup_stats,
chunk_usages_aggregated: aggregate_chunk_usage_by_month,
debug_stats,
})
}
// Does not filter duplicates...
fn iter_over_all_chunkids_incl_parents<'a>(
pointers_to_parent_and_chunkids: &'a BTreeMap<
String,
(Option<String>, BTreeSet<CondensedChunkId>),
>,
pointer_name: &'a str,
) -> Box<dyn Iterator<Item = CondensedChunkId> + 'a> {
let (parent, chunks) = &pointers_to_parent_and_chunkids[pointer_name];
match parent {
None => Box::new(chunks.iter().copied()),
Some(parent) => Box::new(chunks.iter().copied().chain(
iter_over_all_chunkids_incl_parents(pointers_to_parent_and_chunkids, &parent),
)),
}
}
fn collect_chunk_ids<RP: RawPile>(
pile: &Pile<RP>,
root: &TreeNode,
) -> anyhow::Result<BTreeSet<CondensedChunkId>> {
let mut chunk_ids = BTreeSet::new();
root.visit(
&mut |tree_node, _| {
match tree_node {
TreeNode::NormalFile { content, .. } => {
collect_chunk_ids_from_chunkref(pile, content, &mut chunk_ids)?;
}
_ => {}
}
Ok(())
},
"".to_owned(),
)?;
Ok(chunk_ids)
}
fn collect_chunk_ids_from_chunkref<RP: RawPile>(
pile: &Pile<RP>,
chunk_ref: &RecursiveChunkRef,
collection: &mut BTreeSet<CondensedChunkId>,
) -> anyhow::Result<()> {
if chunk_ref.depth == 0 {
collection.insert(condense_chunk_id(chunk_ref.chunk_id));
} else {
let shallower_chunk_ref = RecursiveChunkRef {
chunk_id: chunk_ref.chunk_id,
depth: chunk_ref.depth - 1,
};
let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
let mut next_chunk_id: ChunkId = Default::default();
loop {
let read = unchunker.read(&mut next_chunk_id[..])?;
if read == 0 {
break;
} else if read < next_chunk_id.len() {
unchunker.read_exact(&mut next_chunk_id[read..])?;
}
collection.insert(condense_chunk_id(next_chunk_id));
}
}
Ok(())
}
pub fn print_report(report: &Report) -> anyhow::Result<()> {
print_time_report(report)?;
print_size_report(report)?;
Ok(())
}
pub fn print_time_report(report: &Report) -> anyhow::Result<()> {
println!("\nBackup times");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
table.set_header(vec![
Cell::new("Source name").fg(Color::Cyan),
Cell::new("Last backed up").fg(Color::Cyan),
]);
let today = Utc::today();
let sort_by_dates: Vec<(Option<Date<Utc>>, String)> = report
.last_source_backups
.iter()
.map(|(name, datetime)| (datetime.map(|dt| dt.date()), name.to_owned()))
.sorted()
.collect();
for (date, source_name) in sort_by_dates {
match date {
None => {
table.add_row(vec![
Cell::new(source_name).fg(Color::Magenta),
Cell::new("NEVER").fg(Color::Red).add_attributes(vec![
Attribute::SlowBlink,
Attribute::RapidBlink,
Attribute::Bold,
]),
]);
}
Some(date) => {
let number_of_days = today.signed_duration_since(date).num_days();
let num_days_human = if number_of_days > 0 {
format!("{number_of_days} days ago")
} else {
format!("today")
};
let colour = if number_of_days < 2 {
Color::Green
} else if number_of_days < 14 {
Color::Yellow
} else {
Color::Red
};
let formatted_date = date.format("%F");
let mut val_cell =
Cell::new(format!("{formatted_date} {num_days_human}")).fg(colour);
if number_of_days > 28 {
val_cell = val_cell.add_attribute(Attribute::SlowBlink);
}
table.add_row(vec![Cell::new(source_name).fg(Color::Magenta), val_cell]);
}
}
}
println!("{table}");
Ok(())
}
pub fn print_size_report(report: &Report) -> anyhow::Result<()> {
println!("\nPile size");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
//.set_width(100);
table.set_header(vec![
Cell::new("Pointer name").fg(Color::Cyan),
Cell::new("Rollup size").fg(Color::Magenta),
Cell::new("Unique size").fg(Color::Magenta),
Cell::new("Moral size").fg(Color::Magenta),
Cell::new("Total size").fg(Color::Magenta),
]);
let average_chunk_size = report
.debug_stats
.as_ref()
.map(|stats| stats.total_chunk_size as f64 / stats.number_of_chunks as f64);
for (pointer_name, sizes) in &report.chunk_usage {
table.add_row(vec![
Cell::new(pointer_name).fg(Color::Blue),
Cell::new(format_size(sizes.rollup, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.unique, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.moral, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.total, average_chunk_size)).fg(Color::Yellow),
]);
}
println!("{table}");
Ok(())
}
fn format_size(chunks: u32, average_chunk_size: Option<f64>) -> String {
let est_size_suffix = average_chunk_size
.map(|bytes_per_chunk| {
let num_bytes = (chunks as f64 * bytes_per_chunk) as u64;
let mut format = humansize::file_size_opts::BINARY;
format.decimal_places = 1;
format!(" ~{}", num_bytes.file_size(format).unwrap())
})
.unwrap_or_default();
format!("{} c{}", chunks, est_size_suffix)
}
fn calculate_total_filesize_of_dir(dir: &Path) -> anyhow::Result<u64> {
let mut total = 0;
for file in std::fs::read_dir(dir)? {
let file = file?;
let metadata = file.metadata()?;
total += metadata.size();
if metadata.is_dir() {
total += calculate_total_filesize_of_dir(&file.path())?;
}
}
Ok(total)
}
pub fn print_filesystem_space(pile_path: &Path) -> anyhow::Result<()> {
let usage_for_pile = calculate_total_filesize_of_dir(&pile_path)?;
let path_c = CString::new(pile_path.as_os_str().as_bytes()).unwrap();
let stats = unsafe {
let mut stats: libc::statfs = mem::zeroed();
match libc::statfs(path_c.as_ptr(), &mut stats) {
0 => Ok(stats),
other => Err(std::io::Error::from_raw_os_error(other)),
}
}?;
// On a BTRFS system with 2 disks in RAID1, note (about df -h):
// - 'Size' shows the average size of the two disks. I think of it as 'ideal size'.
// - 'Avail' seems to show the actual number of bytes usable.
// - 'Used' seems to show the actual number of bytes used.
// In short: probably avoid relying on 'size'.
let block_size = stats.f_bsize as i64;
let used_bytes = (stats.f_blocks - stats.f_bfree) as i64 * block_size;
let avail_bytes = stats.f_bavail as i64 * block_size;
let usable_bytes = used_bytes + avail_bytes;
let theoretical_size = stats.f_blocks as i64 * block_size;
let mut format = humansize::file_size_opts::BINARY;
format.decimal_places = 1;
format.decimal_zeroes = 1;
println!("\nFilesystem Information");
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth)
.enforce_styling();
//.set_width(100);
table.set_header(vec![
Cell::new("Theoretical Size").fg(Color::Cyan),
Cell::new("Usable Size").fg(Color::Cyan),
Cell::new("Used").fg(Color::Cyan),
Cell::new("Used for Pile").fg(Color::Cyan),
Cell::new("Available").fg(Color::Cyan),
]);
let available_space_colour = if avail_bytes < 8 * 1024 * 1024 * 1024 {
Color::Red
} else if avail_bytes < 64 * 1024 * 1024 * 1024 {
Color::Yellow
} else {
Color::Green
};
table.add_row(vec![
Cell::new(format!(
"{:>9}",
theoretical_size.file_size(&format).unwrap()
))
.fg(Color::Blue),
Cell::new(format!("{:>9}", usable_bytes.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", used_bytes.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", usage_for_pile.file_size(&format).unwrap())).fg(Color::Blue),
Cell::new(format!("{:>9}", avail_bytes.file_size(&format).unwrap()))
.fg(available_space_colour),
]);
print!("{table}");
Ok(())
}

26
datman/src/datetime.rs Normal file
View File

@ -0,0 +1,26 @@
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, TimeZone};
use eyre::bail;
use std::str::FromStr;
pub struct HumanDateTime(pub DateTime<Local>);
impl FromStr for HumanDateTime {
type Err = eyre::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(date_only) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
let local_datetime = Local
.from_local_datetime(&date_only.and_hms_opt(0, 0, 0).unwrap())
.unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
let local_datetime = Local.from_local_datetime(&date_and_time).unwrap();
Ok(HumanDateTime(local_datetime))
} else if let Ok(date_and_time) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
let local_datetime = Local.from_local_datetime(&date_and_time).unwrap();
Ok(HumanDateTime(local_datetime))
} else {
bail!("Couldn't parse using any format. Use one of: 2021-05-16 OR 2021-05-16T17:42:14 OR 2021-05-16 17:42:14");
}
}
}

View File

@ -1,116 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
// TODO how do we handle?:
// - (important) yama push of one pile to another
// - backup policy stuff like 'minimum backup frequency' ... show when it's not been done
// - backup policy stuff like 'minimum on two different disks, not powered at the same time...'
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct Descriptor {
/// Dataset labels
pub labels: Vec<String>,
/// Sources
pub sources: HashMap<String, SourceDescriptor>,
/// Paths to destination Yama Piles. Remote Piles need a local virtual pile to specify the layers.
pub piles: HashMap<String, DestPileDescriptor>,
pub remote_hosts: HashMap<String, RemoteHostDescriptor>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub retention: Option<RetentionPolicyConfig>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RemoteHostDescriptor {
pub user_at_host: String,
pub path_to_datman: Option<String>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RetentionPolicyConfig {
pub daily: u32,
pub weekly: u32,
pub monthly: u32,
pub yearly: u32,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum SourceDescriptor {
DirectorySource {
hostname: String,
directory: PathBuf,
#[serde(default)]
cross_filesystems: bool,
},
VirtualSource {
/// The name of the helper program that will be used to do this backup.
helper: String,
/// The label that will be assigned to this source.
label: String,
/// The kind of virtual source (how it operates).
kind: VirtualSourceKind,
#[serde(flatten)]
extra_args: HashMap<String, toml::Value>,
},
}
impl SourceDescriptor {
/// Gets the hostname that this source descriptor is for, if possible.
pub fn get_remote_hostname(&self) -> Option<&str> {
match self {
SourceDescriptor::DirectorySource { hostname, .. } => Some(hostname.as_str()),
SourceDescriptor::VirtualSource { .. } => None,
}
}
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum VirtualSourceKind {
Stdout {
#[serde(rename = "stdout")]
filename: String,
},
// TODO(feature) TempDir
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct DestPileDescriptor {
pub path: PathBuf,
pub included_labels: Vec<String>,
}
pub fn load_descriptor(path: &Path) -> anyhow::Result<Descriptor> {
let descriptor_file = path.join("datman.toml");
let mut buf = Vec::new();
File::open(descriptor_file)?.read_to_end(&mut buf)?;
Ok(toml::de::from_slice(&buf)?)
}

View File

@ -0,0 +1,152 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use eyre::{Context, ContextCompat};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use yama::pile_connector::PileConnectionScheme;
// TODO how do we handle?:
// - (important) yama push of one pile to another
// - backup policy stuff like 'minimum backup frequency' ... show when it's not been done
// - backup policy stuff like 'minimum on two different disks, not powered at the same time...'
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct Descriptor {
/// Sources
pub sources: HashMap<String, SourceDescriptor>,
/// Paths to destination Yama Piles. Remote Piles need a local virtual pile to specify the layers.
pub piles: HashMap<String, PilePathOrConnector>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub retention: Option<RetentionPolicyConfig>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum PilePathOrConnector {
PilePath(PathBuf),
PileConnector {
#[serde(flatten)]
scheme: PileConnectionScheme,
yamakey: PathBuf,
},
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct RetentionPolicyConfig {
pub daily: u32,
pub weekly: u32,
pub monthly: u32,
pub yearly: u32,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct SourceDescriptor {
/// The host to run this backup task on.
pub host: String,
#[serde(flatten)]
pub inner: SourceDescriptorInner,
}
impl SourceDescriptor {
pub fn is_directory_source(&self) -> bool {
matches!(&self.inner, &SourceDescriptorInner::DirectorySource { .. })
}
pub fn is_virtual_source(&self) -> bool {
matches!(&self.inner, &SourceDescriptorInner::VirtualSource { .. })
}
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum SourceDescriptorInner {
DirectorySource {
path: PathBuf,
#[serde(default)]
cross_filesystems: bool,
/// TODO Paths to ignore
#[serde(default)]
ignore: Vec<String>,
},
VirtualSource(VirtualSource),
}
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct VirtualSource {
/// The name of the helper program that will be used to do this backup.
pub helper: String,
/// The kind of virtual source (how it operates).
pub kind: VirtualSourceKind,
#[serde(flatten)]
pub extra_args: HashMap<String, toml::Value>,
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(untagged)]
pub enum VirtualSourceKind {
Stdout {
#[serde(rename = "stdout")]
filename: String,
},
// TODO(feature) TempDir
}
/// Loads a descriptor and resolves relative paths contained within.
pub async fn load_descriptor(path: &Path) -> eyre::Result<Descriptor> {
let text = tokio::fs::read_to_string(path).await?;
let mut descriptor: Descriptor = toml::de::from_str(&text)?;
let dir = path
.parent()
.context("there must be a parent path for the descriptor file")?;
// Absolutise pile paths
for (_, pile_path_or_connector) in descriptor.piles.iter_mut() {
match pile_path_or_connector {
PilePathOrConnector::PilePath(pile_path) => {
*pile_path = dir
.join(&*pile_path)
.canonicalize()
.context("Failed to canonicalise path in descriptor")?;
}
PilePathOrConnector::PileConnector {
scheme:
PileConnectionScheme::Local {
directory: pile_path,
},
..
} => {
*pile_path = dir
.join(&*pile_path)
.canonicalize()
.context("Failed to canonicalise path in descriptor")?;
}
PilePathOrConnector::PileConnector { .. } => { /* nop */ }
}
}
Ok(descriptor)
}

183
datman/src/extract.rs Normal file
View File

@ -0,0 +1,183 @@
use crate::datetime::HumanDateTime;
use crate::pointer_names::split_pointer_name;
use chrono::{DateTime, Utc};
use eyre::{bail, eyre, Context, ContextCompat};
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};
use std::path::Path;
use std::sync::Arc;
use tracing::{info_span, warn, Instrument};
use yama::extract;
use yama::extract::flatten_treenode;
use yama::pile_with_cache::{PileWithCache, PointerIntegrationStatistics};
use yama_pile::tree::{FilesystemOwnership, FilesystemPermissions, RootTreeNode, TreeNode};
use yama_wormfile::boxed::BoxedWormFileProvider;
/// Given a list of source names and conditions to find pointers within,
/// returns a mapping of source names to pointers.
pub async fn select_to_extract(
pwc: &PileWithCache<BoxedWormFileProvider>,
sources: BTreeSet<String>,
before: Option<HumanDateTime>,
after: Option<HumanDateTime>,
accept_partial: bool,
) -> eyre::Result<BTreeMap<String, String>> {
let before = before.map(|dt| dt.0.with_timezone(&Utc));
let after = after.map(|dt| dt.0.with_timezone(&Utc));
let pointers_list = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
select_to_extract_impl(pointers_list, sources, before, after, accept_partial)
}
/// Given a list of source names and conditions to find pointers within,
/// returns a mapping of source names to pointers.
fn select_to_extract_impl(
pointers_list: Vec<String>,
sources: BTreeSet<String>,
before: Option<DateTime<Utc>>,
after: Option<DateTime<Utc>>,
accept_partial: bool,
) -> eyre::Result<BTreeMap<String, String>> {
if after.is_some() && before.is_some() {
bail!("Can't specify both before and after!");
}
let mut pointers_by_source: BTreeMap<String, String> = BTreeMap::new();
for pointer in pointers_list {
if let Some((source_name, pointer_datetime)) = split_pointer_name(&pointer) {
if !sources.contains(&source_name) {
// Not a source that we're interested in.
continue;
}
if let Some(before) = before {
if before < pointer_datetime {
// datetime is after the 'before' time
continue;
}
} else if let Some(after) = after {
if pointer_datetime < after {
// datetime is before the 'after' time
continue;
}
}
match pointers_by_source.entry(source_name) {
Entry::Vacant(ve) => {
ve.insert(pointer);
}
Entry::Occupied(mut oe) => {
let current_choice = oe.get_mut();
let (_, current_datetime) = split_pointer_name(&current_choice).unwrap();
let should_replace = if after.is_some() {
// if we want the first one after a time, we want the earliest option!
// so replace if new datetime is earlier than current
pointer_datetime < current_datetime
} else {
// replace if new datetime is after current datetime
current_datetime < pointer_datetime
};
if should_replace {
*current_choice = pointer;
}
}
}
};
}
if pointers_by_source.is_empty() {
bail!("No pointers selected for ANY of the sources: {sources:?}");
}
let missing: Vec<&String> = sources
.iter()
.filter(|src| !pointers_by_source.contains_key(*src))
.collect();
if !missing.is_empty() {
if accept_partial {
warn!("Some sources didn't have any pointers selected: {missing:?}. Continuing because --accept-partial passed.");
} else {
bail!("Some sources didn't have any pointers selected: {missing:?}. Pass --accept-partial if this is intended anyway.");
}
}
Ok(pointers_by_source)
}
pub async fn load_pointers_for_extraction(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
what_to_extract: BTreeMap<String, String>,
) -> eyre::Result<BTreeMap<String, RootTreeNode>> {
let mut result = BTreeMap::new();
for (source_name, pointer_name) in &what_to_extract {
let mut stats = PointerIntegrationStatistics::default();
let pointer = pwc
.read_pointer_fully_integrated(&pointer_name, &mut stats)
.await?
.context("pointer doesn't exist??")?;
// TODO(ownership): adapt uid/gids here
result.insert(source_name.clone(), pointer.root);
}
Ok(result)
}
pub fn merge_roots_for_batch_extract(extracts: BTreeMap<String, RootTreeNode>) -> TreeNode {
let mut children = BTreeMap::new();
for (name, entry) in extracts {
if matches!(entry.node, TreeNode::NormalFile { .. }) {
let mut children2 = BTreeMap::new();
children2.insert(entry.name, entry.node);
children.insert(
name,
TreeNode::Directory {
ownership: FilesystemOwnership {
// TODO(ownership): populate this correctly (current user?)
uid: 0,
gid: 0,
},
permissions: FilesystemPermissions { mode: 0o700 },
children: children2,
},
);
} else {
children.insert(name, entry.node);
}
}
TreeNode::Directory {
ownership: FilesystemOwnership {
// TODO(ownership): populate this correctly (current user?)
uid: 0,
gid: 0,
},
permissions: FilesystemPermissions { mode: 0o700 },
children,
}
}
pub async fn extract(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
node: TreeNode,
destination: &Path,
) -> eyre::Result<()> {
let flat = flatten_treenode(&node)?;
drop(node);
extract::unpack_nonfiles(destination, &flat.nonfiles, false, true).await?;
let extract_span = info_span!("extract_files");
extract::unpack_files(&pwc, destination, &flat.files, false, true)
.instrument(extract_span)
.await?;
Arc::try_unwrap(pwc)
.map_err(|_| eyre!("pwc still in use; can't close down gracefully"))?
.close()
.await?;
Ok(())
}

View File

@ -1,288 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};
use anyhow::anyhow;
use anyhow::Context;
use arc_interner::ArcIntern;
use byteorder::WriteBytesExt;
use glob::Pattern;
use log::warn;
use serde::{Deserialize, Serialize};
use crate::labelling::State::{Excluded, Labelled, Split};
use crate::tree::{FileTree, FileTree1};
pub fn load_labelling_rules(path: &Path, source_name: &str) -> anyhow::Result<LabellingRules> {
let rule_path = path.join("labelling").join(format!("{}.zst", source_name));
if rule_path.exists() {
let rule_file = File::open(&rule_path)?;
let rule_reader = zstd::stream::read::Decoder::new(rule_file)?;
let buf_reader = BufReader::new(rule_reader);
Ok(LabellingRules::load(buf_reader)?)
} else {
Ok(LabellingRules::default())
}
}
pub fn save_labelling_rules(
path: &Path,
source_name: &str,
rules: &LabellingRules,
) -> anyhow::Result<()> {
let rule_path = path.join("labelling").join(format!("{}.zst", source_name));
if rule_path.exists() {
let backup_rule_path = path.join("labelling").join(format!("{}.zst~", source_name));
std::fs::rename(&rule_path, &backup_rule_path)?;
}
let rule_file = File::create(rule_path)?;
let mut zstd_writer = zstd::stream::write::Encoder::new(rule_file, 18)?;
rules.save(&mut zstd_writer)?;
zstd_writer.finish()?; // MUST CALL finish here!
Ok(())
}
#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialOrd, PartialEq, Hash)]
pub struct Label(pub ArcIntern<String>);
#[derive(Clone, Serialize, Deserialize, Debug, Eq, PartialOrd, PartialEq)]
pub enum State {
Labelled(Label),
Split,
Excluded,
}
impl State {
pub fn should_inherit(&self) -> bool {
match self {
Labelled(_) => true,
Split => false,
Excluded => true,
}
}
}
#[derive(Clone, Debug)]
pub struct GlobRule {
pub pattern: String,
pub glob: Pattern,
pub outcome: State,
}
#[derive(Clone, Debug, Default)]
pub struct LabellingRules {
pub position_based_rules: HashMap<String, State>,
pub glob_based_rules: Vec<GlobRule>,
}
impl LabellingRules {
pub fn load<R: BufRead>(mut input: R) -> anyhow::Result<Self> {
let mut result = LabellingRules {
position_based_rules: Default::default(),
glob_based_rules: Default::default(),
};
let mut str = String::new();
loop {
str.clear();
let line_len = input.read_line(&mut str)?;
if line_len == 0 {
break;
}
if &str == "---\n" {
// start reading glob patterns now.
break;
}
let pieces: Vec<&str> = str.trim_end_matches('\n').split('\t').collect();
if pieces.len() == 2 {
match pieces[1] {
"?" => {
result
.position_based_rules
.insert(pieces[0].to_owned(), Split);
}
"!" => {
result
.position_based_rules
.insert(pieces[0].to_owned(), Excluded);
}
label_str => {
result.position_based_rules.insert(
pieces[0].to_owned(),
Labelled(Label(ArcIntern::new(label_str.to_owned()))),
);
}
}
} else {
warn!("not 2 pieces: {:?}", str);
}
}
loop {
str.clear();
let line_len = input.read_line(&mut str)?;
if line_len == 0 {
break;
}
let pieces: Vec<&str> = str.trim().split('\t').collect();
if pieces.len() == 2 {
let outcome = match pieces[1] {
"?" => Split,
"!" => Excluded,
label_str => Labelled(Label(ArcIntern::new(label_str.to_owned()))),
};
let pattern = pieces[0].to_owned();
let glob = Pattern::new(&pattern)
.with_context(|| anyhow!("Whilst compiling glob: {:?}", pattern))?;
result.glob_based_rules.push(GlobRule {
pattern,
glob,
outcome,
});
} else {
warn!("not 2 pieces: {:?}", str);
}
}
Ok(result)
}
pub fn save<W: Write>(&self, mut output: W) -> anyhow::Result<()> {
for (path, rule) in self.position_based_rules.iter() {
output.write_all(path.as_bytes())?;
output.write_u8('\t' as u8)?;
match rule {
Labelled(label) => {
output.write_all(label.0.as_bytes())?;
}
Split => {
output.write_u8('?' as u8)?;
}
Excluded => {
output.write_u8('!' as u8)?;
}
}
output.write_u8('\n' as u8)?;
}
output.write_all("---\n".as_bytes())?;
for glob_rule in self.glob_based_rules.iter() {
output.write_all(glob_rule.pattern.as_bytes())?;
output.write_u8('\t' as u8)?;
match &glob_rule.outcome {
Labelled(label) => {
output.write_all(label.0.as_bytes())?;
}
Split => {
output.write_u8('?' as u8)?;
}
Excluded => {
output.write_u8('!' as u8)?;
}
}
output.write_u8('\n' as u8)?;
}
output.flush()?;
Ok(())
}
pub fn apply(&self, path: &str) -> Option<State> {
if let Some(rule_state) = self.position_based_rules.get(path) {
return Some(rule_state.clone());
}
for glob_rule in self.glob_based_rules.iter() {
if glob_rule.glob.matches(path) {
return Some(glob_rule.outcome.clone());
}
}
None
}
pub fn get_exclusions_set(&self, base: &Path) -> BTreeSet<PathBuf> {
let mut exclusions = BTreeSet::new();
for (ext_path, state) in &self.position_based_rules {
assert!(ext_path.is_empty() || ext_path.starts_with('/'));
let full_path = PathBuf::from(format!(
"{}{ext_path}",
base.to_str().expect("base path must always be utf-8")
));
if state == &Excluded {
exclusions.insert(full_path);
}
}
exclusions
}
}
/// Uninteractively label the nodes.
pub fn label_node(
path: String,
current_state: Option<State>,
node: &mut FileTree1<Option<State>>,
labels: &Vec<Label>,
rules: &LabellingRules,
) -> anyhow::Result<()> {
let mut next_state = current_state;
if let Some(rule_state) = rules.apply(&path) {
next_state = Some(rule_state.clone());
} else if !next_state
.as_ref()
.map(|s| s.should_inherit())
.unwrap_or(false)
{
next_state = None;
}
match node {
FileTree::NormalFile { meta, .. } => {
*meta = next_state;
}
FileTree::Directory { meta, children, .. } => {
*meta = next_state.clone();
for (child_name, child) in children.iter_mut() {
let child_path = format!("{}/{}", path, child_name);
label_node(child_path, next_state.clone(), child, labels, rules)?;
}
}
FileTree::SymbolicLink { meta, .. } => {
*meta = next_state;
}
FileTree::Other(_) => {
panic!("Other() nodes shouldn't be present here.");
}
}
Ok(())
}
pub fn str_to_label<I: AsRef<str>>(input: I) -> Label {
Label(ArcIntern::new(input.as_ref().to_owned()))
}

View File

@ -1,12 +1,6 @@
pub mod commands;
pub mod descriptor;
pub mod labelling;
pub mod remote;
pub mod tree;
pub mod backup;
pub mod descriptor_config;
pub mod extract;
pub fn get_hostname() -> String {
hostname::get()
.expect("No hostname")
.into_string()
.expect("Hostname string must be sensible.")
}
pub mod datetime;
pub mod pointer_names;

View File

@ -0,0 +1,20 @@
use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
pub const POINTER_DATETIME_FORMAT: &'static str = "%F_%T";
pub const POINTER_NAME_DATETIME_SPLITTER: &'static str = "+";
pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String {
format!(
"{}{}{}",
source_name,
POINTER_NAME_DATETIME_SPLITTER,
datetime.format(POINTER_DATETIME_FORMAT).to_string()
)
}
pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
let (source_name, date_time_str) = pointer_name.rsplit_once(POINTER_NAME_DATETIME_SPLITTER)?;
let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
let date_time = Utc.from_utc_datetime(&date_time);
Some((source_name.to_owned(), date_time))
}

View File

@ -1,2 +0,0 @@
pub mod backup_source_requester;
pub mod backup_source_responder;

View File

@ -1,304 +0,0 @@
use crate::commands::backup::{get_pointer_name_at, label_filter_and_convert};
use crate::descriptor::{Descriptor, DestPileDescriptor, SourceDescriptor};
use crate::labelling::load_labelling_rules;
use crate::tree::FileTree;
use anyhow::{anyhow, bail};
use chrono::Utc;
use log::info;
use std::collections::BTreeSet;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use yama::commands::{load_pile_descriptor, open_pile};
use yama::definitions::{PartialPointerData, TreeNode};
use yama::operations::storing::{pointer_ops_prepare_to_store, pointers_ops_after_store};
use yama::pile::access_guard::PileGuard;
use yama::pile::{Pile, RawPile, StoragePipelineSettings};
use yama::progress::ProgressTracker;
use yama::remote::responder::{Responder, ResponderWritingPipeline};
use yama::remote::{read_message, write_message};
use yama::utils::get_number_of_workers;
// SECURITY WARNING: the system you connect to using this mechanism will receive full access to
// your Yama pile. Do NOT connect to untrusted or compromised systems using this mechanism (yet).
pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
info!("Introduction.");
let version = env!("CARGO_PKG_VERSION");
write_message(
write,
&format!("Datman v{} Backup Source Requester", version),
)?;
write.flush()?;
let foreign_side: String = read_message(read)?;
let expected_foreign_side = format!("Datman v{} Backup Source Responder", version);
if &foreign_side != &expected_foreign_side {
bail!(
"Datman version mismatch. Expected {:?}, got {:?}",
expected_foreign_side,
foreign_side
);
}
Ok(())
}
pub fn scanning<R: Read, W: Write>(
read: &mut R,
write: &mut W,
path: &Path,
one_filesystem: bool,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
info!("Scanning.");
write_message(write, &"scan")?;
write_message(write, &path)?;
write_message(write, &one_filesystem)?;
write_message(write, exclusions)?;
write.flush()?;
let scan_result: Option<FileTree<(), (), (), ()>> = read_message(read)?;
Ok(scan_result)
}
pub fn chunking<
R: Read + Send + 'static,
W: Write + Send + 'static,
RP: RawPile + 'static,
PT: ProgressTracker + Send + 'static,
>(
read: R,
mut write: W,
path: &Path,
tree_node: &TreeNode,
raw_pile: Arc<RP>,
progress_bar: PT,
use_writing_pipeline: bool,
) -> anyhow::Result<(R, W, PartialPointerData)> {
info!("Chunking.");
write_message(&mut write, &"chunk")?;
write_message(&mut write, &path)?;
write_message(&mut write, tree_node)?;
write.flush()?;
let (writing_pipeline, control_rx) = if use_writing_pipeline {
let sps = StoragePipelineSettings {
num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
compressor_input_bound: 32,
writer_input_bound: 32,
};
let (control_tx, control_rx) = crossbeam_channel::unbounded();
let pipeline = raw_pile.build_storage_pipeline(sps, control_tx)?;
(
Some(ResponderWritingPipeline {
pipeline_submission: pipeline,
}),
Some(control_rx),
)
} else {
(None, None)
};
let guarded_pile = PileGuard::new(Arc::clone(&raw_pile), true);
let (r_handle, w_handle, join_handles) = Responder::start(
read,
write,
get_number_of_workers("YAMA_RESPONDERS") as u16,
Arc::new(guarded_pile),
writing_pipeline,
progress_bar,
);
info!("Waiting for remote to finish chunking.");
for handle in join_handles {
handle.join().expect("Join handle should not fail");
}
let mut read = r_handle.join().unwrap();
let write = w_handle.join().unwrap();
if let Some(control_rx) = control_rx {
while let Ok(_) = control_rx.recv() {
// TODO nop
}
}
info!("Remote finished chunking.");
let pointer_data: PartialPointerData = read_message(&mut read)?;
Ok((read, write, pointer_data))
}
pub fn quit<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
write_message(write, &"exit")?;
write.flush()?;
let scan_result: String = read_message(read)?;
if scan_result.as_str() != "exit" {
bail!("Exit failed");
}
Ok(())
}
pub fn connect_to_remote(descriptor: &Descriptor, hostname: &str) -> anyhow::Result<Child> {
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
let connection = Command::new("ssh")
.arg(&remote_host_descriptor.user_at_host)
.arg("--")
.arg(
&remote_host_descriptor
.path_to_datman
.as_ref()
.map(|x| x.as_str())
.unwrap_or("datman"),
)
.arg("_backup_source_responder")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.spawn()?;
Ok(connection)
}
pub fn backup_remote_source_to_destination<PT: ProgressTracker + Send + 'static>(
source: &SourceDescriptor,
dest: &DestPileDescriptor,
descriptor: &Descriptor,
desc_path: &Path,
source_name: &str,
dest_name: &str,
_num_workers: u8,
progress_bar: PT,
) -> anyhow::Result<()> {
match source {
SourceDescriptor::DirectorySource {
hostname,
directory,
cross_filesystems,
} => {
let remote_host_descriptor = descriptor
.remote_hosts
.get(hostname)
.ok_or_else(|| anyhow::anyhow!("No remote host by that name: {:?}.", hostname))?;
info!(
"Looking to backup {} (from {}) to {}",
source_name, remote_host_descriptor.user_at_host, dest_name
);
let connection = connect_to_remote(descriptor, hostname)?;
let mut read = connection.stdout.expect("Requested stdout");
let mut write = connection.stdin.expect("Requested stdin");
// first start off with an introduction
info!("Connecting...");
introduction(&mut read, &mut write)?;
let rules = load_labelling_rules(desc_path, source_name)?;
let exclusions = rules.get_exclusions_set(directory);
// then request to scan
info!("Requesting scan... (this may take some time)");
let scan_result = scanning(
&mut read,
&mut write,
directory.as_ref(),
!*cross_filesystems,
&exclusions,
)?
.ok_or_else(|| anyhow!("Remote scan failed (does the directory exist?)"))?;
let mut root =
label_filter_and_convert(scan_result, descriptor, source_name, &rules, dest)?
.ok_or_else(|| anyhow!("Empty filter..."))?;
let absolute_dest_path = desc_path.join(&dest.path);
let pile_descriptor = load_pile_descriptor(&absolute_dest_path)?;
let pile = open_pile(&absolute_dest_path, &pile_descriptor)?;
let pointer_name = get_pointer_name_at(&source_name, Utc::now());
if pile.read_pointer(pointer_name.as_str())?.is_some() {
bail!(
"Pointer by name {:?} already exists; refusing to overwrite.",
pointer_name
);
}
info!("Will write as pointer {:?}.", pointer_name);
info!("Searching for suitable parents.");
let mut parent: Option<String> = None;
let prefix = format!("{}+", source_name);
for pointer in pile.list_pointers()?.iter() {
if pointer.starts_with(&prefix) {
match parent.as_ref() {
None => {
parent = Some(pointer.to_owned());
}
Some(cur_parent) => {
if cur_parent < pointer {
parent = Some(pointer.to_owned());
}
}
}
}
}
match parent.as_ref() {
Some(parent) => {
info!("Using parent: {}", parent);
}
None => {
info!("No suitable parent found.");
}
}
info!("Storing remote using Yama (this may take some time)...");
let raw_pile = Arc::new(pile.raw_pile);
let pile = Pile::new(raw_pile.clone());
pointer_ops_prepare_to_store(&pile, &mut root, &parent)?;
info!(
"Have pointer_name = {:?}, parent = {:?}",
pointer_name, parent
);
let (mut read, mut write, pointer_data) = chunking(
read,
write,
directory.as_ref(),
&root,
raw_pile,
progress_bar,
true,
)?;
quit(&mut read, &mut write)?;
pointers_ops_after_store(&pile, &pointer_name, &pointer_data.complete(parent))?;
pile.flush()?;
info!("Stored! Checking for existence...");
if pile.list_pointers()?.contains(&pointer_name) {
info!("Exists!");
} else {
bail!("Pointer {:?} does not exist...", &pointer_name);
}
}
SourceDescriptor::VirtualSource { .. } => {
unimplemented!("Can't currently back up virtualsources on remotes...")
}
}
Ok(())
}

View File

@ -1,187 +0,0 @@
// This file implements the responder side of the backup source protocol -- the protocol used
// to connect to remote backup sources.
use std::collections::BTreeSet;
use std::io::{stdin, stdout, Read, Write};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use anyhow::bail;
use crossbeam_channel::Sender;
use log::info;
use yama::definitions::{PartialPointerData, TreeNode};
use yama::pile::{Pile, RawPile};
use yama::progress::ProgressTracker;
use yama::remote::requester::Requester;
use yama::remote::{read_message, write_message, RequestBody, ResponseBody};
use yama::utils::get_number_of_workers;
use crate::tree::scan;
pub fn introduction<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let version = env!("CARGO_PKG_VERSION");
write_message(
write,
&format!("Datman v{} Backup Source Responder", version),
)?;
write.flush()?;
let foreign_side: String = read_message(read)?;
let expected_foreign_side = format!("Datman v{} Backup Source Requester", version);
if &foreign_side != &expected_foreign_side {
bail!(
"Datman version mismatch. Expected {:?}, got {:?}",
expected_foreign_side,
foreign_side
);
}
Ok(())
}
pub fn scanning<R: Read, W: Write>(read: &mut R, write: &mut W) -> anyhow::Result<()> {
let path: PathBuf = read_message(read)?;
let one_filesystem: bool = read_message(read)?;
let exclusions: BTreeSet<PathBuf> = read_message(read)?;
let scan_result = scan(&path, one_filesystem, &exclusions)?;
write_message(write, &scan_result)?;
write.flush()?;
Ok(())
}
pub struct ProgressSender {
pub last_sent: Instant,
pub current_progress: u64,
pub current_max: u64,
// TODO actually propagate this
pub current_message: String,
pub sender: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
}
impl ProgressSender {
pub fn send_now(&mut self, _include_message: bool) {
self.sender
.send((
RequestBody::Progress {
current: self.current_progress,
max: self.current_max,
},
None,
))
.expect("Progress sender failed");
self.last_sent = Instant::now();
}
pub fn send_if_overdue(&mut self) {
if Instant::now().duration_since(self.last_sent).as_millis() >= 1024 {
self.send_now(false);
}
}
}
impl ProgressTracker for ProgressSender {
fn inc_progress(&mut self, delta_progress: u64) {
self.current_progress += delta_progress;
self.send_if_overdue();
}
fn set_current(&mut self, current_progress: u64) {
self.current_progress = current_progress;
self.send_if_overdue();
}
fn set_max_size(&mut self, max_size: u64) {
self.current_max = max_size;
self.send_if_overdue();
}
}
// TODO use io-streams crate and get rid of the duplication!!
pub fn chunking_stdio() -> anyhow::Result<PartialPointerData> {
let (path, tree_node) = {
let stdin = stdin();
let mut read = stdin.lock();
let path: PathBuf = read_message(&mut read)?;
let tree_node: TreeNode = read_message(&mut read)?;
(path, tree_node)
};
let (pointer_data, requester_join_handles) = {
let (yama_requester, requester_join_handles) = Requester::new_from_stdio();
let command_sender = yama_requester.clone_command_sender();
info!("progress sender in use");
let mut progress_bar = ProgressSender {
last_sent: Instant::now(),
current_progress: 0,
current_max: 0,
current_message: "".to_string(),
sender: command_sender,
};
let raw_pile: Box<dyn RawPile> = Box::new(yama_requester);
let pile = Pile::new(raw_pile);
let pointer_data = yama::operations::storing::store_without_pointer_ops(
&Arc::new(pile),
&path,
tree_node,
get_number_of_workers("YAMA_CHUNKERS"),
&mut progress_bar,
)?;
(pointer_data, requester_join_handles)
};
info!("Waiting to join.");
for join_handle in requester_join_handles {
join_handle.join().expect("Expected to join handle");
}
info!("Chunking completed.");
Ok(pointer_data)
}
pub fn handler_stdio() -> anyhow::Result<()> {
let stdin = stdin();
let stdout = stdout();
let mut read = stdin.lock();
let mut write = stdout.lock();
info!("Introduction.");
introduction(&mut read, &mut write)?;
loop {
let command: String = read_message(&mut read)?;
match command.as_str() {
"scan" => {
info!("Scanning.");
scanning(&mut read, &mut write)?;
}
"chunk" => {
info!("Chunking.");
drop(read);
drop(write);
let pointer_data = chunking_stdio()?;
read = stdin.lock();
write = stdout.lock();
write_message(&mut write, &pointer_data)?;
write.flush()?;
}
"exit" => {
write_message(&mut write, &"exit")?;
write.flush()?;
break;
}
othercommand => {
bail!("Don't understand {:?}", othercommand);
}
}
}
Ok(())
}

View File

@ -1,359 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Debug;
use std::fs::{read_link, symlink_metadata, DirEntry, Metadata};
use std::io::ErrorKind;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use anyhow::anyhow;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::{debug, info, warn};
use serde::{Deserialize, Serialize};
pub use yama::definitions::FilesystemOwnership;
pub use yama::definitions::FilesystemPermissions;
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub enum FileTree<NMeta, DMeta, SMeta, Other>
where
NMeta: Debug + Clone + Eq + PartialEq,
DMeta: Debug + Clone + Eq + PartialEq,
SMeta: Debug + Clone + Eq + PartialEq,
Other: Debug + Clone + Eq + PartialEq,
{
NormalFile {
/// modification time in ms
mtime: u64,
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
meta: NMeta,
},
Directory {
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
children: BTreeMap<String, FileTree<NMeta, DMeta, SMeta, Other>>,
meta: DMeta,
},
SymbolicLink {
ownership: FilesystemOwnership,
target: String,
meta: SMeta,
},
Other(Other),
}
pub type FileTree1<A> = FileTree<A, A, A, ()>;
impl<NMeta, DMeta, SMeta, Other> FileTree<NMeta, DMeta, SMeta, Other>
where
NMeta: Debug + Clone + Eq + PartialEq,
DMeta: Debug + Clone + Eq + PartialEq,
SMeta: Debug + Clone + Eq + PartialEq,
Other: Debug + Clone + Eq + PartialEq,
{
pub fn is_dir(&self) -> bool {
match self {
FileTree::NormalFile { .. } => false,
FileTree::Directory { .. } => true,
FileTree::SymbolicLink { .. } => false,
FileTree::Other(_) => false,
}
}
pub fn is_symlink(&self) -> bool {
match self {
FileTree::NormalFile { .. } => false,
FileTree::Directory { .. } => false,
FileTree::SymbolicLink { .. } => true,
FileTree::Other(_) => false,
}
}
pub fn get_by_path(&self, path: &String) -> Option<&FileTree<NMeta, DMeta, SMeta, Other>> {
let mut node = self;
for piece in path.split('/') {
if piece.is_empty() {
continue;
}
match node {
FileTree::Directory { children, .. } => match children.get(piece) {
None => {
return None;
}
Some(new_node) => {
node = new_node;
}
},
_ => {
return None;
}
}
}
Some(node)
}
pub fn replace_meta<Replacement: Clone + Debug + Eq + PartialEq>(
&self,
replacement: &Replacement,
) -> FileTree<Replacement, Replacement, Replacement, Other> {
match self {
FileTree::NormalFile {
mtime,
ownership,
permissions,
..
} => FileTree::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
meta: replacement.clone(),
},
FileTree::Directory {
ownership,
permissions,
children,
..
} => {
let children = children
.iter()
.map(|(str, ft)| (str.clone(), ft.replace_meta(replacement)))
.collect();
FileTree::Directory {
ownership: ownership.clone(),
permissions: permissions.clone(),
children,
meta: replacement.clone(),
}
}
FileTree::SymbolicLink {
ownership, target, ..
} => FileTree::SymbolicLink {
ownership: ownership.clone(),
target: target.clone(),
meta: replacement.clone(),
},
FileTree::Other(other) => FileTree::Other(other.clone()),
}
}
/// Filters the tree in-place by removing nodes that do not satisfy the predicate.
/// 'Inclusive' in the sense that if a directory does not satisfy the predicate but one of its
/// descendants does, then the directory will be included anyway.
/// (So nodes that satisfy the predicate will never be excluded because of a parent not doing so.)
///
/// Returns true if this node should be included, and false if it should not be.
pub fn filter_inclusive<F>(&mut self, predicate: &mut F) -> bool
where
F: FnMut(&Self) -> bool,
{
match self {
FileTree::Directory { children, .. } => {
let mut to_remove = Vec::new();
for (name, child) in children.iter_mut() {
if !child.filter_inclusive(predicate) {
to_remove.push(name.clone());
}
}
for name in to_remove {
children.remove(&name);
}
!children.is_empty() || predicate(&self)
}
_ => predicate(&self),
}
}
}
impl<X: Debug + Clone + Eq, YAny: Debug + Clone + Eq> FileTree<X, X, X, YAny> {
pub fn get_metadata(&self) -> Option<&X> {
match self {
FileTree::NormalFile { meta, .. } => Some(meta),
FileTree::Directory { meta, .. } => Some(meta),
FileTree::SymbolicLink { meta, .. } => Some(meta),
FileTree::Other(_) => None,
}
}
pub fn set_metadata(&mut self, new_meta: X) {
match self {
FileTree::NormalFile { meta, .. } => {
*meta = new_meta;
}
FileTree::Directory { meta, .. } => {
*meta = new_meta;
}
FileTree::SymbolicLink { meta, .. } => {
*meta = new_meta;
}
FileTree::Other(_) => {
// nop
}
}
}
}
/// Given a file's metadata, returns the mtime in milliseconds.
pub fn mtime_msec(metadata: &Metadata) -> u64 {
(metadata.mtime() * 1000 + metadata.mtime_nsec() / 1_000_000) as u64
}
/// Scan the filesystem to produce a Tree, using a default progress bar.
pub fn scan(
path: &Path,
one_filesystem: bool,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
let pbar = ProgressBar::with_draw_target(0, ProgressDrawTarget::stdout_with_hz(2));
pbar.set_style(ProgressStyle::default_spinner().template("{spinner} {pos:7} {msg}"));
pbar.set_message("dir scan");
let one_filesystem = if one_filesystem { Some(None) } else { None };
let result = scan_with_progress_bar(path, &pbar, one_filesystem, exclusions);
pbar.finish_at_current_pos();
result
}
/// Scan the filesystem to produce a Tree, using the specified progress bar.
pub fn scan_with_progress_bar(
path: &Path,
progress_bar: &ProgressBar,
mut one_filesystem: Option<Option<u64>>,
exclusions: &BTreeSet<PathBuf>,
) -> anyhow::Result<Option<FileTree<(), (), (), ()>>> {
if exclusions.contains(path) {
// Don't enter excluded paths.
debug!("Not descending into excluded path: {:?}", path);
return Ok(None);
}
let metadata_res = symlink_metadata(path);
progress_bar.inc(1);
if let Err(e) = &metadata_res {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
let metadata = metadata_res?;
let filetype = metadata.file_type();
if let Some(one_filesystem) = one_filesystem.as_mut() {
let this_fs = metadata.dev();
if *one_filesystem.get_or_insert(this_fs) != this_fs {
info!("Stopping at filesystem boundary: {:?}", path);
return Ok(None);
}
}
/*let name = path
.file_name()
.ok_or(anyhow!("No filename, wat"))?
.to_str()
.ok_or(anyhow!("Filename can't be to_str()d"))?
.to_owned();*/
let ownership = FilesystemOwnership {
uid: metadata.uid() as u16,
gid: metadata.gid() as u16,
};
let permissions = FilesystemPermissions {
mode: metadata.mode(),
};
if filetype.is_file() {
// Leave an unpopulated file node. It's not my responsibility to chunk it right now.
Ok(Some(FileTree::NormalFile {
mtime: mtime_msec(&metadata),
ownership,
permissions,
meta: (),
}))
} else if filetype.is_dir() {
let mut children = BTreeMap::new();
progress_bar.set_message(&format!("{:?}", path));
let dir_read = path.read_dir();
if let Err(e) = &dir_read {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished/: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied/: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
for entry in dir_read? {
let entry: DirEntry = entry?;
if entry.file_name() == ".datmanskip" {
// Directories with .datmanskip in them are to be skipped entirely.
// TODO(perf): should this be checked upfront before some children may already
// have been scanned?
debug!("Skipping {path:?} because it has a .datmanskip file.");
return Ok(None);
}
let scanned =
scan_with_progress_bar(&entry.path(), progress_bar, one_filesystem, exclusions)?;
if let Some(scanned) = scanned {
if let Ok(filename) = entry.file_name().into_string() {
children.insert(filename, scanned);
} else {
warn!("Non-UTF-8 filename; ignoring: {:?}", entry.file_name())
}
}
}
Ok(Some(FileTree::Directory {
ownership,
permissions,
children,
meta: (),
}))
} else if filetype.is_symlink() {
let target = read_link(path)?
.to_str()
.ok_or(anyhow!("target path cannot be to_str()d"))?
.to_owned();
Ok(Some(FileTree::SymbolicLink {
ownership,
target,
meta: (),
}))
} else {
Ok(None)
}
}

12
datman_cli_readme.txt Normal file
View File

@ -0,0 +1,12 @@
`datman backup-one <sourceName> <destName>`
`datman backup-all <destName>`
Backs up now (either just one source or all sources) to a destination.
With `--config <>`, use a specified Datman config file; otherwise try current directory.
..
`datman extract <sourceName>` with same filtering options as now.
(unimportant: Yama tools should be decent for this right now.)

11
docs.old/SUMMARY.md Normal file
View File

@ -0,0 +1,11 @@
# Summary
- [Yama](./yama/index.md)
- [Getting Started](./yama/getting_started.md)
- [Internals](./yama/internals.md)
- [Raw Piles](./yama/internals/raw-piles.md)
- [Pointers and Nodes](./yama/internals/pointers-and-nodes.md)
- [Datman](./datman/index.md)
- [Getting Started](./datman/getting_started.md)
- [Remote Backups](./datman/remote_backups.md)

View File

@ -1,11 +0,0 @@
# Summary
- [Yama](./yama/index.md)
- [Getting Started](./yama/getting_started.md)
- [Internals](./yama/internals.md)
- [Raw Piles](./yama/internals/raw-piles.md)
- [Pointers and Nodes](./yama/internals/pointers-and-nodes.md)
- [Datman](./datman/index.md)
- [Getting Started](./datman/getting_started.md)
- [Remote Backups](./datman/remote_backups.md)

5
docs/yama/zstd.md Normal file
View File

@ -0,0 +1,5 @@
# Using a Zstd dictionary with Yama
## Creating a Zstd dictionary

39
flake.lock generated
View File

@ -1,5 +1,26 @@
{
"nodes": {
"fenix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
],
"rust-analyzer-src": "rust-analyzer-src"
},
"locked": {
"lastModified": 1682230876,
"narHash": "sha256-vCnd1pZRQKCdNvivQBD7WzaOlU1GcN91OCAz1rnoe5M=",
"owner": "nix-community",
"repo": "fenix",
"rev": "378f052d9f1cd90060ec4329f81782fee80490a4",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "fenix",
"type": "github"
}
},
"flake-utils": {
"inputs": {
"systems": "systems"
@ -100,12 +121,30 @@
},
"root": {
"inputs": {
"fenix": "fenix",
"naersk": "naersk",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix",
"utils": "utils"
}
},
"rust-analyzer-src": {
"flake": false,
"locked": {
"lastModified": 1682163822,
"narHash": "sha256-u7vaRlI6rYiutytoTk8lyOtNKO/rz5Q63Z6S6QzYCtU=",
"owner": "rust-lang",
"repo": "rust-analyzer",
"rev": "2feabc4dc462644287372922928110eea4c60ca7",
"type": "github"
},
"original": {
"owner": "rust-lang",
"ref": "nightly",
"repo": "rust-analyzer",
"type": "github"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,

View File

@ -7,6 +7,11 @@
url = "github:nix-community/naersk";
inputs.nixpkgs.follows = "nixpkgs";
};
# Current Rust in nixpkgs is too old unfortunately — let's use the Fenix overlay's packages...
fenix = {
url = "github:nix-community/fenix";
inputs.nixpkgs.follows = "nixpkgs";
};
nixpkgs.url = "nixpkgs/nixos-23.11";
poetry2nix = {
url = "github:nix-community/poetry2nix";
@ -14,16 +19,60 @@
};
};
outputs = { self, nixpkgs, utils, naersk, poetry2nix }:
outputs = { self, nixpkgs, utils, naersk, fenix, poetry2nix }:
utils.lib.eachDefaultSystem (system: let
pkgs = nixpkgs.legacyPackages."${system}";
inherit (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) mkPoetryApplication;
naersk-lib = naersk.lib."${system}";
#fenixRustToolchain = fenix.packages."${system}".minimal.toolchain
# fenixRustToolchain =
# fenix."${system}".complete.withComponents [
# "cargo"
# "clippy"
# "rust-src"
# "rustc"
# "rustfmt"
# ];
# fenixRustToolchain = fenix.packages."${system}".stable.toolchain;
fenixRustToolchain =
fenix.packages."${system}".stable.withComponents [
"cargo"
"clippy"
"rust-src"
"rustc"
"rustfmt"
"rust-analyzer"
];
# rust-toolchain = pkgs.symlinkJoin {
# name = "rust-toolchain";
# paths = [fenixRustToolchain.rustc fenixRustToolchain.cargo fenixRustToolchain.clippy fenixRustToolchain.rustfmt fenixRustToolchain.rustPlatform.rustcSrc];
# };
#naersk-lib = naersk.lib."${system}";
naersk-lib = pkgs.callPackage naersk {
cargo = fenixRustToolchain;
rustc = fenixRustToolchain;
};
rustComponents = naersk-lib.buildPackage {
pname = "yama";
root = ./.;
overrideMain = attrs: {
# Set up the dev database, needed for compile-time query checking.
preConfigure = ''
export PATH="${pkgs.sqlx-cli}/bin:$PATH"
pushd yama_localcache
bash dev_db.sh
popd
'';
# Temporary, whilst we still need to occasionally rely on a debugger:
# don't strip debug symbols, at the cost of a much larger binary!
dontStrip = true;
};
buildInputs = with pkgs; [
openssl
pkg-config
@ -51,7 +100,7 @@
installPhase = ''
# set -eu
mkdir $out $out/bin
ln -s ${rustComponents}/bin/{yama,datman} $out/bin
ln -s ${rustComponents}/bin/{yama,datman,yamascan} $out/bin
ln -s ${mysqlHelper}/bin/datman-helper-mysql-{backup,restore} $out/bin
ln -s ${postgresHelper}/bin/datman-helper-postgres-{backup,restore} $out/bin
ln -s ${pkgs.lz4}/bin/lz4 $out/bin/
@ -62,8 +111,10 @@
postInstall = ''
# set -eu
for fn in $out/bin/{datman,yama,datman-helper-{mysql,postgres}-{backup,restore}}; do
for fnbase in {datman,yama,yamascan,datman-helper-{mysql,postgres}-{backup,restore}}; do
fn="$out/bin/$fnbase"
wrapProgram $fn --suffix PATH : $out/bin
mv "$out/bin/$fnbase" "$out/bin/7$fnbase"
done
'';
};
@ -86,7 +137,45 @@
# `nix develop`
devShell = pkgs.mkShell {
nativeBuildInputs = with pkgs; [ rustc cargo ];
buildInputs = [
fenixRustToolchain
#rust-toolchain
pkgs.pkg-config
pkgs.alsa-lib
pkgs.sqlite
pkgs.sqlx-cli
#pkgs.libclang # ??
];
nativeBuildInputs = [
pkgs.openssl
pkgs.python3
];
# Needed for bindgen when binding to avahi
LIBCLANG_PATH="${pkgs.llvmPackages_latest.libclang.lib}/lib";
# Don't know if this var does anything by itself, but you need to feed this value in to IntelliJ IDEA and it's probably easier to pull out of an env var than look it up each time.
RUST_SRC_PATH = "${fenixRustToolchain}/lib/rustlib/src/rust/library";
# Cargo culted:
# Add to rustc search path
RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
]);
# Add to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
#''-I"${pkgs.glib.dev}/include/glib-2.0"''
#''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
#nativeBuildInputs = with pkgs; [ rustc cargo ];
};
});
}

View File

@ -6,7 +6,7 @@ let
rust-toolchain = pkgs.symlinkJoin {
name = "rust-toolchain";
paths = [pkgs.rustc pkgs.cargo pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
paths = [pkgs.rustc pkgs.cargo pkgs.clippy pkgs.rustfmt pkgs.rustPlatform.rustcSrc];
};
in
@ -19,6 +19,7 @@ pkgs.mkShell {
pkgs.alsa-lib
pkgs.sqlite
pkgs.sqlx-cli
#pkgs.libclang # ??
];

View File

@ -1,6 +1,6 @@
[package]
name = "yama"
version = "0.6.0-alpha.5"
version = "0.7.0-alpha.1"
authors = ["Olivier 'reivilibre' <olivier@librepush.net>"]
edition = "2018"
description = "Deduplicated, compressed and encrypted content pile manager"
@ -11,35 +11,48 @@ license = "GPL-3.0-or-later"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
fastcdc = "1.0.6"
zstd = "0.11.2" # 0.11.2+zstd.1.5.2
clap = { version = "3.1.18", features = ["derive"] }
blake = "2.0.2"
twox-hash = "1.5.0"
serde = { version = "1.0.104", features = ["derive"] }
serde_bare = "0.3.0"
users = "0.9.1"
crossbeam-channel = "0.5.1"
crossbeam-utils = "0.8.5"
toml = "0.5.5"
glob = "0.3.0"
nix = "0.17.0"
log = "0.4"
env_logger = "0.7.1"
indicatif = "0.14.0"
num_cpus = "1"
anyhow = "1.0"
thiserror = "1.0"
sodiumoxide = "0.2.6"
byteorder = "1"
itertools = "0.9.0"
rayon = "1.5.0"
rusqlite = "0.24.2"
chrono = "0.4.19"
rustyline = "7.1.0"
derivative = "2.2.0"
metrics = "0.17.1"
eyre = "0.6.8"
tracing = "0.1.37"
ignore = "0.4.20"
patricia_tree = "0.5.7"
users = "0.11.0"
serde = { version = "1.0.160", features = ["derive"] }
yama_pile = { path = "../yama_pile" }
yama_localcache = { path = "../yama_localcache" }
yama_wormfile = { path = "../yama_wormfile" }
yama_wormfile_fs = { path = "../yama_wormfile_fs" }
yama_wormfile_s3 = { path = "../yama_wormfile_s3" }
yama_wormfile_sftp = { path = "../yama_wormfile_sftp" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
clap = { version = "4.2.2", features = ["derive"] }
tokio = { version = "1.28.1", features = ["full"] }
appdirs = "0.2.0"
twox-hash = "1.6.3"
hostname = "0.3.1"
tracing-subscriber = { version = "0.3.16", features = ["tracing-log", "env-filter"] }
tracing-indicatif = "0.3.0"
indicatif = "0.17.3"
dashmap = "5.4.0"
fastcdc = "3.0.3"
zstd = "0.12.3"
memmap2 = "0.5.10"
flume = "0.10.14"
async-recursion = "1.0.4"
toml = "0.7.3"
io-streams = "0.14.3"
dust_style_filetree_display = "0.8.5"
async-backtrace = "0.2.6"
signal-hook = "0.3.17"
[dev-dependencies]
temp-dir = "0.1.11"
maplit = "1.0.2"

File diff suppressed because it is too large Load Diff

237
yama/src/bin/yamascan.rs Normal file
View File

@ -0,0 +1,237 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeMap;
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use clap::Parser;
use dust_style_filetree_display::display::{draw_it, InitialDisplayData};
use dust_style_filetree_display::filter::AggregateData;
use dust_style_filetree_display::node::Node;
use dust_style_filetree_display::{get_height_of_terminal, get_width_of_terminal, init_color};
use eyre::{bail, Context, ContextCompat};
use patricia_tree::PatriciaMap;
use tokio::fs::OpenOptions;
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use yama::scan;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use yama_pile::tree::unpopulated::ScanEntry;
#[derive(Parser, Clone, Debug)]
pub enum YamaScanCommand {
/// Add an entry to an ignore file
#[command(alias = "i")]
Ignore {
/// What to ignore
path: String,
/// Don't anchor the match to this directory.
#[arg(short = 'a')]
unanchored: bool,
},
/// Show dust-style usage graph of the current directory, excluding excluded files.
#[command(alias = "du")]
Usage {
/// Specify an ignore rule. Can use multiple times.
#[arg(short = 'I', long = "ignore")]
ignore: Vec<String>,
},
}
#[tokio::main]
async fn main() -> eyre::Result<()> {
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "sqlx=warn,yama=debug,info".into()),
)
.with(tracing_subscriber::fmt::layer())
.init();
match YamaScanCommand::parse() {
YamaScanCommand::Usage { ignore } => {
let idd = InitialDisplayData {
short_paths: true,
is_reversed: false,
colors_on: !init_color(false),
by_filecount: false,
is_screen_reader: false,
iso: false,
};
let scan = scan::scan(Path::new("."), &ignore).context("Couldn't scan")?;
let top_nodes = assemble_display_tree_from_scan_entries(scan)?.children;
let root_display_node = dust_style_filetree_display::filter::get_biggest(
top_nodes,
AggregateData {
min_size: None,
only_dir: false,
only_file: false,
number_of_lines: get_height_of_terminal(),
depth: usize::MAX,
using_a_filter: false,
},
)
.expect("no root?");
draw_it(
idd,
false,
get_width_of_terminal(),
&root_display_node,
false,
)
}
YamaScanCommand::Ignore { path, unanchored } => {
let mut oo = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(false)
.open(".yamaignore")
.await
.context("failed to open .yamaignore for r/w")?;
let pos = oo.seek(SeekFrom::End(0)).await?;
if pos > 1 {
oo.seek(SeekFrom::End(-1)).await?;
let last_byte = oo.read_u8().await?;
if last_byte != b'\n' {
oo.write_u8(b'\n').await?;
}
}
if unanchored {
oo.write_all(format!("{}\n", path).as_bytes()).await?;
} else {
oo.write_all(format!("/{}\n", path).as_bytes()).await?;
}
oo.flush().await?;
drop(oo);
}
_other => todo!(),
}
Ok(())
}
pub fn assemble_display_tree_from_scan_entries(scan: PatriciaMap<ScanEntry>) -> eyre::Result<Node> {
let mut dirs: BTreeMap<String, BTreeMap<String, Node>> = BTreeMap::new();
// special-case the root ("")
dirs.insert(String::new(), BTreeMap::new());
for (key, entry) in scan.into_iter() {
let key_string = String::from_utf8(key).context("bad UTF-8 in PMap")?;
let (parent_dir_name, child_name) =
key_string.rsplit_once('/').unwrap_or(("", &key_string));
match entry {
ScanEntry::NormalFile { size, .. } => {
// note: for the root, this inserts the root file entry as a child called "" within a fake root 'directory'.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
ScanEntry::Directory {
ownership: _,
permissions: _,
} => {
dirs.insert(key_string.clone(), BTreeMap::new());
// note: for the root, this inserts the root directory entry as a child called "" within the root.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size: 4096,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
ScanEntry::SymbolicLink {
ownership: _,
target: _,
} => {
// note: for the root, this inserts the root symlink entry as a child called "" within a fake root 'directory'.
// That's fine. We'll patch this up later.
dirs.get_mut(parent_dir_name)
.context("bad PMap: parent not seen first")?
.insert(
child_name.to_owned(),
Node {
name: PathBuf::from(&key_string),
size: 4096,
children: vec![],
inode_device: None,
depth: 0,
},
);
}
}
}
// Now roll up the directories. In Rustc v1.66 it'd be nice to use pop_last()...
while let Some(last_key) = dirs.keys().last().cloned() {
let mut last_children = dirs.remove(&last_key).unwrap();
if last_key.is_empty() {
assert!(
dirs.is_empty(),
"when pulling out root pseudo-dir, dirs must be empty for roll-up."
);
let mut real_root = last_children.remove("").unwrap();
real_root.children = last_children.into_values().collect();
real_root.size += real_root.children.iter().map(|c| c.size).sum::<u64>();
return Ok(real_root);
}
// We want to roll up the directory last/key -> {child -> ...}
// so last -> {key -> {child -> ...}}
let (parent_dir, child_name) = last_key.rsplit_once('/').unwrap_or(("", &last_key));
let parent = dirs
.get_mut(parent_dir)
.context("bad PMap? no parent in rollup")?;
let child_in_parent = parent
.get_mut(child_name)
.context("dir child not populated")?;
child_in_parent.children = last_children.into_values().collect();
child_in_parent.size += child_in_parent.children.iter().map(|c| c.size).sum::<u64>();
}
bail!("no root found; bad PMap or bad roll-up???");
}

223
yama/src/check.rs Normal file
View File

@ -0,0 +1,223 @@
use crate::extract::expand_chunkrefs;
use crate::pile_with_cache::PileWithCache;
use crate::retriever::decompressor::DECOMPRESS_CAPACITY;
use crate::PROGRESS_BAR_STYLE;
use eyre::{bail, eyre, Context, ContextCompat};
use flume::{Receiver, Sender};
use indicatif::ProgressStyle;
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use tokio::task::JoinSet;
use tracing::{error, info, info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::BloblogId;
use yama_pile::tree::TreeNode;
use yama_wormfile::boxed::BoxedWormFileProvider;
use zstd::bulk::Decompressor;
/// Check that all pointers point to chunks that exist **in our local cache**.
pub async fn check_pointers_point_to_indexed_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<()> {
let pointer_names = pwc.pile.list_pointers().await?;
let mut rcrs_to_check = BTreeSet::new();
for pointer_name in &pointer_names {
let pointer = pwc
.pile
.read_pointer(pointer_name)
.await?
.context("pointer vanished")?;
if let Some(parent_name) = pointer.parent {
if !pointer_names.contains(pointer_name) {
bail!("{parent_name:?}, the parent of {pointer_name:?}, does not exist");
}
}
pointer
.root
.node
.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
rcrs_to_check.insert(*content);
}
Ok(())
},
String::new(),
)
.unwrap();
}
let chunk_ids: BTreeSet<ChunkId> =
expand_chunkrefs(pwc, rcrs_to_check.into_iter().map(|x| ((), x)))
.await?
.into_iter()
.map(|(_, x)| x)
.flatten()
.collect();
info!("{} chunks to check for existence", chunk_ids.len());
let mut cache = pwc.localcache.read().await?;
let resolved_chunks = cache.locate_chunks(&chunk_ids).await?;
if chunk_ids.len() != resolved_chunks.len() {
bail!("Not all chunk IDs could be resolved. TODO: this check error is currently not granular enough.");
}
info!("All {} chunks accounted for!", resolved_chunks.len());
Ok(())
}
/// Checks all the chunks in the bloblog and then returns the number of chunks that were checked.
pub async fn check_all_chunk_hashes_in_bloblog(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
bloblog_id: BloblogId,
) -> eyre::Result<u64> {
let mut decompressor = match &pwc.pile.pile_config.zstd_dict {
Some(dict) => Decompressor::with_dictionary(dict)?,
None => Decompressor::new()?,
};
let chunk_id_key = &pwc.pile.pile_config.chunk_id_key;
let mut bloblog = pwc
.pile
.read_bloblog(bloblog_id)
.await
.with_context(|| format!("could not open bloblog for checking: {bloblog_id:?}"))?;
let offsets_and_chunks_to_read: BTreeMap<u64, ChunkId> = bloblog
.footer()
.chunks
.iter()
.map(|(chunk_id, locator)| (locator.offset, *chunk_id))
.collect();
let mut buf = Vec::with_capacity(DECOMPRESS_CAPACITY);
let mut checked = 0;
for (_, chunk_id) in offsets_and_chunks_to_read {
let blob = bloblog
.read_chunk(chunk_id)
.await?
.context("missing chunk")?;
(decompressor, buf) = tokio::task::spawn_blocking(move || {
decompressor
.decompress_to_buffer(&blob, &mut buf)
.with_context(|| format!("failed to decompress {chunk_id:?} in {bloblog_id:?}"))?;
Ok::<_, eyre::Error>((decompressor, buf))
})
.await??;
if !chunk_id.verify(&buf, chunk_id_key) {
bail!("verification failure: chunk {chunk_id:?} in bloblog {bloblog_id:?} is corrupt!");
}
checked += 1;
}
Ok(checked)
}
pub async fn check_all_chunks_in_bloblogs(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
bloblogs: &BTreeSet<BloblogId>,
) -> eyre::Result<()> {
let prog_span = info_span!("check_all_chunks");
async {
let prog_span = Span::current();
prog_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
prog_span.pb_set_length(bloblogs.len() as u64);
prog_span.pb_set_message("checking all bloblogs");
let mut workers = JoinSet::new();
let (bloblog_id_tx, bloblog_id_rx) = flume::bounded(16);
let (progress_tx, progress_rx) = flume::bounded(16);
for _ in 0..4 {
let pwc = pwc.clone();
let bloblog_id_rx = bloblog_id_rx.clone();
let progress_tx = progress_tx.clone();
workers.spawn(async {
if let Err(err) =
check_all_chunks_in_bloblogs_worker(pwc, bloblog_id_rx, progress_tx).await
{
error!("Error in chunk checker worker: {err:?}")
}
});
}
drop(progress_tx);
drop(bloblog_id_rx);
let mut success = true;
let mut num_bloblogs_outstanding = bloblogs.len();
let mut total_chunks_checked = 0u64;
tokio::join!(
async move {
for bloblog_id in bloblogs {
bloblog_id_tx
.send_async(*bloblog_id)
.await
.expect("can no longer submit new bloblogs to be checked; fault?");
}
drop(bloblog_id_tx);
},
async {
while let Ok(res) = progress_rx.recv_async().await {
match res {
Ok(chunks_checked) => {
total_chunks_checked += chunks_checked;
}
Err(err) => {
error!("check failure: {err:?}");
success = false;
}
}
prog_span.pb_inc(1);
num_bloblogs_outstanding = num_bloblogs_outstanding
.checked_sub(1)
.expect("more bloblogs progress reports than expected?");
}
}
);
if num_bloblogs_outstanding > 0 {
bail!("{num_bloblogs_outstanding} bloblogs outstanding somehow");
}
info!("{total_chunks_checked} chunks checked!");
if !success {
bail!("There were chunk check failures.");
}
Ok(())
}
.instrument(prog_span)
.await
}
pub async fn check_all_chunks_in_bloblogs_worker(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
bloblogs_rx: Receiver<BloblogId>,
progress_tx: Sender<eyre::Result<u64>>,
) -> eyre::Result<()> {
while let Ok(bloblog_id) = bloblogs_rx.recv_async().await {
let check = check_all_chunk_hashes_in_bloblog(&pwc, bloblog_id).await;
progress_tx
.send_async(check)
.await
.map_err(|_| eyre!("check progress tx shut down"))?;
}
Ok(())
}

View File

@ -1,241 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::cmp::min;
use std::io;
use std::io::{Cursor, Read, Write};
use anyhow::anyhow;
use fastcdc::FastCDC;
use crate::definitions::{ChunkId, RecursiveChunkRef};
use crate::pile::{Pile, RawPile};
use crate::utils::bytes_to_hexstring;
pub const SENSIBLE_THRESHOLD: usize = 1024 * 1024;
// 256 kiB
pub const FASTCDC_MIN: usize = 256 * 1024;
// 1 MiB
pub const FASTCDC_AVG: usize = 1024 * 1024;
// 8 MiB
pub const FASTCDC_MAX: usize = 8 * 1024 * 1024;
pub trait ChunkSubmissionTarget: Sync {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()>;
}
impl<RP: RawPile> ChunkSubmissionTarget for Pile<RP> {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
self.submit_chunk(chunk_id, chunk_data)
}
}
impl ChunkSubmissionTarget for crossbeam_channel::Sender<(ChunkId, Vec<u8>)> {
fn submit(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
self.send((chunk_id, chunk_data.to_vec()))
.map_err(|_| anyhow::anyhow!("Failed to send to pipeline."))
}
}
/// A chunker that will generate nested chunks of chunk references if there is that much data
/// to store.
/// The root RecursiveChunker is fed data bytes.
/// If it exceeds the nominated threshold, it grows a child RecursiveChunker (which may do the same).
/// When done, `finish` should be called to flush the buffers and obtain a `RecursiveChunkRef`.
pub struct RecursiveChunker<'cst, CST: ChunkSubmissionTarget> {
/// The pile to submit chunks to.
target: &'cst CST,
/// Buffer of data at this level.
buffer: Vec<u8>,
/// The next-layer recursive chunker. This is where this chunker will submit chunk IDs to for
/// recursive chunking.
next_layer: Option<Box<RecursiveChunker<'cst, CST>>>,
/// The size at which this chunker will perform recursive chunking.
threshold: usize,
}
impl<'cst, CST: ChunkSubmissionTarget> RecursiveChunker<'cst, CST> {
pub fn new(threshold: usize, target: &'cst CST) -> Self {
RecursiveChunker {
target,
buffer: vec![],
next_layer: None,
threshold,
}
}
/// finalise: true iff this is the last chunk (we will not reject a chunk which may have been
/// truncated)
fn do_chunking(&mut self, finalise: bool) -> anyhow::Result<Vec<u8>> {
let fastcdc = FastCDC::new(&self.buffer, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX);
let mut new_chunks: Vec<u8> = Vec::new();
let mut consumed_until: Option<usize> = None;
for chunk in fastcdc {
let is_final = chunk.offset + chunk.length == self.buffer.len();
if !is_final || finalise {
consumed_until = Some(chunk.offset + chunk.length);
let chunk_data = &self.buffer[chunk.offset..chunk.offset + chunk.length];
let chunk_id = calculate_chunkid(chunk_data);
new_chunks.extend_from_slice(&chunk_id);
self.target.submit(chunk_id, chunk_data)?;
}
}
if let Some(consumed_until) = consumed_until {
if consumed_until > 0 {
self.buffer.drain(0..consumed_until);
}
}
Ok(new_chunks)
}
pub fn finish(mut self) -> anyhow::Result<RecursiveChunkRef> {
if self.next_layer.is_some() {
// we are chunking so make this the last chunk
let new_chunks = self.do_chunking(true)?;
let mut subchunker = self.next_layer.unwrap();
subchunker.write(&new_chunks)?;
let mut rcr = subchunker.finish()?;
// as there is a level of chunking, increase the depth
rcr.depth += 1;
Ok(rcr)
} else {
// no chunking, so depth=0 (raw) and just emit our unchunked data
let chunk_id = calculate_chunkid(&self.buffer);
self.target.submit(chunk_id, &self.buffer)?;
Ok(RecursiveChunkRef { chunk_id, depth: 0 })
}
}
fn write_impl(&mut self, buf: &[u8]) -> anyhow::Result<usize> {
self.buffer.extend_from_slice(buf);
if self.buffer.len() > self.threshold {
if self.next_layer.is_none() {
// start chunking
self.next_layer = Some(Box::new(RecursiveChunker::new(
self.threshold,
self.target.clone(),
)));
}
let new_chunks = self.do_chunking(false)?;
self.next_layer.as_mut().unwrap().write(&new_chunks)?;
}
Ok(buf.len())
}
}
impl<'cst, CST: ChunkSubmissionTarget> Write for RecursiveChunker<'cst, CST> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match self.write_impl(buf) {
Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)),
Ok(written) => Ok(written),
}
}
fn flush(&mut self) -> io::Result<()> {
// nop is probably the most correct action here...
Ok(())
}
}
#[inline]
pub fn calculate_chunkid(chunk: &[u8]) -> ChunkId {
// TODO(newver) Allow pluggable chunkID calculations so that encrypted storage can work without
// leaking contents.
let mut chunk_id: ChunkId = Default::default();
blake::hash(256, &chunk, &mut chunk_id).expect("BLAKE problem");
chunk_id
}
pub struct RecursiveUnchunker<'pile, RP: RawPile> {
sub_reader: Box<dyn Read + 'pile>,
buffer: Vec<u8>,
pile: &'pile Pile<RP>,
}
impl<'pile, RP: RawPile> RecursiveUnchunker<'pile, RP> {
/// Create a new recursive unchunker.
/// This will automatically create sub-unchunkers as needed.
pub fn new(pile: &'pile Pile<RP>, reference: RecursiveChunkRef) -> Self {
if reference.depth == 0 {
// this unchunker only needs to unchunk the one chunk
RecursiveUnchunker {
sub_reader: Box::new(Cursor::new(reference.chunk_id.to_vec())),
buffer: vec![],
pile: &pile,
}
} else {
let next_ref = RecursiveChunkRef {
depth: reference.depth - 1,
..reference
};
let sub_unchunker = RecursiveUnchunker::new(pile, next_ref);
RecursiveUnchunker {
sub_reader: Box::new(sub_unchunker),
buffer: vec![],
pile: &pile,
}
}
}
}
impl<'pile, RP: RawPile> Read for RecursiveUnchunker<'pile, RP> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
while self.buffer.is_empty() {
// Internal buffer is empty, so we need to load another chunk.
// (we use 'while' in case we load an empty chunk...)
let mut next_chunk_id: ChunkId = Default::default();
let read_bytes = self.sub_reader.read(&mut next_chunk_id)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
return Ok(0);
}
if read_bytes < next_chunk_id.len() {
// any error, including EOF at this point, is an error
self.sub_reader
.read_exact(&mut next_chunk_id[read_bytes..])?;
}
let chunk = self
.pile
.read_chunk(&next_chunk_id)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
if let Some(chunk) = chunk {
self.buffer.extend_from_slice(&chunk);
} else {
return Err(io::Error::new(
io::ErrorKind::NotFound,
anyhow!("Chunk {} not found", bytes_to_hexstring(&next_chunk_id)),
));
}
}
let to_read = min(self.buffer.len(), buf.len());
buf[0..to_read].copy_from_slice(&self.buffer[0..to_read]);
self.buffer.drain(0..to_read);
Ok(to_read)
}
}

View File

@ -1,183 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::io;
use std::io::{Read, Write};
use std::path::Path;
use std::sync::Arc;
use anyhow::{anyhow, bail, Context};
use log::warn;
use crate::chunking::{RecursiveChunker, RecursiveUnchunker, SENSIBLE_THRESHOLD};
use crate::definitions::{PointerData, RecursiveChunkRef, RootTreeNode, TreeNode};
use crate::pile::compression::{CompressionSettings, RawPileCompressor};
use crate::pile::integrity::RawPileIntegrityChecker;
use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;
use crate::pile::{Pile, PileDescriptor, PileStorage, RawPile};
use crate::tree::{integrate_node_in_place, merge_uid_or_gid_tables};
use crate::utils::get_number_of_workers;
pub fn init(dir: &Path) -> anyhow::Result<()> {
let yama_toml = dir.join("yama.toml");
if yama_toml.exists() {
bail!("yama.toml already exists. Cannot create yama pile here.");
}
/*
let pile_db = sled::open(dir.join("pile.sled"))?;
pile_db.flush()?;
*/
let mut file = File::create(yama_toml)?;
let desc = PileDescriptor {
yama_version: env!("CARGO_PKG_VERSION").to_owned(),
storage: PileStorage::SqliteIndexedBloblog,
compression: Some(12),
};
file.write_all(&toml::to_vec(&desc)?)?;
Ok(())
}
pub fn load_pile_descriptor(dir: &Path) -> anyhow::Result<PileDescriptor> {
let yama_toml = dir.join("yama.toml");
if !yama_toml.exists() {
bail!("yama.toml does not exist here. Is this an existing pile?");
}
let mut file = File::open(yama_toml)?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)?;
Ok(toml::from_slice(&buf)?)
}
pub fn open_pile(dir: &Path, desc: &PileDescriptor) -> anyhow::Result<Pile<Box<dyn RawPile>>> {
let num_compressors = get_number_of_workers("YAMA_COMPRESSORS");
let num_decompressors = get_number_of_workers("YAMA_DECOMPRESSORS");
match desc.storage {
PileStorage::RemoteOnly => {
bail!("This is a remote-only pile. No local storage allowed.");
}
PileStorage::SqliteIndexedBloblog => {
let blob_raw_pile = RawPileIntegrityChecker::new(SqliteBloblogPile::open(dir)?);
let raw_pile: Box<dyn RawPile> = match desc.compression {
None => Box::new(blob_raw_pile),
Some(comp_level) => {
let mut dictionary = Vec::new();
let dict_path = dir.join("important_zstd.dict");
File::open(dict_path)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary)?;
let (compressed_pile, _handles) = RawPileCompressor::new(
blob_raw_pile,
CompressionSettings {
dictionary: Arc::new(dictionary),
level: comp_level as i32,
num_compressors: num_compressors as u32,
num_decompressors: num_decompressors as u32,
},
)?;
Box::new(compressed_pile)
}
};
Ok(Pile::new(raw_pile))
}
}
}
pub fn store_tree_node<RP: RawPile>(
pile: &Pile<RP>,
root_tree_node: &RootTreeNode,
) -> anyhow::Result<RecursiveChunkRef> {
let serialised = serde_bare::to_vec(root_tree_node)?;
let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, pile);
io::copy(&mut (&serialised[..]), &mut chunker)?;
let chunk_ref = chunker.finish()?;
Ok(chunk_ref)
}
pub fn retrieve_tree_node<RP: RawPile>(
pile: &Pile<RP>,
chunk_ref: RecursiveChunkRef,
) -> anyhow::Result<RootTreeNode> {
let mut serialised = Vec::new();
let mut unchunker = RecursiveUnchunker::new(pile, chunk_ref);
io::copy(&mut unchunker, &mut serialised)?;
Ok(serde_bare::from_slice(&serialised)?)
/*
let unchunker = RecursiveUnchunker::new(pile, chunk_ref);
Ok(serde_bare::from_reader(unchunker)?)
*/
}
/// Given a pointer, fully integrates it in-place. The pointer will no longer have a parent when
/// this operation is finished.
pub fn fully_integrate_pointer_node<RP: RawPile>(
pile: &Pile<RP>,
tree_node: &mut TreeNode,
pointer: &mut PointerData,
) -> anyhow::Result<()> {
if let Some(parent_name) = &pointer.parent_pointer {
let mut parent = pile
.read_pointer(parent_name.as_str())?
.ok_or_else(|| anyhow!("Parent pointer {:?} not found.", parent_name))?;
let mut parent_node = retrieve_tree_node(pile, parent.chunk_ref.clone())?.node;
fully_integrate_pointer_node(pile, &mut parent_node, &mut parent)?;
integrate_node_in_place(tree_node, &mut parent_node)?;
// merge in the UID and GID tables when integrating.
if !merge_uid_or_gid_tables(&mut pointer.uid_lookup, &parent.uid_lookup) {
warn!(
"Overlap when merging parent:{:?}'s UID table into child.",
parent_name
);
}
if !merge_uid_or_gid_tables(&mut pointer.gid_lookup, &parent.gid_lookup) {
warn!(
"Overlap when merging parent:{:?}'s GID table into child.",
parent_name
);
}
pointer.parent_pointer = None;
}
Ok(())
}
/// Loads a pointer and fully integrates it.
/// The result will be a fully-integrated pointer (it won't have a parent).
pub fn fully_load_pointer<RP: RawPile>(
pile: &Pile<RP>,
pointer_name: &str,
) -> anyhow::Result<(PointerData, RootTreeNode)> {
let mut pointer_data = pile
.read_pointer(pointer_name)?
.ok_or_else(|| anyhow!("Pointer {:?} not found.", pointer_name))?;
let mut root_node = retrieve_tree_node(pile, pointer_data.chunk_ref.clone())?;
fully_integrate_pointer_node(pile, &mut root_node.node, &mut pointer_data)?;
Ok((pointer_data, root_node))
}

View File

@ -1,213 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::commands::retrieve_tree_node;
use crate::definitions::{FilesystemOwnership, FilesystemPermissions, TreeNode};
use crate::operations::remove_pointer_safely;
use crate::pile::{Pile, PileDescriptor, RawPile};
use anyhow::anyhow;
use clap::Parser;
use rustyline::error::ReadlineError;
use rustyline::Editor;
#[derive(Parser)]
pub enum DebugCommand {
/// List the pointers that are stored in this yama pile.
#[clap(name = "lsp")]
ListPointers {
/// List details about each pointer.
#[clap(short)]
verbose: bool,
},
/// Delete a pointer from the yama pile.
#[clap(name = "rmp")]
DeletePointer {
/// Name of the pointer to delete.
name: String,
},
/// Reads the information on a pointer.
#[clap(name = "infop")]
PointerInfo {
/// Name of the pointer to read.
name: String,
},
/// Reads statistics from the Pile
#[clap(name = "stats")]
Statistics {},
}
pub fn debug_prompt<RP: RawPile>(pdesc: PileDescriptor, pile: Pile<RP>) -> anyhow::Result<()> {
let mut rl = Editor::<()>::new();
if rl.load_history(".yama-history").is_err() {
// no previous history...
}
loop {
let readline = rl.readline("debug 山 ");
match readline {
Ok(line) => {
rl.add_history_entry(line.as_str());
let mut args = vec![""];
args.extend(line.split_ascii_whitespace());
match DebugCommand::try_parse_from(args) {
Ok(command) => {
if let Err(e) = debug_command(&pdesc, &pile, command) {
eprintln!("Failed {:?}", e);
}
}
Err(e) => {
eprintln!("{}", e);
}
}
}
Err(ReadlineError::Interrupted) => {
eprintln!("^C");
break;
}
Err(ReadlineError::Eof) => {
eprintln!("^D");
break;
}
Err(err) => {
eprintln!("Error: {:?}", err);
break;
}
}
}
pile.flush()?;
rl.save_history(".yama-history").unwrap();
Ok(())
}
pub fn debug_command<RP: RawPile>(
_pdesc: &PileDescriptor,
pile: &Pile<RP>,
command: DebugCommand,
) -> anyhow::Result<()> {
match &command {
DebugCommand::ListPointers { verbose } => {
for pointer in pile.list_pointers()?.iter() {
if *verbose {
let pointer_data = pile.read_pointer(pointer.as_str())?;
match pointer_data {
None => {
println!("{} → ??? MISSING DATA", pointer);
}
Some(data) => {
println!(
"{} → {:?} [parent={:?}]",
pointer, data.chunk_ref, data.parent_pointer
);
}
}
} else {
println!("{}", pointer);
}
}
}
DebugCommand::DeletePointer { name } => {
remove_pointer_safely(pile, name)?;
}
DebugCommand::PointerInfo { name } => {
let this_pointer = pile
.read_pointer(name.as_str())?
.ok_or_else(|| anyhow!("Pointer {:?} does not exist.", name))?;
let this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())?;
eprintln!(" --- Pointer data --- ");
eprintln!("{:#?}", this_pointer);
eprintln!(" --- Tree node --- ");
//eprintln!("{:#?}", this_node.node);
tree_node_printer(&this_node.name, &this_node.node, 0);
}
DebugCommand::Statistics { .. } => {
if let Some(stats) = pile.raw_pile.debug_statistics()? {
println!("Statistics for this pile");
println!(" chunk count: {} chunks", stats.number_of_chunks);
println!(
" total chunk stored space: {} bytes (may exclude deleted chunks)",
stats.total_chunk_size
);
let average_size =
((stats.total_chunk_size as f64) / (stats.number_of_chunks as f64)) as u64;
println!(" (average chunk size: {} bytes)", average_size);
} else {
eprintln!("{:?}", pile);
eprintln!("Statistics appear not to be supported on this kind of pile?");
}
}
}
Ok(())
}
pub fn compact_ownership(ownership: &FilesystemOwnership) -> String {
format!("uid={}, gid={}", ownership.uid, ownership.gid)
}
pub fn compact_permissions(perms: &FilesystemPermissions) -> String {
format!("{:4o}", perms.mode)
}
pub fn tree_node_printer(name: &str, node: &TreeNode, depth: usize) {
match node {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
content,
} => {
eprintln!(
"{}{} = {:?} ({}, {}, mtime={})",
" ".repeat(depth),
name,
content,
compact_ownership(ownership),
compact_permissions(permissions),
mtime
);
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
eprintln!(
"{}{}/ ({}, {})",
" ".repeat(depth),
name,
compact_ownership(ownership),
compact_permissions(permissions)
);
for (name, child) in children.iter() {
tree_node_printer(name, child, depth + 1);
}
}
TreeNode::SymbolicLink { ownership, target } => {
eprintln!(
"{}{} → {} ({})",
" ".repeat(depth),
name,
target,
compact_ownership(ownership)
);
}
TreeNode::Deleted => {
eprintln!("{}{} DELETED", " ".repeat(depth), name);
}
}
}

20
yama/src/debugging.rs Normal file
View File

@ -0,0 +1,20 @@
use tokio::signal::unix::SignalKind;
use tracing::warn;
/// Registers a signal handler on SIGUSR1 that dumps a backtrace of the tokio task tree.
///
/// May be useful for debugging deadlocks etc.
pub fn register_sigusr1_backtrace_helper() {
tokio::spawn(async {
while let Some(()) = tokio::signal::unix::signal(SignalKind::user_defined1())
.unwrap()
.recv()
.await
{
warn!(
"SIGUSR1 received; debug task backtrace:\n{}",
async_backtrace::taskdump_tree(false)
);
}
});
}

View File

@ -1,357 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::BTreeMap;
use std::fmt::{Debug, Formatter};
use anyhow::bail;
use serde::{Deserialize, Serialize};
use crate::utils::bytes_to_hexstring;
pub type ChunkId = [u8; 32];
pub type XXHash = u64;
pub const XXH64_SEED: u64 = 424242;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PointerData {
pub chunk_ref: RecursiveChunkRef,
pub parent_pointer: Option<String>,
pub uid_lookup: BTreeMap<u16, Option<String>>,
pub gid_lookup: BTreeMap<u16, Option<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartialPointerData {
pub chunk_ref: RecursiveChunkRef,
pub uid_lookup: BTreeMap<u16, Option<String>>,
pub gid_lookup: BTreeMap<u16, Option<String>>,
}
impl PartialPointerData {
pub fn complete(self, parent_pointer: Option<String>) -> PointerData {
PointerData {
chunk_ref: self.chunk_ref,
parent_pointer,
uid_lookup: self.uid_lookup,
gid_lookup: self.gid_lookup,
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct RecursiveChunkRef {
/// The root Chunk ID.
pub chunk_id: ChunkId,
/// The depth of the data bytes.
/// 0 means that the chunk addressed by `chunk_id` contains data bytes.
/// 1 means that the chunk addressed by `chunk_id` contains references to chunk that contain
/// data bytes.
/// (and so on)
pub depth: u32,
}
impl Debug for RecursiveChunkRef {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}<{}>", bytes_to_hexstring(&self.chunk_id), self.depth)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RootTreeNode {
pub name: String,
pub node: TreeNode,
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub enum TreeNode {
NormalFile {
/// modification time in ms
mtime: u64,
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
// TODO size: u64 or not
// can perhaps cache chunk-wise (but not sure.)
content: RecursiveChunkRef,
},
Directory {
ownership: FilesystemOwnership,
permissions: FilesystemPermissions,
children: BTreeMap<String, TreeNode>,
},
SymbolicLink {
ownership: FilesystemOwnership,
target: String,
},
// TODO is there any other kind of file we need to store?
Deleted,
}
impl TreeNode {
pub fn metadata_invalidates(&self, other: &TreeNode) -> anyhow::Result<bool> {
Ok(match self {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
..
} => {
if let TreeNode::NormalFile {
mtime: other_mtime,
ownership: other_ownership,
permissions: other_permissions,
..
} = other
{
mtime != other_mtime
|| ownership != other_ownership
|| permissions != other_permissions
} else {
true
}
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
if let TreeNode::Directory {
ownership: other_ownership,
permissions: other_permissions,
children: other_children,
} = other
{
if ownership != other_ownership || permissions != other_permissions {
return Ok(true);
}
if children.len() != other_children.len() {
return Ok(true);
}
for ((left_name, left_node), (right_name, right_node)) in
children.iter().zip(other_children.iter())
{
if left_name != right_name || left_node.metadata_invalidates(right_node)? {
return Ok(true);
}
}
false
} else {
true
}
}
TreeNode::SymbolicLink { ownership, target } => {
if let TreeNode::SymbolicLink {
ownership: other_ownership,
target: other_target,
} = other
{
ownership != other_ownership || target != other_target
} else {
true
}
}
TreeNode::Deleted => {
// unreachable
bail!("Why is Deleted here?");
}
})
}
//
///// Guarantees consistent visit order.
// pub fn visit_mut<F>(
// &mut self,
// visitor: &mut F,
// path_prefix: &str,
// skip_components: u32,
// ) -> anyhow::Result<()>
// where
// F: FnMut(&mut Self, &str) -> anyhow::Result<()>,
// {
// let mut my_path_buf = String::new();
// my_path_buf.push_str(path_prefix);
//
// if skip_components == 0 {
// if !my_path_buf.is_empty() {
// my_path_buf.push('/');
// }
// my_path_buf.push_str(&self.name);
// }
//
// visitor(self, &my_path_buf)?;
//
// if let TreeNode::Directory { children, .. } = &mut self.content {
// for child in children.iter_mut() {
// let new_skip = if skip_components > 0 {
// skip_components - 1
// } else {
// 0
// };
// child.visit_mut(visitor, &my_path_buf, new_skip)?;
// }
// }
// Ok(())
// }
pub fn count_normal_files(&self) -> u32 {
match self {
TreeNode::NormalFile { .. } => 1,
TreeNode::Directory { children, .. } => children
.values()
.map(|child| child.count_normal_files())
.sum(),
_ => 0,
}
}
pub fn visit<F: FnMut(&TreeNode, &str) -> anyhow::Result<()>>(
&self,
func: &mut F,
prefix: String,
) -> anyhow::Result<()> {
func(self, &prefix)?;
if let TreeNode::Directory { children, .. } = &self {
for (name, child) in children.iter() {
if prefix.is_empty() {
// don't want a slash prefix
child.visit(func, name.clone())?;
} else {
child.visit(func, format!("{}/{}", prefix, name))?;
}
}
}
Ok(())
}
pub fn visit_mut<F: FnMut(&mut TreeNode, &str) -> anyhow::Result<()>>(
&mut self,
func: &mut F,
prefix: String,
) -> anyhow::Result<()> {
func(self, &prefix)?;
if let TreeNode::Directory { children, .. } = self {
for (name, child) in children.iter_mut() {
if prefix.is_empty() {
// don't want a slash prefix
child.visit_mut(func, name.clone())?;
} else {
child.visit_mut(func, format!("{}/{}", prefix, name))?;
}
}
}
Ok(())
}
pub fn exists(&self, pieces: &[&str]) -> bool {
match pieces.first() {
None => true,
Some(subpath) => {
if let TreeNode::Directory { children, .. } = self {
match children.get(*subpath) {
None => false,
Some(child) => child.exists(&pieces[1..]),
}
} else {
false
}
}
}
}
/// Recurses into a child by name, or returns Err with a reason.
pub fn child(&mut self, name: &str) -> Result<&mut TreeNode, &'static str> {
match self {
TreeNode::NormalFile { .. } => Err("not a directory: normal file"),
TreeNode::Directory { children, .. } => match children.get_mut(name) {
None => Err("child not in directory"),
Some(node) => Ok(node),
},
TreeNode::SymbolicLink { .. } => Err("not a directory: symlink"),
TreeNode::Deleted => Err("not a directory: deleted"),
}
}
}
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct FilesystemOwnership {
pub uid: u16,
pub gid: u16,
}
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct FilesystemPermissions {
pub mode: u32,
}
#[cfg(test)]
pub mod tests {
use crate::definitions::{
FilesystemOwnership, FilesystemPermissions, RecursiveChunkRef, TreeNode,
};
use std::collections::BTreeMap;
pub fn example_file() -> TreeNode {
TreeNode::NormalFile {
mtime: 424242,
ownership: FilesystemOwnership {
uid: 1042,
gid: 1043,
},
permissions: FilesystemPermissions { mode: 0o760 },
content: RecursiveChunkRef {
chunk_id: Default::default(),
depth: 0,
},
}
}
pub fn example_dir(
file1: Option<(&str, TreeNode)>,
file2: Option<(&str, TreeNode)>,
) -> TreeNode {
let mut map = BTreeMap::new();
if let Some((name, file)) = file1 {
map.insert(name.to_owned(), file);
}
if let Some((name, file)) = file2 {
map.insert(name.to_owned(), file);
}
TreeNode::Directory {
ownership: FilesystemOwnership {
uid: 1042,
gid: 1043,
},
permissions: FilesystemPermissions { mode: 0o770 },
children: map,
}
}
#[test]
pub fn test_exists() {
let file = example_file();
assert!(file.exists(&[]));
assert!(!file.exists(&["anything"]));
let subdir = example_dir(Some(("fetchmailrc", example_file())), None);
let dir = example_dir(Some(("boot.img", example_file())), Some(("etc", subdir)));
assert!(dir.exists(&[]));
assert!(dir.exists(&["boot.img"]));
assert!(dir.exists(&["etc", "fetchmailrc"]));
assert!(!dir.exists(&["bin"]));
assert!(!dir.exists(&["etc", "resolv.conf"]));
assert!(!dir.exists(&["boot.img", "hehe"]));
}
}

483
yama/src/extract.rs Normal file
View File

@ -0,0 +1,483 @@
use crate::pile_with_cache::PileWithCache;
use crate::retriever::decompressor::PipelineDecompressor;
use crate::retriever::{create_fixed_retriever, FileId, JobChunkReq, JobId, RetrieverResp};
use crate::PROGRESS_BAR_STYLE;
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use flume::Receiver;
use indicatif::ProgressStyle;
use patricia_tree::PatriciaMap;
use std::cmp::Reverse;
use std::collections::{BTreeMap, BTreeSet};
use std::fs::Permissions;
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;
use tokio::task::JoinSet;
use tracing::{info_span, Instrument, Span};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BloblogId, RecursiveChunkRef};
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{FilesystemPermissions, TreeNode};
use yama_wormfile::boxed::BoxedWormFileProvider;
#[derive(Clone, Debug, Default)]
pub struct FlattenedTree {
pub files: PatriciaMap<(ScanEntry, RecursiveChunkRef)>,
pub nonfiles: PatriciaMap<ScanEntry>,
}
pub fn flatten_treenode(root_node: &TreeNode) -> eyre::Result<FlattenedTree> {
let mut flat = FlattenedTree::default();
root_node.visit(
&mut |node, path| {
match node {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
size,
content,
} => {
flat.files.insert(
path,
(
ScanEntry::NormalFile {
mtime: *mtime,
ownership: *ownership,
permissions: *permissions,
size: *size,
},
*content,
),
);
}
TreeNode::Directory {
ownership,
permissions,
children: _,
} => {
flat.nonfiles.insert(
path,
ScanEntry::Directory {
ownership: *ownership,
permissions: *permissions,
},
);
}
TreeNode::SymbolicLink { ownership, target } => {
flat.nonfiles.insert(
path,
ScanEntry::SymbolicLink {
ownership: *ownership,
target: target.clone(),
},
);
}
TreeNode::Deleted => {
bail!("unexpected TreeNode::Deleted in flatten_treenode");
}
}
Ok(())
},
String::new(),
)?;
Ok(flat)
}
/// Create directories and symbolic links.
pub async fn unpack_nonfiles(
root: &Path,
nonfiles: &PatriciaMap<ScanEntry>,
restore_ownership: bool,
restore_permissions: bool,
) -> eyre::Result<()> {
if restore_ownership {
bail!("restoring ownership is not yet supported...");
}
for (rel_path, scan_entry) in nonfiles.iter() {
let path = root
.join(String::from_utf8(rel_path).context("nonfiles map contains non-string keys?")?);
match scan_entry {
ScanEntry::NormalFile { .. } => {
bail!("found NormalFile in unpack_nonfiles()");
}
ScanEntry::Directory {
ownership: _,
permissions,
} => {
tokio::fs::create_dir(&path).await?;
if restore_permissions {
tokio::fs::set_permissions(&path, Permissions::from_mode(permissions.mode))
.await?;
}
}
ScanEntry::SymbolicLink {
ownership: _,
target,
} => {
tokio::fs::symlink(target, &path).await?;
}
}
}
Ok(())
}
// TODO(perf): move out file writes into separate tasks...
pub async fn unpack_files(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
root: &Path,
files: &PatriciaMap<(ScanEntry, RecursiveChunkRef)>,
restore_ownership: bool,
restore_permissions: bool,
) -> eyre::Result<()> {
if restore_ownership {
bail!("restoring ownership is not yet supported...");
}
let expanded_chunkrefs = expand_chunkrefs(
pwc,
files
.iter()
.map(|(path_bytes, (scan_entry, rcr))| ((path_bytes, scan_entry), *rcr)),
)
.await?;
let total_chunks = expanded_chunkrefs
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>();
let unpack_span = info_span!("unpack_files");
async move {
let unpack_span = Span::current();
unpack_span.pb_set_style(&ProgressStyle::default_bar().template(
PROGRESS_BAR_STYLE,
).unwrap());
unpack_span.pb_set_message("unpack");
unpack_span.pb_set_length(total_chunks);
let mut join_set = JoinSet::new();
let (file_part_retriever, mut jobs) =
lookup_chunkrefs_and_create_retriever(pwc, expanded_chunkrefs).await?;
let mut open_files = BTreeMap::new();
loop {
tokio::select! {
Ok(next_part) = file_part_retriever.recv_async() => {
match next_part {
RetrieverResp::Blob { job, subjob, blob } => {
if subjob == 0 {
// eprintln!("subjob 0 for job {job:?}");
let (path_bytes, scan_entry) = jobs
.remove(&job)
.with_context(|| format!("bad job {job:?} to extract"))?;
let (permissions, _ownership) = if let ScanEntry::NormalFile {
permissions,
ownership,
..
} = scan_entry
{
(permissions, ownership)
} else {
bail!("not a Normal File in unpack_files()");
};
let path = root.join(String::from_utf8(path_bytes).context("bad utf-8 in PM")?);
let (tx, rx) = flume::bounded(16);
join_set.spawn(file_unpacker_writer(path, *permissions, restore_permissions, rx));
open_files.insert(job, tx);
}
open_files
.get_mut(&job)
.context("bad job to write file")?
.send_async(Some(blob))
.await
.map_err(|_| eyre!("file tx shutdown"))?;
unpack_span.pb_inc(1);
}
RetrieverResp::JobComplete(job) => {
open_files
.remove(&job)
.context("bad job to finish file")?
.send_async(None)
.await
.map_err(|_| eyre!("file tx shutdown"))?;
}
}
},
Some(join_result) = join_set.join_next() => {
join_result
.context("failed file unpacker writer (a)")?
.context("failed file unpacker writer (b)")?;
},
else => {
break;
}
}
}
// we should have already drained the join set, but check...
assert!(join_set.join_next().await.is_none());
if !open_files.is_empty() || !jobs.is_empty() {
bail!("There were errors extracting.");
}
Ok(())
}.instrument(unpack_span).await
}
pub async fn unpack_sync_stream(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
chunkref: RecursiveChunkRef,
mut stream: impl Write,
) -> eyre::Result<()> {
let expanded_chunkrefs = expand_chunkrefs(pwc, vec![((), chunkref)].into_iter()).await?;
let total_chunks = expanded_chunkrefs
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>();
let unpack_span = info_span!("unpack_files");
async move {
let unpack_span = Span::current();
unpack_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
unpack_span.pb_set_message("unpack");
unpack_span.pb_set_length(total_chunks);
let (file_part_retriever, _) =
lookup_chunkrefs_and_create_retriever(pwc, expanded_chunkrefs).await?;
let mut done = false;
while let Ok(next_part) = file_part_retriever.recv_async().await {
match next_part {
RetrieverResp::Blob { blob, .. } => {
tokio::task::block_in_place(|| stream.write_all(&blob))
.context("Failed to write to output stream on Blob")?;
unpack_span.pb_inc(1);
}
RetrieverResp::JobComplete(_) => {
tokio::task::block_in_place(|| stream.flush())
.context("Failed to flush output stream on JobComplete")?;
done = true;
}
}
}
if !done {
bail!("There were errors extracting.");
}
Ok(())
}
.instrument(unpack_span)
.await
}
async fn file_unpacker_writer(
path: PathBuf,
permissions: FilesystemPermissions,
restore_permissions: bool,
rx: Receiver<Option<Vec<u8>>>,
) -> eyre::Result<()> {
let mut oo = OpenOptions::new();
oo.write(true).create_new(true);
if restore_permissions {
oo.mode(permissions.mode);
};
let mut file = oo
.open(&path)
.await
.with_context(|| format!("can't create {path:?}"))?;
loop {
match rx.recv_async().await {
Ok(Some(next_block)) => {
file.write_all(&next_block).await?;
}
Ok(None) => {
file.flush().await.context("failed to flush")?;
return Ok(());
}
Err(_) => {
bail!("rx for file unpacking into {path:?} disconnected unexpectedly");
}
}
}
}
pub(crate) async fn expand_chunkrefs<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
chunkrefs: impl Iterator<Item = (T, RecursiveChunkRef)>,
) -> eyre::Result<Vec<(T, Vec<ChunkId>)>> {
let mut by_depth = BTreeMap::<Reverse<u32>, Vec<(T, Vec<ChunkId>)>>::new();
for (t, rec) in chunkrefs {
by_depth
.entry(Reverse(rec.depth))
.or_default()
.push((t, vec![rec.chunk_id]));
}
while let Some(Reverse(next_depth)) = by_depth.keys().next().cloned() {
let ts_and_chunks = by_depth.remove(&Reverse(next_depth)).unwrap();
if next_depth == 0 {
return Ok(ts_and_chunks);
}
let ec_span = info_span!("expand_chunkrefs");
ec_span.pb_set_style(
&ProgressStyle::default_bar()
.template(PROGRESS_BAR_STYLE)
.unwrap(),
);
ec_span.pb_set_length(
ts_and_chunks
.iter()
.map(|(_, cs)| cs.len() as u64)
.sum::<u64>(),
);
ec_span.pb_set_message(&format!("resolve (d={next_depth})"));
let expanded_ts_and_chunks = expand_chunkrefs_one_layer(pwc, ts_and_chunks)
.instrument(ec_span)
.await?;
by_depth
.entry(Reverse(next_depth - 1))
.or_default()
.extend(expanded_ts_and_chunks);
}
Ok(Vec::new())
}
async fn lookup_chunkrefs_and_create_retriever<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
input: Vec<(T, Vec<ChunkId>)>,
) -> eyre::Result<(Receiver<RetrieverResp>, BTreeMap<JobId, T>)> {
let mut next_job_id = JobId(0);
let chunks_to_lookup: BTreeSet<ChunkId> = input
.iter()
.flat_map(|(_t, chunkids)| chunkids)
.copied()
.collect();
let looked_up_chunks = pwc
.localcache
.read()
.await?
.locate_chunks(&chunks_to_lookup)
.await?;
ensure!(
chunks_to_lookup.len() == looked_up_chunks.len(),
"chunks are missing"
);
let bloblog_ids: BTreeSet<BloblogId> = looked_up_chunks.values().map(|(bi, _)| *bi).collect();
let num_bloblogs = bloblog_ids.len();
let bloblog_to_file_ids: BTreeMap<BloblogId, FileId> = bloblog_ids
.into_iter()
.zip((0..num_bloblogs as u32).map(FileId))
.collect();
let files: BTreeMap<FileId, BloblogId> =
bloblog_to_file_ids.iter().map(|(&k, &v)| (v, k)).collect();
let mut out_by_job = BTreeMap::<JobId, T>::new();
let mut jobs = BTreeMap::<JobId, Vec<JobChunkReq>>::new();
for (t, chunks) in input {
let job_id = next_job_id;
next_job_id.0 += 1;
out_by_job.insert(job_id, t);
jobs.insert(
job_id,
chunks
.into_iter()
.map(|c| {
let (bloblog_id, blob_locator) = &looked_up_chunks[&c];
JobChunkReq {
file: bloblog_to_file_ids[bloblog_id],
offset: blob_locator.offset,
length: blob_locator.length,
}
})
.collect(),
);
}
let retriever = create_fixed_retriever(pwc.clone(), jobs, files, 8)?;
let retriever =
PipelineDecompressor::start(pwc.pile.pile_config.zstd_dict.clone(), 2, retriever)?;
Ok((retriever, out_by_job))
}
pub(crate) async fn expand_chunkrefs_one_layer<T>(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
input: Vec<(T, Vec<ChunkId>)>,
) -> eyre::Result<Vec<(T, Vec<ChunkId>)>> {
let (retriever, jobs_to_ts) = lookup_chunkrefs_and_create_retriever(pwc, input).await?;
let mut out_by_job: BTreeMap<JobId, (T, Vec<u8>)> = jobs_to_ts
.into_iter()
.map(|(ji, t)| (ji, (t, Vec::new())))
.collect();
let mut num_jobs_left = out_by_job.len();
while let Ok(result) = retriever.recv_async().await {
match result {
RetrieverResp::Blob {
job,
subjob: _,
blob,
} => {
out_by_job
.get_mut(&job)
.context("bad job gm")?
.1
.extend_from_slice(&blob);
Span::current().pb_inc(1);
}
RetrieverResp::JobComplete(_) => {
num_jobs_left -= 1;
}
}
}
ensure!(num_jobs_left == 0, "jobs left over, recovery not complete");
out_by_job
.into_values()
.map(|(t, bytes)| {
let chunk_ids = bytes
.chunks(32)
.map(|b| {
if b.len() != 32 {
bail!("wrong number of bytes for chunk refs");
}
let mut b32 = [0u8; 32];
b32.copy_from_slice(b);
Ok(ChunkId::from(b32))
})
.collect::<eyre::Result<_>>()?;
Ok((t, chunk_ids))
})
.collect()
}

90
yama/src/init.rs Normal file
View File

@ -0,0 +1,90 @@
use eyre::{bail, Context, ContextCompat};
use std::path::Path;
use tokio::io::AsyncWriteExt;
use yama_midlevel_crypto::byte_layer::{ByteLayer, CborSerde};
use yama_midlevel_crypto::key_derivation::KeyDerivationParameters;
use yama_midlevel_crypto::sym_box::SymBox;
use yama_pile::definitions::{PackedKeyring, PackedPileConfig, UnlockedOrLockedKeyring};
use yama_pile::keyring::Keyring;
use yama_pile::{DIR_BLOBLOGS, DIR_INDICES, DIR_LOCKS, FILE_MASTER_KEYRING, FILE_YAMA_CONFIG};
use yama_wormfile::paths::WormPath;
use yama_wormfile::{WormFileProvider, WormFileWriter};
/// Perform checks before we init a pile in the given directory.
pub async fn pre_init_check(path: &Path) -> eyre::Result<()> {
if path.exists() && !path.is_dir() {
bail!("{path:?} is not a directory; cannot create pile or connector here.");
}
for important_path in [
"yama.toml",
DIR_BLOBLOGS,
DIR_LOCKS,
FILE_YAMA_CONFIG,
DIR_INDICES,
] {
let important_path = path.join(important_path);
if important_path.exists() {
bail!("{important_path:?} already exists: can't create pile or connector here.");
}
}
Ok(())
}
/// Perform checks before we init a pile in the given WormFileProvider.
pub async fn pre_init_check_wfp(wfp: &impl WormFileProvider) -> eyre::Result<()> {
for important_path in ["yama.toml", FILE_YAMA_CONFIG] {
let important_path = WormPath::new(important_path).unwrap();
if wfp.is_regular_file(&important_path).await? {
bail!("{important_path:?} already exists: can't create pile.");
}
}
Ok(())
}
/// Initialise a pile.
/// Should be run after `pre_init_check_wfp`.
pub async fn init_pile(
wfp: &impl WormFileProvider,
pile_config: PackedPileConfig,
master_keyring_copy: Option<PackedKeyring>,
) -> eyre::Result<()> {
let mut writer = wfp.write().await?;
writer.write_all(&pile_config.into_byte_vec()).await?;
writer.flush().await?;
writer
.finalise(WormPath::new(FILE_YAMA_CONFIG).unwrap(), false)
.await?;
if let Some(master_keyring_copy) = master_keyring_copy {
let mut writer = wfp.write().await?;
writer
.write_all(&master_keyring_copy.into_byte_vec())
.await?;
writer.flush().await?;
writer
.finalise(WormPath::new(FILE_MASTER_KEYRING).unwrap(), false)
.await?;
}
Ok(())
}
// todo move this
pub fn pack_keyring(unpacked: Keyring, password: Option<&str>) -> eyre::Result<PackedKeyring> {
let packed = if let Some(password) = password {
let deriver = KeyDerivationParameters::new_recommended();
let key = deriver
.derive(password)
.context("Failed to derive key from password")?;
let symkey = key.into_symkey();
let lockbox = SymBox::new(CborSerde::serialise(&unpacked).unwrap(), &symkey)
.context("Failed to encrypt keyring")?;
UnlockedOrLockedKeyring::Locked { deriver, lockbox }
} else {
UnlockedOrLockedKeyring::Unlocked(unpacked)
};
Ok(PackedKeyring::serialise(&packed).unwrap())
}

View File

@ -1,10 +1,25 @@
pub mod chunking;
pub mod commands;
pub mod debug;
pub mod definitions;
pub mod operations;
pub mod pile;
pub mod progress;
pub mod remote;
pub mod tree;
pub mod utils;
pub mod init;
pub mod open;
pub mod check;
pub mod extract;
pub mod scan;
pub mod storing;
pub mod vacuum;
pub mod pile_connector;
pub mod pile_with_cache;
pub mod retriever;
pub const PROGRESS_BAR_STYLE: &'static str =
"[{elapsed_precise}]/[{eta}] {wide_bar:.cyan/blue} {pos:>7}/{len:7} {msg}";
pub fn get_hostname() -> String {
hostname::get()
.expect("No hostname")
.into_string()
.expect("Hostname string must be sensible.")
}
pub mod debugging;

228
yama/src/open.rs Normal file
View File

@ -0,0 +1,228 @@
use crate::pile_connector::PileConnectionScheme;
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, Context, ContextCompat};
use std::borrow::Cow;
use std::collections::BTreeSet;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::io::{AsyncBufReadExt, BufReader};
use tracing::debug;
use twox_hash::XxHash64;
use yama_midlevel_crypto::byte_layer::ByteLayer;
use yama_pile::definitions::{IndexId, PackedKeyring, UnlockedOrLockedKeyring};
use yama_pile::keyring::Keyring;
use yama_pile::locks::LockKind;
use yama_pile::{Pile, FILE_YAMA_CONFIG, FILE_YAMA_CONNECTOR};
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const KEYRING_LOOKUP_SEQ: [&'static str; 2] = ["access.yamakeyring", "master.yamakeyring"];
pub async fn pre_open_keyring(connector_in_dir: &Path) -> eyre::Result<UnlockedOrLockedKeyring> {
for lookup in KEYRING_LOOKUP_SEQ {
let keyring_path = connector_in_dir.join(lookup);
if keyring_path.exists() {
return pre_open_keyring_at_path(&keyring_path).await;
}
}
bail!(
"No keyring found in {:?}. Expected to see one at one of: {:?}",
connector_in_dir,
KEYRING_LOOKUP_SEQ
);
}
pub async fn pre_open_keyring_at_path(
keyring_path: &Path,
) -> eyre::Result<UnlockedOrLockedKeyring> {
let packed_keyring_bytes = tokio::fs::read(&keyring_path)
.await
.with_context(|| format!("failed to read keyring file at {:?}", keyring_path))?;
let packed_keyring = PackedKeyring::from_byte_vec(packed_keyring_bytes)
.deserialise()
.with_context(|| format!("failed to deserialise keyring file at {:?}", keyring_path))?;
return Ok(packed_keyring);
}
pub async fn open_keyring_interactive(input: UnlockedOrLockedKeyring) -> eyre::Result<Keyring> {
match input {
UnlockedOrLockedKeyring::Locked { deriver, lockbox } => {
println!("enter keyring password to decrypt:");
let stdin = tokio::io::stdin();
let mut stdin_br = BufReader::new(stdin);
let mut line = String::new();
stdin_br.read_line(&mut line).await?;
let derived = deriver
.derive(line.trim())
.context("failed to derive key from password")?;
let keyring = lockbox
.unlock(&derived.into_symkey())
.context("failed to decrypt keyring")?
.deserialise()
.context("failed to deserialise keyring")?;
Ok(keyring)
}
UnlockedOrLockedKeyring::Unlocked(keyring) => Ok(keyring),
}
}
pub async fn open_pile(
connector_in_dir: &Path,
keyring: Keyring,
lock_kind: LockKind,
lock_holder: String,
) -> eyre::Result<PileWithCache<BoxedWormFileProvider>> {
let connection_scheme = if connector_in_dir.join(FILE_YAMA_CONFIG).exists() {
PileConnectionScheme::Local {
directory: connector_in_dir
.canonicalize()
.context("can't canonicalise local pile path")?
.to_owned(),
}
} else if connector_in_dir.join(FILE_YAMA_CONNECTOR).exists() {
let connector_toml = tokio::fs::read_to_string(&connector_in_dir.join(FILE_YAMA_CONNECTOR))
.await
.context("failed to read connector")?;
let connector: PileConnectionScheme =
toml::from_str(&connector_toml).context("failed to deserialise connector")?;
connector
} else {
bail!("Neither yama.cfg nor yama.toml exists; doesn't look like a Yama pile or pile connector.");
};
// Calculate a prefix for the cache name
let canon_connector_in_dir = connector_in_dir
.canonicalize()
.unwrap_or(connector_in_dir.to_owned());
let cache_base_name = canon_connector_in_dir
.file_name()
.map(|f| f.to_string_lossy())
.unwrap_or(Cow::Borrowed("_"));
open_pile_using_connector(
&connection_scheme,
cache_base_name.as_ref(),
keyring,
lock_kind,
lock_holder,
)
.await
}
pub async fn open_pile_using_connector(
connection_scheme: &PileConnectionScheme,
cache_base_name: &str,
keyring: Keyring,
lock_kind: LockKind,
lock_holder: String,
) -> eyre::Result<PileWithCache<BoxedWormFileProvider>> {
let wormfileprovider = Arc::new(connection_scheme.connect_to_wormfileprovider().await?);
let pile = Pile::open_manual(wormfileprovider, lock_kind, lock_holder, keyring).await?;
let cache_dir = appdirs::user_cache_dir(Some("yama"), None).expect("can't obtain cache dir!");
let mut hasher = XxHash64::default();
connection_scheme.hash(&mut hasher);
let u64_hash = hasher.finish();
let cache_key = format!("{}-{:016x}.sqlite3", cache_base_name, u64_hash);
tokio::fs::create_dir_all(&cache_dir).await?;
let cache_file = cache_dir.join(&cache_key);
let localcache = yama_localcache::Store::new(&cache_file)
.await
.with_context(|| format!("failed to open local cache"))?;
Ok(PileWithCache { pile, localcache })
}
pub async fn update_cache(pwc: &PileWithCache<BoxedWormFileProvider>) -> eyre::Result<()> {
debug!("updating cache");
let available_indices = pwc
.pile
.list_indices()
.await
.context("can't list available indices")?;
let present_indices = pwc
.localcache
.read()
.await?
.list_indices()
.await
.context("can't list cached indices")?;
let missing_indices: BTreeSet<IndexId> = available_indices
.difference(&present_indices)
.cloned()
.collect();
let deleted_indices: BTreeSet<IndexId> = present_indices
.difference(&available_indices)
.cloned()
.collect();
let mut downloaded_indices = Vec::new();
debug!(
"{} new indices to cache, {} deleted indices to back out",
missing_indices.len(),
deleted_indices.len()
);
for missing_index in missing_indices {
debug!("downloading index {missing_index:?}");
downloaded_indices.push((missing_index, pwc.pile.read_index(missing_index).await?));
}
let mut txn = pwc.localcache.write().await?;
for deleted_index in deleted_indices {
debug!("backing out index {deleted_index:?}");
txn.delete_index(deleted_index).await?;
}
for (index_id, index) in downloaded_indices {
debug!("applying index {index_id:?}");
txn.apply_index(index_id, Arc::new(index)).await?;
}
debug!("finished updating cache");
Ok(())
}
pub async fn open_lock_and_update_cache(
pile_connector_path: PathBuf,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let keyring = pre_open_keyring(&pile_connector_path).await?;
let keyring = open_keyring_interactive(keyring).await?;
let pwc = open_pile(&pile_connector_path, keyring, LockKind::Shared, lock_name).await?;
update_cache(&pwc).await?;
Ok(Arc::new(pwc))
}
pub async fn open_lock_and_update_cache_with_connector(
pile_connection_scheme: &PileConnectionScheme,
cache_base_name: &str,
keyring_path: &Path,
lock_name: String,
) -> eyre::Result<Arc<PileWithCache<BoxedWormFileProvider>>> {
let keyring = pre_open_keyring_at_path(keyring_path).await?;
let keyring = open_keyring_interactive(keyring).await?;
let pwc = open_pile_using_connector(
pile_connection_scheme,
cache_base_name,
keyring,
LockKind::Shared,
lock_name,
)
.await?;
update_cache(&pwc).await?;
Ok(Arc::new(pwc))
}

View File

@ -1,80 +0,0 @@
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node, store_tree_node};
use crate::pile::{Pile, RawPile};
use crate::tree::{differentiate_node_in_place, integrate_node_in_place};
use anyhow::{anyhow, Context};
use log::info;
pub mod checking;
pub mod cleanup;
pub mod extracting;
pub mod legacy_pushpull;
pub mod storing;
pub fn remove_pointer_safely<P: RawPile>(pile: &Pile<P>, name: &str) -> anyhow::Result<()> {
// retrieve this pointer
let mut this_pointer = pile
.read_pointer(name)?
.ok_or_else(|| anyhow!("Pointer {:?} does not exist so can not be deleted.", name))?;
let mut this_node = retrieve_tree_node(&pile, this_pointer.chunk_ref.clone())
.context("retrieving 'this' node")?;
let new_parent_name = this_pointer.parent_pointer.clone();
fully_integrate_pointer_node(pile, &mut this_node.node, &mut this_pointer)
.context("integrating new parent")?;
let new_parent = if let Some(ref new_parent_name) = new_parent_name {
let mut new_parent_pointer = pile
.read_pointer(new_parent_name.as_str())?
.ok_or_else(|| anyhow!("Parent pointer {:?} does not exist.", name))?;
let mut new_parent_node = retrieve_tree_node(&pile, new_parent_pointer.chunk_ref.clone())?;
fully_integrate_pointer_node(pile, &mut new_parent_node.node, &mut new_parent_pointer)?;
Some((new_parent_pointer, new_parent_node))
} else {
None
};
// now integrate any pointers that rely on this one
// so that they no longer rely on this one.
for pointer in pile.list_pointers()?.iter() {
if pointer == name {
continue;
}
if let Some(mut pointer_data) = pile.read_pointer(pointer.as_str())? {
if let Some(parent_pointer) = pointer_data.parent_pointer.as_ref() {
if parent_pointer == name {
info!("Pointer would be orphaned: {:?}; integrating", pointer);
// need to integrate this node, so retrieve it
let mut node = retrieve_tree_node(&pile, pointer_data.chunk_ref)?;
// integrate it in-place
integrate_node_in_place(&mut node.node, &this_node.node)?;
if let Some((_, ref new_parent_node)) = new_parent {
// then differentiate with respect to the NEW parent
differentiate_node_in_place(&mut node.node, &new_parent_node.node)?;
}
// pass through the parent
pointer_data.parent_pointer = new_parent_name.clone();
// store the updated version of the pointer
let new_chunk_ref = store_tree_node(&pile, &node)?;
// associate the new node with the new version of the pointer
pointer_data.chunk_ref = new_chunk_ref;
// write the pointer back.
pile.write_pointer(pointer.as_str(), &pointer_data)?;
// we must flush chunks before deleting the pointer
pile.flush()
.context("flushing after writing pointer back")?;
}
}
}
}
// then delete the pointer
pile.delete_pointer(name)?;
info!("Deleted pointer: {:?}", name);
Ok(())
}

View File

@ -1,438 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::chunking::RecursiveUnchunker;
use crate::commands::retrieve_tree_node;
use crate::definitions::{ChunkId, TreeNode};
use crate::pile::{
ControllerMessage, Keyspace, Pile, PipelineDescription, RawPile, StoragePipelineSettings,
};
use anyhow::bail;
use crossbeam_channel::Sender;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use itertools::Itertools;
use log::{error, info, warn};
use std::collections::HashSet;
use std::convert::TryInto;
use std::io::{Read, Write};
use std::sync::Mutex;
#[derive(PartialEq, Eq, Copy, Clone, Debug)]
pub enum VacuumMode {
NoVacuum,
DryRunVacuum,
Vacuum,
}
pub struct NullWriter {}
impl Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
/// Mark-and-sweep style vacuuming system.
/// We mark all the chunks that we run into (following the structure of all the pointers and
/// recursive chunk references) and sweep the chunks that have not been read.
#[derive(Debug)]
pub struct VacuumRawPile<RP: RawPile> {
underlying: RP,
vacuum_tracking_enabled: bool,
pub retrieved_chunks: Mutex<HashSet<ChunkId>>,
}
impl<RP: RawPile> VacuumRawPile<RP> {
pub fn new(underlying: RP, vacuum_tracking_enabled: bool) -> Self {
VacuumRawPile {
underlying,
vacuum_tracking_enabled,
retrieved_chunks: Default::default(),
}
}
pub fn calculate_vacuum_for_sweeping(&self) -> anyhow::Result<HashSet<ChunkId>> {
if !self.vacuum_tracking_enabled {
bail!("Vacuum tracking not enabled, you can't calculate the vacuum set!");
}
let mut to_sweep = HashSet::new();
let retrieved_chunks = self.retrieved_chunks.lock().unwrap();
let mut chunk_id: ChunkId = Default::default();
for key in self.list_keys(Keyspace::Chunk)? {
chunk_id.clone_from_slice(&key?);
if !retrieved_chunks.contains(&chunk_id) {
to_sweep.insert(chunk_id.clone());
}
}
Ok(to_sweep)
}
}
impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if self.vacuum_tracking_enabled && kind == Keyspace::Chunk {
let mut chunk_id: ChunkId = Default::default();
chunk_id.clone_from_slice(key);
self.retrieved_chunks.lock().unwrap().insert(chunk_id);
}
self.underlying.read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.underlying.write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.underlying
.build_storage_pipeline(settings, controller_send)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.underlying.describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}
/// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time.
/// This is also capable of finding and vacuuming unused chunks.
/// This checks:
/// - the integrity of each chunk (assuming an integrity-aware raw pile is used)
/// - the structure of pointers and multi-level chunk references
pub fn check_deep<RP: RawPile>(
pile: Pile<RP>,
vacuum: VacuumMode,
make_progress_bar: bool,
) -> anyhow::Result<u32> {
let pile = Pile::new(VacuumRawPile::new(
pile.raw_pile,
vacuum != VacuumMode::NoVacuum,
));
let mut errors = 0;
let mut to_check = Vec::new();
let pointer_list = pile.list_pointers()?;
for pointer in pointer_list.iter() {
info!("Checking pointer {:?}", pointer);
match pile.read_pointer(&pointer)? {
Some(pointer_data) => {
if let Some(parent) = pointer_data.parent_pointer {
if !pointer_list.contains(&parent) {
errors += 1;
error!(
"Pointer {:?} has a parent {:?} which does not exist.",
pointer, parent
);
}
}
let tree_node = retrieve_tree_node(&pile, pointer_data.chunk_ref.clone())?;
tree_node.node.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
to_check.push(content.clone());
}
Ok(())
},
"".to_owned(),
)?;
}
None => {
errors += 1;
error!("Pointer {:?} does not seem to exist.", pointer);
}
}
}
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("checking");
let mut done = 0;
while let Some(next_to_check) = to_check.pop() {
done += 1;
pbar.set_length(done + to_check.len() as u64);
pbar.set_position(done);
let mut unchunker = RecursiveUnchunker::new(&pile, next_to_check.clone());
match std::io::copy(&mut unchunker, &mut NullWriter {}) {
Ok(_) => {}
Err(err) => {
errors += 1;
warn!(
"Error occurred when reading {:?}: {:?}.",
next_to_check, err
);
}
}
}
pbar.finish_and_clear();
if errors > 0 {
error!("There were {:?}", errors);
} else {
info!("No errors.");
}
if errors == 0 && vacuum != VacuumMode::NoVacuum {
info!("Calculating sweep set for vacuuming.");
let to_vacuum = pile.raw_pile.calculate_vacuum_for_sweeping()?;
info!("{} chunks are ready to be vacuumed.", to_vacuum.len());
if vacuum == VacuumMode::Vacuum {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
to_vacuum.len() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("vacuuming");
// actually do the vacuum!
info!("Going to vacuum them up.");
for vacuum_id in to_vacuum {
pile.raw_pile.delete(Keyspace::Chunk, &vacuum_id)?;
pbar.inc(1);
}
pile.flush()?;
pbar.finish_and_clear();
}
}
Ok(errors)
}
/// A shallower check than the deep one. This avoids reading the last layer of chunks.
/// (they are simply assumed to be OK.).
/// This leads to much faster performance and is mostly intended for GC.
/// We can check existence for those leaf chunks if desired. This still avoids the
/// overhead of decryption, decompression and reading from disk/network.
pub fn check_shallow<RP: RawPile>(
pile: Pile<RP>,
vacuum: VacuumMode,
make_progress_bar: bool,
check_existence: bool,
) -> anyhow::Result<u32> {
let pile = Pile::new(VacuumRawPile::new(
pile.raw_pile,
vacuum != VacuumMode::NoVacuum,
));
let mut additional_seen: HashSet<ChunkId> = HashSet::new();
let mut errors = 0;
let mut to_check = Vec::new();
let pointer_list = pile.list_pointers()?;
for pointer in pointer_list.iter() {
info!("Checking pointer {:?}", pointer);
match pile.read_pointer(&pointer)? {
Some(pointer_data) => {
if let Some(parent) = pointer_data.parent_pointer {
if !pointer_list.contains(&parent) {
errors += 1;
error!(
"Pointer {:?} has a parent {:?} which does not exist.",
pointer, parent
);
}
}
let tree_node = retrieve_tree_node(&pile, pointer_data.chunk_ref.clone())?;
tree_node.node.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
to_check.push(content.clone());
}
Ok(())
},
"".to_owned(),
)?;
}
None => {
errors += 1;
error!("Pointer {:?} does not seem to exist.", pointer);
}
}
}
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("checking");
let mut done = 0;
while let Some(next_to_check) = to_check.pop() {
done += 1;
pbar.set_length(done + to_check.len() as u64);
pbar.set_position(done);
if next_to_check.depth > 0 {
let mut reduced_height = next_to_check.clone();
reduced_height.depth -= 1;
let mut chunk_id_buf: ChunkId = Default::default();
let mut unchunker = RecursiveUnchunker::new(&pile, reduced_height);
loop {
let read_bytes = unchunker.read(&mut chunk_id_buf)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
break;
}
if read_bytes < chunk_id_buf.len() {
// any error, including EOF at this point, is an error
unchunker.read_exact(&mut chunk_id_buf[read_bytes..])?;
}
if check_existence && !pile.chunk_exists(&chunk_id_buf)? {
errors += 1;
warn!("Chunk missing: {:?}", &chunk_id_buf);
}
additional_seen.insert(chunk_id_buf.clone());
}
} else {
// already shallowest, just add the reference to the seen list.
additional_seen.insert(next_to_check.chunk_id);
}
}
pbar.finish_and_clear();
if errors > 0 {
error!("There were {:?}", errors);
} else {
info!("No errors.");
}
if errors == 0 && vacuum != VacuumMode::NoVacuum {
info!("Calculating sweep set for vacuuming.");
let mut to_vacuum = pile.raw_pile.calculate_vacuum_for_sweeping()?;
// don't forget to include the leaves that we didn't actually visit!
for element in additional_seen {
to_vacuum.remove(&element);
}
info!("{} chunks are ready to be vacuumed.", to_vacuum.len());
if vacuum == VacuumMode::Vacuum {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
to_vacuum.len() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar().template(
"[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
),
);
pbar.set_message("vacuuming");
// actually do the vacuum!
info!("Going to vacuum them up.");
for vacuum_ids_chunk in to_vacuum
.into_iter()
.chunks(512)
.into_iter()
.map(|c| c.collect::<Vec<ChunkId>>())
{
pile.raw_pile.delete_many(
Keyspace::Chunk,
vacuum_ids_chunk
.iter()
.map(|ci| ci.as_slice())
.collect::<Vec<&[u8]>>()
.as_slice(),
)?;
pbar.inc(vacuum_ids_chunk.len().try_into().unwrap());
}
pile.flush()?;
pbar.finish_and_clear();
}
}
Ok(errors)
}

View File

@ -1,64 +0,0 @@
use crate::pile::local_sqlitebloblogs::{CompactionThresholds, SqliteBloblogPile};
use crate::pile::{PileDescriptor, PileStorage};
use anyhow::{bail, Context};
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::info;
use std::path::Path;
pub fn compact(
pile_path: &Path,
pile_desc: &PileDescriptor,
actually_run: bool,
make_progress_bar: bool,
thresholds: CompactionThresholds,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(1000 as u64, ProgressDrawTarget::stdout_with_hz(10))
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("compacting");
match pile_desc.storage {
PileStorage::SqliteIndexedBloblog => {
let bloblog_pile = SqliteBloblogPile::open(&pile_path)
.context("Failed to open SQLite-indexed Bloblog Pile")?;
compact_bloblogs(bloblog_pile, pbar, actually_run, thresholds)?;
Ok(())
}
other @ PileStorage::RemoteOnly => {
bail!("Cannot use compaction on this kind of pile: {other:?}!");
}
}
}
fn compact_bloblogs(
bloblog_pile: SqliteBloblogPile,
pbar: ProgressBar,
actually_run: bool,
thresholds: CompactionThresholds,
) -> anyhow::Result<()> {
info!("=== Analysing for compaction ===");
let analysis = bloblog_pile.analyse_for_compaction()?;
let chunks_total: u64 = analysis.values().map(|bs| bs.chunks_total).sum();
let chunks_deleted: u64 = analysis.values().map(|bs| bs.chunks_deleted).sum();
let bytes_total: u64 = analysis.values().map(|bs| bs.bytes_total).sum();
let bytes_deleted: u64 = analysis.values().map(|bs| bs.bytes_deleted).sum();
info!("{} bloblogs in this pile, with {chunks_total} chunks ({bytes_total} B) of which {chunks_deleted} ({bytes_deleted} B) are deleted.", analysis.len());
info!("=== Planning compaction ===");
let plan = bloblog_pile.plan_compaction(&thresholds, analysis)?;
info!("Planned compaction: replace {} bloblogs (of which {} are small), freeing up {} B and rewriting {} B", plan.bloblogs_to_replace.len(), plan.small_bloblogs, plan.reclaimable_space, plan.bytes_to_write);
if actually_run {
info!("=== Compacting ===");
bloblog_pile.perform_compaction(Box::new(pbar), plan)?;
}
Ok(())
}

View File

@ -1,370 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::convert::TryInto;
use std::fs::OpenOptions;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use std::{fs, io};
use anyhow::{anyhow, Context};
use crossbeam_channel::{Receiver, Sender};
use crossbeam_utils::thread;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::error;
use nix::sys::time::{TimeVal, TimeValLike};
use nix::unistd::{Gid, Uid};
use crate::chunking::RecursiveUnchunker;
use crate::commands::fully_load_pointer;
use crate::definitions::{FilesystemOwnership, RecursiveChunkRef, TreeNode};
use crate::pile::{Pile, RawPile};
use std::collections::{BTreeMap, HashMap};
/// Given a fully-integrated root node, extracts the files from the pile.
pub fn extract<RP: RawPile>(
target_path: &Path,
root: &mut TreeNode,
pile: &Pile<RP>,
make_progress_bar: bool,
num_workers: u8,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
root.count_normal_files() as u64,
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("extracting");
let (paths_send, paths_recv) = crossbeam_channel::unbounded();
let (results_send, results_recv) = crossbeam_channel::bounded(16);
let failures = AtomicU32::new(0);
thread::scope(|s| {
for worker in 0..num_workers {
let paths_recv = paths_recv.clone();
let results_send = results_send.clone();
let failures = &failures; // needed because of move
s.builder()
.name(format!("yama unchunker {}", worker))
.spawn(move |_| {
if let Err(e) = extract_worker(pile, paths_recv, results_send) {
error!("Extraction worker {} failed: {:?}!", worker, e);
failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
// Needed to allow the manager to join once the workers finish and drop their senders.
drop(results_send);
drop(paths_recv);
s.spawn(|_| {
if let Err(e) = manager(root, target_path, paths_send, results_recv, &pbar) {
error!("Extraction manager failed: {:?}!", e);
failures.fetch_add(1, Ordering::Relaxed);
}
});
})
.expect("join issue");
pbar.set_message("applying metadata");
apply_metadata(
root,
target_path,
apply_permissions,
apply_mtime,
apply_ownership,
)?;
Ok(())
}
/// Given the name of a pointer, extracts it.
pub fn extract_from_pointer_name<RP: RawPile>(
target_path: &Path,
pointer_name: &str,
pile: &Pile<RP>,
make_progress_bar: bool,
num_workers: u8,
apply_permissions: bool,
apply_mtime: bool,
apply_ownership: bool,
) -> anyhow::Result<()> {
let (pointer_data, mut root_node) = fully_load_pointer(pile, pointer_name.as_ref())?;
let uid_translation_table = build_uid_translation_table(&pointer_data.uid_lookup);
let gid_translation_table = build_gid_translation_table(&pointer_data.gid_lookup);
// convert the UIDs and GIDs to match this system, which may be different from the usual.
apply_uid_and_gid_translation_tables(
&mut root_node.node,
&uid_translation_table,
&gid_translation_table,
);
extract(
&target_path.join(&root_node.name),
&mut root_node.node,
pile,
make_progress_bar,
num_workers,
apply_permissions,
apply_mtime,
apply_ownership,
)
}
pub fn build_uid_translation_table(
uid_lookup: &BTreeMap<u16, Option<String>>,
) -> HashMap<u16, u16> {
let mut result: HashMap<u16, u16> = Default::default();
for (old_uid, name) in uid_lookup.iter() {
if let Some(name) = name {
if let Some(user) = users::get_user_by_name(name) {
let new_uid = user.uid() as u16;
if new_uid != *old_uid {
result.insert(*old_uid, new_uid);
}
}
}
}
result
}
pub fn build_gid_translation_table(
gid_lookup: &BTreeMap<u16, Option<String>>,
) -> HashMap<u16, u16> {
let mut result: HashMap<u16, u16> = Default::default();
for (old_gid, name) in gid_lookup.iter() {
if let Some(name) = name {
if let Some(group) = users::get_group_by_name(name) {
let new_gid = group.gid() as u16;
if new_gid != *old_gid {
result.insert(*old_gid, new_gid);
}
}
}
}
result
}
pub fn apply_uid_and_gid_translation_tables(
node: &mut TreeNode,
uid_translation: &HashMap<u16, u16>,
gid_translation: &HashMap<u16, u16>,
) {
if uid_translation.is_empty() && gid_translation.is_empty() {
// nothing to do here :).
return;
}
let apply_to = |ownership: &mut FilesystemOwnership| {
ownership.uid = *uid_translation
.get(&ownership.uid)
.unwrap_or(&ownership.uid);
ownership.gid = *gid_translation
.get(&ownership.gid)
.unwrap_or(&ownership.gid);
};
node.visit_mut(
&mut |node, _| {
match node {
TreeNode::NormalFile { ownership, .. } => {
apply_to(ownership);
}
TreeNode::Directory { ownership, .. } => {
apply_to(ownership);
}
TreeNode::SymbolicLink { ownership, .. } => {
apply_to(ownership);
}
TreeNode::Deleted => {}
}
Ok(())
},
"".to_owned(),
)
.expect("Can't fail since we don't fail.");
}
/// A worker thread for extracting
pub fn extract_worker<RP: RawPile>(
pile: &Pile<RP>,
paths: Receiver<(PathBuf, RecursiveChunkRef)>,
results: Sender<()>,
) -> anyhow::Result<()> {
while let Ok((path, chunk_ref)) = paths.recv() {
let mut extractor = RecursiveUnchunker::new(pile, chunk_ref);
let mut file = OpenOptions::new()
.write(true)
.create_new(true)
.open(&path)
.with_context(|| format!("Failed to open {:?}", path))?;
io::copy(&mut extractor, &mut file)?;
results
.send(())
.or_else(|_| Err(anyhow!("Failed to send result")))?;
}
Ok(())
}
/// A single thread that manages the workers
pub fn manager(
root: &mut TreeNode,
target_path: &Path,
paths_sender: Sender<(PathBuf, RecursiveChunkRef)>,
results_receiver: Receiver<()>,
progress_bar: &ProgressBar,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, name| {
let final_path = if name.is_empty() {
target_path.to_path_buf()
} else {
target_path.join(name)
};
match tree_node {
TreeNode::NormalFile { content, .. } => {
paths_sender
.send((final_path, content.clone()))
.expect("Unable to send to should-be unbounded channel");
}
TreeNode::Directory { .. } => {
fs::create_dir(&final_path)?;
}
TreeNode::SymbolicLink { target, .. } => {
// TODO may want to perform rewrites ...?
std::os::unix::fs::symlink(target, &final_path)?;
}
TreeNode::Deleted => {
panic!("should not be extracting 'Deleted!' --- BUG.");
}
};
Ok(())
},
"".to_string(),
)?;
// Needed to allow the workers to finish; otherwise we never join.
drop(paths_sender);
while let Ok(()) = results_receiver.recv() {
progress_bar.inc(1);
}
Ok(())
}
/// Applies metadata (permissions, mtime, ownership) to files from a tree node.
pub fn apply_metadata(
root: &TreeNode,
target: &Path,
apply_permissions: bool,
apply_mtime: bool,
apply_owner: bool,
) -> anyhow::Result<()> {
match root {
TreeNode::NormalFile {
mtime,
ownership,
permissions,
..
} => {
if apply_permissions {
let mut perms = fs::metadata(&target)?.permissions();
perms.set_mode(permissions.mode);
fs::set_permissions(&target, perms)?;
}
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
if apply_mtime {
if let Ok(mtime) = (*mtime).try_into() {
let tv = TimeVal::milliseconds(mtime);
nix::sys::stat::lutimes(target, &tv, &tv)?;
}
}
}
TreeNode::Directory {
ownership,
permissions,
children,
} => {
if apply_permissions {
let mut perms = fs::metadata(&target)?.permissions();
perms.set_mode(permissions.mode);
fs::set_permissions(&target, perms)?;
}
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
for (name, child) in children.iter() {
let child_path = target.join(name);
apply_metadata(
child,
&child_path,
apply_permissions,
apply_mtime,
apply_owner,
)?;
}
}
TreeNode::SymbolicLink { ownership, .. } => {
if apply_owner {
nix::unistd::chown(
target,
Some(Uid::from_raw(ownership.uid as u32)),
Some(Gid::from_raw(ownership.gid as u32)),
)?;
}
}
TreeNode::Deleted => {
panic!("Deleted is not meant to be reachable here.");
}
}
Ok(())
}

View File

@ -1,333 +0,0 @@
use crate::chunking::RecursiveUnchunker;
use crate::commands::fully_load_pointer;
use crate::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use crate::operations::checking::VacuumRawPile;
use crate::operations::legacy_pushpull::PushWorkerToManagerMessage::{NewTask, TaskDone};
use crate::pile::compression::{CompressionSettings, RawPileCompressor};
use crate::pile::integrity::RawPileIntegrityChecker;
use crate::pile::local_sqlitebloblogs::SqliteBloblogPile;
use crate::pile::{Keyspace, Pile, PileDescriptor, PileStorage, RawPile};
use crate::utils::get_number_of_workers;
use anyhow::{bail, Context};
use crossbeam_channel::{Receiver, Sender};
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::error;
use std::collections::HashSet;
use std::fs::File;
use std::io::Read;
use std::path::Path;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
/// Pushes chunks (and pointers) from one pile to another.
/// This is a thorough implementation that could be slow but at least should give good confidence.
/// (Presumably we could do better by looking at the pointers that already exist on the destination
/// and only integrating as much as we need to.)
pub fn push_to(
from_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
from_rp_bypass: Arc<Box<dyn RawPile>>,
to_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
to_rp_bypass: Arc<Box<dyn RawPile>>,
pointers: Vec<String>,
make_progress_bar: bool,
num_workers: u32,
) -> anyhow::Result<()> {
let pbar = if make_progress_bar {
ProgressBar::with_draw_target(
1, // TODO
ProgressDrawTarget::stdout_with_hz(10),
)
} else {
ProgressBar::hidden()
};
pbar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}]/[{eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}"),
);
pbar.set_message("push/pull");
let (jobs_tx, jobs_rx) = crossbeam_channel::unbounded();
let (stat_tx, stat_rx) = crossbeam_channel::bounded(32);
let mut to_process = Vec::new();
for pointer in pointers {
let (pointer_data, root_node) = fully_load_pointer(&from_pile, &pointer)?;
// schedule storing the pointer chunks
to_process.push(pointer_data.chunk_ref.clone());
if to_pile.read_pointer(&pointer)?.is_some() {
unimplemented!("pointer in target exists.");
}
// copy across the pointer data
to_pile.write_pointer(&pointer, &pointer_data)?;
root_node
.node
.visit(
&mut |node, _path| {
match node {
TreeNode::NormalFile { content, .. } => {
to_process.push(content.clone());
}
_ => {} // nop
}
Ok(())
},
String::new(),
)
.expect("No fail");
}
// start the work
let critical_failures = Arc::new(AtomicU32::new(0));
for worker_num in 0..num_workers {
let jobs_rx = jobs_rx.clone();
let stat_tx = stat_tx.clone();
let critical_failures = critical_failures.clone();
let from_pile = from_pile.clone();
let from_rp_bypass = from_rp_bypass.clone();
let to_pile = to_pile.clone();
let to_rp_bypass = to_rp_bypass.clone();
std::thread::Builder::new()
.name(format!("yama pusher {}", worker_num))
.spawn(move || {
if let Err(e) = pusher_worker(
from_pile,
from_rp_bypass,
to_pile,
to_rp_bypass,
jobs_rx,
stat_tx,
) {
error!("[critical!] Push worker {} FAILED: {:?}", worker_num, e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
for task in to_process {
stat_tx
.send(NewTask(task))
.expect("unbounded so should be able to send");
}
// must drop here for ending to happen
drop(jobs_rx);
drop(stat_tx);
pbar.set_length(0);
if let Err(e) = pusher_manager(&pbar, stat_rx, jobs_tx) {
error!("[critical!] Push manager FAILED: {:?}", e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
Ok(())
}
enum PushWorkerToManagerMessage {
NewTask(RecursiveChunkRef),
TaskDone,
}
fn pusher_manager(
pbar: &ProgressBar,
update_receiver: Receiver<PushWorkerToManagerMessage>,
job_queue: Sender<RecursiveChunkRef>,
) -> anyhow::Result<()> {
let mut outstanding = 0;
let mut already_done = HashSet::new();
while let Ok(status) = update_receiver.recv() {
match status {
PushWorkerToManagerMessage::NewTask(task) => {
if already_done.insert(task.clone()) {
job_queue.send(task)?;
pbar.inc_length(1);
outstanding += 1;
}
}
PushWorkerToManagerMessage::TaskDone => {
pbar.inc(1);
outstanding -= 1;
if outstanding == 0 {
break;
}
}
}
}
Ok(())
}
fn pusher_worker(
from_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
from_rp_bypass: Arc<Box<dyn RawPile>>,
to_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
to_rp_bypass: Arc<Box<dyn RawPile>>,
jobs_rx: Receiver<RecursiveChunkRef>,
stat_tx: Sender<PushWorkerToManagerMessage>,
) -> anyhow::Result<()> {
while let Ok(job) = jobs_rx.recv() {
if !to_pile.chunk_exists(&job.chunk_id)? {
if let Some(bypass_chunk_data) = from_rp_bypass.read(Keyspace::Chunk, &job.chunk_id)? {
to_rp_bypass.write(Keyspace::Chunk, &job.chunk_id, &bypass_chunk_data)?;
} else {
bail!("Chunk cannot be copied because doesn't exist (in bypass pile).");
}
}
if job.depth > 0 {
// we want to (partially) unchunk this and submit all subchunks.
let vacuum_rp = VacuumRawPile::new(from_pile.raw_pile.clone(), true);
let vacuum_pile = Pile::new(vacuum_rp);
// First read the bottom-level chunk IDs
let mut reduced_height = job.clone();
reduced_height.depth -= 1;
let mut chunk_id_buf: ChunkId = Default::default();
let mut unchunker = RecursiveUnchunker::new(&vacuum_pile, reduced_height);
loop {
let read_bytes = unchunker.read(&mut chunk_id_buf)?;
if read_bytes == 0 {
// end of chunks, because return of zero here means EOF
break;
}
if read_bytes < chunk_id_buf.len() {
// any error, including EOF at this point, is an error
unchunker.read_exact(&mut chunk_id_buf[read_bytes..])?;
}
stat_tx
.send(NewTask(RecursiveChunkRef {
chunk_id: chunk_id_buf.clone(),
depth: 0,
}))
.expect("Should be able to send");
}
// Then track the chunks that we read whilst doing the above
for needed_chunk_id in vacuum_pile
.raw_pile
.retrieved_chunks
.lock()
.expect("Should be able to lock")
.iter()
{
if needed_chunk_id != &job.chunk_id {
// only track them if they're not the same as the one on this job.
stat_tx
.send(NewTask(RecursiveChunkRef {
chunk_id: needed_chunk_id.clone(),
depth: 0,
}))
.expect("Should be able to send");
}
}
}
stat_tx.send(TaskDone)?;
}
Ok(())
}
#[derive(Copy, Clone, Debug)]
pub enum BypassLevel {
NoBypass,
CompressionBypass,
}
pub fn determine_bypass_level(
desc1: &PileDescriptor,
dir1: &Path,
desc2: &PileDescriptor,
dir2: &Path,
) -> anyhow::Result<BypassLevel> {
if desc1.compression.is_some() && desc2.compression.is_some() {
let mut dictionary1 = Vec::new();
let dict_path1 = dir1.join("important_zstd.dict");
File::open(dict_path1)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary1)?;
let mut dictionary2 = Vec::new();
let dict_path2 = dir2.join("important_zstd.dict");
File::open(dict_path2)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary2)?;
if dictionary1 == dictionary2 {
// we can only bypass if both dictionaries are the same
Ok(BypassLevel::CompressionBypass)
} else {
Ok(BypassLevel::NoBypass)
}
} else {
Ok(BypassLevel::NoBypass)
}
}
/// Opens a pile with potential for returning a 'complete' pile as well as a lower-level 'bypass'
/// pile, which, for example, skips performing compression operations.
///
/// Return tuple: (actual pile, bypass raw pile)
pub fn open_pile_with_work_bypass(
dir: &Path,
desc: &PileDescriptor,
bypass_level: BypassLevel,
) -> anyhow::Result<(Pile<Arc<Box<dyn RawPile>>>, Arc<Box<dyn RawPile>>)> {
let num_compressors = get_number_of_workers("YAMA_COMPRESSORS");
let num_decompressors = get_number_of_workers("YAMA_DECOMPRESSORS");
match desc.storage {
PileStorage::RemoteOnly => {
bail!("This is a remote-only pile. No local storage allowed.");
}
PileStorage::SqliteIndexedBloblog => {
let blob_raw_pile = RawPileIntegrityChecker::new(SqliteBloblogPile::open(dir)?);
match bypass_level {
BypassLevel::NoBypass => {
unimplemented!()
}
BypassLevel::CompressionBypass => {
let common_raw_pile: Arc<Box<dyn RawPile>> = Arc::new(Box::new(blob_raw_pile));
let raw_pile: Arc<Box<dyn RawPile>> = match desc.compression {
None => common_raw_pile.clone(),
Some(comp_level) => {
let mut dictionary = Vec::new();
let dict_path = dir.join("important_zstd.dict");
File::open(dict_path)
.context("You need important_zstd.dict in your pile folder.")?
.read_to_end(&mut dictionary)?;
let (compressed_pile, _handles) = RawPileCompressor::new(
common_raw_pile.clone(),
CompressionSettings {
dictionary: Arc::new(dictionary),
level: comp_level as i32,
num_compressors: num_compressors as u32,
num_decompressors: num_decompressors as u32,
},
)?;
Arc::new(Box::new(compressed_pile))
}
};
Ok((Pile::new(raw_pile), common_raw_pile))
}
}
}
}
}

View File

@ -1,342 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::fs::File;
use std::io;
use std::io::ErrorKind;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use anyhow::{anyhow, bail, Context};
use crossbeam_channel::{Receiver, Sender};
use crossbeam_utils::thread;
use log::{error, warn};
use crate::chunking::{ChunkSubmissionTarget, RecursiveChunker, SENSIBLE_THRESHOLD};
use crate::commands;
use crate::commands::{fully_integrate_pointer_node, retrieve_tree_node};
use crate::definitions::{
PartialPointerData, PointerData, RecursiveChunkRef, RootTreeNode, TreeNode,
};
use crate::pile::{existence_checker_stage, Pile, RawPile, StoragePipelineSettings};
use crate::progress::ProgressTracker;
use crate::tree::{create_uidgid_lookup_tables, differentiate_node_in_place};
use crate::utils::get_number_of_workers;
use std::collections::BTreeMap;
use std::sync::Arc;
pub fn store<CST: ChunkSubmissionTarget, PT: ProgressTracker>(
root_path: &Path,
root: &mut TreeNode,
target: &CST,
progress_bar: &mut PT,
num_workers: u8,
) -> anyhow::Result<()> {
let (paths_send, paths_recv) = crossbeam_channel::unbounded();
let (results_send, results_recv) = crossbeam_channel::bounded(16);
progress_bar.set_max_size(root.count_normal_files() as u64);
let critical_failures = AtomicU32::new(0);
thread::scope(|s| {
for worker_num in 0..num_workers {
let paths_recv = paths_recv.clone();
let results_send = results_send.clone();
let critical_failures = &critical_failures; // needed because of move
s.builder()
.name(format!("yama chunker {}", worker_num))
.spawn(move |_| {
if let Err(e) = store_worker(root_path, target, paths_recv, results_send) {
error!("[critical!] Storage worker {} FAILED: {:?}", worker_num, e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("Failed to start thread");
}
drop(results_send);
drop(paths_recv);
if let Err(e) = manager(root, paths_send, results_recv, progress_bar) {
error!("[critical!] Storage manager FAILED: {:?}", e);
critical_failures.fetch_add(1, Ordering::Relaxed);
}
})
.expect("thread scope failed");
let critical_failures = critical_failures.load(Ordering::SeqCst);
if critical_failures > 0 {
bail!("There were {} critical failures.", critical_failures);
} else {
Ok(())
}
}
pub fn store_worker<CST: ChunkSubmissionTarget>(
root: &Path,
target: &CST,
paths: Receiver<String>,
results: Sender<(String, Option<RecursiveChunkRef>)>,
) -> anyhow::Result<()> {
while let Ok(path) = paths.recv() {
let full_path = root.join(&path);
match File::open(&full_path) {
Ok(mut file) => {
let mut chunker = RecursiveChunker::new(SENSIBLE_THRESHOLD, target);
// streaming copy from file to chunker, really cool :)
io::copy(&mut file, &mut chunker)?;
let chunk_ref = chunker.finish()?;
results
.send((path, Some(chunk_ref)))
.or(Err(anyhow!("Failed to send result.")))?;
}
Err(err) => match err.kind() {
ErrorKind::NotFound => {
warn!("File vanished: {:?}. Will ignore.", full_path);
// send None so the manager knows to remove this from the tree.
results
.send((path, None))
.or(Err(anyhow!("Failed to send result.")))?;
}
ErrorKind::PermissionDenied => {
// TODO think about if we want a 'skip failed permissions' mode ...
error!(
"Permission denied to read {:?}; do you need to change user?",
full_path
);
Err(err)?;
}
_ => {
Err(err)?;
}
},
};
}
Ok(())
}
fn delete_node(root: &mut TreeNode, child_path: &str) -> anyhow::Result<()> {
let path_pieces: Vec<&str> = child_path.split('/').collect();
let mut this = root;
for &piece in &path_pieces[0..path_pieces.len() - 1] {
if let TreeNode::Directory { children, .. } = this {
match children.get_mut(piece) {
None => bail!(
"Tried to delete {} but {} does not exist.",
child_path,
piece
),
Some(next) => this = next,
}
} else {
bail!(
"Tried to delete {} from tree node but '{}' not a directory.",
child_path,
piece
);
}
}
if let TreeNode::Directory { children, .. } = this {
children.remove(*path_pieces.last().unwrap());
} else {
bail!(
"Tried to delete {} from tree node but parent not a directory.",
child_path
);
}
Ok(())
}
fn update_node(
root: &mut TreeNode,
child_path: &str,
new_ref: RecursiveChunkRef,
) -> anyhow::Result<()> {
let mut this = root;
for piece in child_path.split('/') {
if let TreeNode::Directory { children, .. } = this {
this = children
.get_mut(piece)
.ok_or_else(|| anyhow!("Tried to update {} but {} not found", child_path, piece))?;
} else {
bail!(
"Tried to update {} but {} not a directory.",
child_path,
piece
);
}
}
if let TreeNode::NormalFile { content, .. } = this {
*content = new_ref;
} else {
bail!("Tried to update {} but it's not a NormalFile.", child_path);
}
Ok(())
}
pub fn manager<PT: ProgressTracker>(
root: &mut TreeNode,
paths_sender: Sender<String>,
results_receiver: Receiver<(String, Option<RecursiveChunkRef>)>,
progress_bar: &mut PT,
) -> anyhow::Result<()> {
root.visit(
&mut |tree_node, name| {
if let TreeNode::NormalFile { .. } = tree_node {
paths_sender
.send(name.to_string())
.or_else(|_| Err(anyhow!("Unable to send to should-be unbounded channel")))?;
}
Ok(())
},
"".to_string(),
)?;
drop(paths_sender);
while let Ok((path, opt_chunk_ref)) = results_receiver.recv() {
progress_bar.inc_progress(1);
match opt_chunk_ref {
None => {
delete_node(root, &path)?;
}
Some(new_chunk_ref) => {
update_node(root, &path, new_chunk_ref)?;
}
}
}
Ok(())
}
/// Stores files into the pile, potentially differentiating using a parent pointer (which will be
/// loaded and fully-integrated).
/// This also creates a pointer (which is why this is called `store_fully`).
pub fn store_fully<PT: ProgressTracker>(
pile: Arc<Pile<Box<dyn RawPile>>>,
root_dir: &PathBuf,
new_pointer_name: &String,
mut root_node: TreeNode,
parent: Option<String>,
num_workers: u8,
progress_bar: &mut PT,
) -> anyhow::Result<()> {
pointer_ops_prepare_to_store(&pile, &mut root_node, &parent)?;
let pointer_data =
store_without_pointer_ops(&pile, &root_dir, root_node, num_workers, progress_bar)?
.complete(parent);
pointers_ops_after_store(&pile, &new_pointer_name, &pointer_data)?;
Ok(())
}
pub fn pointers_ops_after_store(
pile: &Pile<impl RawPile>,
new_pointer_name: &str,
pointer_data: &PointerData,
) -> anyhow::Result<()> {
pile.write_pointer(&new_pointer_name, &pointer_data)?;
pile.flush()?;
Ok(())
}
pub fn pointer_ops_prepare_to_store(
pile: &Pile<impl RawPile>,
mut root_node: &mut TreeNode,
parent: &Option<String>,
) -> anyhow::Result<()> {
if let Some(parent) = parent.as_ref() {
let mut parent_pointer = pile.read_pointer(parent)?.ok_or_else(|| {
anyhow!(
"Selected parent pointer {:?} didn't exist when tried to retrieve it.",
parent
)
})?;
let mut parent_node = retrieve_tree_node(&pile, parent_pointer.chunk_ref.clone())?;
fully_integrate_pointer_node(&pile, &mut parent_node.node, &mut parent_pointer)?;
differentiate_node_in_place(&mut root_node, &parent_node.node)?;
}
Ok(())
}
pub fn store_without_pointer_ops<PT: ProgressTracker>(
pile: &Arc<Pile<Box<dyn RawPile>>>,
root_dir: &PathBuf,
mut root_node: TreeNode,
num_workers: u8,
progress_bar: &mut PT,
) -> anyhow::Result<PartialPointerData> {
// TODO make these configurable
let sps = StoragePipelineSettings {
num_compressors: get_number_of_workers("YAMA_PL_COMPRESSORS") as u32,
compressor_input_bound: 64,
writer_input_bound: 64,
};
let (control_tx, control_rx) = crossbeam_channel::unbounded();
let pile2 = pile.clone();
let pipeline = pile.raw_pile.build_storage_pipeline(sps, control_tx)?;
// TODO(newver) The existence checker stage should be able to be swapped between different implementations.
let pipeline = existence_checker_stage(pile2, pipeline);
store(
&root_dir,
&mut root_node,
&pipeline,
progress_bar,
num_workers,
)?;
// must drop the pipeline to allow the threads to close
drop(pipeline);
while let Ok(_) = control_rx.recv() {
// TODO nothing for now.
}
let mut uid_lookup = BTreeMap::new();
let mut gid_lookup = BTreeMap::new();
create_uidgid_lookup_tables(&root_node, &mut uid_lookup, &mut gid_lookup)
.context("Failed to build UID and GID lookup tables :(.")?;
let chunk_ref = commands::store_tree_node(
&pile,
&RootTreeNode {
name: root_dir
.file_name()
.map(|s| s.to_str())
.flatten()
.unwrap_or("_root")
.to_owned(),
node: root_node,
},
)?;
let pointer_data = PartialPointerData {
chunk_ref,
uid_lookup,
gid_lookup,
};
Ok(pointer_data)
}

View File

@ -1,394 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use crate::definitions::{ChunkId, PointerData};
use crate::utils::get_number_of_workers;
use crossbeam_channel::Sender;
use std::collections::HashSet;
use std::fmt::Debug;
use std::sync::{Arc, Condvar, Mutex};
pub mod access_guard;
pub mod compression;
pub mod encryption;
pub mod integrity;
pub mod local_sqlitebloblogs;
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct PileDescriptor {
/// The last version of yama that was used with this pile.
pub yama_version: String,
/// The storage backend to use.
pub storage: PileStorage,
/// If specified, the compression level of the pile.
pub compression: Option<u16>,
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
pub enum PileStorage {
/// No local storage. Pile is only usable for remotes.
RemoteOnly,
/// Local storage backed by bloblogs that are indexed by a SQLite database.
SqliteIndexedBloblog,
// Local temporary storage in which chunks are only kept for long enough to send them to
// remotes. Unimplemented at present.
// TODO THIS IS NOT THE CORRECT NAME ANYWAY BarePushSled,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RemoteDescriptor {
pub encrypted: bool,
pub host: Option<String>,
pub user: Option<String>,
pub path: PathBuf,
}
#[derive(PartialOrd, PartialEq, Copy, Clone, Serialize, Deserialize, Eq)]
pub enum Keyspace {
Chunk,
ChunkHash,
Pointer,
}
/// Useful information for humans. Doesn't need to be spot on, but kind of interesting.
#[derive(Debug, Clone)]
pub struct DebugStatistics {
pub number_of_chunks: u64,
pub minimum_chunk_size: Option<u32>,
pub maximum_chunk_size: Option<u32>,
pub total_chunk_size: u64,
}
#[derive(Debug, Clone)]
pub struct StoragePipelineSettings {
pub num_compressors: u32,
pub compressor_input_bound: u32,
pub writer_input_bound: u32,
}
pub fn existence_checker_stage<RP: RawPile>(
pile: Arc<Pile<RP>>,
next_stage: Sender<(ChunkId, Vec<u8>)>,
) -> Sender<(ChunkId, Vec<u8>)> {
// TODO(newver) Do better than this.
let shared_seen_set: Arc<Mutex<HashSet<ChunkId>>> = Default::default();
let (tx, rx) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(32);
// TODO would like something better for the networked case
for _ in 0..get_number_of_workers("YAMA_EXISTENCE_CHECKERS") {
let shared_seen_set = shared_seen_set.clone();
let next_stage = next_stage.clone();
let rx = rx.clone();
let pile = pile.clone();
std::thread::Builder::new()
.name("yama exist?er".to_string())
.spawn(move || {
while let Ok((chunk_id, chunk)) = rx.recv() {
// TODO handle errors properly
let is_new = { shared_seen_set.lock().unwrap().insert(chunk_id) };
if !is_new {
continue;
}
if !pile.chunk_exists(&chunk_id).unwrap() {
next_stage.send((chunk_id, chunk)).unwrap();
}
}
})
.unwrap();
}
tx
}
pub enum ControllerMessage {
Failure {
worker_id: Arc<String>,
error_message: String,
},
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum PipelineDescription {
Store,
Remote,
Integrity,
Compression { dictionary_fingerprint: u64 },
Encryption,
}
pub trait RawPile: Send + Sync + Debug + 'static {
// TODO expose verification errors?
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool>;
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>>;
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()>;
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()>;
fn delete_many(&self, kind: Keyspace, key: &[&[u8]]) -> anyhow::Result<()>;
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>>;
/*
fn list_keyvalue_pairs(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<(Vec<u8>, Vec<u8>)>>>>;
*/
fn flush(&self) -> anyhow::Result<()>;
// TODO return a progress Receiver
fn check_lowlevel(&self) -> anyhow::Result<bool>;
/// Return a few statistics, if possible.
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
Ok(None)
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>>;
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>>;
/// Return a u64 order token that indicates the optimum order to read this chunk in
/// compared to other chunks.
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64>;
}
impl RawPile for Box<dyn RawPile> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.as_ref().exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
self.as_ref().read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.as_ref().write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.as_ref().delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.as_ref().list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.as_ref().flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.as_ref().check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.as_ref().debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.as_ref()
.build_storage_pipeline(settings, controller_send)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.as_ref().describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
}
}
impl<RP: RawPile> RawPile for Arc<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.as_ref().exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
self.as_ref().read(kind, key)
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
self.as_ref().write(kind, key, value)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.as_ref().delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.as_ref().delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.as_ref().list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.as_ref().flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.as_ref().check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.as_ref().debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
self.as_ref()
.build_storage_pipeline(settings, controller_send)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
self.as_ref().describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.as_ref().chunk_id_transfer_ordering_hint(chunk_id)
}
}
#[derive(Debug)]
pub struct Pile<R: RawPile> {
pub raw_pile: R,
pub racy_submission_mutex: Mutex<HashSet<ChunkId>>,
pub racy_submission_condvar: Condvar,
}
impl<R: RawPile> Pile<R> {
pub fn new(raw_pile: R) -> Self {
Pile {
raw_pile,
racy_submission_mutex: Mutex::new(Default::default()),
racy_submission_condvar: Default::default(),
}
}
// TODO(clarity, features): have a special kind of error for verification failures
// may be wanted for best-effort restores
pub fn read_chunk(&self, key: &ChunkId) -> anyhow::Result<Option<Vec<u8>>> {
self.raw_pile.read(Keyspace::Chunk, key)
/*
let result = self.raw_pile.read(Keyspace::Chunk, &key)?;
if let Some(chunk) = result {
if verify {
let hash = self
.raw_pile
.read(Keyspace::ChunkHash, &key)?
.ok_or_else(|| {
anyhow!(
"Hash not found for chunk {}; can't verify",
bytes_to_hexstring(&key)
)
})?;
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&chunk);
let computed_hash = hasher.finish().to_be_bytes();
if &computed_hash[..] != &hash {
bail!(
"Hash mismatch for chunk {}: expected {} computed {}",
bytes_to_hexstring(&key),
bytes_to_hexstring(&hash),
bytes_to_hexstring(&computed_hash),
);
}
}
Ok(Some(chunk))
} else {
Ok(None)
}
*/
}
pub fn write_chunk(&self, key: &ChunkId, value: &[u8]) -> anyhow::Result<()> {
self.raw_pile.write(Keyspace::Chunk, key, value)
}
pub fn chunk_exists(&self, key: &ChunkId) -> anyhow::Result<bool> {
self.raw_pile.exists(Keyspace::Chunk, key)
}
pub fn read_pointer(&self, key: &str) -> anyhow::Result<Option<PointerData>> {
Ok(
if let Some(pointer_data_raw) = self.raw_pile.read(Keyspace::Pointer, key.as_bytes())? {
Some(serde_bare::from_slice(&pointer_data_raw)?)
} else {
None
},
)
}
pub fn write_pointer(&self, key: &str, pointer: &PointerData) -> anyhow::Result<()> {
self.raw_pile.write(
Keyspace::Pointer,
key.as_bytes(),
&serde_bare::to_vec(pointer)?,
)
}
pub fn delete_pointer(&self, key: &str) -> anyhow::Result<()> {
self.raw_pile.delete(Keyspace::Pointer, key.as_bytes())
}
pub fn list_pointers(&self) -> anyhow::Result<Vec<String>> {
let mut result = Vec::new();
for key in self.raw_pile.list_keys(Keyspace::Pointer)? {
result.push(String::from_utf8(key?)?);
}
Ok(result)
}
pub fn submit_chunk(&self, chunk_id: ChunkId, chunk_data: &[u8]) -> anyhow::Result<()> {
let mut racy_submissions = self.racy_submission_mutex.lock().unwrap();
if racy_submissions.insert(chunk_id) {
drop(racy_submissions);
if !self.chunk_exists(&chunk_id)? {
self.write_chunk(&chunk_id, chunk_data)?;
}
racy_submissions = self.racy_submission_mutex.lock().unwrap();
racy_submissions.remove(&chunk_id);
// wake up anyone who might be waiting for this chunk
self.racy_submission_condvar.notify_all();
} else {
loop {
racy_submissions = self.racy_submission_condvar.wait(racy_submissions).unwrap();
if !racy_submissions.contains(&chunk_id) {
break;
}
}
}
Ok(())
}
/// Flushes buffered writes. Should really run this before exiting, so I can sleep better at
/// night (rather than relying on the destructor).
pub fn flush(&self) -> anyhow::Result<()> {
self.raw_pile.flush()
}
}

View File

@ -1,141 +0,0 @@
use crate::chunking::calculate_chunkid;
use crate::definitions::ChunkId;
use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use anyhow::{anyhow, bail};
use crossbeam_channel::{Receiver, Sender};
use derivative::Derivative;
use std::sync::Arc;
use std::thread;
/// PileGuard is a wrapper around a pile that prevents data exfiltration and malicious corruption.
/// It's basically a firewall for a Pile?
/// Preventing malicious corruption requires the chunks to be unprocessed. This way, their ID can be
/// checked by this module.
#[derive(Debug, Derivative)]
#[derivative(Clone(bound = ""))]
// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
pub struct PileGuard<R: Clone + RawPile> {
underlying: R,
/// Whether to verify chunk IDs to prevent malicious corruption
verify_chunk_ids: bool,
}
fn pipeline(
subsequent_pipeline: Sender<(ChunkId, Vec<u8>)>,
input: Receiver<(ChunkId, Vec<u8>)>,
) -> anyhow::Result<()> {
while let Ok((claimed_chunk_id, chunk)) = input.recv() {
let actual_chunk_id = calculate_chunkid(&chunk);
if actual_chunk_id != claimed_chunk_id {
bail!("CHUNK ID MISMATCH — is this forgery? (malicious storage process?) claimed{:?} actually{:?}", claimed_chunk_id, actual_chunk_id);
}
subsequent_pipeline
.send((claimed_chunk_id, chunk))
.map_err(|_| anyhow!("Subsequent step closed"))?;
}
Ok(())
}
impl<R: Clone + RawPile> PileGuard<R> {
pub fn new(underlying: R, verify_chunk_ids: bool) -> Self {
PileGuard {
underlying,
verify_chunk_ids,
}
}
}
impl<R: Clone + RawPile> RawPile for PileGuard<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
match kind {
Keyspace::Chunk => self.underlying.exists(kind, key),
Keyspace::ChunkHash => {
bail!("Access denied");
}
Keyspace::Pointer => {
bail!("Access denied");
}
}
}
fn read(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
bail!("Access denied");
}
fn write(&self, kind: Keyspace, _key: &[u8], _value: &[u8]) -> anyhow::Result<()> {
match kind {
Keyspace::Chunk => {
todo!()
}
Keyspace::ChunkHash => {
bail!("Access denied");
}
Keyspace::Pointer => {
bail!("Access denied");
}
}
}
fn delete(&self, _kind: Keyspace, _key: &[u8]) -> anyhow::Result<()> {
bail!("Access denied");
}
fn delete_many(&self, _kind: Keyspace, _keys: &[&[u8]]) -> anyhow::Result<()> {
bail!("Access denied");
}
fn list_keys(
&self,
_kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
bail!("Access denied");
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
let subsequent_pipeline = self
.underlying
.build_storage_pipeline(settings.clone(), controller_send.clone())?;
let (input_to_this_stage, receiver) = crossbeam_channel::bounded(8);
thread::Builder::new()
.name("yama Aguard".to_owned())
.spawn(move || {
if let Err(err) = pipeline(subsequent_pipeline, receiver) {
controller_send
.send(ControllerMessage::Failure {
worker_id: Arc::new(String::from("accessguard")),
error_message: format!("err {:?}", err),
})
.expect("This is BAD: failed to send failure message to controller.");
}
})
.unwrap();
Ok(input_to_this_stage)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
// TODO(question) Should we be described in the pipeline?
self.underlying.describe_pipeline()
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}

View File

@ -1,359 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::convert::TryInto;
use std::sync::Arc;
use std::thread;
use std::thread::JoinHandle;
use anyhow::anyhow;
use crossbeam_channel::{Receiver, Sender};
use derivative::Derivative;
use log::error;
use metrics::{register_counter, Unit};
use zstd::bulk::{Compressor, Decompressor};
use crate::definitions::ChunkId;
use crate::pile::{
ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
StoragePipelineSettings,
};
pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
#[derive(Clone, Debug)]
pub struct CompressionSettings {
/// Raw dictionary to pass to Zstd for compression and decompression
pub dictionary: Arc<Vec<u8>>,
/// The compression level, passed to Zstd.
pub level: i32,
/// The number of compressor threads to use.
pub num_compressors: u32,
/// The number of decompressor threads to use.
pub num_decompressors: u32,
}
#[derive(Debug, Derivative)]
#[derivative(Clone(bound = ""))]
// we need to use derivative's Clone impl because Arc<R> causes R to have a bound on Clone
// even though that's not needed. https://github.com/rust-lang/rust/issues/26925
pub struct RawPileCompressor<R: RawPile> {
underlying: Arc<R>,
compressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
decompressor: Option<Sender<(Vec<u8>, Sender<Vec<u8>>)>>,
settings: Arc<CompressionSettings>,
}
impl<R: RawPile> RawPileCompressor<R> {
pub fn new(
underlying: R,
settings: CompressionSettings,
) -> anyhow::Result<(Self, Vec<JoinHandle<()>>)> {
register_counter!(
"compressor_in_bytes",
Unit::Bytes,
"Number of bytes that have been fed into the compressor"
);
register_counter!(
"compressor_out_bytes",
Unit::Bytes,
"Number of bytes that have come out of the compressor"
);
register_counter!(
"compressor_chunks",
Unit::Count,
"Number of chunks that have been compressed"
);
register_counter!(
"decompressor_in_bytes",
Unit::Bytes,
"Number of bytes that have been fed into the decompressor"
);
register_counter!(
"decompressor_out_bytes",
Unit::Bytes,
"Number of bytes that have come out of the decompressor"
);
register_counter!(
"decompressor_chunks",
Unit::Count,
"Number of chunks that have been decompressed"
);
if settings.num_compressors == 0 && settings.num_decompressors == 0 {
// optimisation for when we're only building a pipeline: we don't want to
return Ok((
RawPileCompressor {
underlying: Arc::new(underlying),
compressor: None,
decompressor: None,
settings: Arc::new(settings),
},
Vec::with_capacity(0),
));
}
let (com_s, com_r) = crossbeam_channel::bounded(4);
let (dec_s, dec_r) = crossbeam_channel::bounded(4);
let mut handles = Vec::new();
for worker in 0..settings.num_compressors {
let settings = settings.clone();
let com_r = com_r.clone();
let builder = thread::Builder::new().name(format!("yama compressor {}", worker));
handles.push(builder.spawn(move || {
if let Err(e) = Self::compressor_worker(com_r, settings) {
error!("compressor worker failed: {:?}", e);
}
})?);
}
for worker in 0..settings.num_decompressors {
let settings = settings.clone();
let dec_r = dec_r.clone();
let builder = thread::Builder::new().name(format!("yama decompressor {}", worker));
handles.push(builder.spawn(move || {
if let Err(e) = Self::decompressor_worker(dec_r, settings) {
error!("decompressor worker failed: {:?}", e);
}
})?);
}
Ok((
RawPileCompressor {
underlying: Arc::new(underlying),
compressor: Some(com_s),
decompressor: Some(dec_s),
settings: Arc::new(settings),
},
handles,
))
}
fn compressor_worker(
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings,
) -> anyhow::Result<()> {
let mut compressor =
Compressor::with_dictionary(settings.level, settings.dictionary.as_ref())?;
while let Ok((job, response_sender)) = queue.recv() {
let result = compressor.compress(&job)?;
response_sender
.send(result)
.or(Err(anyhow!("Couldn't send compression result")))?;
}
Ok(())
}
fn decompressor_worker(
queue: Receiver<(Vec<u8>, Sender<Vec<u8>>)>,
settings: CompressionSettings,
) -> anyhow::Result<()> {
let mut decompressor = Decompressor::with_dictionary(settings.dictionary.as_ref())?;
while let Ok((job, response_sender)) = queue.recv() {
let result = decompressor.decompress(&job, DECOMPRESS_CAPACITY)?;
response_sender
.send(result)
.or(Err(anyhow!("Couldn't send decompression result")))?;
}
Ok(())
}
fn decompress(&self, data: &[u8]) -> anyhow::Result<Vec<u8>> {
let (ret_s, ret_r) = crossbeam_channel::bounded(0);
self.decompressor
.as_ref()
.expect("No decompressors configured")
.send((data.to_vec(), ret_s))
.or(Err(anyhow!("couldn't send to decompressor")))?;
Ok(ret_r.recv().or(Err(anyhow!("couldn't receive result")))?)
}
fn compress(&self, compressed_data: &[u8]) -> anyhow::Result<Vec<u8>> {
let (ret_s, ret_r) = crossbeam_channel::bounded(0);
self.compressor
.as_ref()
.expect("No compressors configured")
.send((compressed_data.to_vec(), ret_s))
.or(Err(anyhow!("couldn't send to compressor")))?;
Ok(ret_r.recv().or(Err(anyhow!("couldn't receive result")))?)
}
fn storage_pipeline_worker(
&self,
next_stage: Sender<(ChunkId, Vec<u8>)>,
input: Receiver<(ChunkId, Vec<u8>)>,
worker_id: String,
) -> anyhow::Result<()> {
// the worker ID has to live forever, so we leak it :/
let worker_id: &'static str = Box::leak(worker_id.into_boxed_str());
metrics::register_histogram!(
"compressor_idle_time",
metrics::Unit::Seconds,
"Time spent waiting between chunks",
"id" => worker_id
);
metrics::register_counter!(
"compressor_bytes_input",
metrics::Unit::Bytes,
"Number of bytes input into the compressor.",
"id" => worker_id
);
metrics::register_counter!(
"compressor_bytes_output",
metrics::Unit::Bytes,
"Number of bytes output from the compressor.",
"id" => worker_id
);
metrics::register_counter!(
"compressor_chunks_processed",
metrics::Unit::Count,
"Number of bytes input into the compressor.",
"id" => worker_id
);
let mut compressor =
Compressor::with_dictionary(self.settings.level, self.settings.dictionary.as_ref())?;
while let Ok((chunk_id, bytes)) = input.recv() {
let in_bytes = bytes.len();
let bytes = compressor.compress(&bytes)?;
let out_bytes = bytes.len();
next_stage.send((chunk_id, bytes))?;
// Per-worker metrics
// TODO rename
metrics::counter!("compressor_bytes_input", in_bytes as u64, "id" => worker_id);
metrics::counter!("compressor_bytes_output", out_bytes as u64, "id" => worker_id);
// Global metrics
metrics::counter!("compressor_in_bytes", in_bytes as u64);
metrics::counter!("compressor_out_bytes", out_bytes as u64);
metrics::increment_counter!("compressor_chunks");
}
Ok(())
}
}
impl<R: RawPile> RawPile for RawPileCompressor<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if let Some(data) = self.underlying.read(kind, key)? {
Ok(Some(self.decompress(&data)?))
} else {
Ok(None)
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
let compressed = self.compress(value)?;
self.underlying.write(kind, key, &compressed)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.underlying.debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// this one should have a few threads behind it! yarr!
let subsequent_pipeline = self
.underlying
.build_storage_pipeline(settings.clone(), controller_send.clone())?;
let (input_to_this_stage, receiver) =
crossbeam_channel::bounded(settings.compressor_input_bound as usize);
for compressor_number in 0..settings.num_compressors {
let subsequent_pipeline = subsequent_pipeline.clone();
let receiver = receiver.clone();
let controller_send = controller_send.clone();
let this = (*self).clone();
thread::Builder::new()
.name(format!("yama Pcomp{}", compressor_number))
.spawn(move || {
let worker_id = Arc::new(format!("compressor-{}", compressor_number));
if let Err(err) = this.storage_pipeline_worker(
subsequent_pipeline,
receiver,
worker_id.to_string(),
) {
controller_send
.send(ControllerMessage::Failure {
worker_id,
error_message: format!("err {:?}", err),
})
.expect("This is BAD: failed to send failure message to controller.");
}
})
.unwrap();
}
Ok(input_to_this_stage)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
let mut dict_fingerprint_u256 = [0; 32];
blake::hash(256, &self.settings.dictionary, &mut dict_fingerprint_u256)?;
let dictionary_fingerprint: u64 =
u64::from_be_bytes(dict_fingerprint_u256[0..8].try_into().unwrap());
underlying.push(PipelineDescription::Compression {
dictionary_fingerprint,
});
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}

View File

@ -1,138 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use anyhow::anyhow;
use log::warn;
use sodiumoxide::crypto::secretbox;
use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES};
use crate::definitions::ChunkId;
use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use crossbeam_channel::Sender;
/// A RawPile that provides encryption of chunk contents.
/// Please note that keys are not currently encrypted, so this scheme is not CPA-secure.
/// It seems easily possible to test the pile for inclusion of a known file (by first chunking it and
/// looking for matching chunk IDs).
/// Use of compression a custom Zstd dictionary may make that harder but in general it seems dubious
/// to rely on that.
/// This feature will be revisited soon...
/// Notably, keys should be passed through a secure permutation first.
#[derive(Debug)]
pub struct RawPileEncryptor<R: RawPile> {
underlying: R,
secret_key: Key,
}
impl<R: RawPile> RawPileEncryptor<R> {
pub fn new(underlying: R, key: Key) -> Self {
warn!(
"WARNING! Encrypted RawPiles are not CPA secure. Do not rely on them for security yet!"
);
RawPileEncryptor {
underlying,
secret_key: key,
}
}
fn decrypt(&self, kind: Keyspace, key: &[u8], data: &[u8]) -> anyhow::Result<Vec<u8>> {
Ok(if kind == Keyspace::Chunk {
let mut nonce = [0u8; NONCEBYTES];
nonce[0..key.len()].copy_from_slice(key);
secretbox::open(data, &Nonce(nonce), &self.secret_key)
.or(Err(anyhow!("Failed to decrypt")))?
} else {
let mut nonce = [0u8; NONCEBYTES];
nonce.copy_from_slice(&data[0..NONCEBYTES]);
secretbox::open(&data[NONCEBYTES..], &Nonce(nonce), &self.secret_key)
.or(Err(anyhow!("Failed to decrypt")))?
})
}
fn encrypt(&self, kind: Keyspace, key: &[u8], data: &[u8]) -> Vec<u8> {
if kind == Keyspace::Chunk {
let mut nonce = [0u8; NONCEBYTES];
nonce[0..key.len()].copy_from_slice(key);
secretbox::seal(data, &Nonce(nonce), &self.secret_key)
} else {
let nonce = secretbox::gen_nonce();
let mut out = Vec::new();
out.extend_from_slice(&nonce.0);
out.extend_from_slice(&secretbox::seal(data, &nonce, &self.secret_key));
out
}
}
}
impl<R: RawPile> RawPile for RawPileEncryptor<R> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
if let Some(data) = self.underlying.read(kind, key)? {
Ok(Some(self.decrypt(kind, key, &data)?))
} else {
Ok(None)
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
let encrypted = self.encrypt(kind, key, value);
self.underlying.write(kind, key, &encrypted)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
self.underlying.check_lowlevel()
}
fn build_storage_pipeline(
&self,
_settings: StoragePipelineSettings,
_controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
todo!()
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
underlying.push(PipelineDescription::Encryption);
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}

View File

@ -1,160 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::hash::Hasher;
use thiserror::Error;
use crate::definitions::{ChunkId, XXH64_SEED};
use crate::pile::{
ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
StoragePipelineSettings,
};
use crate::utils::bytes_to_hexstring;
use crossbeam_channel::Sender;
/// This RawPile enables checking the integrity of stored chunks.
/// This is done by storing a hash along with the chunk contents, which can later be verified.
#[derive(Debug)]
pub struct RawPileIntegrityChecker<RP: RawPile> {
underlying: RP,
}
impl<RP: RawPile> RawPileIntegrityChecker<RP> {
pub fn new(underlying: RP) -> Self {
RawPileIntegrityChecker { underlying }
}
}
#[derive(Error, Debug)]
#[error("Integrity error for chunk {chunk_id}; expected XXHash {expected_hash} but computed {computed_hash}!")]
pub struct IntegrityError {
pub chunk_id: String,
pub expected_hash: String,
pub computed_hash: String,
}
impl<RP: RawPile> RawPile for RawPileIntegrityChecker<RP> {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
self.underlying.exists(kind, key)
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
match self.underlying.read(kind, key)? {
None => Ok(None),
Some(mut data_then_hash) => {
let len = data_then_hash.len();
let data_only = &data_then_hash[..len - 8];
let xxhash = &data_then_hash[len - 8..];
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&data_only);
let computed_hash = hasher.finish().to_be_bytes();
if computed_hash != xxhash {
Err(IntegrityError {
chunk_id: bytes_to_hexstring(key),
expected_hash: bytes_to_hexstring(&xxhash),
computed_hash: bytes_to_hexstring(&computed_hash),
})?;
}
// remove hash from end
data_then_hash.drain(len - 8..);
Ok(Some(data_then_hash))
}
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
// start with the data
let mut buf = Vec::new();
buf.extend_from_slice(&value[..]);
// then append the hash
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&value);
let computed_hash = hasher.finish().to_be_bytes();
buf.extend_from_slice(&computed_hash);
self.underlying.write(kind, key, &buf)
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
self.underlying.delete(kind, key)
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
self.underlying.delete_many(kind, keys)
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
self.underlying.list_keys(kind)
}
fn flush(&self) -> anyhow::Result<()> {
self.underlying.flush()
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
// TODO integrity check ...?
self.underlying.check_lowlevel()
}
fn debug_statistics(&self) -> anyhow::Result<Option<DebugStatistics>> {
self.underlying.debug_statistics()
}
fn build_storage_pipeline(
&self,
settings: StoragePipelineSettings,
controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// TODO primitive implementation but good enough for now.
// May want metrics later?
let next_stage = self
.underlying
.build_storage_pipeline(settings, controller_send)?;
let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(64);
std::thread::Builder::new()
.name("yama integrity".to_string())
.spawn(move || {
while let Ok((chunk_id, mut chunk)) = receiver.recv() {
let mut hasher = twox_hash::XxHash64::with_seed(XXH64_SEED);
hasher.write(&chunk);
let computed_hash = hasher.finish().to_be_bytes();
chunk.extend_from_slice(&computed_hash);
next_stage.send((chunk_id, chunk)).unwrap();
}
})
.unwrap();
Ok(input)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
let mut underlying = self.underlying.describe_pipeline()?;
underlying.push(PipelineDescription::Integrity);
Ok(underlying)
}
fn chunk_id_transfer_ordering_hint(&self, chunk_id: &ChunkId) -> anyhow::Result<u64> {
self.underlying.chunk_id_transfer_ordering_hint(chunk_id)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,68 @@
use eyre::{bail, Context};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile_fs::LocalWormFilesystem;
use yama_wormfile_sftp::SftpWormFilesystem;
#[derive(Clone, Serialize, Deserialize, Debug, Hash)]
#[serde(tag = "scheme")]
pub enum PileConnectionScheme {
#[serde(rename = "local")]
Local { directory: PathBuf },
#[serde(rename = "sftp")]
Sftp {
user_at_host: String,
directory: String,
},
#[serde(rename = "s3")]
S3 {},
}
impl PileConnectionScheme {
pub async fn connect_to_wormfileprovider(&self) -> eyre::Result<BoxedWormFileProvider> {
match self {
PileConnectionScheme::Local { directory } => {
if directory.exists() {
if !directory.is_dir() {
bail!("Can't connect to local pile {directory:?}: not a directory.");
}
} else {
tokio::fs::create_dir(directory)
.await
.context("Can't connect to local pile; can't create directory.")?;
}
Ok(BoxedWormFileProvider::new(LocalWormFilesystem::new(
directory,
)?))
}
PileConnectionScheme::Sftp {
user_at_host,
directory,
} => Ok(BoxedWormFileProvider::new(
SftpWormFilesystem::new(user_at_host, directory)
.await
.context("Failed SFTP connection")?,
)),
PileConnectionScheme::S3 { .. } => {
//S3WormFilesystem::new()
todo!()
}
}
}
}
#[derive(Clone, Serialize, Deserialize)]
pub struct PileConnectionDetails {
#[serde(flatten)]
pub scheme: PileConnectionScheme,
pub keyring: PathBuf,
}
impl PileConnectionDetails {
pub async fn connect(self) -> eyre::Result<()> {
// TODO
Ok(())
}
}

View File

@ -0,0 +1,72 @@
use yama_localcache::Store;
use yama_pile::Pile;
use yama_pile::{pointers::Pointer, tree::IntegrationStats};
use yama_wormfile::WormFileProvider;
use crate::scan::integrate_uid_or_gid_map;
use async_recursion::async_recursion;
use eyre::{Context, ContextCompat};
use yama_pile::tree::integrate_node_in_place;
pub struct PileWithCache<WFP: WormFileProvider> {
pub pile: Pile<WFP>,
pub localcache: Store,
}
#[derive(Clone, Debug, Default)]
pub struct PointerIntegrationStatistics {
pub integration: IntegrationStats,
/// Number of pointers that were integrated to get here.
pub depth: u64,
}
impl<WFP: WormFileProvider + 'static> PileWithCache<WFP> {
pub async fn fully_integrate_pointer_in_place(
&self,
pointer: &mut Pointer,
stats: &mut PointerIntegrationStatistics,
) -> eyre::Result<()> {
if let Some(parent_pointer_name) = pointer.parent.as_ref() {
let parent_pointer = self
.read_pointer_fully_integrated(parent_pointer_name, stats)
.await
.with_context(|| {
format!("failed to read pointer {parent_pointer_name} whilst integrating")
})?
.with_context(|| {
format!("whilst integrating, expected pointer {parent_pointer_name} to exist")
})?;
stats.integration +=
integrate_node_in_place(&mut pointer.root.node, &parent_pointer.root.node);
integrate_uid_or_gid_map(&mut pointer.uids, &parent_pointer.uids);
integrate_uid_or_gid_map(&mut pointer.gids, &parent_pointer.gids);
pointer.parent = None;
stats.depth += 1;
}
Ok(())
}
#[async_recursion]
pub async fn read_pointer_fully_integrated(
&self,
name: &str,
stats: &mut PointerIntegrationStatistics,
) -> eyre::Result<Option<Pointer>> {
match self.pile.read_pointer(name).await? {
Some(mut pointer) => {
self.fully_integrate_pointer_in_place(&mut pointer, stats)
.await?;
Ok(Some(pointer))
}
None => Ok(None),
}
}
/// Gracefully close this pile + local cache.
pub async fn close(self) -> eyre::Result<()> {
self.pile.close().await?;
Ok(())
}
}

View File

@ -1,42 +0,0 @@
use indicatif::ProgressBar;
pub trait ProgressTracker {
fn inc_progress(&mut self, delta_progress: u64);
fn set_current(&mut self, current_progress: u64);
fn set_max_size(&mut self, max_size: u64);
}
impl ProgressTracker for ProgressBar {
#[inline]
fn set_max_size(&mut self, max_size: u64) {
self.set_length(max_size);
}
#[inline]
fn inc_progress(&mut self, delta_progress: u64) {
self.inc(delta_progress);
}
#[inline]
fn set_current(&mut self, current_progress: u64) {
self.set_position(current_progress);
}
}
/// No-operation progress tracker.
impl ProgressTracker for () {
#[inline]
fn set_max_size(&mut self, _max_size: u64) {
// nop
}
#[inline]
fn inc_progress(&mut self, _delta_progress: u64) {
// nop
}
#[inline]
fn set_current(&mut self, _current_progress: u64) {
// nop
}
}

View File

@ -1,102 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::convert::TryInto;
use std::io::{Read, Write};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
use crate::pile::{Keyspace, PipelineDescription};
pub mod requester;
pub mod responder;
#[derive(Serialize, Deserialize, Clone)]
pub struct Request {
id: u16,
body: RequestBody,
}
#[derive(Serialize, Deserialize, Clone, Eq, PartialEq)]
pub enum RequestBody {
Read {
kind: Keyspace,
key: Vec<u8>,
},
CheckExists {
kind: Keyspace,
key: Vec<u8>,
},
Write {
kind: Keyspace,
key: Vec<u8>,
value: Vec<u8>,
},
Delete {
kind: Keyspace,
key: Vec<u8>,
},
List {
kind: Keyspace,
},
NextBatch {
token: u16,
},
Flush,
LowLevelCheck,
Describe,
Shutdown,
Progress {
current: u64,
max: u64,
},
}
#[derive(Serialize, Deserialize, Clone)]
pub struct Response {
response_to: u16,
body: ResponseBody,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub enum ResponseBody {
Success,
Failed(String),
NotExists,
Data(Vec<u8>),
BatchData {
batch: Vec<Vec<u8>>,
next_token: u16,
},
Description(Vec<PipelineDescription>),
}
pub fn read_message<R: Read, D: DeserializeOwned>(read: &mut R) -> anyhow::Result<D> {
let len = read.read_u32::<BigEndian>()?;
let mut data_vec = vec![0u8; len as usize];
read.read_exact(&mut data_vec)?;
Ok(serde_bare::from_slice(&data_vec)?)
}
pub fn write_message<W: Write, S: Serialize>(write: &mut W, message: &S) -> anyhow::Result<()> {
let data_vec = serde_bare::to_vec(&message)?;
write.write_u32::<BigEndian>(data_vec.len().try_into()?)?;
write.write_all(&data_vec)?;
Ok(())
}

View File

@ -1,495 +0,0 @@
use std::collections::HashMap;
use std::io::{stdin, stdout, Read, Write};
use std::sync::{Arc, Mutex};
use std::thread;
use std::thread::JoinHandle;
use anyhow::{anyhow, bail};
use crossbeam_channel::{Receiver, Sender};
use log::{error, info};
use crate::definitions::ChunkId;
use crate::pile::{
ControllerMessage, Keyspace, PipelineDescription, RawPile, StoragePipelineSettings,
};
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
use metrics::{
gauge, histogram, increment_counter, register_counter, register_gauge, register_histogram, Unit,
};
use std::sync::atomic::{AtomicBool, AtomicU16, Ordering};
use std::time::Instant;
/// A kind of RawPile which can make requests to a RawPile over a pipe (e.g. TCP socket or an
/// SSH connection).
/// The requests are handled by a `Responder` on the other end of the pipe.
#[derive(Debug)]
pub struct Requester {
commands: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
}
impl Requester {
pub fn new<R: Read + Send + 'static, W: Write + Send + 'static>(
read: R,
write: W,
) -> (Self, Vec<JoinHandle<()>>) {
register_histogram!(
"requester_cmd_response_time_ms",
Unit::Milliseconds,
"Time between request being issued and a response being received"
);
let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
Arc::new(Mutex::new(HashMap::new()));
let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
let mut handles = Vec::new();
let shutdown_signal: Arc<(AtomicU16, AtomicBool)> = Default::default();
{
// Spawn a reader
let in_flight = in_flight.clone();
let shutdown_signal = shutdown_signal.clone();
handles.push(
thread::Builder::new()
.name("ReqstrReader".to_string())
.spawn(move || {
if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
error!("reader failed: {:?}", e);
}
})
.unwrap(),
);
}
{
// Spawn a writer
let in_flight = in_flight.clone();
let command_receiver = command_receiver.clone();
handles.push(
thread::Builder::new()
.name("ReqstrWriter".to_string())
.spawn(move || {
if let Err(e) =
Self::writer(write, in_flight, command_receiver, shutdown_signal)
{
error!("writer failed: {:?}", e);
}
})
.unwrap(),
);
}
(
Requester {
commands: command_sender,
},
handles,
)
}
pub fn new_from_stdio() -> (Self, Vec<JoinHandle<()>>) {
let in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>> =
Arc::new(Mutex::new(HashMap::new()));
let (command_sender, command_receiver) = crossbeam_channel::bounded(16);
let mut handles = Vec::new();
let shutdown_signal: Arc<(AtomicU16, AtomicBool)> = Default::default();
{
// Spawn a reader
let in_flight = in_flight.clone();
let shutdown_signal = shutdown_signal.clone();
handles.push(
thread::Builder::new()
.name("ReqstrReaderSI".to_string())
.spawn(move || {
let stdin = stdin();
let read = stdin.lock();
if let Err(e) = Self::reader(read, in_flight, shutdown_signal) {
error!("reader failed: {:?}", e);
}
})
.unwrap(),
);
}
{
// Spawn a writer
let in_flight = in_flight.clone();
let command_receiver = command_receiver.clone();
handles.push(
thread::Builder::new()
.name("ReqstrWriterSO".to_string())
.spawn(move || {
let stdout = stdout();
let write = stdout.lock();
if let Err(e) =
Self::writer(write, in_flight, command_receiver, shutdown_signal)
{
error!("writer failed: {:?}", e);
}
})
.unwrap(),
);
}
(
Requester {
commands: command_sender,
},
handles,
)
}
pub fn clone_command_sender(&self) -> Sender<(RequestBody, Option<Sender<ResponseBody>>)> {
self.commands.clone()
}
/// Thread that reads messages and sends them along.
fn reader<R: Read>(
mut read: R,
in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
) -> anyhow::Result<()> {
loop {
let response: Response = read_message(&mut read)?;
if shutdown_request_channel.1.load(Ordering::Relaxed)
&& response.response_to == shutdown_request_channel.0.load(Ordering::Relaxed)
{
return Ok(());
}
let mut map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
// We free up the ID as we get the sender out of the map.
let (resp_sender, req_instant) = map
.remove(&response.response_to)
.ok_or(anyhow!("Didn't find response channel..."))?;
let req_resp_time_in_millis =
Instant::now().duration_since(req_instant).as_millis() as f64;
histogram!("requester_cmd_response_time_ms", req_resp_time_in_millis);
resp_sender
.send(response.body)
.or(Err(anyhow!("Failed to send response to channel")))?;
}
}
/// Thread that writes messages.
fn writer<W: Write>(
mut write: W,
in_flight: Arc<Mutex<HashMap<u16, (Sender<ResponseBody>, Instant)>>>,
command_receiver: Receiver<(RequestBody, Option<Sender<ResponseBody>>)>,
shutdown_request_channel: Arc<(AtomicU16, AtomicBool)>,
) -> anyhow::Result<()> {
while let Ok((req_body, response_channel)) = command_receiver.recv() {
let request_id = if let Some(response_channel) = response_channel {
let mut map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
let request_id = (1u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
let now = Instant::now();
map.insert(request_id, (response_channel, now));
request_id
} else {
0
};
let shutting_down = &req_body == &RequestBody::Shutdown;
if shutting_down {
shutdown_request_channel
.0
.store(request_id, Ordering::SeqCst);
shutdown_request_channel.1.store(true, Ordering::SeqCst);
}
write_message(
&mut write,
&Request {
id: request_id,
body: req_body,
},
)?;
write.flush()?;
if shutting_down {
return Ok(());
}
}
info!("Exited send loop without shutdown message, issuing Shutdown.");
// shutdown ourselves
let request_id = {
let map = in_flight.lock().or(Err(anyhow!("Mutex poisoned")))?;
let request_id = (0u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
request_id
};
shutdown_request_channel
.0
.store(request_id, Ordering::SeqCst);
shutdown_request_channel.1.store(true, Ordering::SeqCst);
write_message(
&mut write,
&Request {
id: request_id,
body: RequestBody::Shutdown,
},
)?;
write.flush()?;
Ok(())
}
/// Helper to make a request and wait for the result.
fn request(&self, req: RequestBody) -> anyhow::Result<ResponseBody> {
let (sender, receiver) = crossbeam_channel::bounded(0);
self.commands
.send((req, Some(sender)))
.or(Err(anyhow!("Failed to queue request")))?;
Ok(receiver
.recv()
.or(Err(anyhow!("Failed to receive response")))?)
}
}
impl RawPile for Requester {
fn exists(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<bool> {
match self.request(RequestBody::CheckExists {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(false),
other => Err(anyhow!("Received {:?} for Exists", other)),
}
}
fn read(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<Option<Vec<u8>>> {
match self.request(RequestBody::Read {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Err(anyhow!("Received Success for read.")),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::NotExists => Ok(None),
ResponseBody::Data(data) => Ok(Some(data)),
other => Err(anyhow!("Received {:?} for Read", other)),
}
}
fn write(&self, kind: Keyspace, key: &[u8], value: &[u8]) -> anyhow::Result<()> {
match self.request(RequestBody::Write {
kind,
key: key.to_vec(),
value: value.to_vec(),
})? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for Write", other)),
}
}
fn delete(&self, kind: Keyspace, key: &[u8]) -> anyhow::Result<()> {
match self.request(RequestBody::Delete {
kind,
key: key.to_vec(),
})? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for Delete", other)),
}
}
fn delete_many(&self, kind: Keyspace, keys: &[&[u8]]) -> anyhow::Result<()> {
for &key in keys {
self.delete(kind, key)?;
}
Ok(())
}
fn list_keys(
&self,
kind: Keyspace,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<Vec<u8>>>>> {
match self.request(RequestBody::List { kind })? {
ResponseBody::Success => Ok(Box::new(ListKeyIterator {
command_sender: self.commands.clone(),
batch_token: None,
buffer: Vec::with_capacity(0),
})),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
ResponseBody::BatchData { batch, next_token } => Ok(Box::new(ListKeyIterator {
command_sender: self.commands.clone(),
batch_token: Some(next_token),
buffer: batch,
})),
other => Err(anyhow!("Received {:?} for List", other)),
}
}
fn flush(&self) -> anyhow::Result<()> {
match self.request(RequestBody::Flush)? {
ResponseBody::Success => Ok(()),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for Flush", other)),
}
}
fn check_lowlevel(&self) -> anyhow::Result<bool> {
match self.request(RequestBody::LowLevelCheck)? {
ResponseBody::Success => Ok(true),
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for LowLevelCheck", other)),
}
}
fn build_storage_pipeline(
&self,
_settings: StoragePipelineSettings,
_controller_send: Sender<ControllerMessage>,
) -> anyhow::Result<Sender<(ChunkId, Vec<u8>)>> {
// this one is a little bit more complex.
// We want to be able to send off multiple write requests at once, but not too many, so we
// need to be able to apply backpressure.
let (input, receiver) = crossbeam_channel::bounded::<(ChunkId, Vec<u8>)>(128);
let command_sender = self.commands.clone();
register_counter!(
"requester_pipeline_cmds_issued",
Unit::Count,
"Number of write commands issued by the Requester's storage pipeline"
);
register_gauge!(
"requester_pipeline_writes_inflight",
Unit::Count,
"Number of write commands in-flight"
);
std::thread::Builder::new()
.name("ReqStPpln".to_string())
.spawn(move || {
let (response_tx, response_rx) = crossbeam_channel::bounded::<ResponseBody>(32);
let mut in_flight_writes = 0;
const MAX_IN_FLIGHT_WRITES: u32 = 32;
let mut pipeline_still_going = true;
while pipeline_still_going || in_flight_writes > 0 {
gauge!(
"requester_pipeline_writes_inflight",
in_flight_writes as f64
);
// TODO this won't handle channel closure properly.
if in_flight_writes < MAX_IN_FLIGHT_WRITES && pipeline_still_going {
crossbeam_channel::select! {
recv(response_rx) -> resp => {
in_flight_writes -= 1;
match resp.unwrap() {
ResponseBody::Success => {
// nop
}
ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string);
}
other => panic!("wtf {:?}", other),
}
}
recv(receiver) -> resp => {
if let Ok((chunk_id, write)) = resp {
in_flight_writes += 1;
increment_counter!("requester_pipeline_cmds_issued");
command_sender.send((RequestBody::Write {
kind: Keyspace::Chunk,
key: chunk_id.to_vec(),
value: write
}, Some(response_tx.clone()))).unwrap();
} else {
// the input has stopped
pipeline_still_going = false;
}
}
}
} else {
// Either the pipeline is stopping or we are too busy to accept new chunks,
// so only process responses.
let resp = response_rx.recv().unwrap();
in_flight_writes -= 1;
match resp {
ResponseBody::Success => {
// nop
}
ResponseBody::Failed(string) => {
panic!("Requester pipeline fail {}", string);
}
other => panic!("wtf {:?}", other),
}
}
}
})
.unwrap();
Ok(input)
}
fn describe_pipeline(&self) -> anyhow::Result<Vec<PipelineDescription>> {
match self.request(RequestBody::Describe)? {
ResponseBody::Description(mut description) => {
description.push(PipelineDescription::Remote);
Ok(description)
}
ResponseBody::Failed(err_msg) => Err(anyhow!("Remote failure: {}", err_msg)),
other => Err(anyhow!("Received {:?} for Describe", other)),
}
}
fn chunk_id_transfer_ordering_hint(&self, _chunk_id: &ChunkId) -> anyhow::Result<u64> {
bail!("You probably shouldn't be using chunk ID transfer ordering hints with a remote.");
}
}
pub struct ListKeyIterator {
pub(crate) command_sender: Sender<(RequestBody, Option<Sender<ResponseBody>>)>,
pub(crate) batch_token: Option<u16>,
/// in reverse order
pub(crate) buffer: Vec<Vec<u8>>,
}
impl Iterator for ListKeyIterator {
type Item = anyhow::Result<Vec<u8>>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(next) = self.buffer.pop() {
Some(Ok(next))
} else if let Some(batch_token) = self.batch_token {
let (send, recv) = crossbeam_channel::bounded(0);
self.command_sender
.send((RequestBody::NextBatch { token: batch_token }, Some(send)))
.expect("Unable to send");
let resp = recv.recv().expect("Unable to recv");
match resp {
ResponseBody::Success => {
self.batch_token = None;
None
}
ResponseBody::Failed(err_msg) => Some(Err(anyhow!("Remote failure: {}", err_msg))),
ResponseBody::BatchData { batch, next_token } => {
self.batch_token = Some(next_token);
self.buffer = batch;
self.buffer.reverse();
if let Some(next) = self.buffer.pop() {
Some(Ok(next))
} else {
None
}
}
other => Some(Err(anyhow!("Received {:?} for NextBatch", other))),
}
} else {
None
}
}
}

View File

@ -1,374 +0,0 @@
use std::collections::HashMap;
use std::io::{Read, Write};
use std::sync::{Arc, Mutex};
use std::thread;
use std::thread::JoinHandle;
use anyhow::anyhow;
use crossbeam_channel::{Receiver, Sender};
use itertools::Itertools;
use log::{error, info, warn};
use crate::definitions::ChunkId;
use crate::pile::{Keyspace, RawPile};
use crate::progress::ProgressTracker;
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
#[derive(Clone)]
pub struct ResponderWritingPipeline {
pub pipeline_submission: Sender<(ChunkId, Vec<u8>)>,
}
#[derive(Clone)]
/// A wrapper for a RawPile which allows a `Requester` to access it over a pipe (e.g. TCP socket or
/// an SSH connection).
pub struct Responder {
continuation_tokens: Arc<Mutex<HashMap<u16, Sender<u16>>>>,
writing_pipeline: Option<ResponderWritingPipeline>,
}
impl Responder {
/// Start a 'responder' (command processor).
pub fn start<
RP: RawPile + 'static,
R: Read + Send + 'static,
W: Write + Send + 'static,
PT: ProgressTracker + Send + 'static,
>(
read: R,
write: W,
num_workers: u16,
pile: Arc<RP>,
writing_pipeline: Option<ResponderWritingPipeline>,
mut progress_bar: PT,
) -> (JoinHandle<R>, JoinHandle<W>, Vec<JoinHandle<()>>) {
let mut handles = Vec::new();
let (work_queue_send, work_queue_recv) = crossbeam_channel::bounded::<Request>(16);
let (resp_send, resp_recv) = crossbeam_channel::bounded::<Response>(4);
let responder = Responder {
continuation_tokens: Arc::new(Mutex::new(Default::default())),
writing_pipeline,
};
let r_handle = {
// spawn the reader
let work_queue_send = work_queue_send.clone();
let responder = responder.clone();
thread::Builder::new()
.name("RespdrReader".to_string())
.spawn(move || {
let mut read = read;
if let Err(e) = responder.reader(&mut read, work_queue_send, &mut progress_bar)
{
error!("reader failed: {:?}", e);
}
read
})
.unwrap()
};
let w_handle = {
// spawn the writer
let resp_recv = resp_recv.clone();
let responder = responder.clone();
thread::Builder::new()
.name("RespdrWriter".to_string())
.spawn(move || {
let mut write = write;
if let Err(e) = responder.writer(&mut write, resp_recv) {
error!("writer failed: {:?}", e);
}
write
})
.unwrap()
};
for worker_num in 0..num_workers {
// spawn a worker
let responder = responder.clone();
let work_queue_recv = work_queue_recv.clone();
let resp_send = resp_send.clone();
let pile = pile.clone();
handles.push(
thread::Builder::new()
.name("RespdrWorker".to_string())
.spawn(move || {
if let Err(e) = responder.worker(pile.as_ref(), work_queue_recv, resp_send)
{
error!("worker {} failed: {:?}", worker_num, e);
}
})
.unwrap(),
);
}
(r_handle, w_handle, handles)
}
/// Thread that reads messages and sends them along.
fn reader<R: Read + Send + 'static, PT: ProgressTracker>(
&self,
read: &mut R,
worker_queue_send: Sender<Request>,
progress_tracker: &mut PT,
) -> anyhow::Result<()> {
loop {
let request: Request = read_message(read)?;
match request.body {
RequestBody::Shutdown => {
worker_queue_send.send(request)?;
info!("Shutting down responder");
return Ok(());
}
RequestBody::NextBatch { token } => {
let tokens = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
tokens
.get(&token)
.ok_or(anyhow!("Could not find that token."))?
.send(request.id)
.or(Err(anyhow!(
"Failed to send continuation token to continuer"
)))?;
}
RequestBody::Progress { current, max } => {
progress_tracker.set_max_size(max);
progress_tracker.set_current(current);
}
_ => {
worker_queue_send.send(request)?;
}
}
}
}
/// Thread that writes messages.
fn writer<W: Write + Send>(
&self,
mut write: W,
responses: Receiver<Response>,
) -> anyhow::Result<()> {
while let Ok(response) = responses.recv() {
write_message(&mut write, &response)?;
write.flush()?;
}
Ok(())
}
/// Thread which performs the actual work using the pile.
fn worker<RP: RawPile>(
&self,
pile: &RP,
requests: Receiver<Request>,
responses: Sender<Response>,
) -> anyhow::Result<()> {
while let Ok(request) = requests.recv() {
let response = match request.body {
RequestBody::Read { kind, key } => match pile.read(kind, &key) {
Ok(Some(data)) => Response {
response_to: request.id,
body: ResponseBody::Data(data),
},
Ok(None) => Response {
response_to: request.id,
body: ResponseBody::NotExists,
},
Err(err) => {
warn!("Error whilst doing a raw read: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::CheckExists { kind, key } => match pile.exists(kind, &key) {
Ok(true) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Ok(false) => Response {
response_to: request.id,
body: ResponseBody::NotExists,
},
Err(err) => {
warn!("Error whilst doing a raw exists: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::Write { kind, key, value } => {
if let Some(writing_pipeline) = self
.writing_pipeline
.as_ref()
.filter(|_| kind == Keyspace::Chunk)
{
let mut chunk_id = ChunkId::default();
chunk_id.copy_from_slice(&key[..]);
writing_pipeline
.pipeline_submission
.send((chunk_id, value))?;
// We lie and say it was successful once we submit.
// We'll complain on our side if anything goes wrong, anyway.
Response {
response_to: request.id,
body: ResponseBody::Success,
}
} else {
match pile.write(kind, &key, &value) {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw write: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
}
}
}
RequestBody::Delete { kind, key } => match pile.delete(kind, &key) {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw delete: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::List { kind } => match pile.list_keys(kind) {
Ok(iterator) => {
let mut resp_id = request.id;
let (cont_sender, cont_receiver) = crossbeam_channel::bounded(1);
let batch_token = {
let mut map = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
let batch_token = (0u16..u16::MAX)
.into_iter()
.find(|id| !map.contains_key(&id))
.expect("No ID found");
map.insert(batch_token, cont_sender);
batch_token
};
for chunk in &iterator.chunks(32) {
let mut results = Vec::with_capacity(32);
for result in chunk {
results.push(result?);
}
responses
.send(Response {
response_to: resp_id,
body: ResponseBody::BatchData {
batch: results,
next_token: batch_token,
},
})
.or(Err(anyhow!("Failed to queue response")))?;
resp_id = cont_receiver
.recv()
.or(Err(anyhow!("Failed to receive continuation response ID")))?;
}
let mut map = self
.continuation_tokens
.lock()
.or(Err(anyhow!("Mutex poisoned")))?;
map.remove(&batch_token);
Response {
response_to: resp_id,
body: ResponseBody::Success,
}
}
Err(err) => {
warn!("Error whilst doing a raw list_keys: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::NextBatch { .. } => {
unreachable!("because this is handled by the reader")
}
RequestBody::Flush => match pile.flush() {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw flush: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::LowLevelCheck => match pile.check_lowlevel() {
Ok(_) => Response {
response_to: request.id,
body: ResponseBody::Success,
},
Err(err) => {
warn!("Error whilst doing a raw check_lowlevel: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
RequestBody::Shutdown => Response {
response_to: request.id,
body: ResponseBody::Success,
},
RequestBody::Progress { .. } => {
unreachable!("handled by readea")
}
RequestBody::Describe => match pile.describe_pipeline() {
Ok(description) => Response {
response_to: request.id,
body: ResponseBody::Description(description),
},
Err(err) => {
warn!("Error whilst doing a raw describe_pipeline: {:?}", err);
let err = format!("{:?}", err);
Response {
response_to: request.id,
body: ResponseBody::Failed(err),
}
}
},
};
responses
.send(response)
.or(Err(anyhow!("Failed to queue response")))?;
}
Ok(())
}
}

413
yama/src/retriever.rs Normal file
View File

@ -0,0 +1,413 @@
// TODO The retriever should possibly live somewhere else
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, ensure, eyre, ContextCompat};
use flume::{Receiver, Sender};
use std::collections::{BTreeMap, BTreeSet};
use std::pin::Pin;
use std::sync::Arc;
use tracing::error;
use yama_pile::bloblogs::BloblogReader;
use yama_pile::definitions::BloblogId;
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile::WormFileReader;
pub mod decompressor;
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
#[repr(transparent)]
pub struct JobId(pub u32);
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
#[repr(transparent)]
pub struct FileId(pub u32);
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct JobChunkReq {
pub file: FileId,
pub offset: u64,
pub length: u64,
}
#[derive(Clone, Debug)]
pub enum RetrieverResp {
Blob {
job: JobId,
subjob: u32,
blob: Vec<u8>,
},
JobComplete(JobId),
}
#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
struct FileRegionMarker {
pub file: FileId,
pub offset: u64,
pub length: u64,
pub job: JobId,
pub subjob: u32,
}
#[derive(Debug)]
struct OpenFileState {
pub req_tx: Sender<OpenFileReq>,
pub offset: u64,
}
#[derive(Debug)]
struct OpenFileReq {
pub offset: u64,
pub length: u64,
pub job: JobId,
pub subjob: u32,
}
#[derive(Debug)]
struct ActiveJobState {
pub subjobs: Vec<JobChunkReq>,
pub next_subjob: u32,
pub inflight: u32,
}
struct RetrieverInternals {
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
jobs_queue: BTreeMap<JobId, Vec<JobChunkReq>>,
file_regions: BTreeSet<FileRegionMarker>,
files: BTreeMap<FileId, BloblogId>,
open_files: BTreeMap<FileId, OpenFileState>,
results_tx: Sender<RetrieverResp>,
active_jobs: BTreeMap<JobId, ActiveJobState>,
ack_rx: Receiver<JobId>,
self_ack_tx: Sender<JobId>,
rec_active_jobs: u16,
}
pub fn create_fixed_retriever(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
jobs: BTreeMap<JobId, Vec<JobChunkReq>>,
files: BTreeMap<FileId, BloblogId>,
rec_active_jobs: u16,
) -> eyre::Result<Receiver<RetrieverResp>> {
let (results_tx, results_rx) = flume::bounded(4);
let (self_ack_tx, ack_rx) = flume::bounded(4);
let mut rint = RetrieverInternals {
pwc,
jobs_queue: Default::default(),
file_regions: Default::default(),
files,
open_files: Default::default(),
results_tx,
active_jobs: Default::default(),
ack_rx,
self_ack_tx,
rec_active_jobs,
};
for (job_id, job) in jobs {
rint.set_up_job(job_id, job);
}
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) = rint.retrieval_task().await {
error!("retriever failed: {e:?}");
}
}));
Ok(results_rx)
}
impl RetrieverInternals {
fn set_up_job(&mut self, job_id: JobId, job: Vec<JobChunkReq>) {
for (subjob, chunk) in job.iter().enumerate() {
self.file_regions.insert(FileRegionMarker {
file: chunk.file,
offset: chunk.offset,
length: chunk.length,
job: job_id,
subjob: subjob as u32,
});
}
self.jobs_queue.insert(job_id, job);
// eprintln!("new job {job_id:?}");
}
async fn file_request(
open_file: &mut OpenFileState,
job: JobId,
subjob: u32,
offset: u64,
length: u64,
) -> eyre::Result<()> {
// debug!("sched {job:?}->{subjob:?}");
open_file
.req_tx
.send_async(OpenFileReq {
offset,
length,
job,
subjob,
})
.await
.map_err(|_| eyre!("open file shut down :/"))?;
open_file.offset = offset + length;
Ok(())
}
async fn open_file(&mut self, file_id: FileId) -> eyre::Result<()> {
assert!(!self.open_files.contains_key(&file_id));
let &bloblog_id = self.files.get(&file_id).context("no file by that ID")?;
let bloblog_reader = self.pwc.pile.read_bloblog(bloblog_id).await?;
let completion_tx = self.results_tx.clone();
let ack_tx = self.self_ack_tx.clone();
let (subjob_tx, subjob_rx) = flume::unbounded();
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) =
Self::reader_task(bloblog_reader, subjob_rx, ack_tx, completion_tx).await
{
error!("error in reader for {bloblog_id:?}: {e:?}");
}
}));
self.open_files.insert(
file_id,
OpenFileState {
req_tx: subjob_tx,
offset: 0,
},
);
Ok(())
}
async fn reader_task(
mut bloblog_reader: BloblogReader<Pin<Box<dyn WormFileReader>>>,
subjob_rx: Receiver<OpenFileReq>,
ack_tx: Sender<JobId>,
completion_tx: Sender<RetrieverResp>,
) -> eyre::Result<()> {
while let Ok(next_job) = subjob_rx.recv_async().await {
let mut blob = Vec::with_capacity(next_job.length as usize);
bloblog_reader
.read_to_buf(&mut blob, next_job.offset, next_job.length)
.await?;
completion_tx
.send_async(RetrieverResp::Blob {
job: next_job.job,
subjob: next_job.subjob,
blob,
})
.await
.expect("completions shut");
// debug!("read,acking! {:?}", next_job);
ack_tx.send_async(next_job.job).await?;
}
Ok(())
}
async fn retrieval_task(&mut self) -> eyre::Result<()> {
// let mut icount = 0u64;
loop {
// icount += 1;
// debug!("[{icount}] active jobs {:#?}", self.active_jobs);
// debug!("[{icount}] open files {:#?}", self.open_files);
// 0. Try to progress open jobs if they are staring right at the bytes they need...
let mut to_remove = Vec::new();
for (active_job_id, active_job) in &mut self.active_jobs {
if active_job.inflight > 0 {
// skip if it's busy, we don't want to send blobs out of order...
continue;
}
if active_job.next_subjob as usize >= active_job.subjobs.len() {
// this job is to be finished!
to_remove.push(*active_job_id);
continue;
}
// Which file we are 'staring at' and requesting a run of chunks from
let mut stare_file = None;
'single_job_staring: loop {
let desired_blob = &active_job.subjobs[active_job.next_subjob as usize];
if stare_file.is_some() && stare_file != Some(desired_blob.file) {
// We have changed which file we are looking at, we can't request any further
// because they might get retrieved out of order.
break 'single_job_staring;
}
if let Some(open_file) = self.open_files.get_mut(&desired_blob.file) {
stare_file = Some(desired_blob.file);
if open_file.offset == desired_blob.offset {
Self::file_request(
open_file,
*active_job_id,
active_job.next_subjob,
desired_blob.offset,
desired_blob.length,
)
.await?;
ensure!(
self.file_regions.remove(&FileRegionMarker {
file: desired_blob.file,
offset: desired_blob.offset,
length: desired_blob.length,
job: *active_job_id,
subjob: active_job.next_subjob,
}),
"no FRM to remove (0)"
);
active_job.next_subjob += 1;
active_job.inflight += 1;
if active_job.next_subjob as usize >= active_job.subjobs.len() {
// this job is to be finished!
break 'single_job_staring;
}
} else {
break 'single_job_staring;
}
} else {
break 'single_job_staring;
}
}
}
for remove in to_remove {
self.active_jobs.remove(&remove);
// eprintln!("job complete {remove:?}");
self.results_tx
.send_async(RetrieverResp::JobComplete(remove))
.await
.map_err(|_| eyre!("results_tx shutdown"))?;
}
// 1. Try to make the most of open files by opening new jobs in convenient locations.
// Basically: if we have slots for new active jobs, then look to see if we have any
// jobs that begin at the offset in question...
if self.active_jobs.len() < self.rec_active_jobs as usize {
let mut allowed = self.rec_active_jobs as usize - self.active_jobs.len();
let mut progress = false;
for (open_file_id, open_file_state) in &self.open_files {
for region in self.file_regions.range(
FileRegionMarker {
file: *open_file_id,
offset: open_file_state.offset,
length: 0,
job: JobId(0),
subjob: 0,
}..FileRegionMarker {
file: *open_file_id,
offset: open_file_state.offset + 1,
length: 0,
job: JobId(0),
subjob: 0,
},
) {
if region.subjob != 0 {
// only accept this region if it's the start of a job
continue;
}
if let Some(subjobs) = self.jobs_queue.remove(&region.job) {
self.active_jobs.insert(
region.job,
ActiveJobState {
subjobs,
next_subjob: 0,
inflight: 0,
},
);
allowed -= 1;
progress = true;
break;
}
}
if allowed == 0 {
break;
}
}
if progress {
continue;
}
}
// 2. Try to progress active jobs, even if we have to open new files or seek.
let mut files_to_open = BTreeSet::new();
for (active_job_id, active_job) in &mut self.active_jobs {
if active_job.inflight > 0 {
// skip if it's busy, we don't want to send blobs out of order...
continue;
}
let desired_blob = &active_job.subjobs[active_job.next_subjob as usize];
if let Some(open_file) = self.open_files.get_mut(&desired_blob.file) {
Self::file_request(
open_file,
*active_job_id,
active_job.next_subjob,
desired_blob.offset,
desired_blob.length,
)
.await?;
ensure!(
self.file_regions.remove(&FileRegionMarker {
file: desired_blob.file,
offset: desired_blob.offset,
length: desired_blob.length,
job: *active_job_id,
subjob: active_job.next_subjob,
}),
"no FRM to remove (0)"
);
active_job.next_subjob += 1;
active_job.inflight += 1;
} else {
// can't open immediately here due to mut borrow.
files_to_open.insert(desired_blob.file);
}
}
if !files_to_open.is_empty() {
for file in files_to_open {
self.open_file(file).await?;
}
continue;
}
// 3. Start new jobs
if self.active_jobs.len() < self.rec_active_jobs as usize {
// spawn a new job...
if let Some(activate_job_id) = self.jobs_queue.keys().next().cloned() {
let new_job = self.jobs_queue.remove(&activate_job_id).unwrap();
self.active_jobs.insert(
activate_job_id,
ActiveJobState {
subjobs: new_job,
next_subjob: 0,
inflight: 0,
},
);
continue;
}
}
// 4. Block for acks, unless there are no jobs in which case we should just finish!
if self.active_jobs.is_empty() {
break;
}
if let Ok(ack) = self.ack_rx.recv_async().await {
if let Some(job) = self.active_jobs.get_mut(&ack) {
ensure!(job.inflight > 0, "recv'd ack for job that has 0 inflight");
job.inflight -= 1;
} else {
bail!("recv'd ack for bad job {ack:?}");
}
}
}
Ok(())
}
}

View File

@ -0,0 +1,213 @@
use crate::retriever::{JobId, RetrieverResp};
use eyre::{bail, ensure, eyre, Context, ContextCompat};
use flume::{Receiver, Sender};
use std::collections::BTreeMap;
use std::sync::Arc;
use tracing::error;
use zstd::bulk::Decompressor;
pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
pub struct PipelineDecompressor {
rx: Receiver<RetrieverResp>,
tx: Sender<RetrieverResp>,
job_pool_tx: Sender<(JobId, u32, Vec<u8>)>,
complete_rx: Receiver<(JobId, u32, Vec<u8>)>,
processing: BTreeMap<JobId, JobState>,
}
struct JobState {
pub next_submit_subjob: u32,
pub next_enqueue_subjob: u32,
pub queued: BTreeMap<u32, Vec<u8>>,
pub complete: bool,
}
impl PipelineDecompressor {
pub fn start(
decom_dict: Option<Arc<Vec<u8>>>,
num_decom: u8,
rx: Receiver<RetrieverResp>,
) -> eyre::Result<Receiver<RetrieverResp>> {
let (out_tx, out_rx) = flume::bounded(4);
let (job_pool_tx, job_pool_rx) = flume::bounded(0);
let (complete_tx, complete_rx) = flume::unbounded();
for num in 0..num_decom {
let decom_dict = decom_dict.clone();
let job_pool_rx = job_pool_rx.clone();
let complete_tx = complete_tx.clone();
std::thread::Builder::new()
.name(format!("decomp {num}"))
.spawn(move || {
if let Err(err) =
Self::decompressor_worker(decom_dict, job_pool_rx, complete_tx)
{
error!("error in decompressor worker: {err:?}");
}
})?;
}
let mut pd = PipelineDecompressor {
rx,
tx: out_tx,
job_pool_tx,
complete_rx,
processing: Default::default(),
};
tokio::spawn(async_backtrace::frame!(async move {
if let Err(e) = pd.decompressor_manager().await {
eprintln!("pipeline decompressor error: {e:?}");
}
}));
Ok(out_rx)
}
fn decompressor_worker(
decom_dict: Option<Arc<Vec<u8>>>,
job_pool_rx: Receiver<(JobId, u32, Vec<u8>)>,
complete_tx: Sender<(JobId, u32, Vec<u8>)>,
) -> eyre::Result<()> {
let mut decompressor = match decom_dict {
Some(dict) => Decompressor::with_dictionary(&dict)?,
None => Decompressor::new()?,
};
while let Ok((job_id, subjob, compressed_bytes)) = job_pool_rx.recv() {
let decompressed_bytes = decompressor
.decompress(&compressed_bytes, DECOMPRESS_CAPACITY)
.context("failed to decompress")?;
complete_tx
.send((job_id, subjob, decompressed_bytes))
.map_err(|_| eyre!("complete_tx shutdown"))?;
}
Ok(())
}
async fn decompressor_manager(&mut self) -> eyre::Result<()> {
let mut incoming_open = true;
loop {
// Always process completed jobs as top priority
while let Ok(completion) = self.complete_rx.try_recv() {
self.handle_completion(completion).await?;
}
// Then it doesn't matter so much what we process after that
tokio::select! {
Ok(completion) = self.complete_rx.recv_async(), if !self.processing.is_empty() => {
self.handle_completion(completion).await?;
},
incoming_res = self.rx.recv_async(), if incoming_open => {
if let Ok(incoming) = incoming_res {
self.handle_incoming(incoming).await?;
} else {
incoming_open = false;
}
}
else => {
if !self.processing.is_empty() {
bail!("decompressor still procesing but shutting down?");
}
// eprintln!("D shutdown");
break Ok(());
}
};
}
}
async fn handle_completion(
&mut self,
(job_id, subjob, decompressed): (JobId, u32, Vec<u8>),
) -> eyre::Result<()> {
let state = self
.processing
.get_mut(&job_id)
.context("bad job when recv complete decomp")?;
ensure!(
state.queued.insert(subjob, decompressed).is_none(),
"overwrote decompressed block??"
);
while let Some(send_off) = state.queued.remove(&state.next_submit_subjob) {
// eprintln!("D send off {job_id:?} {subjob}");
self.tx
.send(RetrieverResp::Blob {
job: job_id,
subjob: state.next_submit_subjob,
blob: send_off,
})
.map_err(|_| eyre!("tx shutdown"))?;
state.next_submit_subjob += 1;
}
if state.queued.is_empty()
&& state.complete
&& state.next_submit_subjob == state.next_enqueue_subjob
{
// This job is done now
// eprintln!("D jc {job_id:?}");
self.tx
.send(RetrieverResp::JobComplete(job_id))
.map_err(|_| eyre!("tx shutdown"))?;
self.processing.remove(&job_id);
}
Ok(())
}
async fn handle_incoming(&mut self, incoming: RetrieverResp) -> eyre::Result<()> {
match incoming {
RetrieverResp::Blob { job, subjob, blob } => {
if subjob == 0 {
ensure!(
self.processing
.insert(
job,
JobState {
next_submit_subjob: 0,
next_enqueue_subjob: 0,
queued: Default::default(),
complete: false,
}
)
.is_none(),
"job was overwritten"
);
}
// debug!("blob {job:?} {subjob:?}");
let state = self.processing.get_mut(&job).with_context(|| {
format!("bad job/not starting at 0 for job {job:?} (subjob={subjob:?})")
})?;
ensure!(
state.next_enqueue_subjob == subjob,
"out of order Blob commands"
);
state.next_enqueue_subjob += 1;
self.job_pool_tx
.send_async((job, subjob, blob))
.await
.map_err(|_| eyre!("job_pool_tx shutdown"))?;
}
RetrieverResp::JobComplete(job) => {
let state = self
.processing
.get_mut(&job)
.context("bad job to complete")?;
state.complete = true;
// debug!("complete {job:?}");
let can_remove = state.next_submit_subjob == state.next_enqueue_subjob;
if can_remove {
self.tx
.send(RetrieverResp::JobComplete(job))
.map_err(|_| eyre!("tx shutdown"))?;
self.processing.remove(&job);
}
}
}
Ok(())
}
}

466
yama/src/scan.rs Normal file
View File

@ -0,0 +1,466 @@
use eyre::{bail, eyre, Context, ContextCompat};
use ignore::overrides::OverrideBuilder;
use ignore::WalkBuilder;
use patricia_tree::PatriciaMap;
use std::cmp::max;
use std::collections::{BTreeMap, BTreeSet};
use std::fs::{read_link, Metadata};
use std::io::ErrorKind;
use std::os::unix::fs::MetadataExt;
use std::path::{Component, Path};
use tracing::warn;
use yama_pile::definitions::RecursiveChunkRef;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{mtime_msec, FilesystemOwnership, FilesystemPermissions, TreeNode};
/// Given a node, recursively constructs a UID and GID lookup table based on THIS system's
/// users and groups.
///
/// Returns UIDs then GIDs.
pub fn create_uidgid_lookup_tables(
node: &TreeNode,
) -> eyre::Result<(BTreeMap<u16, String>, BTreeMap<u16, String>)> {
let mut uids = BTreeMap::<u16, String>::new();
let mut gids = BTreeMap::<u16, String>::new();
let mut used_uids = BTreeSet::new();
let mut used_gids = BTreeSet::new();
find_used_uidsgids(&node, &mut used_uids, &mut used_gids);
for uid in used_uids {
if let Some(user) = users::get_user_by_uid(uid.into()) {
uids.insert(
uid,
user.name()
.to_str()
.ok_or(eyre!("uid leads to non-String name"))?
.to_owned(),
);
}
}
for gid in used_gids {
if let Some(group) = users::get_group_by_gid(gid.into()) {
gids.insert(
gid,
group
.name()
.to_str()
.ok_or(eyre!("gid leads to non-String name"))?
.to_owned(),
);
}
}
Ok((uids, gids))
}
fn find_used_uidsgids(node: &TreeNode, uids: &mut BTreeSet<u16>, gids: &mut BTreeSet<u16>) {
match &node {
TreeNode::NormalFile { ownership, .. }
| TreeNode::Directory { ownership, .. }
| TreeNode::SymbolicLink { ownership, .. } => {
uids.insert(ownership.uid);
gids.insert(ownership.gid);
}
TreeNode::Deleted => { /* nop */ }
};
if let TreeNode::Directory { children, .. } = &node {
for (_name, child) in children {
find_used_uidsgids(child, uids, gids);
}
}
}
/// Calculates the relative path.
///
/// Returns empty string if the paths are the same, otherwise it's a /-separated string.
/// The returned string is not allowed to contain any . or .. components.
pub fn relative_path(base: &Path, leaf: &Path) -> Option<String> {
assert_eq!(std::path::MAIN_SEPARATOR, '/');
let relative = leaf.strip_prefix(base).ok()?;
if relative
.components()
.any(|c| c == Component::CurDir || c == Component::ParentDir || c == Component::RootDir)
{
return None;
}
relative.to_str().map(|s| s.to_owned())
}
/// Scans a directory tree.
///
/// Aborts if any errors (permission, bad .yamaignore files, etc) are encountered.
/// In the future, we possibly want to consider allowing
pub fn scan(root: &Path, ignores: &Vec<String>) -> eyre::Result<PatriciaMap<ScanEntry>> {
let mut entries: PatriciaMap<ScanEntry> = PatriciaMap::new();
if !root.is_dir() {
let metadata = std::fs::symlink_metadata(root).context("reading metadata of root")?;
entries.insert(
"",
scan_one_no_recurse(root, metadata)
.context("failed to generate scan entry for root")?
.context("root probably doesn't exist, or is ignored?")?,
);
return Ok(entries);
}
let mut walker = WalkBuilder::new(root);
walker
.standard_filters(false)
.add_custom_ignore_filename(".yamaignore")
.parents(false)
.follow_links(false)
.same_file_system(true);
let mut overrides = OverrideBuilder::new(root);
for ign in ignores {
overrides
.add(&("!".to_owned() + ign))
.with_context(|| format!("failed to add ignore rule: {ign:?}"))?;
}
walker.overrides(
overrides
.build()
.context("failed to create overrides with ignore entries")?,
);
// TODO(performance): We could use `WalkParallel` if we restructure this code.
let walker = walker.build();
for entry in walker {
let entry = entry?;
if !entry.path().starts_with(root) {
bail!(
"Scanned entry {:?} does not start with search path {:?}",
entry.path(),
root
);
}
let rel_path = if let Some(rel_path) = relative_path(root, entry.path()) {
rel_path
} else {
continue;
};
if !rel_path.is_empty() {
let parent_relpath = rel_path
.rsplit_once('/')
.map(|(parent, _child)| parent)
.unwrap_or("");
assert!(
entries.contains_key(parent_relpath),
"have not scanned parent for {}",
rel_path
);
}
if let Some(single_scan) = scan_one_no_recurse(
entry.path(),
entry
.metadata()
.with_context(|| format!("Failed to read metadata for {:?}", rel_path))?,
)
.with_context(|| format!("Failed to scan {:?}", rel_path))?
{
entries.insert(&rel_path, single_scan);
}
}
Ok(entries)
}
fn scan_one_no_recurse(path: &Path, metadata: Metadata) -> eyre::Result<Option<ScanEntry>> {
let filetype = metadata.file_type();
let ownership = FilesystemOwnership {
uid: metadata.uid() as u16,
gid: metadata.gid() as u16,
};
let permissions = FilesystemPermissions {
mode: metadata.mode(),
};
if filetype.is_file() {
// Leave an unpopulated file node. It's not my responsibility to chunk it right now.
Ok(Some(ScanEntry::NormalFile {
mtime: mtime_msec(&metadata),
ownership,
permissions,
size: metadata.size(),
}))
} else if filetype.is_dir() {
// TODO(perforance): this call is very likely wasteful
let dir_read = path.read_dir();
if let Err(e) = &dir_read {
match e.kind() {
ErrorKind::NotFound => {
warn!("vanished/: {:?}", path);
return Ok(None);
}
ErrorKind::PermissionDenied => {
warn!("permission denied/: {:?}", path);
return Ok(None);
}
_ => { /* nop */ }
}
}
Ok(Some(ScanEntry::Directory {
ownership,
permissions,
}))
} else if filetype.is_symlink() {
let target = read_link(path)?
.to_str()
.ok_or(eyre!("target path cannot be to_str()d"))?
.to_owned();
Ok(Some(ScanEntry::SymbolicLink { ownership, target }))
} else {
Ok(None)
}
}
/// Given the parent pointer's root TreeNode and a scan entry map of the current pointer,
/// return a chunkings map prepopulated with the reusable entries.
/// Also returns a pruned and prepopulated copy of the scan entry map.
pub fn prepopulate_unmodified(
parent_tree: &TreeNode,
scan_entry_map: &PatriciaMap<ScanEntry>,
) -> (
PatriciaMap<Option<(RecursiveChunkRef, u64)>>,
PatriciaMap<ScanEntry>,
PatriciaMap<ScanEntry>,
) {
let mut reusable_chunkings = PatriciaMap::new();
let mut prepopulated_scan_entry_map = PatriciaMap::new();
let mut pruned_scan_entry_map = scan_entry_map.clone();
parent_tree
.visit(
&mut |tree_node, path| {
// TODO We should consider prepopulating symlinks and empty dirs too, if they're
// included in the parent.
if let TreeNode::NormalFile {
mtime: prev_mtime,
ownership: prev_ownership,
permissions: prev_permissions,
size: prev_size,
content: prev_content,
} = tree_node
{
if let Some(ScanEntry::NormalFile {
mtime,
ownership,
permissions,
size,
}) = scan_entry_map.get(path)
{
if mtime == prev_mtime
&& size == prev_size
&& ownership == prev_ownership
&& prev_permissions == permissions
{
// Nothing seems to have changed about this file, let's just reuse the `content` from last time.
reusable_chunkings.insert(path, Some((*prev_content, *size)));
prepopulated_scan_entry_map.insert(
path,
pruned_scan_entry_map.remove(path).expect("checked removal"),
);
// Pull out parent directories so our subset always contains the parents for their children.
for path_fragment in iterate_dirs_upwards(path.as_bytes()) {
if let Some(directory) = pruned_scan_entry_map.remove(path_fragment)
{
prepopulated_scan_entry_map.insert(path_fragment, directory);
}
}
}
}
}
Ok(())
},
String::new(),
)
.expect("no reason to fail");
(
reusable_chunkings,
pruned_scan_entry_map,
prepopulated_scan_entry_map,
)
}
pub fn integrate_uid_or_gid_map(new: &mut BTreeMap<u16, String>, old: &BTreeMap<u16, String>) {
for (old_uid, old_user) in old {
new.entry(*old_uid).or_insert_with(|| old_user.clone());
}
}
/// Given a scan entry map, creates another one whose size is limited to not containing more than
/// the given number of file bytes to be stored.
/// There is one exception: if an individual file exceeds the max size, the returned scan entry map
/// may contain just that one file.
///
/// Useful for gradually starting backups without having to do the first in one shot.
pub fn limit_scan_entry_map_to_size(
scan_entry_map: PatriciaMap<ScanEntry>,
soft_max_size: u64,
) -> PatriciaMap<ScanEntry> {
let mut accum_size: u64 = 0;
let mut have_file = false;
let mut result = PatriciaMap::new();
// let mut removeds = PatriciaSet::new();
let mut unincluded_directories = PatriciaMap::new();
for (path_bytes, entry) in scan_entry_map.into_iter() {
if accum_size >= soft_max_size {
// we're already full!
break;
}
let size_of_entry = match entry {
ScanEntry::NormalFile { size, .. } => {
// even zero-byte files are not for free, so don't let them be.
max(size, 4096)
}
ScanEntry::Directory { .. } => {
// slightly made up number, but typical size of an inode?
4096
}
ScanEntry::SymbolicLink { .. } => {
// slightly made up number, but typical size of an inode?
4096
}
};
let size_limit_would_be_respected = accum_size + size_of_entry <= soft_max_size;
let this_is_the_only_file = !have_file && matches!(&entry, ScanEntry::NormalFile { .. });
if size_limit_would_be_respected || this_is_the_only_file {
have_file |= matches!(&entry, ScanEntry::NormalFile { .. });
result.insert(&path_bytes, entry);
accum_size += size_of_entry;
// Pull out parent directories so our subset always contains the parents for their children.
for path_fragment in iterate_dirs_upwards(&path_bytes) {
if let Some(directory) = unincluded_directories.remove(path_fragment) {
result.insert(path_fragment, directory);
accum_size += 4096;
}
}
} else if matches!(&entry, &ScanEntry::Directory { .. }) {
// put the directory to one side in case we need it...
unincluded_directories.insert(path_bytes, entry);
} else {
// removeds.insert(path_bytes);
}
}
// for (key, _) in unincluded_directories {
// removeds.insert(key);
// }
// (result, removeds)
result
}
/// Returns a list of all the parent paths of the given path (in bytes),
/// including the root, in order from leaf to root.
pub fn iterate_dirs_upwards(path_bytes: &[u8]) -> Vec<&[u8]> {
let mut result = Vec::new();
let mut path_fragment = &path_bytes[..];
while let Some((index, _)) = path_fragment
.iter()
.enumerate()
.rev()
.find(|(_idx, char_byte)| **char_byte == b'/')
{
path_fragment = &path_bytes[0..index];
result.push(path_fragment);
}
result.push(&path_bytes[0..0]);
result
}
#[cfg(test)]
mod tests {
use crate::scan::limit_scan_entry_map_to_size;
use maplit::btreeset;
use patricia_tree::PatriciaMap;
use std::collections::BTreeSet;
use yama_pile::tree::unpopulated::ScanEntry;
use yama_pile::tree::{FilesystemOwnership, FilesystemPermissions};
#[test]
fn test_limit_scan_entry_map_to_size() {
let mut orig = PatriciaMap::new();
orig.insert(
"somedir".as_bytes(),
ScanEntry::Directory {
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
},
);
orig.insert(
"somedir/a_small_file".as_bytes(),
ScanEntry::NormalFile {
mtime: 0,
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
size: 4,
},
);
orig.insert(
"somedir/somefile".as_bytes(),
ScanEntry::NormalFile {
mtime: 0,
ownership: FilesystemOwnership { uid: 0, gid: 0 },
permissions: FilesystemPermissions { mode: 0 },
size: 8192,
},
);
// 16k = 4k (dir) + 8k (somefile) + 4k (small file; minimum)
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 16384)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
b"somedir/somefile".to_vec(),
}
);
// now we don't have room for the big file.
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 16383)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
}
);
// because we must always include at least one file so we make forward progress, it doesn't
// matter that this violates the size limit.
assert_eq!(
limit_scan_entry_map_to_size(orig.clone(), 1)
.keys()
.collect::<BTreeSet<_>>(),
btreeset! {
b"somedir".to_vec(),
b"somedir/a_small_file".to_vec(),
}
);
}
}

543
yama/src/storing.rs Normal file
View File

@ -0,0 +1,543 @@
use crate::pile_with_cache::PileWithCache;
use dashmap::DashSet;
use eyre::{bail, eyre, Context};
use fastcdc::v2020::{FastCDC, StreamCDC};
use flume::{Receiver, RecvError, SendError, Sender};
use std::cmp::Reverse;
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Debug;
use std::fs::File;
use std::io;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::sync::Arc;
use std::thread::JoinHandle;
use tokio::runtime::Handle;
use tracing::{debug, error, info, info_span, warn};
use yama_localcache::StoreConnection;
use yama_midlevel_crypto::chunk_id::{ChunkId, ChunkIdKey};
use yama_pile::bloblogs::BloblogWriter;
use yama_pile::definitions::{BlobLocator, BloblogId, Index, IndexBloblogEntry, RecursiveChunkRef};
use yama_wormfile::boxed::BoxedWormFileProvider;
use yama_wormfile::WormFileWriter;
use zstd::bulk::Compressor;
pub const DESIRED_INDEX_SIZE_ENTRIES: usize = 32768;
// 256 kiB
pub const FASTCDC_MIN: u32 = 256 * 1024;
// 1 MiB
pub const FASTCDC_AVG: u32 = 1024 * 1024;
// 8 MiB
pub const FASTCDC_MAX: u32 = 8 * 1024 * 1024;
pub struct StoringState {
/// A connection to the local cache for checking whether
pub cache_conn: StoreConnection<false>,
/// Set of unflushed chunks, not present in any index, which we can assume have been created in this session.
pub new_unflushed_chunks: Arc<DashSet<ChunkId>>,
/// New bloblogs that we have created but not yet written out indices for.
pub new_bloblogs: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
pub pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
pub chunk_id_key: ChunkIdKey,
pub compressor: zstd::bulk::Compressor<'static>,
pub tokio_handle: Handle,
}
impl StoringState {
pub async fn new(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
tokio_handle: Handle,
) -> eyre::Result<Self> {
let compressor = match pwc.pile.pile_config.zstd_dict.as_ref() {
None => {
Compressor::new(get_zstd_level()).context("can't create dictless compressor")?
}
Some(dict_bytes) => Compressor::with_dictionary(get_zstd_level(), dict_bytes)
.context("can't create dictful compressor")?,
};
let chunk_id_key = pwc.pile.pile_config.chunk_id_key;
Ok(StoringState {
cache_conn: pwc.localcache.read().await?,
new_unflushed_chunks,
new_bloblogs: vec![],
pwc,
chunk_id_key,
compressor,
tokio_handle,
})
}
}
struct StoringIntermediate {
/// New bloblogs that we have created but not yet written out indices for.
pub new_bloblogs: Vec<(BloblogId, BTreeMap<ChunkId, BlobLocator>)>,
}
impl From<StoringState> for StoringIntermediate {
fn from(ss: StoringState) -> Self {
StoringIntermediate {
new_bloblogs: ss.new_bloblogs,
}
}
}
#[derive(Default)]
pub struct StoringBloblogWriters {
/// Bloblog writer for actual file contents (we try to keep file contents sequential in the
/// common case)
pub file_contents: Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
/// Bloblog writer for chunks of chunks
pub metachunks: Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
}
impl StoringBloblogWriters {
pub async fn finish_bloblogs(&mut self, ss: &mut StoringState) -> eyre::Result<()> {
if let Some(writer_to_finish) = self.file_contents.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
ss.new_bloblogs.push((bloblog_id, chunkmap));
}
if let Some(writer_to_finish) = self.metachunks.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
ss.new_bloblogs.push((bloblog_id, chunkmap));
}
Ok(())
}
}
impl StoringState {
/// Acquire a bloblog writer handle, reusing the existing one in the slot if suitable.
#[async_backtrace::framed]
async fn obtain_bloblog_writer<'a>(
&mut self,
slot: &'a mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<&'a mut BloblogWriter<Pin<Box<dyn WormFileWriter>>>> {
// if let Some(ref mut writer) = slot {
// if !writer.should_finish() {
// return Ok(writer);
// }
// }
// awkward avoidance of strange borrow issues that I don't fully grok
if slot.as_ref().map(|w| w.should_finish()) == Some(false) {
return Ok(slot.as_mut().unwrap());
}
if let Some(writer_to_finish) = slot.take() {
let (_bloblog_path, bloblog_id, chunkmap) = writer_to_finish.finish().await?;
self.new_bloblogs.push((bloblog_id, chunkmap));
}
*slot = Some(self.pwc.pile.create_bloblog().await?);
Ok(slot.as_mut().unwrap())
}
/// For internal use only.
fn process_chunk(
&mut self,
chunk_bytes: &[u8],
result: &mut Vec<ChunkId>,
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<()> {
let chunk_id = ChunkId::compute(chunk_bytes, &self.chunk_id_key);
result.push(chunk_id);
let tokio_handle = self.tokio_handle.clone();
let is_new = tokio_handle.block_on(async_backtrace::frame!(async {
Ok::<bool, eyre::Report>(
self.cache_conn.is_chunk_new(chunk_id).await?
&& self.new_unflushed_chunks.insert(chunk_id),
)
}))?;
if is_new {
let compressed_bytes = self.compressor.compress(&chunk_bytes)?;
tokio_handle.block_on(async_backtrace::frame!(async {
let writer = self.obtain_bloblog_writer(slot).await?;
writer.write_chunk(chunk_id, &compressed_bytes).await?;
Ok::<(), eyre::Report>(())
}))?;
}
Ok(())
}
fn store_full_slice_returning_chunks(
&mut self,
store_slice: &[u8],
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<Vec<ChunkId>> {
let mut result = Vec::new();
for chunk in FastCDC::new(store_slice, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX) {
let chunk_bytes = &store_slice[chunk.offset..chunk.offset + chunk.length];
self.process_chunk(chunk_bytes, &mut result, slot)?
}
if result.is_empty() {
self.process_chunk(&[], &mut result, slot)?;
}
Ok(result)
}
fn store_full_stream_returning_chunks(
&mut self,
store_stream: impl Read,
slot: &mut Option<BloblogWriter<Pin<Box<dyn WormFileWriter>>>>,
) -> eyre::Result<(Vec<ChunkId>, u64)> {
let mut stream_length = 0u64;
let mut result = Vec::new();
for chunk in StreamCDC::new(store_stream, FASTCDC_MIN, FASTCDC_AVG, FASTCDC_MAX) {
let chunk = chunk.context("failed to read in for StreamCDC")?;
let chunk_bytes = chunk.data.as_slice();
stream_length += chunk_bytes.len() as u64;
self.process_chunk(chunk_bytes, &mut result, slot)?;
}
if result.is_empty() {
self.process_chunk(&[], &mut result, slot)?;
}
Ok((result, stream_length))
}
pub fn store_full_slice(
&mut self,
store_slice: &[u8],
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<RecursiveChunkRef> {
// First calculate all the chunk IDs needed to be written here.
let mut chunk_ids =
self.store_full_slice_returning_chunks(store_slice, &mut sbw.file_contents)?;
let mut depth = 0;
// If we have the wrong number of chunks, we should chunk the chunk list...
while chunk_ids.len() != 1 {
let mut metachunks_list_bytes: Vec<u8> = Vec::with_capacity(chunk_ids.len() * 32);
for chunk_id in chunk_ids {
metachunks_list_bytes.extend_from_slice(&chunk_id.to_bytes());
}
// TODO It might be nice to store these in opposite order, so a read is a true sequential
// scan.
// i.e. (depth=3) (depth=2) (depth=1) (depth=0) ...
chunk_ids = self
.store_full_slice_returning_chunks(&metachunks_list_bytes, &mut sbw.metachunks)?;
depth += 1;
}
Ok(RecursiveChunkRef {
chunk_id: chunk_ids[0],
depth,
})
}
/// Stores a full stream (`Read`) and returns the recursive chunk ref plus the length of the
/// stream.
#[async_backtrace::framed]
pub fn store_full_stream(
&mut self,
store_stream: impl Read,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<(RecursiveChunkRef, u64)> {
// First calculate all the chunk IDs needed to be written here.
let (mut chunk_ids, stream_length) =
self.store_full_stream_returning_chunks(store_stream, &mut sbw.file_contents)?;
let mut depth = 0;
// If we have the wrong number of chunks, we should chunk the chunk list...
while chunk_ids.len() != 1 {
let mut metachunks_list_bytes: Vec<u8> = Vec::with_capacity(chunk_ids.len() * 32);
for chunk_id in chunk_ids {
metachunks_list_bytes.extend_from_slice(&chunk_id.to_bytes());
}
// TODO It might be nice to store these in opposite order, so a read is a true sequential
// scan.
// i.e. (depth=3) (depth=2) (depth=1) (depth=0) ...
chunk_ids = self
.store_full_slice_returning_chunks(&metachunks_list_bytes, &mut sbw.metachunks)?;
depth += 1;
}
Ok((
RecursiveChunkRef {
chunk_id: chunk_ids[0],
depth,
},
stream_length,
))
}
}
/// Stores a file, returning Ok(Some(...)) if fine, Ok(None) if the file doesn't exist (vanished)
/// or Err(...) for any other error.
///
/// WARNING! This memory-maps the file and should NOT be used on files that are being written to
/// by other applications. If the underlying data changes during storage, it can cause issues with
/// Zstd (and presumably can also cause the chunk hashes to be invalid).
///
/// Further, I have had issues with this seeming to 'use' a lot of memory. Whilst it should only
/// be virtual memory, for some reason it seems to cause swap to be used and it makes diagnosis
/// of REAL memory issues much harder.
/// For that reason it is hard to recommend this approach for now.
#[allow(dead_code)]
fn store_file_mmap_blocking(
file_path: &Path,
storing_state: &mut StoringState,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<Option<(RecursiveChunkRef, u64)>> {
let file = match File::open(file_path) {
Ok(file) => file,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
warn!("file vanished: {file_path:?}");
return Ok(None);
}
Err(other) => {
bail!("error storing {file_path:?}: {other:?}");
}
};
let mapped = unsafe { memmap2::Mmap::map(&file) }?;
let size_of_file = mapped.as_ref().len();
let chunkref = storing_state.store_full_slice(mapped.as_ref(), sbw)?;
Ok(Some((chunkref, size_of_file as u64)))
}
fn store_file_non_mmap_blocking(
file_path: &Path,
storing_state: &mut StoringState,
sbw: &mut StoringBloblogWriters,
) -> eyre::Result<Option<(RecursiveChunkRef, u64)>> {
let file = match File::open(file_path) {
Ok(file) => file,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
warn!("file vanished: {file_path:?}");
return Ok(None);
}
Err(other) => {
bail!("error storing {file_path:?}: {other:?}");
}
};
storing_state.store_full_stream(file, sbw).map(Some)
}
pub struct StoragePipeline<JobName> {
result_rx: Receiver<(JobName, Option<(RecursiveChunkRef, u64)>)>,
join_set: Vec<JoinHandle<eyre::Result<StoringIntermediate>>>,
}
fn storage_pipeline_worker_blocking<JobName: Debug>(
job_rx: Receiver<(JobName, PathBuf)>,
result_tx: Sender<(JobName, Option<(RecursiveChunkRef, u64)>)>,
mut storing_state: StoringState,
tokio_handle: Handle,
) -> eyre::Result<StoringIntermediate> {
let mut bloblog_writers = StoringBloblogWriters::default();
debug!("SPW startup");
while let Ok((job_id, file_path)) = job_rx.recv() {
// TODO(span): is this correctly a child of the parent span?
let span = info_span!("store_file", file=?file_path);
let _span_entered = span.enter();
let file_store_opt =
store_file_non_mmap_blocking(&file_path, &mut storing_state, &mut bloblog_writers)
.with_context(|| format!("failed to store {file_path:?}"))?;
if let Err(SendError(to_be_sent)) = result_tx.send((job_id, file_store_opt)) {
bail!("Can't return result for {to_be_sent:?} — result_tx shut down.");
}
}
debug!("finishing bloblogs");
tokio_handle.block_on(bloblog_writers.finish_bloblogs(&mut storing_state))?;
debug!("finished bloblogs!");
Ok(StoringIntermediate::from(storing_state))
}
fn get_zstd_level() -> i32 {
// TODO Do something more proper
if let Ok(var) = std::env::var("YAMA_HACK_ZSTD_LEVEL") {
if let Ok(level) = var.parse() {
info!("YAMA_HACK_ZSTD_LEVEL: using {level}");
return level;
} else {
error!("YAMA_HACK_ZSTD_LEVEL was not set to a valid i32: {var:?}")
}
}
return 16;
}
impl<JobName: Debug + Send + 'static> StoragePipeline<JobName> {
#[async_backtrace::framed]
pub async fn launch_new(
workers: u32,
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
new_unflushed_chunks: Arc<DashSet<ChunkId>>,
) -> eyre::Result<(StoragePipeline<JobName>, Sender<(JobName, PathBuf)>)> {
let (job_tx, job_rx) = flume::bounded(16);
let (result_tx, result_rx) = flume::bounded(4);
let mut join_set = Vec::new();
for spw_num in 0..workers {
let job_rx = job_rx.clone();
let result_tx = result_tx.clone();
let tokio_handle = Handle::current();
let storing_state = StoringState::new(
pwc.clone(),
new_unflushed_chunks.clone(),
tokio_handle.clone(),
)
.await
.context("failed to create storing state")?;
// make a logging span for the Storage Pipeline Workers
let spw_span = info_span!("spw", n = spw_num);
let thread = std::thread::Builder::new()
.name(format!("spw-{spw_num}"))
.spawn(move || {
let _spw_span_entered = spw_span.enter();
let result = storage_pipeline_worker_blocking(
job_rx,
result_tx,
storing_state,
tokio_handle,
);
if let Err(ref err) = result {
error!("Error in SPW {err:?}");
}
result
})
.expect("failed to spawn SPW thread!");
join_set.push(thread);
}
Ok((
StoragePipeline {
result_rx,
join_set,
},
job_tx,
))
}
#[inline]
pub async fn next_result(
&self,
) -> Result<(JobName, Option<(RecursiveChunkRef, u64)>), RecvError> {
self.result_rx.recv_async().await
}
/// Must be sure that all results have been collected first.
#[async_backtrace::framed]
pub async fn finish_into_chunkmaps(
mut self,
) -> eyre::Result<BTreeMap<BloblogId, IndexBloblogEntry>> {
if let Ok(msg) = self.result_rx.recv_async().await {
bail!("Haven't processed all results yet! {msg:?}");
}
let mut chunkmap = BTreeMap::new();
while let Some(thread) = self.join_set.pop() {
// TODO(blocking on async thread): do this differently.
let join_resres = thread.join().map_err(|panic_err| eyre!("{panic_err:?}"));
chunkmap.extend(join_resres??.new_bloblogs.into_iter().map(|(k, nb)| {
(
k,
IndexBloblogEntry {
chunks: nb,
forgotten_bytes: 0,
},
)
}));
}
Ok(chunkmap)
}
}
fn assemble_indices(chunkmap: BTreeMap<BloblogId, IndexBloblogEntry>) -> Vec<Index> {
let mut sorted_map = BTreeMap::new();
for (idx, chunkmap) in chunkmap.into_iter().enumerate() {
let size_of_chunkmap = chunkmap.1.chunks.len() + 1;
sorted_map.insert(Reverse((size_of_chunkmap, idx)), chunkmap);
}
let mut indices = Vec::new();
while let Some((Reverse((size, _)), (bloblog_id, bloblog_chunks))) = sorted_map.pop_first() {
let mut new_index_contents = BTreeMap::new();
new_index_contents.insert(bloblog_id, bloblog_chunks);
let mut new_index_size_so_far = size;
while new_index_size_so_far < DESIRED_INDEX_SIZE_ENTRIES && !sorted_map.is_empty() {
if let Some((k, _)) = sorted_map
.range(
Reverse((
DESIRED_INDEX_SIZE_ENTRIES - new_index_size_so_far,
usize::MAX,
))..,
)
.next()
{
let k = k.clone();
let (Reverse((add_size, _)), (bloblog_id, bloblog_chunks)) =
sorted_map.remove_entry(&k).unwrap();
new_index_size_so_far += add_size;
new_index_contents.insert(bloblog_id, bloblog_chunks);
} else {
break;
}
}
indices.push(Index {
supersedes: BTreeSet::new(),
bloblogs: new_index_contents,
});
}
indices
}
async fn write_indices(
pwc: &PileWithCache<BoxedWormFileProvider>,
indices: Vec<Index>,
) -> eyre::Result<()> {
for index in indices {
let index_id = pwc.pile.create_index(&index).await?;
if !pwc
.localcache
.write()
.await?
.apply_index(index_id, Arc::new(index))
.await?
{
error!("freshly-created index wasn't new. This is suspicious.");
};
}
Ok(())
}
#[async_backtrace::framed]
pub async fn assemble_and_write_indices(
pwc: &PileWithCache<BoxedWormFileProvider>,
chunkmap: BTreeMap<BloblogId, IndexBloblogEntry>,
) -> eyre::Result<()> {
let indices = assemble_indices(chunkmap);
write_indices(pwc, indices).await
}

View File

@ -1,140 +0,0 @@
/*
This file is part of Yama.
Yama is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Yama is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Yama. If not, see <https://www.gnu.org/licenses/>.
*/
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write;
pub fn bytes_to_hexstring(chunkid: &[u8]) -> String {
let mut s = String::with_capacity(chunkid.len() * 2);
for &byte in chunkid.iter() {
write!(&mut s, "{:02x}", byte).expect("Unable to write");
}
s
}
pub fn get_number_of_workers(first_try_env_name: &str) -> u8 {
let from_env_var = std::env::var(first_try_env_name)
.ok()
.or_else(|| std::env::var("YAMA_WORKERS").ok());
if let Some(from_env_var) = from_env_var {
from_env_var
.parse()
.expect("Number of workers should be an integer from 1 to 255.")
} else {
let number = num_cpus::get();
if number > u8::MAX.into() {
u8::MAX
} else {
number as u8
}
}
}
#[derive(Clone, Debug)]
pub struct LruMap<K, V> {
capacity: usize,
last_access: BTreeSet<(u64, K)>,
items: BTreeMap<K, (V, u64)>,
counter: u64,
}
impl<K: Ord + Clone, V> LruMap<K, V> {
pub fn new(capacity: usize) -> LruMap<K, V> {
LruMap {
capacity,
last_access: BTreeSet::new(),
items: BTreeMap::new(),
counter: 0,
}
}
/// Gets an item from the LRU map.
pub fn get(&mut self, key: &K) -> Option<&V> {
match self.items.get_mut(key) {
Some((value, last_used_instant)) => {
assert!(
self.last_access.remove(&(*last_used_instant, key.clone())),
"Corrupt LRU map: freshen not correct."
);
let new_instant = self.counter;
self.counter += 1;
self.last_access.insert((new_instant, key.clone()));
*last_used_instant = new_instant;
Some(value)
}
None => None,
}
}
pub fn insert(&mut self, key: K, value: V) -> Option<V> {
let new_instant = self.counter;
self.counter += 1;
let retval = match self.items.insert(key.clone(), (value, new_instant)) {
Some((old_entry, old_instant)) => {
assert!(
self.last_access.remove(&(old_instant, key.clone())),
"Corrupt LRU map: insert not correct."
);
Some(old_entry)
}
None => None,
};
self.last_access.insert((new_instant, key));
if retval.is_none() {
// We didn't replace any item, so we have grown by 1.
// Check if we need to evict.
if self.items.len() > self.capacity {
self.evict();
}
}
retval
}
pub fn evict(&mut self) -> Option<(K, V)> {
if let Some(first_entry) = self.last_access.iter().next().cloned() {
self.last_access.remove(&first_entry);
let (_, key) = first_entry;
let (value, _) = self
.items
.remove(&key)
.expect("Corrupt LRU map: last access and items out of sync");
Some((key, value))
} else {
None
}
}
}
#[cfg(test)]
mod test {
use crate::utils::LruMap;
#[test]
fn test_lru_map() {
let mut lmap = LruMap::new(3);
lmap.insert(1, 1);
lmap.insert(2, 1);
lmap.insert(3, 1);
assert_eq!(lmap.get(&1), Some(&1));
lmap.insert(4, 1);
assert_eq!(lmap.get(&2), None);
}
}

4
yama/src/vacuum.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod delete_unrefd_bloblogs;
pub mod forget_chunks;
pub mod merge_indices;
pub mod repack_bloblogs_and_indices;

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,171 @@
use crate::extract::expand_chunkrefs_one_layer;
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, ensure, Context, ContextCompat};
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use tracing::info;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::IndexId;
use yama_pile::tree::TreeNode;
use yama_wormfile::boxed::BoxedWormFileProvider;
pub async fn find_forgettable_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
indices: BTreeSet<IndexId>,
) -> eyre::Result<BTreeSet<ChunkId>> {
let mut unseen_chunk_ids = BTreeSet::new();
// Find all chunks in the given indices
{
let mut cache_conn = pwc.localcache.read().await?;
for index_id in &indices {
unseen_chunk_ids.extend(cache_conn.list_chunks_in_index(*index_id).await?);
}
};
let chunks_to_scan = prepare_chunkrefs_to_scan(pwc).await?;
scan_chunks(pwc, &mut unseen_chunk_ids, chunks_to_scan)
.await
.context("failed to do a sweep")?;
Ok(unseen_chunk_ids)
}
async fn prepare_chunkrefs_to_scan(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<BTreeMap<u32, BTreeSet<ChunkId>>> {
let pointer_names = pwc
.pile
.list_pointers()
.await
.context("failed to list pointers")?;
let mut chunks_to_scan_by_depth: BTreeMap<u32, BTreeSet<ChunkId>> = BTreeMap::new();
for pointer_name in &pointer_names {
let pointer = pwc
.pile
.read_pointer(pointer_name)
.await?
.context("pointer vanished")?;
if let Some(parent_name) = pointer.parent {
if !pointer_names.contains(pointer_name) {
bail!("{parent_name:?}, the parent of {pointer_name:?}, does not exist");
}
}
pointer
.root
.node
.visit(
&mut |node, _| {
if let TreeNode::NormalFile { content, .. } = node {
chunks_to_scan_by_depth
.entry(content.depth)
.or_default()
.insert(content.chunk_id);
}
Ok(())
},
String::new(),
)
.unwrap();
}
Ok(chunks_to_scan_by_depth)
}
/// Scans the recursive chunkrefs that are passed in, ticking off chunks from the `unseen` set as
/// we go.
async fn scan_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
unseen: &mut BTreeSet<ChunkId>,
chunks_to_scan_by_depth: BTreeMap<u32, BTreeSet<ChunkId>>,
) -> eyre::Result<()> {
let mut to_scan: Vec<(u32, Vec<ChunkId>)> = chunks_to_scan_by_depth
.into_iter()
.flat_map(|(depth, chunkset)| {
chunkset
.into_iter()
.map(move |chunk_id| (depth, vec![chunk_id]))
})
.collect();
while !to_scan.is_empty() {
// Mark as seen.
for (_, chunk_ids) in &to_scan {
for chunk_id in chunk_ids {
unseen.remove(chunk_id);
}
}
// Don't descend further into zero-depth elements.
to_scan = to_scan
.into_iter()
.filter(|(depth, _)| *depth > 0)
.collect();
// Decrement depth counters.
to_scan = expand_chunkrefs_one_layer(pwc, to_scan)
.await?
.into_iter()
.map(|(old_depth, chunkids)| (old_depth - 1, chunkids))
.collect();
}
Ok(())
}
pub async fn forget_chunks(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
indices: BTreeSet<IndexId>,
forgettable: BTreeSet<ChunkId>,
) -> eyre::Result<()> {
let mut indices_to_rewrite = Vec::new();
// First do a cache-only check to see which indices need rewriting.
{
let mut cache_conn = pwc.localcache.read().await?;
for index_id in &indices {
let chunks_in_this_index = cache_conn.list_chunks_in_index(*index_id).await?;
if !chunks_in_this_index.is_disjoint(&forgettable) {
indices_to_rewrite.push(index_id);
}
}
}
info!(
"{} indices to rewrite in order to forget chunks",
indices_to_rewrite.len()
);
// Go through each index and clean out whatever needs forgetting (then re-create the index and
// remove the old one).
for index_id in indices_to_rewrite {
let mut index = pwc.pile.read_index(*index_id).await?;
let mut changed = false;
for bloblog_entry in index.bloblogs.values_mut() {
let removable: Vec<ChunkId> = bloblog_entry
.chunks
.keys()
.filter(|ci| forgettable.contains(ci))
.cloned()
.collect();
changed |= !removable.is_empty();
for chunk_id in removable {
bloblog_entry.forgotten_bytes +=
bloblog_entry.chunks.remove(&chunk_id).unwrap().length;
}
}
ensure!(changed, "no change to index {index_id:?}");
index.supersedes.clear();
index.supersedes.insert(*index_id);
// TODO APPLY THE NEW INDEX DIRECTLY (how do we do that again?)
let new_index_id = pwc.pile.create_index(&index).await?;
ensure!(new_index_id != *index_id, "index ID bounce");
pwc.pile.delete_index_dangerous_exclusive(*index_id).await?;
}
Ok(())
}

View File

@ -0,0 +1,127 @@
use crate::pile_with_cache::PileWithCache;
use eyre::{bail, Context};
use std::collections::btree_map::Entry;
use std::collections::BTreeSet;
use std::sync::Arc;
use tracing::{debug, warn};
use yama_pile::definitions::{Index, IndexId};
use yama_wormfile::boxed::BoxedWormFileProvider;
pub const MERGE_THRESHOLD_SIZE: u32 = 2 * 1024 * 1024;
pub const MERGE_TARGET_SIZE: u32 = 16 * 1024 * 1024;
/// Selects indices for merge.
///
/// Criteria:
/// - size is less than the `threshold_size`
/// - (FUTURE; TODO) two indices that cover the same bloblog should be merged
pub async fn select_indices_for_merge(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
target_size: u32,
threshold_size: u32,
) -> eyre::Result<Vec<BTreeSet<IndexId>>> {
let mut result = Vec::new();
let mut mergeable_indices: BTreeSet<(u64, IndexId)> = pwc
.pile
.list_indices_with_meta()
.await?
.into_iter()
.filter(|(_, meta)| meta.file_size < threshold_size as u64)
.map(|(index_id, meta)| (meta.file_size, index_id))
.collect();
while mergeable_indices.len() >= 2 {
let mut merge_set = BTreeSet::new();
let mut merge_size = 0u64;
let (first_size, first_index) = mergeable_indices.pop_first().unwrap();
merge_size += first_size;
merge_set.insert(first_index);
while let Some((size, index)) = mergeable_indices.first() {
if merge_size + *size < target_size as u64 {
merge_size += *size;
merge_set.insert(*index);
mergeable_indices.pop_first();
} else {
break;
}
}
if merge_set.len() > 1 {
result.push(merge_set);
}
}
Ok(result)
}
/// Merges some indices, deleting them in the process.
/// Requires exclusive lock.
/// (Note: in the future we could only supersede the indices, which only needs a shared lock.
/// However you need an exclusive lock to eventually delete superseded indices...).
pub async fn merge_indices(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
merge_sets: Vec<BTreeSet<IndexId>>,
) -> eyre::Result<()> {
for merge_set in merge_sets {
let mut final_index = Index {
supersedes: merge_set.clone(),
bloblogs: Default::default(),
};
for index_id in &merge_set {
let index_being_subsumed = pwc.pile.read_index(*index_id).await?;
// TODO: do we need to worry about the 'supersedes' property on the index here?
// I think not, or at least not if the superseded indices don't exist,
// but worth thinking about in the future if we don't immediately delete
// superseded indices...
for (bloblog_id, bloblog_entry) in index_being_subsumed.bloblogs {
match final_index.bloblogs.entry(bloblog_id) {
Entry::Vacant(ve) => {
ve.insert(bloblog_entry);
}
Entry::Occupied(mut oe) => {
let new_entry = oe.get_mut();
let (existing_chunks, new_chunks): (Vec<_>, Vec<_>) = bloblog_entry
.chunks
.into_iter()
.partition(|(chunk_id, _)| new_entry.chunks.contains_key(chunk_id));
for (chunk_id, locator) in new_chunks {
// Subtract from the forgotten byte count, since this may be us re-remembering bytes out of safety...
new_entry.forgotten_bytes =
new_entry.forgotten_bytes.saturating_sub(locator.length);
let is_new = new_entry.chunks.insert(chunk_id, locator).is_none();
assert!(is_new);
}
for (chunk_id, locator) in existing_chunks {
if &new_entry.chunks[&chunk_id] != &locator {
bail!("Attempted to merge indices that disagree about {bloblog_id:?}/{chunk_id:?}");
}
}
}
}
}
}
let merged_index_id = pwc
.pile
.create_index(&final_index)
.await
.context("failed to create merged index")?;
if merge_set.contains(&merged_index_id) {
// I don't see how this could be possible, but let's avoid deleting the new index if it somehow is a merge of itself...
warn!("strange: created index ID is one of its own merges...");
continue;
}
debug!("merged indices {merge_set:?} into {merged_index_id:?}; deleting mergees");
for index_to_delete in merge_set {
pwc.pile
.delete_index_dangerous_exclusive(index_to_delete)
.await?;
}
}
Ok(())
}

View File

@ -0,0 +1,191 @@
use crate::pile_with_cache::PileWithCache;
use crate::storing::assemble_and_write_indices;
use eyre::ContextCompat;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};
use std::sync::Arc;
use yama_localcache::BloblogStats;
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BloblogId, IndexBloblogEntry};
use yama_wormfile::boxed::BoxedWormFileProvider;
/// Repack bloblogs that have this much forgotten space in them.
pub const REPACK_BLOBLOGS_TO_RECLAIM_SPACE_BYTES: u64 = 32 * 1024 * 1024;
/// Defines what a 'small bloblog' is (one that is below a certain size, excluding forgotten bytes).
pub const SMALL_BLOBLOG_THRESHOLD: u64 = 64 * 1024 * 1024;
/// Clump together small bloblogs when together they would hit or exceed this size.
pub const REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES: u64 = 2 * 1024 * 1024 * 1024;
/// The target size to reach when repacking, in terms of blob bytes.
pub const REPACK_TARGET_SIZE: u64 = 4 * 1024 * 1024;
/// The limit size to use when repacking, in terms of blob bytes.
pub const REPACK_TARGET_LIMIT: u64 = 5 * 1024 * 1024;
/// Gets bloblogs' stats. Only considers bloblogs referenced by exactly one index, so we don't
/// have to deal with unifying indices.
pub async fn get_bloblogs_stats(
pwc: &Arc<PileWithCache<BoxedWormFileProvider>>,
) -> eyre::Result<BTreeMap<BloblogId, BloblogStats>> {
let mut cache_conn = pwc.localcache.read().await?;
let indices = cache_conn.list_indices().await?;
let mut bloblogs: BTreeMap<BloblogId, Option<BloblogStats>> = BTreeMap::new();
for index in indices {
for (bloblog, stats) in cache_conn.index_bloblog_stats(index).await? {
match bloblogs.entry(bloblog) {
Entry::Vacant(ve) => {
ve.insert(Some(stats));
}
Entry::Occupied(mut oe) => {
// only allow one stats per bloblog, then replace with None.
oe.insert(None);
}
}
}
}
Ok(bloblogs
.into_iter()
.flat_map(|(k, v)| v.map(|v| (k, v)))
.collect())
}
/// Choose some bloblogs to repack. Assumes an updated local cache.
///
/// Only bloblogs referenced by exactly one index will be considered for repacking.
pub async fn select_bloblogs_for_repack(
stats: BTreeMap<BloblogId, BloblogStats>,
) -> eyre::Result<Vec<BTreeMap<BloblogId, BloblogStats>>> {
let mut repack_for_space: BTreeSet<BloblogId> = stats
.iter()
.filter(|(_, v)| v.forgotten_bytes >= REPACK_BLOBLOGS_TO_RECLAIM_SPACE_BYTES)
.map(|(&k, _)| k)
.collect();
let maybe_repack_for_clumping: BTreeSet<BloblogId> = stats
.iter()
.filter(|(_, v)| v.blob_size <= SMALL_BLOBLOG_THRESHOLD)
.map(|(&k, _)| k)
.collect();
let should_repack_for_clumping = maybe_repack_for_clumping.len() > 1
&& maybe_repack_for_clumping
.iter()
.map(|bi| stats[bi].blob_size)
.sum::<u64>()
> REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES;
let to_repack = repack_for_space.clone();
if should_repack_for_clumping {
repack_for_space.extend(maybe_repack_for_clumping);
}
let mut to_repack: BTreeSet<(u64, BloblogId)> = to_repack
.into_iter()
.map(|bi| (stats[&bi].blob_size, bi))
.collect();
let mut repack_sets = Vec::new();
while !to_repack.is_empty() {
let mut new_repack_group = BTreeMap::new();
let mut new_repack_group_size = 0u64;
let (first_sz, first_to_repack) = to_repack.pop_last().unwrap();
new_repack_group_size += first_sz;
new_repack_group.insert(first_to_repack, stats[&first_to_repack].clone());
while new_repack_group_size < REPACK_TARGET_SIZE {
let Some((first_size, _)) = to_repack.first() else { break; };
if new_repack_group_size + *first_size > REPACK_TARGET_LIMIT {
break;
}
let (extra_size, extra_bloblog_id) = to_repack.pop_first().unwrap();
new_repack_group_size += extra_size;
new_repack_group.insert(extra_bloblog_id, stats[&extra_bloblog_id].clone());
}
// now check the repack group is good
if new_repack_group
.keys()
.any(|bi| repack_for_space.contains(bi))
|| new_repack_group_size > REPACK_BLOBLOGS_TO_CLUMP_TOGETHER_SMALL_BLOBLOGS_BYTES
{
repack_sets.push(new_repack_group);
}
}
Ok(repack_sets)
}
pub async fn perform_repack(
pwc: Arc<PileWithCache<BoxedWormFileProvider>>,
repack_sets: Vec<BTreeMap<BloblogId, BloblogStats>>,
) -> eyre::Result<()> {
// 1. Write new bloblogs
let mut indices_buffer = BTreeMap::new();
let mut index_parts: BTreeMap<BloblogId, IndexBloblogEntry> = BTreeMap::new();
for repack_set in &repack_sets {
let mut new_bloblog = pwc.pile.create_bloblog().await?;
for (old_bloblog_id, old_bloblog_stats) in repack_set {
let index_id = old_bloblog_stats.in_index;
if !indices_buffer.contains_key(&index_id) {
indices_buffer.insert(index_id, pwc.pile.read_index(index_id).await?);
}
let index_bloblog_entry = indices_buffer
.get_mut(&index_id)
.unwrap()
.bloblogs
.remove(&old_bloblog_id)
.context("bug: no IBE despite rewrite from context of this index")?;
let mut old_bloblog = pwc.pile.read_bloblog(*old_bloblog_id).await?;
let locators: BTreeMap<u64, ChunkId> = index_bloblog_entry
.chunks
.into_iter()
.map(|(blob, locator)| (locator.offset, blob))
.collect();
for chunk_id in locators.into_values() {
let chunk = old_bloblog
.read_chunk(chunk_id)
.await?
.context("bug or corrupt bloblog: promised chunk missing")?;
new_bloblog.write_chunk(chunk_id, &chunk).await?;
}
}
let (_wormpath, new_bloblog_id, new_bloblog_index_info) = new_bloblog.finish().await?;
index_parts.insert(
new_bloblog_id,
IndexBloblogEntry {
chunks: new_bloblog_index_info,
forgotten_bytes: 0,
},
);
}
// 2. Write new indices, but make sure to also write out index entries for unaffected bloblogs
// that appear in the indices we want to replace shortly.
for (_, index) in indices_buffer.iter_mut() {
index_parts.extend(std::mem::take(&mut index.bloblogs));
}
assemble_and_write_indices(&pwc, index_parts).await?;
// 3. Delete old indices
for index_id in indices_buffer.into_keys() {
pwc.pile.delete_index_dangerous_exclusive(index_id).await?;
}
// 4. Delete old bloblogs
for repack_group in repack_sets {
for bloblog_id in repack_group.into_keys() {
pwc.pile
.delete_bloblog_dangerous_exclusive(bloblog_id)
.await?;
}
}
Ok(())
}

91
yama_cli_readme.txt Normal file
View File

@ -0,0 +1,91 @@
`yama init` → init a yama pile right here, right now
creates:
* config
* directory structure
* master keyring (prompts for password)
`--zstd-dict <dict> | --no-zstd-dict`: choose a Zstd dictionary (or lack thereof)
OR
`yama init --sftp` → interactively create SFTP pile
`yama init --s3` → interactively create S3 pile
creates:
* config (remote)
* directory structure (remote)
* master keyring (local + optionally remote too, prompts for password)
* connection information file (local)
`yama keyring`
`inspect <file>.yamakeyring` → print contents of keyring, ask for password if needed
`new|create <new> [--from <src>] [--with <keylist>] [--no-password]`
create a new keyring based on another one. If `--from` not specified, then defaults to the master key in this directory (`master.yamakeyring`).
if `--no-password` is specified, then the new keyring will be unprotected
if `--with` is specified, then it's either a list of keynames to include (e.g. `r_bloblog_contents`, etc)
or a list of opinionated roles (ALL, config, backup, restore, ...?)
e.g. you might give your server a keyring with:
`yama keyring new myserver.yamakeyring --from master.yamakeyring --with backup --no-password` to allow it to create backups but not read from them
`yama store <source file/dir> [<dest pile/pileconnector dir>:]<pointer name>`
Stores a file/directory into Yama, with the given pointer.
If `--stdin` is passed, then the contents to store are actually read from stdin instead and the provided filename is a fake filename for informational purposes only.
Would be suitable for `pg_dump | yama store --stdin mydbname.sql
If `--force` is passed, this can overwrite a pointer name.
I expect we will also have `--exclude` and `--exclude-list` options.
I expect we will also have a `--dry-run` option.
`yama extract [<dest pile/pileconnector dir>:]<pointer name>[/path/to/subtree] (--stdout | <target file/dir>)`
Extracts a file/directory from Yama, from the given pointer.
If `--stdout` is passed, writes to stdout, in which case the input must be just one file.
I expect we will also have `--exclude` and `--exclude-list` options.
I expect we will also have a `--dry-run` option.
`yama mount [<dest pile/pileconnector dir>:]<pointer name>[/path/to/subtree] <target file/dir>`
Mount a pointer as a read-only FUSE filesystem.
`yama check`
Checks consistency of the pile. One of the levels must be specified:
`--pointers`|`-1`: checks that all pointers are valid
`--shallow`|`-2`: checks that all pointers' tree nodes point to chunks that exist.
`--intensive`|`-9`: checks that all chunks have the correct hash, that all indices correctly represent the bloblogs, that all pointers point to valid files in the end, ... as much as possible
`yama lsp [[<dest pile/pileconnector dir>:]<glob>]`
(glob defaults to `.:*`)
Lists pointers in the pile.
If `--deleted` is specified, includes deleted pointers that have yet to be vacuumed.
`yama rmp [<dest pile/pileconnector dir>:]<pointer>`
Deletes pointers, or marks them as deleted.
If `--glob` specified, then `<pointer>` is a glob.
If `--now` is specified, an exclusive lock is required to actually delete the pointer.
If `--now` is *not* specified, then the pointer is merely marked as deleted and this only requires a shared lock.
`yama vacuum`
Vacuums the pile, reclaiming disk space. Holds an exclusive lock over the pile.
Does things like:
- (--pointers) clean up deleted pointers that need to be actually deleted
- (--sweep) scans all pointers to discover all the chunks that are present in bloblogs but not used, then removes them from the indices (possibly slow, but necessary to actually make bloblog repacking possible)
- (--indices) writes new indices to replace existing indices, if the existing indices are not space-efficient
- (--bloblogs) repacks bloblogs that aren't space-efficient, removing unindexed blobs in the process
`--all` for everything.

View File

@ -0,0 +1,16 @@
[package]
name = "yama_localcache"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
sqlx = { version = "0.6.3", features = ["sqlite", "runtime-tokio-rustls"] }
tracing = "0.1.37"
eyre = "0.6.8"
tokio = "1.27.0"
yama_pile = { path = "../yama_pile" }
yama_midlevel_crypto = { path = "../yama_midlevel_crypto" }
itertools = "0.10.5"
async-backtrace = "0.2.6"

7
yama_localcache/dev_db.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
set -eu
dbpath="$(dirname "$0")/testdb.sqlite"
#echo $dbpath
sqlx db create --database-url sqlite:"$dbpath"
sqlx migrate run --database-url sqlite:"$dbpath"

View File

@ -0,0 +1,39 @@
-- Create a local cache of indices.
CREATE TABLE indices (
index_short_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
index_sha256 TEXT NOT NULL
);
CREATE UNIQUE INDEX indices_index_sha256 ON indices(index_sha256);
CREATE TABLE bloblogs (
bloblog_short_id INTEGER PRIMARY KEY NOT NULL,
bloblog_sha256 TEXT NOT NULL
);
CREATE UNIQUE INDEX bloblogs_bloblog_sha256 ON bloblogs(bloblog_sha256);
-- Track the relationship between indices and bloblogs
CREATE TABLE indices_bloblogs (
index_short_id INTEGER NOT NULL REFERENCES indices(index_short_id),
bloblog_short_id INTEGER NOT NULL REFERENCES bloblogs(bloblog_short_id),
forgotten_bytes INTEGER NOT NULL,
PRIMARY KEY (index_short_id, bloblog_short_id)
);
CREATE TABLE blobs (
chunk_id TEXT NOT NULL,
bloblog_short_id INTEGER NOT NULL,
index_short_id INTEGER NOT NULL,
offset INTEGER NOT NULL,
size INTEGER NOT NULL,
PRIMARY KEY (chunk_id, bloblog_short_id, index_short_id),
FOREIGN KEY (index_short_id, bloblog_short_id) REFERENCES indices_bloblogs(index_short_id, bloblog_short_id)
);
CREATE INDEX blobs_bloblog_short_id ON blobs(bloblog_short_id);
CREATE INDEX blobs_index_short_id ON blobs(index_short_id);
CREATE TABLE indices_supersede (
superseded_sha256 TEXT NOT NULL,
successor_sha256 TEXT NOT NULL REFERENCES indices(index_sha256),
PRIMARY KEY (superseded_sha256, successor_sha256)
);

423
yama_localcache/src/lib.rs Normal file
View File

@ -0,0 +1,423 @@
use eyre::Context;
use itertools::Itertools;
use sqlx::pool::PoolConnection;
use sqlx::sqlite::{
SqliteConnectOptions, SqliteJournalMode, SqlitePoolOptions, SqliteRow, SqliteSynchronous,
};
use sqlx::{query, Connection, Row, Sqlite, SqlitePool};
use std::collections::{BTreeMap, BTreeSet};
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use yama_midlevel_crypto::chunk_id::ChunkId;
use yama_pile::definitions::{BlobLocator, BloblogId, Index, IndexId};
#[derive(Clone)]
pub struct Store {
pool: Arc<SqlitePool>,
writer_semaphore: Arc<Semaphore>,
}
pub struct StoreConnection<const RW: bool> {
/// The underlying 'connection'.
conn: PoolConnection<Sqlite>,
/// Permit to write. Only here so that it is dropped at the correct time.
#[allow(dead_code)]
writer_permit: Option<OwnedSemaphorePermit>,
}
const MAX_SQLITE_CONNECTIONS: u32 = 16;
impl Store {
pub async fn new(path: &Path) -> eyre::Result<Store> {
let pool = SqlitePoolOptions::new()
.max_connections(MAX_SQLITE_CONNECTIONS)
.connect_with(
SqliteConnectOptions::new()
.create_if_missing(true)
.journal_mode(SqliteJournalMode::Wal)
.synchronous(SqliteSynchronous::Normal)
.foreign_keys(true)
.filename(path),
)
.await?;
let store = Store {
pool: Arc::new(pool),
writer_semaphore: Arc::new(Semaphore::new(1)),
};
let mut conn = store.pool.acquire().await?;
// This will run the necessary migrations.
sqlx::migrate!("./migrations").run(&mut conn).await?;
Ok(store)
}
async fn txn<const RW: bool>(&self) -> eyre::Result<StoreConnection<RW>> {
let writer_permit = if RW {
Some(self.writer_semaphore.clone().acquire_owned().await?)
} else {
None
};
let conn = self.pool.acquire().await?;
Ok(StoreConnection {
conn,
writer_permit,
})
}
pub async fn read(&self) -> eyre::Result<StoreConnection<false>> {
self.txn().await
}
pub async fn write(&self) -> eyre::Result<StoreConnection<true>> {
self.txn().await
}
}
impl StoreConnection<true> {
/// Store an index into the local index cache.
/// If the index supersedes any others, then those supersedings are stored and the blob entries
/// for the superseded indices are removed.
///
/// Returns true iff the index was new.
pub async fn apply_index(
&mut self,
index_id: IndexId,
index: Arc<Index>,
) -> eyre::Result<bool> {
let index_id_txt = index_id.to_string();
self.conn.transaction(|txn| Box::pin(async move {
let needs_index = query!("
SELECT 1 AS one FROM indices WHERE index_sha256 = ?
", index_id_txt).fetch_optional(&mut *txn).await?.is_none();
if !needs_index {
return Ok(false);
}
let index_short_id = query!("
INSERT INTO indices (index_sha256)
VALUES (?)
RETURNING index_short_id
", index_id_txt).fetch_one(&mut *txn).await?.index_short_id;
for supersede in &index.supersedes {
let supersede_txt = supersede.to_string();
query!("
INSERT INTO indices_supersede (superseded_sha256, successor_sha256)
VALUES (?, ?)
", supersede_txt, index_id_txt).execute(&mut *txn).await?;
if let Some(row) = query!("
SELECT index_short_id FROM indices WHERE index_sha256 = ?
", supersede_txt).fetch_optional(&mut *txn).await? {
// Clear out any chunk entries for the superseded indices.
// This ensures we don't rely on them in the future and also clears up space.
query!("
DELETE FROM blobs WHERE index_short_id = ?
", row.index_short_id).execute(&mut *txn).await?;
}
}
// Check that the index hasn't already been superseded, before adding blobs
let is_superseded = query!("
SELECT 1 as _yes FROM indices_supersede WHERE superseded_sha256 = ?",
index_id_txt
).fetch_optional(&mut *txn).await?.is_some();
if !is_superseded {
for (bloblog_sha256, index_bloblog_entry) in &index.bloblogs {
let bloblog_sha256_txt = bloblog_sha256.to_string();
let bloblog_short_id_opt = query!("
SELECT bloblog_short_id FROM bloblogs WHERE bloblog_sha256 = ?
", bloblog_sha256_txt).fetch_optional(&mut *txn).await?;
let bloblog_short_id = match bloblog_short_id_opt {
None => {
query!("
INSERT INTO bloblogs (bloblog_sha256)
VALUES (?)
RETURNING bloblog_short_id
", bloblog_sha256_txt).fetch_one(&mut *txn).await?.bloblog_short_id
},
Some(row) => row.bloblog_short_id,
};
let forgotten_bytes = index_bloblog_entry.forgotten_bytes as i64;
query!("
INSERT INTO indices_bloblogs (index_short_id, bloblog_short_id, forgotten_bytes)
VALUES (?, ?, ?)
", index_short_id, bloblog_short_id, forgotten_bytes)
.execute(&mut *txn)
.await?;
for (chunk_id, chunk_locator) in index_bloblog_entry.chunks.iter() {
let chunk_id_txt = chunk_id.to_string();
let coffset = chunk_locator.offset as i64;
let clen = chunk_locator.length as i64;
query!("
INSERT INTO blobs (index_short_id, bloblog_short_id, chunk_id, offset, size)
VALUES (?, ?, ?, ?, ?)
", index_short_id, bloblog_short_id, chunk_id_txt, coffset, clen).execute(&mut *txn).await?;
}
}
}
Ok(true)
})).await
}
/// Delete an index from the cache, if the cache was deleted from the pile.
pub async fn delete_index(&mut self, index_id: IndexId) -> eyre::Result<()> {
self.conn
.transaction(|txn| {
Box::pin(async move {
let index_id_txt = index_id.to_string();
query!(
"
DELETE FROM indices_supersede WHERE successor_sha256 = ?
",
index_id_txt
)
.execute(&mut *txn)
.await?;
let index_short_id = query!(
"
SELECT index_short_id FROM indices WHERE index_sha256 = ?
",
index_id_txt
)
.fetch_one(&mut *txn)
.await?
.index_short_id;
query!(
"
DELETE FROM blobs WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
query!(
"
DELETE FROM indices_bloblogs WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
query!(
"
DELETE FROM indices WHERE index_short_id = ?
",
index_short_id
)
.execute(&mut *txn)
.await?;
Ok::<_, eyre::Report>(())
})
})
.await?;
Ok(())
}
}
impl<const RW: bool> StoreConnection<RW> {
pub async fn locate_chunk(
&mut self,
chunk_id: ChunkId,
) -> eyre::Result<Option<(BloblogId, BlobLocator)>> {
let chunk_id_text = chunk_id.to_string();
let row_opt = query!(
"
SELECT bl.bloblog_sha256, b.offset, b.size
FROM blobs b
JOIN bloblogs bl USING (bloblog_short_id)
WHERE b.chunk_id = ?
LIMIT 1
",
chunk_id_text
)
.fetch_optional(&mut *self.conn)
.await?;
match row_opt {
None => Ok(None),
Some(row) => {
let bloblog_id =
BloblogId::try_from(row.bloblog_sha256.as_str()).with_context(|| {
format!("failed to decode bloblog ID: {:?}", row.bloblog_sha256)
})?;
Ok(Some((
bloblog_id,
BlobLocator {
offset: row.offset.try_into().context("offset too big")?,
length: row.size.try_into().context("size too big")?,
},
)))
}
}
}
/// Returns all chunk locations.
/// If a chunk does not exist, it is just not returned in the output map.
pub async fn locate_chunks(
&mut self,
chunk_ids: &BTreeSet<ChunkId>,
) -> eyre::Result<BTreeMap<ChunkId, (BloblogId, BlobLocator)>> {
let mut out = BTreeMap::new();
for batch in &chunk_ids.iter().chunks(64) {
let chunk_id_texts: Vec<String> = batch.map(|ci| ci.to_string()).collect();
let query_param_str = format!("({})", &",?".repeat(chunk_id_texts.len())[1..]);
let sql = format!(
"
SELECT b.chunk_id, bl.bloblog_sha256, b.offset, b.size
FROM blobs b
JOIN bloblogs bl USING (bloblog_short_id)
WHERE b.chunk_id IN {query_param_str}
"
);
let mut q = query(&sql);
for chunk_id in &chunk_id_texts {
q = q.bind(chunk_id);
}
let rows = q
.map(|row: SqliteRow| {
Ok::<_, eyre::Report>((
ChunkId::from_str(row.get::<&str, _>(0))?,
BloblogId::try_from(row.get::<&str, _>(1))?,
row.get::<i64, _>(2),
row.get::<i64, _>(3),
))
})
.fetch_all(&mut *self.conn)
.await?;
for row in rows {
let (chunk_id, bloblog_id, offset, size) = row?;
out.insert(
chunk_id,
(
bloblog_id,
BlobLocator {
offset: offset as u64,
length: size as u64,
},
),
);
}
}
Ok(out)
}
pub async fn list_indices(&mut self) -> eyre::Result<BTreeSet<IndexId>> {
let row_results = query!(
"
SELECT index_sha256 FROM indices
"
)
.map(|row| {
IndexId::try_from(row.index_sha256.as_ref())
.context("failed to decode IndexId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn list_bloblogs(&mut self) -> eyre::Result<BTreeSet<BloblogId>> {
let row_results = query!(
"
SELECT bloblog_sha256 FROM bloblogs
"
)
.map(|row| {
BloblogId::try_from(row.bloblog_sha256.as_ref())
.context("failed to decode BloblogId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn is_chunk_new(&mut self, chunk_id: ChunkId) -> eyre::Result<bool> {
let chunk_id_text = chunk_id.to_string();
let is_new = query!(
"
SELECT 1 AS _yes FROM blobs WHERE chunk_id = ?
",
chunk_id_text
)
.fetch_optional(&mut *self.conn)
.await?
.is_none();
Ok(is_new)
}
pub async fn list_chunks_in_index(
&mut self,
index_id: IndexId,
) -> eyre::Result<BTreeSet<ChunkId>> {
let index_id_text = index_id.to_string();
let row_results = query!(
"
SELECT chunk_id AS \"chunk_id!\" FROM indices i
JOIN blobs b USING (index_short_id)
WHERE index_sha256 = ?
",
index_id_text
)
.map(|row| {
ChunkId::from_str(&row.chunk_id).context("failed to decode ChunkId in local cache")
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
pub async fn index_bloblog_stats(
&mut self,
index_id: IndexId,
) -> eyre::Result<BTreeMap<BloblogId, BloblogStats>> {
let index_id_text = index_id.to_string();
let row_results = query!("
SELECT bloblog_sha256 AS bloblog_id, ib.forgotten_bytes AS forgotten_bytes, COUNT(size) AS \"num_chunks!: i64\", SUM(size) AS \"num_bytes!: i64\" FROM indices i
LEFT JOIN indices_bloblogs ib USING (index_short_id)
LEFT JOIN bloblogs b USING (bloblog_short_id)
LEFT JOIN blobs USING (index_short_id, bloblog_short_id)
WHERE index_sha256 = ?
GROUP BY bloblog_sha256
", index_id_text)
.map(|row| {
Ok((BloblogId::try_from(row.bloblog_id.as_ref())?, BloblogStats {
in_index: index_id,
blob_size: row.num_bytes as u64,
forgotten_bytes: row.forgotten_bytes as u64,
num_chunks: row.num_chunks as u32,
}))
})
.fetch_all(&mut *self.conn)
.await?;
row_results.into_iter().collect()
}
}
#[derive(Clone, Debug)]
pub struct BloblogStats {
pub in_index: IndexId,
pub blob_size: u64,
pub forgotten_bytes: u64,
pub num_chunks: u32,
}

View File

@ -0,0 +1,35 @@
[package]
name = "yama_midlevel_crypto"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde = { version = "1.0.159", features = ["derive"] }
ciborium = "0.2.0"
blake3 = "1.3.3"
# Unauthenticated symmetric seekable stream constructions
chacha20 = "0.9.1"
x25519-dalek = { version = "2.0.0-rc.2", features = ["serde", "static_secrets"] }
poly1305 = "0.8.0"
ed25519-dalek = { version = "2.0.0-rc.2", features = ["serde", "rand_core"] }
# Hybrid quantum-resistant asymmetric 'key encapsulation' mechanisms
pqc_kyber = { version = "0.5.0", features = ["kyber1024"] }
#alkali = "0.3.0"
rand = "0.8.5"
eyre = "0.6.8"
# 0.12.3+zstd.1.5.2
zstd = "0.12.3"
hex = "0.4.3"
argon2 = { version = "0.4.1", default-features = false, features = ["alloc", "std"] }

View File

@ -0,0 +1,92 @@
use ed25519_dalek::SIGNATURE_LENGTH;
use serde::{Deserialize, Serialize};
use std::marker::PhantomData;
use crate::asym_keyx::{AsymKeyExchange, DecryptingKey, EncryptingKey, KEY_EXCHANGE_LENGTH};
use crate::asym_signed::{SignedBytes, SigningKey, VerifyingKey};
use crate::byte_layer::ByteLayer;
use crate::sym_box::{SymBox, SymKey};
/// A locked box storing something using asymmetric cryptography.
///
/// For key encapsulation: x25519 and kyber (quantum-resistant)
/// For signing: ed25519 (not quantum-resistant)
///
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct AsymBox<T> {
payload: SignedBytes,
#[serde(skip, default)]
_phantom: PhantomData<T>,
}
/// A public key, needed to lock an AsymBox or verify the signature when unlocking an AsymBox.
#[derive(Clone, Serialize, Deserialize)]
pub struct AsymPublicKey {
verify: VerifyingKey,
encrypt: EncryptingKey,
}
/// A private key, needed to open an AsymBox or to sign an AsymBox that is being locked.
#[derive(Clone, Serialize, Deserialize)]
pub struct AsymPrivateKey {
sign: SigningKey,
decrypt: DecryptingKey,
}
impl<T> ByteLayer for AsymBox<T> {
fn from_byte_vec(bytes: Vec<u8>) -> Self {
Self {
payload: SignedBytes::from_bytes_vec_assumed(bytes),
_phantom: Default::default(),
}
}
fn into_byte_vec(self) -> Vec<u8> {
self.payload.into_bytes_vec()
}
}
impl<T: ByteLayer> AsymBox<T> {
// TODO error
pub fn unlock(
self,
receiver_decrypt_key: &DecryptingKey,
sender_verify_key: &VerifyingKey,
) -> Option<T> {
let verified_payload = self.payload.into_verified(sender_verify_key)?;
let cutoff = verified_payload.len() - KEY_EXCHANGE_LENGTH;
let key_exchanger_bytes = &verified_payload[cutoff..];
let key_exchanger = AsymKeyExchange::load_from_bytes(key_exchanger_bytes)
.expect("can't load AsymKeyExchange");
let exchanged = key_exchanger.unlock(receiver_decrypt_key)?;
let symkey = SymKey::from(exchanged);
let symbox: SymBox<T> = SymBox::new_from_raw(&verified_payload[0..cutoff]);
symbox.unlock(&symkey)
}
}
impl<T: ByteLayer> AsymBox<T> {
// TODO error
pub fn new(
contents: T,
sender_signing_key: &SigningKey,
receiver_encrypt_key: &EncryptingKey,
) -> Option<Self> {
let (key_exchanger, exchanged) = AsymKeyExchange::lock(receiver_encrypt_key);
let symkey = SymKey::from(exchanged);
let mut signed_bytes = SymBox::new(contents, &symkey)?.into_vec();
signed_bytes.reserve(KEY_EXCHANGE_LENGTH + SIGNATURE_LENGTH);
signed_bytes.extend_from_slice(key_exchanger.as_bytes());
let signed = SignedBytes::new(signed_bytes, sender_signing_key);
Some(Self {
payload: signed,
_phantom: Default::default(),
})
}
}

View File

@ -0,0 +1,149 @@
use crate::keys_kyber::{kyber_keypair, KyberPrivateKey, KyberPublicKey};
use crate::keys_x25519::{x25519_keypair, X25519PrivateKey, X25519PublicKey};
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::io::Read;
// x25519 ephemeral pubkey (32) + kyber (1568)
pub const KEY_EXCHANGE_LENGTH: usize = 32 + 1568;
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct AsymKeyExchange<'bytes> {
inner: Cow<'bytes, [u8]>,
}
#[derive(Clone, Serialize, Deserialize, PartialEq)]
pub struct EncryptingKey {
x25519: X25519PublicKey,
kyber: KyberPublicKey,
}
#[derive(Clone, Serialize, Deserialize, PartialEq)]
pub struct DecryptingKey {
x25519: X25519PrivateKey,
kyber: KyberPrivateKey,
x25519_pub: X25519PublicKey,
kyber_pub: KyberPublicKey,
}
pub fn generate_asym_keypair() -> (EncryptingKey, DecryptingKey) {
let (x25519_pub, x25519_priv) = x25519_keypair();
let (kyber_pub, kyber_priv) = kyber_keypair();
(
EncryptingKey {
x25519: x25519_pub.clone(),
kyber: kyber_pub.clone(),
},
DecryptingKey {
x25519: x25519_priv,
kyber: kyber_priv,
x25519_pub,
kyber_pub,
},
)
}
pub struct Exchanged(pub(crate) [u8; 64]);
impl<'bytes> AsymKeyExchange<'bytes> {
pub fn load_from_bytes(bytes: &'bytes [u8]) -> Option<Self> {
if bytes.len() != KEY_EXCHANGE_LENGTH {
return None;
}
Some(Self {
inner: Cow::Borrowed(&bytes),
})
}
pub fn as_bytes(&self) -> &[u8] {
self.inner.as_ref()
}
pub fn lock(ek: &EncryptingKey) -> (AsymKeyExchange, Exchanged) {
let mut public_bytes = Vec::with_capacity(KEY_EXCHANGE_LENGTH);
let mut rand = rand::thread_rng();
// X25519
let ephemeral_privkey = x25519_dalek::EphemeralSecret::random_from_rng(&mut rand);
let ephemeral_pubkey = x25519_dalek::PublicKey::from(&ephemeral_privkey);
public_bytes.extend_from_slice(ephemeral_pubkey.as_bytes());
let shared_secret_x25519 = ephemeral_privkey.diffie_hellman(&ek.x25519.inner);
// Kyber
let kyber = ek.kyber.encapsulate(&mut rand);
public_bytes.extend_from_slice(&kyber.public_bytes);
assert_eq!(public_bytes.len(), KEY_EXCHANGE_LENGTH);
let exchanged = Self::perform_exchange(
&public_bytes,
&ek.x25519,
&ek.kyber,
shared_secret_x25519.as_bytes(),
&kyber.shared_secret,
);
(
AsymKeyExchange {
inner: Cow::Owned(public_bytes),
},
exchanged,
)
}
fn perform_exchange(
public_bytes: &[u8],
rx_x25519_pub: &X25519PublicKey,
rx_kyber_pub: &KyberPublicKey,
x25519_ss: &[u8; 32],
kyber_ss: &[u8; 32],
) -> Exchanged {
assert_eq!(public_bytes.len(), KEY_EXCHANGE_LENGTH);
let mut hasher = blake3::Hasher::new_derive_key("yama AsymKeyExchange");
// Includes the pubkeys of the writer
hasher.update(public_bytes);
// Include the pubkeys of the receiver
hasher.update(rx_x25519_pub.as_bytes());
hasher.update(rx_kyber_pub.as_bytes());
// Include what was exchanged
hasher.update(x25519_ss);
hasher.update(kyber_ss);
let mut exchanged_bytes = [0u8; 64];
let mut out = hasher.finalize_xof();
out.read_exact(&mut exchanged_bytes)
.expect("failed to read 64b from blake3");
Exchanged(exchanged_bytes)
}
pub fn unlock(&self, dk: &DecryptingKey) -> Option<Exchanged> {
if self.inner.len() != KEY_EXCHANGE_LENGTH {
return None;
}
// X25519
let ephemeral_x25519_pubkey_bytes: &[u8; 32] = &self.inner[0..32].try_into().unwrap();
let ephemeral_x25519_pubkey = x25519_dalek::PublicKey::from(*ephemeral_x25519_pubkey_bytes);
let shared_secret_x25519 = dk.x25519.inner.diffie_hellman(&ephemeral_x25519_pubkey);
// Kyber
let kyber_ciphertext_bytes: &[u8; 1568] = &self.inner[32..].try_into().unwrap();
let kyber = dk.kyber.decapsulate(&kyber_ciphertext_bytes);
Some(Self::perform_exchange(
&self.inner,
&dk.x25519_pub,
&dk.kyber_pub,
shared_secret_x25519.as_bytes(),
&kyber,
))
}
}

View File

@ -0,0 +1,138 @@
use serde::de::Error as DeError;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use ed25519_dalek::{
Signature, Signer, SigningKey as Ed25519PrivateKey, Verifier, VerifyingKey as Ed25519PublicKey,
};
pub use ed25519_dalek::SIGNATURE_LENGTH;
use rand::thread_rng;
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct SignedBytes {
inner: Vec<u8>,
}
#[derive(Clone)]
pub struct SigningKey {
ed25519: Ed25519PrivateKey,
}
impl PartialEq for SigningKey {
fn eq(&self, other: &Self) -> bool {
self.ed25519.to_keypair_bytes() == other.ed25519.to_keypair_bytes()
}
}
#[derive(Clone)]
pub struct VerifyingKey {
ed25519: Ed25519PublicKey,
}
impl PartialEq for VerifyingKey {
fn eq(&self, other: &Self) -> bool {
self.ed25519.as_bytes() == other.ed25519.as_bytes()
}
}
pub fn asym_signing_keypair() -> (SigningKey, VerifyingKey) {
let mut rng = thread_rng();
let keypair = ed25519_dalek::SigningKey::generate(&mut rng);
(
SigningKey {
ed25519: keypair.clone(),
},
VerifyingKey {
ed25519: keypair.verifying_key(),
},
)
}
impl Serialize for SigningKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let ed25519 = self.ed25519.to_bytes();
<[u8]>::serialize(&ed25519, serializer)
}
}
impl<'d> Deserialize<'d> for SigningKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Vec::<u8>::deserialize(deserializer)?;
let mut ed25519 = [0u8; 32];
if bytes.len() != ed25519.len() {
return Err(D::Error::custom("wrong length of ed25519 key"));
}
ed25519.copy_from_slice(&bytes);
Ok(SigningKey {
ed25519: Ed25519PrivateKey::from_bytes(&ed25519),
})
}
}
impl Serialize for VerifyingKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
(self.ed25519.as_bytes() as &[u8]).serialize(serializer)
}
}
impl<'d> Deserialize<'d> for VerifyingKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Vec::<u8>::deserialize(deserializer)?;
let mut ed25519 = [0u8; 32];
if bytes.len() != ed25519.len() {
return Err(D::Error::custom("wrong length of ed25519 key"));
}
ed25519.copy_from_slice(&bytes);
Ok(VerifyingKey {
ed25519: Ed25519PublicKey::from_bytes(&ed25519).map_err(D::Error::custom)?,
})
}
}
impl SignedBytes {
pub fn new(mut bytes: Vec<u8>, sign_with: &SigningKey) -> SignedBytes {
let signature = sign_with.ed25519.sign(&bytes);
let sig = signature.to_bytes();
assert_eq!(sig.len(), SIGNATURE_LENGTH);
bytes.extend(sig);
SignedBytes { inner: bytes }
}
pub fn into_verified(mut self, verify_with: &VerifyingKey) -> Option<Vec<u8>> {
if self.inner.len() < SIGNATURE_LENGTH {
return None;
}
let (payload, sig) = self.inner.split_at(self.inner.len() - SIGNATURE_LENGTH);
let sig = Signature::from_bytes(sig.try_into().expect("wrong split"));
verify_with.ed25519.verify(&payload, &sig).ok()?;
self.inner.drain(self.inner.len() - SIGNATURE_LENGTH..);
Some(self.inner)
}
pub fn into_bytes_vec(self) -> Vec<u8> {
self.inner
}
pub fn from_bytes_vec_assumed(inner: Vec<u8>) -> Self {
Self { inner }
}
}

View File

@ -0,0 +1,55 @@
use serde::de::DeserializeOwned;
use serde::Serialize;
use std::marker::PhantomData;
/// Trait to help layering byte transformers together.
pub trait ByteLayer {
fn from_byte_vec(bytes: Vec<u8>) -> Self;
fn into_byte_vec(self) -> Vec<u8>;
}
#[derive(Clone)]
pub struct CborSerde<T> {
bytes: Vec<u8>,
marker: PhantomData<T>,
}
impl<T: Serialize + DeserializeOwned> ByteLayer for CborSerde<T> {
fn from_byte_vec(bytes: Vec<u8>) -> Self {
CborSerde {
bytes,
marker: PhantomData::default(),
}
}
fn into_byte_vec(self) -> Vec<u8> {
self.bytes
}
}
impl<T: Serialize> CborSerde<T> {
pub fn serialise(from: &T) -> eyre::Result<Self> {
let mut bytes = Vec::new();
ciborium::ser::into_writer(from, &mut bytes)?;
Ok(CborSerde {
bytes,
marker: Default::default(),
})
}
}
impl<T: DeserializeOwned> CborSerde<T> {
pub fn deserialise(&self) -> eyre::Result<T> {
Ok(ciborium::de::from_reader(&self.bytes[..])?)
}
}
impl ByteLayer for Vec<u8> {
fn from_byte_vec(bytes: Vec<u8>) -> Self {
bytes
}
fn into_byte_vec(self) -> Vec<u8> {
self
}
}

View File

@ -0,0 +1,91 @@
use eyre::bail;
use rand::{thread_rng, Rng};
use serde::{Deserialize, Serialize};
use std::fmt::{Debug, Formatter};
use std::str::FromStr;
/// The ID of a chunk. It's a 256-bit BLAKE3 hash.
#[derive(Copy, Clone, Serialize, Deserialize, Ord, PartialOrd, Eq, PartialEq, Hash)]
#[repr(transparent)]
#[serde(transparent)]
pub struct ChunkId {
blake3: [u8; 32],
}
impl Debug for ChunkId {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
for &byte in self.blake3.iter() {
write!(f, "{:02x}", byte)?;
}
Ok(())
}
}
impl ToString for ChunkId {
fn to_string(&self) -> String {
hex::encode(&self.blake3)
}
}
impl From<[u8; 32]> for ChunkId {
fn from(bytes: [u8; 32]) -> Self {
ChunkId { blake3: bytes }
}
}
impl FromStr for ChunkId {
type Err = eyre::Report;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s.len() != 64 {
bail!("chunk ID of wrong length");
}
let decoded = hex::decode(s)?;
let mut new = ChunkId {
blake3: Default::default(),
};
new.blake3.copy_from_slice(&decoded);
Ok(new)
}
}
impl ChunkId {
pub fn to_bytes(self) -> [u8; 32] {
self.blake3
}
}
/// Key needed to create and verify chunk IDs. It's a 256-bit key for the BLAKE3 keyed hash function.
#[derive(Copy, Clone, Serialize, Deserialize)]
pub struct ChunkIdKey {
key: [u8; 32],
}
impl ChunkIdKey {
pub fn new_rand() -> ChunkIdKey {
let mut rng = thread_rng();
ChunkIdKey { key: rng.gen() }
}
}
impl Debug for ChunkIdKey {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
// for privacy, don't leak the contents of the key
write!(f, "ChunkIdKey(...)")
}
}
impl ChunkId {
/// Computes a chunk ID given the input and key.
pub fn compute(input: &[u8], key: &ChunkIdKey) -> ChunkId {
ChunkId {
blake3: blake3::keyed_hash(&key.key, input).into(),
}
}
/// Returns true iff this Chunk ID is correct for the given input and key.
pub fn verify(&self, input: &[u8], key: &ChunkIdKey) -> bool {
let comparison = Self::compute(input, key);
self == &comparison
}
}

View File

@ -0,0 +1,72 @@
use crate::sym_box::SymKey;
use argon2::Algorithm::Argon2id;
use argon2::{Argon2, Version};
use eyre::{bail, Context};
use rand::{thread_rng, Rng};
use serde::{Deserialize, Serialize};
/// 1 GiB. Intended to prevent maliciously large memory costs; not sure if that's a real risk.
pub const MAX_MEMORY_COST_KIBIBYTES: u32 = 1048576;
/// 512 MiB
pub const DEFAULT_MEMORY_COST_KIBIBYTES: u32 = 524288;
pub const DEFAULT_LANES: u32 = 1;
pub const DEFAULT_ITERATIONS: u32 = 256;
#[derive(Clone, Serialize, Deserialize)]
pub struct KeyDerivationParameters {
salt: [u8; 32],
iterations: u32,
memory_kibibytes: u32,
lanes: u32,
}
impl KeyDerivationParameters {
pub fn new_recommended() -> KeyDerivationParameters {
let mut rng = thread_rng();
KeyDerivationParameters {
salt: rng.gen(),
iterations: DEFAULT_ITERATIONS,
memory_kibibytes: DEFAULT_MEMORY_COST_KIBIBYTES,
lanes: DEFAULT_LANES,
}
}
pub fn derive(&self, password: &str) -> eyre::Result<DerivedKey> {
if self.memory_kibibytes > MAX_MEMORY_COST_KIBIBYTES {
bail!(
"Too much memory needed for key derivation! {} > {}",
self.memory_kibibytes,
MAX_MEMORY_COST_KIBIBYTES
);
}
let mut params = argon2::ParamsBuilder::new();
params
.m_cost(self.memory_kibibytes)
.unwrap()
.p_cost(self.lanes)
.unwrap()
.t_cost(self.iterations)
.unwrap()
.output_len(64)
.unwrap();
let params = params.params().unwrap();
let argon = Argon2::new(Argon2id, Version::V0x13, params.clone());
let mut derived_key = DerivedKey([0; 64]);
argon
.hash_password_into(password.as_bytes(), &self.salt, &mut derived_key.0)
.context("failed to hash password")?;
Ok(derived_key)
}
}
pub struct DerivedKey(pub(crate) [u8; 64]);
impl DerivedKey {
pub fn into_symkey(self) -> SymKey {
SymKey::from(self)
}
}

View File

@ -0,0 +1,109 @@
use pqc_kyber::CryptoRng;
use rand::{thread_rng, RngCore};
use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::borrow::Cow;
#[derive(Clone, Eq, PartialEq)]
pub struct KyberPublicKey {
inner: pqc_kyber::PublicKey,
}
#[derive(Clone, Eq, PartialEq)]
pub struct KyberPrivateKey {
inner: pqc_kyber::SecretKey,
}
pub fn kyber_keypair() -> (KyberPublicKey, KyberPrivateKey) {
let mut rng = thread_rng();
let kyber_pair = pqc_kyber::keypair(&mut rng);
(
KyberPublicKey {
inner: kyber_pair.public,
},
KyberPrivateKey {
inner: kyber_pair.secret,
},
)
}
impl Serialize for KyberPublicKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
<[u8]>::serialize(&self.inner, serializer)
}
}
impl<'d> Deserialize<'d> for KyberPublicKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Cow::<'d, [u8]>::deserialize(deserializer)?;
let mut kyber: pqc_kyber::PublicKey = [0; 1568];
if kyber.len() != bytes.len() {
return Err(D::Error::custom("wrong length of kyber key"));
}
kyber.copy_from_slice(&bytes);
Ok(KyberPublicKey { inner: kyber })
}
}
impl KyberPublicKey {
pub fn as_bytes(&self) -> &[u8; 1568] {
&self.inner
}
}
impl Serialize for KyberPrivateKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
<[u8]>::serialize(&self.inner, serializer)
}
}
impl<'d> Deserialize<'d> for KyberPrivateKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Cow::<'d, [u8]>::deserialize(deserializer)?;
let mut kyber: pqc_kyber::SecretKey = [0; 3168];
if kyber.len() != bytes.len() {
return Err(D::Error::custom("wrong length of kyber key"));
}
kyber.copy_from_slice(&bytes);
Ok(KyberPrivateKey { inner: kyber })
}
}
pub struct KyberEncapsulation {
pub shared_secret: pqc_kyber::SharedSecret,
pub public_bytes: [u8; pqc_kyber::KYBER_CIPHERTEXTBYTES],
}
impl KyberPublicKey {
pub fn encapsulate<T: RngCore + CryptoRng>(&self, rng: &mut T) -> KyberEncapsulation {
let (public_bytes, shared_secret) =
pqc_kyber::encapsulate(&self.inner, rng).expect("bad kyber encapsulation");
KyberEncapsulation {
shared_secret,
public_bytes,
}
}
}
impl KyberPrivateKey {
pub fn decapsulate(&self, ciphertext: &[u8; 1568]) -> pqc_kyber::SharedSecret {
pqc_kyber::decapsulate(ciphertext, &self.inner).expect("bad kyber decapsulation")
}
}

View File

@ -0,0 +1,82 @@
use rand::thread_rng;
use serde::de::Error as _DeError;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::borrow::Cow;
#[derive(Clone)]
#[repr(transparent)]
pub struct X25519PrivateKey {
pub(crate) inner: x25519_dalek::StaticSecret,
}
impl PartialEq for X25519PrivateKey {
fn eq(&self, other: &Self) -> bool {
self.inner.as_bytes() == other.inner.as_bytes()
}
}
#[derive(Clone, PartialEq)]
#[repr(transparent)]
pub struct X25519PublicKey {
pub(crate) inner: x25519_dalek::PublicKey,
}
pub fn x25519_keypair() -> (X25519PublicKey, X25519PrivateKey) {
let mut rng = thread_rng();
let x25519_priv = x25519_dalek::StaticSecret::random_from_rng(&mut rng);
let x25519_pub = x25519_dalek::PublicKey::from(&x25519_priv);
(
X25519PublicKey { inner: x25519_pub },
X25519PrivateKey { inner: x25519_priv },
)
}
impl Serialize for X25519PrivateKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
<[u8]>::serialize(self.inner.as_bytes(), serializer)
}
}
impl<'d> Deserialize<'d> for X25519PrivateKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Cow::<'d, [u8]>::deserialize(deserializer)?;
let counted_bytes: [u8; 32] = bytes.as_ref().try_into().map_err(D::Error::custom)?;
Ok(X25519PrivateKey {
inner: x25519_dalek::StaticSecret::from(counted_bytes),
})
}
}
impl X25519PublicKey {
pub fn as_bytes(&self) -> &[u8; 32] {
self.inner.as_bytes()
}
}
impl Serialize for X25519PublicKey {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
<[u8]>::serialize(self.inner.as_bytes(), serializer)
}
}
impl<'d> Deserialize<'d> for X25519PublicKey {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'d>,
{
let bytes = Cow::<'d, [u8]>::deserialize(deserializer)?;
let counted_bytes: [u8; 32] = bytes.as_ref().try_into().map_err(D::Error::custom)?;
Ok(X25519PublicKey {
inner: x25519_dalek::PublicKey::from(counted_bytes),
})
}
}

View File

@ -0,0 +1,17 @@
pub mod asym_box;
pub mod asym_keyx;
pub mod asym_signed;
pub mod keys_kyber;
pub mod keys_x25519;
pub mod sym_box;
pub mod sym_stream;
pub mod chunk_id;
pub mod zstd_box;
pub mod byte_layer;
pub mod key_derivation;

View File

@ -0,0 +1,142 @@
use crate::asym_keyx::Exchanged;
use crate::byte_layer::ByteLayer;
use crate::key_derivation::DerivedKey;
use chacha20::cipher::{KeyIvInit, StreamCipher};
use chacha20::XChaCha20;
use poly1305::universal_hash::KeyInit;
use poly1305::Poly1305;
use rand::Rng;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::marker::PhantomData;
pub const SYMBOX_NONCE_LENGTH: usize = 24;
pub const SYMBOX_MAC_LENGTH: usize = 16;
pub const SYMBOX_FOOTER_LENGTH: usize = SYMBOX_MAC_LENGTH + SYMBOX_NONCE_LENGTH;
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct SymBox<'bytes, T> {
// payload || nonce || mac
bytes: Cow<'bytes, [u8]>,
#[serde(skip)]
phantom: PhantomData<T>,
}
#[derive(Clone)]
pub struct SymKey {
xchacha20: [u8; 32],
poly1305: poly1305::Key,
}
impl SymKey {
fn from_64_bytes(input: &[u8]) -> Self {
let (xchacha20_bytes, poly1305_bytes) = input.split_at(32);
SymKey {
xchacha20: xchacha20_bytes.try_into().unwrap(),
poly1305: *poly1305::Key::from_slice(poly1305_bytes),
}
}
}
impl From<Exchanged> for SymKey {
fn from(exchanged: Exchanged) -> Self {
SymKey::from_64_bytes(&exchanged.0)
}
}
impl From<DerivedKey> for SymKey {
fn from(derived: DerivedKey) -> Self {
SymKey::from_64_bytes(&derived.0)
}
}
impl<'bytes, T> SymBox<'bytes, T> {
pub fn as_bytes(&self) -> &[u8] {
self.bytes.as_ref()
}
pub fn new_from_raw(bytes: &'bytes [u8]) -> Self {
Self {
bytes: Cow::Borrowed(bytes),
phantom: Default::default(),
}
}
}
impl<'bytes, T> ByteLayer for SymBox<'bytes, T> {
fn from_byte_vec(bytes: Vec<u8>) -> Self {
Self {
bytes: Cow::Owned(bytes),
phantom: Default::default(),
}
}
fn into_byte_vec(self) -> Vec<u8> {
self.bytes.into_owned()
}
}
impl<T> SymBox<'static, T> {
pub fn into_vec(self) -> Vec<u8> {
match self.bytes {
Cow::Borrowed(b) => b.to_vec(),
Cow::Owned(o) => o,
}
}
}
impl<'bytes, T: ByteLayer> SymBox<'bytes, T> {
// TODO error
pub fn unlock(&self, symkey: &SymKey) -> Option<T> {
let blen = self.bytes.len();
if blen < SYMBOX_FOOTER_LENGTH {
return None;
}
let (ciphertext_then_nonce, mac) = self.bytes.split_at(blen - SYMBOX_MAC_LENGTH);
let (ciphertext, nonce) = ciphertext_then_nonce.split_at(blen - SYMBOX_FOOTER_LENGTH);
let poly1305_mac = Poly1305::new(&symkey.poly1305).compute_unpadded(&ciphertext_then_nonce);
if poly1305_mac.as_slice() != mac {
// TODO Should this pedantically be a constant-time equality check?
// I don't think it matters in any exploitable way for Yama though...
return None;
}
let mut out_buf = Vec::<u8>::with_capacity(ciphertext.len());
out_buf.extend_from_slice(&ciphertext);
let mut xchacha20: XChaCha20 = XChaCha20::new(&symkey.xchacha20.into(), nonce.into());
xchacha20.apply_keystream(&mut out_buf);
let decrypted: Vec<u8> = out_buf;
Some(T::from_byte_vec(decrypted))
}
}
impl<'bytes, T: ByteLayer> SymBox<'bytes, T> {
// TODO error
pub fn new(contents: T, symkey: &SymKey) -> Option<Self> {
let unencrypted = contents.into_byte_vec();
let mut rng = rand::thread_rng();
let nonce = rng.gen::<[u8; SYMBOX_NONCE_LENGTH]>();
let mut out_buf = Vec::<u8>::with_capacity(unencrypted.len() + SYMBOX_FOOTER_LENGTH);
out_buf.extend_from_slice(&unencrypted);
let mut xchacha20: XChaCha20 = XChaCha20::new(&symkey.xchacha20.into(), &nonce.into());
xchacha20.apply_keystream(&mut out_buf);
out_buf.extend_from_slice(&nonce);
let poly1305_mac = Poly1305::new(&symkey.poly1305).compute_unpadded(&out_buf);
out_buf.extend_from_slice(poly1305_mac.as_slice());
Some(SymBox {
bytes: Cow::Owned(out_buf),
phantom: Default::default(),
})
}
}

Some files were not shown because too many files have changed in this diff Show More