Add some comments to help maintainability
Some checks failed
continuous-integration/drone the build failed

Closes #42
This commit is contained in:
Olivier 'reivilibre' 2021-09-27 18:36:10 +01:00
parent 5f0e6bf18c
commit 1ce0478b2c
14 changed files with 78 additions and 2 deletions

View File

@ -38,7 +38,8 @@ pub const FASTCDC_MAX: usize = 8 * 1024 * 1024;
/// A chunker that will generate nested chunks of chunk references if there is that much data
/// to store.
/// The root RecursiveChunker is fed data bytes.
/// If it exceeds the nominated threshold, it grows a child RecursiveChunker
/// If it exceeds the nominated threshold, it grows a child RecursiveChunker (which may do the same).
/// When done, `finish` should be called to flush the buffers and obtain a `RecursiveChunkRef`.
pub struct RecursiveChunker<'pile, RP: RawPile> {
/// The pile to submit chunks to.
pile: &'pile Pile<RP>,

View File

@ -132,6 +132,8 @@ pub fn retrieve_tree_node<RP: RawPile>(
*/
}
/// Given a pointer, fully integrates it in-place. The pointer will no longer have a parent when
/// this operation is finished.
pub fn fully_integrate_pointer_node<RP: RawPile>(
pile: &Pile<RP>,
tree_node: &mut TreeNode,
@ -165,6 +167,8 @@ pub fn fully_integrate_pointer_node<RP: RawPile>(
Ok(())
}
/// Loads a pointer and fully integrates it.
/// The result will be a fully-integrated pointer (it won't have a parent).
pub fn fully_load_pointer<RP: RawPile>(
pile: &Pile<RP>,
pointer_name: &str,

View File

@ -37,7 +37,13 @@ pub struct PointerData {
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct RecursiveChunkRef {
/// The root Chunk ID.
pub chunk_id: ChunkId,
/// The depth of the data bytes.
/// 0 means that the chunk addressed by `chunk_id` contains data bytes.
/// 1 means that the chunk addressed by `chunk_id` contains references to chunk that contain
/// data bytes.
/// (and so on)
pub depth: u32,
}

View File

@ -46,6 +46,8 @@ impl Write for NullWriter {
}
/// Mark-and-sweep style vacuuming system.
/// We mark all the chunks that we run into (following the structure of all the pointers and
/// recursive chunk references) and sweep the chunks that have not been read.
pub struct VacuumRawPile<RP: RawPile> {
underlying: RP,
vacuum_tracking_enabled: bool,
@ -120,6 +122,11 @@ impl<RP: RawPile> RawPile for VacuumRawPile<RP> {
}
}
/// Runs a full check of a Yama pile. This reads ALL the chunks, which can take a long time.
/// This is also capable of finding and vacuuming unused chunks.
/// This checks:
/// - the integrity of each chunk (assuming an integrity-aware raw pile is used)
/// - the structure of pointers and multi-level chunk references
pub fn check_deep<RP: RawPile>(
pile: Pile<RP>,
vacuum: VacuumMode,

View File

@ -36,6 +36,7 @@ use crate::definitions::{FilesystemOwnership, RecursiveChunkRef, TreeNode};
use crate::pile::{Pile, RawPile};
use std::collections::{BTreeMap, HashMap};
/// Given a fully-integrated root node, extracts the files from the pile.
pub fn extract<RP: RawPile>(
target_path: &Path,
root: &mut TreeNode,
@ -107,6 +108,7 @@ pub fn extract<RP: RawPile>(
Ok(())
}
/// Given the name of a pointer, extracts it.
pub fn extract_from_pointer_name<RP: RawPile>(
target_path: &Path,
pointer_name: &str,
@ -218,6 +220,7 @@ pub fn apply_uid_and_gid_translation_tables(
.expect("Can't fail since we don't fail.");
}
/// A worker thread for extracting
pub fn extract_worker<RP: RawPile>(
pile: &Pile<RP>,
paths: Receiver<(PathBuf, RecursiveChunkRef)>,
@ -240,6 +243,7 @@ pub fn extract_worker<RP: RawPile>(
Ok(())
}
/// A single thread that manages the workers
pub fn manager(
root: &mut TreeNode,
target_path: &Path,
@ -286,6 +290,7 @@ pub fn manager(
Ok(())
}
/// Applies metadata (permissions, mtime, ownership) to files from a tree node.
pub fn apply_metadata(
root: &TreeNode,
target: &Path,

View File

@ -21,6 +21,8 @@ use std::sync::Arc;
/// Pushes chunks (and pointers) from one pile to another.
/// This is a thorough implementation that could be slow but at least should give good confidence.
/// (Presumably we could do better by looking at the pointers that already exist on the destination
/// and only integrating as much as we need to.)
pub fn push_to(
from_pile: Arc<Pile<Arc<Box<dyn RawPile>>>>,
from_rp_bypass: Arc<Box<dyn RawPile>>,

View File

@ -239,6 +239,9 @@ pub fn manager(
Ok(())
}
/// Stores files into the pile, potentially differentiating using a parent pointer (which will be
/// loaded and fully-integrated).
/// This also creates a pointer (which is why this is called `store_fully`).
pub fn store_fully(
pile: &Pile<Box<dyn RawPile>>,
root_dir: &PathBuf,

View File

@ -30,9 +30,13 @@ pub const DECOMPRESS_CAPACITY: usize = 32 * 1024 * 1024;
#[derive(Clone)]
pub struct CompressionSettings {
/// Raw dictionary to pass to Zstd for compression and decompression
pub dictionary: Arc<Vec<u8>>,
/// The compression level, passed to Zstd.
pub level: i32,
/// The number of compressor threads to use.
pub num_compressors: u32,
/// The number of decompressor threads to use.
pub num_decompressors: u32,
}

View File

@ -21,6 +21,14 @@ use sodiumoxide::crypto::secretbox::{Key, Nonce, NONCEBYTES};
use crate::pile::{Keyspace, RawPile};
/// A RawPile that provides encryption of chunk contents.
/// Please note that keys are not currently encrypted, so this scheme is not CPA-secure.
/// It seems easily possible to test the pile for inclusion of a known file (by first chunking it and
/// looking for matching chunk IDs).
/// Use of compression a custom Zstd dictionary may make that harder but in general it seems dubious
/// to rely on that.
/// This feature will be revisited soon...
/// Notably, keys should be passed through a secure permutation first.
pub struct RawPileEncryptor<R: RawPile> {
underlying: R,
secret_key: Key,
@ -28,6 +36,9 @@ pub struct RawPileEncryptor<R: RawPile> {
impl<R: RawPile> RawPileEncryptor<R> {
pub fn new(underlying: R, key: Key) -> Self {
warn!(
"WARNING! Encrypted RawPiles are not CPA secure. Do not rely on them for security yet!"
);
RawPileEncryptor {
underlying,
secret_key: key,

View File

@ -23,6 +23,8 @@ use crate::definitions::XXH64_SEED;
use crate::pile::{Keyspace, RawPile};
use crate::utils::bytes_to_hexstring;
/// This RawPile enables checking the integrity of stored chunks.
/// This is done by storing a hash along with the chunk contents, which can later be verified.
pub struct RawPileIntegrityChecker<RP: RawPile> {
underlying: RP,
}

View File

@ -132,14 +132,25 @@ pub struct Inner {
writers_in_progress: u16,
}
/// A Pile built on the idea of SQLite-indexed 'blob logs'.
/// 'Blob logs' are append-only binary files which contain simple concatenations of chunks (with a
/// small header). This format is very dense but does not inherently provide random access.
/// Granularity of deletes is also impacted by this structure, so vacuuming steps may involve needing
/// to re-write bloblogs to remove deleted chunks.
/// Because random access is important for performance, an additional SQLite database is used
/// as a map from chunk IDs to their positions in the blob logs, allowing readers to seek to the
/// appropriate place and read a chunk randomly.
pub struct SqliteBloblogPile {
inner: Arc<Mutex<Inner>>,
path: PathBuf,
writers_reach_zero: Condvar,
}
/// A pointer to a blob in a 'blob log'.
pub struct BloblogPointer {
/// Which blob log the blob is stored in.
bloblog: BloblogId,
/// The seek offset at which the blob is located in the log.
offset: u64,
}

View File

@ -12,6 +12,9 @@ use crate::pile::{Keyspace, RawPile};
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
use std::sync::atomic::{AtomicBool, AtomicU16, Ordering};
/// A kind of RawPile which can make requests to a RawPile over a pipe (e.g. TCP socket or an
/// SSH connection).
/// The requests are handled by a `Responder` on the other end of the pipe.
pub struct Requester {
commands: Sender<(RequestBody, Sender<ResponseBody>)>,
}

View File

@ -13,6 +13,8 @@ use crate::pile::RawPile;
use crate::remote::{read_message, write_message, Request, RequestBody, Response, ResponseBody};
#[derive(Clone)]
/// A wrapper for a RawPile which allows a `Requester` to access it over a pipe (e.g. TCP socket or
/// an SSH connection).
pub struct Responder {
continuation_tokens: Arc<Mutex<HashMap<u16, Sender<u16>>>>,
}

View File

@ -144,6 +144,12 @@ pub fn scan_with_progress_bar(
}
}
/// Differentiates a node in place.
/// This makes `old` the parent of `new` (though it is up to the caller to properly update the
/// `PointerData` to reflect this!).
/// Loosely speaking, `new` is modified to contain the differences that, when applied to `old`, will
/// result in the original value of `new`.
/// See `integrate_node_in_place` for the inverse of this operation.
pub fn differentiate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow::Result<()> {
if let TreeNode::Directory { children, .. } = new {
if let TreeNode::Directory {
@ -171,7 +177,16 @@ pub fn differentiate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow
Ok(())
}
/// `old` must be an integrated pointer.
/// Integrates a node in place.
/// This makes `new` no longer have a parent (remember, the caller is responsible for updating
/// `PointerData` appropriately if needed to reflect this).
///
/// Loosely speaking, `new` is treated as a set of differences that are applied to `old`, though the
/// result is in-place.
///
/// Preconditions:
/// - `old` must be an integrated pointer.
/// - `old` is the parent of `new`
pub fn integrate_node_in_place(new: &mut TreeNode, old: &TreeNode) -> anyhow::Result<()> {
if let TreeNode::Directory { children, .. } = new {
if let TreeNode::Directory {