Add a command to show a report of the Datman system

This commit is contained in:
Olivier 'reivilibre' 2022-05-29 13:35:15 +01:00
parent 438af9164e
commit 948ca3f2b5
6 changed files with 378 additions and 2 deletions

95
Cargo.lock generated
View File

@ -233,7 +233,7 @@ version = "3.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25320346e922cffe59c0bbc5410c8d8784509efb321488971081313cb1e1a33c"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro-error",
"proc-macro2",
"quote",
@ -249,6 +249,18 @@ dependencies = [
"os_str_bytes",
]
[[package]]
name = "comfy-table"
version = "6.0.0-rc.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a1a275e66c69adb0600a13650aed718c99337d9a185d353efa13ff1e05576c4"
dependencies = [
"crossterm",
"strum",
"strum_macros",
"unicode-width",
]
[[package]]
name = "console"
version = "0.15.0"
@ -340,6 +352,31 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "crossterm"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17"
dependencies = [
"bitflags",
"crossterm_winapi",
"libc",
"mio",
"parking_lot",
"signal-hook",
"signal-hook-mio",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
dependencies = [
"winapi",
]
[[package]]
name = "crunchy"
version = "0.2.2"
@ -378,6 +415,7 @@ dependencies = [
"byteorder",
"chrono",
"clap",
"comfy-table",
"crossbeam-channel",
"env_logger",
"glob",
@ -578,6 +616,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "heck"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "heck"
version = "0.4.0"
@ -1096,6 +1143,12 @@ dependencies = [
"smallvec",
]
[[package]]
name = "rustversion"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f"
[[package]]
name = "rustyline"
version = "7.1.0"
@ -1187,6 +1240,27 @@ dependencies = [
"serde",
]
[[package]]
name = "signal-hook"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d"
dependencies = [
"libc",
"signal-hook-registry",
]
[[package]]
name = "signal-hook-mio"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
dependencies = [
"libc",
"mio",
"signal-hook",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.0"
@ -1242,6 +1316,25 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
[[package]]
name = "strum_macros"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
dependencies = [
"heck 0.3.3",
"proc-macro2",
"quote",
"rustversion",
"syn",
]
[[package]]
name = "syn"
version = "1.0.95"

View File

@ -33,3 +33,4 @@ hostname = "0.3.1"
yama = { path = "../yama", version = "0.6.0-alpha.1" }
metrics = "0.17.1"
bare-metrics-recorder = { version = "0.1.0" }
comfy-table = "6.0.0-rc.1"

View File

@ -111,6 +111,11 @@ pub enum DatmanCommand {
skip_metadata: bool,
},
Report {
/// Name of the pile to report on.
pile_name: String,
},
#[clap(name = "_backup_source_responder")]
InternalBackupSourceResponder,
}
@ -307,6 +312,15 @@ fn main() -> anyhow::Result<()> {
info!("Datman responder at {:?}", std::env::current_exe()?);
backup_source_responder::handler_stdio()?;
}
DatmanCommand::Report { pile_name } => {
let descriptor = load_descriptor(Path::new(".")).unwrap();
let destination = &descriptor.piles[&pile_name];
let report = datman::commands::report::generate_report(destination, &descriptor)?;
// TODO Display report
// TODO E-mail report (Can just pipe through aha and then apprise though!)
datman::commands::report::print_report(&report)?;
}
}
Ok(())
}

View File

@ -26,6 +26,7 @@ pub mod backup;
pub mod extract;
pub mod ibrowse;
pub mod ilabel;
pub mod report;
pub fn init_descriptor(path: &Path) -> anyhow::Result<()> {
std::fs::create_dir_all(path)?;

View File

@ -21,7 +21,7 @@ use crate::labelling::{label_node, load_labelling_rules, str_to_label, Label, St
use crate::tree::{scan, FileTree, FileTree1};
use anyhow::{anyhow, bail};
use arc_interner::ArcIntern;
use chrono::{DateTime, Utc};
use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
use log::{info, warn};
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
@ -49,6 +49,13 @@ pub fn get_pointer_name_at(source_name: &str, datetime: DateTime<Utc>) -> String
)
}
pub fn split_pointer_name(pointer_name: &str) -> Option<(String, DateTime<Utc>)> {
let (source_name, date_time_str) = pointer_name.rsplit_once("+")?;
let date_time = NaiveDateTime::parse_from_str(date_time_str, POINTER_DATETIME_FORMAT).ok()?;
let date_time = Utc.from_utc_datetime(&date_time);
Some((source_name.to_owned(), date_time))
}
pub fn open_stdout_backup_process(
extra_args: &HashMap<String, toml::Value>,
program_name: &str,

View File

@ -0,0 +1,260 @@
use crate::commands::backup::split_pointer_name;
use crate::descriptor::{Descriptor, DestPileDescriptor};
use anyhow::Context;
use chrono::{DateTime, Utc};
use comfy_table::presets::UTF8_FULL;
use comfy_table::{Cell, Color, ContentArrangement, Table};
use humansize::FileSize;
use log::info;
use std::collections::{BTreeMap, BTreeSet};
use std::io::Read;
use std::mem::size_of;
use yama::chunking::RecursiveUnchunker;
use yama::commands::{load_pile_descriptor, open_pile, retrieve_tree_node};
use yama::definitions::{ChunkId, RecursiveChunkRef, TreeNode};
use yama::pile::{DebugStatistics, Pile, RawPile};
// This module generates reports for a Datman system.
// Referenced Chunk IDs are counted and used to give an indication of size.
// Chunk IDs are summarised into u32s to reduce memory usage. Since the report is approximate,
// it doesn't matter if there are a few collisions (although they are still fairly unlikely to
// affect much).
#[derive(Clone)]
pub struct Report {
pub last_source_backups: BTreeMap<String, Option<DateTime<Utc>>>,
pub chunk_usage: BTreeMap<String, Sizes>,
pub debug_stats: Option<DebugStatistics>,
}
#[derive(Clone, Default)]
pub struct Sizes {
/// Total number of chunks that we refer to.
pub total: u32,
/// Each referred chunk is counted once here, but divided by the number of sharers.
/// We are 'morally responsible' for this many chunks.
pub moral: u32,
/// Number of chunks that only we point to.
pub unique: u32,
/// Number of chunks for which we are the oldest (lexicographically earliest) pointer to point
/// to those chunks.
pub rollup: u32,
}
type CondensedChunkId = u32;
fn condense_chunk_id(chunk_id: ChunkId) -> CondensedChunkId {
CondensedChunkId::from_be_bytes(
chunk_id[0..size_of::<CondensedChunkId>()]
.try_into()
.unwrap(),
)
}
pub fn generate_report(
dest_pile_descriptor: &DestPileDescriptor,
descriptor: &Descriptor,
) -> anyhow::Result<Report> {
let pile_descriptor = load_pile_descriptor(&dest_pile_descriptor.path)?;
let pile = open_pile(&dest_pile_descriptor.path, &pile_descriptor)?;
let debug_stats = pile.raw_pile.debug_statistics()?;
let mut pointers_to_parent_and_chunkids = BTreeMap::new();
info!("Collecting chunk IDs... This will probably be slow.");
for pointer_name in pile.list_pointers()? {
let pointer = pile
.read_pointer(&pointer_name)?
.context("listed pointer doesn't exist")?;
let root_node = retrieve_tree_node(&pile, pointer.chunk_ref)?;
let pointer_chunk_ids = collect_chunk_ids(&pile, &root_node.node)?;
pointers_to_parent_and_chunkids
.insert(pointer_name, (pointer.parent_pointer, pointer_chunk_ids));
}
// Now we iterate in reverse order, making a list of count of Chunk IDs.
// At the same time, we can also calculate 'rollup' sizes.
let mut chunk_sharer_counts: BTreeMap<CondensedChunkId, u16> = BTreeMap::new();
let mut pointer_stats: BTreeMap<String, Sizes> = BTreeMap::new();
for pointer_name in pointers_to_parent_and_chunkids.keys().rev() {
let deduped_chunks: BTreeSet<CondensedChunkId> =
iter_over_all_chunkids_incl_parents(&pointers_to_parent_and_chunkids, &pointer_name)
.collect();
let mut rollup_count = 0;
for chunk in deduped_chunks {
let count = chunk_sharer_counts.entry(chunk).or_default();
*count += 1;
if *count == 1 {
rollup_count += 1;
}
}
let entry = pointer_stats.entry(pointer_name.to_owned()).or_default();
entry.rollup = rollup_count;
}
// Now go through again and update all the stats!
for pointer_name in pointers_to_parent_and_chunkids.keys().rev() {
let deduped_chunks: BTreeSet<CondensedChunkId> =
iter_over_all_chunkids_incl_parents(&pointers_to_parent_and_chunkids, &pointer_name)
.collect();
let mut unique_count = 0;
let mut shared_count_by_sharers = [0u32; 256];
let total_count = deduped_chunks.len();
for chunk in deduped_chunks {
let count = chunk_sharer_counts[&chunk];
if count == 1 {
unique_count += 1;
} else {
let num_sharers = (count as usize).min(256);
shared_count_by_sharers[num_sharers - 1] += 1;
}
}
let mut sharers_sum: f64 = 0.0;
for (sharers_minus_one, count) in shared_count_by_sharers.into_iter().enumerate() {
sharers_sum += (count as f64) / (sharers_minus_one + 1) as f64;
}
let entry = pointer_stats.entry(pointer_name.to_owned()).or_default();
entry.moral = (sharers_sum.ceil() as u32) + unique_count;
entry.unique = unique_count;
entry.total = total_count as u32;
}
let mut last_backed_up = BTreeMap::new();
for source_name in descriptor.sources.keys().cloned() {
last_backed_up.insert(source_name, None);
}
for pointer_name in pointers_to_parent_and_chunkids.keys() {
if let Some((source_name, date_time)) = split_pointer_name(&pointer_name) {
last_backed_up.insert(source_name, Some(date_time));
}
}
Ok(Report {
last_source_backups: last_backed_up,
chunk_usage: pointer_stats,
debug_stats,
})
}
// Does not filter duplicates...
fn iter_over_all_chunkids_incl_parents<'a>(
pointers_to_parent_and_chunkids: &'a BTreeMap<
String,
(Option<String>, BTreeSet<CondensedChunkId>),
>,
pointer_name: &'a str,
) -> Box<dyn Iterator<Item = CondensedChunkId> + 'a> {
let (parent, chunks) = &pointers_to_parent_and_chunkids[pointer_name];
match parent {
None => Box::new(chunks.iter().copied()),
Some(parent) => Box::new(chunks.iter().copied().chain(
iter_over_all_chunkids_incl_parents(pointers_to_parent_and_chunkids, &parent),
)),
}
}
fn collect_chunk_ids<RP: RawPile>(
pile: &Pile<RP>,
root: &TreeNode,
) -> anyhow::Result<BTreeSet<CondensedChunkId>> {
let mut chunk_ids = BTreeSet::new();
root.visit(
&mut |tree_node, _| {
match tree_node {
TreeNode::NormalFile { content, .. } => {
collect_chunk_ids_from_chunkref(pile, content, &mut chunk_ids)?;
}
_ => {}
}
Ok(())
},
"".to_owned(),
)?;
Ok(chunk_ids)
}
fn collect_chunk_ids_from_chunkref<RP: RawPile>(
pile: &Pile<RP>,
chunk_ref: &RecursiveChunkRef,
collection: &mut BTreeSet<CondensedChunkId>,
) -> anyhow::Result<()> {
if chunk_ref.depth == 0 {
collection.insert(condense_chunk_id(chunk_ref.chunk_id));
} else {
let shallower_chunk_ref = RecursiveChunkRef {
chunk_id: chunk_ref.chunk_id,
depth: chunk_ref.depth - 1,
};
let mut unchunker = RecursiveUnchunker::new(pile, shallower_chunk_ref);
let mut next_chunk_id: ChunkId = Default::default();
loop {
let read = unchunker.read(&mut next_chunk_id[..])?;
if read == 0 {
break;
} else if read < next_chunk_id.len() {
unchunker.read_exact(&mut next_chunk_id[read..])?;
}
collection.insert(condense_chunk_id(next_chunk_id));
}
}
Ok(())
}
pub fn print_report(report: &Report) -> anyhow::Result<()> {
let mut table = Table::new();
table
.load_preset(UTF8_FULL)
.set_content_arrangement(ContentArrangement::DynamicFullWidth);
//.set_width(100);
table.set_header(vec![
Cell::new("Pointer name").fg(Color::Cyan),
Cell::new("Rollup size").fg(Color::Magenta),
Cell::new("Unique size").fg(Color::Magenta),
Cell::new("Moral size").fg(Color::Magenta),
Cell::new("Total size").fg(Color::Magenta),
]);
let average_chunk_size = report
.debug_stats
.as_ref()
.map(|stats| stats.total_chunk_size as f64 / stats.number_of_chunks as f64);
for (pointer_name, sizes) in &report.chunk_usage {
table.add_row(vec![
Cell::new(pointer_name).fg(Color::Blue),
Cell::new(format_size(sizes.rollup, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.unique, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.moral, average_chunk_size)).fg(Color::Yellow),
Cell::new(format_size(sizes.total, average_chunk_size)).fg(Color::Yellow),
]);
}
println!("{table}");
Ok(())
}
fn format_size(chunks: u32, average_chunk_size: Option<f64>) -> String {
let est_size_suffix = average_chunk_size
.map(|bytes_per_chunk| {
let num_bytes = (chunks as f64 * bytes_per_chunk) as u64;
format!(
" ~{}",
num_bytes
.file_size(humansize::file_size_opts::BINARY)
.unwrap()
)
})
.unwrap_or_default();
format!("{} c{}", chunks, est_size_suffix)
}