Add compaction logic

2022-11-19 15:27:41 +00:00 · 2022-11-19 15:27:41 +00:00 · 58c5c3f039
parent 30b261d172
commit 58c5c3f039
1 changed files with 341 additions and 4 deletions
--- a/yama/src/pile/local_sqlitebloblogs.rs
+++ b/yama/src/pile/local_sqlitebloblogs.rs
@ -15,19 +15,20 @@ You should have received a copy of the GNU General Public License
 along with Yama.  If not, see <https://www.gnu.org/licenses/>.
 */

-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
 use std::convert::{TryFrom, TryInto};
-use std::fs::{read_dir, File, OpenOptions};
+use std::fs::{read_dir, remove_file, File, OpenOptions};
 use std::io::{Read, Seek, SeekFrom, Write};
+use std::os::unix::fs::MetadataExt;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, Condvar, Mutex};
 use std::{fs, thread};

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use log::{info, warn};
 use nix::unistd::sync;
-use rusqlite::{params, Error, ErrorCode, Transaction};
+use rusqlite::{params, Error, ErrorCode, Transaction, TransactionBehavior, NO_PARAMS};
 use rusqlite::{Connection, OptionalExtension};

 use crate::definitions::ChunkId;
@ -35,6 +36,7 @@ use crate::pile::{
    ControllerMessage, DebugStatistics, Keyspace, PipelineDescription, RawPile,
    StoragePipelineSettings,
 };
+use crate::progress::ProgressTracker;
 use crate::utils::{bytes_to_hexstring, LruMap};
 use crossbeam_channel::{Receiver, Sender};
 use rusqlite::ffi::ErrorCode::ConstraintViolation;
@ -49,6 +51,11 @@ pub const POINTER_WRITE_BATCHES: usize = 2048;
 /// This many bloblogs will be kept open for reading, at maximum.
 pub const BLOBLOG_MAX_READING_FILE_COUNT: usize = 128;

+/// Size of a blob header within a bloblog.
+/// 32 byte Chunk Id
+/// 4 byte (u32) Blob size
+pub const BLOB_HEADER_SIZE: u64 = 32 + 4;
+
 /// A file storing a log of blobs.
 /// Format:
 ///     Repeated:
@ -496,6 +503,336 @@ impl SqliteBloblogPile {
        assert!(pointers_buffered.is_empty());
        Ok(())
    }
+
+    /// Look at the bloblogs in this pile and see where space may be reclaimable if we were to
+    /// compact.
+    ///
+    /// Next step: plan_compaction
+    pub fn analyse_for_compaction(&self) -> anyhow::Result<BTreeMap<BloblogId, BloblogStats>> {
+        let mut inner = self.inner.lock().unwrap();
+        // Lock the database right away.
+        let txn = inner
+            .connection
+            .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+        let mut stmt = txn.prepare(
+            "
+            SELECT bloblog, COUNT(c.offset), COUNT(d.offset), SUM(COALESCE(d.size, 0))
+            FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
+            GROUP BY bloblog
+        ",
+        )?;
+
+        struct UnpopulatedBloblogStats {
+            pub bloblog_id: BloblogId,
+            pub chunks_total: u64,
+            pub chunks_deleted: u64,
+            pub bytes_deleted: u64,
+        }
+
+        let unpopul_bloblog_stats = stmt.query_map(NO_PARAMS, |row| {
+            Ok(UnpopulatedBloblogStats {
+                bloblog_id: row.get(0)?,
+                chunks_total: row.get::<_, i64>(1)?.try_into().expect("i64 -> u64"),
+                chunks_deleted: row.get::<_, i64>(2)?.try_into().expect("i64 -> u64"),
+                bytes_deleted: row.get::<_, i64>(3)?.try_into().expect("i64 -> u64"),
+            })
+        })?;
+
+        let mut final_stats = BTreeMap::new();
+
+        for unpopul_stat in unpopul_bloblog_stats {
+            let UnpopulatedBloblogStats {
+                bloblog_id,
+                chunks_total,
+                chunks_deleted,
+                bytes_deleted,
+            } = unpopul_stat?;
+            let bloblog_path = self.path.join(&bloblog_id.to_string());
+            let bytes_total = std::fs::metadata(&bloblog_path)
+                .with_context(|| format!("Failed to get metadata for bloblog: {:?}", bloblog_path))?
+                .size();
+
+            final_stats.insert(
+                bloblog_id,
+                BloblogStats {
+                    chunks_total,
+                    chunks_deleted,
+                    bytes_total,
+                    // Add a slight correction since we can count the blob headers of deleted blobs
+                    // as deleted.
+                    bytes_deleted: bytes_deleted + chunks_deleted * BLOB_HEADER_SIZE,
+                },
+            );
+        }
+
+        Ok(final_stats)
+    }
+
+    /// Look at the analysis of compaction and, using the specified thresholds, come up with a plan
+    /// to perform compaction.
+    ///
+    /// May return an empty plan if compaction isn't worthwhile.
+    ///
+    /// Previous step: analyse_for_compaction
+    /// Next step: perform_compaction
+    pub fn plan_compaction(
+        &self,
+        thresholds: &CompactionThresholds,
+        analysis: BTreeMap<BloblogId, BloblogStats>,
+    ) -> anyhow::Result<CompactionPlan> {
+        let bloblogs_to_replace: BTreeMap<BloblogId, BloblogStats> = analysis
+            .into_iter()
+            .filter(|(_id, stats)| thresholds.should_replace_bloblog(stats))
+            .collect();
+        let reclaimable_space: u64 = bloblogs_to_replace
+            .values()
+            .map(|bs| bs.bytes_deleted)
+            .sum();
+        let bytes_to_write: u64 = bloblogs_to_replace
+            .values()
+            .map(|bs| bs.bytes_total - bs.bytes_deleted)
+            .sum();
+        let small_bloblogs: u32 = bloblogs_to_replace
+            .values()
+            .filter(|bs| bs.bytes_total - bs.bytes_deleted < thresholds.cond_if_less_allocated_than)
+            .count() as u32;
+
+        if reclaimable_space < thresholds.minimum_to_reclaim
+            && small_bloblogs < thresholds.minimum_small_bloblogs_to_merge
+        {
+            // Nothing worth doing: return an empty plan.
+            return Ok(CompactionPlan {
+                bloblogs_to_replace: Default::default(),
+                bytes_to_write: 0,
+                reclaimable_space: 0,
+                small_bloblogs: 0,
+            });
+        }
+
+        Ok(CompactionPlan {
+            bloblogs_to_replace: bloblogs_to_replace.keys().copied().collect(),
+            bytes_to_write,
+            reclaimable_space,
+            small_bloblogs,
+        })
+    }
+
+    /// Given a compaction plan, perform the compaction.
+    /// There shouldn't be any decisions left to be made at this point: just action.
+    ///
+    /// TODO flock the bloblogs to be removed and make readers and writers also flock them too.
+    ///
+    /// TODO find a way to deal with bloblogs that are entirely unreferenced from the index
+    ///      (e.g. bloblogs that weren't written properly, e.g. if compaction fails.)
+    pub fn perform_compaction(
+        &self,
+        mut progress: Box<dyn ProgressTracker>,
+        plan: CompactionPlan,
+    ) -> anyhow::Result<()> {
+        #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+        struct ReplacedBlobRow {
+            pub old_bloblog: BloblogId,
+            pub old_offset: u64,
+            pub chunk_id: ChunkId,
+        }
+
+        let mut to_preserve = BTreeSet::new();
+        let mut replacements = BTreeMap::new();
+
+        progress.set_max_size(plan.bytes_to_write);
+
+        // First find all the blobs we need to replace.
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+            let mut stmt = txn.prepare(
+                "
+                SELECT chunk_id, c.offset
+                FROM chunks c LEFT JOIN deleted d USING (bloblog, offset)
+                WHERE bloblog = ?1 AND d.offset IS NULL
+            ",
+            )?;
+            for bloblog in plan.bloblogs_to_replace.iter().copied() {
+                to_preserve.extend(
+                    stmt.query_map([bloblog], |row| {
+                        let mut chunk_id = ChunkId::default();
+                        chunk_id.copy_from_slice(row.get::<_, Vec<u8>>(0).unwrap().as_slice());
+                        Ok(ReplacedBlobRow {
+                            old_bloblog: bloblog,
+                            chunk_id,
+                            old_offset: row.get::<_, i64>(1).unwrap().try_into().unwrap(),
+                        })
+                    })?
+                    .collect::<Result<Vec<ReplacedBlobRow>, _>>()?,
+                );
+            }
+        }
+
+        // Then make the replacements
+        info!("Rewriting bloblogs...");
+        let mut buf = Vec::new();
+        let mut iterator = to_preserve.into_iter();
+        loop {
+            let (new_bloblog_id, bloglog_mutex) = self.get_writing_bloblog()?;
+            let mut new_bloblog = bloglog_mutex.lock().expect("Failed to lock bloblog?");
+            let mut is_more = false;
+
+            while let Some(preserve) = iterator.next() {
+                is_more = true;
+
+                // Get hold of the old bloblog
+                let old_bloblog = self.open_bloblog(preserve.old_bloblog)?;
+                let mut old_bloblog = old_bloblog.lock().unwrap();
+
+                // Transfer the blob
+                buf.clear();
+                old_bloblog.read_blob(preserve.old_offset, &preserve.chunk_id, &mut buf)?;
+                let new_offset = new_bloblog.write_blob(&preserve.chunk_id, &buf)?;
+
+                // Make a note of the replacement
+                replacements.insert(
+                    preserve,
+                    BloblogPointer {
+                        bloblog: new_bloblog_id,
+                        offset: new_offset,
+                    },
+                );
+
+                progress.inc_progress(buf.len() as u64);
+
+                if new_bloblog.filesize()? > MAX_BLOBLOG_REUSE_SIZE {
+                    // get a new bloblog to write with.
+                    break;
+                }
+            }
+
+            drop(new_bloblog);
+            self.return_writing_bloblog(new_bloblog_id, bloglog_mutex)?;
+
+            if !is_more {
+                break;
+            }
+        }
+
+        info!("Applying replacements...");
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+            let mut stmt = txn.prepare(
+                "
+                UPDATE chunks
+                SET bloblog = ?1, offset = ?2
+                WHERE chunk_id = ?3
+            ",
+            )?;
+
+            for (replacement_row, new_pos) in replacements {
+                ensure!(
+                    stmt.execute(params![
+                        new_pos.bloblog,
+                        new_pos.offset as i64,
+                        &replacement_row.chunk_id as &[u8]
+                    ])? == 1,
+                    "Wrong number of rows updated for replacement!"
+                );
+            }
+
+            drop(stmt);
+            txn.commit().context("committing replacements")?;
+        }
+
+        // TODO fsync new bloblogs
+
+        info!("Deleting old bloblogs...");
+        {
+            let mut inner = self.inner.lock().unwrap();
+            // Lock the database right away.
+            let txn = inner
+                .connection
+                .transaction_with_behavior(TransactionBehavior::Exclusive)?;
+
+            for bloblog_id in plan.bloblogs_to_replace.iter().copied() {
+                let deleted_chunks = txn.execute(
+                    "
+                    DELETE FROM chunks WHERE bloblog = ?1
+                ",
+                    params![bloblog_id],
+                )?;
+
+                let deleted_deleted = txn.execute(
+                    "
+                    DELETE FROM deleted WHERE bloblog = ?1
+                ",
+                    params![bloblog_id],
+                )?;
+
+                ensure!(deleted_chunks == deleted_deleted, "Undeleted chunks left in bloblog {bloblog_id}: CHUNKS={deleted_chunks} DELETED={deleted_deleted}");
+
+                let bloblog_path = self.path.join(bloblog_id.to_string());
+                remove_file(&bloblog_path).with_context(|| {
+                    format!("Failed to remove obsolete bloblog: {:?}", bloblog_path)
+                })?;
+            }
+
+            txn.commit()?;
+        }
+
+        Ok(())
+    }
+}
+
+pub struct BloblogStats {
+    pub chunks_total: u64,
+    pub chunks_deleted: u64,
+    pub bytes_total: u64,
+    pub bytes_deleted: u64,
+}
+
+pub struct CompactionPlan {
+    pub bloblogs_to_replace: BTreeSet<BloblogId>,
+    pub bytes_to_write: u64,
+    pub reclaimable_space: u64,
+    pub small_bloblogs: u32,
+}
+
+pub struct CompactionThresholds {
+    /// Minimum bytes to be reclaimable overall for compaction to be worthwhile.
+    pub minimum_to_reclaim: u64,
+
+    /// (alternative reason) Minimum number of files to be undersized in order for compaction
+    /// to be worthwhile.
+    /// This gives us a way to make compaction run if we have lots of tiny bloblogs.
+    pub minimum_small_bloblogs_to_merge: u32,
+
+    /// A bloblog will be replaced if the deallocated size is greater than this.
+    pub cond_if_more_deallocated_than: u64,
+
+    /// A bloblog will be replaced if the allocated size is less than this.
+    pub cond_if_less_allocated_than: u64,
+}
+
+impl CompactionThresholds {
+    pub fn should_replace_bloblog(&self, bloblog_stats: &BloblogStats) -> bool {
+        let allocated = bloblog_stats.bytes_total - bloblog_stats.bytes_deleted;
+        // Note that this will also trigger for fully-deallocated files if
+        let is_small = allocated < self.cond_if_less_allocated_than;
+        let has_large_deallocations =
+            bloblog_stats.bytes_deleted > self.cond_if_more_deallocated_than;
+        is_small || has_large_deallocations
+    }
+}
+
+pub struct CompactionOutcome {
+    pub bloblogs_deleted: u32,
+    pub bloblogs_created: u32,
+    pub bytes_deleted: u32,
+    pub bytes_created: u32,
 }

 impl Drop for SqliteBloblogPile {