From 2e76e89d576e143f0e9307d7ecc876dc0cac8592 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sat, 31 May 2025 17:10:14 +0200 Subject: [PATCH 01/26] refactor: separate backend operations --- crates/core/src/archiver.rs | 9 +- crates/core/src/archiver/file_archiver.rs | 11 +- crates/core/src/archiver/tree_archiver.rs | 8 +- crates/core/src/blob.rs | 1 + crates/core/src/blob/packer.rs | 603 ++----------------- crates/core/src/blob/repopacker.rs | 552 +++++++++++++++++ crates/core/src/chunker.rs | 6 +- crates/core/src/commands/copy.rs | 13 +- crates/core/src/commands/merge.rs | 11 +- crates/core/src/commands/prune.rs | 22 +- crates/core/src/commands/repair/index.rs | 11 +- crates/core/src/commands/repair/snapshots.rs | 13 +- crates/core/src/index.rs | 1 + crates/core/src/index/indexer.rs | 78 ++- crates/core/src/index/repoindexer.rs | 103 ++++ 15 files changed, 805 insertions(+), 637 deletions(-) create mode 100644 crates/core/src/blob/repopacker.rs create mode 100644 crates/core/src/index/repoindexer.rs diff --git a/crates/core/src/archiver.rs b/crates/core/src/archiver.rs index 33e0b1e3a..0bbde23df 100644 --- a/crates/core/src/archiver.rs +++ b/crates/core/src/archiver.rs @@ -49,7 +49,7 @@ pub struct Archiver<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> { parent: Parent, /// The `SharedIndexer` is used to index the data. - indexer: SharedIndexer, + indexer: SharedIndexer, /// The backend to write to. be: BE, @@ -83,7 +83,7 @@ impl<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> Archiver<'a, BE, I> { parent: Parent, mut snap: SnapshotFile, ) -> RusticResult { - let indexer = Indexer::new(be.clone()).into_shared(); + let indexer = Indexer::new().into_shared(); let mut summary = snap.summary.take().unwrap_or_default(); summary.backup_start = Local::now(); @@ -217,7 +217,10 @@ impl<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> Archiver<'a, BE, I> { stats.apply(&mut summary, BlobType::Data); self.snap.tree = id; - self.indexer.write().unwrap().finalize()?; + let res = self.indexer.write().unwrap().finalize(); + if let Some(file) = res { + _ = self.be.save_file(&file)?; + } summary.finalize(self.snap.time).map_err(|err| { RusticError::with_source( diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index dccd98a19..e2ce8025d 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -13,10 +13,7 @@ use crate::{ decrypt::DecryptWriteBackend, node::{Node, NodeType}, }, - blob::{ - BlobId, BlobType, DataId, - packer::{Packer, PackerStats}, - }, + blob::{BlobId, BlobType, DataId, packer::PackerStats, repopacker::RepositoryPacker}, chunker::ChunkIter, crypto::hasher::hash, error::{ErrorKind, RusticError, RusticResult}, @@ -35,7 +32,7 @@ use crate::{ #[derive(Clone)] pub(crate) struct FileArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> { index: &'a I, - data_packer: Packer, + data_packer: RepositoryPacker, rabin: Rabin64, } @@ -61,12 +58,12 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> FileArchiver<'a, BE, I> { pub(crate) fn new( be: BE, index: &'a I, - indexer: SharedIndexer, + indexer: SharedIndexer, config: &ConfigFile, ) -> RusticResult { let poly = config.poly()?; - let data_packer = Packer::new( + let data_packer = RepositoryPacker::new( be, BlobType::Data, indexer, diff --git a/crates/core/src/archiver/tree_archiver.rs b/crates/core/src/archiver/tree_archiver.rs index a5edbc72c..ca1a95761 100644 --- a/crates/core/src/archiver/tree_archiver.rs +++ b/crates/core/src/archiver/tree_archiver.rs @@ -8,7 +8,7 @@ use crate::{ backend::{decrypt::DecryptWriteBackend, node::Node}, blob::{ BlobType, - packer::Packer, + repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, error::{ErrorKind, RusticError, RusticResult}, @@ -34,7 +34,7 @@ pub(crate) struct TreeArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> /// The index to read from. index: &'a I, /// The packer to write to. - tree_packer: Packer, + tree_packer: RepositoryPacker, /// The summary of the snapshot. summary: SnapshotSummary, } @@ -62,11 +62,11 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> TreeArchiver<'a, BE, I> { pub(crate) fn new( be: BE, index: &'a I, - indexer: SharedIndexer, + indexer: SharedIndexer, config: &ConfigFile, summary: SnapshotSummary, ) -> RusticResult { - let tree_packer = Packer::new( + let tree_packer = RepositoryPacker::new( be, BlobType::Tree, indexer, diff --git a/crates/core/src/blob.rs b/crates/core/src/blob.rs index 5b0a4c5e6..70b19d9d2 100644 --- a/crates/core/src/blob.rs +++ b/crates/core/src/blob.rs @@ -1,4 +1,5 @@ pub(crate) mod packer; +pub(crate) mod repopacker; pub(crate) mod tree; use derive_more::Constructor; diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 167303afa..9811bd8ff 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -1,29 +1,20 @@ use std::{ num::NonZeroU32, - sync::{Arc, RwLock}, time::{Duration, SystemTime}, }; use bytes::{Bytes, BytesMut}; -use chrono::Local; -use crossbeam_channel::{Receiver, Sender, bounded}; use integer_sqrt::IntegerSquareRoot; use log::warn; -use pariter::{IteratorExt, scope}; use crate::{ - backend::{ - FileType, - decrypt::{DecryptFullBackend, DecryptWriteBackend}, - }, blob::{BlobId, BlobType}, - crypto::{CryptoKey, hasher::hash}, + crypto::CryptoKey, error::{ErrorKind, RusticError, RusticResult}, - index::indexer::SharedIndexer, repofile::{ configfile::ConfigFile, - indexfile::{IndexBlob, IndexPack}, - packfile::{PackHeaderLength, PackHeaderRef, PackId}, + indexfile::IndexPack, + packfile::{PackHeaderLength, PackHeaderRef}, snapshotfile::SnapshotSummary, }, }; @@ -38,22 +29,9 @@ pub enum PackerErrorKind { from: &'static str, source: std::num::TryFromIntError, }, - /// Sending crossbeam message failed: `size_limit`: `{size_limit:?}`, `id`: `{id:?}`, `data`: `{data:?}` : `{source}` - SendingCrossbeamMessage { - size_limit: Option, - id: BlobId, - data: Bytes, - source: crossbeam_channel::SendError<(Bytes, BlobId, Option)>, - }, - /// Sending crossbeam data message failed: `data`: `{data:?}`, `index_pack`: `{index_pack:?}` : `{source}` - SendingCrossbeamDataMessage { - data: Bytes, - index_pack: IndexPack, - source: crossbeam_channel::SendError<(Bytes, IndexPack)>, - }, } -pub(crate) type PackerResult = Result>; +pub(crate) type PackerResult = Result; pub(super) mod constants { use std::time::Duration; @@ -176,220 +154,6 @@ impl PackSizer { } } -/// The `Packer` is responsible for packing blobs into pack files. -/// -/// # Type Parameters -/// -/// * `BE` - The backend type. -#[allow(missing_debug_implementations)] -#[allow(clippy::struct_field_names)] -#[derive(Clone)] -pub struct Packer { - /// The raw packer wrapped in an `Arc` and `RwLock`. - // This is a hack: raw_packer and indexer are only used in the add_raw() method. - // TODO: Refactor as actor, like the other add() methods - raw_packer: Arc>>, - /// The shared indexer containing the backend. - indexer: SharedIndexer, - /// The sender to send blobs to the raw packer. - sender: Sender<(Bytes, BlobId, Option)>, - /// The receiver to receive the status from the raw packer. - finish: Receiver>, -} - -impl Packer { - /// Creates a new `Packer`. - /// - /// # Type Parameters - /// - /// * `BE` - The backend type. - /// - /// # Arguments - /// - /// * `be` - The backend to write to. - /// * `blob_type` - The blob type. - /// * `indexer` - The indexer to write to. - /// * `config` - The config file. - /// * `total_size` - The total size of the pack file. - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - /// * If converting the data length to u64 fails - #[allow(clippy::unnecessary_wraps)] - pub fn new( - be: BE, - blob_type: BlobType, - indexer: SharedIndexer, - config: &ConfigFile, - total_size: u64, - ) -> RusticResult { - let raw_packer = Arc::new(RwLock::new(RawPacker::new( - be.clone(), - blob_type, - indexer.clone(), - config, - total_size, - ))); - - let (tx, rx) = bounded(0); - let (finish_tx, finish_rx) = bounded::>(0); - let packer = Self { - raw_packer: raw_packer.clone(), - indexer: indexer.clone(), - sender: tx, - finish: finish_rx, - }; - - let _join_handle = std::thread::spawn(move || { - scope(|scope| { - let status = rx - .into_iter() - .readahead_scoped(scope) - // early check if id is already contained - .filter(|(_, id, _)| !indexer.read().unwrap().has(id)) - .filter(|(_, id, _)| !raw_packer.read().unwrap().has(id)) - .readahead_scoped(scope) - .parallel_map_scoped( - scope, - |(data, id, size_limit): (Bytes, BlobId, Option)| { - let (data, data_len, uncompressed_length) = be.process_data(&data)?; - Ok(( - data, - id, - u64::from(data_len), - uncompressed_length, - size_limit, - )) - }, - ) - .readahead_scoped(scope) - // check again if id is already contained - // TODO: We may still save duplicate blobs - the indexer is only updated when the packfile write has completed - .filter(|res| { - res.as_ref().map_or_else( - |_| true, - |(_, id, _, _, _)| !indexer.read().unwrap().has(id), - ) - }) - .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { - let (data, id, data_len, ul, size_limit) = item?; - raw_packer - .write() - .unwrap() - .add_raw(&data, &id, data_len, ul, size_limit) - }) - .and_then(|()| raw_packer.write().unwrap().finalize()); - _ = finish_tx.send(status); - }) - .unwrap(); - }); - - Ok(packer) - } - - /// Adds the blob to the packfile - /// - /// # Arguments - /// - /// * `data` - The blob data - /// * `id` - The blob id - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - pub fn add(&self, data: Bytes, id: BlobId) -> RusticResult<()> { - // compute size limit based on total size and size bounds - self.add_with_sizelimit(data, id, None).map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to add blob `{id}` to packfile.", - err, - ) - .attach_context("id", id.to_string()) - .ask_report() - }) - } - - /// Adds the blob to the packfile, allows specifying a size limit for the pack file - /// - /// # Arguments - /// - /// * `data` - The blob data - /// * `id` - The blob id - /// * `size_limit` - The size limit for the pack file - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - fn add_with_sizelimit( - &self, - data: Bytes, - id: BlobId, - size_limit: Option, - ) -> PackerResult<()> { - self.sender - .send((data.clone(), id, size_limit)) - .map_err(|err| PackerErrorKind::SendingCrossbeamMessage { - size_limit, - id, - data, - source: err, - })?; - Ok(()) - } - - /// Adds the already encrypted (and maybe compressed) blob to the packfile - /// - /// # Arguments - /// - /// * `data` - The blob data - /// * `id` - The blob id - /// * `data_len` - The length of the blob data - /// * `uncompressed_length` - The length of the blob data before compression - /// * `size_limit` - The size limit for the pack file - /// - /// # Errors - /// - /// * If the blob is already present in the index - /// * If sending the message to the raw packer fails. - fn add_raw( - &self, - data: &[u8], - id: &BlobId, - data_len: u64, - uncompressed_length: Option, - size_limit: Option, - ) -> RusticResult<()> { - // only add if this blob is not present - if self.indexer.read().unwrap().has(id) { - Ok(()) - } else { - self.raw_packer.write().unwrap().add_raw( - data, - id, - data_len, - uncompressed_length, - size_limit, - ) - } - } - - /// Finalizes the packer and does cleanup - /// - /// # Panics - /// - /// * If the channel could not be dropped - pub fn finalize(self) -> RusticResult { - // cancel channel - drop(self.sender); - // wait for items in channel to be processed - self.finish - .recv() - .expect("Should be able to receive from channel to finalize packer.") - } -} - // TODO: add documentation! #[derive(Default, Debug, Clone, Copy)] pub struct PackerStats { @@ -436,9 +200,9 @@ impl PackerStats { /// /// * `BE` - The backend type. #[allow(missing_debug_implementations, clippy::module_name_repetitions)] -pub(crate) struct RawPacker { - /// The backend to write to. - be: BE, +pub(crate) struct Packer { + /// the + key: C, /// The blob type to pack. blob_type: BlobType, /// The file to write to @@ -451,15 +215,13 @@ pub(crate) struct RawPacker { created: SystemTime, /// The index of the pack index: IndexPack, - /// The actor to write the pack file - file_writer: Option, /// The pack sizer - pack_sizer: PackSizer, + pub pack_sizer: PackSizer, /// The packer stats - stats: PackerStats, + pub stats: PackerStats, } -impl RawPacker { +impl Packer { /// Creates a new `RawPacker`. /// /// # Type Parameters @@ -473,34 +235,17 @@ impl RawPacker { /// * `indexer` - The indexer to write to. /// * `config` - The config file. /// * `total_size` - The total size of the pack file. - fn new( - be: BE, - blob_type: BlobType, - indexer: SharedIndexer, - config: &ConfigFile, - total_size: u64, - ) -> Self { - let file_writer = Some(Actor::new( - FileWriterHandle { - be: be.clone(), - indexer, - cacheable: blob_type.is_cacheable(), - }, - 1, - 1, - )); - + pub fn new(key: C, blob_type: BlobType, config: &ConfigFile, total_size: u64) -> Self { let pack_sizer = PackSizer::from_config(config, blob_type, total_size); Self { - be, + key, blob_type, file: BytesMut::new(), size: 0, count: 0, created: SystemTime::now(), index: IndexPack::default(), - file_writer, pack_sizer, stats: PackerStats::default(), } @@ -511,16 +256,10 @@ impl RawPacker { /// # Errors /// /// * If the packfile could not be saved - fn finalize(&mut self) -> RusticResult { - self.save().map_err(|err| { - err.overwrite_kind(ErrorKind::Internal) - .prepend_guidance_line("Failed to save packfile. Data may be lost.") - .ask_report() - })?; + pub fn finalize(&mut self) -> RusticResult<(Option<(Bytes, IndexPack)>, PackerStats)> { + let stats = std::mem::take(&mut self.stats); - self.file_writer.take().unwrap().finalize()?; - - Ok(std::mem::take(&mut self.stats)) + Ok((self.save()?, stats)) } /// Writes the given data to the packfile. @@ -559,13 +298,12 @@ impl RawPacker { /// # Errors /// /// * If converting the data length to u64 fails - fn add_raw( + pub fn add( &mut self, data: &[u8], id: &BlobId, data_len: u64, uncompressed_length: Option, - size_limit: Option, ) -> RusticResult<()> { if self.has(id) { return Ok(()); @@ -585,8 +323,6 @@ impl RawPacker { self.stats.data_packed += data_len_packed; - let size_limit = size_limit.unwrap_or_else(|| self.pack_sizer.pack_size()); - let offset = self.size; let len = self.write_data(data).map_err(|err| { @@ -596,7 +332,6 @@ impl RawPacker { err, ) .attach_context("id", id.to_string()) - .attach_context("size_limit", size_limit.to_string()) .attach_context("data_length_packed", data_len_packed.to_string()) })?; @@ -605,23 +340,25 @@ impl RawPacker { self.count += 1; + Ok(()) + } + + pub fn needs_save(&self, size_limit: Option) -> bool { + if self.size == 0 { + return false; + } + + let size_limit = size_limit.unwrap_or_else(|| self.pack_sizer.pack_size()); + // check if PackFile needs to be saved let elapsed = self.created.elapsed().unwrap_or_else(|err| { warn!("couldn't get elapsed time from system time: {err:?}"); Duration::ZERO }); - if self.count >= constants::MAX_COUNT + self.count >= constants::MAX_COUNT || self.size >= size_limit || elapsed >= constants::MAX_AGE - { - self.pack_sizer.add_size(self.index.pack_size()); - self.save()?; - self.size = 0; - self.count = 0; - self.created = SystemTime::now(); - } - Ok(()) } /// Writes header and length of header to packfile @@ -644,7 +381,7 @@ impl RawPacker { })?; // encrypt and write to pack file - let data = self.be.key().encrypt_data(&data)?; + let data = self.key.encrypt_data(&data)?; let headerlen: u32 = data.len().try_into().map_err(|err| { RusticError::with_source( @@ -689,7 +426,6 @@ impl RawPacker { Ok(()) } - /// Saves the packfile /// /// # Errors @@ -700,279 +436,46 @@ impl RawPacker { /// /// * If converting the header length to u32 fails /// * If the header could not be written - fn save(&mut self) -> RusticResult<()> { - if self.size == 0 { - return Ok(()); + pub fn save_if_needed( + &mut self, + size_limit: Option, + ) -> RusticResult> { + if !self.needs_save(size_limit) { + return Ok(None); } - self.write_header()?; - - // write file to backend - let index = std::mem::take(&mut self.index); - let file = std::mem::replace(&mut self.file, BytesMut::new()); - self.file_writer - .as_ref() - .unwrap() - .send((file.into(), index)) - .map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to send packfile to file writer.", - err, - ) - })?; - - Ok(()) - } - - fn has(&self, id: &BlobId) -> bool { - self.index.blobs.iter().any(|b| &b.id == id) - } -} - -// TODO: add documentation -/// # Type Parameters -/// -/// * `BE` - The backend type. -#[derive(Clone)] -pub(crate) struct FileWriterHandle { - /// The backend to write to. - be: BE, - /// The shared indexer containing the backend. - indexer: SharedIndexer, - /// Whether the file is cacheable. - cacheable: bool, -} - -impl FileWriterHandle { - // TODO: add documentation - fn process(&self, load: (Bytes, PackId, IndexPack)) -> RusticResult { - let (file, id, mut index) = load; - index.id = id; - self.be - .write_bytes(FileType::Pack, &id, self.cacheable, file)?; - index.time = Some(Local::now()); - Ok(index) - } - - fn index(&self, index: IndexPack) -> RusticResult<()> { - self.indexer.write().unwrap().add(index)?; - Ok(()) - } -} - -// TODO: add documentation -pub(crate) struct Actor { - /// The sender to send blobs to the raw packer. - sender: Sender<(Bytes, IndexPack)>, - /// The receiver to receive the status from the raw packer. - finish: Receiver>, -} - -impl Actor { - /// Creates a new `Actor`. - /// - /// # Type Parameters - /// - /// * `BE` - The backend type. - /// - /// # Arguments - /// - /// * `fwh` - The file writer handle. - /// * `queue_len` - The length of the queue. - /// * `par` - The number of parallel threads. - fn new( - fwh: FileWriterHandle, - queue_len: usize, - _par: usize, - ) -> Self { - let (tx, rx) = bounded(queue_len); - let (finish_tx, finish_rx) = bounded::>(0); - - let _join_handle = std::thread::spawn(move || { - scope(|scope| { - let status = rx - .into_iter() - .readahead_scoped(scope) - .map(|(file, index): (Bytes, IndexPack)| { - let id = hash(&file); - (file, PackId::from(id), index) - }) - .readahead_scoped(scope) - .map(|load| fwh.process(load)) - .readahead_scoped(scope) - .try_for_each(|index| fwh.index(index?)); - _ = finish_tx.send(status); - }) - .unwrap(); - }); - - Self { - sender: tx, - finish: finish_rx, - } + self.save() } - /// Sends the given data to the actor. - /// - /// # Arguments - /// - /// * `load` - The data to send. + /// Saves the packfile /// /// # Errors /// - /// If sending the message to the actor fails. - fn send(&self, load: (Bytes, IndexPack)) -> PackerResult<()> { - self.sender.send(load.clone()).map_err(|err| { - PackerErrorKind::SendingCrossbeamDataMessage { - data: load.0, - index_pack: load.1, - source: err, - } - })?; - Ok(()) - } - - /// Finalizes the actor and does cleanup - /// - /// # Panics - /// - /// * If the receiver is not present - fn finalize(self) -> RusticResult<()> { - // cancel channel - drop(self.sender); - // wait for items in channel to be processed - self.finish.recv().unwrap() - } -} - -/// The `Repacker` is responsible for repacking blobs into pack files. -/// -/// # Type Parameters -/// -/// * `BE` - The backend to read from. -#[allow(missing_debug_implementations)] -pub struct Repacker -where - BE: DecryptFullBackend, -{ - /// The backend to read from. - be: BE, - /// The packer to write to. - packer: Packer, - /// The size limit of the pack file. - size_limit: u32, -} - -impl Repacker { - /// Creates a new `Repacker`. - /// - /// # Type Parameters - /// - /// * `BE` - The backend to read from. - /// - /// # Arguments - /// - /// * `be` - The backend to read from. - /// * `blob_type` - The blob type. - /// * `indexer` - The indexer to write to. - /// * `config` - The config file. - /// * `total_size` - The total size of the pack file. + /// If the header could not be written /// /// # Errors /// - /// * If the Packer could not be created - pub fn new( - be: BE, - blob_type: BlobType, - indexer: SharedIndexer, - config: &ConfigFile, - total_size: u64, - ) -> RusticResult { - let packer = Packer::new(be.clone(), blob_type, indexer, config, total_size)?; - let size_limit = PackSizer::from_config(config, blob_type, total_size).pack_size(); - Ok(Self { - be, - packer, - size_limit, - }) - } + /// * If converting the header length to u32 fails + /// * If the header could not be written + pub fn save(&mut self) -> RusticResult> { + self.created = SystemTime::now(); + self.count = 0; - /// Adds the blob to the packfile without any check - /// - /// # Arguments - /// - /// * `pack_id` - The pack id - /// * `blob` - The blob to add - /// - /// # Errors - /// - /// * If the blob could not be added - /// * If reading the blob from the backend fails - pub fn add_fast(&self, pack_id: &PackId, blob: &IndexBlob) -> RusticResult<()> { - let data = self.be.read_partial( - FileType::Pack, - pack_id, - blob.tpe.is_cacheable(), - blob.offset, - blob.length, - )?; - - self.packer - .add_raw( - &data, - &blob.id, - 0, - blob.uncompressed_length, - Some(self.size_limit), - ) - .map_err(|err| { - err.overwrite_kind(ErrorKind::Internal) - .prepend_guidance_line( - "Failed to fast-add (unchecked) blob `{blob_id}` to packfile.", - ) - .attach_context("blob_id", blob.id.to_string()) - })?; + if self.size == 0 { + return Ok(None); + } - Ok(()) - } + self.write_header()?; + // prepare everything for write to the backend + let file = std::mem::take(&mut self.file).into(); + let index = std::mem::take(&mut self.index); - /// Adds the blob to the packfile - /// - /// # Arguments - /// - /// * `pack_id` - The pack id - /// * `blob` - The blob to add - /// - /// # Errors - /// - /// * If the blob could not be added - /// * If reading the blob from the backend fails - pub fn add(&self, pack_id: &PackId, blob: &IndexBlob) -> RusticResult<()> { - let data = self.be.read_encrypted_partial( - FileType::Pack, - pack_id, - blob.tpe.is_cacheable(), - blob.offset, - blob.length, - blob.uncompressed_length, - )?; - - self.packer - .add_with_sizelimit(data, blob.id, Some(self.size_limit)) - .map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to add blob to packfile.", - err, - ) - })?; + self.size = 0; - Ok(()) + Ok(Some((file, index))) } - /// Finalizes the repacker and returns the stats - pub fn finalize(self) -> RusticResult { - self.packer.finalize() + pub fn has(&self, id: &BlobId) -> bool { + self.index.blobs.iter().any(|b| &b.id == id) } } diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs new file mode 100644 index 000000000..9916fea79 --- /dev/null +++ b/crates/core/src/blob/repopacker.rs @@ -0,0 +1,552 @@ +use std::{ + num::NonZeroU32, + sync::{Arc, RwLock}, +}; + +use bytes::Bytes; +use chrono::Local; +use crossbeam_channel::{Receiver, Sender, bounded}; +use pariter::{IteratorExt, scope}; + +use crate::{ + backend::{ + FileType, + decrypt::{DecryptFullBackend, DecryptWriteBackend}, + }, + blob::{BlobId, BlobType}, + crypto::hasher::hash, + error::{ErrorKind, RusticError, RusticResult}, + index::indexer::SharedIndexer, + repofile::{ + configfile::ConfigFile, + indexfile::{IndexBlob, IndexPack}, + packfile::PackId, + }, +}; + +use super::packer::{PackSizer, Packer, PackerStats}; + +/// [`PackerErrorKind`] describes the errors that can be returned for a Packer +#[derive(thiserror::Error, Debug, displaydoc::Display)] +#[non_exhaustive] +pub enum PackerErrorKind { + /// Conversion from `{from}` to `{to}` failed: `{source}` + Conversion { + to: &'static str, + from: &'static str, + source: std::num::TryFromIntError, + }, + /// Sending crossbeam message failed: `size_limit`: `{size_limit:?}`, `id`: `{id:?}`, `data`: `{data:?}` : `{source}` + SendingCrossbeamMessage { + size_limit: Option, + id: BlobId, + data: Bytes, + source: crossbeam_channel::SendError<(Bytes, BlobId, Option)>, + }, + /// Sending crossbeam data message failed: `data`: `{data:?}`, `index_pack`: `{index_pack:?}` : `{source}` + SendingCrossbeamDataMessage { + data: Bytes, + index_pack: IndexPack, + source: crossbeam_channel::SendError<(Bytes, IndexPack)>, + }, +} + +pub(crate) type PackerResult = Result>; + +/// The `Packer` is responsible for packing blobs into pack files. +/// +/// # Type Parameters +/// +/// * `BE` - The backend type. +#[allow(missing_debug_implementations)] +#[allow(clippy::struct_field_names)] +#[derive(Clone)] +pub struct RepositoryPacker { + /// The raw packer wrapped in an `Arc` and `RwLock`. + // This is a hack: raw_packer and indexer are only used in the add_raw() method. + // TODO: Refactor as actor, like the other add() methods + packer: Arc>>, + /// The shared indexer containing the backend. + indexer: SharedIndexer, + /// The actor to write the pack file + file_writer: Actor, + /// The sender to send blobs to the raw packer. + sender: Sender<(Bytes, BlobId, Option)>, + /// The receiver to receive the status from the raw packer. + finish: Receiver>, +} + +impl RepositoryPacker { + /// Creates a new `Packer`. + /// + /// # Type Parameters + /// + /// * `BE` - The backend type. + /// + /// # Arguments + /// + /// * `be` - The backend to write to. + /// * `blob_type` - The blob type. + /// * `indexer` - The indexer to write to. + /// * `config` - The config file. + /// * `total_size` - The total size of the pack file. + /// + /// # Errors + /// + /// * If sending the message to the raw packer fails. + /// * If converting the data length to u64 fails + #[allow(clippy::unnecessary_wraps)] + pub fn new( + be: BE, + blob_type: BlobType, + indexer: SharedIndexer, + config: &ConfigFile, + total_size: u64, + ) -> RusticResult { + let packer = Arc::new(RwLock::new(Packer::new( + *be.key(), + blob_type, + config, + total_size, + ))); + + let file_writer = Actor::new( + FileWriterHandle { + be: be.clone(), + indexer: indexer.clone(), + cacheable: blob_type.is_cacheable(), + }, + 1, + 1, + ); + + let (tx, rx) = bounded(0); + let (finish_tx, finish_rx) = bounded::>(0); + let repository_packer = Self { + packer: packer.clone(), + indexer: indexer.clone(), + file_writer: file_writer.clone(), + sender: tx, + finish: finish_rx, + }; + + let repo_packer = repository_packer.clone(); + let _join_handle = std::thread::spawn(move || { + scope(|scope| { + let status = rx + .into_iter() + .readahead_scoped(scope) + // early check if id is already contained + .filter(|(_, id, _)| !indexer.read().unwrap().has(id)) + .filter(|(_, id, _)| !packer.read().unwrap().has(id)) + .readahead_scoped(scope) + .parallel_map_scoped( + scope, + |(data, id, size_limit): (Bytes, BlobId, Option)| { + let (data, data_len, uncompressed_length) = be.process_data(&data)?; + Ok(( + data, + id, + u64::from(data_len), + uncompressed_length, + size_limit, + )) + }, + ) + .readahead_scoped(scope) + // check again if id is already contained + // TODO: We may still save duplicate blobs - the indexer is only updated when the packfile write has completed + .filter(|res| { + res.as_ref().map_or_else( + |_| true, + |(_, id, _, _, _)| !indexer.read().unwrap().has(id), + ) + }) + .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { + let (data, id, data_len, ul, size_limit) = item?; + repo_packer.add_raw(&data, &id, data_len, ul, size_limit)?; + + Ok(()) + }) + .and_then(|()| { + let (res, stats) = packer.write().unwrap().finalize()?; + if let Some((file, index)) = res { + file_writer.send((file, index)).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to send packfile to file writer.", + err, + ) + })?; + } + Ok(stats) + }); + _ = finish_tx.send(status); + }) + .unwrap(); + }); + + Ok(repository_packer) + } + + /// Adds the blob to the packfile + /// + /// # Arguments + /// + /// * `data` - The blob data + /// * `id` - The blob id + /// + /// # Errors + /// + /// * If sending the message to the raw packer fails. + pub fn add(&self, data: Bytes, id: BlobId) -> RusticResult<()> { + // compute size limit based on total size and size bounds + self.add_with_sizelimit(data, id, None).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to add blob `{id}` to packfile.", + err, + ) + .attach_context("id", id.to_string()) + .ask_report() + }) + } + + /// Adds the blob to the packfile, allows specifying a size limit for the pack file + /// + /// # Arguments + /// + /// * `data` - The blob data + /// * `id` - The blob id + /// * `size_limit` - The size limit for the pack file + /// + /// # Errors + /// + /// * If sending the message to the raw packer fails. + fn add_with_sizelimit( + &self, + data: Bytes, + id: BlobId, + size_limit: Option, + ) -> PackerResult<()> { + self.sender + .send((data.clone(), id, size_limit)) + .map_err(|err| PackerErrorKind::SendingCrossbeamMessage { + size_limit, + id, + data, + source: err, + })?; + Ok(()) + } + + /// Adds the already encrypted (and maybe compressed) blob to the packfile + /// + /// # Arguments + /// + /// * `data` - The blob data + /// * `id` - The blob id + /// * `data_len` - The length of the blob data + /// * `uncompressed_length` - The length of the blob data before compression + /// * `size_limit` - The size limit for the pack file + /// + /// # Errors + /// + /// * If the blob is already present in the index + /// * If sending the message to the raw packer fails. + fn add_raw( + &self, + data: &[u8], + id: &BlobId, + data_len: u64, + uncompressed_length: Option, + size_limit: Option, + ) -> RusticResult<()> { + // only add if this blob is not present + if !self.indexer.read().unwrap().has(id) { + let mut raw_packer = self.packer.write().unwrap(); + raw_packer.add(data, id, data_len, uncompressed_length)?; + + if let Some((file, index)) = raw_packer.save_if_needed(size_limit)? { + self.file_writer.send((file, index)).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to send packfile to file writer.", + err, + ) + })?; + } + } + Ok(()) + } + + /// Finalizes the packer and does cleanup + /// + /// # Panics + /// + /// * If the channel could not be dropped + pub fn finalize(self) -> RusticResult { + // cancel channel + drop(self.sender); + // wait for items in channel to be processed + self.finish + .recv() + .expect("Should be able to receive from channel to finalize packer.") + } +} + +// TODO: add documentation +/// # Type Parameters +/// +/// * `BE` - The backend type. +#[derive(Clone)] +pub(crate) struct FileWriterHandle { + /// The backend to write to. + be: BE, + /// The shared indexer containing the backend. + indexer: SharedIndexer, + /// Whether the file is cacheable. + cacheable: bool, +} + +impl FileWriterHandle { + // TODO: add documentation + fn process(&self, load: (Bytes, PackId, IndexPack)) -> RusticResult { + let (file, id, mut index) = load; + index.id = id; + self.be + .write_bytes(FileType::Pack, &id, self.cacheable, file)?; + index.time = Some(Local::now()); + Ok(index) + } + + fn index(&self, index: IndexPack) -> RusticResult<()> { + let res = { + let mut indexer = self.indexer.write().unwrap(); + + indexer.add(index); + indexer.save_if_needed() + }; + if let Some(file) = res { + let _ = self.be.save_file(&file)?; + } + Ok(()) + } +} + +// TODO: add documentation +#[derive(Clone)] +pub(crate) struct Actor { + /// The sender to send blobs to the raw packer. + sender: Sender<(Bytes, IndexPack)>, + /// The receiver to receive the status from the raw packer. + finish: Receiver>, +} + +impl Actor { + /// Creates a new `Actor`. + /// + /// # Type Parameters + /// + /// * `BE` - The backend type. + /// + /// # Arguments + /// + /// * `fwh` - The file writer handle. + /// * `queue_len` - The length of the queue. + /// * `par` - The number of parallel threads. + fn new( + fwh: FileWriterHandle, + queue_len: usize, + _par: usize, + ) -> Self { + let (tx, rx) = bounded(queue_len); + let (finish_tx, finish_rx) = bounded::>(0); + + let _join_handle = std::thread::spawn(move || { + scope(|scope| { + let status = rx + .into_iter() + .readahead_scoped(scope) + .map(|(file, index): (Bytes, IndexPack)| { + let id = hash(&file); + (file, PackId::from(id), index) + }) + .readahead_scoped(scope) + .map(|load| fwh.process(load)) + .readahead_scoped(scope) + .try_for_each(|index| fwh.index(index?)); + _ = finish_tx.send(status); + }) + .unwrap(); + }); + + Self { + sender: tx, + finish: finish_rx, + } + } + + /// Sends the given data to the actor. + /// + /// # Arguments + /// + /// * `load` - The data to send. + /// + /// # Errors + /// + /// If sending the message to the actor fails. + fn send(&self, load: (Bytes, IndexPack)) -> PackerResult<()> { + self.sender.send(load.clone()).map_err(|err| { + PackerErrorKind::SendingCrossbeamDataMessage { + data: load.0, + index_pack: load.1, + source: err, + } + })?; + Ok(()) + } + + /// Finalizes the actor and does cleanup + /// + /// # Panics + /// + /// * If the receiver is not present + fn finalize(self) -> RusticResult<()> { + // cancel channel + drop(self.sender); + // wait for items in channel to be processed + self.finish.recv().unwrap() + } +} + +/// The `Repacker` is responsible for repacking blobs into pack files. +/// +/// # Type Parameters +/// +/// * `BE` - The backend to read from. +#[allow(missing_debug_implementations)] +pub struct Repacker +where + BE: DecryptFullBackend, +{ + /// The backend to read from. + be: BE, + /// The packer to write to. + packer: RepositoryPacker, + /// The size limit of the pack file. + size_limit: u32, +} + +impl Repacker { + /// Creates a new `Repacker`. + /// + /// # Type Parameters + /// + /// * `BE` - The backend to read from. + /// + /// # Arguments + /// + /// * `be` - The backend to read from. + /// * `blob_type` - The blob type. + /// * `indexer` - The indexer to write to. + /// * `config` - The config file. + /// * `total_size` - The total size of the pack file. + /// + /// # Errors + /// + /// * If the Packer could not be created + pub fn new( + be: BE, + blob_type: BlobType, + indexer: SharedIndexer, + config: &ConfigFile, + total_size: u64, + ) -> RusticResult { + let packer = RepositoryPacker::new(be.clone(), blob_type, indexer, config, total_size)?; + let size_limit = PackSizer::from_config(config, blob_type, total_size).pack_size(); + Ok(Self { + be, + packer, + size_limit, + }) + } + + /// Adds the blob to the packfile without any check + /// + /// # Arguments + /// + /// * `pack_id` - The pack id + /// * `blob` - The blob to add + /// + /// # Errors + /// + /// * If the blob could not be added + /// * If reading the blob from the backend fails + pub fn add_fast(&self, pack_id: &PackId, blob: &IndexBlob) -> RusticResult<()> { + let data = self.be.read_partial( + FileType::Pack, + pack_id, + blob.tpe.is_cacheable(), + blob.offset, + blob.length, + )?; + + self.packer + .add_raw( + &data, + &blob.id, + 0, + blob.uncompressed_length, + Some(self.size_limit), + ) + .map_err(|err| { + err.overwrite_kind(ErrorKind::Internal) + .prepend_guidance_line( + "Failed to fast-add (unchecked) blob `{blob_id}` to packfile.", + ) + .attach_context("blob_id", blob.id.to_string()) + })?; + + Ok(()) + } + + /// Adds the blob to the packfile + /// + /// # Arguments + /// + /// * `pack_id` - The pack id + /// * `blob` - The blob to add + /// + /// # Errors + /// + /// * If the blob could not be added + /// * If reading the blob from the backend fails + pub fn add(&self, pack_id: &PackId, blob: &IndexBlob) -> RusticResult<()> { + let data = self.be.read_encrypted_partial( + FileType::Pack, + pack_id, + blob.tpe.is_cacheable(), + blob.offset, + blob.length, + blob.uncompressed_length, + )?; + + self.packer + .add_with_sizelimit(data, blob.id, Some(self.size_limit)) + .map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to add blob to packfile.", + err, + ) + })?; + + Ok(()) + } + + /// Finalizes the repacker and returns the stats + pub fn finalize(self) -> RusticResult { + self.packer.finalize() + } +} diff --git a/crates/core/src/chunker.rs b/crates/core/src/chunker.rs index 9516a306e..41b2d2f1f 100644 --- a/crates/core/src/chunker.rs +++ b/crates/core/src/chunker.rs @@ -29,7 +29,7 @@ const fn default_predicate(x: u64) -> bool { } /// `ChunkIter` is an iterator that chunks data. -pub(crate) struct ChunkIter { +pub(crate) struct ChunkIter { /// The buffer used for reading. buf: Vec, @@ -58,7 +58,7 @@ pub(crate) struct ChunkIter { finished: bool, } -impl ChunkIter { +impl ChunkIter { /// Creates a new `ChunkIter`. /// /// # Arguments @@ -81,7 +81,7 @@ impl ChunkIter { } } -impl Iterator for ChunkIter { +impl Iterator for ChunkIter { type Item = RusticResult>; fn next(&mut self) -> Option { diff --git a/crates/core/src/commands/copy.rs b/crates/core/src/commands/copy.rs index fd7a17bcd..3b1e1ca79 100644 --- a/crates/core/src/commands/copy.rs +++ b/crates/core/src/commands/copy.rs @@ -5,7 +5,7 @@ use rayon::prelude::{IntoParallelRefIterator, ParallelBridge, ParallelIterator}; use crate::{ backend::{decrypt::DecryptWriteBackend, node::NodeType}, - blob::{BlobId, BlobType, packer::Packer, tree::TreeStreamerOnce}, + blob::{BlobId, BlobType, repopacker::RepositoryPacker, tree::TreeStreamerOnce}, error::RusticResult, index::{ReadIndex, indexer::Indexer}, progress::{Progress, ProgressBars}, @@ -58,16 +58,16 @@ pub(crate) fn copy<'a, Q, R: IndexedFull, P: ProgressBars, S: IndexedIds>( let be = repo.dbe(); let index = repo.index(); let index_dest = repo_dest.index(); - let indexer = Indexer::new(be_dest.clone()).into_shared(); + let indexer = Indexer::new().into_shared(); - let data_packer = Packer::new( + let data_packer = RepositoryPacker::new( be_dest.clone(), BlobType::Data, indexer.clone(), repo_dest.config(), index_dest.total_size(BlobType::Data), )?; - let tree_packer = Packer::new( + let tree_packer = RepositoryPacker::new( be_dest.clone(), BlobType::Tree, indexer.clone(), @@ -128,7 +128,10 @@ pub(crate) fn copy<'a, Q, R: IndexedFull, P: ProgressBars, S: IndexedIds>( _ = data_packer.finalize()?; _ = tree_packer.finalize()?; - indexer.write().unwrap().finalize()?; + let res = indexer.write().unwrap().finalize(); + if let Some(file) = res { + let _ = be_dest.save_file(&file)?; + } let p = pb.progress_counter("saving snapshots..."); be_dest.save_list(snaps.iter(), p)?; diff --git a/crates/core/src/commands/merge.rs b/crates/core/src/commands/merge.rs index 5533f9687..0a251a257 100644 --- a/crates/core/src/commands/merge.rs +++ b/crates/core/src/commands/merge.rs @@ -8,7 +8,7 @@ use crate::{ backend::{decrypt::DecryptWriteBackend, node::Node}, blob::{ BlobId, BlobType, - packer::Packer, + repopacker::RepositoryPacker, tree::{self, Tree, TreeId}, }, error::{ErrorKind, RusticError, RusticResult}, @@ -103,8 +103,8 @@ pub(crate) fn merge_trees( ) -> RusticResult { let be = repo.dbe(); let index = repo.index(); - let indexer = Indexer::new(repo.dbe().clone()).into_shared(); - let packer = Packer::new( + let indexer = Indexer::new().into_shared(); + let packer = RepositoryPacker::new( repo.dbe().clone(), BlobType::Tree, indexer.clone(), @@ -136,7 +136,10 @@ pub(crate) fn merge_trees( let p = repo.pb.progress_spinner("merging snapshots..."); let tree_merged = tree::merge_trees(be, index, trees, cmp, &save, summary)?; let stats = packer.finalize()?; - indexer.write().unwrap().finalize()?; + let res = indexer.write().unwrap().finalize(); + if let Some(file) = res { + let _ = be.save_file(&file)?; + } p.finish(); stats.apply(summary, BlobType::Tree); diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index 3b3efc9fe..9c7c4b1c9 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -26,8 +26,7 @@ use crate::{ node::NodeType, }, blob::{ - BlobId, BlobType, BlobTypeMap, Initialize, - packer::{PackSizer, Repacker}, + BlobId, BlobType, BlobTypeMap, Initialize, packer::PackSizer, repopacker::Repacker, tree::TreeStreamerOnce, }, error::{ErrorKind, RusticError, RusticResult}, @@ -35,6 +34,7 @@ use crate::{ GlobalIndex, ReadGlobalIndex, ReadIndex, binarysorted::{IndexCollector, IndexType}, indexer::Indexer, + repoindexer::RepositoryIndexer, }, progress::{Progress, ProgressBars}, repofile::{ @@ -1247,7 +1247,7 @@ pub(crate) fn prune_repository( let be = repo.dbe(); let pb = &repo.pb; - let indexer = Indexer::new_unindexed(be.clone()).into_shared(); + let indexer = Indexer::new_unindexed().into_shared(); // Calculate an approximation of sizes after pruning. // The size actually is: @@ -1281,6 +1281,8 @@ pub(crate) fn prune_repository( size_after_prune[BlobType::Data], )?; + let indexer = RepositoryIndexer::new(be.clone(), indexer); + // mark unreferenced packs for deletion if !prune_plan.existing_packs.is_empty() { if opts.instant_delete { @@ -1297,7 +1299,7 @@ pub(crate) fn prune_repository( time: Some(Local::now()), blobs: Vec::new(), }; - indexer.write().unwrap().add_remove(pack)?; + indexer.add_remove(pack)?; p.inc(1); } p.finish(); @@ -1365,7 +1367,7 @@ pub(crate) fn prune_repository( PackToDo::Keep => { // keep pack: add to new index let pack = pack.into_index_pack(); - indexer.write().unwrap().add(pack)?; + indexer.add(pack)?; } PackToDo::Repack => { // TODO: repack in parallel @@ -1391,7 +1393,7 @@ pub(crate) fn prune_repository( } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.write().unwrap().add_remove(pack)?; + indexer.add_remove(pack)?; } } PackToDo::MarkDelete => { @@ -1400,7 +1402,7 @@ pub(crate) fn prune_repository( } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.write().unwrap().add_remove(pack)?; + indexer.add_remove(pack)?; } } PackToDo::KeepMarked | PackToDo::KeepMarkedAndCorrect => { @@ -1411,13 +1413,13 @@ pub(crate) fn prune_repository( // Note the timestamp shouldn't be None here, however if it is not not set, use the current time to heal the entry! let time = pack.time.unwrap_or(prune_plan.time); let pack = pack.into_index_pack_with_time(time); - indexer.write().unwrap().add_remove(pack)?; + indexer.add_remove(pack)?; } } PackToDo::Recover => { // recover pack: add to new index in section packs let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.write().unwrap().add(pack)?; + indexer.add(pack)?; } PackToDo::Delete => delete_pack(pack), } @@ -1425,7 +1427,7 @@ pub(crate) fn prune_repository( })?; _ = tree_repacker.finalize()?; _ = data_repacker.finalize()?; - indexer.write().unwrap().finalize()?; + indexer.finalize()?; p.finish(); // remove old index files first as they may reference pack files which are removed soon. diff --git a/crates/core/src/commands/repair/index.rs b/crates/core/src/commands/repair/index.rs index 9659bab77..c3f3a35ae 100644 --- a/crates/core/src/commands/repair/index.rs +++ b/crates/core/src/commands/repair/index.rs @@ -73,7 +73,7 @@ pub(crate) fn repair_index( let pack_read_header = checker.into_pack_to_read(); repo.warm_up_wait(pack_read_header.iter().map(|(id, _, _)| *id))?; - let indexer = Indexer::new(be.clone()).into_shared(); + let mut indexer = Indexer::new(); let p = repo.pb.progress_counter("reading pack headers"); p.set_length(pack_read_header.len().try_into().map_err(|err| { @@ -101,13 +101,18 @@ pub(crate) fn repair_index( }; if !dry_run { // write pack file to index - without the delete mark - indexer.write().unwrap().add_with(pack, false)?; + indexer.add_with(pack, false); + if let Some(file) = indexer.save_if_needed() { + _ = be.save_file(&file)?; + } } } } p.inc(1); } - indexer.write().unwrap().finalize()?; + if let Some(file) = indexer.finalize() { + _ = be.save_file(&file)?; + } p.finish(); Ok(()) diff --git a/crates/core/src/commands/repair/snapshots.rs b/crates/core/src/commands/repair/snapshots.rs index ed4370088..fffef0bf9 100644 --- a/crates/core/src/commands/repair/snapshots.rs +++ b/crates/core/src/commands/repair/snapshots.rs @@ -11,7 +11,7 @@ use crate::{ }, blob::{ BlobId, BlobType, - packer::Packer, + repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, error::{ErrorKind, RusticError, RusticResult}, @@ -106,8 +106,8 @@ pub(crate) fn repair_snapshots( let mut state = RepairState::default(); - let indexer = Indexer::new(be.clone()).into_shared(); - let mut packer = Packer::new( + let indexer = Indexer::new().into_shared(); + let mut packer = RepositoryPacker::new( be.clone(), BlobType::Tree, indexer.clone(), @@ -154,7 +154,10 @@ pub(crate) fn repair_snapshots( if !dry_run { _ = packer.finalize()?; - indexer.write().unwrap().finalize()?; + let mut indexer = indexer.write().unwrap(); + if let Some(file) = indexer.finalize() { + _ = be.save_file(&file)?; + } } if opts.delete { @@ -195,7 +198,7 @@ pub(crate) fn repair_tree( be: &impl DecryptFullBackend, opts: &RepairSnapshotsOptions, index: &impl ReadGlobalIndex, - packer: &mut Packer, + packer: &mut RepositoryPacker, id: Option, state: &mut RepairState, dry_run: bool, diff --git a/crates/core/src/index.rs b/crates/core/src/index.rs index 81f667876..14daba2cd 100644 --- a/crates/core/src/index.rs +++ b/crates/core/src/index.rs @@ -17,6 +17,7 @@ use crate::{ pub(crate) mod binarysorted; pub(crate) mod indexer; +pub(crate) mod repoindexer; /// An entry in the index #[derive(Debug, Clone, Copy, PartialEq, Eq, Constructor)] diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index 467595c16..2156140cc 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -7,9 +7,7 @@ use std::{ use log::warn; use crate::{ - backend::decrypt::DecryptWriteBackend, blob::BlobId, - error::RusticResult, repofile::indexfile::{IndexFile, IndexPack}, }; @@ -22,16 +20,11 @@ pub(super) mod constants { pub(super) const MAX_AGE: Duration = Duration::from_secs(300); } -pub(crate) type SharedIndexer = Arc>>; +pub(crate) type SharedIndexer = Arc>; /// The `Indexer` is responsible for indexing blobs. #[derive(Debug)] -pub struct Indexer -where - BE: DecryptWriteBackend, -{ - /// The backend to write to. - be: BE, +pub struct Indexer { /// The index file. file: IndexFile, /// The number of blobs indexed. @@ -42,7 +35,7 @@ where indexed: Option>, } -impl Indexer { +impl Indexer { /// Creates a new `Indexer`. /// /// # Type Parameters @@ -52,9 +45,8 @@ impl Indexer { /// # Arguments /// /// * `be` - The backend to write to. - pub fn new(be: BE) -> Self { + pub fn new() -> Self { Self { - be, file: IndexFile::default(), count: 0, created: SystemTime::now(), @@ -71,9 +63,8 @@ impl Indexer { /// # Arguments /// /// * `be` - The backend to write to. - pub fn new_unindexed(be: BE) -> Self { + pub fn new_unindexed() -> Self { Self { - be, file: IndexFile::default(), count: 0, created: SystemTime::now(), @@ -81,19 +72,12 @@ impl Indexer { } } - /// Resets the indexer. - pub fn reset(&mut self) { - self.file = IndexFile::default(); - self.count = 0; - self.created = SystemTime::now(); - } - /// Returns a `SharedIndexer` to use in multiple threads. /// /// # Type Parameters /// /// * `BE` - The backend type. - pub fn into_shared(self) -> SharedIndexer { + pub fn into_shared(self) -> SharedIndexer { Arc::new(RwLock::new(self)) } @@ -102,7 +86,23 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn finalize(&self) -> RusticResult<()> { + pub fn finalize(&mut self) -> Option { + self.save() + } + + pub fn needs_save(&self) -> bool { + // check if IndexFile needs to be saved + let elapsed = self.created.elapsed().unwrap_or_else(|err| { + warn!("couldn't get elapsed time from system time: {err:?}"); + Duration::ZERO + }); + self.count >= constants::MAX_COUNT || elapsed >= constants::MAX_AGE + } + + pub fn save_if_needed(&mut self) -> Option { + if !self.needs_save() { + return None; + } self.save() } @@ -111,11 +111,14 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn save(&self) -> RusticResult<()> { - if (self.file.packs.len() + self.file.packs_to_delete.len()) > 0 { - _ = self.be.save_file(&self.file)?; + pub fn save(&mut self) -> Option { + if self.file.packs.is_empty() && self.file.packs_to_delete.is_empty() { + return None; } - Ok(()) + let file = std::mem::take(&mut self.file); + self.count = 0; + self.created = SystemTime::now(); + Some(file) } /// Adds a pack to the `Indexer`. @@ -127,8 +130,8 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn add(&mut self, pack: IndexPack) -> RusticResult<()> { - self.add_with(pack, false) + pub fn add(&mut self, pack: IndexPack) { + self.add_with(pack, false); } /// Adds a pack to the `Indexer` and removes it from the backend. @@ -140,8 +143,8 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn add_remove(&mut self, pack: IndexPack) -> RusticResult<()> { - self.add_with(pack, true) + pub fn add_remove(&mut self, pack: IndexPack) { + self.add_with(pack, true); } /// Adds a pack to the `Indexer`. @@ -154,7 +157,7 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn add_with(&mut self, pack: IndexPack, delete: bool) -> RusticResult<()> { + pub fn add_with(&mut self, pack: IndexPack, delete: bool) { self.count += pack.blobs.len(); if let Some(indexed) = &mut self.indexed { @@ -164,17 +167,6 @@ impl Indexer { } self.file.add(pack, delete); - - // check if IndexFile needs to be saved - let elapsed = self.created.elapsed().unwrap_or_else(|err| { - warn!("couldn't get elapsed time from system time: {err:?}"); - Duration::ZERO - }); - if self.count >= constants::MAX_COUNT || elapsed >= constants::MAX_AGE { - self.save()?; - self.reset(); - } - Ok(()) } /// Returns whether the given id is indexed. diff --git a/crates/core/src/index/repoindexer.rs b/crates/core/src/index/repoindexer.rs new file mode 100644 index 000000000..2f072a60b --- /dev/null +++ b/crates/core/src/index/repoindexer.rs @@ -0,0 +1,103 @@ +use crate::{ + backend::decrypt::DecryptWriteBackend, blob::BlobId, error::RusticResult, + repofile::indexfile::IndexPack, +}; + +use super::indexer::SharedIndexer; + +/// The `Indexer` is responsible for indexing blobs. +#[derive(Debug, Clone)] +pub struct RepositoryIndexer +where + BE: DecryptWriteBackend, +{ + /// The backend to write to. + be: BE, + /// The index file. + raw_indexer: SharedIndexer, +} + +impl RepositoryIndexer { + /// Creates a new `Indexer`. + /// + /// # Type Parameters + /// + /// * `BE` - The backend type. + /// + /// # Arguments + /// + /// * `be` - The backend to write to. + pub fn new(be: BE, raw_indexer: SharedIndexer) -> Self { + Self { be, raw_indexer } + } + + /// Adds a pack to the `Indexer`. + /// + /// # Arguments + /// + /// * `pack` - The pack to add. + /// + /// # Errors + /// + /// * If the index file could not be serialized. + pub fn add(&self, pack: IndexPack) -> RusticResult<()> { + self.add_with(pack, false) + } + + /// Adds a pack to the `Indexer` and removes it from the backend. + /// + /// # Arguments + /// + /// * `pack` - The pack to add. + /// + /// # Errors + /// + /// * If the index file could not be serialized. + pub fn add_remove(&self, pack: IndexPack) -> RusticResult<()> { + self.add_with(pack, true) + } + + /// Adds a pack to the `Indexer`. + /// + /// # Arguments + /// + /// * `pack` - The pack to add. + /// * `delete` - Whether to delete the pack from the backend. + /// + /// # Errors + /// + /// * If the index file could not be serialized. + pub fn add_with(&self, pack: IndexPack, delete: bool) -> RusticResult<()> { + let res = { + let mut raw_indexer = self.raw_indexer.write().unwrap(); + raw_indexer.add_with(pack, delete); + raw_indexer.save_if_needed() + }; + + if let Some(file) = res { + let _ = self.be.save_file(&file)?; + } + Ok(()) + } + + pub fn finalize(&self) -> RusticResult<()> { + let res = { + let mut raw_indexer = self.raw_indexer.write().unwrap(); + raw_indexer.finalize() + }; + + if let Some(file) = res { + let _ = self.be.save_file(&file)?; + } + Ok(()) + } + + /// Returns whether the given id is indexed. + /// + /// # Arguments + /// + /// * `id` - The id to check. + pub fn has(&self, id: &BlobId) -> bool { + self.raw_indexer.read().unwrap().has(id) + } +} From 9461d1579c718060af59f007c43a17e8a60624b0 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Tue, 3 Jun 2025 18:20:40 +0200 Subject: [PATCH 02/26] fix deadlock & duplicate pack saving --- crates/core/src/blob/repopacker.rs | 28 +++++++++++++++++++++------- crates/core/src/index/indexer.rs | 8 ++++---- crates/core/src/index/repoindexer.rs | 9 +++------ 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 9916fea79..5575b8d77 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -130,14 +130,13 @@ impl RepositoryPacker { finish: finish_rx, }; - let repo_packer = repository_packer.clone(); let _join_handle = std::thread::spawn(move || { scope(|scope| { let status = rx .into_iter() .readahead_scoped(scope) // early check if id is already contained - .filter(|(_, id, _)| !indexer.read().unwrap().has(id)) + .filter(|(_, id, _)| !indexer.write().unwrap().has(id)) .filter(|(_, id, _)| !packer.read().unwrap().has(id)) .readahead_scoped(scope) .parallel_map_scoped( @@ -157,14 +156,29 @@ impl RepositoryPacker { // check again if id is already contained // TODO: We may still save duplicate blobs - the indexer is only updated when the packfile write has completed .filter(|res| { - res.as_ref().map_or_else( + let res = res.as_ref().map_or_else( |_| true, - |(_, id, _, _, _)| !indexer.read().unwrap().has(id), - ) + |(_, id, _, _, _)| !indexer.write().unwrap().has(id), + ); + res }) .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { let (data, id, data_len, ul, size_limit) = item?; - repo_packer.add_raw(&data, &id, data_len, ul, size_limit)?; + let res = { + let mut raw_packer = packer.write().unwrap(); + raw_packer.add(&data, &id, data_len, ul)?; + + raw_packer.save_if_needed(size_limit)? + }; + if let Some((file, index)) = res { + file_writer.send((file, index)).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to send packfile to file writer.", + err, + ) + })?; + } Ok(()) }) @@ -263,7 +277,7 @@ impl RepositoryPacker { size_limit: Option, ) -> RusticResult<()> { // only add if this blob is not present - if !self.indexer.read().unwrap().has(id) { + if !self.indexer.write().unwrap().has(id) { let mut raw_packer = self.packer.write().unwrap(); raw_packer.add(data, id, data_len, uncompressed_length)?; diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index 2156140cc..b05398f76 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -169,14 +169,14 @@ impl Indexer { self.file.add(pack, delete); } - /// Returns whether the given id is indexed. + /// Returns whether the given id is indexed. If not, mark it as indexed /// /// # Arguments /// /// * `id` - The id to check. - pub fn has(&self, id: &BlobId) -> bool { + pub fn has(&mut self, id: &BlobId) -> bool { self.indexed - .as_ref() - .is_some_and(|indexed| indexed.contains(id)) + .as_mut() + .is_some_and(|indexed| !indexed.insert(*id)) } } diff --git a/crates/core/src/index/repoindexer.rs b/crates/core/src/index/repoindexer.rs index 2f072a60b..d038213f6 100644 --- a/crates/core/src/index/repoindexer.rs +++ b/crates/core/src/index/repoindexer.rs @@ -81,10 +81,7 @@ impl RepositoryIndexer { } pub fn finalize(&self) -> RusticResult<()> { - let res = { - let mut raw_indexer = self.raw_indexer.write().unwrap(); - raw_indexer.finalize() - }; + let res = self.raw_indexer.write().unwrap().finalize(); if let Some(file) = res { let _ = self.be.save_file(&file)?; @@ -92,12 +89,12 @@ impl RepositoryIndexer { Ok(()) } - /// Returns whether the given id is indexed. + /// Returns whether the given id is indexed. If not, mark it as indexed /// /// # Arguments /// /// * `id` - The id to check. pub fn has(&self, id: &BlobId) -> bool { - self.raw_indexer.read().unwrap().has(id) + self.raw_indexer.write().unwrap().has(id) } } From 085e6bb4d822d2ed4f21725ea043e81e8748f3aa Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Tue, 3 Jun 2025 18:54:44 +0200 Subject: [PATCH 03/26] fix clippy --- crates/core/src/blob/repopacker.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 5575b8d77..2414d168a 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -156,11 +156,10 @@ impl RepositoryPacker { // check again if id is already contained // TODO: We may still save duplicate blobs - the indexer is only updated when the packfile write has completed .filter(|res| { - let res = res.as_ref().map_or_else( + res.as_ref().map_or_else( |_| true, |(_, id, _, _, _)| !indexer.write().unwrap().has(id), - ); - res + ) }) .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { let (data, id, data_len, ul, size_limit) = item?; From 2935fdea52d007aa4f2e556204db0de8ea387f71 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Tue, 3 Jun 2025 18:59:20 +0200 Subject: [PATCH 04/26] remove repoindexer --- crates/core/src/commands/prune.rs | 78 +++++++++++++-------- crates/core/src/index.rs | 1 - crates/core/src/index/repoindexer.rs | 100 --------------------------- 3 files changed, 49 insertions(+), 130 deletions(-) delete mode 100644 crates/core/src/index/repoindexer.rs diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index 9c7c4b1c9..dcc6e19ed 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -34,7 +34,6 @@ use crate::{ GlobalIndex, ReadGlobalIndex, ReadIndex, binarysorted::{IndexCollector, IndexType}, indexer::Indexer, - repoindexer::RepositoryIndexer, }, progress::{Progress, ProgressBars}, repofile::{ @@ -1247,7 +1246,7 @@ pub(crate) fn prune_repository( let be = repo.dbe(); let pb = &repo.pb; - let indexer = Indexer::new_unindexed().into_shared(); + let mut indexer = Indexer::new_unindexed(); // Calculate an approximation of sizes after pruning. // The size actually is: @@ -1265,24 +1264,6 @@ pub(crate) fn prune_repository( * u64::from(HeaderEntry::ENTRY_LEN_COMPRESSED) }); - let tree_repacker = Repacker::new( - be.clone(), - BlobType::Tree, - indexer.clone(), - repo.config(), - size_after_prune[BlobType::Tree], - )?; - - let data_repacker = Repacker::new( - be.clone(), - BlobType::Data, - indexer.clone(), - repo.config(), - size_after_prune[BlobType::Data], - )?; - - let indexer = RepositoryIndexer::new(be.clone(), indexer); - // mark unreferenced packs for deletion if !prune_plan.existing_packs.is_empty() { if opts.instant_delete { @@ -1299,7 +1280,10 @@ pub(crate) fn prune_repository( time: Some(Local::now()), blobs: Vec::new(), }; - indexer.add_remove(pack)?; + indexer.add_remove(pack); + if let Some(file) = indexer.save_if_needed() { + _ = be.save_file(&file)?; + } p.inc(1); } p.finish(); @@ -1323,9 +1307,26 @@ pub(crate) fn prune_repository( p.set_length(prune_plan.stats.size_sum().repack - prune_plan.stats.size_sum().repackrm); let mut indexes_remove = Vec::new(); + let indexer = indexer.into_shared(); let tree_packs_remove = Arc::new(Mutex::new(Vec::new())); let data_packs_remove = Arc::new(Mutex::new(Vec::new())); + let tree_repacker = Repacker::new( + be.clone(), + BlobType::Tree, + indexer.clone(), + repo.config(), + size_after_prune[BlobType::Tree], + )?; + + let data_repacker = Repacker::new( + be.clone(), + BlobType::Data, + indexer.clone(), + repo.config(), + size_after_prune[BlobType::Data], + )?; + let delete_pack = |pack: PrunePack| { // delete pack match pack.blob_type { @@ -1355,7 +1356,7 @@ pub(crate) fn prune_repository( packs .into_par_iter() .try_for_each(|pack| -> RusticResult<_> { - match pack.to_do { + let to_index = match pack.to_do { PackToDo::Undecided => { return Err(RusticError::new( ErrorKind::Internal, @@ -1367,7 +1368,7 @@ pub(crate) fn prune_repository( PackToDo::Keep => { // keep pack: add to new index let pack = pack.into_index_pack(); - indexer.add(pack)?; + Some((pack, true)) } PackToDo::Repack => { // TODO: repack in parallel @@ -1390,44 +1391,63 @@ pub(crate) fn prune_repository( } if opts.instant_delete { delete_pack(pack); + None } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.add_remove(pack)?; + Some((pack, true)) } } PackToDo::MarkDelete => { if opts.instant_delete { delete_pack(pack); + None } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.add_remove(pack)?; + Some((pack, true)) } } PackToDo::KeepMarked | PackToDo::KeepMarkedAndCorrect => { if opts.instant_delete { delete_pack(pack); + None } else { // keep pack: add to new index; keep the timestamp. // Note the timestamp shouldn't be None here, however if it is not not set, use the current time to heal the entry! let time = pack.time.unwrap_or(prune_plan.time); let pack = pack.into_index_pack_with_time(time); - indexer.add_remove(pack)?; + Some((pack, true)) } } PackToDo::Recover => { // recover pack: add to new index in section packs let pack = pack.into_index_pack_with_time(prune_plan.time); - indexer.add(pack)?; + Some((pack, false)) + } + PackToDo::Delete => { + delete_pack(pack); + None + } + }; + if let Some((pack, delete)) = to_index { + let res = { + let mut indexer = indexer.write().unwrap(); + indexer.add_with(pack, delete); + indexer.save_if_needed() + }; + if let Some(file) = res { + _ = be.save_file(&file)?; } - PackToDo::Delete => delete_pack(pack), } Ok(()) })?; _ = tree_repacker.finalize()?; _ = data_repacker.finalize()?; - indexer.finalize()?; + let res = indexer.write().unwrap().finalize(); + if let Some(file) = res { + _ = be.save_file(&file)?; + } p.finish(); // remove old index files first as they may reference pack files which are removed soon. diff --git a/crates/core/src/index.rs b/crates/core/src/index.rs index 14daba2cd..81f667876 100644 --- a/crates/core/src/index.rs +++ b/crates/core/src/index.rs @@ -17,7 +17,6 @@ use crate::{ pub(crate) mod binarysorted; pub(crate) mod indexer; -pub(crate) mod repoindexer; /// An entry in the index #[derive(Debug, Clone, Copy, PartialEq, Eq, Constructor)] diff --git a/crates/core/src/index/repoindexer.rs b/crates/core/src/index/repoindexer.rs deleted file mode 100644 index d038213f6..000000000 --- a/crates/core/src/index/repoindexer.rs +++ /dev/null @@ -1,100 +0,0 @@ -use crate::{ - backend::decrypt::DecryptWriteBackend, blob::BlobId, error::RusticResult, - repofile::indexfile::IndexPack, -}; - -use super::indexer::SharedIndexer; - -/// The `Indexer` is responsible for indexing blobs. -#[derive(Debug, Clone)] -pub struct RepositoryIndexer -where - BE: DecryptWriteBackend, -{ - /// The backend to write to. - be: BE, - /// The index file. - raw_indexer: SharedIndexer, -} - -impl RepositoryIndexer { - /// Creates a new `Indexer`. - /// - /// # Type Parameters - /// - /// * `BE` - The backend type. - /// - /// # Arguments - /// - /// * `be` - The backend to write to. - pub fn new(be: BE, raw_indexer: SharedIndexer) -> Self { - Self { be, raw_indexer } - } - - /// Adds a pack to the `Indexer`. - /// - /// # Arguments - /// - /// * `pack` - The pack to add. - /// - /// # Errors - /// - /// * If the index file could not be serialized. - pub fn add(&self, pack: IndexPack) -> RusticResult<()> { - self.add_with(pack, false) - } - - /// Adds a pack to the `Indexer` and removes it from the backend. - /// - /// # Arguments - /// - /// * `pack` - The pack to add. - /// - /// # Errors - /// - /// * If the index file could not be serialized. - pub fn add_remove(&self, pack: IndexPack) -> RusticResult<()> { - self.add_with(pack, true) - } - - /// Adds a pack to the `Indexer`. - /// - /// # Arguments - /// - /// * `pack` - The pack to add. - /// * `delete` - Whether to delete the pack from the backend. - /// - /// # Errors - /// - /// * If the index file could not be serialized. - pub fn add_with(&self, pack: IndexPack, delete: bool) -> RusticResult<()> { - let res = { - let mut raw_indexer = self.raw_indexer.write().unwrap(); - raw_indexer.add_with(pack, delete); - raw_indexer.save_if_needed() - }; - - if let Some(file) = res { - let _ = self.be.save_file(&file)?; - } - Ok(()) - } - - pub fn finalize(&self) -> RusticResult<()> { - let res = self.raw_indexer.write().unwrap().finalize(); - - if let Some(file) = res { - let _ = self.be.save_file(&file)?; - } - Ok(()) - } - - /// Returns whether the given id is indexed. If not, mark it as indexed - /// - /// # Arguments - /// - /// * `id` - The id to check. - pub fn has(&self, id: &BlobId) -> bool { - self.raw_indexer.write().unwrap().has(id) - } -} From 4be2bf780cea037e09956ea072c929d0ab077114 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 09:00:16 +0200 Subject: [PATCH 05/26] fix: wait for filewriter to finish --- crates/core/src/blob/repopacker.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 2414d168a..51b00a801 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -302,9 +302,12 @@ impl RepositoryPacker { // cancel channel drop(self.sender); // wait for items in channel to be processed - self.finish + let res = self + .finish .recv() - .expect("Should be able to receive from channel to finalize packer.") + .expect("Should be able to receive from channel to finalize packer."); + self.file_writer.finalize()?; + res } } From e16be9da15098b9105e18eae45facbd62dfe5bdf Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 09:00:53 +0200 Subject: [PATCH 06/26] fix: prune: set correct delete flag --- crates/core/src/commands/prune.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index dcc6e19ed..e48d6cfae 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -1327,12 +1327,13 @@ pub(crate) fn prune_repository( size_after_prune[BlobType::Data], )?; - let delete_pack = |pack: PrunePack| { + let delete_pack = |pack: PrunePack| -> Option<_> { // delete pack match pack.blob_type { BlobType::Data => data_packs_remove.lock().unwrap().push(pack.id), BlobType::Tree => tree_packs_remove.lock().unwrap().push(pack.id), } + None }; let used_ids = Arc::new(Mutex::new(prune_plan.used_ids)); @@ -1368,7 +1369,7 @@ pub(crate) fn prune_repository( PackToDo::Keep => { // keep pack: add to new index let pack = pack.into_index_pack(); - Some((pack, true)) + Some((pack, false)) } PackToDo::Repack => { // TODO: repack in parallel @@ -1390,8 +1391,7 @@ pub(crate) fn prune_repository( p.inc(u64::from(blob.length)); } if opts.instant_delete { - delete_pack(pack); - None + delete_pack(pack) } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); @@ -1400,8 +1400,7 @@ pub(crate) fn prune_repository( } PackToDo::MarkDelete => { if opts.instant_delete { - delete_pack(pack); - None + delete_pack(pack) } else { // mark pack for removal let pack = pack.into_index_pack_with_time(prune_plan.time); @@ -1410,8 +1409,7 @@ pub(crate) fn prune_repository( } PackToDo::KeepMarked | PackToDo::KeepMarkedAndCorrect => { if opts.instant_delete { - delete_pack(pack); - None + delete_pack(pack) } else { // keep pack: add to new index; keep the timestamp. // Note the timestamp shouldn't be None here, however if it is not not set, use the current time to heal the entry! @@ -1425,10 +1423,7 @@ pub(crate) fn prune_repository( let pack = pack.into_index_pack_with_time(prune_plan.time); Some((pack, false)) } - PackToDo::Delete => { - delete_pack(pack); - None - } + PackToDo::Delete => delete_pack(pack), }; if let Some((pack, delete)) = to_index { let res = { From 9c4a13eb8dabf73449d6e264687955e992c80fd3 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 09:01:19 +0200 Subject: [PATCH 07/26] remove unneccessary checks for duplicates --- crates/core/src/blob/packer.rs | 7 ------- crates/core/src/blob/repopacker.rs | 12 +----------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 9811bd8ff..bad04ea6e 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -305,9 +305,6 @@ impl Packer { data_len: u64, uncompressed_length: Option, ) -> RusticResult<()> { - if self.has(id) { - return Ok(()); - } self.stats.blobs += 1; self.stats.data += data_len; @@ -474,8 +471,4 @@ impl Packer { Ok(Some((file, index))) } - - pub fn has(&self, id: &BlobId) -> bool { - self.index.blobs.iter().any(|b| &b.id == id) - } } diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 51b00a801..28830ad56 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -135,10 +135,8 @@ impl RepositoryPacker { let status = rx .into_iter() .readahead_scoped(scope) - // early check if id is already contained + // early check if id is already contained and reserve, if not .filter(|(_, id, _)| !indexer.write().unwrap().has(id)) - .filter(|(_, id, _)| !packer.read().unwrap().has(id)) - .readahead_scoped(scope) .parallel_map_scoped( scope, |(data, id, size_limit): (Bytes, BlobId, Option)| { @@ -153,14 +151,6 @@ impl RepositoryPacker { }, ) .readahead_scoped(scope) - // check again if id is already contained - // TODO: We may still save duplicate blobs - the indexer is only updated when the packfile write has completed - .filter(|res| { - res.as_ref().map_or_else( - |_| true, - |(_, id, _, _, _)| !indexer.write().unwrap().has(id), - ) - }) .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { let (data, id, data_len, ul, size_limit) = item?; let res = { From ab2214aa25bee493401f283ba0f8408894527088 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 11:48:55 +0200 Subject: [PATCH 08/26] clarify indexer methods --- crates/core/src/blob/repopacker.rs | 4 ++-- crates/core/src/index/indexer.rs | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 28830ad56..1d7b3d037 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -136,7 +136,7 @@ impl RepositoryPacker { .into_iter() .readahead_scoped(scope) // early check if id is already contained and reserve, if not - .filter(|(_, id, _)| !indexer.write().unwrap().has(id)) + .filter(|(_, id, _)| indexer.write().unwrap().reserve(id)) .parallel_map_scoped( scope, |(data, id, size_limit): (Bytes, BlobId, Option)| { @@ -266,7 +266,7 @@ impl RepositoryPacker { size_limit: Option, ) -> RusticResult<()> { // only add if this blob is not present - if !self.indexer.write().unwrap().has(id) { + if self.indexer.write().unwrap().reserve(id) { let mut raw_packer = self.packer.write().unwrap(); raw_packer.add(data, id, data_len, uncompressed_length)?; diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index b05398f76..f3d75bab0 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -169,14 +169,25 @@ impl Indexer { self.file.add(pack, delete); } - /// Returns whether the given id is indexed. If not, mark it as indexed - /// + /// Returns whether the given id is indexed. + /// /// # Arguments /// /// * `id` - The id to check. pub fn has(&mut self, id: &BlobId) -> bool { self.indexed .as_mut() - .is_some_and(|indexed| !indexed.insert(*id)) + .is_some_and(|indexed| indexed.contains(id)) + } + + /// Reserve an id for saving in the index. Returns whether the id was newly reseved, i.e. was not already present. + /// + /// # Arguments + /// + /// * `id` - The id to check. + pub fn reserve(&mut self, id: &BlobId) -> bool { + self.indexed + .as_mut() + .is_some_and(|indexed| indexed.insert(*id)) } } From 080b4c7c5b97479e021b125a941d14726f15018b Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 13:01:46 +0200 Subject: [PATCH 09/26] refactor pack sizer --- crates/core/src/archiver/file_archiver.rs | 9 +- crates/core/src/archiver/tree_archiver.rs | 5 +- crates/core/src/blob.rs | 1 + crates/core/src/blob/pack_sizer.rs | 135 +++++++++++++++++++ crates/core/src/blob/packer.rs | 133 ++---------------- crates/core/src/blob/repopacker.rs | 124 ++++++----------- crates/core/src/commands/copy.rs | 4 +- crates/core/src/commands/merge.rs | 2 +- crates/core/src/commands/prune.rs | 10 +- crates/core/src/commands/repair/snapshots.rs | 5 +- 10 files changed, 209 insertions(+), 219 deletions(-) create mode 100644 crates/core/src/blob/pack_sizer.rs diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index e2ce8025d..7de244192 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -13,7 +13,10 @@ use crate::{ decrypt::DecryptWriteBackend, node::{Node, NodeType}, }, - blob::{BlobId, BlobType, DataId, packer::PackerStats, repopacker::RepositoryPacker}, + blob::{ + BlobId, BlobType, DataId, pack_sizer::DefaultPackSizer, packer::PackerStats, + repopacker::RepositoryPacker, + }, chunker::ChunkIter, crypto::hasher::hash, error::{ErrorKind, RusticError, RusticResult}, @@ -32,7 +35,7 @@ use crate::{ #[derive(Clone)] pub(crate) struct FileArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> { index: &'a I, - data_packer: RepositoryPacker, + data_packer: RepositoryPacker, rabin: Rabin64, } @@ -63,7 +66,7 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> FileArchiver<'a, BE, I> { ) -> RusticResult { let poly = config.poly()?; - let data_packer = RepositoryPacker::new( + let data_packer = RepositoryPacker::new_with_default_sizer( be, BlobType::Data, indexer, diff --git a/crates/core/src/archiver/tree_archiver.rs b/crates/core/src/archiver/tree_archiver.rs index ca1a95761..7fcb82eea 100644 --- a/crates/core/src/archiver/tree_archiver.rs +++ b/crates/core/src/archiver/tree_archiver.rs @@ -8,6 +8,7 @@ use crate::{ backend::{decrypt::DecryptWriteBackend, node::Node}, blob::{ BlobType, + pack_sizer::DefaultPackSizer, repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, @@ -34,7 +35,7 @@ pub(crate) struct TreeArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> /// The index to read from. index: &'a I, /// The packer to write to. - tree_packer: RepositoryPacker, + tree_packer: RepositoryPacker, /// The summary of the snapshot. summary: SnapshotSummary, } @@ -66,7 +67,7 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> TreeArchiver<'a, BE, I> { config: &ConfigFile, summary: SnapshotSummary, ) -> RusticResult { - let tree_packer = RepositoryPacker::new( + let tree_packer = RepositoryPacker::new_with_default_sizer( be, BlobType::Tree, indexer, diff --git a/crates/core/src/blob.rs b/crates/core/src/blob.rs index 70b19d9d2..e9fbe68f5 100644 --- a/crates/core/src/blob.rs +++ b/crates/core/src/blob.rs @@ -1,3 +1,4 @@ +pub(crate) mod pack_sizer; pub(crate) mod packer; pub(crate) mod repopacker; pub(crate) mod tree; diff --git a/crates/core/src/blob/pack_sizer.rs b/crates/core/src/blob/pack_sizer.rs new file mode 100644 index 000000000..8382b12e7 --- /dev/null +++ b/crates/core/src/blob/pack_sizer.rs @@ -0,0 +1,135 @@ +use super::BlobType; +use crate::repofile::ConfigFile; + +use integer_sqrt::IntegerSquareRoot; + +pub trait PackSizer { + /// Computes the size of the pack file. + #[must_use] + fn pack_size(&self) -> u32; + + /// Evaluates whether the given size is not too small or too large + /// + /// # Arguments + /// + /// * `size` - The size to check + #[must_use] + fn size_ok(&self, size: u32) -> bool { + !self.is_too_small(size) && !self.is_too_large(size) + } + + /// Evaluates whether the given size is too small + /// + /// # Arguments + /// + /// * `size` - The size to check + #[must_use] + fn is_too_small(&self, _size: u32) -> bool { + false + } + + /// Evaluates whether the given size is too large + /// + /// # Arguments + /// + /// * `size` - The size to check + #[must_use] + fn is_too_large(&self, _size: u32) -> bool { + false + } + + /// Adds the given size to the current size. + /// + /// # Arguments + /// + /// * `added` - The size to add + /// + /// # Panics + /// + /// * If the size is too large + fn add_size(&mut self, _added: u32) {} +} + +/// The pack sizer is responsible for computing the size of the pack file. +#[derive(Debug, Clone, Copy)] +pub struct DefaultPackSizer { + /// The default size of a pack file. + default_size: u32, + /// The grow factor of a pack file. + grow_factor: u32, + /// The size limit of a pack file. + size_limit: u32, + /// The current size of a pack file. + current_size: u64, + /// The minimum pack size tolerance in percent before a repack is triggered. + min_packsize_tolerate_percent: u32, + /// The maximum pack size tolerance in percent before a repack is triggered. + max_packsize_tolerate_percent: u32, +} + +impl DefaultPackSizer { + /// Creates a new `PackSizer` from a config file. + /// + /// # Arguments + /// + /// * `config` - The config file. + /// * `blob_type` - The blob type. + /// * `current_size` - The current size of the pack file. + /// + /// # Returns + /// + /// A new `PackSizer`. + #[must_use] + pub fn from_config(config: &ConfigFile, blob_type: BlobType, current_size: u64) -> Self { + let (default_size, grow_factor, size_limit) = config.packsize(blob_type); + let (min_packsize_tolerate_percent, max_packsize_tolerate_percent) = + config.packsize_ok_percents(); + Self { + default_size, + grow_factor, + size_limit, + current_size, + min_packsize_tolerate_percent, + max_packsize_tolerate_percent, + } + } +} + +impl PackSizer for DefaultPackSizer { + #[allow(clippy::cast_possible_truncation)] + fn pack_size(&self) -> u32 { + (self.current_size.integer_sqrt() as u32 * self.grow_factor + self.default_size) + .min(self.size_limit) + } + + fn is_too_small(&self, size: u32) -> bool { + let target_size = self.pack_size(); + // Note: we cast to u64 so that no overflow can occur in the multiplications + u64::from(size) * 100 + < u64::from(target_size) * u64::from(self.min_packsize_tolerate_percent) + } + + fn is_too_large(&self, size: u32) -> bool { + let target_size = self.pack_size(); + // Note: we cast to u64 so that no overflow can occur in the multiplications + u64::from(size) * 100 + > u64::from(target_size) * u64::from(self.max_packsize_tolerate_percent) + } + + fn add_size(&mut self, added: u32) { + self.current_size += u64::from(added); + } +} + +/// The pack sizer is responsible for computing the size of the pack file. +#[derive(Debug, Clone, Copy)] +pub struct FixedPackSizer(pub u32); + +impl PackSizer for FixedPackSizer { + fn pack_size(&self) -> u32 { + self.0 + } + fn is_too_large(&self, size: u32) -> bool { + size > self.0 + } +} diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index bad04ea6e..7411b6c64 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -4,7 +4,6 @@ use std::{ }; use bytes::{Bytes, BytesMut}; -use integer_sqrt::IntegerSquareRoot; use log::warn; use crate::{ @@ -12,13 +11,14 @@ use crate::{ crypto::CryptoKey, error::{ErrorKind, RusticError, RusticResult}, repofile::{ - configfile::ConfigFile, indexfile::IndexPack, packfile::{PackHeaderLength, PackHeaderRef}, snapshotfile::SnapshotSummary, }, }; +use super::pack_sizer::PackSizer; + /// [`PackerErrorKind`] describes the errors that can be returned for a Packer #[derive(thiserror::Error, Debug, displaydoc::Display)] #[non_exhaustive] @@ -48,112 +48,6 @@ pub(super) mod constants { pub(super) const MAX_AGE: Duration = Duration::from_secs(300); } -/// The pack sizer is responsible for computing the size of the pack file. -#[derive(Debug, Clone, Copy)] -pub struct PackSizer { - /// The default size of a pack file. - default_size: u32, - /// The grow factor of a pack file. - grow_factor: u32, - /// The size limit of a pack file. - size_limit: u32, - /// The current size of a pack file. - current_size: u64, - /// The minimum pack size tolerance in percent before a repack is triggered. - min_packsize_tolerate_percent: u32, - /// The maximum pack size tolerance in percent before a repack is triggered. - max_packsize_tolerate_percent: u32, -} - -impl PackSizer { - /// Creates a new `PackSizer` from a config file. - /// - /// # Arguments - /// - /// * `config` - The config file. - /// * `blob_type` - The blob type. - /// * `current_size` - The current size of the pack file. - /// - /// # Returns - /// - /// A new `PackSizer`. - #[must_use] - pub fn from_config(config: &ConfigFile, blob_type: BlobType, current_size: u64) -> Self { - let (default_size, grow_factor, size_limit) = config.packsize(blob_type); - let (min_packsize_tolerate_percent, max_packsize_tolerate_percent) = - config.packsize_ok_percents(); - Self { - default_size, - grow_factor, - size_limit, - current_size, - min_packsize_tolerate_percent, - max_packsize_tolerate_percent, - } - } - - /// Computes the size of the pack file. - #[must_use] - // The cast actually shouldn't pose any problems. - // `current_size` is `u64`, the maximum value is `2^64-1`. - // `isqrt(2^64-1) = 2^32-1` which fits into a `u32`. (@aawsome) - #[allow(clippy::cast_possible_truncation)] - pub fn pack_size(&self) -> u32 { - (self.current_size.integer_sqrt() as u32 * self.grow_factor + self.default_size) - .min(self.size_limit) - .min(constants::MAX_SIZE) - } - - /// Evaluates whether the given size is not too small or too large - /// - /// # Arguments - /// - /// * `size` - The size to check - #[must_use] - pub fn size_ok(&self, size: u32) -> bool { - !self.is_too_small(size) && !self.is_too_large(size) - } - - /// Evaluates whether the given size is too small - /// - /// # Arguments - /// - /// * `size` - The size to check - #[must_use] - pub fn is_too_small(&self, size: u32) -> bool { - let target_size = self.pack_size(); - // Note: we cast to u64 so that no overflow can occur in the multiplications - u64::from(size) * 100 - < u64::from(target_size) * u64::from(self.min_packsize_tolerate_percent) - } - - /// Evaluates whether the given size is too large - /// - /// # Arguments - /// - /// * `size` - The size to check - #[must_use] - pub fn is_too_large(&self, size: u32) -> bool { - let target_size = self.pack_size(); - // Note: we cast to u64 so that no overflow can occur in the multiplications - u64::from(size) * 100 - > u64::from(target_size) * u64::from(self.max_packsize_tolerate_percent) - } - - /// Adds the given size to the current size. - /// - /// # Arguments - /// - /// * `added` - The size to add - /// - /// # Panics - /// - /// * If the size is too large - fn add_size(&mut self, added: u32) { - self.current_size += u64::from(added); - } -} - // TODO: add documentation! #[derive(Default, Debug, Clone, Copy)] pub struct PackerStats { @@ -200,8 +94,8 @@ impl PackerStats { /// /// * `BE` - The backend type. #[allow(missing_debug_implementations, clippy::module_name_repetitions)] -pub(crate) struct Packer { - /// the +pub(crate) struct Packer { + /// the key to encrypt data key: C, /// The blob type to pack. blob_type: BlobType, @@ -216,12 +110,12 @@ pub(crate) struct Packer { /// The index of the pack index: IndexPack, /// The pack sizer - pub pack_sizer: PackSizer, + pub pack_sizer: S, /// The packer stats pub stats: PackerStats, } -impl Packer { +impl Packer { /// Creates a new `RawPacker`. /// /// # Type Parameters @@ -235,9 +129,7 @@ impl Packer { /// * `indexer` - The indexer to write to. /// * `config` - The config file. /// * `total_size` - The total size of the pack file. - pub fn new(key: C, blob_type: BlobType, config: &ConfigFile, total_size: u64) -> Self { - let pack_sizer = PackSizer::from_config(config, blob_type, total_size); - + pub fn new(key: C, pack_sizer: S, blob_type: BlobType) -> Self { Self { key, blob_type, @@ -340,12 +232,12 @@ impl Packer { Ok(()) } - pub fn needs_save(&self, size_limit: Option) -> bool { + pub fn needs_save(&self) -> bool { if self.size == 0 { return false; } - let size_limit = size_limit.unwrap_or_else(|| self.pack_sizer.pack_size()); + let size_limit = self.pack_sizer.pack_size().min(constants::MAX_SIZE); // check if PackFile needs to be saved let elapsed = self.created.elapsed().unwrap_or_else(|err| { @@ -433,11 +325,8 @@ impl Packer { /// /// * If converting the header length to u32 fails /// * If the header could not be written - pub fn save_if_needed( - &mut self, - size_limit: Option, - ) -> RusticResult> { - if !self.needs_save(size_limit) { + pub fn save_if_needed(&mut self) -> RusticResult> { + if !self.needs_save() { return Ok(None); } diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 1d7b3d037..1ddd15eba 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -24,7 +24,10 @@ use crate::{ }, }; -use super::packer::{PackSizer, Packer, PackerStats}; +use super::{ + pack_sizer::{DefaultPackSizer, FixedPackSizer, PackSizer}, + packer::{Packer, PackerStats}, +}; /// [`PackerErrorKind`] describes the errors that can be returned for a Packer #[derive(thiserror::Error, Debug, displaydoc::Display)] @@ -36,12 +39,11 @@ pub enum PackerErrorKind { from: &'static str, source: std::num::TryFromIntError, }, - /// Sending crossbeam message failed: `size_limit`: `{size_limit:?}`, `id`: `{id:?}`, `data`: `{data:?}` : `{source}` + /// Sending crossbeam message failed: `id`: `{id:?}`, `data`: `{data:?}` : `{source}` SendingCrossbeamMessage { - size_limit: Option, id: BlobId, data: Bytes, - source: crossbeam_channel::SendError<(Bytes, BlobId, Option)>, + source: crossbeam_channel::SendError<(Bytes, BlobId)>, }, /// Sending crossbeam data message failed: `data`: `{data:?}`, `index_pack`: `{index_pack:?}` : `{source}` SendingCrossbeamDataMessage { @@ -61,22 +63,36 @@ pub(crate) type PackerResult = Result>; #[allow(missing_debug_implementations)] #[allow(clippy::struct_field_names)] #[derive(Clone)] -pub struct RepositoryPacker { +pub struct RepositoryPacker { /// The raw packer wrapped in an `Arc` and `RwLock`. // This is a hack: raw_packer and indexer are only used in the add_raw() method. // TODO: Refactor as actor, like the other add() methods - packer: Arc>>, + packer: Arc>>, /// The shared indexer containing the backend. indexer: SharedIndexer, /// The actor to write the pack file file_writer: Actor, /// The sender to send blobs to the raw packer. - sender: Sender<(Bytes, BlobId, Option)>, + sender: Sender<(Bytes, BlobId)>, /// The receiver to receive the status from the raw packer. finish: Receiver>, } -impl RepositoryPacker { +impl RepositoryPacker { + #[allow(clippy::unnecessary_wraps)] + pub fn new_with_default_sizer( + be: BE, + blob_type: BlobType, + indexer: SharedIndexer, + config: &ConfigFile, + total_size: u64, + ) -> RusticResult { + let pack_sizer = DefaultPackSizer::from_config(config, BlobType::Data, total_size); + Self::new(be, blob_type, indexer, pack_sizer) + } +} + +impl RepositoryPacker { /// Creates a new `Packer`. /// /// # Type Parameters @@ -100,15 +116,9 @@ impl RepositoryPacker { be: BE, blob_type: BlobType, indexer: SharedIndexer, - config: &ConfigFile, - total_size: u64, + pack_sizer: S, ) -> RusticResult { - let packer = Arc::new(RwLock::new(Packer::new( - *be.key(), - blob_type, - config, - total_size, - ))); + let packer = Arc::new(RwLock::new(Packer::new(*be.key(), pack_sizer, blob_type))); let file_writer = Actor::new( FileWriterHandle { @@ -136,28 +146,18 @@ impl RepositoryPacker { .into_iter() .readahead_scoped(scope) // early check if id is already contained and reserve, if not - .filter(|(_, id, _)| indexer.write().unwrap().reserve(id)) - .parallel_map_scoped( - scope, - |(data, id, size_limit): (Bytes, BlobId, Option)| { - let (data, data_len, uncompressed_length) = be.process_data(&data)?; - Ok(( - data, - id, - u64::from(data_len), - uncompressed_length, - size_limit, - )) - }, - ) + .filter(|(_, id)| indexer.write().unwrap().reserve(id)) + .parallel_map_scoped(scope, |(data, id): (Bytes, BlobId)| { + let (data, data_len, uncompressed_length) = be.process_data(&data)?; + Ok((data, id, u64::from(data_len), uncompressed_length)) + }) .readahead_scoped(scope) .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { - let (data, id, data_len, ul, size_limit) = item?; + let (data, id, data_len, ul) = item?; let res = { let mut raw_packer = packer.write().unwrap(); raw_packer.add(&data, &id, data_len, ul)?; - - raw_packer.save_if_needed(size_limit)? + raw_packer.save_if_needed()? }; if let Some((file, index)) = res { file_writer.send((file, index)).map_err(|err| { @@ -204,7 +204,7 @@ impl RepositoryPacker { /// * If sending the message to the raw packer fails. pub fn add(&self, data: Bytes, id: BlobId) -> RusticResult<()> { // compute size limit based on total size and size bounds - self.add_with_sizelimit(data, id, None).map_err(|err| { + self.sender.send((data, id)).map_err(|err| { RusticError::with_source( ErrorKind::Internal, "Failed to add blob `{id}` to packfile.", @@ -215,34 +215,6 @@ impl RepositoryPacker { }) } - /// Adds the blob to the packfile, allows specifying a size limit for the pack file - /// - /// # Arguments - /// - /// * `data` - The blob data - /// * `id` - The blob id - /// * `size_limit` - The size limit for the pack file - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - fn add_with_sizelimit( - &self, - data: Bytes, - id: BlobId, - size_limit: Option, - ) -> PackerResult<()> { - self.sender - .send((data.clone(), id, size_limit)) - .map_err(|err| PackerErrorKind::SendingCrossbeamMessage { - size_limit, - id, - data, - source: err, - })?; - Ok(()) - } - /// Adds the already encrypted (and maybe compressed) blob to the packfile /// /// # Arguments @@ -263,14 +235,13 @@ impl RepositoryPacker { id: &BlobId, data_len: u64, uncompressed_length: Option, - size_limit: Option, ) -> RusticResult<()> { // only add if this blob is not present if self.indexer.write().unwrap().reserve(id) { let mut raw_packer = self.packer.write().unwrap(); raw_packer.add(data, id, data_len, uncompressed_length)?; - if let Some((file, index)) = raw_packer.save_if_needed(size_limit)? { + if let Some((file, index)) = raw_packer.save_if_needed()? { self.file_writer.send((file, index)).map_err(|err| { RusticError::with_source( ErrorKind::Internal, @@ -439,7 +410,7 @@ where /// The backend to read from. be: BE, /// The packer to write to. - packer: RepositoryPacker, + packer: RepositoryPacker, /// The size limit of the pack file. size_limit: u32, } @@ -469,8 +440,9 @@ impl Repacker { config: &ConfigFile, total_size: u64, ) -> RusticResult { - let packer = RepositoryPacker::new(be.clone(), blob_type, indexer, config, total_size)?; - let size_limit = PackSizer::from_config(config, blob_type, total_size).pack_size(); + let size_limit = DefaultPackSizer::from_config(config, blob_type, total_size).pack_size(); + let repo_sizer = FixedPackSizer(size_limit); + let packer = RepositoryPacker::new(be.clone(), blob_type, indexer, repo_sizer)?; Ok(Self { be, packer, @@ -499,13 +471,7 @@ impl Repacker { )?; self.packer - .add_raw( - &data, - &blob.id, - 0, - blob.uncompressed_length, - Some(self.size_limit), - ) + .add_raw(&data, &blob.id, 0, blob.uncompressed_length) .map_err(|err| { err.overwrite_kind(ErrorKind::Internal) .prepend_guidance_line( @@ -538,15 +504,7 @@ impl Repacker { blob.uncompressed_length, )?; - self.packer - .add_with_sizelimit(data, blob.id, Some(self.size_limit)) - .map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to add blob to packfile.", - err, - ) - })?; + self.packer.add(data, blob.id)?; Ok(()) } diff --git a/crates/core/src/commands/copy.rs b/crates/core/src/commands/copy.rs index 3b1e1ca79..2b140ee83 100644 --- a/crates/core/src/commands/copy.rs +++ b/crates/core/src/commands/copy.rs @@ -60,14 +60,14 @@ pub(crate) fn copy<'a, Q, R: IndexedFull, P: ProgressBars, S: IndexedIds>( let index_dest = repo_dest.index(); let indexer = Indexer::new().into_shared(); - let data_packer = RepositoryPacker::new( + let data_packer = RepositoryPacker::new_with_default_sizer( be_dest.clone(), BlobType::Data, indexer.clone(), repo_dest.config(), index_dest.total_size(BlobType::Data), )?; - let tree_packer = RepositoryPacker::new( + let tree_packer = RepositoryPacker::new_with_default_sizer( be_dest.clone(), BlobType::Tree, indexer.clone(), diff --git a/crates/core/src/commands/merge.rs b/crates/core/src/commands/merge.rs index 0a251a257..8ed30abe8 100644 --- a/crates/core/src/commands/merge.rs +++ b/crates/core/src/commands/merge.rs @@ -104,7 +104,7 @@ pub(crate) fn merge_trees( let be = repo.dbe(); let index = repo.index(); let indexer = Indexer::new().into_shared(); - let packer = RepositoryPacker::new( + let packer = RepositoryPacker::new_with_default_sizer( repo.dbe().clone(), BlobType::Tree, indexer.clone(), diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index e48d6cfae..2f327c2dc 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -26,7 +26,9 @@ use crate::{ node::NodeType, }, blob::{ - BlobId, BlobType, BlobTypeMap, Initialize, packer::PackSizer, repopacker::Repacker, + BlobId, BlobType, BlobTypeMap, Initialize, + pack_sizer::{DefaultPackSizer, PackSizer}, + repopacker::Repacker, tree::TreeStreamerOnce, }, error::{ErrorKind, RusticError, RusticResult}, @@ -747,7 +749,7 @@ impl PrunePlan { .repack_cacheable_only .unwrap_or_else(|| repo.config().is_hot == Some(true)); let pack_sizer = - total_size.map(|tpe, size| PackSizer::from_config(repo.config(), tpe, size)); + total_size.map(|tpe, size| DefaultPackSizer::from_config(repo.config(), tpe, size)); pruner.decide_packs( Duration::from_std(*opts.keep_pack).map_err(|err| { @@ -846,7 +848,7 @@ impl PrunePlan { repack_cacheable_only: bool, repack_uncompressed: bool, repack_all: bool, - pack_sizer: &BlobTypeMap, + pack_sizer: &BlobTypeMap, ) -> RusticResult<()> { // first process all marked packs then the unmarked ones: // - first processed packs are more likely to have all blobs seen as unused @@ -1007,7 +1009,7 @@ impl PrunePlan { max_unused: &LimitOption, repack_uncompressed: bool, no_resize: bool, - pack_sizer: &BlobTypeMap, + pack_sizer: &BlobTypeMap, ) { let max_unused = match (repack_uncompressed, max_unused) { (true, _) => 0, diff --git a/crates/core/src/commands/repair/snapshots.rs b/crates/core/src/commands/repair/snapshots.rs index fffef0bf9..eea364948 100644 --- a/crates/core/src/commands/repair/snapshots.rs +++ b/crates/core/src/commands/repair/snapshots.rs @@ -11,6 +11,7 @@ use crate::{ }, blob::{ BlobId, BlobType, + pack_sizer::DefaultPackSizer, repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, @@ -107,7 +108,7 @@ pub(crate) fn repair_snapshots( let mut state = RepairState::default(); let indexer = Indexer::new().into_shared(); - let mut packer = RepositoryPacker::new( + let mut packer = RepositoryPacker::new_with_default_sizer( be.clone(), BlobType::Tree, indexer.clone(), @@ -198,7 +199,7 @@ pub(crate) fn repair_tree( be: &impl DecryptFullBackend, opts: &RepairSnapshotsOptions, index: &impl ReadGlobalIndex, - packer: &mut RepositoryPacker, + packer: &mut RepositoryPacker, id: Option, state: &mut RepairState, dry_run: bool, From d449b87ee98ec6632de3d370d6d9dbefc946f96a Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 13:52:05 +0200 Subject: [PATCH 10/26] simplify SharedIndexer --- crates/core/src/archiver.rs | 9 ++--- crates/core/src/backend/decrypt.rs | 5 +++ crates/core/src/blob/repopacker.rs | 16 +++------ crates/core/src/commands/copy.rs | 5 +-- crates/core/src/commands/merge.rs | 5 +-- crates/core/src/commands/prune.rs | 14 ++------ crates/core/src/commands/repair/snapshots.rs | 5 +-- crates/core/src/index/indexer.rs | 38 ++++++++++++++++++-- 8 files changed, 54 insertions(+), 43 deletions(-) diff --git a/crates/core/src/archiver.rs b/crates/core/src/archiver.rs index 0bbde23df..566d435ce 100644 --- a/crates/core/src/archiver.rs +++ b/crates/core/src/archiver.rs @@ -217,10 +217,11 @@ impl<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> Archiver<'a, BE, I> { stats.apply(&mut summary, BlobType::Data); self.snap.tree = id; - let res = self.indexer.write().unwrap().finalize(); - if let Some(file) = res { - _ = self.be.save_file(&file)?; - } + self.indexer + .finalize_and_check_save(|file| -> RusticResult<_> { + _ = self.be.save_file(file)?; + Ok(()) + })?; summary.finalize(self.snap.time).map_err(|err| { RusticError::with_source( diff --git a/crates/core/src/backend/decrypt.rs b/crates/core/src/backend/decrypt.rs index 0c0cb5ad5..db6e4d2c6 100644 --- a/crates/core/src/backend/decrypt.rs +++ b/crates/core/src/backend/decrypt.rs @@ -265,6 +265,11 @@ pub trait DecryptWriteBackend: WriteBackend + Clone + 'static { self.hash_write_full(F::TYPE, &data) } + fn save_file_no_id(&self, file: &F) -> RusticResult<()> { + let _ = self.save_file(file)?; + Ok(()) + } + /// Saves the given file uncompressed. /// /// # Arguments diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 1ddd15eba..6dda49e87 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -146,7 +146,7 @@ impl RepositoryPa .into_iter() .readahead_scoped(scope) // early check if id is already contained and reserve, if not - .filter(|(_, id)| indexer.write().unwrap().reserve(id)) + .filter(|(_, id)| indexer.reserve(id)) .parallel_map_scoped(scope, |(data, id): (Bytes, BlobId)| { let (data, data_len, uncompressed_length) = be.process_data(&data)?; Ok((data, id, u64::from(data_len), uncompressed_length)) @@ -237,7 +237,7 @@ impl RepositoryPa uncompressed_length: Option, ) -> RusticResult<()> { // only add if this blob is not present - if self.indexer.write().unwrap().reserve(id) { + if self.indexer.reserve(id) { let mut raw_packer = self.packer.write().unwrap(); raw_packer.add(data, id, data_len, uncompressed_length)?; @@ -298,16 +298,8 @@ impl FileWriterHandle { } fn index(&self, index: IndexPack) -> RusticResult<()> { - let res = { - let mut indexer = self.indexer.write().unwrap(); - - indexer.add(index); - indexer.save_if_needed() - }; - if let Some(file) = res { - let _ = self.be.save_file(&file)?; - } - Ok(()) + self.indexer + .add_and_check_save(index, false, |file| self.be.save_file_no_id(file)) } } diff --git a/crates/core/src/commands/copy.rs b/crates/core/src/commands/copy.rs index 2b140ee83..55dbcf04d 100644 --- a/crates/core/src/commands/copy.rs +++ b/crates/core/src/commands/copy.rs @@ -128,10 +128,7 @@ pub(crate) fn copy<'a, Q, R: IndexedFull, P: ProgressBars, S: IndexedIds>( _ = data_packer.finalize()?; _ = tree_packer.finalize()?; - let res = indexer.write().unwrap().finalize(); - if let Some(file) = res { - let _ = be_dest.save_file(&file)?; - } + indexer.finalize_and_check_save(|file| be_dest.save_file_no_id(file))?; let p = pb.progress_counter("saving snapshots..."); be_dest.save_list(snaps.iter(), p)?; diff --git a/crates/core/src/commands/merge.rs b/crates/core/src/commands/merge.rs index 8ed30abe8..3b28bd626 100644 --- a/crates/core/src/commands/merge.rs +++ b/crates/core/src/commands/merge.rs @@ -136,10 +136,7 @@ pub(crate) fn merge_trees( let p = repo.pb.progress_spinner("merging snapshots..."); let tree_merged = tree::merge_trees(be, index, trees, cmp, &save, summary)?; let stats = packer.finalize()?; - let res = indexer.write().unwrap().finalize(); - if let Some(file) = res { - let _ = be.save_file(&file)?; - } + indexer.finalize_and_check_save(|file| be.save_file_no_id(file))?; p.finish(); stats.apply(summary, BlobType::Tree); diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index 2f327c2dc..3300a9512 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -1428,23 +1428,13 @@ pub(crate) fn prune_repository( PackToDo::Delete => delete_pack(pack), }; if let Some((pack, delete)) = to_index { - let res = { - let mut indexer = indexer.write().unwrap(); - indexer.add_with(pack, delete); - indexer.save_if_needed() - }; - if let Some(file) = res { - _ = be.save_file(&file)?; - } + indexer.add_and_check_save(pack, delete, |file| be.save_file_no_id(file))?; } Ok(()) })?; _ = tree_repacker.finalize()?; _ = data_repacker.finalize()?; - let res = indexer.write().unwrap().finalize(); - if let Some(file) = res { - _ = be.save_file(&file)?; - } + indexer.finalize_and_check_save(|file| be.save_file_no_id(file))?; p.finish(); // remove old index files first as they may reference pack files which are removed soon. diff --git a/crates/core/src/commands/repair/snapshots.rs b/crates/core/src/commands/repair/snapshots.rs index eea364948..b1ef91373 100644 --- a/crates/core/src/commands/repair/snapshots.rs +++ b/crates/core/src/commands/repair/snapshots.rs @@ -155,10 +155,7 @@ pub(crate) fn repair_snapshots( if !dry_run { _ = packer.finalize()?; - let mut indexer = indexer.write().unwrap(); - if let Some(file) = indexer.finalize() { - _ = be.save_file(&file)?; - } + indexer.finalize_and_check_save(|file| be.save_file_no_id(file))?; } if opts.delete { diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index f3d75bab0..708c296f7 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -20,8 +20,6 @@ pub(super) mod constants { pub(super) const MAX_AGE: Duration = Duration::from_secs(300); } -pub(crate) type SharedIndexer = Arc>; - /// The `Indexer` is responsible for indexing blobs. #[derive(Debug)] pub struct Indexer { @@ -78,7 +76,7 @@ impl Indexer { /// /// * `BE` - The backend type. pub fn into_shared(self) -> SharedIndexer { - Arc::new(RwLock::new(self)) + SharedIndexer(Arc::new(RwLock::new(self))) } /// Finalizes the `Indexer`. @@ -191,3 +189,37 @@ impl Indexer { .is_some_and(|indexed| indexed.insert(*id)) } } + +#[derive(Debug, Clone)] +pub(crate) struct SharedIndexer(Arc>); + +impl SharedIndexer { + pub fn add_and_check_save( + &self, + pack: IndexPack, + delete: bool, + writer: impl Fn(&IndexFile) -> Result<(), E>, + ) -> Result<(), E> { + let mut indexer = self.0.write().unwrap(); + indexer.add_with(pack, delete); + let res = indexer.save_if_needed(); + drop(indexer); + + res.as_ref().map_or(Ok(()), writer) + } + + pub fn finalize_and_check_save( + &self, + writer: impl Fn(&IndexFile) -> Result<(), E>, + ) -> Result<(), E> { + let mut indexer = self.0.write().unwrap(); + let res = indexer.finalize(); + drop(indexer); + + res.as_ref().map_or(Ok(()), writer) + } + + pub fn reserve(&self, id: &BlobId) -> bool { + self.0.write().unwrap().reserve(id) + } +} From 063017db88b332dc2b8678d3e372a75b16aa80a8 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 14:22:21 +0200 Subject: [PATCH 11/26] clean up Packer errors --- crates/core/src/archiver.rs | 5 +-- crates/core/src/blob/repopacker.rs | 56 +++++------------------------- 2 files changed, 10 insertions(+), 51 deletions(-) diff --git a/crates/core/src/archiver.rs b/crates/core/src/archiver.rs index 566d435ce..cac677350 100644 --- a/crates/core/src/archiver.rs +++ b/crates/core/src/archiver.rs @@ -218,10 +218,7 @@ impl<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> Archiver<'a, BE, I> { self.snap.tree = id; self.indexer - .finalize_and_check_save(|file| -> RusticResult<_> { - _ = self.be.save_file(file)?; - Ok(()) - })?; + .finalize_and_check_save(|file| self.be.save_file_no_id(file))?; summary.finalize(self.snap.time).map_err(|err| { RusticError::with_source( diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 6dda49e87..e66d00769 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -29,32 +29,6 @@ use super::{ packer::{Packer, PackerStats}, }; -/// [`PackerErrorKind`] describes the errors that can be returned for a Packer -#[derive(thiserror::Error, Debug, displaydoc::Display)] -#[non_exhaustive] -pub enum PackerErrorKind { - /// Conversion from `{from}` to `{to}` failed: `{source}` - Conversion { - to: &'static str, - from: &'static str, - source: std::num::TryFromIntError, - }, - /// Sending crossbeam message failed: `id`: `{id:?}`, `data`: `{data:?}` : `{source}` - SendingCrossbeamMessage { - id: BlobId, - data: Bytes, - source: crossbeam_channel::SendError<(Bytes, BlobId)>, - }, - /// Sending crossbeam data message failed: `data`: `{data:?}`, `index_pack`: `{index_pack:?}` : `{source}` - SendingCrossbeamDataMessage { - data: Bytes, - index_pack: IndexPack, - source: crossbeam_channel::SendError<(Bytes, IndexPack)>, - }, -} - -pub(crate) type PackerResult = Result>; - /// The `Packer` is responsible for packing blobs into pack files. /// /// # Type Parameters @@ -160,13 +134,7 @@ impl RepositoryPa raw_packer.save_if_needed()? }; if let Some((file, index)) = res { - file_writer.send((file, index)).map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to send packfile to file writer.", - err, - ) - })?; + file_writer.send((file, index))?; } Ok(()) @@ -174,13 +142,7 @@ impl RepositoryPa .and_then(|()| { let (res, stats) = packer.write().unwrap().finalize()?; if let Some((file, index)) = res { - file_writer.send((file, index)).map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to send packfile to file writer.", - err, - ) - })?; + file_writer.send((file, index))?; } Ok(stats) }); @@ -365,13 +327,13 @@ impl Actor { /// # Errors /// /// If sending the message to the actor fails. - fn send(&self, load: (Bytes, IndexPack)) -> PackerResult<()> { - self.sender.send(load.clone()).map_err(|err| { - PackerErrorKind::SendingCrossbeamDataMessage { - data: load.0, - index_pack: load.1, - source: err, - } + fn send(&self, load: (Bytes, IndexPack)) -> RusticResult<()> { + self.sender.send(load).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to send packfile to file writer.", + err, + ) })?; Ok(()) } From 228defda2195122bb6b7c45cbf1379235ccc52c0 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 15:59:18 +0200 Subject: [PATCH 12/26] simplify RepositoryPacker --- crates/core/src/archiver.rs | 4 +- crates/core/src/archiver/file_archiver.rs | 13 ++- crates/core/src/archiver/tree_archiver.rs | 9 +- crates/core/src/blob/repopacker.rs | 93 ++++++++++---------- crates/core/src/commands/repair/snapshots.rs | 5 +- 5 files changed, 59 insertions(+), 65 deletions(-) diff --git a/crates/core/src/archiver.rs b/crates/core/src/archiver.rs index cac677350..5e5751328 100644 --- a/crates/core/src/archiver.rs +++ b/crates/core/src/archiver.rs @@ -40,10 +40,10 @@ pub struct TreeStackEmptyError; #[allow(clippy::struct_field_names)] pub struct Archiver<'a, BE: DecryptFullBackend, I: ReadGlobalIndex> { /// The `FileArchiver` is responsible for archiving files. - file_archiver: FileArchiver<'a, BE, I>, + file_archiver: FileArchiver<'a, I>, /// The `TreeArchiver` is responsible for archiving trees. - tree_archiver: TreeArchiver<'a, BE, I>, + tree_archiver: TreeArchiver<'a, I>, /// The parent snapshot to use. parent: Parent, diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index 7de244192..d81af25f7 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -13,10 +13,7 @@ use crate::{ decrypt::DecryptWriteBackend, node::{Node, NodeType}, }, - blob::{ - BlobId, BlobType, DataId, pack_sizer::DefaultPackSizer, packer::PackerStats, - repopacker::RepositoryPacker, - }, + blob::{BlobId, BlobType, DataId, packer::PackerStats, repopacker::RepositoryPacker}, chunker::ChunkIter, crypto::hasher::hash, error::{ErrorKind, RusticError, RusticResult}, @@ -33,13 +30,13 @@ use crate::{ /// * `BE` - The backend type. /// * `I` - The index to read from. #[derive(Clone)] -pub(crate) struct FileArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> { +pub(crate) struct FileArchiver<'a, I: ReadGlobalIndex> { index: &'a I, - data_packer: RepositoryPacker, + data_packer: RepositoryPacker, rabin: Rabin64, } -impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> FileArchiver<'a, BE, I> { +impl<'a, I: ReadGlobalIndex> FileArchiver<'a, I> { /// Creates a new `FileArchiver`. /// /// # Type Parameters @@ -58,7 +55,7 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> FileArchiver<'a, BE, I> { /// /// * If sending the message to the raw packer fails. /// * If converting the data length to u64 fails - pub(crate) fn new( + pub(crate) fn new( be: BE, index: &'a I, indexer: SharedIndexer, diff --git a/crates/core/src/archiver/tree_archiver.rs b/crates/core/src/archiver/tree_archiver.rs index 7fcb82eea..cec4b9f97 100644 --- a/crates/core/src/archiver/tree_archiver.rs +++ b/crates/core/src/archiver/tree_archiver.rs @@ -8,7 +8,6 @@ use crate::{ backend::{decrypt::DecryptWriteBackend, node::Node}, blob::{ BlobType, - pack_sizer::DefaultPackSizer, repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, @@ -27,7 +26,7 @@ pub(crate) type TreeItem = TreeType<(ParentResult<()>, u64), ParentResult { +pub(crate) struct TreeArchiver<'a, I: ReadGlobalIndex> { /// The current tree. tree: Tree, /// The stack of trees. @@ -35,12 +34,12 @@ pub(crate) struct TreeArchiver<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> /// The index to read from. index: &'a I, /// The packer to write to. - tree_packer: RepositoryPacker, + tree_packer: RepositoryPacker, /// The summary of the snapshot. summary: SnapshotSummary, } -impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> TreeArchiver<'a, BE, I> { +impl<'a, I: ReadGlobalIndex> TreeArchiver<'a, I> { /// Creates a new `TreeArchiver`. /// /// # Type Parameters @@ -60,7 +59,7 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> TreeArchiver<'a, BE, I> { /// /// * If sending the message to the raw packer fails. /// * If converting the data length to u64 fails - pub(crate) fn new( + pub(crate) fn new( be: BE, index: &'a I, indexer: SharedIndexer, diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index e66d00769..ae64aed78 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -1,7 +1,4 @@ -use std::{ - num::NonZeroU32, - sync::{Arc, RwLock}, -}; +use std::num::NonZeroU32; use bytes::Bytes; use chrono::Local; @@ -29,6 +26,8 @@ use super::{ packer::{Packer, PackerStats}, }; +type RawSender = Sender, BlobId, u64, Option)>>; + /// The `Packer` is responsible for packing blobs into pack files. /// /// # Type Parameters @@ -37,24 +36,20 @@ use super::{ #[allow(missing_debug_implementations)] #[allow(clippy::struct_field_names)] #[derive(Clone)] -pub struct RepositoryPacker { - /// The raw packer wrapped in an `Arc` and `RwLock`. - // This is a hack: raw_packer and indexer are only used in the add_raw() method. - // TODO: Refactor as actor, like the other add() methods - packer: Arc>>, +pub struct RepositoryPacker { /// The shared indexer containing the backend. indexer: SharedIndexer, - /// The actor to write the pack file - file_writer: Actor, - /// The sender to send blobs to the raw packer. + /// The sender to send blobs to the packer. sender: Sender<(Bytes, BlobId)>, + /// The sender to send raw blobs to the packer. + raw_sender: RawSender, /// The receiver to receive the status from the raw packer. finish: Receiver>, } -impl RepositoryPacker { +impl RepositoryPacker { #[allow(clippy::unnecessary_wraps)] - pub fn new_with_default_sizer( + pub fn new_with_default_sizer( be: BE, blob_type: BlobType, indexer: SharedIndexer, @@ -66,7 +61,7 @@ impl RepositoryPacker { } } -impl RepositoryPacker { +impl RepositoryPacker { /// Creates a new `Packer`. /// /// # Type Parameters @@ -86,13 +81,13 @@ impl RepositoryPa /// * If sending the message to the raw packer fails. /// * If converting the data length to u64 fails #[allow(clippy::unnecessary_wraps)] - pub fn new( + pub fn new( be: BE, blob_type: BlobType, indexer: SharedIndexer, pack_sizer: S, ) -> RusticResult { - let packer = Arc::new(RwLock::new(Packer::new(*be.key(), pack_sizer, blob_type))); + let mut packer = Packer::new(*be.key(), pack_sizer, blob_type); let file_writer = Actor::new( FileWriterHandle { @@ -105,19 +100,18 @@ impl RepositoryPa ); let (tx, rx) = bounded(0); + let (tx_raw, rx_raw) = bounded(0); let (finish_tx, finish_rx) = bounded::>(0); let repository_packer = Self { - packer: packer.clone(), indexer: indexer.clone(), - file_writer: file_writer.clone(), sender: tx, + raw_sender: tx_raw.clone(), finish: finish_rx, }; let _join_handle = std::thread::spawn(move || { scope(|scope| { - let status = rx - .into_iter() + rx.into_iter() .readahead_scoped(scope) // early check if id is already contained and reserve, if not .filter(|(_, id)| indexer.reserve(id)) @@ -126,24 +120,31 @@ impl RepositoryPa Ok((data, id, u64::from(data_len), uncompressed_length)) }) .readahead_scoped(scope) + .for_each(|item: RusticResult<_>| tx_raw.send(item).unwrap()); + }) + .unwrap(); + }); + + let _join_handle_raw = std::thread::spawn(move || { + scope(|scope| { + let status = rx_raw + .into_iter() + .readahead_scoped(scope) .try_for_each(|item: RusticResult<_>| -> RusticResult<()> { let (data, id, data_len, ul) = item?; - let res = { - let mut raw_packer = packer.write().unwrap(); - raw_packer.add(&data, &id, data_len, ul)?; - raw_packer.save_if_needed()? - }; - if let Some((file, index)) = res { + packer.add(&data, &id, data_len, ul)?; + if let Some((file, index)) = packer.save_if_needed()? { file_writer.send((file, index))?; } Ok(()) }) .and_then(|()| { - let (res, stats) = packer.write().unwrap().finalize()?; + let (res, stats) = packer.finalize()?; if let Some((file, index)) = res { file_writer.send((file, index))?; } + file_writer.finalize()?; Ok(stats) }); _ = finish_tx.send(status); @@ -193,25 +194,25 @@ impl RepositoryPa /// * If sending the message to the raw packer fails. fn add_raw( &self, - data: &[u8], - id: &BlobId, + data: Vec, + id: BlobId, data_len: u64, uncompressed_length: Option, ) -> RusticResult<()> { // only add if this blob is not present - if self.indexer.reserve(id) { - let mut raw_packer = self.packer.write().unwrap(); - raw_packer.add(data, id, data_len, uncompressed_length)?; - - if let Some((file, index)) = raw_packer.save_if_needed()? { - self.file_writer.send((file, index)).map_err(|err| { + if self.indexer.reserve(&id) { + // compute size limit based on total size and size bounds + self.raw_sender + .send(Ok((data, id, data_len, uncompressed_length))) + .map_err(|err| { RusticError::with_source( ErrorKind::Internal, - "Failed to send packfile to file writer.", + "Failed to add blob `{id}` to packfile.", err, ) + .attach_context("id", id.to_string()) + .ask_report() })?; - } } Ok(()) } @@ -222,15 +223,13 @@ impl RepositoryPa /// /// * If the channel could not be dropped pub fn finalize(self) -> RusticResult { - // cancel channel + // cancel channels drop(self.sender); - // wait for items in channel to be processed - let res = self - .finish + drop(self.raw_sender); + // wait for items in channels to be processed + self.finish .recv() - .expect("Should be able to receive from channel to finalize packer."); - self.file_writer.finalize()?; - res + .expect("Should be able to receive from channel to finalize packer.") } } @@ -364,7 +363,7 @@ where /// The backend to read from. be: BE, /// The packer to write to. - packer: RepositoryPacker, + packer: RepositoryPacker, /// The size limit of the pack file. size_limit: u32, } @@ -425,7 +424,7 @@ impl Repacker { )?; self.packer - .add_raw(&data, &blob.id, 0, blob.uncompressed_length) + .add_raw(data.to_vec(), blob.id, 0, blob.uncompressed_length) .map_err(|err| { err.overwrite_kind(ErrorKind::Internal) .prepend_guidance_line( diff --git a/crates/core/src/commands/repair/snapshots.rs b/crates/core/src/commands/repair/snapshots.rs index b1ef91373..0bd0059e3 100644 --- a/crates/core/src/commands/repair/snapshots.rs +++ b/crates/core/src/commands/repair/snapshots.rs @@ -11,7 +11,6 @@ use crate::{ }, blob::{ BlobId, BlobType, - pack_sizer::DefaultPackSizer, repopacker::RepositoryPacker, tree::{Tree, TreeId}, }, @@ -192,11 +191,11 @@ pub(crate) fn repair_snapshots( /// # Returns /// /// A tuple containing the change status and the id of the repaired tree -pub(crate) fn repair_tree( +pub(crate) fn repair_tree( be: &impl DecryptFullBackend, opts: &RepairSnapshotsOptions, index: &impl ReadGlobalIndex, - packer: &mut RepositoryPacker, + packer: &mut RepositoryPacker, id: Option, state: &mut RepairState, dry_run: bool, From c38d762adad9098fe5fc6c5e8b802a7390945a7f Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 4 Jun 2025 16:16:31 +0200 Subject: [PATCH 13/26] use self in finalize --- crates/core/src/blob/packer.rs | 6 ++---- crates/core/src/blob/repopacker.rs | 10 +++++----- crates/core/src/index/indexer.rs | 12 +++++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 7411b6c64..2fa3a70f0 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -148,10 +148,8 @@ impl Packer { /// # Errors /// /// * If the packfile could not be saved - pub fn finalize(&mut self) -> RusticResult<(Option<(Bytes, IndexPack)>, PackerStats)> { - let stats = std::mem::take(&mut self.stats); - - Ok((self.save()?, stats)) + pub fn finalize(mut self) -> RusticResult<(Option<(Bytes, IndexPack)>, PackerStats)> { + Ok((self.save()?, self.stats)) } /// Writes the given data to the packfile. diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index ae64aed78..71ea75294 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -294,9 +294,8 @@ impl Actor { let (finish_tx, finish_rx) = bounded::>(0); let _join_handle = std::thread::spawn(move || { - scope(|scope| { - let status = rx - .into_iter() + let status = scope(|scope| -> RusticResult<_> { + rx.into_iter() .readahead_scoped(scope) .map(|(file, index): (Bytes, IndexPack)| { let id = hash(&file); @@ -305,10 +304,11 @@ impl Actor { .readahead_scoped(scope) .map(|load| fwh.process(load)) .readahead_scoped(scope) - .try_for_each(|index| fwh.index(index?)); - _ = finish_tx.send(status); + .try_for_each(|index| fwh.index(index?)) }) .unwrap(); + drop(fwh); + _ = finish_tx.send(status); }); Self { diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index 708c296f7..4e9c7319a 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -84,7 +84,7 @@ impl Indexer { /// # Errors /// /// * If the index file could not be serialized. - pub fn finalize(&mut self) -> Option { + pub fn finalize(mut self) -> Option { self.save() } @@ -209,12 +209,14 @@ impl SharedIndexer { } pub fn finalize_and_check_save( - &self, + self, writer: impl Fn(&IndexFile) -> Result<(), E>, ) -> Result<(), E> { - let mut indexer = self.0.write().unwrap(); - let res = indexer.finalize(); - drop(indexer); + let res = Arc::try_unwrap(self.0) + .expect("indexer still in use") + .into_inner() + .unwrap() + .finalize(); res.as_ref().map_or(Ok(()), writer) } From 8dd914d13520655289bc6d81a5ec571205d4f94c Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Thu, 5 Jun 2025 10:42:00 +0200 Subject: [PATCH 14/26] corrections --- crates/core/src/blob/packer.rs | 1 + crates/core/src/blob/repopacker.rs | 7 ++++++- crates/core/src/index/indexer.rs | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 2fa3a70f0..74fa2d097 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -353,6 +353,7 @@ impl Packer { // prepare everything for write to the backend let file = std::mem::take(&mut self.file).into(); let index = std::mem::take(&mut self.index); + self.pack_sizer.add_size(self.size); self.size = 0; diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 71ea75294..75865ac5e 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -424,7 +424,12 @@ impl Repacker { )?; self.packer - .add_raw(data.to_vec(), blob.id, 0, blob.uncompressed_length) + .add_raw( + data.to_vec(), + blob.id, + blob.length.into(), + blob.uncompressed_length, + ) .map_err(|err| { err.overwrite_kind(ErrorKind::Internal) .prepend_guidance_line( diff --git a/crates/core/src/index/indexer.rs b/crates/core/src/index/indexer.rs index 4e9c7319a..087e56545 100644 --- a/crates/core/src/index/indexer.rs +++ b/crates/core/src/index/indexer.rs @@ -186,7 +186,7 @@ impl Indexer { pub fn reserve(&mut self, id: &BlobId) -> bool { self.indexed .as_mut() - .is_some_and(|indexed| indexed.insert(*id)) + .is_none_or(|indexed| indexed.insert(*id)) } } From 0b11bde017c2769518d7b76dabc81463ee609a42 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Thu, 5 Jun 2025 10:42:10 +0200 Subject: [PATCH 15/26] adjust docu --- crates/core/src/archiver/file_archiver.rs | 6 ------ crates/core/src/archiver/tree_archiver.rs | 8 -------- crates/core/src/backend/decrypt.rs | 5 +++++ crates/core/src/blob/pack_sizer.rs | 13 +++++-------- crates/core/src/blob/packer.rs | 17 ++--------------- 5 files changed, 12 insertions(+), 37 deletions(-) diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index d81af25f7..959a444a3 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -27,7 +27,6 @@ use crate::{ /// /// # Type Parameters /// -/// * `BE` - The backend type. /// * `I` - The index to read from. #[derive(Clone)] pub(crate) struct FileArchiver<'a, I: ReadGlobalIndex> { @@ -50,11 +49,6 @@ impl<'a, I: ReadGlobalIndex> FileArchiver<'a, I> { /// * `index` - The index to read from. /// * `indexer` - The indexer to write to. /// * `config` - The config file. - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - /// * If converting the data length to u64 fails pub(crate) fn new( be: BE, index: &'a I, diff --git a/crates/core/src/archiver/tree_archiver.rs b/crates/core/src/archiver/tree_archiver.rs index cec4b9f97..3c55ffa20 100644 --- a/crates/core/src/archiver/tree_archiver.rs +++ b/crates/core/src/archiver/tree_archiver.rs @@ -22,10 +22,7 @@ pub(crate) type TreeItem = TreeType<(ParentResult<()>, u64), ParentResult { /// The current tree. tree: Tree, @@ -54,11 +51,6 @@ impl<'a, I: ReadGlobalIndex> TreeArchiver<'a, I> { /// * `indexer` - The indexer to write to. /// * `config` - The config file. /// * `summary` - The summary of the snapshot. - /// - /// # Errors - /// - /// * If sending the message to the raw packer fails. - /// * If converting the data length to u64 fails pub(crate) fn new( be: BE, index: &'a I, diff --git a/crates/core/src/backend/decrypt.rs b/crates/core/src/backend/decrypt.rs index db6e4d2c6..ea64e20a6 100644 --- a/crates/core/src/backend/decrypt.rs +++ b/crates/core/src/backend/decrypt.rs @@ -265,6 +265,11 @@ pub trait DecryptWriteBackend: WriteBackend + Clone + 'static { self.hash_write_full(F::TYPE, &data) } + /// Saves the given file without returning the id. + /// + /// # Arguments + /// + /// * `file` - The file to save. fn save_file_no_id(&self, file: &F) -> RusticResult<()> { let _ = self.save_file(file)?; Ok(()) diff --git a/crates/core/src/blob/pack_sizer.rs b/crates/core/src/blob/pack_sizer.rs index 8382b12e7..da83c28b7 100644 --- a/crates/core/src/blob/pack_sizer.rs +++ b/crates/core/src/blob/pack_sizer.rs @@ -3,6 +3,7 @@ use crate::repofile::ConfigFile; use integer_sqrt::IntegerSquareRoot; +/// The pack sizer is responsible for computing the size of the pack file. pub trait PackSizer { /// Computes the size of the pack file. #[must_use] @@ -43,14 +44,10 @@ pub trait PackSizer { /// # Arguments /// /// * `added` - The size to add - /// - /// # Panics - /// - /// * If the size is too large fn add_size(&mut self, _added: u32) {} } -/// The pack sizer is responsible for computing the size of the pack file. +/// The default pack sizer computes packs depending on a default size, a grow factor amd a size limit. #[derive(Debug, Clone, Copy)] pub struct DefaultPackSizer { /// The default size of a pack file. @@ -68,7 +65,7 @@ pub struct DefaultPackSizer { } impl DefaultPackSizer { - /// Creates a new `PackSizer` from a config file. + /// Creates a new `DefaultPackSizer` from a config file. /// /// # Arguments /// @@ -78,7 +75,7 @@ impl DefaultPackSizer { /// /// # Returns /// - /// A new `PackSizer`. + /// A new `DefaultPackSizer`. #[must_use] pub fn from_config(config: &ConfigFile, blob_type: BlobType, current_size: u64) -> Self { let (default_size, grow_factor, size_limit) = config.packsize(blob_type); @@ -121,7 +118,7 @@ impl PackSizer for DefaultPackSizer { } } -/// The pack sizer is responsible for computing the size of the pack file. +/// A pack sizer which uses a fixed pack size #[derive(Debug, Clone, Copy)] pub struct FixedPackSizer(pub u32); diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 74fa2d097..aeea6abc1 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -230,6 +230,7 @@ impl Packer { Ok(()) } + /// Determines if the current pack should be saved. pub fn needs_save(&self) -> bool { if self.size == 0 { return false; @@ -313,16 +314,7 @@ impl Packer { Ok(()) } - /// Saves the packfile - /// - /// # Errors - /// - /// If the header could not be written - /// - /// # Errors - /// - /// * If converting the header length to u32 fails - /// * If the header could not be written + /// Saves the packfile if conditions for saving are fulfilled pub fn save_if_needed(&mut self) -> RusticResult> { if !self.needs_save() { return Ok(None); @@ -336,11 +328,6 @@ impl Packer { /// # Errors /// /// If the header could not be written - /// - /// # Errors - /// - /// * If converting the header length to u32 fails - /// * If the header could not be written pub fn save(&mut self) -> RusticResult> { self.created = SystemTime::now(); self.count = 0; From 3abadd4c70fdc3797a1a55dd23f7f3d3060ef9e2 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sat, 14 Jun 2025 21:47:06 +0200 Subject: [PATCH 16/26] import from root --- crates/core/src/blob/pack_sizer.rs | 3 +-- crates/core/src/blob/repopacker.rs | 11 +++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/crates/core/src/blob/pack_sizer.rs b/crates/core/src/blob/pack_sizer.rs index da83c28b7..25f20ded8 100644 --- a/crates/core/src/blob/pack_sizer.rs +++ b/crates/core/src/blob/pack_sizer.rs @@ -1,5 +1,4 @@ -use super::BlobType; -use crate::repofile::ConfigFile; +use crate::{blob::BlobType, repofile::ConfigFile}; use integer_sqrt::IntegerSquareRoot; diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 75865ac5e..1bc6683bc 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -10,7 +10,11 @@ use crate::{ FileType, decrypt::{DecryptFullBackend, DecryptWriteBackend}, }, - blob::{BlobId, BlobType}, + blob::{ + BlobId, BlobType, + pack_sizer::{DefaultPackSizer, FixedPackSizer, PackSizer}, + packer::{Packer, PackerStats}, + }, crypto::hasher::hash, error::{ErrorKind, RusticError, RusticResult}, index::indexer::SharedIndexer, @@ -21,11 +25,6 @@ use crate::{ }, }; -use super::{ - pack_sizer::{DefaultPackSizer, FixedPackSizer, PackSizer}, - packer::{Packer, PackerStats}, -}; - type RawSender = Sender, BlobId, u64, Option)>>; /// The `Packer` is responsible for packing blobs into pack files. From 618a1c8f88ef52e6a9f18e3d9d673ca38acabbef Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sat, 14 Jun 2025 22:29:32 +0200 Subject: [PATCH 17/26] Add more docu --- crates/core/src/blob/repopacker.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 1bc6683bc..59bf309cd 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -88,7 +88,7 @@ impl RepositoryPacker { ) -> RusticResult { let mut packer = Packer::new(*be.key(), pack_sizer, blob_type); - let file_writer = Actor::new( + let file_writer = FileWriter::new( FileWriterHandle { be: be.clone(), indexer: indexer.clone(), @@ -232,7 +232,8 @@ impl RepositoryPacker { } } -// TODO: add documentation +/// The handle for the [`FileWriter`] actor, repsonsible for writing pack files and indexing them using an indexer. +/// /// # Type Parameters /// /// * `BE` - The backend type. @@ -247,7 +248,6 @@ pub(crate) struct FileWriterHandle { } impl FileWriterHandle { - // TODO: add documentation fn process(&self, load: (Bytes, PackId, IndexPack)) -> RusticResult { let (file, id, mut index) = load; index.id = id; @@ -263,16 +263,18 @@ impl FileWriterHandle { } } -// TODO: add documentation +/// An actor which is repsonsible for writing pack files and indexing them using an indexer. +/// +/// This actor is repsonsible for the communication, the actual work is done by [`FileWriterHandle`] #[derive(Clone)] -pub(crate) struct Actor { +pub(crate) struct FileWriter { /// The sender to send blobs to the raw packer. sender: Sender<(Bytes, IndexPack)>, /// The receiver to receive the status from the raw packer. finish: Receiver>, } -impl Actor { +impl FileWriter { /// Creates a new `Actor`. /// /// # Type Parameters From 839a49c0f45457602b5c9d4eee645897a77187ab Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Thu, 12 Jun 2025 08:08:59 +0200 Subject: [PATCH 18/26] WIP: Use padding for packfiles --- crates/core/src/archiver/file_archiver.rs | 2 +- crates/core/src/backend.rs | 4 +- crates/core/src/blob/packer.rs | 27 ++++- crates/core/src/blob/repopacker.rs | 20 +++- crates/core/src/commands/config.rs | 6 + crates/core/src/commands/prune.rs | 138 ++++++++++------------ crates/core/src/repofile/configfile.rs | 11 ++ 7 files changed, 125 insertions(+), 83 deletions(-) diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index 2fd66847f..8e3bbf851 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -135,7 +135,7 @@ impl<'a, I: ReadGlobalIndex> FileArchiver<'a, I> { // TODO: add documentation! fn backup_reader( &self, - r: impl Read + Send + 'static, + r: impl Read, node: Node, p: &impl Progress, ) -> RusticResult<(Node, u64)> { diff --git a/crates/core/src/backend.rs b/crates/core/src/backend.rs index 2179671b2..a08e17ed7 100644 --- a/crates/core/src/backend.rs +++ b/crates/core/src/backend.rs @@ -441,7 +441,7 @@ impl ReadSourceEntry { /// This trait is implemented by all backends that can read data and open from a source. pub trait ReadSourceOpen { /// The Reader used for this source - type Reader: Read + Send + 'static; + type Reader: Read; /// Opens the source. /// @@ -456,7 +456,7 @@ pub trait ReadSourceOpen { } /// blanket implementation for readers -impl ReadSourceOpen for T { +impl ReadSourceOpen for T { type Reader = T; fn open(self) -> RusticResult { Ok(self) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index aeea6abc1..1a0c20875 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -8,7 +8,7 @@ use log::warn; use crate::{ blob::{BlobId, BlobType}, - crypto::CryptoKey, + crypto::{CryptoKey, hasher::hash}, error::{ErrorKind, RusticError, RusticResult}, repofile::{ indexfile::IndexPack, @@ -113,6 +113,8 @@ pub(crate) struct Packer { pub pack_sizer: S, /// The packer stats pub stats: PackerStats, + /// add a padding blob to stealthen the packsize + add_padding: bool, } impl Packer { @@ -129,7 +131,7 @@ impl Packer { /// * `indexer` - The indexer to write to. /// * `config` - The config file. /// * `total_size` - The total size of the pack file. - pub fn new(key: C, pack_sizer: S, blob_type: BlobType) -> Self { + pub fn new(key: C, pack_sizer: S, blob_type: BlobType, add_padding: bool) -> Self { Self { key, blob_type, @@ -140,6 +142,7 @@ impl Packer { index: IndexPack::default(), pack_sizer, stats: PackerStats::default(), + add_padding, } } @@ -336,6 +339,9 @@ impl Packer { return Ok(None); } + if self.add_padding { + self.add_padding_blob()?; + } self.write_header()?; // prepare everything for write to the backend let file = std::mem::take(&mut self.file).into(); @@ -346,4 +352,21 @@ impl Packer { Ok(Some((file, index))) } + + // Add a padding blob + fn add_padding_blob(&mut self) -> RusticResult<()> { + // TODO: calculate reasonable padding size! + pub(super) const KB: usize = 1024; + pub(super) const MB: usize = 1024 * KB; + pub(super) const MIN_SIZE: usize = 512 * KB; + pub(super) const MAX_SIZE: usize = 8 * MB; + + // !!TODO!! -> change to reasonable padding! + let padding_size = 400; + let data = vec![0; padding_size]; + let id = BlobId(hash(&data)); + let data = self.key.encrypt_data(&data)?; + let padding_size = u64::try_from(padding_size).expect("padding_size should fit into u64"); + self.add(&data, &id, padding_size, None) + } } diff --git a/crates/core/src/blob/repopacker.rs b/crates/core/src/blob/repopacker.rs index 59bf309cd..49d798ac4 100644 --- a/crates/core/src/blob/repopacker.rs +++ b/crates/core/src/blob/repopacker.rs @@ -47,6 +47,14 @@ pub struct RepositoryPacker { } impl RepositoryPacker { + fn padding_from_config(blob_type: BlobType, config: &ConfigFile) -> bool { + if blob_type == BlobType::Data { + config.use_pack_padding() + } else { + false + } + } + #[allow(clippy::unnecessary_wraps)] pub fn new_with_default_sizer( be: BE, @@ -56,11 +64,10 @@ impl RepositoryPacker { total_size: u64, ) -> RusticResult { let pack_sizer = DefaultPackSizer::from_config(config, BlobType::Data, total_size); - Self::new(be, blob_type, indexer, pack_sizer) + let add_padding = Self::padding_from_config(blob_type, config); + Self::new(be, blob_type, indexer, pack_sizer, add_padding) } -} -impl RepositoryPacker { /// Creates a new `Packer`. /// /// # Type Parameters @@ -85,8 +92,9 @@ impl RepositoryPacker { blob_type: BlobType, indexer: SharedIndexer, pack_sizer: S, + add_padding: bool, ) -> RusticResult { - let mut packer = Packer::new(*be.key(), pack_sizer, blob_type); + let mut packer = Packer::new(*be.key(), pack_sizer, blob_type, add_padding); let file_writer = FileWriter::new( FileWriterHandle { @@ -396,7 +404,9 @@ impl Repacker { ) -> RusticResult { let size_limit = DefaultPackSizer::from_config(config, blob_type, total_size).pack_size(); let repo_sizer = FixedPackSizer(size_limit); - let packer = RepositoryPacker::new(be.clone(), blob_type, indexer, repo_sizer)?; + let add_padding = RepositoryPacker::padding_from_config(blob_type, config); + let packer = + RepositoryPacker::new(be.clone(), blob_type, indexer, repo_sizer, add_padding)?; Ok(Self { be, packer, diff --git a/crates/core/src/commands/config.rs b/crates/core/src/commands/config.rs index 3673a57bc..8e3afc08b 100644 --- a/crates/core/src/commands/config.rs +++ b/crates/core/src/commands/config.rs @@ -109,6 +109,10 @@ pub struct ConfigOptions { #[cfg_attr(feature = "clap", clap(long, value_name = "VERSION"))] pub set_version: Option, + /// Set padding for data packs + #[cfg_attr(feature = "clap", clap(long))] + pub set_use_pack_padding: Option, + /// Set append-only mode. /// Note that only append-only commands work once this is set. `forget`, `prune` or `config` won't work any longer. #[cfg_attr(feature = "clap", clap(long))] @@ -209,6 +213,8 @@ impl ConfigOptions { config.version = version; } + config.use_pack_padding = self.set_use_pack_padding; + if let Some(compression) = self.set_compression { if config.version == 1 && compression != 0 { return Err(RusticError::new( diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index 28453c232..49c63f3b5 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -39,7 +39,7 @@ use crate::{ }, progress::{Progress, ProgressBars}, repofile::{ - HeaderEntry, IndexBlob, IndexFile, IndexPack, SnapshotFile, SnapshotId, indexfile::IndexId, + HeaderEntry, IndexFile, IndexPack, SnapshotFile, SnapshotId, indexfile::IndexId, packfile::PackId, }, repository::{Open, Repository}, @@ -237,16 +237,18 @@ pub enum PackStatus { TooSmall, HasUnusedBlobs, HasUsedBlobs, + HasPadding, Marked, } -#[derive(Debug, Clone, Copy, Serialize)] +#[derive(Debug, Default, Clone, Copy, Serialize)] pub struct DebugDetailedStats { pub packs: u64, pub unused_blobs: u64, pub unused_size: u64, pub used_blobs: u64, pub used_size: u64, + pub padding_size: u64, } #[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq, PartialOrd, Ord)] @@ -269,18 +271,13 @@ impl DebugStats { blob_type, status, }) - .or_insert(DebugDetailedStats { - packs: 0, - unused_blobs: 0, - unused_size: 0, - used_blobs: 0, - used_size: 0, - }); + .or_default(); details.packs += 1; details.unused_blobs += u64::from(pi.unused_blobs); details.unused_size += u64::from(pi.unused_size); details.used_blobs += u64::from(pi.used_blobs); details.used_size += u64::from(pi.used_size); + details.padding_size += u64::from(pi.padding_size); } } @@ -405,7 +402,7 @@ struct PruneIndex { impl PruneIndex { // TODO: add documentation! fn len(&self) -> usize { - self.packs.iter().map(|p| p.blobs.len()).sum() + self.packs.iter().map(|p| p.index.blobs.len()).sum() } } @@ -439,8 +436,6 @@ impl Default for PackToDo { /// A pack which is to be pruned #[derive(Debug)] struct PrunePack { - /// The id of the pack - id: PackId, /// The type of the pack blob_type: BlobType, /// The size of the pack @@ -449,10 +444,8 @@ struct PrunePack { delete_mark: bool, /// The task to be executed on the pack to_do: PackToDo, - /// The time the pack was created - time: Option>, - /// The blobs in the pack - blobs: Vec, + /// the pack as saved in the index + index: IndexPack, } impl PrunePack { @@ -463,14 +456,14 @@ impl PrunePack { /// * `p` - The `IndexPack` to create the `PrunePack` from /// * `delete_mark` - Whether the pack is marked for deletion fn from_index_pack(p: IndexPack, delete_mark: bool) -> Self { + let blob_type = p.blob_type(); + let size = p.pack_size(); Self { - id: p.id, - blob_type: p.blob_type(), - size: p.pack_size(), + index: p, + blob_type, + size, delete_mark, to_do: PackToDo::Undecided, - time: p.time, - blobs: p.blobs, } } @@ -494,12 +487,7 @@ impl PrunePack { /// Convert the `PrunePack` into an `IndexPack` fn into_index_pack(self) -> IndexPack { - IndexPack { - id: self.id, - time: self.time, - size: None, - blobs: self.blobs, - } + self.index } /// Convert the `PrunePack` into an `IndexPack` with the given time @@ -508,12 +496,9 @@ impl PrunePack { /// /// * `time` - The time to set fn into_index_pack_with_time(self, time: DateTime) -> IndexPack { - IndexPack { - id: self.id, - time: Some(time), - size: None, - blobs: self.blobs, - } + let mut index_pack = self.into_index_pack(); + index_pack.time = Some(time); + index_pack } /// Set the task to be executed on the pack @@ -568,7 +553,8 @@ impl PrunePack { /// Returns whether the pack is compressed fn is_compressed(&self) -> bool { - self.blobs + self.index + .blobs .iter() .all(|blob| blob.uncompressed_length.is_some()) } @@ -577,6 +563,8 @@ impl PrunePack { /// Reasons why a pack should be repacked #[derive(PartialEq, Eq, Debug)] enum RepackReason { + /// We repack-all was selected + RepackAll, /// The pack is partly used PartlyUsed, /// The pack is to be compressed @@ -658,7 +646,7 @@ impl PrunePlan { let mut modified = false; index.packs.retain(|p| { !p.delete_mark || { - let duplicate = processed_packs.contains(&p.id); + let duplicate = processed_packs.contains(&p.index.id); modified |= duplicate; !duplicate } @@ -794,7 +782,7 @@ impl PrunePlan { .index_files .iter() .flat_map(|index| &index.packs) - .flat_map(|pack| &pack.blobs) + .flat_map(|pack| &pack.index.blobs) { if let Some(count) = self.used_ids.get_mut(&blob.id) { // note that duplicates are only counted up to 255. If there are more @@ -871,11 +859,15 @@ impl PrunePlan { let mut status = EnumSet::empty(); // Various checks to determine if packs need to be kept - let too_young = pack.time > Some(self.time - keep_pack); + let too_young = pack.index.time > Some(self.time - keep_pack); if too_young && !pack.delete_mark { _ = status.insert(PackStatus::TooYoung); } let keep_uncacheable = repack_cacheable_only && !pack.blob_type.is_cacheable(); + let has_padding = pi.padding_size != 0; + if has_padding { + _ = status.insert(PackStatus::HasPadding); + } let to_compress = repack_uncompressed && !pack.is_compressed(); if to_compress { @@ -906,22 +898,16 @@ impl PrunePlan { _ = status.insert(PackStatus::HasUsedBlobs); if too_young || keep_uncacheable { pack.set_todo(PackToDo::Keep, &pi, status, &mut self.stats); - } else if to_compress || repack_all { - self.repack_candidates.push(( - pi, - status, - RepackReason::ToCompress, - index_num, - pack_num, - )); - } else if size_mismatch { - self.repack_candidates.push(( - pi, - status, - RepackReason::SizeMismatch, - index_num, - pack_num, - )); + } else if to_compress || repack_all | size_mismatch { + let reason = if to_compress { + RepackReason::ToCompress + } else if repack_all { + RepackReason::RepackAll + } else { + RepackReason::SizeMismatch + }; + self.repack_candidates + .push((pi, status, reason, index_num, pack_num)); } else { pack.set_todo(PackToDo::Keep, &pi, status, &mut self.stats); } @@ -933,8 +919,8 @@ impl PrunePlan { status .insert_all(PackStatus::HasUsedBlobs | PackStatus::HasUnusedBlobs); - if too_young || keep_uncacheable { - // keep packs which are too young and non-cacheable packs if requested + if too_young || keep_uncacheable || has_padding { + // keep packs which have padding, are too young and non-cacheable packs if requested pack.set_todo(PackToDo::Keep, &pi, status, &mut self.stats); } else { // other partly used pack => candidate for repacking @@ -949,7 +935,7 @@ impl PrunePlan { } (true, 0, _) => { _ = status.insert(PackStatus::Marked); - match pack.time { + match pack.index.time { // unneeded and marked pack => check if we can remove it. Some(local_date_time) if self.time - local_date_time >= keep_delete => @@ -960,7 +946,7 @@ impl PrunePlan { None => { warn!( "pack to delete {}: no time set, this should not happen! Keeping this pack.", - pack.id + pack.index.id ); _ = status.insert(PackStatus::TimeNotSet); pack.set_todo( @@ -1081,7 +1067,7 @@ impl PrunePlan { /// * If a pack does not exist fn check_existing_packs(&mut self) -> RusticResult<()> { for pack in self.index_files.iter().flat_map(|index| &index.packs) { - let existing_size = self.existing_packs.remove(&pack.id); + let existing_size = self.existing_packs.remove(&pack.index.id); // TODO: Unused Packs which don't exist (i.e. only existing in index) let check_size = || { @@ -1091,12 +1077,12 @@ impl PrunePlan { ErrorKind::Internal, "Pack size `{size_in_pack_real}` of id `{pack_id}` does not match the expected size `{size_in_index_expected}` in the index file. ", ) - .attach_context("pack_id", pack.id.to_string()) + .attach_context("pack_id", pack.index.id.to_string()) .attach_context("size_in_index_expected", pack.size.to_string()) .attach_context("size_in_pack_real", size.to_string()) .ask_report()), None => Err(RusticError::new(ErrorKind::Internal, "Pack `{pack_id}` does not exist.") - .attach_context("pack_id", pack.id.to_string()) + .attach_context("pack_id", pack.index.id.to_string()) .ask_report()), } }; @@ -1107,11 +1093,11 @@ impl PrunePlan { ErrorKind::Internal, "Pack `{pack_id}` got no decision what to do with it!", ) - .attach_context("pack_id", pack.id.to_string()) + .attach_context("pack_id", pack.index.id.to_string()) .ask_report()); } PackToDo::Keep | PackToDo::Recover => { - for blob in &pack.blobs { + for blob in &pack.index.blobs { _ = self.used_ids.remove(&blob.id); } check_size()?; @@ -1176,7 +1162,7 @@ impl PrunePlan { .iter() .flat_map(|index| &index.packs) .filter(|pack| pack.to_do == PackToDo::Repack) - .map(|pack| pack.id) + .map(|pack| pack.index.id) .collect() } @@ -1332,8 +1318,8 @@ pub(crate) fn prune_repository( let delete_pack = |pack: PrunePack| -> Option<_> { // delete pack match pack.blob_type { - BlobType::Data => data_packs_remove.lock().unwrap().push(pack.id), - BlobType::Tree => tree_packs_remove.lock().unwrap().push(pack.id), + BlobType::Data => data_packs_remove.lock().unwrap().push(pack.index.id), + BlobType::Tree => tree_packs_remove.lock().unwrap().push(pack.index.id), } None }; @@ -1365,7 +1351,7 @@ pub(crate) fn prune_repository( ErrorKind::Internal, "Pack `{pack_id}` got no decision what to do with it!", ) - .attach_context("pack_id", pack.id.to_string()) + .attach_context("pack_id", pack.index.id.to_string()) .ask_report()); } PackToDo::Keep => { @@ -1375,7 +1361,7 @@ pub(crate) fn prune_repository( } PackToDo::Repack => { // TODO: repack in parallel - for blob in &pack.blobs { + for blob in &pack.index.blobs { if used_ids.lock().unwrap().remove(&blob.id).is_none() { // don't save duplicate blobs continue; @@ -1386,9 +1372,9 @@ pub(crate) fn prune_repository( BlobType::Tree => &tree_repacker, }; if opts.fast_repack { - repacker.add_fast(&pack.id, blob)?; + repacker.add_fast(&pack.index.id, blob)?; } else { - repacker.add(&pack.id, blob)?; + repacker.add(&pack.index.id, blob)?; } p.inc(u64::from(blob.length)); } @@ -1415,7 +1401,7 @@ pub(crate) fn prune_repository( } else { // keep pack: add to new index; keep the timestamp. // Note the timestamp shouldn't be None here, however if it is not not set, use the current time to heal the entry! - let time = pack.time.unwrap_or(prune_plan.time); + let time = pack.index.time.unwrap_or(prune_plan.time); let pack = pack.into_index_pack_with_time(time); Some((pack, true)) } @@ -1473,6 +1459,8 @@ struct PackInfo { used_size: u32, /// The size of the unused blobs in the pack unused_size: u32, + // The size of an unused blob (also contained in unused_size) which may be a padding blob ; zero if there is no padding blob detected + padding_size: u32, } impl PartialOrd for PackInfo { @@ -1508,6 +1496,7 @@ impl PackInfo { unused_blobs: 0, used_size: 0, unused_size: 0, + padding_size: 0, }; // We search all blobs in the pack for needed ones. We do this by already marking @@ -1516,7 +1505,7 @@ impl PackInfo { // Note that by this processing, we are also able to handle duplicate blobs within a pack // correctly. // If we found a needed blob, we stop and process the information that the pack is actually needed. - let first_needed = pack.blobs.iter().position(|blob| { + let first_needed = pack.index.blobs.iter().position(|blob| { match used_ids.get_mut(&blob.id) { None | Some(0) => { pi.unused_size += blob.length; @@ -1542,7 +1531,7 @@ impl PackInfo { if let Some(first_needed) = first_needed { // The pack is actually needed. // We reprocess the blobs up to the first needed one and mark all blobs which are genarally needed as used. - for blob in &pack.blobs[..first_needed] { + for blob in &pack.index.blobs[..first_needed] { match used_ids.get_mut(&blob.id) { None | Some(0) => {} // already correctly marked Some(count) => { @@ -1556,9 +1545,12 @@ impl PackInfo { } } // Then we process the remaining blobs and mark all blobs which are generally needed as used in this blob - for blob in &pack.blobs[first_needed + 1..] { + for (num, blob) in pack.index.blobs[first_needed + 1..].iter().enumerate() { match used_ids.get_mut(&blob.id) { None | Some(0) => { + if pi.unused_size == 0 && num == pack.index.blobs.len() - 1 { + pi.padding_size = blob.length; + } pi.unused_size += blob.length; pi.unused_blobs += 1; } diff --git a/crates/core/src/repofile/configfile.rs b/crates/core/src/repofile/configfile.rs index 52e4d11b4..33e0883ce 100644 --- a/crates/core/src/repofile/configfile.rs +++ b/crates/core/src/repofile/configfile.rs @@ -50,6 +50,11 @@ pub struct ConfigFile { /// The chunker polynomial used to chunk data pub chunker_polynomial: String, + /// Marker if this repository uses padding for pack files to mitigate chunking attacks. + /// + /// If not set, uses padding + pub use_pack_padding: Option, + /// Marker if this is a hot repository. If not set, this is no hot repository /// /// # Note @@ -166,6 +171,12 @@ impl ConfigFile { } } + /// Determine whether to use pack padding + #[must_use] + pub fn use_pack_padding(&self) -> bool { + self.use_pack_padding.unwrap_or(true) + } + /// Get whether an extra verification (decompressing/decrypting data before writing to the repository) should be performed. #[must_use] pub fn extra_verify(&self) -> bool { From 8f6ae7e15b02c6ba7599559494afa34d5ceb439b Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Mon, 30 Jun 2025 11:33:15 +0200 Subject: [PATCH 19/26] pad by filling up to next 64kiB; correct statistics --- crates/core/src/blob/packer.rs | 50 +++++++++---------- crates/core/src/repofile/packfile.rs | 8 +-- ...gration__backup-tar-summary-first-nix.snap | 2 +- ...gration__dryrun-tar-summary-first-nix.snap | 2 +- ...ntegration__stdin-command-summary-nix.snap | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index 1a0c20875..ae840846f 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -11,8 +11,9 @@ use crate::{ crypto::{CryptoKey, hasher::hash}, error::{ErrorKind, RusticError, RusticResult}, repofile::{ + HeaderEntry, indexfile::IndexPack, - packfile::{PackHeaderLength, PackHeaderRef}, + packfile::{self, PackHeaderLength, PackHeaderRef}, snapshotfile::SnapshotSummary, }, }; @@ -165,7 +166,7 @@ impl Packer { /// /// The number of bytes written. fn write_data(&mut self, data: &[u8]) -> PackerResult { - let len = data + let len: u32 = data .len() .try_into() .map_err(|err| PackerErrorKind::Conversion { @@ -173,6 +174,8 @@ impl Packer { from: "usize", source: err, })?; + let data_len_packed: u64 = len.into(); + self.stats.data_packed += data_len_packed; self.file.extend_from_slice(data); self.size += len; Ok(len) @@ -199,20 +202,8 @@ impl Packer { uncompressed_length: Option, ) -> RusticResult<()> { self.stats.blobs += 1; - self.stats.data += data_len; - let data_len_packed: u64 = data.len().try_into().map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to convert data length `{length}` to u64.", - err, - ) - .attach_context("length", data.len().to_string()) - })?; - - self.stats.data_packed += data_len_packed; - let offset = self.size; let len = self.write_data(data).map_err(|err| { @@ -222,7 +213,6 @@ impl Packer { err, ) .attach_context("id", id.to_string()) - .attach_context("data_length_packed", data_len_packed.to_string()) })?; self.index @@ -356,17 +346,27 @@ impl Packer { // Add a padding blob fn add_padding_blob(&mut self) -> RusticResult<()> { // TODO: calculate reasonable padding size! - pub(super) const KB: usize = 1024; - pub(super) const MB: usize = 1024 * KB; - pub(super) const MIN_SIZE: usize = 512 * KB; - pub(super) const MAX_SIZE: usize = 8 * MB; - - // !!TODO!! -> change to reasonable padding! - let padding_size = 400; - let data = vec![0; padding_size]; + pub(super) const KB: u32 = 1024; + pub(super) const MAX_PADDING: u32 = 64 * KB; + + let size = PackHeaderRef::from_index_pack(&self.index).pack_size() + + HeaderEntry::ENTRY_LEN + + packfile::constants::COMP_OVERHEAD; + + let padding_size = MAX_PADDING - size % MAX_PADDING; + let data = vec![ + 0; + padding_size + .try_into() + .expect("u32 should convert to usize") + ]; let id = BlobId(hash(&data)); let data = self.key.encrypt_data(&data)?; - let padding_size = u64::try_from(padding_size).expect("padding_size should fit into u64"); - self.add(&data, &id, padding_size, None) + let padding_size = padding_size.into(); + self.add(&data, &id, padding_size, None)?; + // correct stats - padding should not contribute to blobs and data_added + self.stats.blobs -= 1; + self.stats.data -= padding_size; + Ok(()) } } diff --git a/crates/core/src/repofile/packfile.rs b/crates/core/src/repofile/packfile.rs index 916bf6943..98397e8bc 100644 --- a/crates/core/src/repofile/packfile.rs +++ b/crates/core/src/repofile/packfile.rs @@ -26,13 +26,13 @@ pub(crate) type PackFileResult = Result; impl_repoid!(PackId, FileType::Pack); -pub(super) mod constants { +pub(crate) mod constants { // 32 equals the size of the crypto overhead // TODO: use from crypto mod /// The overhead of compression and encryption - pub(super) const COMP_OVERHEAD: u32 = 32; + pub(crate) const COMP_OVERHEAD: u32 = 32; /// The length of the length field within the pack header - pub(super) const LENGTH_LEN: u32 = 4; + pub(crate) const LENGTH_LEN: u32 = 4; } /// The length field within the pack header (which is the total length of the pack header) @@ -131,7 +131,7 @@ pub enum HeaderEntry { impl HeaderEntry { /// The length of an uncompressed header entry - const ENTRY_LEN: u32 = 37; + pub(crate) const ENTRY_LEN: u32 = 37; /// The length of a compressed header entry pub(crate) const ENTRY_LEN_COMPRESSED: u32 = 41; diff --git a/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap b/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap index 4280c5511..be194157e 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 131072, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap index 8abb966ad..a851c5fc4 100644 --- a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap +++ b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 131072, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap b/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap index e837cecb6..124997ca7 100644 --- a/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap +++ b/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 5, - data_added_files_packed: 46, + data_added_files_packed: 65536, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", From b76b12f6977b864fc32010e5adfd65138e4e1db6 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 11:37:34 +0200 Subject: [PATCH 20/26] correct test --- .../tests/snapshots/integration__backup-tar-groups-nix.snap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap b/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap index d3ca3f2fb..16a391136 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap @@ -35,7 +35,7 @@ expression: snap data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 131072, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", From d2bdcd4e12c00dd6419a348f09bab683a5ef6148 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 11:37:40 +0200 Subject: [PATCH 21/26] only check for padding blobs in prune if padding option is selected --- crates/core/src/commands/prune.rs | 127 +++++++++++++++--------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/crates/core/src/commands/prune.rs b/crates/core/src/commands/prune.rs index 49c63f3b5..4bb1df8d5 100644 --- a/crates/core/src/commands/prune.rs +++ b/crates/core/src/commands/prune.rs @@ -183,7 +183,7 @@ impl PruneOptions { } /// Enum to specify a size limit -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Default)] #[non_exhaustive] pub enum LimitOption { /// Size in bytes @@ -191,6 +191,7 @@ pub enum LimitOption { /// Size in percentage of repository size Percentage(u64), /// No limit + #[default] Unlimited, } @@ -574,8 +575,27 @@ enum RepackReason { } /// A plan what should be repacked or removed by a `prune` run -#[derive(Debug)] +#[derive(Debug, Default)] +#[allow(clippy::struct_excessive_bools)] pub struct PrunePlan { + /// The how long to keep packs before considering repacking them + keep_pack: Duration, + /// The how long to keep pack marked for deletion before relly removing them + keep_delete: Duration, + /// indicates whether to only repack cacheable packs (i.e. tree packs) + repack_cacheable_only: bool, + /// indicates whether to repack uncompressed packs + repack_uncompressed: bool, + /// indicates whether to use padding for data packs + use_padding: bool, + /// indicates whether to repack all packs + repack_all: bool, + /// limit packs to repack + max_repack: LimitOption, + /// limit maximum unused size after repacking + max_unused: LimitOption, + /// indicates whether not to resize packs with non-fitting sizes + no_resize: bool, /// The time the plan was created time: DateTime, /// The ids of the blobs which are used @@ -659,9 +679,8 @@ impl PrunePlan { time: Local::now(), used_ids, existing_packs, - repack_candidates: Vec::new(), index_files, - stats: PruneStats::default(), + ..Default::default() } } @@ -733,42 +752,40 @@ impl PrunePlan { let mut pruner = Self::new(used_ids, existing_packs, index_files); pruner.count_used_blobs(); pruner.check()?; - let repack_cacheable_only = opts + + pruner.keep_pack = Duration::from_std(*opts.keep_pack).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to convert keep_pack duration `{keep_pack}` to std::time::Duration.", + err, + ) + .attach_context("keep_pack", opts.keep_pack.to_string()) + })?; + + pruner.keep_delete = Duration::from_std(*opts.keep_delete).map_err(|err| { + RusticError::with_source( + ErrorKind::Internal, + "Failed to convert keep_delete duration `{keep_delete}` to std::time::Duration.", + err, + ) + .attach_context("keep_delete", opts.keep_delete.to_string()) + })?; + + pruner.repack_cacheable_only = opts .repack_cacheable_only .unwrap_or_else(|| repo.config().is_hot == Some(true)); + pruner.repack_uncompressed = opts.repack_uncompressed; + pruner.repack_all = opts.repack_all; + pruner.use_padding = repo.config().use_pack_padding(); + pruner.max_repack = opts.max_repack; + pruner.max_unused = opts.max_unused; + pruner.no_resize = opts.no_resize; + let pack_sizer = total_size.map(|tpe, size| DefaultPackSizer::from_config(repo.config(), tpe, size)); - pruner.decide_packs( - Duration::from_std(*opts.keep_pack).map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to convert keep_pack duration `{keep_pack}` to std::time::Duration.", - err, - ) - .attach_context("keep_pack", opts.keep_pack.to_string()) - })?, - Duration::from_std(*opts.keep_delete).map_err(|err| { - RusticError::with_source( - ErrorKind::Internal, - "Failed to convert keep_delete duration `{keep_delete}` to std::time::Duration.", - err, - ) - .attach_context("keep_delete", opts.keep_delete.to_string()) - })?, - repack_cacheable_only, - opts.repack_uncompressed, - opts.repack_all, - &pack_sizer, - )?; - - pruner.decide_repack( - &opts.max_repack, - &opts.max_unused, - opts.repack_uncompressed || opts.repack_all, - opts.no_resize, - &pack_sizer, - ); + pruner.decide_packs(&pack_sizer)?; + pruner.decide_repack(&pack_sizer); pruner.check_existing_packs()?; pruner.filter_index_files(opts.instant_delete); @@ -829,15 +846,7 @@ impl PrunePlan { // TODO: add errors! #[allow(clippy::too_many_lines)] #[allow(clippy::unnecessary_wraps)] - fn decide_packs( - &mut self, - keep_pack: Duration, - keep_delete: Duration, - repack_cacheable_only: bool, - repack_uncompressed: bool, - repack_all: bool, - pack_sizer: &BlobTypeMap, - ) -> RusticResult<()> { + fn decide_packs(&mut self, pack_sizer: &BlobTypeMap) -> RusticResult<()> { // first process all marked packs then the unmarked ones: // - first processed packs are more likely to have all blobs seen as unused // - if marked packs have used blob but these blobs are all present in @@ -859,17 +868,18 @@ impl PrunePlan { let mut status = EnumSet::empty(); // Various checks to determine if packs need to be kept - let too_young = pack.index.time > Some(self.time - keep_pack); + let too_young = pack.index.time > Some(self.time - self.keep_pack); if too_young && !pack.delete_mark { _ = status.insert(PackStatus::TooYoung); } - let keep_uncacheable = repack_cacheable_only && !pack.blob_type.is_cacheable(); - let has_padding = pi.padding_size != 0; + let keep_uncacheable = + self.repack_cacheable_only && !pack.blob_type.is_cacheable(); + let has_padding = self.use_padding && pi.padding_size != 0; if has_padding { _ = status.insert(PackStatus::HasPadding); } - let to_compress = repack_uncompressed && !pack.is_compressed(); + let to_compress = self.repack_uncompressed && !pack.is_compressed(); if to_compress { _ = status.insert(PackStatus::NotCompressed); } @@ -898,10 +908,10 @@ impl PrunePlan { _ = status.insert(PackStatus::HasUsedBlobs); if too_young || keep_uncacheable { pack.set_todo(PackToDo::Keep, &pi, status, &mut self.stats); - } else if to_compress || repack_all | size_mismatch { + } else if to_compress || self.repack_all | size_mismatch { let reason = if to_compress { RepackReason::ToCompress - } else if repack_all { + } else if self.repack_all { RepackReason::RepackAll } else { RepackReason::SizeMismatch @@ -938,7 +948,7 @@ impl PrunePlan { match pack.index.time { // unneeded and marked pack => check if we can remove it. Some(local_date_time) - if self.time - local_date_time >= keep_delete => + if self.time - local_date_time >= self.keep_delete => { _ = status.insert(PackStatus::TooYoung); pack.set_todo(PackToDo::Delete, &pi, status, &mut self.stats); @@ -989,15 +999,8 @@ impl PrunePlan { /// # Errors /// // TODO: add errors! - fn decide_repack( - &mut self, - max_repack: &LimitOption, - max_unused: &LimitOption, - repack_uncompressed: bool, - no_resize: bool, - pack_sizer: &BlobTypeMap, - ) { - let max_unused = match (repack_uncompressed, max_unused) { + fn decide_repack(&mut self, pack_sizer: &BlobTypeMap) { + let max_unused = match (self.repack_uncompressed | self.repack_all, self.max_unused) { (true, _) => 0, (false, LimitOption::Unlimited) => u64::MAX, (false, LimitOption::Size(size)) => size.as_u64(), @@ -1007,7 +1010,7 @@ impl PrunePlan { (false, LimitOption::Percentage(p)) => (p * self.stats.size_sum().used) / (100 - p), }; - let max_repack = match max_repack { + let max_repack = match self.max_repack { LimitOption::Unlimited => u64::MAX, LimitOption::Size(size) => size.as_u64(), LimitOption::Percentage(p) => (p * self.stats.size_sum().total()) / 100, @@ -1029,7 +1032,7 @@ impl PrunePlan { || (self.stats.size_sum().unused_after_prune() < max_unused && repack_reason == RepackReason::PartlyUsed && blob_type == BlobType::Data) - || (repack_reason == RepackReason::SizeMismatch && no_resize) + || (repack_reason == RepackReason::SizeMismatch && self.no_resize) { pack.set_todo(PackToDo::Keep, &pi, status, &mut self.stats); } else if repack_reason == RepackReason::SizeMismatch { From 6a0736a77b813a10a10ad23ef01f87a7cb9ddc6a Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 11:45:42 +0200 Subject: [PATCH 22/26] add test without padding --- crates/core/tests/integration/backup.rs | 10 +++++++--- .../integration__dryrun-tar-summary-first-nix.snap | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/core/tests/integration/backup.rs b/crates/core/tests/integration/backup.rs index 8c19be0d1..9c6695745 100644 --- a/crates/core/tests/integration/backup.rs +++ b/crates/core/tests/integration/backup.rs @@ -9,8 +9,8 @@ use pretty_assertions::assert_eq; use rstest::rstest; use rustic_core::{ - BackupOptions, CommandInput, ParentOptions, PathList, SnapshotGroupCriterion, SnapshotOptions, - StringList, + BackupOptions, CommandInput, ConfigOptions, ParentOptions, PathList, SnapshotGroupCriterion, + SnapshotOptions, StringList, repofile::{PackId, SnapshotFile}, }; @@ -153,7 +153,11 @@ fn test_backup_dry_run_with_tar_gz_passes( insta_node_redaction: Settings, ) -> Result<()> { // Fixtures - let (source, repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); + let (source, mut repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); + + // here, we additionally test without pack padding + let opts = ConfigOptions::default().set_use_pack_padding(false); + assert!(repo.apply_config(&opts)?); let paths = &source.path_list(); diff --git a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap index a851c5fc4..3b7b04ba2 100644 --- a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap +++ b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 131072, + data_added_files_packed: 81646, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", From ec01585358d1fc767fe44c29f735665313e3939b Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 14:13:02 +0200 Subject: [PATCH 23/26] use padding only for 1 test --- crates/core/tests/integration/backup.rs | 22 +++++++++++-------- .../integration__backup-tar-groups-nix.snap | 2 +- ...gration__backup-tar-summary-first-nix.snap | 2 +- ...gration__dryrun-tar-summary-first-nix.snap | 2 +- ...ntegration__stdin-command-summary-nix.snap | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/crates/core/tests/integration/backup.rs b/crates/core/tests/integration/backup.rs index 9c6695745..c266db447 100644 --- a/crates/core/tests/integration/backup.rs +++ b/crates/core/tests/integration/backup.rs @@ -30,10 +30,13 @@ fn test_backup_with_tar_gz_passes( // SimpleLogger::init(log::LevelFilter::Debug, Config::default())?; // Fixtures - let (source, repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); - + let (source, mut repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); let paths = &source.path_list(); + // we don't use padding in this test as we want to compare packsize with insta! + let opts = ConfigOptions::default().set_use_pack_padding(false); + assert!(repo.apply_config(&opts)?); + // we use as_path to not depend on the actual tempdir let opts = BackupOptions::default().as_path(PathBuf::from_str("test")?); @@ -153,14 +156,11 @@ fn test_backup_dry_run_with_tar_gz_passes( insta_node_redaction: Settings, ) -> Result<()> { // Fixtures - let (source, mut repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); - - // here, we additionally test without pack padding - let opts = ConfigOptions::default().set_use_pack_padding(false); - assert!(repo.apply_config(&opts)?); - + let (source, repo) = (tar_gz_testdata?, set_up_repo?.to_indexed_ids()?); let paths = &source.path_list(); + // Note: padding is enabled (automatically) for this test! + // we use as_path to not depend on the actual tempdir let opts = BackupOptions::default() .as_path(PathBuf::from_str("test")?) @@ -226,9 +226,13 @@ fn test_backup_stdin_command( insta_snapshotfile_redaction: Settings, ) -> Result<()> { // Fixtures - let repo = set_up_repo?.to_indexed_ids()?; + let mut repo = set_up_repo?.to_indexed_ids()?; let paths = PathList::from_string("-")?; + // we don't use padding in this test as we want to compare packsize with insta! + let opts = ConfigOptions::default().set_use_pack_padding(false); + assert!(repo.apply_config(&opts)?); + let cmd: CommandInput = "echo test".parse()?; let opts = BackupOptions::default() .stdin_filename("test") diff --git a/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap b/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap index 16a391136..b17a2b427 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-groups-nix.snap @@ -35,7 +35,7 @@ expression: snap data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 131072, + data_added_files_packed: 81646, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap b/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap index be194157e..bb70a3cc3 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-summary-first-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 131072, + data_added_files_packed: 81646, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap index 3b7b04ba2..a851c5fc4 100644 --- a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap +++ b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 81646, + data_added_files_packed: 131072, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap b/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap index 124997ca7..31a7eb180 100644 --- a/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap +++ b/crates/core/tests/snapshots/integration__stdin-command-summary-nix.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 5, - data_added_files_packed: 65536, + data_added_files_packed: 123, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", From e08c82329c1a326d9d7c12a59b099dc85b7aebc5 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 15:53:05 +0200 Subject: [PATCH 24/26] prevent 0-size padding; update comments --- crates/core/src/blob/packer.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index ae840846f..ee1ed630b 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -345,15 +345,18 @@ impl Packer { // Add a padding blob fn add_padding_blob(&mut self) -> RusticResult<()> { - // TODO: calculate reasonable padding size! pub(super) const KB: u32 = 1024; pub(super) const MAX_PADDING: u32 = 64 * KB; + // compute current size including the HeaderEntry and crypt overhead of the padding blob to-add let size = PackHeaderRef::from_index_pack(&self.index).pack_size() + HeaderEntry::ENTRY_LEN + packfile::constants::COMP_OVERHEAD; - let padding_size = MAX_PADDING - size % MAX_PADDING; + // compute padding size. Note that we don't add zero-sized blobs here, i.e. padding_size is in 1..=MAX_PADDING. + let padding_size = MAX_PADDING - 1 - (size - 1) % MAX_PADDING; + + // write padding blob let data = vec![ 0; padding_size @@ -364,6 +367,7 @@ impl Packer { let data = self.key.encrypt_data(&data)?; let padding_size = padding_size.into(); self.add(&data, &id, padding_size, None)?; + // correct stats - padding should not contribute to blobs and data_added self.stats.blobs -= 1; self.stats.data -= padding_size; From 7a63042837f7f50faab3e5107f1ee026c599cfcc Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 17:38:14 +0200 Subject: [PATCH 25/26] adapt windows tests --- .../tests/snapshots/integration__backup-tar-groups-windows.snap | 2 +- .../integration__backup-tar-summary-first-windows.snap | 2 +- .../integration__dryrun-tar-summary-first-windows.snap | 2 +- .../snapshots/integration__stdin-command-summary-windows.snap | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/core/tests/snapshots/integration__backup-tar-groups-windows.snap b/crates/core/tests/snapshots/integration__backup-tar-groups-windows.snap index 5e2bb4018..083d08819 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-groups-windows.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-groups-windows.snap @@ -35,7 +35,7 @@ expression: snap data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 81646, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__backup-tar-summary-first-windows.snap b/crates/core/tests/snapshots/integration__backup-tar-summary-first-windows.snap index 4280c5511..bb70a3cc3 100644 --- a/crates/core/tests/snapshots/integration__backup-tar-summary-first-windows.snap +++ b/crates/core/tests/snapshots/integration__backup-tar-summary-first-windows.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 81646, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-windows.snap b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-windows.snap index 8abb966ad..a851c5fc4 100644 --- a/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-windows.snap +++ b/crates/core/tests/snapshots/integration__dryrun-tar-summary-first-windows.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 1125653, - data_added_files_packed: 78740, + data_added_files_packed: 131072, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", diff --git a/crates/core/tests/snapshots/integration__stdin-command-summary-windows.snap b/crates/core/tests/snapshots/integration__stdin-command-summary-windows.snap index e837cecb6..31a7eb180 100644 --- a/crates/core/tests/snapshots/integration__stdin-command-summary-windows.snap +++ b/crates/core/tests/snapshots/integration__stdin-command-summary-windows.snap @@ -30,7 +30,7 @@ SnapshotFile( data_added: "[data_added]", data_added_packed: "[data_added_packed]", data_added_files: 5, - data_added_files_packed: 46, + data_added_files_packed: 123, data_added_trees: "[data_added_trees]", data_added_trees_packed: "[data_added_trees_packed]", command: "[command]", From 7738b7068bfea025c0fc6efba19452b7b5e76983 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Wed, 2 Jul 2025 20:02:58 +0200 Subject: [PATCH 26/26] correct padding size computation --- crates/core/src/blob/packer.rs | 35 ++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/crates/core/src/blob/packer.rs b/crates/core/src/blob/packer.rs index ee1ed630b..b0e0b3a77 100644 --- a/crates/core/src/blob/packer.rs +++ b/crates/core/src/blob/packer.rs @@ -47,6 +47,8 @@ pub(super) mod constants { pub(super) const MAX_COUNT: u32 = 10_000; /// The maximum age of a pack pub(super) const MAX_AGE: Duration = Duration::from_secs(300); + /// The maximum size used for padding + pub(super) const MAX_PADDING: u32 = 64 * KB; } // TODO: add documentation! @@ -353,8 +355,7 @@ impl Packer { + HeaderEntry::ENTRY_LEN + packfile::constants::COMP_OVERHEAD; - // compute padding size. Note that we don't add zero-sized blobs here, i.e. padding_size is in 1..=MAX_PADDING. - let padding_size = MAX_PADDING - 1 - (size - 1) % MAX_PADDING; + let padding_size = padding_size(size); // write padding blob let data = vec![ @@ -374,3 +375,33 @@ impl Packer { Ok(()) } } + +fn padding_size(size: u32) -> u32 { + // compute padding size. Note that we don't add zero-sized blobs here, i.e. padding_size is in 1..=MAX_PADDING. + let padding = constants::MAX_PADDING - size % constants::MAX_PADDING; + if padding == 0 { + constants::MAX_PADDING + } else { + padding + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_padding_size() { + assert_eq!(padding_size(1), constants::MAX_PADDING - 1); + assert_eq!(padding_size(constants::MAX_PADDING - 1), 1); + assert_eq!(padding_size(constants::MAX_PADDING), constants::MAX_PADDING); + assert_eq!( + padding_size(constants::MAX_PADDING + 1), + constants::MAX_PADDING - 1 + ); + assert_eq!( + padding_size(3 * constants::MAX_PADDING + 5), + constants::MAX_PADDING - 5 + ); + } +}