From c45bd233f18f6c1fe0c10f2b974e4657e1aad23b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 23 Apr 2026 12:00:22 -0700 Subject: [PATCH 1/4] Add an error when minimizers/zipcodes seem older than the distance index --- src/index_registry.cpp | 16 ++++++++++++++++ src/index_registry.hpp | 7 +++++++ src/subcommand/giraffe_main.cpp | 31 ++++++++++++++++++------------- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 9a8f2923d6..da6b39a766 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -5222,6 +5222,22 @@ vector IndexRegistry::require(const IndexName& identifier) const { return index->get_filenames(); } +bool IndexRegistry::predates(const IndexName& earlier, const IndexName& later) const { + // Get all the files + std::vector earlier_files = require(earlier); + std::vector later_files = require(later); + + // Get all their modification times + std::filesystem::file_time_type (*predicate)(const std::filesystem::path&) = std::filesystem::last_write_time; + std::vector earlier_times; + std::transform(earlier_files.begin(), earlier_files.end(), std::back_inserter(earlier_times), predicate); + std::vector later_times; + std::transform(later_files.begin(), later_files.end(), std::back_inserter(later_times), predicate); + + // Return if the earlier files are touched no later than the later files. + return std::max_element(earlier_times.begin(), earlier_times.end()) <= std::min_element(later_times.begin(), later_times.end());; +} + void IndexRegistry::set_target_memory_usage(int64_t bytes) { target_memory_usage = bytes; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 3b7f58dd4c..7f20a5fd37 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -331,6 +331,13 @@ class IndexRegistry { /// Return true if the given index is available and can be require()'d, and /// false otherwise. bool available(const IndexName& identifier) const; + + /// For two available indexes, returns true if the modification times + /// on the eariler index are no later than those on the later index. + /// + /// Useful for enforcing that downstream indexes haven't had their upstream + /// indexes overwritten. + bool predates(const IndexName& earlier, const IndexName& later) const; /// Get the possible filename(s) associated with the given index with the given prefix. /// TODO: Get this to account for sample-scoped indexes. diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index a9cab2c577..0936040d6e 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1756,42 +1756,47 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Minimizer Index" << endl; } + IndexName minimizer_indexname; unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { if (use_path_minimizer) { - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read PathMinimizers").at(0)); + minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { // Use the long read minimizers - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read Minimizers").at(0)); + minimizer_indexname = "Long Read Minimizers"; } } else { - minimizer_index = vg::io::VPKG::load_one(registry.require("Short Read Minimizers").at(0)); + minimizer_indexname = "Short Read Minimizers"; } + if (!registry.predates("Giraffe Distance Index", minimizer_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " but should depend on it" << std::endl; + } + minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); require_payload(*minimizer_index, payload_type); // Grab the zipcodes if (show_progress) { logger.info() << "Loading Zipcodes" << endl; } + IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { if (use_path_minimizer) { - ifstream zip_in (registry.require("Long Read PathZipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { - ifstream zip_in (registry.require("Long Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read Zipcodes"; } - } else { - ifstream zip_in (registry.require("Short Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Short Read Zipcodes"; + } + if (!registry.predates("Giraffe Distance Index", oversized_zipcodes_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(oversized_zipcodes_indexname).at(0) << " but should depend on it" << std::endl; } + ifstream zip_in (registry.require(oversized_zipcodes_indexname).at(0)); + oversized_zipcodes.deserialize(zip_in); + zip_in.close(); // Grab the GBZ From f1cb7e4f01db55b9623bed07272cc7a38e53e897 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 23 Apr 2026 12:22:38 -0700 Subject: [PATCH 2/4] Actually compare times and not iterators --- src/index_registry.cpp | 14 +++++++++++++- src/subcommand/giraffe_main.cpp | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index da6b39a766..bd773a90b5 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -5227,6 +5227,14 @@ bool IndexRegistry::predates(const IndexName& earlier, const IndexName& later) c std::vector earlier_files = require(earlier); std::vector later_files = require(later); + // Make sure they're nonempty + if (earlier_files.empty()) { + throw std::runtime_error(earlier + " index has no files"); + } + if (later_files.empty()) { + throw std::runtime_error(later + " index has no files"); + } + // Get all their modification times std::filesystem::file_time_type (*predicate)(const std::filesystem::path&) = std::filesystem::last_write_time; std::vector earlier_times; @@ -5234,8 +5242,12 @@ bool IndexRegistry::predates(const IndexName& earlier, const IndexName& later) c std::vector later_times; std::transform(later_files.begin(), later_files.end(), std::back_inserter(later_times), predicate); + // Find where the times that shouldn't intersect are, and get them. + std::filesystem::file_time_type earlier_time = *std::max_element(earlier_times.begin(), earlier_times.end()); + std::filesystem::file_time_type later_time = *std::max_element(later_times.begin(), later_times.end()); + // Return if the earlier files are touched no later than the later files. - return std::max_element(earlier_times.begin(), earlier_times.end()) <= std::min_element(later_times.begin(), later_times.end());; + return earlier_time <= later_time; } void IndexRegistry::set_target_memory_usage(int64_t bytes) { diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 0936040d6e..5d1bbf77c2 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1771,7 +1771,7 @@ int main_giraffe(int argc, char** argv) { minimizer_indexname = "Short Read Minimizers"; } if (!registry.predates("Giraffe Distance Index", minimizer_indexname)) { - logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " but should depend on it" << std::endl; + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " which depends on it" << std::endl; } minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); require_payload(*minimizer_index, payload_type); @@ -1792,7 +1792,7 @@ int main_giraffe(int argc, char** argv) { oversized_zipcodes_indexname = "Short Read Zipcodes"; } if (!registry.predates("Giraffe Distance Index", oversized_zipcodes_indexname)) { - logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(oversized_zipcodes_indexname).at(0) << " but should depend on it" << std::endl; + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(oversized_zipcodes_indexname).at(0) << " which depends on it" << std::endl; } ifstream zip_in (registry.require(oversized_zipcodes_indexname).at(0)); oversized_zipcodes.deserialize(zip_in); From 0dd4050b78d7dd4c486ba5dfefb24710a3c3cb26 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 24 Apr 2026 08:24:44 -0700 Subject: [PATCH 3/4] Stop protoc from trying to run multiple times at once and destroying our generated headers --- deps/libvgio | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libvgio b/deps/libvgio index fff151be9d..3026f7d28e 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit fff151be9d8255672d91f32a5b41285584905743 +Subproject commit 3026f7d28ef1576982968aff4eed7adf5a10f262 From 8a7927870688f67baea01497100387fedd2923f6 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 24 Apr 2026 12:42:27 -0400 Subject: [PATCH 4/4] Make sure the distance index is on disk with any pointer fixups before saving the min/zipcodes files --- src/index_registry.cpp | 4 ++++ src/io/register_loader_saver_distance_index.cpp | 2 +- src/subcommand/giraffe_main.cpp | 8 ++++++++ src/subcommand/minimizer_main.cpp | 7 +++++-- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index bd773a90b5..cb09dbcbfa 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -551,6 +551,10 @@ construct_minimizers_impl( *gbz, distance_index.get(), &oversized_zipcodes, params ); + // Close the distance index so it can't appear to be modified after the + // files that depend on it. + distance_index.reset(); + string output_name = plan->output_filepath(minimizer_output); save_minimizer(minimizers, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); output_name_minimizer.push_back(output_name); diff --git a/src/io/register_loader_saver_distance_index.cpp b/src/io/register_loader_saver_distance_index.cpp index 54245956a9..926dc0f3f6 100644 --- a/src/io/register_loader_saver_distance_index.cpp +++ b/src/io/register_loader_saver_distance_index.cpp @@ -1,6 +1,6 @@ /** * \file register_loader_saver_distance_index.cpp - * Defines IO for an XG index from stream files. + * Defines IO for a SnarlDistanceIndex index from stream files. */ #include diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 5d1bbf77c2..f1cf0250a5 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1810,6 +1810,14 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Distance Index" << endl; } + // TODO: Now that we enforce that the minimizer and zipcodes files are + // newer than the distance index, we really shouldn't modify it ourselves + // by fixing any indirect pointers that may still be in it. So we should be + // able to open the file read-only and map the file read-only here, which + // in turn would solve problems with writable mappings being slow on shared + // filesystems even when not being written. But the VPKG system doesn't + // really support doing that, so we'd have to get the file descriptor + // manually and deserialize() on it and close() it later. auto distance_index = vg::io::VPKG::load_one(registry.require("Giraffe Distance Index").at(0)); if (show_progress) { diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 8635df2a97..379e240463 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -102,13 +102,16 @@ int main_minimizer(int argc, char** argv) { config.params ); - // Serialize the index and the oversized zipcodes. + // Close the distance index so it can't seem to be modified after the files + // that depend on it. + distance_index.reset(); + + // Serialize the minimizer index and the oversized zipcodes. save_minimizer(index, config.output_name); if (!config.zipcode_name.empty()) { std::ofstream zip_out(config.zipcode_name); oversized_zipcodes.serialize(zip_out); zip_out.close(); - } if (config.progress) {