diff --git a/deps/libvgio b/deps/libvgio index fff151be9d..3026f7d28e 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit fff151be9d8255672d91f32a5b41285584905743 +Subproject commit 3026f7d28ef1576982968aff4eed7adf5a10f262 diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 9a8f2923d6..cb09dbcbfa 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -551,6 +551,10 @@ construct_minimizers_impl( *gbz, distance_index.get(), &oversized_zipcodes, params ); + // Close the distance index so it can't appear to be modified after the + // files that depend on it. + distance_index.reset(); + string output_name = plan->output_filepath(minimizer_output); save_minimizer(minimizers, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); output_name_minimizer.push_back(output_name); @@ -5222,6 +5226,34 @@ vector IndexRegistry::require(const IndexName& identifier) const { return index->get_filenames(); } +bool IndexRegistry::predates(const IndexName& earlier, const IndexName& later) const { + // Get all the files + std::vector earlier_files = require(earlier); + std::vector later_files = require(later); + + // Make sure they're nonempty + if (earlier_files.empty()) { + throw std::runtime_error(earlier + " index has no files"); + } + if (later_files.empty()) { + throw std::runtime_error(later + " index has no files"); + } + + // Get all their modification times + std::filesystem::file_time_type (*predicate)(const std::filesystem::path&) = std::filesystem::last_write_time; + std::vector earlier_times; + std::transform(earlier_files.begin(), earlier_files.end(), std::back_inserter(earlier_times), predicate); + std::vector later_times; + std::transform(later_files.begin(), later_files.end(), std::back_inserter(later_times), predicate); + + // Find where the times that shouldn't intersect are, and get them. + std::filesystem::file_time_type earlier_time = *std::max_element(earlier_times.begin(), earlier_times.end()); + std::filesystem::file_time_type later_time = *std::max_element(later_times.begin(), later_times.end()); + + // Return if the earlier files are touched no later than the later files. + return earlier_time <= later_time; +} + void IndexRegistry::set_target_memory_usage(int64_t bytes) { target_memory_usage = bytes; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 3b7f58dd4c..7f20a5fd37 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -331,6 +331,13 @@ class IndexRegistry { /// Return true if the given index is available and can be require()'d, and /// false otherwise. bool available(const IndexName& identifier) const; + + /// For two available indexes, returns true if the modification times + /// on the eariler index are no later than those on the later index. + /// + /// Useful for enforcing that downstream indexes haven't had their upstream + /// indexes overwritten. + bool predates(const IndexName& earlier, const IndexName& later) const; /// Get the possible filename(s) associated with the given index with the given prefix. /// TODO: Get this to account for sample-scoped indexes. diff --git a/src/io/register_loader_saver_distance_index.cpp b/src/io/register_loader_saver_distance_index.cpp index 54245956a9..926dc0f3f6 100644 --- a/src/io/register_loader_saver_distance_index.cpp +++ b/src/io/register_loader_saver_distance_index.cpp @@ -1,6 +1,6 @@ /** * \file register_loader_saver_distance_index.cpp - * Defines IO for an XG index from stream files. + * Defines IO for a SnarlDistanceIndex index from stream files. */ #include diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index a9cab2c577..f1cf0250a5 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1756,42 +1756,47 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Minimizer Index" << endl; } + IndexName minimizer_indexname; unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { if (use_path_minimizer) { - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read PathMinimizers").at(0)); + minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { // Use the long read minimizers - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read Minimizers").at(0)); + minimizer_indexname = "Long Read Minimizers"; } } else { - minimizer_index = vg::io::VPKG::load_one(registry.require("Short Read Minimizers").at(0)); + minimizer_indexname = "Short Read Minimizers"; } + if (!registry.predates("Giraffe Distance Index", minimizer_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " which depends on it" << std::endl; + } + minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); require_payload(*minimizer_index, payload_type); // Grab the zipcodes if (show_progress) { logger.info() << "Loading Zipcodes" << endl; } + IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { if (use_path_minimizer) { - ifstream zip_in (registry.require("Long Read PathZipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { - ifstream zip_in (registry.require("Long Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read Zipcodes"; } - } else { - ifstream zip_in (registry.require("Short Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Short Read Zipcodes"; + } + if (!registry.predates("Giraffe Distance Index", oversized_zipcodes_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(oversized_zipcodes_indexname).at(0) << " which depends on it" << std::endl; } + ifstream zip_in (registry.require(oversized_zipcodes_indexname).at(0)); + oversized_zipcodes.deserialize(zip_in); + zip_in.close(); // Grab the GBZ @@ -1805,6 +1810,14 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Distance Index" << endl; } + // TODO: Now that we enforce that the minimizer and zipcodes files are + // newer than the distance index, we really shouldn't modify it ourselves + // by fixing any indirect pointers that may still be in it. So we should be + // able to open the file read-only and map the file read-only here, which + // in turn would solve problems with writable mappings being slow on shared + // filesystems even when not being written. But the VPKG system doesn't + // really support doing that, so we'd have to get the file descriptor + // manually and deserialize() on it and close() it later. auto distance_index = vg::io::VPKG::load_one(registry.require("Giraffe Distance Index").at(0)); if (show_progress) { diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 8635df2a97..379e240463 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -102,13 +102,16 @@ int main_minimizer(int argc, char** argv) { config.params ); - // Serialize the index and the oversized zipcodes. + // Close the distance index so it can't seem to be modified after the files + // that depend on it. + distance_index.reset(); + + // Serialize the minimizer index and the oversized zipcodes. save_minimizer(index, config.output_name); if (!config.zipcode_name.empty()) { std::ofstream zip_out(config.zipcode_name); oversized_zipcodes.serialize(zip_out); zip_out.close(); - } if (config.progress) {