From d24ab45ee98a267987ce443ce27303a1ac3d83e7 Mon Sep 17 00:00:00 2001 From: smehringer Date: Mon, 21 Jul 2025 12:30:43 +0200 Subject: [PATCH 01/18] [FEATURE] Add the LSH algorithm. --- include/chopper/lsh.hpp | 199 ++++++++++++++++++++++++++++++++++++++++ test/api/CMakeLists.txt | 1 + test/api/lsh_test.cpp | 160 ++++++++++++++++++++++++++++++++ 3 files changed, 360 insertions(+) create mode 100644 include/chopper/lsh.hpp create mode 100644 test/api/lsh_test.cpp diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp new file mode 100644 index 00000000..af67bfa9 --- /dev/null +++ b/include/chopper/lsh.hpp @@ -0,0 +1,199 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides chopper::adjust_seed. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +namespace chopper +{ + +/*\brief foo + */ +struct Cluster +{ +protected: + size_t representative_id{}; // representative id of the cluster; identifier; + + std::vector user_bins{}; // the user bins contained in thus cluster + + std::optional moved_id{std::nullopt}; // where this Clusters user bins where moved to + +public: + Cluster() = default; + Cluster(const Cluster&) = default; + Cluster(Cluster&&) = default; + Cluster& operator=(const Cluster&) = default; + Cluster& operator=(Cluster&&) = default; + ~Cluster() = default; + + Cluster(size_t const id) + { + representative_id = id; + user_bins = {id}; + } + + Cluster(size_t const id, size_t const user_bins_id) + { + representative_id = id; + user_bins = {user_bins_id}; + } + + size_t id() const + { + return representative_id; + } + + std::vector const & contained_user_bins() const + { + return user_bins; + } + + bool has_been_moved() const + { + return moved_id.has_value(); + } + + bool empty() const + { + return user_bins.empty(); + } + + size_t size() const + { + return user_bins.size(); + } + + bool is_valid(size_t id) const + { + bool const ids_equal = representative_id == id; + bool const properly_moved = has_been_moved() && empty() && moved_id.has_value(); + bool const not_moved = !has_been_moved() && !empty(); + + return ids_equal && (properly_moved || not_moved); + } + + size_t moved_to_cluster_id() const + { + assert(moved_id.has_value()); + assert(is_valid(representative_id)); + return moved_id.value(); + } + + void move_to(Cluster & target_cluster) + { + target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end()); + this->user_bins.clear(); + moved_id = target_cluster.id(); + } + + void sort_by_cardinality(std::vector const & cardinalities) + { + std::sort(user_bins.begin(), user_bins.end(), [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; }); + } +}; + +struct MultiCluster : Cluster +{ +protected: + std::vector> user_bins{}; // the user bins contained in this cluster + +public: + MultiCluster() = default; + MultiCluster(const MultiCluster&) = default; + MultiCluster(MultiCluster&&) = default; + MultiCluster& operator=(const MultiCluster&) = default; + MultiCluster& operator=(MultiCluster&&) = default; + ~MultiCluster() = default; + + MultiCluster(Cluster const & clust) + { + representative_id = clust.id(); + if (clust.has_been_moved()) + { + moved_id = clust.moved_to_cluster_id(); + } + else + { + user_bins.push_back(clust.contained_user_bins()); + } + } + + std::vector> const & contained_user_bins() const + { + return user_bins; + } + + // needs to be defined again because of redefinition of `user_bins` + bool empty() const + { + return user_bins.empty(); + } + + // needs to be defined again because of redefinition of `user_bins` + size_t size() const + { + return user_bins.size(); + } + + bool is_valid(size_t id) const + { + bool const ids_equal = representative_id == id; + bool const properly_moved = has_been_moved() && empty() && moved_id.has_value(); + bool const not_moved = !has_been_moved() && !empty(); + + return ids_equal && (properly_moved || not_moved); + } + + void move_to(MultiCluster & target_cluster) + { + target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end()); + this->user_bins.clear(); + moved_id = target_cluster.id(); + } + + // sort user bins within a Cluster by cardinality and the clusters themselves by size + void sort_by_cardinality(std::vector const & cardinalities) + { + auto cmp = [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; }; + for (auto & user_bin_cluster : user_bins) + std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp); + + auto cmp_clusters = [](auto const & c1, auto const & c2){ return c2.size() < c1.size(); }; + std::sort(user_bins.begin(), user_bins.end(), cmp_clusters); + } +}; + + +// A valid cluster is one that hasn't been moved but actually contains user bins +// A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i +// A moved cluster is one that has been joined and thereby moved to another cluster +// A moved cluster i is identified by the following: cluster[i].size() == 1 && cluster[i][0] != i +// returns position of the representative cluster +template +size_t LSH_find_representative_cluster(std::vector const & clusters, size_t current_id) +{ + std::reference_wrapper representative = clusters[current_id]; + + assert(representative.get().is_valid(current_id)); + + while (representative.get().has_been_moved()) + { + current_id = representative.get().moved_to_cluster_id(); + representative = clusters[current_id]; // replace by next cluster + assert(representative.get().is_valid(current_id)); + } + + return current_id; +} + +} // namespace chopper diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index 050affcf..3c46f8a4 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -9,5 +9,6 @@ include (add_subdirectories) add_api_test (config_test.cpp) add_api_test (input_functor_test.cpp) +add_api_test (lsh_test.cpp) add_subdirectories () diff --git a/test/api/lsh_test.cpp b/test/api/lsh_test.cpp new file mode 100644 index 00000000..71be4de1 --- /dev/null +++ b/test/api/lsh_test.cpp @@ -0,0 +1,160 @@ +#include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult + +#include // for size_t +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for operator<< +#include // for vector + +#include + +TEST(Cluster_test, ctor_from_id) +{ + size_t user_bin_idx{5}; + chopper::Cluster const cluster{user_bin_idx}; + + EXPECT_EQ(cluster.id(), user_bin_idx); + EXPECT_FALSE(cluster.empty()); + EXPECT_EQ(cluster.size(), 1u); + ASSERT_EQ(cluster.contained_user_bins().size(), 1u); + EXPECT_EQ(cluster.contained_user_bins()[0], user_bin_idx); + EXPECT_TRUE(cluster.is_valid(user_bin_idx)); +} + +TEST(Cluster_test, move_to) +{ + size_t user_bin_idx1{5}; + size_t user_bin_idx2{7}; + chopper::Cluster cluster1{user_bin_idx1}; + chopper::Cluster cluster2{user_bin_idx2}; + + EXPECT_TRUE(cluster1.is_valid(user_bin_idx1)); + EXPECT_TRUE(cluster2.is_valid(user_bin_idx2)); + + cluster2.move_to(cluster1); + + // cluster1 now contains user bins 5 and 7 + EXPECT_EQ(cluster1.size(), 2u); + ASSERT_EQ(cluster1.contained_user_bins().size(), 2u); + EXPECT_EQ(cluster1.contained_user_bins()[0], user_bin_idx1); + EXPECT_EQ(cluster1.contained_user_bins()[1], user_bin_idx2); + + // cluster 2 is empty + EXPECT_TRUE(cluster2.has_been_moved()); + EXPECT_TRUE(cluster2.empty()); + EXPECT_EQ(cluster2.size(), 0u); + EXPECT_EQ(cluster2.contained_user_bins().size(), 0u); + EXPECT_EQ(cluster2.moved_to_cluster_id(), cluster1.id()); + + // both should still be valid + EXPECT_TRUE(cluster1.is_valid(user_bin_idx1)); + EXPECT_TRUE(cluster2.is_valid(user_bin_idx2)); +} + +TEST(Multicluster_test, ctor_from_cluster) +{ + chopper::Cluster const cluster1{5}; + chopper::MultiCluster const multi_cluster1{cluster1}; + + EXPECT_EQ(multi_cluster1.id(), cluster1.id()); + EXPECT_FALSE(multi_cluster1.empty()); + EXPECT_EQ(multi_cluster1.size(), 1u); + ASSERT_EQ(multi_cluster1.contained_user_bins().size(), 1u); + ASSERT_EQ(multi_cluster1.contained_user_bins()[0].size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins()[0][0], 5u); + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); +} + +TEST(Multicluster_test, ctor_from_moved_cluster) +{ + chopper::Cluster cluster1{5}; + chopper::Cluster cluster2{7}; + cluster2.move_to(cluster1); + + chopper::MultiCluster const multi_cluster2{cluster2}; + + EXPECT_EQ(multi_cluster2.id(), cluster2.id()); + EXPECT_TRUE(multi_cluster2.empty()); + EXPECT_TRUE(multi_cluster2.has_been_moved()); + EXPECT_EQ(multi_cluster2.moved_to_cluster_id(), cluster2.moved_to_cluster_id()); + EXPECT_EQ(multi_cluster2.size(), 0u); + ASSERT_EQ(multi_cluster2.contained_user_bins().size(), 0u); + EXPECT_TRUE(multi_cluster2.is_valid(cluster2.id())); +} + +TEST(Multicluster_test, move_to) +{ + chopper::Cluster cluster1{5}; + chopper::Cluster cluster2{7}; + cluster2.move_to(cluster1); + ASSERT_EQ(cluster1.size(), 2u); + chopper::Cluster const cluster3{13}; + + chopper::MultiCluster multi_cluster1{cluster1}; + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); + EXPECT_EQ(multi_cluster1.size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins().size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins()[0].size(), 2u); + + chopper::MultiCluster multi_cluster3{cluster3}; + EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id())); + EXPECT_EQ(multi_cluster3.size(), 1u); + + multi_cluster1.move_to(multi_cluster3); + + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); + EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id())); + + // multi_cluster1 has been moved and is empty now + EXPECT_TRUE(multi_cluster1.empty()); + EXPECT_EQ(multi_cluster1.size(), 0u); + EXPECT_TRUE(multi_cluster1.has_been_moved()); + EXPECT_EQ(multi_cluster1.moved_to_cluster_id(), multi_cluster3.id()); + + // multi_cluster3 contains 2 clusters now, {13} and {5, 7} + EXPECT_FALSE(multi_cluster3.empty()); + EXPECT_EQ(multi_cluster3.size(), 2u); // two clusters + ASSERT_EQ(multi_cluster3.contained_user_bins().size(), 2u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[0].size(), 1u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1].size(), 2u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[0][0], cluster3.id()); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1][0], cluster1.id()); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1][1], cluster2.id()); +} + +TEST(LSH_find_representative_cluster_test, cluster_one_move) +{ + std::vector clusters{chopper::Cluster{0}, chopper::Cluster{1}}; + clusters[1].move_to(clusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id()); +} + +TEST(LSH_find_representative_cluster_test, multi_cluster_one_move) +{ + std::vector mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}}; + mclusters[1].move_to(mclusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id()); + +} + +TEST(LSH_find_representative_cluster_test, cluster_two_moves) +{ + std::vector clusters{chopper::Cluster{0}, chopper::Cluster{1}, chopper::Cluster{2}}; + clusters[2].move_to(clusters[1]); + clusters[1].move_to(clusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id()); + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[2].id()), clusters[0].id()); +} + +TEST(LSH_find_representative_cluster_test, multi_cluster_two_moves) +{ + std::vector mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}, {chopper::Cluster{2}}}; + mclusters[2].move_to(mclusters[1]); + mclusters[1].move_to(mclusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id()); + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[2].id()), mclusters[0].id()); +} From ac45b94ea00a0a902e9d89d94dd9002e7cc9d7bb Mon Sep 17 00:00:00 2001 From: smehringer Date: Mon, 21 Jul 2025 12:33:31 +0200 Subject: [PATCH 02/18] add /build_debug/ to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7dfea74f..90a24ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ # build dir /build/ +/build_debug/ # editor configuration /.vscode/ From 73c62b06535a31a0425e2369e14c29e8d921f2f8 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 24 Jul 2025 21:39:42 +0200 Subject: [PATCH 03/18] [MISC] Add determine_split_bins_test. --- .../chopper/layout/determine_split_bins.hpp | 29 +++ src/layout/determine_split_bins.cpp | 152 ++++++++++++++ test/api/layout/CMakeLists.txt | 1 + test/api/layout/determine_split_bins_test.cpp | 192 ++++++++++++++++++ 4 files changed, 374 insertions(+) create mode 100644 include/chopper/layout/determine_split_bins.hpp create mode 100644 src/layout/determine_split_bins.cpp create mode 100644 test/api/layout/determine_split_bins_test.cpp diff --git a/include/chopper/layout/determine_split_bins.hpp b/include/chopper/layout/determine_split_bins.hpp new file mode 100644 index 00000000..dee17b24 --- /dev/null +++ b/include/chopper/layout/determine_split_bins.hpp @@ -0,0 +1,29 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides chopper::determine_split_bins. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +#include + +namespace chopper::layout +{ + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions); + +} \ No newline at end of file diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp new file mode 100644 index 00000000..71d946b8 --- /dev/null +++ b/src/layout/determine_split_bins.cpp @@ -0,0 +1,152 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include // for allocator, string +#include // for vector + +#include +#include +#include // for data_store + +#include +#include + +namespace chopper::fast_layout +{ + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions) +{ + if (num_user_bins == 0) + return {0,0}; + + assert(num_technical_bins > 0u); + assert(num_user_bins > 0u); + assert(num_user_bins <= num_technical_bins); + + auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); + + std::vector> matrix(num_technical_bins); // rows + for (auto & v : matrix) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + std::vector> trace(num_technical_bins); // rows + for (auto & v : trace) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + size_t const extra_bins = num_technical_bins - num_user_bins + 1; + + // initialize first column (first row is initialized with inf) + double const ub_cardinality = static_cast(cardinalities[positions[0]]); + for (size_t i = 0; i < extra_bins; ++i) + { + size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[i + 1]); + matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u); + } + + // we must iterate column wise + for (size_t j = 1; j < num_user_bins; ++j) + { + double const ub_cardinality = static_cast(cardinalities[positions[j]]); + + for (size_t i = j; i < j + extra_bins; ++i) + { + size_t minimum{std::numeric_limits::max()}; + + for (size_t i_prime = j - 1; i_prime < i; ++i_prime) + { + size_t const corrected_ub_cardinality = + static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); + size_t score = + std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]); + + // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; + + minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum; + } + + matrix[i][j] = minimum; + } + } + + // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits::max()); + //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits::max()); + + // backtracking + // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good) + // the less rows, the better + size_t trace_i{num_technical_bins - 1}; + size_t best_score{std::numeric_limits::max()}; + for (size_t best_i{0}; best_i < num_technical_bins; ++best_i) + { + if (matrix[best_i][num_user_bins - 1] < best_score) + { + best_score = matrix[best_i][num_user_bins - 1]; + trace_i = best_i; + } + } + size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number + + // now that we found the best trace_i start usual backtracking + size_t trace_j = num_user_bins - 1; + + // size_t max_id{}; + size_t max_size{}; + + size_t bin_id{}; + + while (trace_j > 0) + { + size_t next_i = trace[trace_i][trace_j]; + size_t const number_of_bins = (trace_i - next_i); + size_t const cardinality = cardinalities[positions[trace_j]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[number_of_bins]); + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < number_of_bins; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]); + ++bin_id; + } + + trace_i = trace[trace_i][trace_j]; + --trace_j; + } + ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins + size_t const cardinality = cardinalities[positions[0]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[trace_i]); + // NOLINTNEXTLINE(clang-analyzer-core.DivideZero) + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < trace_i; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[0]); + ++bin_id; + } + + return {number_of_split_tbs, max_size}; +} + +} // namespace chopper::layout diff --git a/test/api/layout/CMakeLists.txt b/test/api/layout/CMakeLists.txt index 9cdb4076..98c6cfe3 100644 --- a/test/api/layout/CMakeLists.txt +++ b/test/api/layout/CMakeLists.txt @@ -26,3 +26,4 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" endif () add_api_test (user_bin_io_test.cpp) add_api_test (input_test.cpp) +add_api_test (determine_split_bins_test.cpp) diff --git a/test/api/layout/determine_split_bins_test.cpp b/test/api/layout/determine_split_bins_test.cpp new file mode 100644 index 00000000..2af4c488 --- /dev/null +++ b/test/api/layout/determine_split_bins_test.cpp @@ -0,0 +1,192 @@ +#include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult + +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for vector + +#include +#include // for data_store + +#include +#include + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions) +{ + if (num_user_bins == 0) + return {0,0}; + + assert(num_technical_bins > 0u); + assert(num_user_bins > 0u); + assert(num_user_bins <= num_technical_bins); + + auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); + + std::vector> matrix(num_technical_bins); // rows + for (auto & v : matrix) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + std::vector> trace(num_technical_bins); // rows + for (auto & v : trace) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + size_t const extra_bins = num_technical_bins - num_user_bins + 1; + + // initialize first column (first row is initialized with inf) + double const ub_cardinality = static_cast(cardinalities[positions[0]]); + for (size_t i = 0; i < extra_bins; ++i) + { + size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[i + 1]); + matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u); + } + + // we must iterate column wise + for (size_t j = 1; j < num_user_bins; ++j) + { + double const ub_cardinality = static_cast(cardinalities[positions[j]]); + + for (size_t i = j; i < j + extra_bins; ++i) + { + size_t minimum{std::numeric_limits::max()}; + + for (size_t i_prime = j - 1; i_prime < i; ++i_prime) + { + size_t const corrected_ub_cardinality = + static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); + size_t score = + std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]); + + // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; + + minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum; + } + + matrix[i][j] = minimum; + } + } + + // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits::max()); + //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits::max()); + + // backtracking + // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good) + // the less rows, the better + size_t trace_i{num_technical_bins - 1}; + size_t best_score{std::numeric_limits::max()}; + for (size_t best_i{0}; best_i < num_technical_bins; ++best_i) + { + if (matrix[best_i][num_user_bins - 1] < best_score) + { + best_score = matrix[best_i][num_user_bins - 1]; + trace_i = best_i; + } + } + size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number + + // now that we found the best trace_i start usual backtracking + size_t trace_j = num_user_bins - 1; + + // size_t max_id{}; + size_t max_size{}; + + size_t bin_id{}; + + while (trace_j > 0) + { + size_t next_i = trace[trace_i][trace_j]; + size_t const number_of_bins = (trace_i - next_i); + size_t const cardinality = cardinalities[positions[trace_j]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[number_of_bins]); + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < number_of_bins; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]); + ++bin_id; + } + + trace_i = trace[trace_i][trace_j]; + --trace_j; + } + ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins + size_t const cardinality = cardinalities[positions[0]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[trace_i]); + // NOLINTNEXTLINE(clang-analyzer-core.DivideZero) + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < trace_i; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[0]); + ++bin_id; + } + + return {number_of_split_tbs, max_size}; +} + +TEST(simple_split, first) +{ + chopper::configuration config{}; + + std::vector cardinalities{}; + for (size_t i{0}; i < 20; ++i) + cardinalities.push_back(2400); + cardinalities.push_back(200); + for (size_t i{0}; i < 84; ++i) + cardinalities.push_back(50); + + std::vector positions(cardinalities.size()); + std::iota(positions.begin(), positions.end(), 0u); + + size_t num_technical_bins{63}; + size_t num_user_bins{21}; + + std::vector> partitions(64); + + auto const [num_splits, max_size] = determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions); + + EXPECT_EQ(num_splits, 61); + EXPECT_EQ(max_size, 1452); + + size_t ub_idx{20}; + size_t tb_idx{partitions.size() - 1}; + + // TBs: 0, 1, 2, 3, ... + // UBs: 20,19,19,19,18,18,18,17,17,17,16,16,16,15,15,15,14,14,14,13,13,13,12,12,12,11,11,11,10,10,10,9,9,9,8,8,8,7,7,7,6,6,6,5,5,5,4,4,4,3,3,3,2,2,2,1,1,1,0,0,0, + + EXPECT_EQ(partitions[tb_idx][0], ub_idx); // single bin with card = 200 + --tb_idx; + --ub_idx; + + for (size_t i{partitions.size() - 1}; i > partitions.size() - num_splits - 1; --i) + std::cerr << partitions[i][0] << ","; + + + for (;tb_idx > partitions.size() - num_splits - 1; --tb_idx) + { + for (size_t three = 0; three < 3; ++three) + { + EXPECT_EQ(partitions[tb_idx][0], ub_idx); + --tb_idx; + } + ++tb_idx; // one too much in loop before + --ub_idx; + } +} \ No newline at end of file From 6fd5b75e84bdb5bd57993fdf284186a39cf96e60 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 24 Jul 2025 21:40:08 +0200 Subject: [PATCH 04/18] [TEST] Add simple fast layout test --- test/api/layout/CMakeLists.txt | 1 + test/api/layout/fast_layout_test.cpp | 310 +++++++++++++++++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 test/api/layout/fast_layout_test.cpp diff --git a/test/api/layout/CMakeLists.txt b/test/api/layout/CMakeLists.txt index 98c6cfe3..45f5b1ce 100644 --- a/test/api/layout/CMakeLists.txt +++ b/test/api/layout/CMakeLists.txt @@ -9,6 +9,7 @@ cmake_minimum_required (VERSION 3.18) add_api_test (ibf_query_cost_test.cpp) add_api_test (execute_layout_test.cpp) +add_api_test (fast_layout_test.cpp) add_api_test (execute_with_estimation_test.cpp) target_use_datasources (execute_with_estimation_test FILES seq1.fa) diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp new file mode 100644 index 00000000..2347f66b --- /dev/null +++ b/test/api/layout/fast_layout_test.cpp @@ -0,0 +1,310 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "../api_test.hpp" +#include "print_debug_file.hpp" + +TEST(execute_test, many_ubs) +{ + seqan3::test::tmp_directory tmp_dir{}; + std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; + + std::vector> many_filenames; + + for (size_t i{0}; i < 96u; ++i) + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); + + // Creates sizes of the following series + // [100,101,...,120,222,223,...,241,343,344,...362,464,465,...,483,585,586,...,600] + // See also https://godbolt.org/z/9517eaaaG + auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) + { + size_t const desired_kmer_count = 1001 * ((num + 20) / 20) + num; + for (auto hash : std::views::iota(num * 600, num * 600 + desired_kmer_count)) + it = hash; + }; + + chopper::configuration config{}; + config.output_filename = layout_file; + config.disable_sketch_output = true; + config.hibf_config.tmax = 64; + config.hibf_config.input_fn = simulated_input; + config.hibf_config.number_of_user_bins = many_filenames.size(); + config.hibf_config.disable_estimate_union = true; // also disables rearrangement + + std::vector sketches; + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); + + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); + + std::string const expected_file{ + "@CHOPPER_USER_BINS\n" + "@0 seq0\n" + "@1 seq1\n" + "@2 seq2\n" + "@3 seq3\n" + "@4 seq4\n" + "@5 seq5\n" + "@6 seq6\n" + "@7 seq7\n" + "@8 seq8\n" + "@9 seq9\n" + "@10 seq10\n" + "@11 seq11\n" + "@12 seq12\n" + "@13 seq13\n" + "@14 seq14\n" + "@15 seq15\n" + "@16 seq16\n" + "@17 seq17\n" + "@18 seq18\n" + "@19 seq19\n" + "@20 seq20\n" + "@21 seq21\n" + "@22 seq22\n" + "@23 seq23\n" + "@24 seq24\n" + "@25 seq25\n" + "@26 seq26\n" + "@27 seq27\n" + "@28 seq28\n" + "@29 seq29\n" + "@30 seq30\n" + "@31 seq31\n" + "@32 seq32\n" + "@33 seq33\n" + "@34 seq34\n" + "@35 seq35\n" + "@36 seq36\n" + "@37 seq37\n" + "@38 seq38\n" + "@39 seq39\n" + "@40 seq40\n" + "@41 seq41\n" + "@42 seq42\n" + "@43 seq43\n" + "@44 seq44\n" + "@45 seq45\n" + "@46 seq46\n" + "@47 seq47\n" + "@48 seq48\n" + "@49 seq49\n" + "@50 seq50\n" + "@51 seq51\n" + "@52 seq52\n" + "@53 seq53\n" + "@54 seq54\n" + "@55 seq55\n" + "@56 seq56\n" + "@57 seq57\n" + "@58 seq58\n" + "@59 seq59\n" + "@60 seq60\n" + "@61 seq61\n" + "@62 seq62\n" + "@63 seq63\n" + "@64 seq64\n" + "@65 seq65\n" + "@66 seq66\n" + "@67 seq67\n" + "@68 seq68\n" + "@69 seq69\n" + "@70 seq70\n" + "@71 seq71\n" + "@72 seq72\n" + "@73 seq73\n" + "@74 seq74\n" + "@75 seq75\n" + "@76 seq76\n" + "@77 seq77\n" + "@78 seq78\n" + "@79 seq79\n" + "@80 seq80\n" + "@81 seq81\n" + "@82 seq82\n" + "@83 seq83\n" + "@84 seq84\n" + "@85 seq85\n" + "@86 seq86\n" + "@87 seq87\n" + "@88 seq88\n" + "@89 seq89\n" + "@90 seq90\n" + "@91 seq91\n" + "@92 seq92\n" + "@93 seq93\n" + "@94 seq94\n" + "@95 seq95\n" + "@CHOPPER_USER_BINS_END\n" + "@CHOPPER_CONFIG\n" + "@{\n" + "@ \"chopper_config\": {\n" + "@ \"version\": 2,\n" + "@ \"data_file\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"debug\": false,\n" + "@ \"sketch_directory\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"k\": 19,\n" + "@ \"window_size\": 19,\n" + "@ \"disable_sketch_output\": true,\n" + "@ \"precomputed_files\": false,\n" + "@ \"output_filename\": {\n" + "@ \"value0\": \"" + layout_file.string() + "\"\n" + "@ },\n" + "@ \"determine_best_tmax\": false,\n" + "@ \"force_all_binnings\": false\n" + "@ }\n" + "@}\n" + "@CHOPPER_CONFIG_END\n" + "@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 96,\n" + "@ \"number_of_hash_functions\": 2,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 1,\n" + "@ \"sketch_bits\": 12,\n" + "@ \"tmax\": 64,\n" + "@ \"empty_bin_fraction\": 0.0,\n" + "@ \"alpha\": 1.2,\n" + "@ \"max_rearrangement_ratio\": 0.5,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": true\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" + "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" + "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" + "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" + "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n" + "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n" + "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" + "0 5;0 1;2\n" + "1 9;0 1;2\n" + "2 3;0 1;2\n" + "3 12;0 1;2\n" + "4 12;2 1;2\n" + "5 6;0 1;2\n" + "6 9;2 1;2\n" + "7 9;4 1;2\n" + "8 3;2 1;2\n" + "9 3;4 1;2\n" + "10 4;0 1;2\n" + "11 6;2 1;2\n" + "12 2;0 1;2\n" + "13 2;2 1;2\n" + "14 2;4 1;2\n" + "15 4;2 1;2\n" + "16 12;4 1;3\n" + "17 6;4 1;2\n" + "18 5;2 1;3\n" + "19 9;6 1;2\n" + "20 8;0 1;12\n" + "21 8;12 1;12\n" + "22 6;6 1;9\n" + "23 6;15 1;9\n" + "24 9;8 1;9\n" + "25 9;17 1;9\n" + "26 4;4 1;11\n" + "27 4;15 1;11\n" + "28 1;0 1;12\n" + "29 1;12 1;12\n" + "30 2;6 1;11\n" + "31 2;17 1;11\n" + "32 12;7 1;13\n" + "33 6;24 1;9\n" + "34 5;5 1;14\n" + "35 9;26 1;9\n" + "36 3;6 1;13\n" + "37 22 1\n" + "38 21 1\n" + "39 20 1\n" + "40 15 1\n" + "41 18 1\n" + "42 19 1\n" + "43 13 1\n" + "44 17 1\n" + "45 14 1\n" + "46 10 1\n" + "47 16 1\n" + "48 11 1\n" + "49 8;24 1;40\n" + "50 12;20 1;44\n" + "51 6;33 1;31\n" + "52 5;19 1;45\n" + "53 9;35 1;29\n" + "54 3;19 1;45\n" + "55 4;26 1;38\n" + "56 7 1\n" + "57 1;24 1;40\n" + "58 0 1\n" + "59 2;28 1;36\n" + "60 63 1\n" + "61 62 1\n" + "62 60 1\n" + "63 61 1\n" + "64 59 1\n" + "65 58 1\n" + "66 57 1\n" + "67 55 1\n" + "68 56 1\n" + "69 54 1\n" + "70 52 1\n" + "71 53 1\n" + "72 51 1\n" + "73 49 1\n" + "74 48 1\n" + "75 50 1\n" + "76 45 1\n" + "77 47 1\n" + "78 46 1\n" + "79 44 1\n" + "80 39 1\n" + "81 40 1\n" + "82 41 1\n" + "83 42 1\n" + "84 38 1\n" + "85 37 1\n" + "86 43 1\n" + "87 36 1\n" + "88 35 1\n" + "89 34 1\n" + "90 29 2\n" + "91 31 2\n" + "92 27 2\n" + "93 23 2\n" + "94 33 1\n" + "95 25 2\n"}; + std::string const actual_file{string_from_file(layout_file)}; + + EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl; +} From 85da3cdb8fffd257f076b635b43934c11fcdc25d Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 24 Jul 2025 21:41:27 +0200 Subject: [PATCH 05/18] [FEATURE] fast layout algorthm (still ugly). --- include/chopper/configuration.hpp | 16 + include/chopper/layout/execute.hpp | 4 +- src/chopper_layout.cpp | 19 +- src/layout/CMakeLists.txt | 2 +- src/layout/execute.cpp | 1880 +++++++++++++++++++++++++++- src/layout/hibf_statistics.cpp | 2 +- 6 files changed, 1887 insertions(+), 36 deletions(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index bae58feb..825bc407 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -20,8 +20,20 @@ namespace chopper { +enum partitioning_scheme +{ + blocked, // 0 + sorted, // 1 + folded, // 2 + weighted_fold, // 3 + similarity, // 4 + lsh, // 5 + lsh_sim // 6 +}; + struct configuration { + int partitioning_approach{6}; /*!\name General Configuration * \{ */ @@ -77,6 +89,10 @@ struct configuration mutable seqan::hibf::concurrent_timer union_estimation_timer{}; mutable seqan::hibf::concurrent_timer rearrangement_timer{}; mutable seqan::hibf::concurrent_timer dp_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer lsh_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer search_partition_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer intital_partition_timer{}; + mutable seqan::hibf::concurrent_timer small_layouts_timer{}; void read_from(std::istream & stream); diff --git a/include/chopper/layout/execute.hpp b/include/chopper/layout/execute.hpp index 52bfc3b7..00c4fa2e 100644 --- a/include/chopper/layout/execute.hpp +++ b/include/chopper/layout/execute.hpp @@ -13,12 +13,14 @@ #include #include +#include namespace chopper::layout { int execute(chopper::configuration & config, std::vector> const & filenames, - std::vector const & sketches); + std::vector const & sketches, + std::vector const & minHash_sketches); } // namespace chopper::layout diff --git a/src/chopper_layout.cpp b/src/chopper_layout.cpp index 5fda4ddb..9df893a3 100644 --- a/src/chopper_layout.cpp +++ b/src/chopper_layout.cpp @@ -93,6 +93,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) std::vector> filenames{}; std::vector sketches{}; + std::vector minHash_sketches{}; if (input_is_a_sketch_file) { @@ -106,6 +107,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) filenames = std::move(sin.filenames); // No need to call check_filenames because the files are not read. sketches = std::move(sin.hll_sketches); + minHash_sketches = std::move(sin.minHash_sketches); validate_configuration(parser, config, sin.chopper_config); } else @@ -128,17 +130,18 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) if (!input_is_a_sketch_file) { config.compute_sketches_timer.start(); - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); config.compute_sketches_timer.stop(); } - exit_code |= chopper::layout::execute(config, filenames, sketches); + exit_code |= chopper::layout::execute(config, filenames, sketches, minHash_sketches); if (!config.disable_sketch_output) { chopper::sketch::sketch_file sout{.chopper_config = config, .filenames = std::move(filenames), - .hll_sketches = std::move(sketches)}; + .hll_sketches = std::move(sketches), + .minHash_sketches = std::move(minHash_sketches)}; std::ofstream os{config.sketch_directory, std::ios::binary}; cereal::BinaryOutputArchive oarchive{os}; oarchive(sout); @@ -151,11 +154,19 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) output_stream << "sketching_in_seconds\t" << "layouting_in_seconds\t" << "union_estimation_in_seconds\t" - << "rearrangement_in_seconds\n"; + << "rearrangement_in_seconds\t" + << "lsh_in_seconds\t" + << "intital_partition_timer_in_seconds\t" + << "small_layouts_timer_in_seconds\t" + << "search_best_p_in_seconds\n"; output_stream << config.compute_sketches_timer.in_seconds() << '\t'; output_stream << config.dp_algorithm_timer.in_seconds() << '\t'; output_stream << config.union_estimation_timer.in_seconds() << '\t'; output_stream << config.rearrangement_timer.in_seconds() << '\t'; + output_stream << config.lsh_algorithm_timer.in_seconds() << '\t'; + output_stream << config.intital_partition_timer.in_seconds() << '\t'; + output_stream << config.small_layouts_timer.in_seconds() << '\t'; + output_stream << config.search_partition_algorithm_timer.in_seconds() << '\n'; } return exit_code; diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt index 915c83d7..4c2542c8 100644 --- a/src/layout/CMakeLists.txt +++ b/src/layout/CMakeLists.txt @@ -5,7 +5,7 @@ if (TARGET chopper::layout) endif () add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp - ibf_query_cost.cpp input.cpp output.cpp + ibf_query_cost.cpp input.cpp output.cpp determine_split_bins.cpp ) target_link_libraries (chopper_layout PUBLIC chopper::shared) add_library (chopper::layout ALIAS chopper_layout) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 6630447e..fd02cd0d 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include #include @@ -12,67 +13,1888 @@ #include #include #include +#include +#include #include #include #include #include +#include + +#include #include +#include #include #include #include #include #include +#include +#include +#include +#include // for compute_relaxed_fpr_correction #include #include +#include #include #include // for estimate_kmer_counts #include +#include namespace chopper::layout { -int execute(chopper::configuration & config, - std::vector> const & filenames, - std::vector const & sketches) +uint64_t lsh_hash_the_sketch(std::vector const & sketch, size_t const number_of_hashes_to_consider) { - config.hibf_config.validate_and_set_defaults(); + // lets just compute the sum + uint64_t sum{0}; - seqan::hibf::layout::layout hibf_layout; - std::vector kmer_counts; - seqan::hibf::sketch::estimate_kmer_counts(sketches, kmer_counts); + for (size_t i = 0; i < number_of_hashes_to_consider; ++i) + sum += sketch[i]; + + return sum; +} + +auto LSH_fill_hashtable(std::vector const & clusters, + std::vector const & minHash_sketches, + size_t const current_sketch_index, + size_t const current_number_of_sketch_hashes) +{ + robin_hood::unordered_flat_map> table; - if (config.determine_best_tmax) + size_t processed_user_bins{0}; // only for sanity check + for (size_t pos = 0; pos < clusters.size(); ++pos) { - hibf_layout = determine_best_number_of_technical_bins(config, kmer_counts, sketches); + auto const & current = clusters[pos]; + assert(current.is_valid(pos)); + + if (current.has_been_moved()) // cluster has been moved somewhere else, don't process + continue; + + for (size_t const user_bin_idx : current.contained_user_bins()) + { + ++processed_user_bins; + uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes); + table[key].push_back(current.id()); // insert representative for all user bins + } } - else + assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters + + return table; +} + +auto LSH_fill_hashtable(std::vector const & clusters, + std::vector const & minHash_sketches, + size_t const current_sketch_index, + size_t const current_number_of_sketch_hashes) +{ + robin_hood::unordered_flat_map> table; + + size_t processed_user_bins{0}; // only for sanity check + for (size_t pos = 0; pos < clusters.size(); ++pos) + { + auto const & current = clusters[pos]; + assert(current.is_valid(pos)); + + if (current.has_been_moved()) // cluster has been moved somewhere else, don't process + continue; + + for (auto const & similarity_cluster : current.contained_user_bins()) + { + for (size_t const user_bin_idx : similarity_cluster) + { + ++processed_user_bins; + uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes); + table[key].push_back(current.id()); // insert representative for all user bins + } + } + } + assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters + + return table; +} + +// minHash_sketches data structure: +// Vector L1 : number of user bins +// Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b) +// Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r) +std::vector initital_LSH_partitioning(std::vector const & minHash_sketches, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) +{ + assert(!minHash_sketches.empty()); + assert(!minHash_sketches[0].table.empty()); + assert(!minHash_sketches[0].table[0].empty()); + + size_t const number_of_user_bins{positions.size()}; + size_t const number_of_max_minHash_sketches{seqan::hibf::sketch::minhashes::num_sketches}; // LSH ADD+OR parameter b + size_t const minHash_sketche_size{seqan::hibf::sketch::minhashes::sketch_size}; // LSH ADD+OR parameter r + seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits}; + + // initialise clusters with a signle user bin per cluster. + // clusters are either + // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to + // e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y}) + // 2) of size >= 1; with the first entry beging id == position (a valid cluster) + // e.g. cluster[X] = {X} // valid singleton + // e.g. cluster[X] = {X, a, b, c, ...} // valid cluster with more joined entries + // The clusters could me moved recursively, s.t. + // cluster[A] = {B} + // cluster[B] = {C} + // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B + std::vector clusters; + clusters.reserve(number_of_user_bins); + + std::vector current_cluster_cardinality(number_of_user_bins); + std::vector current_cluster_sketches(number_of_user_bins, empty_sketch); + size_t current_max_cluster_size{0}; + size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively + size_t current_sketch_index{0}; + size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster + + for (size_t pos = 0; pos < number_of_user_bins; ++pos) + { + clusters.emplace_back(pos, positions[pos]); // id, user_bin_idx + current_cluster_cardinality[pos] = cardinalities[positions[pos]]; + current_cluster_sketches[pos] = sketches[positions[pos]]; + current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]); + } + + // refine clusters +//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; + while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ + current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? + { +//std::cout << "Current number of clusters: " << current_number_of_clusters; + + // fill LSH collision hashtable + robin_hood::unordered_flat_map> table = + LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); + + // read out LSH collision hashtable + // for each present key, if the list contains more than one cluster, we merge everything contained in the list + // into the first cluster, since those clusters collide and should be joined in the same bucket + for (auto & [key, list] : table) + { + assert(!list.empty()); + + // uniquify list. Since I am inserting representative_idx's into the table, the same number can + // be inserted into multiple splots, and multiple times in the same slot. + std::sort(list.begin(), list.end()); + auto const end = std::unique(list.begin(), list.end()); + auto const begin = list.begin(); + + if (end - begin <= 1) // nothing to do here + continue; + + // Now combine all clusters into the first. + + // 1) find the representative cluster to merge everything else into + // It can happen, that the representative has already been joined with another cluster + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved + size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin); + auto & representative_cluster = clusters[representative_cluster_id]; + assert(representative_cluster.is_valid(representative_cluster_id)); + assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); + + for (auto current = begin + 1; current < end; ++current) + { + // For every other entry in the list, it can happen that I already joined that list with another + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {0, 2, 11} // now I want to do it again + size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current); + auto & next_cluster = clusters[next_cluster_id]; + + if (next_cluster.id() == representative_cluster.id()) // already joined + continue; + + next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster + assert(next_cluster.size() == 0); + assert(next_cluster.has_been_moved()); + assert(representative_cluster.size() > 1); // there should be at least two user bins now + assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid + + current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]); + current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate(); + + --current_number_of_clusters; + } + + current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality); + } + + ++current_sketch_index; + +//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl; + } + + return clusters; +} + +std::vector very_similar_LSH_partitioning(std::vector const & minHash_sketches, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) +{ + assert(!minHash_sketches.empty()); + assert(!minHash_sketches[0].table.empty()); + assert(!minHash_sketches[0].table[0].empty()); + + size_t const number_of_user_bins{positions.size()}; + assert(number_of_user_bins <= minHash_sketches.size()); + size_t const number_of_max_minHash_sketches{3}; // LSH ADD+OR parameter b + // size_t const minHash_sketche_size{minHash_sketches[0].table[0].size()}; // LSH ADD+OR parameter r + size_t const minHash_sketche_size{5}; // LSH ADD+OR parameter r + seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits}; +// std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl; +// std::cout << "sketch size used here: " << minHash_sketche_size << std::endl; + // initialise clusters with a signle user bin per cluster. + // clusters are either + // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to + // e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y}) + // 2) of size >= 1; with the first entry beging id == position (a valid cluster) + // e.g. cluster[X] = {X} // valid singleton + // e.g. cluster[X] = {X, a, b, c, ...} // valid cluster with more joined entries + // The clusters could me moved recursively, s.t. + // cluster[A] = {B} + // cluster[B] = {C} + // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B + std::vector clusters; + clusters.reserve(number_of_user_bins); + + std::vector current_cluster_cardinality(number_of_user_bins); + std::vector current_cluster_sketches(number_of_user_bins, empty_sketch); + size_t current_max_cluster_size{0}; + size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively + size_t current_sketch_index{0}; + size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster + + for (size_t pos = 0; pos < number_of_user_bins; ++pos) + { + clusters.emplace_back(pos, positions[pos]); + current_cluster_cardinality[pos] = cardinalities[positions[pos]]; + current_cluster_sketches[pos] = sketches[positions[pos]]; + current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]); + } + + // refine clusters +//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; + while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ + current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? + { +//std::cout << "Current number of clusters: " << current_number_of_clusters; + + // fill LSH collision hashtable + robin_hood::unordered_flat_map> table = + LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); + + // read out LSH collision hashtable + // for each present key, if the list contains more than one cluster, we merge everything contained in the list + // into the first cluster, since those clusters collide and should be joined in the same bucket + for (auto & [key, list] : table) + { + assert(!list.empty()); + + // uniquify list. Since I am inserting representative_idx's into the table, the same number can + // be inserted into multiple splots, and multiple times in the same slot. + std::sort(list.begin(), list.end()); + auto const end = std::unique(list.begin(), list.end()); + auto const begin = list.begin(); + + if (end - begin <= 1) // nothing to do here + continue; + + // Now combine all clusters into the first. + + // 1) find the representative cluster to merge everything else into + // It can happen, that the representative has already been joined with another cluster + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved + size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin); + auto & representative_cluster = clusters[representative_cluster_id]; + assert(representative_cluster.is_valid(representative_cluster_id)); + assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); + + for (auto current = begin + 1; current < end; ++current) + { + // For every other entry in the list, it can happen that I already joined that list with another + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {0, 2, 11} // now I want to do it again + size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current); + auto & next_cluster = clusters[next_cluster_id]; + + if (next_cluster.id() == representative_cluster.id()) // already joined + continue; + + next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster + assert(next_cluster.size() == 0); + assert(next_cluster.has_been_moved()); + assert(representative_cluster.size() > 1); // there should be at least two user bins now + assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid + + current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]); + current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate(); + + --current_number_of_clusters; + } + + current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality); + } + + ++current_sketch_index; + +//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl; + } + + return clusters; +} + +std::vector most_distant_LSH_partitioning(std::vector const & initial_clusters, + std::vector const & minHash_sketches, + chopper::configuration const & config) +{ + assert(!minHash_sketches.empty()); + assert(!minHash_sketches[0].table.empty()); + assert(!minHash_sketches[0].table[0].empty()); + + size_t const number_of_user_bins{initial_clusters.size()}; + size_t const number_of_max_minHash_sketches{minHash_sketches[0].table.size()}; // LSH ADD+OR parameter b + // size_t const minHash_sketche_size{minHash_sketches[0][0].size()}; // LSH ADD+OR parameter r + + size_t current_number_of_sketch_hashes{5}; + size_t current_sketch_index{0}; + + size_t current_number_of_clusters = [&initial_clusters] () + { + size_t number{0}; + for (auto const & cluster : initial_clusters) + { + if (cluster.size()) + ++number; + } + return number; + }(); + + std::vector clusters; + clusters.reserve(number_of_user_bins); + + for (size_t pos = 0; pos < number_of_user_bins; ++pos) + { + clusters.emplace_back(initial_clusters[pos]); + assert(clusters[pos].is_valid(pos)); + } + + // refine clusters. I want to cluster as much as possible to initialise partitions with very distant clusters + while (current_number_of_clusters > (config.hibf_config.tmax * 2) && + current_sketch_index < number_of_max_minHash_sketches) + { +//std::cout << "[Dist] Current number of clusters: " << current_number_of_clusters << " sketch size:" << current_number_of_sketch_hashes << " "; + + // fill LSH collision hashtable + robin_hood::unordered_flat_map> table = + LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); + + // read out LSH collision hashtable + // for each present key, if the list contains more than one cluster, we merge everything contained in the list + // into the first cluster, since those clusters collide and should be joined in the same bucket + for (auto & [key, list] : table) + { + assert(!list.empty()); + + // uniquify list. Since I am inserting representative_idx's into the table, the same number can + // be inserted into multiple splots, and multiple times in the same slot. + std::sort(list.begin(), list.end()); + auto const end = std::unique(list.begin(), list.end()); + auto const begin = list.begin(); + + if (end - begin <= 1) // nothing to do here + continue; + + // Now combine all clusters into the first. + + // 1) find the representative cluster to merge everything else into + // It can happen, that the representative has already been joined with another cluster + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved + size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin); + auto & representative_cluster = clusters[representative_cluster_id]; + assert(representative_cluster.is_valid(representative_cluster_id)); + assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); + + for (auto current = begin + 1; current < end; ++current) + { + // For every other entry in the list, it can happen that I already joined that list with another + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {0, 2, 11} // now I want to do it again + size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current); + auto & next_cluster = clusters[next_cluster_id]; + + if (next_cluster.id() == representative_cluster.id()) // already joined + continue; + + next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster + assert(next_cluster.size() == 0); + assert(next_cluster.has_been_moved()); + assert(representative_cluster.size() > 1); // there should be at least two user bins now + assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid + + --current_number_of_clusters; + } + } + + ++current_sketch_index; + + if (current_sketch_index % 4 == 0 && current_number_of_sketch_hashes > 1) + --current_number_of_sketch_hashes; + +//std::cout << " and after this clustering step there are: " << current_number_of_clusters << std::endl; + } + + return clusters; +} + +void post_process_clusters(std::vector & clusters, + std::vector const & cardinalities, + chopper::configuration const & config) +{ + // clusters are done. Start post processing + // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not + // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id() + // would neet to be found + + for (size_t pos = 0; pos < clusters.size(); ++pos) + { + assert(clusters[pos].is_valid(pos)); + clusters[pos].sort_by_cardinality(cardinalities); + } + + // push largest p clusters to the front + auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2) + { + if (v2.size() == v1.size() && !v2.empty()) + return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; + return v2.size() < v1.size(); + }; + std::partial_sort(clusters.begin(), clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp); + + // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub + // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last. + // the largest ub is already at the start because of former sorting. + auto compare_cardinality_and_move_empty_clusters_to_the_end = [&cardinalities](auto const & v1, auto const & v2) + { + if (v1.empty()) + return false; // v1 can never be larger than v2 then + + if (v2.empty()) // and v1 is not, since the first if would catch + return true; + + return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; + }; + std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), compare_cardinality_and_move_empty_clusters_to_the_end); + + assert(clusters[0].size() >= clusters[1].size()); // sanity check + // assert(cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check + +// debug + for (size_t cidx = 1; cidx < clusters.size(); ++cidx) + { + if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end + throw std::runtime_error{"sorting did not work"}; + } +// debug +} + +void post_process_clusters(std::vector & clusters, + std::vector const & cardinalities, + chopper::configuration const & config) +{ + // clusters are done. Start post processing + // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not + // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id() + // would neet to be found + + for (size_t pos = 0; pos < clusters.size(); ++pos) + { + assert(clusters[pos].is_valid(pos)); + clusters[pos].sort_by_cardinality(cardinalities); + } + + // push largest p clusters to the front + auto cluster_size_cmp = [&cardinalities](auto const & mc1, auto const & mc2) + { + if (mc1.empty()) + return false; // mc1 can never be larger than mc2 then + if (mc2.empty()) // and mc1 is not, since the first if would catch + return true; + + auto const & largest_c1 = mc1.contained_user_bins()[0]; // first cluster is the largest because of sorting above + auto const & largest_c2 = mc2.contained_user_bins()[0]; // first cluster is the largest because of sorting above + + assert(!largest_c1.empty()); // should never be empty + assert(!largest_c2.empty()); // should never be empty + + if (largest_c2.size() == largest_c1.size()) + return cardinalities[largest_c2[0]] < cardinalities[largest_c1[0]]; + return largest_c2.size() < largest_c1.size(); + }; + std::partial_sort(clusters.begin(), clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp); + + auto move_empty_clusters_to_the_end = [](auto const & v1, auto const & v2) + { + return v2.size() < v1.size(); + }; + std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), move_empty_clusters_to_the_end); + +// debug sanity check + for (size_t cidx = 1; cidx < clusters.size(); ++cidx) + { + if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end + throw std::runtime_error{"sorting did not work"}; + } +// debug +} + +std::vector LSH_partitioning(std::vector const & minHash_sketches, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) +{ + std::vector clusters = initital_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config); + post_process_clusters(clusters, cardinalities, config); + return clusters; +} + +std::vector sim_dist_LSH_partitioning(std::vector const & minHash_sketches, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) +{ + std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config); + std::vector multi_clusters = most_distant_LSH_partitioning(clusters, minHash_sketches, config); + post_process_clusters(multi_clusters, cardinalities, config); + return multi_clusters; +} + +bool find_best_partition(chopper::configuration const & config, + size_t const number_of_partitions, + size_t & corrected_estimate_per_part, + std::vector const & cluster, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector> & positions, + std::vector & partition_sketches, + std::vector & max_partition_cardinality, + std::vector & min_partition_cardinality) +{ + seqan::hibf::sketch::hyperloglog const current_sketch = [&sketches, &cluster, &config]() + { + seqan::hibf::sketch::hyperloglog result{config.hibf_config.sketch_bits}; + + for (size_t const user_bin_idx : cluster) + result.merge(sketches[user_bin_idx]); + + return result; + }(); + + size_t const max_card = [&cardinalities, &cluster] () + { + size_t max{0}; + + for (size_t const user_bin_idx : cluster) + max = std::max(max, cardinalities[user_bin_idx]); + + return max; + }(); + + // TODO: afterwads check if I should again merge by how little the effective text ratio grows + + // search best partition fit by similarity + // similarity here is defined as: + // "whose (<-partition) effective text size is subsumed most by the current user bin" + // or in other words: + // "which partition has the largest intersection with user bin b compared to its own (partition) size." + // double best_subsume_ratio{0.0}; + size_t smallest_change{std::numeric_limits::max()}; + size_t best_p{0}; + bool best_p_found{false}; + + auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p) + { + size_t min = std::min(max_card, min_partition_cardinality[p]); + size_t max = std::max(max_card, max_partition_cardinality[p]); + + if (positions[p].size() > config.hibf_config.tmax) // already a third level + { + // if there must already be another lower level because the current merged bin contains more than tmax + // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set + // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user + // bin needs to be stored additionally + size_t const num_ubs_in_merged_bin{positions[p].size() + additional_number_of_user_bins}; + double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax); + return static_cast(max_card * levels); + } + else if (positions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) // now a third level + { + // if the current merged bin contains exactly tmax UBS, adding otherone must + // result in another lower level. Most likely, the smallest user bin will end up on the lower level + // therefore the penalty is set to 'min * tmax' + // of course, there could also be a third level with a lower number of user bins, but this is hard to + // estimate. + size_t const penalty = min * config.hibf_config.tmax; + return penalty; + } + else // positions[p].size() + additional_number_of_user_bins < tmax + { + // if the new user bin is smaller than all other already contained user bins + // the waste of space is high if stored in a single technical bin + if (max_card < min) + return (max - max_card); + // if the new user bin is bigger than all other already contained user bins, the IBF size increases + else if (max_card > max) + return (max_card - max) * config.hibf_config.tmax; + // else, end if-else-block and zero is returned + } + + return (size_t)0u; + }; + + for (size_t p = 0; p < number_of_partitions; ++p) + { + seqan::hibf::sketch::hyperloglog union_sketch = current_sketch; + union_sketch.merge(partition_sketches[p]); + size_t const union_estimate = union_sketch.estimate(); + size_t const current_partition_size = partition_sketches[p].estimate(); + + assert(union_estimate >= current_partition_size); + size_t const penalty_current_bin = union_estimate - current_partition_size; + size_t const penalty_current_ibf = config.hibf_config.tmax * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part); + size_t const change = penalty_current_bin + penalty_current_ibf + penalty_lower_level(cluster.size(), p); + + // size_t const intersection = current_sketch.estimate() - change; + // double const subsume_ratio = static_cast(intersection) / current_partition_size; +//std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" << penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl; + if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/ + (smallest_change > change /*&& subsume_ratio > best_subsume_ratio &&*/ + /*current_partition_size < corrected_estimate_per_part*/)) + { +//std::cout << "smaller!" << std::endl; + // best_subsume_ratio = subsume_ratio; + smallest_change = change; + best_p = p; + best_p_found = true; + } + } + + if (!best_p_found) + throw "currently there are no safety measures if a partition is not found because it is very unlikely"; + +//std::cout << "best_p:" << best_p << std::endl<< std::endl; + + // now that we know which partition fits best (`best_p`), add those indices to it + for (size_t const user_bin_idx : cluster) + { + positions[best_p].push_back(user_bin_idx); + max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]); + min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); + } + partition_sketches[best_p].merge(current_sketch); + corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast(partition_sketches[best_p].estimate())); + + return true; +} + +bool find_best_sorted_partition(chopper::configuration const & config, + size_t const number_of_partitions, + size_t & corrected_estimate_per_part, + std::vector const & cluster, + std::vector const & cardinalities, + std::vector const & /*sketches*/, + std::vector> & partitions, + std::vector & partition_sizes, + std::vector & max_partition_cardinality, + std::vector & min_partition_cardinality) +{ + size_t max_card = [&cardinalities, &cluster] () + { + size_t max{0}; + + for (size_t const user_bin_idx : cluster) + max = std::max(max, cardinalities[user_bin_idx]); + + return max; + }(); + + size_t sum_card = [&cardinalities, &cluster] () + { + size_t sum{0}; + + for (size_t const user_bin_idx : cluster) + sum += cardinalities[user_bin_idx]; + + return sum; + }(); + + size_t smallest_change{std::numeric_limits::max()}; + size_t best_p{0}; + bool best_p_found{false}; + + auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p) + { + size_t min = std::min(max_card, min_partition_cardinality[p]); + size_t max = std::max(max_card, max_partition_cardinality[p]); + + if (partitions[p].size() > config.hibf_config.tmax) // already a third level + { + // if there must already be another lower level because the current merged bin contains more than tmax + // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set + // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user + // bin needs to be stored additionally + size_t const num_ubs_in_merged_bin{partitions[p].size() + additional_number_of_user_bins}; + double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax); + return static_cast(max_card * levels); + } + else if (partitions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) + { + // if the current merged bin contains exactly tmax UBS, adding otherone must + // result in another lower level. Most likely, the smallest user bin will end up on the lower level + // therefore the penalty is set to 'min * tmax' + // of course, there could also be a third level with a lower number of user bins, but this is hard to + // estimate. + size_t const penalty = min * config.hibf_config.tmax; + return penalty; + } + else // partitions[p].size() + additional_number_of_user_bins <= tmax + { + // if the new user bin is smaller than all other already contained user bins + // the waste of space is high if stored in a single technical bin + if (max_card < min) + return (max - max_card); + // if the new user bin is bigger than all other already contained user bins, the IBF size increases + else if (max_card > max) + return (max_card - max) * config.hibf_config.tmax; + // else, end if-else-block and zero is returned + } + + return (size_t)0u; + }; + + for (size_t p = 0; p < number_of_partitions; ++p) + { + size_t const sum = partition_sizes[p] + sum_card; + size_t const penalty_current_ibf = config.hibf_config.tmax * ((sum <= corrected_estimate_per_part) ? 0u : sum - corrected_estimate_per_part); + size_t const change = penalty_current_ibf + penalty_lower_level(cluster.size(), p); + + if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/ + (smallest_change > change)) + { + smallest_change = change; + best_p = p; + best_p_found = true; + } + } + + if (!best_p_found) + return false; + +//std::cout << "best_p:" << best_p << std::endl<< std::endl; + + // now that we know which partition fits best (`best_p`), add those indices to it + for (size_t const user_bin_idx : cluster) + { + partitions[best_p].push_back(user_bin_idx); + partition_sizes[best_p] += cardinalities[user_bin_idx]; + max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]); + min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); + } + corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast(partition_sizes[best_p])); + + return true; +} + +size_t split_bins(chopper::configuration const & config, + std::vector const & sorted_positions, + std::vector const & cardinalities, + std::vector> & partitions, + size_t const sum_of_cardinalities, + size_t const end_idx) +{ + // assign split bins to the end. makes it easier to process merged bins afterwards + size_t pos{partitions.size() - 1}; + for (size_t idx = 0; idx < end_idx; ++idx) { - config.dp_algorithm_timer.start(); - hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, - kmer_counts, - sketches, - seqan::hibf::iota_vector(sketches.size()), - config.union_estimation_timer, - config.rearrangement_timer); - config.dp_algorithm_timer.stop(); + size_t const ub_idx{sorted_positions[idx]}; + size_t const number_of_split_tbs = std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); +std::cout << "number_of_split_tbs: " << number_of_split_tbs + << " cardinalities[ub_idx]:" << cardinalities[ub_idx] + << " sum_of_cardinalities:" << sum_of_cardinalities + << std::endl; + // fill partitions from behind to ensure an easier layouting + for (size_t i = 0; i < number_of_split_tbs; ++i) + { + assert(pos > 0); + assert(partitions[pos].empty()); + partitions[pos].push_back(ub_idx); + --pos; + } + } + std::cout << "pos: " << pos << std::endl; + return pos + 1; +} - if (config.output_verbose_statistics) +std::pair find_bins_to_be_split(std::vector const & sorted_positions, + std::vector const & cardinalities, + size_t threshold, + size_t const max_bins) +{ + size_t idx{0}; + size_t sum{0}; + size_t number_of_split_bins = max_bins + 1; // initialise to something bigger to trigger while loop below + + auto find_idx_and_sum = [&]() + { + while (idx < sorted_positions.size() && cardinalities[sorted_positions[idx]] > threshold) { - size_t dummy{}; - chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; - global_stats.hibf_layout = hibf_layout; - global_stats.print_header_to(std::cout); - global_stats.print_summary_to(dummy, std::cout); + sum += cardinalities[sorted_positions[idx]]; + ++idx; } + }; + + while (number_of_split_bins > max_bins) + { + idx = 0; + sum = 0; + find_idx_and_sum(); + number_of_split_bins = sum / threshold; + // max(1.01, ...) because a tie would end in an endless loop. + threshold = static_cast(threshold) * std::max(1.01, static_cast(sum) / (threshold * max_bins)); } - // brief Write the output to the layout file. - std::ofstream fout{config.output_filename}; - chopper::layout::write_user_bins_to(filenames, fout); - config.write_to(fout); - hibf_layout.write_to(fout); + // Maybe there are more user bins (position of idx) than available split bins. + // Thus, clamp the number of user bins to split to maximally the number_of_split_bins + idx = std::min(idx, number_of_split_bins); + + return {idx, number_of_split_bins}; +} + +size_t lsh_sim_approach(chopper::configuration const & config, + std::vector const & sorted_positions2, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + std::vector> & partitions, + size_t const number_of_remaining_tbs, + size_t const technical_bin_size_threshold, + size_t const sum_of_cardinalities) +{ + uint8_t const sketch_bits{config.hibf_config.sketch_bits}; + //std::cout << "LSH partitioning into " << config.hibf_config.tmax << std::endl; + std::vector partition_sketches(number_of_remaining_tbs, + seqan::hibf::sketch::hyperloglog(sketch_bits)); + + std::vector max_partition_cardinality(number_of_remaining_tbs, 0u); + std::vector min_partition_cardinality(number_of_remaining_tbs, std::numeric_limits::max()); + + // initial partitioning using locality sensitive hashing (LSH) + config.lsh_algorithm_timer.start(); + std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, technical_bin_size_threshold, config); + post_process_clusters(clusters, cardinalities, config); + config.lsh_algorithm_timer.stop(); + +std::ofstream ofs{"/tmp/final.clusters"}; +for (size_t i = 0; i < clusters.size(); ++i) +{ +seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); +for (size_t j = 0; j < clusters[i].size(); ++j) +{ + sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); +} +ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; +} + + std::vector> remaining_clusters{}; + + // initialise partitions with the first p largest clusters (post_processing sorts by size) + size_t cidx{0}; // current cluster index + for (size_t p = 0; p < number_of_remaining_tbs; ++p) + { + assert(!clusters[cidx].empty()); + auto const & cluster = clusters[cidx].contained_user_bins(); + bool split_cluster = false; + + if (cluster.size() > config.hibf_config.tmax) + { + size_t card{0}; + for (size_t uidx = 0; uidx < cluster.size(); ++uidx) + card += cardinalities[cluster[uidx]]; + + if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax) + split_cluster = true; + } + + size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax); + for (size_t uidx = 0; uidx < end; ++uidx) + { + size_t const user_bin_idx = cluster[uidx]; + // if a single cluster already exceeds the cardinality_per_part, + // then the remaining user bins of the cluster must spill over into the next partition + if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sketches[p].estimate() > technical_bin_size_threshold) + { + ++p; + + if (p >= number_of_remaining_tbs) + { + split_cluster = true; + end = uidx; + break; + } + } + + partition_sketches[p].merge(sketches[user_bin_idx]); + partitions[p].push_back(user_bin_idx); + max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); + min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); + } + + if (split_cluster) + { + std::vector remainder(cluster.begin() + end, cluster.end()); + remaining_clusters.insert(remaining_clusters.end(), remainder); + } + + ++cidx; + } + + for (size_t i = cidx; i < clusters.size(); ++i) + { + if (clusters[i].empty()) + break; + + remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins()); + } + + // assign the rest by similarity + size_t merged_threshold{technical_bin_size_threshold}; + for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx) + { + auto const & cluster = remaining_clusters[ridx]; + + config.search_partition_algorithm_timer.start(); + find_best_partition(config, number_of_remaining_tbs, merged_threshold, cluster, cardinalities, sketches, partitions, partition_sketches, max_partition_cardinality, min_partition_cardinality); + config.search_partition_algorithm_timer.start(); + } + + // compute actual max size + size_t max_size{0}; + for (auto const & sketch : partition_sketches) + max_size = std::max(max_size, (size_t)sketch.estimate()); + + return max_size; +} + +void partition_user_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + std::vector> & partitions) +{ + // all approaches need sorted positions + std::vector const sorted_positions = [&positions, &cardinalities]() + { + std::vector ps(positions.begin(), positions.end()); + seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps); + return ps; + }(); + + auto const [sum_of_cardinalities, max_cardinality, joint_estimate] = [&]() + { + size_t sum{0}; + size_t max{0}; + seqan::hibf::sketch::hyperloglog sketch{config.hibf_config.sketch_bits}; + + for (size_t const pos : positions) + { + sum += cardinalities[pos]; + max = std::max(max, cardinalities[pos]); + sketch.merge(sketches[pos]); + } + + return std::tuple{sum, max, sketch.estimate()}; + }(); + + // If the effective text size is very low, it can happen that the joint_estimate divided by the number of partitions + // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size + // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand. + // This way we make sure there is at least 1 LSH clustering step. + size_t const estimate_per_part = std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), + max_cardinality + 1); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); +//std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl; + + size_t split_threshold{}; + size_t idx{0}; // start in sorted positions + size_t number_of_split_tbs{0}; + size_t number_of_merged_tbs{config.hibf_config.tmax}; + size_t max_split_size{0}; + size_t max_merged_size{0}; + + // split_threshold = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax); + split_threshold = seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax); + + auto parition_split_bins = [&]() + { + size_t number_of_potential_split_bins{0}; + std::tie(idx, number_of_potential_split_bins) = find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1); + std::tie(number_of_split_tbs, max_split_size) = chopper::layout::determine_split_bins(config, sorted_positions, cardinalities, number_of_potential_split_bins, idx, partitions); + number_of_merged_tbs = config.hibf_config.tmax - number_of_split_tbs; + }; + + auto partitions_merged_bins = [&]() + { + + if (config.partitioning_approach == partitioning_scheme::blocked) + { + size_t const estimated_tb_size = [&]() + { + size_t sum{0}; + + for (size_t pos{idx}; pos < sorted_positions.size(); ++pos) + sum += cardinalities[sorted_positions[pos]]; + + return seqan::hibf::divide_and_ceil(sum, number_of_merged_tbs); + }(); + + size_t current_part{0}; + size_t current_sum{0}; + + for (;idx < sorted_positions.size(); ++idx) + { + size_t const current_user_bin_id{sorted_positions[idx]}; + partitions[current_part].push_back(current_user_bin_id); + current_sum += cardinalities[current_user_bin_id]; + + if (current_sum >= estimated_tb_size && current_part < number_of_merged_tbs) + { + max_merged_size = std::max(current_sum, max_merged_size); + ++current_part; + current_sum = 0; + assert(current_part < number_of_merged_tbs); + } + } + } + else if (config.partitioning_approach == partitioning_scheme::sorted) + { + size_t current_tb_size_threshold = split_threshold; + + size_t current_part{0}; + seqan::hibf::sketch::hyperloglog current_sketch{config.hibf_config.sketch_bits}; + std::vector partition_sketches(config.hibf_config.tmax, current_sketch); + + for (;idx < sorted_positions.size(); ++idx) + { + size_t const current_user_bin_id{sorted_positions[idx]}; + partitions[current_part].push_back(current_user_bin_id); + current_sketch.merge(sketches[current_user_bin_id]); + + if (current_sketch.estimate() >= current_tb_size_threshold) + { + max_merged_size = std::max((size_t)current_sketch.estimate(), max_merged_size); + partition_sketches[current_part] = current_sketch; + current_sketch = seqan::hibf::sketch::hyperloglog{config.hibf_config.sketch_bits}; + ++current_part; + + if (current_part >= number_of_merged_tbs) + { + current_tb_size_threshold *= 1 + static_cast(idx) / sorted_positions.size(); + current_part = 0; // fill up from the start agagin as the threshold increased + } + } + } + } + else if (config.partitioning_approach == partitioning_scheme::folded) + { + } + else if (config.partitioning_approach == partitioning_scheme::weighted_fold) + { + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.hibf_config.tmax); + + size_t current_big_pos{0}; // the next largest user bin to assign to a partition + size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin + + for (size_t current_part = 0; current_part + 1 < config.hibf_config.tmax; ++current_part) + { + size_t current_cardinality{0}; + std::vector small_bins; + size_t new_small_bin_addition{0}; + + auto compute_score = [&]() + { + double const weight = static_cast(current_cardinality) / cardinality_per_part; + double const amount = + static_cast(partitions[current_part].size() + small_bins.size() + new_small_bin_addition) + / u_bins_per_part; + return std::abs(1.0 - weight) + std::abs(1.0 - amount); + }; + + // first add all large bins that fit + while (current_cardinality < cardinality_per_part) + { + partitions[current_part].push_back(sorted_positions[current_big_pos]); + current_cardinality += cardinalities[sorted_positions[current_big_pos]]; + ++current_big_pos; + } + + double local_optimum = compute_score(); + + // then remove big bins and add small bins until a local optima is reached + while (true) + { + size_t const cache_last_small_pos{current_small_pos}; + // remove a big user bin and fill the partition with small user bins + current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; + + // can we further improve the ratio by adding more small bins? + double improved_score{}; + do + { + improved_score = compute_score(); + current_cardinality += cardinalities[sorted_positions[current_small_pos]]; + --current_small_pos; + ++new_small_bin_addition; + } + while (compute_score() < improved_score); // smaller is better + // remove overstep + ++current_small_pos; + current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; + --new_small_bin_addition; + + if (local_optimum < compute_score()) // score would increase. Stop + { + current_small_pos = cache_last_small_pos; + break; + } + else // update + { + partitions[current_part].pop_back(); + --current_big_pos; + for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos) + small_bins.push_back(sorted_positions[pos]); + } + } + partitions[current_part].insert(partitions[current_part].end(), small_bins.begin(), small_bins.end()); + } + + // remaining user bins go to last partition + while (current_big_pos <= current_small_pos) + { + partitions[config.hibf_config.tmax - 1].push_back(sorted_positions[current_big_pos]); + ++current_big_pos; + } + } + else if (config.partitioning_approach == partitioning_scheme::similarity) + { + size_t const number_of_remaining_ubs = positions.size() - idx; + uint8_t const sketch_bits{config.hibf_config.sketch_bits}; + std::vector partition_sketches(number_of_merged_tbs, + seqan::hibf::sketch::hyperloglog(sketch_bits)); + + size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; + size_t merged_threshold = std::max(corrected_max_split_size, split_threshold); + + size_t const u_bins_per_part = number_of_remaining_ubs / number_of_merged_tbs; // round down + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + size_t const number_of_blocks = seqan::hibf::divide_and_ceil(number_of_remaining_ubs, block_size); + + std::vector max_partition_cardinality(number_of_merged_tbs, 0u); + std::vector min_partition_cardinality(number_of_merged_tbs, std::numeric_limits::max()); + + // don't move from largest to smallest but pick the next block to process randomly. + // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) + std::vector indices(number_of_blocks); + std::iota(indices.begin(), indices.end(), 0); + std::random_device shuffle_random_device; + std::mt19937 shuffle_engine(shuffle_random_device()); + std::shuffle(indices.begin(), indices.end(), shuffle_engine); + + // initialise partitions with the first random number_of_merged_tbs blocks + assert(number_of_blocks >= number_of_merged_tbs); + for (size_t p = 0; p < number_of_merged_tbs; ++p) + { + size_t const i = indices[p]; + size_t const end = (i == (indices.size() - 1) && (number_of_remaining_ubs % block_size != 0)) ? (number_of_remaining_ubs % block_size) : block_size; + for (size_t x = 0; x < end; ++x) + { + assert(idx + block_size * i + x < sorted_positions.size()); + size_t const user_bin_idx{sorted_positions[idx + block_size * i + x]}; + partition_sketches[p].merge(sketches[user_bin_idx]); + partitions[p].push_back(user_bin_idx); + max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); + min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); + } + } + + auto has_been_processed_in_init = [&](size_t const i) + { + bool result{false}; + for (size_t p = 0; p < number_of_merged_tbs; ++p) + result |= ((idx + indices[p] * block_size) == i); + return result; + }; + + // assign the rest by similarity, user in by user bin (single bins, no blocks) + for (size_t i = idx; i < sorted_positions.size(); ++i) + { + if (has_been_processed_in_init(i)) + { + i += (block_size - 1); // -1 because there will be an increment after continue + continue; + } + + find_best_partition(config, + number_of_merged_tbs, + merged_threshold, + {sorted_positions[i]}, + cardinalities, + sketches, + partitions, + partition_sketches, + max_partition_cardinality, + min_partition_cardinality); + } + + // compute actual max size + for (auto const & sketch : partition_sketches) + max_merged_size = std::max(max_merged_size, (size_t)sketch.estimate()); + } + else if (config.partitioning_approach == partitioning_scheme::lsh) + { + std::vector const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end()); + size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; + size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); + + std::vector partition_sizes(number_of_merged_tbs, 0u); + + std::vector max_partition_cardinality(number_of_merged_tbs, 0u); + std::vector min_partition_cardinality(number_of_merged_tbs, std::numeric_limits::max()); + + // initial partitioning using locality sensitive hashing (LSH) + config.lsh_algorithm_timer.start(); + std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, merged_threshold, config); + post_process_clusters(clusters, cardinalities, config); + config.lsh_algorithm_timer.stop(); + +std::ofstream ofs{"/tmp/final.clusters"}; +for (size_t i = 0; i < clusters.size(); ++i) +{ + seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); + for (size_t j = 0; j < clusters[i].size(); ++j) + { + sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); + } + ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; +} + + std::vector> remaining_clusters{}; + + size_t cidx{0}; // current cluster index + // initialise partitions with the first p largest clusters (post_processing sorts by size) + for (size_t p = 0; p < number_of_merged_tbs; ++p) + { + assert(!clusters[cidx].empty()); + auto const & cluster = clusters[cidx].contained_user_bins(); + bool split_cluster = false; + + if (cluster.size() > config.hibf_config.tmax) + { + size_t card{0}; + for (size_t uidx = 0; uidx < cluster.size(); ++uidx) + card += cardinalities[cluster[uidx]]; + + if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax) + split_cluster = true; + } + + size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax); + for (size_t uidx = 0; uidx < end; ++uidx) + { + size_t const user_bin_idx = cluster[uidx]; + // if a single cluster already exceeds the cardinality_per_part, + // then the remaining user bins of the cluster must spill over into the next partition + if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sizes[p] > merged_threshold) + { + ++p; + + if (p >= number_of_merged_tbs) + { + split_cluster = true; + end = uidx; + break; + } + } + + partition_sizes[p] += cardinalities[user_bin_idx]; + partitions[p].push_back(user_bin_idx); + max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); + min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); + } + + if (split_cluster) + { + std::vector remainder(cluster.begin() + end, cluster.end()); + remaining_clusters.insert(remaining_clusters.end(), remainder); + } + + ++cidx; + } + + for (size_t i = cidx; i < clusters.size(); ++i) + { + if (clusters[i].empty()) + break; + + remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins()); + } + + // assign the rest by similarity + max_merged_size = merged_threshold; + for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx) + { + auto const & cluster = remaining_clusters[ridx]; + + config.search_partition_algorithm_timer.start(); + find_best_sorted_partition(config, number_of_merged_tbs, max_merged_size, cluster, cardinalities, sketches, partitions, partition_sizes, max_partition_cardinality, min_partition_cardinality); + config.search_partition_algorithm_timer.start(); + + } + } + else if (config.partitioning_approach == partitioning_scheme::lsh_sim) + { + // determine number of split bins + std::vector const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end()); + + // distribute the rest to merged bins + size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; + size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); + + max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities); + } + + }; + + parition_split_bins(); + partitions_merged_bins(); + + int64_t const difference = static_cast(max_merged_size * relaxed_fpr_correction) - static_cast(max_split_size); + +std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl; +std::cout << "Reconfiguring threshold. from:" << split_threshold; + + if (number_of_split_tbs == 0) + split_threshold = (split_threshold + max_merged_size) / 2; // increase threshold + else if (difference > 0) // need more merged bins -> increase threshold + split_threshold = static_cast(split_threshold) * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + else // need more split bins -> decrease threshold + split_threshold = static_cast(split_threshold) * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + +std::cout << " to:" << split_threshold << std::endl; + + // reset result + partitions.clear(); + partitions.resize(config.hibf_config.tmax); + idx = 0; + number_of_split_tbs = 0; + number_of_merged_tbs = config.hibf_config.tmax; + max_split_size = 0; + max_merged_size = 0; + + parition_split_bins(); + partitions_merged_bins(); + + // sanity check: + size_t sum{0}; + size_t last_index{positions.size()}; // non existing user bin idx + for (auto const & p : partitions) + { + if (p.size() == 1) + { + // for split bins, avoid additional counts + if (p[0] != last_index) + ++sum; + + last_index = p[0]; + } + else + { + sum += p.size(); + } + } + + if (sum != positions.size()) + { + std::string str{"Not all user bins have been assigned to the "}; + str += std::to_string(partitions.size()); + str += " partitions! ("; + str += std::to_string(sum); + str += "/"; + str += std::to_string(positions.size()); + str += ")\n"; + for (auto const & p : partitions) + { + str += "["; + for (auto const h : p) + { + str += std::to_string(h); + str += ","; + } + str.back() = ']'; + str += '\n'; + } + + throw std::logic_error{str}; + } + + // assert([&](){ bool x{false}; for (auto const & p : partitions) { x &= !p.empty(); }; return x; }); +} + +seqan::hibf::layout::layout general_layout(chopper::configuration const & config, + std::vector positions, + std::vector const & cardinalities, + std::vector const & sketches) +{ + seqan::hibf::layout::layout hibf_layout; + + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; + + dp_algorithm_timer.start(); + hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + cardinalities, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); + + return hibf_layout; +} + +bool do_I_need_a_fast_layout(chopper::configuration const & config, std::vector const & positions, std::vector const & cardinalities) +{ + // the fast layout heuristic would greedily merge even if merging only 2 bins at a time + // merging only little number of bins is highly disadvantegous for lower levels because few bins + // will be heavily split and this will raise the fpr correction for split bins + // Thus, if the average number of user bins per technical bin is less then 64, we should not fast layout + if (positions.size() < (64 * config.hibf_config.tmax)) + return false; + + if (positions.size() > 500'000) // layout takes more than half a day (should this be a user option?) + return true; + + size_t largest_size{0}; + size_t sum_of_cardinalities{0}; + + for (size_t const i : positions) + { + sum_of_cardinalities += cardinalities[i]; + largest_size = std::max(largest_size, cardinalities[i]); + } + + size_t const cardinality_per_tb = sum_of_cardinalities / config.hibf_config.tmax; + + bool const largest_user_bin_might_be_split = largest_size > cardinality_per_tb; + + // if no splitting is needed, its worth it to use a fast-merge-only algorithm + if (!largest_user_bin_might_be_split) + return true; + + return false; +} + +void add_level_to_layout(chopper::configuration const & config, + seqan::hibf::layout::layout & hibf_layout, + std::vector> const & partitions, + std::vector const & sketches, + std::vector const & previous) +{ + size_t max_bin_id{0}; + size_t max_size{0}; + + auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = partitions.size()}); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + + // we assume here that the user bins have been sorted by user bin id such that pos = idx + for (size_t partition_idx{0}; partition_idx < partitions.size(); ++partition_idx) + { + auto const & partition = partitions[partition_idx]; + + if (partition.size() > 1) // merged bin + { + seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size + current_sketch.reset(); + + for (size_t const user_bin_id : partition) + { + assert(hibf_layout.user_bins[user_bin_id].idx == user_bin_id); + auto & current_user_bin = hibf_layout.user_bins[user_bin_id]; + + // update + assert(previous == current_user_bin.previous_TB_indices); + current_user_bin.previous_TB_indices.push_back(partition_idx); + current_sketch.merge(sketches[user_bin_id]); + } + + // update max_bin_id, max_size + size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction; + if (current_size > max_size) + { + max_bin_id = partition_idx; + max_size = current_size; + } + } + else if (partition.size() == 0) // should not happen.. dge case? + { + continue; + } + else // single or split bin (partition.size() == 1) + { + auto & current_user_bin = hibf_layout.user_bins[partitions[partition_idx][0]]; + assert(current_user_bin.idx == partitions[partition_idx][0]); + current_user_bin.storage_TB_id = partition_idx; + current_user_bin.number_of_technical_bins = 1; // initialise to 1 + + while (partition_idx + 1 < partitions.size() && + partitions[partition_idx].size() == 1 && + partitions[partition_idx + 1].size() == 1 && + partitions[partition_idx][0] == partitions[partition_idx + 1][0]) + { + ++current_user_bin.number_of_technical_bins; + ++partition_idx; + } + + // update max_bin_id, max_size + size_t const current_size = sketches[current_user_bin.idx].estimate() * split_fpr_correction[current_user_bin.number_of_technical_bins]; + if (current_size > max_size) + { + max_bin_id = current_user_bin.storage_TB_id; + max_size = current_size; + } + } + } + + hibf_layout.max_bins.emplace_back(previous, max_bin_id); // add lower level meta information +} + +void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout, + seqan::hibf::layout::layout & hibf_layout, + std::vector const & new_previous) +{ + hibf_layout.max_bins.emplace_back(new_previous, child_layout.top_level_max_bin_id); + + for (auto & max_bin : child_layout.max_bins) + { + max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), new_previous.begin(), new_previous.end()); + hibf_layout.max_bins.push_back(max_bin); + } + + for (auto const & user_bin : child_layout.user_bins) + { + auto & actual_user_bin = hibf_layout.user_bins[user_bin.idx]; + + actual_user_bin.previous_TB_indices.insert(actual_user_bin.previous_TB_indices.end(), + user_bin.previous_TB_indices.begin(), + user_bin.previous_TB_indices.end()); + actual_user_bin.number_of_technical_bins = user_bin.number_of_technical_bins; + actual_user_bin.storage_TB_id = user_bin.storage_TB_id; + } +} + +void fast_layout_recursion(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout, + std::vector const & previous) +{ + std::vector> tmax_partitions(config.hibf_config.tmax); + + // here we assume that we want to start with a fast layout + partition_user_bins(config, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); + + #pragma omp critical + { + add_level_to_layout(config, hibf_layout, tmax_partitions, sketches, previous); + } + + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + auto const & partition = tmax_partitions[partition_idx]; + auto const new_previous = [&] () {auto cpy{previous}; cpy.push_back(partition_idx); return cpy; }(); + + if (partition.empty() || partition.size() == 1) // nothing to merge + continue; + + if (do_I_need_a_fast_layout(config, partition, cardinalities)) + { + fast_layout_recursion(config, partition, cardinalities, sketches, minHash_sketches, hibf_layout, new_previous); // recurse fast_layout + } + else + { + auto child_layout = general_layout(config, partition, cardinalities, sketches); + + #pragma omp critical + { + update_layout_from_child_layout(child_layout, hibf_layout, new_previous); + } + } + } +} + +void fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout) +{ + auto config_copy = config; + + std::vector> tmax_partitions(config.hibf_config.tmax); + + // here we assume that we want to start with a fast layout + config.intital_partition_timer.start(); + partition_user_bins(config_copy, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); + config.intital_partition_timer.stop(); + + auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = config.hibf_config.tmax}); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + + size_t max_bin_id{0}; + size_t max_size{0}; + hibf_layout.user_bins.resize(config_copy.hibf_config.number_of_user_bins); + + // initialise user bins in layout + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + if (tmax_partitions[partition_idx].size() > 1) // merged bin + { + seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size + current_sketch.reset(); + + for (size_t const user_bin_id : tmax_partitions[partition_idx]) + { + hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {partition_idx}, + .storage_TB_id = 0 /*not determiend yet*/, + .number_of_technical_bins = 1 /*not determiend yet*/, + .idx = user_bin_id}; + current_sketch.merge(sketches[user_bin_id]); + } + + // update max_bin_id, max_size + size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction; + if (current_size > max_size) + { + max_bin_id = partition_idx; + max_size = current_size; + } + } + else if (tmax_partitions[partition_idx].size() == 0) // should not happen. Edge case? + { + continue; + } + else // single or split bin (tmax_partitions[partition_idx].size() == 1) + { + assert(tmax_partitions[partition_idx].size() == 1); + size_t const user_bin_id = tmax_partitions[partition_idx][0]; + hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {}, + .storage_TB_id = partition_idx, + .number_of_technical_bins = 1 /*determiend below*/, + .idx = user_bin_id}; + + while (partition_idx + 1 < tmax_partitions.size() && + tmax_partitions[partition_idx].size() == 1 && + tmax_partitions[partition_idx + 1].size() == 1 && + tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0]) + { + ++hibf_layout.user_bins[user_bin_id].number_of_technical_bins; + ++partition_idx; + } + + // update max_bin_id, max_size + size_t const current_size = sketches[user_bin_id].estimate() * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins]; + if (current_size > max_size) + { + max_bin_id = hibf_layout.user_bins[user_bin_id].storage_TB_id; + max_size = current_size; + } + } + } + + hibf_layout.top_level_max_bin_id = max_bin_id; + + config.small_layouts_timer.start(); + #pragma omp parallel + #pragma omp single + { + #pragma omp taskloop + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + auto const & partition = tmax_partitions[partition_idx]; + + if (partition.empty() || partition.size() == 1) // nothing to merge + continue; + + if (do_I_need_a_fast_layout(config_copy, partition, cardinalities)) + { + fast_layout_recursion(config_copy, partition, cardinalities, sketches, minHash_sketches, hibf_layout, {partition_idx}); // recurse fast_layout + } + else + { + auto small_layout = general_layout(config, partition, cardinalities, sketches); + + #pragma omp critical + { + update_layout_from_child_layout(small_layout, hibf_layout, std::vector{partition_idx}); + } + } + } + } + config.small_layouts_timer.stop(); +} + +int execute(chopper::configuration & config, + std::vector> const & filenames, + std::vector const & sketches, + std::vector const & minHash_sketches) +{ + config.hibf_config.validate_and_set_defaults(); + + std::vector cardinalities; + seqan::hibf::sketch::estimate_kmer_counts(sketches, cardinalities); + + if (true) // 0 == unset == single HIBF, 1 == single HIBF + { + seqan::hibf::layout::layout hibf_layout; + + + if (config.determine_best_tmax) + { + hibf_layout = determine_best_number_of_technical_bins(config, cardinalities, sketches); + } + else + { + config.dp_algorithm_timer.start(); + fast_layout(config, seqan::hibf::iota_vector(sketches.size()), cardinalities, sketches, minHash_sketches, hibf_layout); + config.dp_algorithm_timer.stop(); + + // hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + // cardinalities, + // sketches, + // seqan::hibf::iota_vector(sketches.size()), + // config.union_estimation_timer, + // config.rearrangement_timer); + + // sort records ascending by the number of bin indices (corresponds to the IBF levels) + // GCOVR_EXCL_START + std::ranges::sort(hibf_layout.max_bins, + [](auto const & r, auto const & l) + { + if (r.previous_TB_indices.size() == l.previous_TB_indices.size()) + return std::ranges::lexicographical_compare(r.previous_TB_indices, l.previous_TB_indices); + else + return r.previous_TB_indices.size() < l.previous_TB_indices.size(); + }); + // GCOVR_EXCL_STOP + + if (config.output_verbose_statistics) + { + size_t dummy{}; + chopper::layout::hibf_statistics global_stats{config, sketches, cardinalities}; + global_stats.hibf_layout = hibf_layout; + global_stats.print_header_to(std::cout); + global_stats.print_summary_to(dummy, std::cout); + } + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + hibf_layout.write_to(fout); + } + else + { + std::vector> positions(config.hibf_config.tmax); // asign positions for each partition + + std::vector ps; + ps.resize(cardinalities.size()); + std::iota(ps.begin(), ps.end(), 0); + partition_user_bins(config, ps, cardinalities, sketches, minHash_sketches, positions); + + std::vector hibf_layouts(config.hibf_config.tmax); // multiple layouts + +#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads) + for (size_t i = 0; i < config.hibf_config.tmax; ++i) + { + // reset tmax to fit number of user bins in layout + auto local_hibf_config = config.hibf_config; // every thread needs to set individual tmax + local_hibf_config.tmax = + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(positions[i].size())))); + + config.dp_algorithm_timer.start(); + hibf_layouts[i] = seqan::hibf::layout::compute_layout(local_hibf_config, + cardinalities, + sketches, + std::move(positions[i]), + config.union_estimation_timer, + config.rearrangement_timer); + config.dp_algorithm_timer.stop(); + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + + for (size_t i = 0; i < config.hibf_config.tmax; ++i) + hibf_layouts[i].write_to(fout); + } return 0; } diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index df86775c..e171ed93 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -398,7 +398,7 @@ void hibf_statistics::collect_bins() # pragma GCC diagnostic ignored "-Warray-bounds=" # pragma GCC diagnostic ignored "-Wstringop-overflow=" #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV - ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); + // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV # pragma GCC diagnostic pop #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV From 4aefac2aee107d2d3c67bd58d39604a71bb7bf05 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 24 Jul 2025 21:44:50 +0200 Subject: [PATCH 06/18] delete worse approaches --- src/layout/execute.cpp | 324 ----------------------------------------- 1 file changed, 324 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index fd02cd0d..3c323d5f 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -1071,328 +1071,6 @@ void partition_user_bins(chopper::configuration const & config, }; auto partitions_merged_bins = [&]() - { - - if (config.partitioning_approach == partitioning_scheme::blocked) - { - size_t const estimated_tb_size = [&]() - { - size_t sum{0}; - - for (size_t pos{idx}; pos < sorted_positions.size(); ++pos) - sum += cardinalities[sorted_positions[pos]]; - - return seqan::hibf::divide_and_ceil(sum, number_of_merged_tbs); - }(); - - size_t current_part{0}; - size_t current_sum{0}; - - for (;idx < sorted_positions.size(); ++idx) - { - size_t const current_user_bin_id{sorted_positions[idx]}; - partitions[current_part].push_back(current_user_bin_id); - current_sum += cardinalities[current_user_bin_id]; - - if (current_sum >= estimated_tb_size && current_part < number_of_merged_tbs) - { - max_merged_size = std::max(current_sum, max_merged_size); - ++current_part; - current_sum = 0; - assert(current_part < number_of_merged_tbs); - } - } - } - else if (config.partitioning_approach == partitioning_scheme::sorted) - { - size_t current_tb_size_threshold = split_threshold; - - size_t current_part{0}; - seqan::hibf::sketch::hyperloglog current_sketch{config.hibf_config.sketch_bits}; - std::vector partition_sketches(config.hibf_config.tmax, current_sketch); - - for (;idx < sorted_positions.size(); ++idx) - { - size_t const current_user_bin_id{sorted_positions[idx]}; - partitions[current_part].push_back(current_user_bin_id); - current_sketch.merge(sketches[current_user_bin_id]); - - if (current_sketch.estimate() >= current_tb_size_threshold) - { - max_merged_size = std::max((size_t)current_sketch.estimate(), max_merged_size); - partition_sketches[current_part] = current_sketch; - current_sketch = seqan::hibf::sketch::hyperloglog{config.hibf_config.sketch_bits}; - ++current_part; - - if (current_part >= number_of_merged_tbs) - { - current_tb_size_threshold *= 1 + static_cast(idx) / sorted_positions.size(); - current_part = 0; // fill up from the start agagin as the threshold increased - } - } - } - } - else if (config.partitioning_approach == partitioning_scheme::folded) - { - } - else if (config.partitioning_approach == partitioning_scheme::weighted_fold) - { - size_t const cardinality_per_part = - seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax); - size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.hibf_config.tmax); - - size_t current_big_pos{0}; // the next largest user bin to assign to a partition - size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin - - for (size_t current_part = 0; current_part + 1 < config.hibf_config.tmax; ++current_part) - { - size_t current_cardinality{0}; - std::vector small_bins; - size_t new_small_bin_addition{0}; - - auto compute_score = [&]() - { - double const weight = static_cast(current_cardinality) / cardinality_per_part; - double const amount = - static_cast(partitions[current_part].size() + small_bins.size() + new_small_bin_addition) - / u_bins_per_part; - return std::abs(1.0 - weight) + std::abs(1.0 - amount); - }; - - // first add all large bins that fit - while (current_cardinality < cardinality_per_part) - { - partitions[current_part].push_back(sorted_positions[current_big_pos]); - current_cardinality += cardinalities[sorted_positions[current_big_pos]]; - ++current_big_pos; - } - - double local_optimum = compute_score(); - - // then remove big bins and add small bins until a local optima is reached - while (true) - { - size_t const cache_last_small_pos{current_small_pos}; - // remove a big user bin and fill the partition with small user bins - current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; - - // can we further improve the ratio by adding more small bins? - double improved_score{}; - do - { - improved_score = compute_score(); - current_cardinality += cardinalities[sorted_positions[current_small_pos]]; - --current_small_pos; - ++new_small_bin_addition; - } - while (compute_score() < improved_score); // smaller is better - // remove overstep - ++current_small_pos; - current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; - --new_small_bin_addition; - - if (local_optimum < compute_score()) // score would increase. Stop - { - current_small_pos = cache_last_small_pos; - break; - } - else // update - { - partitions[current_part].pop_back(); - --current_big_pos; - for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos) - small_bins.push_back(sorted_positions[pos]); - } - } - partitions[current_part].insert(partitions[current_part].end(), small_bins.begin(), small_bins.end()); - } - - // remaining user bins go to last partition - while (current_big_pos <= current_small_pos) - { - partitions[config.hibf_config.tmax - 1].push_back(sorted_positions[current_big_pos]); - ++current_big_pos; - } - } - else if (config.partitioning_approach == partitioning_scheme::similarity) - { - size_t const number_of_remaining_ubs = positions.size() - idx; - uint8_t const sketch_bits{config.hibf_config.sketch_bits}; - std::vector partition_sketches(number_of_merged_tbs, - seqan::hibf::sketch::hyperloglog(sketch_bits)); - - size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; - size_t merged_threshold = std::max(corrected_max_split_size, split_threshold); - - size_t const u_bins_per_part = number_of_remaining_ubs / number_of_merged_tbs; // round down - size_t const block_size = - std::min(u_bins_per_part, - chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); - size_t const number_of_blocks = seqan::hibf::divide_and_ceil(number_of_remaining_ubs, block_size); - - std::vector max_partition_cardinality(number_of_merged_tbs, 0u); - std::vector min_partition_cardinality(number_of_merged_tbs, std::numeric_limits::max()); - - // don't move from largest to smallest but pick the next block to process randomly. - // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) - std::vector indices(number_of_blocks); - std::iota(indices.begin(), indices.end(), 0); - std::random_device shuffle_random_device; - std::mt19937 shuffle_engine(shuffle_random_device()); - std::shuffle(indices.begin(), indices.end(), shuffle_engine); - - // initialise partitions with the first random number_of_merged_tbs blocks - assert(number_of_blocks >= number_of_merged_tbs); - for (size_t p = 0; p < number_of_merged_tbs; ++p) - { - size_t const i = indices[p]; - size_t const end = (i == (indices.size() - 1) && (number_of_remaining_ubs % block_size != 0)) ? (number_of_remaining_ubs % block_size) : block_size; - for (size_t x = 0; x < end; ++x) - { - assert(idx + block_size * i + x < sorted_positions.size()); - size_t const user_bin_idx{sorted_positions[idx + block_size * i + x]}; - partition_sketches[p].merge(sketches[user_bin_idx]); - partitions[p].push_back(user_bin_idx); - max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); - min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); - } - } - - auto has_been_processed_in_init = [&](size_t const i) - { - bool result{false}; - for (size_t p = 0; p < number_of_merged_tbs; ++p) - result |= ((idx + indices[p] * block_size) == i); - return result; - }; - - // assign the rest by similarity, user in by user bin (single bins, no blocks) - for (size_t i = idx; i < sorted_positions.size(); ++i) - { - if (has_been_processed_in_init(i)) - { - i += (block_size - 1); // -1 because there will be an increment after continue - continue; - } - - find_best_partition(config, - number_of_merged_tbs, - merged_threshold, - {sorted_positions[i]}, - cardinalities, - sketches, - partitions, - partition_sketches, - max_partition_cardinality, - min_partition_cardinality); - } - - // compute actual max size - for (auto const & sketch : partition_sketches) - max_merged_size = std::max(max_merged_size, (size_t)sketch.estimate()); - } - else if (config.partitioning_approach == partitioning_scheme::lsh) - { - std::vector const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end()); - size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; - size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); - - std::vector partition_sizes(number_of_merged_tbs, 0u); - - std::vector max_partition_cardinality(number_of_merged_tbs, 0u); - std::vector min_partition_cardinality(number_of_merged_tbs, std::numeric_limits::max()); - - // initial partitioning using locality sensitive hashing (LSH) - config.lsh_algorithm_timer.start(); - std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, merged_threshold, config); - post_process_clusters(clusters, cardinalities, config); - config.lsh_algorithm_timer.stop(); - -std::ofstream ofs{"/tmp/final.clusters"}; -for (size_t i = 0; i < clusters.size(); ++i) -{ - seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); - for (size_t j = 0; j < clusters[i].size(); ++j) - { - sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); - } - ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; -} - - std::vector> remaining_clusters{}; - - size_t cidx{0}; // current cluster index - // initialise partitions with the first p largest clusters (post_processing sorts by size) - for (size_t p = 0; p < number_of_merged_tbs; ++p) - { - assert(!clusters[cidx].empty()); - auto const & cluster = clusters[cidx].contained_user_bins(); - bool split_cluster = false; - - if (cluster.size() > config.hibf_config.tmax) - { - size_t card{0}; - for (size_t uidx = 0; uidx < cluster.size(); ++uidx) - card += cardinalities[cluster[uidx]]; - - if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax) - split_cluster = true; - } - - size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax); - for (size_t uidx = 0; uidx < end; ++uidx) - { - size_t const user_bin_idx = cluster[uidx]; - // if a single cluster already exceeds the cardinality_per_part, - // then the remaining user bins of the cluster must spill over into the next partition - if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sizes[p] > merged_threshold) - { - ++p; - - if (p >= number_of_merged_tbs) - { - split_cluster = true; - end = uidx; - break; - } - } - - partition_sizes[p] += cardinalities[user_bin_idx]; - partitions[p].push_back(user_bin_idx); - max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); - min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); - } - - if (split_cluster) - { - std::vector remainder(cluster.begin() + end, cluster.end()); - remaining_clusters.insert(remaining_clusters.end(), remainder); - } - - ++cidx; - } - - for (size_t i = cidx; i < clusters.size(); ++i) - { - if (clusters[i].empty()) - break; - - remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins()); - } - - // assign the rest by similarity - max_merged_size = merged_threshold; - for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx) - { - auto const & cluster = remaining_clusters[ridx]; - - config.search_partition_algorithm_timer.start(); - find_best_sorted_partition(config, number_of_merged_tbs, max_merged_size, cluster, cardinalities, sketches, partitions, partition_sizes, max_partition_cardinality, min_partition_cardinality); - config.search_partition_algorithm_timer.start(); - - } - } - else if (config.partitioning_approach == partitioning_scheme::lsh_sim) { // determine number of split bins std::vector const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end()); @@ -1402,8 +1080,6 @@ for (size_t i = 0; i < clusters.size(); ++i) size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities); - } - }; parition_split_bins(); From f323b4f1bdba6792eea535fea0995fde6542a583 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 24 Jul 2025 21:50:24 +0200 Subject: [PATCH 07/18] Remove trial functions linked to now unused approaches. --- src/layout/execute.cpp | 407 ----------------------------------------- 1 file changed, 407 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 3c323d5f..a07d8807 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -120,124 +120,6 @@ auto LSH_fill_hashtable(std::vector const & clusters, // Vector L1 : number of user bins // Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b) // Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r) -std::vector initital_LSH_partitioning(std::vector const & minHash_sketches, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - size_t const average_technical_bin_size, - chopper::configuration const & config) -{ - assert(!minHash_sketches.empty()); - assert(!minHash_sketches[0].table.empty()); - assert(!minHash_sketches[0].table[0].empty()); - - size_t const number_of_user_bins{positions.size()}; - size_t const number_of_max_minHash_sketches{seqan::hibf::sketch::minhashes::num_sketches}; // LSH ADD+OR parameter b - size_t const minHash_sketche_size{seqan::hibf::sketch::minhashes::sketch_size}; // LSH ADD+OR parameter r - seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits}; - - // initialise clusters with a signle user bin per cluster. - // clusters are either - // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to - // e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y}) - // 2) of size >= 1; with the first entry beging id == position (a valid cluster) - // e.g. cluster[X] = {X} // valid singleton - // e.g. cluster[X] = {X, a, b, c, ...} // valid cluster with more joined entries - // The clusters could me moved recursively, s.t. - // cluster[A] = {B} - // cluster[B] = {C} - // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B - std::vector clusters; - clusters.reserve(number_of_user_bins); - - std::vector current_cluster_cardinality(number_of_user_bins); - std::vector current_cluster_sketches(number_of_user_bins, empty_sketch); - size_t current_max_cluster_size{0}; - size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively - size_t current_sketch_index{0}; - size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster - - for (size_t pos = 0; pos < number_of_user_bins; ++pos) - { - clusters.emplace_back(pos, positions[pos]); // id, user_bin_idx - current_cluster_cardinality[pos] = cardinalities[positions[pos]]; - current_cluster_sketches[pos] = sketches[positions[pos]]; - current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]); - } - - // refine clusters -//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; - while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ - current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? - { -//std::cout << "Current number of clusters: " << current_number_of_clusters; - - // fill LSH collision hashtable - robin_hood::unordered_flat_map> table = - LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); - - // read out LSH collision hashtable - // for each present key, if the list contains more than one cluster, we merge everything contained in the list - // into the first cluster, since those clusters collide and should be joined in the same bucket - for (auto & [key, list] : table) - { - assert(!list.empty()); - - // uniquify list. Since I am inserting representative_idx's into the table, the same number can - // be inserted into multiple splots, and multiple times in the same slot. - std::sort(list.begin(), list.end()); - auto const end = std::unique(list.begin(), list.end()); - auto const begin = list.begin(); - - if (end - begin <= 1) // nothing to do here - continue; - - // Now combine all clusters into the first. - - // 1) find the representative cluster to merge everything else into - // It can happen, that the representative has already been joined with another cluster - // e.g. - // [key1] = {0,11} // then clusters[11] is merged into clusters[0] - // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved - size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin); - auto & representative_cluster = clusters[representative_cluster_id]; - assert(representative_cluster.is_valid(representative_cluster_id)); - assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); - - for (auto current = begin + 1; current < end; ++current) - { - // For every other entry in the list, it can happen that I already joined that list with another - // e.g. - // [key1] = {0,11} // then clusters[11] is merged into clusters[0] - // [key2] = {0, 2, 11} // now I want to do it again - size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current); - auto & next_cluster = clusters[next_cluster_id]; - - if (next_cluster.id() == representative_cluster.id()) // already joined - continue; - - next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster - assert(next_cluster.size() == 0); - assert(next_cluster.has_been_moved()); - assert(representative_cluster.size() > 1); // there should be at least two user bins now - assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid - - current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]); - current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate(); - - --current_number_of_clusters; - } - - current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality); - } - - ++current_sketch_index; - -//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl; - } - - return clusters; -} std::vector very_similar_LSH_partitioning(std::vector const & minHash_sketches, std::vector const & positions, @@ -361,112 +243,6 @@ std::vector very_similar_LSH_partitioning(std::vector most_distant_LSH_partitioning(std::vector const & initial_clusters, - std::vector const & minHash_sketches, - chopper::configuration const & config) -{ - assert(!minHash_sketches.empty()); - assert(!minHash_sketches[0].table.empty()); - assert(!minHash_sketches[0].table[0].empty()); - - size_t const number_of_user_bins{initial_clusters.size()}; - size_t const number_of_max_minHash_sketches{minHash_sketches[0].table.size()}; // LSH ADD+OR parameter b - // size_t const minHash_sketche_size{minHash_sketches[0][0].size()}; // LSH ADD+OR parameter r - - size_t current_number_of_sketch_hashes{5}; - size_t current_sketch_index{0}; - - size_t current_number_of_clusters = [&initial_clusters] () - { - size_t number{0}; - for (auto const & cluster : initial_clusters) - { - if (cluster.size()) - ++number; - } - return number; - }(); - - std::vector clusters; - clusters.reserve(number_of_user_bins); - - for (size_t pos = 0; pos < number_of_user_bins; ++pos) - { - clusters.emplace_back(initial_clusters[pos]); - assert(clusters[pos].is_valid(pos)); - } - - // refine clusters. I want to cluster as much as possible to initialise partitions with very distant clusters - while (current_number_of_clusters > (config.hibf_config.tmax * 2) && - current_sketch_index < number_of_max_minHash_sketches) - { -//std::cout << "[Dist] Current number of clusters: " << current_number_of_clusters << " sketch size:" << current_number_of_sketch_hashes << " "; - - // fill LSH collision hashtable - robin_hood::unordered_flat_map> table = - LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); - - // read out LSH collision hashtable - // for each present key, if the list contains more than one cluster, we merge everything contained in the list - // into the first cluster, since those clusters collide and should be joined in the same bucket - for (auto & [key, list] : table) - { - assert(!list.empty()); - - // uniquify list. Since I am inserting representative_idx's into the table, the same number can - // be inserted into multiple splots, and multiple times in the same slot. - std::sort(list.begin(), list.end()); - auto const end = std::unique(list.begin(), list.end()); - auto const begin = list.begin(); - - if (end - begin <= 1) // nothing to do here - continue; - - // Now combine all clusters into the first. - - // 1) find the representative cluster to merge everything else into - // It can happen, that the representative has already been joined with another cluster - // e.g. - // [key1] = {0,11} // then clusters[11] is merged into clusters[0] - // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved - size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin); - auto & representative_cluster = clusters[representative_cluster_id]; - assert(representative_cluster.is_valid(representative_cluster_id)); - assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); - - for (auto current = begin + 1; current < end; ++current) - { - // For every other entry in the list, it can happen that I already joined that list with another - // e.g. - // [key1] = {0,11} // then clusters[11] is merged into clusters[0] - // [key2] = {0, 2, 11} // now I want to do it again - size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current); - auto & next_cluster = clusters[next_cluster_id]; - - if (next_cluster.id() == representative_cluster.id()) // already joined - continue; - - next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster - assert(next_cluster.size() == 0); - assert(next_cluster.has_been_moved()); - assert(representative_cluster.size() > 1); // there should be at least two user bins now - assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid - - --current_number_of_clusters; - } - } - - ++current_sketch_index; - - if (current_sketch_index % 4 == 0 && current_number_of_sketch_hashes > 1) - --current_number_of_sketch_hashes; - -//std::cout << " and after this clustering step there are: " << current_number_of_clusters << std::endl; - } - - return clusters; -} - void post_process_clusters(std::vector & clusters, std::vector const & cardinalities, chopper::configuration const & config) @@ -518,81 +294,6 @@ void post_process_clusters(std::vector & clusters, // debug } -void post_process_clusters(std::vector & clusters, - std::vector const & cardinalities, - chopper::configuration const & config) -{ - // clusters are done. Start post processing - // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not - // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id() - // would neet to be found - - for (size_t pos = 0; pos < clusters.size(); ++pos) - { - assert(clusters[pos].is_valid(pos)); - clusters[pos].sort_by_cardinality(cardinalities); - } - - // push largest p clusters to the front - auto cluster_size_cmp = [&cardinalities](auto const & mc1, auto const & mc2) - { - if (mc1.empty()) - return false; // mc1 can never be larger than mc2 then - if (mc2.empty()) // and mc1 is not, since the first if would catch - return true; - - auto const & largest_c1 = mc1.contained_user_bins()[0]; // first cluster is the largest because of sorting above - auto const & largest_c2 = mc2.contained_user_bins()[0]; // first cluster is the largest because of sorting above - - assert(!largest_c1.empty()); // should never be empty - assert(!largest_c2.empty()); // should never be empty - - if (largest_c2.size() == largest_c1.size()) - return cardinalities[largest_c2[0]] < cardinalities[largest_c1[0]]; - return largest_c2.size() < largest_c1.size(); - }; - std::partial_sort(clusters.begin(), clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp); - - auto move_empty_clusters_to_the_end = [](auto const & v1, auto const & v2) - { - return v2.size() < v1.size(); - }; - std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), move_empty_clusters_to_the_end); - -// debug sanity check - for (size_t cidx = 1; cidx < clusters.size(); ++cidx) - { - if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end - throw std::runtime_error{"sorting did not work"}; - } -// debug -} - -std::vector LSH_partitioning(std::vector const & minHash_sketches, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - size_t const average_technical_bin_size, - chopper::configuration const & config) -{ - std::vector clusters = initital_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config); - post_process_clusters(clusters, cardinalities, config); - return clusters; -} - -std::vector sim_dist_LSH_partitioning(std::vector const & minHash_sketches, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - size_t const average_technical_bin_size, - chopper::configuration const & config) -{ - std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config); - std::vector multi_clusters = most_distant_LSH_partitioning(clusters, minHash_sketches, config); - post_process_clusters(multi_clusters, cardinalities, config); - return multi_clusters; -} - bool find_best_partition(chopper::configuration const & config, size_t const number_of_partitions, size_t & corrected_estimate_per_part, @@ -721,114 +422,6 @@ bool find_best_partition(chopper::configuration const & config, return true; } -bool find_best_sorted_partition(chopper::configuration const & config, - size_t const number_of_partitions, - size_t & corrected_estimate_per_part, - std::vector const & cluster, - std::vector const & cardinalities, - std::vector const & /*sketches*/, - std::vector> & partitions, - std::vector & partition_sizes, - std::vector & max_partition_cardinality, - std::vector & min_partition_cardinality) -{ - size_t max_card = [&cardinalities, &cluster] () - { - size_t max{0}; - - for (size_t const user_bin_idx : cluster) - max = std::max(max, cardinalities[user_bin_idx]); - - return max; - }(); - - size_t sum_card = [&cardinalities, &cluster] () - { - size_t sum{0}; - - for (size_t const user_bin_idx : cluster) - sum += cardinalities[user_bin_idx]; - - return sum; - }(); - - size_t smallest_change{std::numeric_limits::max()}; - size_t best_p{0}; - bool best_p_found{false}; - - auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p) - { - size_t min = std::min(max_card, min_partition_cardinality[p]); - size_t max = std::max(max_card, max_partition_cardinality[p]); - - if (partitions[p].size() > config.hibf_config.tmax) // already a third level - { - // if there must already be another lower level because the current merged bin contains more than tmax - // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set - // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user - // bin needs to be stored additionally - size_t const num_ubs_in_merged_bin{partitions[p].size() + additional_number_of_user_bins}; - double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax); - return static_cast(max_card * levels); - } - else if (partitions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) - { - // if the current merged bin contains exactly tmax UBS, adding otherone must - // result in another lower level. Most likely, the smallest user bin will end up on the lower level - // therefore the penalty is set to 'min * tmax' - // of course, there could also be a third level with a lower number of user bins, but this is hard to - // estimate. - size_t const penalty = min * config.hibf_config.tmax; - return penalty; - } - else // partitions[p].size() + additional_number_of_user_bins <= tmax - { - // if the new user bin is smaller than all other already contained user bins - // the waste of space is high if stored in a single technical bin - if (max_card < min) - return (max - max_card); - // if the new user bin is bigger than all other already contained user bins, the IBF size increases - else if (max_card > max) - return (max_card - max) * config.hibf_config.tmax; - // else, end if-else-block and zero is returned - } - - return (size_t)0u; - }; - - for (size_t p = 0; p < number_of_partitions; ++p) - { - size_t const sum = partition_sizes[p] + sum_card; - size_t const penalty_current_ibf = config.hibf_config.tmax * ((sum <= corrected_estimate_per_part) ? 0u : sum - corrected_estimate_per_part); - size_t const change = penalty_current_ibf + penalty_lower_level(cluster.size(), p); - - if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/ - (smallest_change > change)) - { - smallest_change = change; - best_p = p; - best_p_found = true; - } - } - - if (!best_p_found) - return false; - -//std::cout << "best_p:" << best_p << std::endl<< std::endl; - - // now that we know which partition fits best (`best_p`), add those indices to it - for (size_t const user_bin_idx : cluster) - { - partitions[best_p].push_back(user_bin_idx); - partition_sizes[best_p] += cardinalities[user_bin_idx]; - max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]); - min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); - } - corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast(partition_sizes[best_p])); - - return true; -} - size_t split_bins(chopper::configuration const & config, std::vector const & sorted_positions, std::vector const & cardinalities, From c36dbdf91028c8c1fecd276645cb2fe196ec6a2a Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 25 Jul 2025 20:07:45 +0200 Subject: [PATCH 08/18] [MISC] automatic linting --- .../chopper/layout/determine_split_bins.hpp | 10 +- include/chopper/lsh.hpp | 34 +- src/layout/CMakeLists.txt | 10 +- src/layout/determine_split_bins.cpp | 21 +- src/layout/execute.cpp | 374 +++++++------ src/layout/hibf_statistics.cpp | 4 +- test/api/layout/determine_split_bins_test.cpp | 23 +- test/api/layout/fast_layout_test.cpp | 497 +++++++++--------- test/api/lsh_test.cpp | 1 - 9 files changed, 536 insertions(+), 438 deletions(-) diff --git a/include/chopper/layout/determine_split_bins.hpp b/include/chopper/layout/determine_split_bins.hpp index dee17b24..91400e42 100644 --- a/include/chopper/layout/determine_split_bins.hpp +++ b/include/chopper/layout/determine_split_bins.hpp @@ -20,10 +20,10 @@ namespace chopper::layout { std::pair determine_split_bins(chopper::configuration const & config, - std::vector const & positions, - std::vector const & cardinalities, - size_t const num_technical_bins, - size_t const num_user_bins, - std::vector> & partitions); + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions); } \ No newline at end of file diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp index af67bfa9..5ef11a75 100644 --- a/include/chopper/lsh.hpp +++ b/include/chopper/lsh.hpp @@ -30,10 +30,10 @@ struct Cluster public: Cluster() = default; - Cluster(const Cluster&) = default; - Cluster(Cluster&&) = default; - Cluster& operator=(const Cluster&) = default; - Cluster& operator=(Cluster&&) = default; + Cluster(Cluster const &) = default; + Cluster(Cluster &&) = default; + Cluster & operator=(Cluster const &) = default; + Cluster & operator=(Cluster &&) = default; ~Cluster() = default; Cluster(size_t const id) @@ -98,7 +98,12 @@ struct Cluster void sort_by_cardinality(std::vector const & cardinalities) { - std::sort(user_bins.begin(), user_bins.end(), [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; }); + std::sort(user_bins.begin(), + user_bins.end(), + [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v2] < cardinalities[v1]; + }); } }; @@ -109,10 +114,10 @@ struct MultiCluster : Cluster public: MultiCluster() = default; - MultiCluster(const MultiCluster&) = default; - MultiCluster(MultiCluster&&) = default; - MultiCluster& operator=(const MultiCluster&) = default; - MultiCluster& operator=(MultiCluster&&) = default; + MultiCluster(MultiCluster const &) = default; + MultiCluster(MultiCluster &&) = default; + MultiCluster & operator=(MultiCluster const &) = default; + MultiCluster & operator=(MultiCluster &&) = default; ~MultiCluster() = default; MultiCluster(Cluster const & clust) @@ -164,16 +169,21 @@ struct MultiCluster : Cluster // sort user bins within a Cluster by cardinality and the clusters themselves by size void sort_by_cardinality(std::vector const & cardinalities) { - auto cmp = [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; }; + auto cmp = [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v2] < cardinalities[v1]; + }; for (auto & user_bin_cluster : user_bins) std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp); - auto cmp_clusters = [](auto const & c1, auto const & c2){ return c2.size() < c1.size(); }; + auto cmp_clusters = [](auto const & c1, auto const & c2) + { + return c2.size() < c1.size(); + }; std::sort(user_bins.begin(), user_bins.end(), cmp_clusters); } }; - // A valid cluster is one that hasn't been moved but actually contains user bins // A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i // A moved cluster is one that has been joined and thereby moved to another cluster diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt index 4c2542c8..f2b16735 100644 --- a/src/layout/CMakeLists.txt +++ b/src/layout/CMakeLists.txt @@ -4,8 +4,14 @@ if (TARGET chopper::layout) return () endif () -add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp - ibf_query_cost.cpp input.cpp output.cpp determine_split_bins.cpp +add_library (chopper_layout STATIC + determine_best_number_of_technical_bins.cpp + determine_split_bins.cpp + execute.cpp + hibf_statistics.cpp + ibf_query_cost.cpp + input.cpp + output.cpp ) target_link_libraries (chopper_layout PUBLIC chopper::shared) add_library (chopper::layout ALIAS chopper_layout) diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp index 71d946b8..08e5fdee 100644 --- a/src/layout/determine_split_bins.cpp +++ b/src/layout/determine_split_bins.cpp @@ -5,14 +5,14 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- -#include // for allocator, string +#include // for allocator, string #include // for vector -#include #include -#include // for data_store +#include #include +#include // for data_store #include namespace chopper::fast_layout @@ -26,15 +26,16 @@ std::pair determine_split_bins(chopper::configuration const & co std::vector> & partitions) { if (num_user_bins == 0) - return {0,0}; + return {0, 0}; assert(num_technical_bins > 0u); assert(num_user_bins > 0u); assert(num_user_bins <= num_technical_bins); - auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .hash_count = config.hibf_config.number_of_hash_functions, - .t_max = num_technical_bins}); + auto const fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); std::vector> matrix(num_technical_bins); // rows for (auto & v : matrix) @@ -67,8 +68,8 @@ std::pair determine_split_bins(chopper::configuration const & co { size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); - size_t score = - std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]); + size_t score = std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), + matrix[i_prime][j - 1]); // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; @@ -149,4 +150,4 @@ std::pair determine_split_bins(chopper::configuration const & co return {number_of_split_tbs, max_size}; } -} // namespace chopper::layout +} // namespace chopper::fast_layout diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index a07d8807..9e2f4e0b 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -14,28 +14,27 @@ #include #include #include +#include #include #include #include #include #include -#include - -#include #include -#include #include +#include #include #include #include +#include #include #include -#include +#include #include -#include // for compute_relaxed_fpr_correction #include +#include // for compute_relaxed_fpr_correction #include #include #include @@ -76,7 +75,8 @@ auto LSH_fill_hashtable(std::vector const & clusters, for (size_t const user_bin_idx : current.contained_user_bins()) { ++processed_user_bins; - uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes); + uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], + current_number_of_sketch_hashes); table[key].push_back(current.id()); // insert representative for all user bins } } @@ -106,7 +106,8 @@ auto LSH_fill_hashtable(std::vector const & clusters, for (size_t const user_bin_idx : similarity_cluster) { ++processed_user_bins; - uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes); + uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], + current_number_of_sketch_hashes); table[key].push_back(current.id()); // insert representative for all user bins } } @@ -122,11 +123,11 @@ auto LSH_fill_hashtable(std::vector const & clusters, // Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r) std::vector very_similar_LSH_partitioning(std::vector const & minHash_sketches, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - size_t const average_technical_bin_size, - chopper::configuration const & config) + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) { assert(!minHash_sketches.empty()); assert(!minHash_sketches[0].table.empty()); @@ -134,12 +135,12 @@ std::vector very_similar_LSH_partitioning(std::vector very_similar_LSH_partitioning(std::vector(number_of_user_bins) > 0.5 &&*/ + //std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; + while (current_max_cluster_size < average_technical_bin_size + && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? { -//std::cout << "Current number of clusters: " << current_number_of_clusters; + //std::cout << "Current number of clusters: " << current_number_of_clusters; // fill LSH collision hashtable robin_hood::unordered_flat_map> table = @@ -226,8 +228,10 @@ std::vector very_similar_LSH_partitioning(std::vector 1); // there should be at least two user bins now assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid - current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]); - current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate(); + current_cluster_sketches[representative_cluster.id()].merge( + current_cluster_sketches[next_cluster.id()]); + current_cluster_cardinality[representative_cluster.id()] = + current_cluster_sketches[representative_cluster.id()].estimate(); --current_number_of_clusters; } @@ -237,7 +241,7 @@ std::vector very_similar_LSH_partitioning(std::vector & clusters, auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2) { if (v2.size() == v1.size() && !v2.empty()) - return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; + return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; return v2.size() < v1.size(); }; - std::partial_sort(clusters.begin(), clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp); + std::partial_sort(clusters.begin(), + clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), + clusters.end(), + cluster_size_cmp); // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last. @@ -280,18 +287,21 @@ void post_process_clusters(std::vector & clusters, return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; }; - std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), clusters.end(), compare_cardinality_and_move_empty_clusters_to_the_end); + std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), + clusters.end(), + compare_cardinality_and_move_empty_clusters_to_the_end); assert(clusters[0].size() >= clusters[1].size()); // sanity check // assert(cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check -// debug + // debug for (size_t cidx = 1; cidx < clusters.size(); ++cidx) { - if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end + if (clusters[cidx - 1].empty() + && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end throw std::runtime_error{"sorting did not work"}; } -// debug + // debug } bool find_best_partition(chopper::configuration const & config, @@ -315,7 +325,7 @@ bool find_best_partition(chopper::configuration const & config, return result; }(); - size_t const max_card = [&cardinalities, &cluster] () + size_t const max_card = [&cardinalities, &cluster]() { size_t max{0}; @@ -337,7 +347,7 @@ bool find_best_partition(chopper::configuration const & config, size_t best_p{0}; bool best_p_found{false}; - auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p) + auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p) { size_t min = std::min(max_card, min_partition_cardinality[p]); size_t max = std::max(max_card, max_partition_cardinality[p]); @@ -350,7 +360,7 @@ bool find_best_partition(chopper::configuration const & config, // bin needs to be stored additionally size_t const num_ubs_in_merged_bin{positions[p].size() + additional_number_of_user_bins}; double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax); - return static_cast(max_card * levels); + return static_cast(max_card * levels); } else if (positions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) // now a third level { @@ -386,17 +396,19 @@ bool find_best_partition(chopper::configuration const & config, assert(union_estimate >= current_partition_size); size_t const penalty_current_bin = union_estimate - current_partition_size; - size_t const penalty_current_ibf = config.hibf_config.tmax * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part); + size_t const penalty_current_ibf = + config.hibf_config.tmax + * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part); size_t const change = penalty_current_bin + penalty_current_ibf + penalty_lower_level(cluster.size(), p); // size_t const intersection = current_sketch.estimate() - change; // double const subsume_ratio = static_cast(intersection) / current_partition_size; -//std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" << penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl; + //std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" << penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl; if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/ (smallest_change > change /*&& subsume_ratio > best_subsume_ratio &&*/ - /*current_partition_size < corrected_estimate_per_part*/)) + /*current_partition_size < corrected_estimate_per_part*/)) { -//std::cout << "smaller!" << std::endl; + //std::cout << "smaller!" << std::endl; // best_subsume_ratio = subsume_ratio; smallest_change = change; best_p = p; @@ -407,7 +419,7 @@ bool find_best_partition(chopper::configuration const & config, if (!best_p_found) throw "currently there are no safety measures if a partition is not found because it is very unlikely"; -//std::cout << "best_p:" << best_p << std::endl<< std::endl; + //std::cout << "best_p:" << best_p << std::endl<< std::endl; // now that we know which partition fits best (`best_p`), add those indices to it for (size_t const user_bin_idx : cluster) @@ -417,28 +429,29 @@ bool find_best_partition(chopper::configuration const & config, min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); } partition_sketches[best_p].merge(current_sketch); - corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast(partition_sketches[best_p].estimate())); + corrected_estimate_per_part = + std::max(corrected_estimate_per_part, static_cast(partition_sketches[best_p].estimate())); return true; } size_t split_bins(chopper::configuration const & config, - std::vector const & sorted_positions, - std::vector const & cardinalities, - std::vector> & partitions, - size_t const sum_of_cardinalities, - size_t const end_idx) + std::vector const & sorted_positions, + std::vector const & cardinalities, + std::vector> & partitions, + size_t const sum_of_cardinalities, + size_t const end_idx) { // assign split bins to the end. makes it easier to process merged bins afterwards size_t pos{partitions.size() - 1}; for (size_t idx = 0; idx < end_idx; ++idx) { size_t const ub_idx{sorted_positions[idx]}; - size_t const number_of_split_tbs = std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); -std::cout << "number_of_split_tbs: " << number_of_split_tbs - << " cardinalities[ub_idx]:" << cardinalities[ub_idx] - << " sum_of_cardinalities:" << sum_of_cardinalities - << std::endl; + size_t const number_of_split_tbs = + std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); + std::cout << "number_of_split_tbs: " << number_of_split_tbs + << " cardinalities[ub_idx]:" << cardinalities[ub_idx] + << " sum_of_cardinalities:" << sum_of_cardinalities << std::endl; // fill partitions from behind to ensure an easier layouting for (size_t i = 0; i < number_of_split_tbs; ++i) { @@ -453,9 +466,9 @@ std::cout << "number_of_split_tbs: " << number_of_split_tbs } std::pair find_bins_to_be_split(std::vector const & sorted_positions, - std::vector const & cardinalities, - size_t threshold, - size_t const max_bins) + std::vector const & cardinalities, + size_t threshold, + size_t const max_bins) { size_t idx{0}; size_t sum{0}; @@ -488,14 +501,14 @@ std::pair find_bins_to_be_split(std::vector const & sort } size_t lsh_sim_approach(chopper::configuration const & config, - std::vector const & sorted_positions2, - std::vector const & cardinalities, - std::vector const & sketches, - std::vector const & minHash_sketches, - std::vector> & partitions, - size_t const number_of_remaining_tbs, - size_t const technical_bin_size_threshold, - size_t const sum_of_cardinalities) + std::vector const & sorted_positions2, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + std::vector> & partitions, + size_t const number_of_remaining_tbs, + size_t const technical_bin_size_threshold, + size_t const sum_of_cardinalities) { uint8_t const sketch_bits{config.hibf_config.sketch_bits}; //std::cout << "LSH partitioning into " << config.hibf_config.tmax << std::endl; @@ -507,20 +520,25 @@ size_t lsh_sim_approach(chopper::configuration const & config, // initial partitioning using locality sensitive hashing (LSH) config.lsh_algorithm_timer.start(); - std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, technical_bin_size_threshold, config); + std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, + sorted_positions2, + cardinalities, + sketches, + technical_bin_size_threshold, + config); post_process_clusters(clusters, cardinalities, config); config.lsh_algorithm_timer.stop(); -std::ofstream ofs{"/tmp/final.clusters"}; -for (size_t i = 0; i < clusters.size(); ++i) -{ -seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); -for (size_t j = 0; j < clusters[i].size(); ++j) -{ - sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); -} -ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; -} + std::ofstream ofs{"/tmp/final.clusters"}; + for (size_t i = 0; i < clusters.size(); ++i) + { + seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); + for (size_t j = 0; j < clusters[i].size(); ++j) + { + sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); + } + ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; + } std::vector> remaining_clusters{}; @@ -548,7 +566,8 @@ ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; size_t const user_bin_idx = cluster[uidx]; // if a single cluster already exceeds the cardinality_per_part, // then the remaining user bins of the cluster must spill over into the next partition - if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sketches[p].estimate() > technical_bin_size_threshold) + if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) + || partition_sketches[p].estimate() > technical_bin_size_threshold) { ++p; @@ -590,7 +609,16 @@ ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; auto const & cluster = remaining_clusters[ridx]; config.search_partition_algorithm_timer.start(); - find_best_partition(config, number_of_remaining_tbs, merged_threshold, cluster, cardinalities, sketches, partitions, partition_sketches, max_partition_cardinality, min_partition_cardinality); + find_best_partition(config, + number_of_remaining_tbs, + merged_threshold, + cluster, + cardinalities, + sketches, + partitions, + partition_sketches, + max_partition_cardinality, + min_partition_cardinality); config.search_partition_algorithm_timer.start(); } @@ -630,20 +658,21 @@ void partition_user_bins(chopper::configuration const & config, sketch.merge(sketches[pos]); } - return std::tuple{sum, max, sketch.estimate()}; + return std::tuple{sum, max, sketch.estimate()}; }(); // If the effective text size is very low, it can happen that the joint_estimate divided by the number of partitions // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand. // This way we make sure there is at least 1 LSH clustering step. - size_t const estimate_per_part = std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), - max_cardinality + 1); + size_t const estimate_per_part = + std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), max_cardinality + 1); - double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .relaxed_fpr = config.hibf_config.relaxed_fpr, - .hash_count = config.hibf_config.number_of_hash_functions}); -//std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl; + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + //std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl; size_t split_threshold{}; size_t idx{0}; // start in sorted positions @@ -658,8 +687,15 @@ void partition_user_bins(chopper::configuration const & config, auto parition_split_bins = [&]() { size_t number_of_potential_split_bins{0}; - std::tie(idx, number_of_potential_split_bins) = find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1); - std::tie(number_of_split_tbs, max_split_size) = chopper::layout::determine_split_bins(config, sorted_positions, cardinalities, number_of_potential_split_bins, idx, partitions); + std::tie(idx, number_of_potential_split_bins) = + find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1); + std::tie(number_of_split_tbs, max_split_size) = + chopper::layout::determine_split_bins(config, + sorted_positions, + cardinalities, + number_of_potential_split_bins, + idx, + partitions); number_of_merged_tbs = config.hibf_config.tmax - number_of_split_tbs; }; @@ -672,25 +708,38 @@ void partition_user_bins(chopper::configuration const & config, size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); - max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities); + max_merged_size = lsh_sim_approach(config, + sorted_positions2, + cardinalities, + sketches, + minHash_sketches, + partitions, + number_of_merged_tbs, + merged_threshold, + sum_of_cardinalities); }; parition_split_bins(); partitions_merged_bins(); - int64_t const difference = static_cast(max_merged_size * relaxed_fpr_correction) - static_cast(max_split_size); + int64_t const difference = + static_cast(max_merged_size * relaxed_fpr_correction) - static_cast(max_split_size); -std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl; -std::cout << "Reconfiguring threshold. from:" << split_threshold; + std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl; + std::cout << "Reconfiguring threshold. from:" << split_threshold; if (number_of_split_tbs == 0) split_threshold = (split_threshold + max_merged_size) / 2; // increase threshold - else if (difference > 0) // need more merged bins -> increase threshold - split_threshold = static_cast(split_threshold) * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + else if (difference > 0) // need more merged bins -> increase threshold + split_threshold = + static_cast(split_threshold) + * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); else // need more split bins -> decrease threshold - split_threshold = static_cast(split_threshold) * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + split_threshold = + static_cast(split_threshold) + * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); -std::cout << " to:" << split_threshold << std::endl; + std::cout << " to:" << split_threshold << std::endl; // reset result partitions.clear(); @@ -751,9 +800,9 @@ std::cout << " to:" << split_threshold << std::endl; } seqan::hibf::layout::layout general_layout(chopper::configuration const & config, - std::vector positions, - std::vector const & cardinalities, - std::vector const & sketches) + std::vector positions, + std::vector const & cardinalities, + std::vector const & sketches) { seqan::hibf::layout::layout hibf_layout; @@ -773,7 +822,9 @@ seqan::hibf::layout::layout general_layout(chopper::configuration const & config return hibf_layout; } -bool do_I_need_a_fast_layout(chopper::configuration const & config, std::vector const & positions, std::vector const & cardinalities) +bool do_I_need_a_fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities) { // the fast layout heuristic would greedily merge even if merging only 2 bins at a time // merging only little number of bins is highly disadvantegous for lower levels because few bins @@ -814,13 +865,15 @@ void add_level_to_layout(chopper::configuration const & config, size_t max_bin_id{0}; size_t max_size{0}; - auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .hash_count = config.hibf_config.number_of_hash_functions, - .t_max = partitions.size()}); + auto const split_fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = partitions.size()}); - double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .relaxed_fpr = config.hibf_config.relaxed_fpr, - .hash_count = config.hibf_config.number_of_hash_functions}); + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); // we assume here that the user bins have been sorted by user bin id such that pos = idx for (size_t partition_idx{0}; partition_idx < partitions.size(); ++partition_idx) @@ -862,17 +915,17 @@ void add_level_to_layout(chopper::configuration const & config, current_user_bin.storage_TB_id = partition_idx; current_user_bin.number_of_technical_bins = 1; // initialise to 1 - while (partition_idx + 1 < partitions.size() && - partitions[partition_idx].size() == 1 && - partitions[partition_idx + 1].size() == 1 && - partitions[partition_idx][0] == partitions[partition_idx + 1][0]) + while (partition_idx + 1 < partitions.size() && partitions[partition_idx].size() == 1 + && partitions[partition_idx + 1].size() == 1 + && partitions[partition_idx][0] == partitions[partition_idx + 1][0]) { ++current_user_bin.number_of_technical_bins; ++partition_idx; } // update max_bin_id, max_size - size_t const current_size = sketches[current_user_bin.idx].estimate() * split_fpr_correction[current_user_bin.number_of_technical_bins]; + size_t const current_size = sketches[current_user_bin.idx].estimate() + * split_fpr_correction[current_user_bin.number_of_technical_bins]; if (current_size > max_size) { max_bin_id = current_user_bin.storage_TB_id; @@ -892,7 +945,9 @@ void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout, for (auto & max_bin : child_layout.max_bins) { - max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), new_previous.begin(), new_previous.end()); + max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), + new_previous.begin(), + new_previous.end()); hibf_layout.max_bins.push_back(max_bin); } @@ -901,27 +956,27 @@ void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout, auto & actual_user_bin = hibf_layout.user_bins[user_bin.idx]; actual_user_bin.previous_TB_indices.insert(actual_user_bin.previous_TB_indices.end(), - user_bin.previous_TB_indices.begin(), - user_bin.previous_TB_indices.end()); + user_bin.previous_TB_indices.begin(), + user_bin.previous_TB_indices.end()); actual_user_bin.number_of_technical_bins = user_bin.number_of_technical_bins; actual_user_bin.storage_TB_id = user_bin.storage_TB_id; } } void fast_layout_recursion(chopper::configuration const & config, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - std::vector const & minHash_sketches, - seqan::hibf::layout::layout & hibf_layout, - std::vector const & previous) + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout, + std::vector const & previous) { std::vector> tmax_partitions(config.hibf_config.tmax); // here we assume that we want to start with a fast layout partition_user_bins(config, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); - #pragma omp critical +#pragma omp critical { add_level_to_layout(config, hibf_layout, tmax_partitions, sketches, previous); } @@ -929,20 +984,31 @@ void fast_layout_recursion(chopper::configuration const & config, for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) { auto const & partition = tmax_partitions[partition_idx]; - auto const new_previous = [&] () {auto cpy{previous}; cpy.push_back(partition_idx); return cpy; }(); + auto const new_previous = [&]() + { + auto cpy{previous}; + cpy.push_back(partition_idx); + return cpy; + }(); if (partition.empty() || partition.size() == 1) // nothing to merge continue; if (do_I_need_a_fast_layout(config, partition, cardinalities)) { - fast_layout_recursion(config, partition, cardinalities, sketches, minHash_sketches, hibf_layout, new_previous); // recurse fast_layout + fast_layout_recursion(config, + partition, + cardinalities, + sketches, + minHash_sketches, + hibf_layout, + new_previous); // recurse fast_layout } else { auto child_layout = general_layout(config, partition, cardinalities, sketches); - #pragma omp critical +#pragma omp critical { update_layout_from_child_layout(child_layout, hibf_layout, new_previous); } @@ -951,11 +1017,11 @@ void fast_layout_recursion(chopper::configuration const & config, } void fast_layout(chopper::configuration const & config, - std::vector const & positions, - std::vector const & cardinalities, - std::vector const & sketches, - std::vector const & minHash_sketches, - seqan::hibf::layout::layout & hibf_layout) + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout) { auto config_copy = config; @@ -966,13 +1032,15 @@ void fast_layout(chopper::configuration const & config, partition_user_bins(config_copy, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); config.intital_partition_timer.stop(); - auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .hash_count = config.hibf_config.number_of_hash_functions, - .t_max = config.hibf_config.tmax}); + auto const split_fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = config.hibf_config.tmax}); - double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .relaxed_fpr = config.hibf_config.relaxed_fpr, - .hash_count = config.hibf_config.number_of_hash_functions}); + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); size_t max_bin_id{0}; size_t max_size{0}; @@ -1016,17 +1084,18 @@ void fast_layout(chopper::configuration const & config, .number_of_technical_bins = 1 /*determiend below*/, .idx = user_bin_id}; - while (partition_idx + 1 < tmax_partitions.size() && - tmax_partitions[partition_idx].size() == 1 && - tmax_partitions[partition_idx + 1].size() == 1 && - tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0]) + while (partition_idx + 1 < tmax_partitions.size() && tmax_partitions[partition_idx].size() == 1 + && tmax_partitions[partition_idx + 1].size() == 1 + && tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0]) { ++hibf_layout.user_bins[user_bin_id].number_of_technical_bins; ++partition_idx; } // update max_bin_id, max_size - size_t const current_size = sketches[user_bin_id].estimate() * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins]; + size_t const current_size = + sketches[user_bin_id].estimate() + * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins]; if (current_size > max_size) { max_bin_id = hibf_layout.user_bins[user_bin_id].storage_TB_id; @@ -1038,10 +1107,10 @@ void fast_layout(chopper::configuration const & config, hibf_layout.top_level_max_bin_id = max_bin_id; config.small_layouts_timer.start(); - #pragma omp parallel - #pragma omp single +#pragma omp parallel +#pragma omp single { - #pragma omp taskloop +#pragma omp taskloop for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) { auto const & partition = tmax_partitions[partition_idx]; @@ -1051,13 +1120,19 @@ void fast_layout(chopper::configuration const & config, if (do_I_need_a_fast_layout(config_copy, partition, cardinalities)) { - fast_layout_recursion(config_copy, partition, cardinalities, sketches, minHash_sketches, hibf_layout, {partition_idx}); // recurse fast_layout + fast_layout_recursion(config_copy, + partition, + cardinalities, + sketches, + minHash_sketches, + hibf_layout, + {partition_idx}); // recurse fast_layout } else { auto small_layout = general_layout(config, partition, cardinalities, sketches); - #pragma omp critical +#pragma omp critical { update_layout_from_child_layout(small_layout, hibf_layout, std::vector{partition_idx}); } @@ -1081,7 +1156,6 @@ int execute(chopper::configuration & config, { seqan::hibf::layout::layout hibf_layout; - if (config.determine_best_tmax) { hibf_layout = determine_best_number_of_technical_bins(config, cardinalities, sketches); @@ -1089,7 +1163,12 @@ int execute(chopper::configuration & config, else { config.dp_algorithm_timer.start(); - fast_layout(config, seqan::hibf::iota_vector(sketches.size()), cardinalities, sketches, minHash_sketches, hibf_layout); + fast_layout(config, + seqan::hibf::iota_vector(sketches.size()), + cardinalities, + sketches, + minHash_sketches, + hibf_layout); config.dp_algorithm_timer.stop(); // hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, @@ -1102,13 +1181,14 @@ int execute(chopper::configuration & config, // sort records ascending by the number of bin indices (corresponds to the IBF levels) // GCOVR_EXCL_START std::ranges::sort(hibf_layout.max_bins, - [](auto const & r, auto const & l) - { - if (r.previous_TB_indices.size() == l.previous_TB_indices.size()) - return std::ranges::lexicographical_compare(r.previous_TB_indices, l.previous_TB_indices); - else - return r.previous_TB_indices.size() < l.previous_TB_indices.size(); - }); + [](auto const & r, auto const & l) + { + if (r.previous_TB_indices.size() == l.previous_TB_indices.size()) + return std::ranges::lexicographical_compare(r.previous_TB_indices, + l.previous_TB_indices); + else + return r.previous_TB_indices.size() < l.previous_TB_indices.size(); + }); // GCOVR_EXCL_STOP if (config.output_verbose_statistics) diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index e171ed93..fdf2bb7d 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -397,8 +397,8 @@ void hibf_statistics::collect_bins() # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Warray-bounds=" # pragma GCC diagnostic ignored "-Wstringop-overflow=" -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV - // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV \ + // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV # pragma GCC diagnostic pop #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV diff --git a/test/api/layout/determine_split_bins_test.cpp b/test/api/layout/determine_split_bins_test.cpp index 2af4c488..4b0c49d2 100644 --- a/test/api/layout/determine_split_bins_test.cpp +++ b/test/api/layout/determine_split_bins_test.cpp @@ -2,12 +2,12 @@ #include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... #include // for allocator, string -#include // for vector +#include // for vector #include -#include // for data_store #include +#include // for data_store #include std::pair determine_split_bins(chopper::configuration const & config, @@ -18,15 +18,16 @@ std::pair determine_split_bins(chopper::configuration const & co std::vector> & partitions) { if (num_user_bins == 0) - return {0,0}; + return {0, 0}; assert(num_technical_bins > 0u); assert(num_user_bins > 0u); assert(num_user_bins <= num_technical_bins); - auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // - .hash_count = config.hibf_config.number_of_hash_functions, - .t_max = num_technical_bins}); + auto const fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); std::vector> matrix(num_technical_bins); // rows for (auto & v : matrix) @@ -59,8 +60,8 @@ std::pair determine_split_bins(chopper::configuration const & co { size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); - size_t score = - std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]); + size_t score = std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), + matrix[i_prime][j - 1]); // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; @@ -160,7 +161,8 @@ TEST(simple_split, first) std::vector> partitions(64); - auto const [num_splits, max_size] = determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions); + auto const [num_splits, max_size] = + determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions); EXPECT_EQ(num_splits, 61); EXPECT_EQ(max_size, 1452); @@ -178,8 +180,7 @@ TEST(simple_split, first) for (size_t i{partitions.size() - 1}; i > partitions.size() - num_splits - 1; --i) std::cerr << partitions[i][0] << ","; - - for (;tb_idx > partitions.size() - num_splits - 1; --tb_idx) + for (; tb_idx > partitions.size() - num_splits - 1; --tb_idx) { for (size_t three = 0; three < 3; ++three) { diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp index 2347f66b..ca2a4a5c 100644 --- a/test/api/layout/fast_layout_test.cpp +++ b/test/api/layout/fast_layout_test.cpp @@ -56,254 +56,255 @@ TEST(execute_test, many_ubs) chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); - std::string const expected_file{ - "@CHOPPER_USER_BINS\n" - "@0 seq0\n" - "@1 seq1\n" - "@2 seq2\n" - "@3 seq3\n" - "@4 seq4\n" - "@5 seq5\n" - "@6 seq6\n" - "@7 seq7\n" - "@8 seq8\n" - "@9 seq9\n" - "@10 seq10\n" - "@11 seq11\n" - "@12 seq12\n" - "@13 seq13\n" - "@14 seq14\n" - "@15 seq15\n" - "@16 seq16\n" - "@17 seq17\n" - "@18 seq18\n" - "@19 seq19\n" - "@20 seq20\n" - "@21 seq21\n" - "@22 seq22\n" - "@23 seq23\n" - "@24 seq24\n" - "@25 seq25\n" - "@26 seq26\n" - "@27 seq27\n" - "@28 seq28\n" - "@29 seq29\n" - "@30 seq30\n" - "@31 seq31\n" - "@32 seq32\n" - "@33 seq33\n" - "@34 seq34\n" - "@35 seq35\n" - "@36 seq36\n" - "@37 seq37\n" - "@38 seq38\n" - "@39 seq39\n" - "@40 seq40\n" - "@41 seq41\n" - "@42 seq42\n" - "@43 seq43\n" - "@44 seq44\n" - "@45 seq45\n" - "@46 seq46\n" - "@47 seq47\n" - "@48 seq48\n" - "@49 seq49\n" - "@50 seq50\n" - "@51 seq51\n" - "@52 seq52\n" - "@53 seq53\n" - "@54 seq54\n" - "@55 seq55\n" - "@56 seq56\n" - "@57 seq57\n" - "@58 seq58\n" - "@59 seq59\n" - "@60 seq60\n" - "@61 seq61\n" - "@62 seq62\n" - "@63 seq63\n" - "@64 seq64\n" - "@65 seq65\n" - "@66 seq66\n" - "@67 seq67\n" - "@68 seq68\n" - "@69 seq69\n" - "@70 seq70\n" - "@71 seq71\n" - "@72 seq72\n" - "@73 seq73\n" - "@74 seq74\n" - "@75 seq75\n" - "@76 seq76\n" - "@77 seq77\n" - "@78 seq78\n" - "@79 seq79\n" - "@80 seq80\n" - "@81 seq81\n" - "@82 seq82\n" - "@83 seq83\n" - "@84 seq84\n" - "@85 seq85\n" - "@86 seq86\n" - "@87 seq87\n" - "@88 seq88\n" - "@89 seq89\n" - "@90 seq90\n" - "@91 seq91\n" - "@92 seq92\n" - "@93 seq93\n" - "@94 seq94\n" - "@95 seq95\n" - "@CHOPPER_USER_BINS_END\n" - "@CHOPPER_CONFIG\n" - "@{\n" - "@ \"chopper_config\": {\n" - "@ \"version\": 2,\n" - "@ \"data_file\": {\n" - "@ \"value0\": \"\"\n" - "@ },\n" - "@ \"debug\": false,\n" - "@ \"sketch_directory\": {\n" - "@ \"value0\": \"\"\n" - "@ },\n" - "@ \"k\": 19,\n" - "@ \"window_size\": 19,\n" - "@ \"disable_sketch_output\": true,\n" - "@ \"precomputed_files\": false,\n" - "@ \"output_filename\": {\n" - "@ \"value0\": \"" + layout_file.string() + "\"\n" - "@ },\n" - "@ \"determine_best_tmax\": false,\n" - "@ \"force_all_binnings\": false\n" - "@ }\n" - "@}\n" - "@CHOPPER_CONFIG_END\n" - "@HIBF_CONFIG\n" - "@{\n" - "@ \"hibf_config\": {\n" - "@ \"version\": 2,\n" - "@ \"number_of_user_bins\": 96,\n" - "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_fpr\": 0.05,\n" - "@ \"relaxed_fpr\": 0.3,\n" - "@ \"threads\": 1,\n" - "@ \"sketch_bits\": 12,\n" - "@ \"tmax\": 64,\n" - "@ \"empty_bin_fraction\": 0.0,\n" - "@ \"alpha\": 1.2,\n" - "@ \"max_rearrangement_ratio\": 0.5,\n" - "@ \"disable_estimate_union\": true,\n" - "@ \"disable_rearrangement\": true\n" - "@ }\n" - "@}\n" - "@HIBF_CONFIG_END\n" - "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" - "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" - "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" - "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" - "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" - "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" - "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n" - "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" - "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n" - "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n" - "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" - "0 5;0 1;2\n" - "1 9;0 1;2\n" - "2 3;0 1;2\n" - "3 12;0 1;2\n" - "4 12;2 1;2\n" - "5 6;0 1;2\n" - "6 9;2 1;2\n" - "7 9;4 1;2\n" - "8 3;2 1;2\n" - "9 3;4 1;2\n" - "10 4;0 1;2\n" - "11 6;2 1;2\n" - "12 2;0 1;2\n" - "13 2;2 1;2\n" - "14 2;4 1;2\n" - "15 4;2 1;2\n" - "16 12;4 1;3\n" - "17 6;4 1;2\n" - "18 5;2 1;3\n" - "19 9;6 1;2\n" - "20 8;0 1;12\n" - "21 8;12 1;12\n" - "22 6;6 1;9\n" - "23 6;15 1;9\n" - "24 9;8 1;9\n" - "25 9;17 1;9\n" - "26 4;4 1;11\n" - "27 4;15 1;11\n" - "28 1;0 1;12\n" - "29 1;12 1;12\n" - "30 2;6 1;11\n" - "31 2;17 1;11\n" - "32 12;7 1;13\n" - "33 6;24 1;9\n" - "34 5;5 1;14\n" - "35 9;26 1;9\n" - "36 3;6 1;13\n" - "37 22 1\n" - "38 21 1\n" - "39 20 1\n" - "40 15 1\n" - "41 18 1\n" - "42 19 1\n" - "43 13 1\n" - "44 17 1\n" - "45 14 1\n" - "46 10 1\n" - "47 16 1\n" - "48 11 1\n" - "49 8;24 1;40\n" - "50 12;20 1;44\n" - "51 6;33 1;31\n" - "52 5;19 1;45\n" - "53 9;35 1;29\n" - "54 3;19 1;45\n" - "55 4;26 1;38\n" - "56 7 1\n" - "57 1;24 1;40\n" - "58 0 1\n" - "59 2;28 1;36\n" - "60 63 1\n" - "61 62 1\n" - "62 60 1\n" - "63 61 1\n" - "64 59 1\n" - "65 58 1\n" - "66 57 1\n" - "67 55 1\n" - "68 56 1\n" - "69 54 1\n" - "70 52 1\n" - "71 53 1\n" - "72 51 1\n" - "73 49 1\n" - "74 48 1\n" - "75 50 1\n" - "76 45 1\n" - "77 47 1\n" - "78 46 1\n" - "79 44 1\n" - "80 39 1\n" - "81 40 1\n" - "82 41 1\n" - "83 42 1\n" - "84 38 1\n" - "85 37 1\n" - "86 43 1\n" - "87 36 1\n" - "88 35 1\n" - "89 34 1\n" - "90 29 2\n" - "91 31 2\n" - "92 27 2\n" - "93 23 2\n" - "94 33 1\n" - "95 25 2\n"}; + std::string const expected_file{"@CHOPPER_USER_BINS\n" + "@0 seq0\n" + "@1 seq1\n" + "@2 seq2\n" + "@3 seq3\n" + "@4 seq4\n" + "@5 seq5\n" + "@6 seq6\n" + "@7 seq7\n" + "@8 seq8\n" + "@9 seq9\n" + "@10 seq10\n" + "@11 seq11\n" + "@12 seq12\n" + "@13 seq13\n" + "@14 seq14\n" + "@15 seq15\n" + "@16 seq16\n" + "@17 seq17\n" + "@18 seq18\n" + "@19 seq19\n" + "@20 seq20\n" + "@21 seq21\n" + "@22 seq22\n" + "@23 seq23\n" + "@24 seq24\n" + "@25 seq25\n" + "@26 seq26\n" + "@27 seq27\n" + "@28 seq28\n" + "@29 seq29\n" + "@30 seq30\n" + "@31 seq31\n" + "@32 seq32\n" + "@33 seq33\n" + "@34 seq34\n" + "@35 seq35\n" + "@36 seq36\n" + "@37 seq37\n" + "@38 seq38\n" + "@39 seq39\n" + "@40 seq40\n" + "@41 seq41\n" + "@42 seq42\n" + "@43 seq43\n" + "@44 seq44\n" + "@45 seq45\n" + "@46 seq46\n" + "@47 seq47\n" + "@48 seq48\n" + "@49 seq49\n" + "@50 seq50\n" + "@51 seq51\n" + "@52 seq52\n" + "@53 seq53\n" + "@54 seq54\n" + "@55 seq55\n" + "@56 seq56\n" + "@57 seq57\n" + "@58 seq58\n" + "@59 seq59\n" + "@60 seq60\n" + "@61 seq61\n" + "@62 seq62\n" + "@63 seq63\n" + "@64 seq64\n" + "@65 seq65\n" + "@66 seq66\n" + "@67 seq67\n" + "@68 seq68\n" + "@69 seq69\n" + "@70 seq70\n" + "@71 seq71\n" + "@72 seq72\n" + "@73 seq73\n" + "@74 seq74\n" + "@75 seq75\n" + "@76 seq76\n" + "@77 seq77\n" + "@78 seq78\n" + "@79 seq79\n" + "@80 seq80\n" + "@81 seq81\n" + "@82 seq82\n" + "@83 seq83\n" + "@84 seq84\n" + "@85 seq85\n" + "@86 seq86\n" + "@87 seq87\n" + "@88 seq88\n" + "@89 seq89\n" + "@90 seq90\n" + "@91 seq91\n" + "@92 seq92\n" + "@93 seq93\n" + "@94 seq94\n" + "@95 seq95\n" + "@CHOPPER_USER_BINS_END\n" + "@CHOPPER_CONFIG\n" + "@{\n" + "@ \"chopper_config\": {\n" + "@ \"version\": 2,\n" + "@ \"data_file\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"debug\": false,\n" + "@ \"sketch_directory\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"k\": 19,\n" + "@ \"window_size\": 19,\n" + "@ \"disable_sketch_output\": true,\n" + "@ \"precomputed_files\": false,\n" + "@ \"output_filename\": {\n" + "@ \"value0\": \"" + + layout_file.string() + + "\"\n" + "@ },\n" + "@ \"determine_best_tmax\": false,\n" + "@ \"force_all_binnings\": false\n" + "@ }\n" + "@}\n" + "@CHOPPER_CONFIG_END\n" + "@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 96,\n" + "@ \"number_of_hash_functions\": 2,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 1,\n" + "@ \"sketch_bits\": 12,\n" + "@ \"tmax\": 64,\n" + "@ \"empty_bin_fraction\": 0.0,\n" + "@ \"alpha\": 1.2,\n" + "@ \"max_rearrangement_ratio\": 0.5,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": true\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" + "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" + "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" + "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" + "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n" + "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n" + "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" + "0 5;0 1;2\n" + "1 9;0 1;2\n" + "2 3;0 1;2\n" + "3 12;0 1;2\n" + "4 12;2 1;2\n" + "5 6;0 1;2\n" + "6 9;2 1;2\n" + "7 9;4 1;2\n" + "8 3;2 1;2\n" + "9 3;4 1;2\n" + "10 4;0 1;2\n" + "11 6;2 1;2\n" + "12 2;0 1;2\n" + "13 2;2 1;2\n" + "14 2;4 1;2\n" + "15 4;2 1;2\n" + "16 12;4 1;3\n" + "17 6;4 1;2\n" + "18 5;2 1;3\n" + "19 9;6 1;2\n" + "20 8;0 1;12\n" + "21 8;12 1;12\n" + "22 6;6 1;9\n" + "23 6;15 1;9\n" + "24 9;8 1;9\n" + "25 9;17 1;9\n" + "26 4;4 1;11\n" + "27 4;15 1;11\n" + "28 1;0 1;12\n" + "29 1;12 1;12\n" + "30 2;6 1;11\n" + "31 2;17 1;11\n" + "32 12;7 1;13\n" + "33 6;24 1;9\n" + "34 5;5 1;14\n" + "35 9;26 1;9\n" + "36 3;6 1;13\n" + "37 22 1\n" + "38 21 1\n" + "39 20 1\n" + "40 15 1\n" + "41 18 1\n" + "42 19 1\n" + "43 13 1\n" + "44 17 1\n" + "45 14 1\n" + "46 10 1\n" + "47 16 1\n" + "48 11 1\n" + "49 8;24 1;40\n" + "50 12;20 1;44\n" + "51 6;33 1;31\n" + "52 5;19 1;45\n" + "53 9;35 1;29\n" + "54 3;19 1;45\n" + "55 4;26 1;38\n" + "56 7 1\n" + "57 1;24 1;40\n" + "58 0 1\n" + "59 2;28 1;36\n" + "60 63 1\n" + "61 62 1\n" + "62 60 1\n" + "63 61 1\n" + "64 59 1\n" + "65 58 1\n" + "66 57 1\n" + "67 55 1\n" + "68 56 1\n" + "69 54 1\n" + "70 52 1\n" + "71 53 1\n" + "72 51 1\n" + "73 49 1\n" + "74 48 1\n" + "75 50 1\n" + "76 45 1\n" + "77 47 1\n" + "78 46 1\n" + "79 44 1\n" + "80 39 1\n" + "81 40 1\n" + "82 41 1\n" + "83 42 1\n" + "84 38 1\n" + "85 37 1\n" + "86 43 1\n" + "87 36 1\n" + "88 35 1\n" + "89 34 1\n" + "90 29 2\n" + "91 31 2\n" + "92 27 2\n" + "93 23 2\n" + "94 33 1\n" + "95 25 2\n"}; std::string const actual_file{string_from_file(layout_file)}; EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl; diff --git a/test/api/lsh_test.cpp b/test/api/lsh_test.cpp index 71be4de1..1ebfd552 100644 --- a/test/api/lsh_test.cpp +++ b/test/api/lsh_test.cpp @@ -136,7 +136,6 @@ TEST(LSH_find_representative_cluster_test, multi_cluster_one_move) mclusters[1].move_to(mclusters[0]); EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id()); - } TEST(LSH_find_representative_cluster_test, cluster_two_moves) From 7c647ab890d6110bdc34f546d0621597f762212c Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 13:50:51 +0200 Subject: [PATCH 09/18] unused variables --- src/layout/execute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 9e2f4e0b..e8053800 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -160,7 +160,7 @@ std::vector very_similar_LSH_partitioning(std::vector Date: Fri, 25 Jul 2025 13:59:56 +0200 Subject: [PATCH 10/18] fix linkage --- src/layout/determine_split_bins.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp index 08e5fdee..013975b4 100644 --- a/src/layout/determine_split_bins.cpp +++ b/src/layout/determine_split_bins.cpp @@ -15,7 +15,7 @@ #include // for data_store #include -namespace chopper::fast_layout +namespace chopper::layout { std::pair determine_split_bins(chopper::configuration const & config, From 53e301f9e93e479a82ebd43082259dbb7ad8785f Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 14:04:26 +0200 Subject: [PATCH 11/18] [DELETE ME] temporarily make tests compile --- test/api/layout/execute_layout_test.cpp | 15 ++++++++------ .../layout/execute_with_estimation_test.cpp | 20 +++++++++++-------- test/api/layout/hibf_statistics_test.cpp | 10 ++++++---- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 5449ec12..d77a22f7 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -46,9 +46,10 @@ TEST(execute_test, few_ubs) filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); std::string const expected_file{"@CHOPPER_USER_BINS\n" "@0 seq0a seq0b\n" @@ -142,9 +143,10 @@ TEST(execute_test, set_default_tmax) filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); EXPECT_EQ(config.hibf_config.tmax, 64u); } @@ -178,9 +180,10 @@ TEST(execute_test, many_ubs) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); std::string const expected_file{"@CHOPPER_USER_BINS\n" "@0 seq0\n" diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index c33d0243..9951befe 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -53,9 +53,10 @@ TEST(execute_estimation_test, few_ubs) filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -114,9 +115,10 @@ TEST(execute_estimation_test, many_ubs) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -421,9 +423,10 @@ TEST(execute_estimation_test, many_ubs_force_all) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -523,9 +526,10 @@ TEST(execute_estimation_test, with_rearrangement) config.hibf_config.number_of_user_bins = filenames.size(); std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index 718b43dc..054fbcb2 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -133,11 +133,12 @@ TEST(execute_test, chopper_layout_statistics) .disable_estimate_union = true /* also disable rearrangement */}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); testing::internal::CaptureStdout(); testing::internal::CaptureStderr(); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); std::string layout_result_stdout = testing::internal::GetCapturedStdout(); std::string layout_result_stderr = testing::internal::GetCapturedStderr(); @@ -190,9 +191,10 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins) .disable_estimate_union = true /* also disable rearrangement */}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); std::string expected_cout = R"expected_cout(## ### Parameters ### From 8003d00b19664628c721f1463694fcb52bcc231a Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 15:40:07 +0200 Subject: [PATCH 12/18] dedup --- include/chopper/configuration.hpp | 5 +- include/chopper/lsh.hpp | 61 ++++++++---------- src/layout/execute.cpp | 103 +++++++++++------------------- 3 files changed, 68 insertions(+), 101 deletions(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index 825bc407..4d8e80ff 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -20,7 +20,7 @@ namespace chopper { -enum partitioning_scheme +enum class partitioning_scheme : uint8_t { blocked, // 0 sorted, // 1 @@ -33,7 +33,8 @@ enum partitioning_scheme struct configuration { - int partitioning_approach{6}; + partitioning_scheme partitioning_approach{partitioning_scheme::lsh_sim}; + /*!\name General Configuration * \{ */ diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp index 5ef11a75..caa83975 100644 --- a/include/chopper/lsh.hpp +++ b/include/chopper/lsh.hpp @@ -36,17 +36,11 @@ struct Cluster Cluster & operator=(Cluster &&) = default; ~Cluster() = default; - Cluster(size_t const id) - { - representative_id = id; - user_bins = {id}; - } + Cluster(size_t const id, size_t const user_bins_id) : representative_id{id}, user_bins({user_bins_id}) + {} - Cluster(size_t const id, size_t const user_bins_id) - { - representative_id = id; - user_bins = {user_bins_id}; - } + Cluster(size_t const id) : Cluster{id, id} + {} size_t id() const { @@ -76,7 +70,7 @@ struct Cluster bool is_valid(size_t id) const { bool const ids_equal = representative_id == id; - bool const properly_moved = has_been_moved() && empty() && moved_id.has_value(); + bool const properly_moved = has_been_moved() && empty(); bool const not_moved = !has_been_moved() && !empty(); return ids_equal && (properly_moved || not_moved); @@ -98,12 +92,11 @@ struct Cluster void sort_by_cardinality(std::vector const & cardinalities) { - std::sort(user_bins.begin(), - user_bins.end(), - [&cardinalities](auto const & v1, auto const & v2) - { - return cardinalities[v2] < cardinalities[v1]; - }); + std::ranges::sort(user_bins, + [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v1] > cardinalities[v2]; + }); } }; @@ -123,14 +116,11 @@ struct MultiCluster : Cluster MultiCluster(Cluster const & clust) { representative_id = clust.id(); + if (clust.has_been_moved()) - { moved_id = clust.moved_to_cluster_id(); - } else - { user_bins.push_back(clust.contained_user_bins()); - } } std::vector> const & contained_user_bins() const @@ -153,7 +143,7 @@ struct MultiCluster : Cluster bool is_valid(size_t id) const { bool const ids_equal = representative_id == id; - bool const properly_moved = has_been_moved() && empty() && moved_id.has_value(); + bool const properly_moved = has_been_moved() && empty(); bool const not_moved = !has_been_moved() && !empty(); return ids_equal && (properly_moved || not_moved); @@ -169,18 +159,21 @@ struct MultiCluster : Cluster // sort user bins within a Cluster by cardinality and the clusters themselves by size void sort_by_cardinality(std::vector const & cardinalities) { - auto cmp = [&cardinalities](auto const & v1, auto const & v2) - { - return cardinalities[v2] < cardinalities[v1]; - }; - for (auto & user_bin_cluster : user_bins) - std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp); - - auto cmp_clusters = [](auto const & c1, auto const & c2) - { - return c2.size() < c1.size(); - }; - std::sort(user_bins.begin(), user_bins.end(), cmp_clusters); + std::ranges::for_each(user_bins, + [&cardinalities](auto & user_bin_cluster) + { + std::ranges::sort(user_bin_cluster, + [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v1] > cardinalities[v2]; + }); + }); + + std::ranges::sort(user_bins, + [](auto const & c1, auto const & c2) + { + return c1.size() > c2.size(); + }); } }; diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index e8053800..5058e1e3 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -47,23 +47,30 @@ namespace chopper::layout uint64_t lsh_hash_the_sketch(std::vector const & sketch, size_t const number_of_hashes_to_consider) { - // lets just compute the sum - uint64_t sum{0}; - - for (size_t i = 0; i < number_of_hashes_to_consider; ++i) - sum += sketch[i]; - - return sum; + return std::reduce(sketch.begin(), sketch.begin() + number_of_hashes_to_consider); } -auto LSH_fill_hashtable(std::vector const & clusters, +template +auto LSH_fill_hashtable(std::vector const & clusters, std::vector const & minHash_sketches, size_t const current_sketch_index, size_t const current_number_of_sketch_hashes) { + static constexpr bool is_multi_cluster = std::same_as; + static_assert(std::same_as || std::same_as); + robin_hood::unordered_flat_map> table; - size_t processed_user_bins{0}; // only for sanity check + [[maybe_unused]] size_t processed_user_bins{0}; // only for sanity check + + auto get_user_bin_idx = [](cluster_type const & cluster) + { + if constexpr (is_multi_cluster) + return std::views::join(cluster.contained_user_bins()); + else + return cluster.contained_user_bins(); + }; + for (size_t pos = 0; pos < clusters.size(); ++pos) { auto const & current = clusters[pos]; @@ -72,7 +79,7 @@ auto LSH_fill_hashtable(std::vector const & clusters, if (current.has_been_moved()) // cluster has been moved somewhere else, don't process continue; - for (size_t const user_bin_idx : current.contained_user_bins()) + for (size_t const user_bin_idx : get_user_bin_idx(current)) { ++processed_user_bins; uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], @@ -85,42 +92,10 @@ auto LSH_fill_hashtable(std::vector const & clusters, return table; } -auto LSH_fill_hashtable(std::vector const & clusters, - std::vector const & minHash_sketches, - size_t const current_sketch_index, - size_t const current_number_of_sketch_hashes) -{ - robin_hood::unordered_flat_map> table; - - size_t processed_user_bins{0}; // only for sanity check - for (size_t pos = 0; pos < clusters.size(); ++pos) - { - auto const & current = clusters[pos]; - assert(current.is_valid(pos)); - - if (current.has_been_moved()) // cluster has been moved somewhere else, don't process - continue; - - for (auto const & similarity_cluster : current.contained_user_bins()) - { - for (size_t const user_bin_idx : similarity_cluster) - { - ++processed_user_bins; - uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], - current_number_of_sketch_hashes); - table[key].push_back(current.id()); // insert representative for all user bins - } - } - } - assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters - - return table; -} - // minHash_sketches data structure: // Vector L1 : number of user bins // Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b) -// Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r) +// Vector L3 : minHash_sketch_size (LSH ADD+OR parameter r) std::vector very_similar_LSH_partitioning(std::vector const & minHash_sketches, std::vector const & positions, @@ -136,11 +111,11 @@ std::vector very_similar_LSH_partitioning(std::vector very_similar_LSH_partitioning(std::vector current_cluster_cardinality(number_of_user_bins); std::vector current_cluster_sketches(number_of_user_bins, empty_sketch); size_t current_max_cluster_size{0}; - size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively + size_t current_number_of_sketch_hashes{minHash_sketch_size}; // start with high r but decrease it iteratively size_t current_sketch_index{0}; [[maybe_unused]] size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster @@ -191,8 +166,8 @@ std::vector very_similar_LSH_partitioning(std::vector very_similar_LSH_partitioning(std::vector & clusters, // push largest p clusters to the front auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2) { - if (v2.size() == v1.size() && !v2.empty()) - return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; - return v2.size() < v1.size(); + if (v1.size() == v2.size() && !v2.empty()) // Note: If v2 is empty, so is v1. + return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()]; + + return v1.size() > v2.size(); }; - std::partial_sort(clusters.begin(), - clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), - clusters.end(), - cluster_size_cmp); + std::ranges::partial_sort(clusters, + std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), + cluster_size_cmp); // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last. @@ -285,11 +258,11 @@ void post_process_clusters(std::vector & clusters, if (v2.empty()) // and v1 is not, since the first if would catch return true; - return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]]; + return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()]; }; - std::sort(clusters.begin() + std::min(clusters.size(), config.hibf_config.tmax), - clusters.end(), - compare_cardinality_and_move_empty_clusters_to_the_end); + std::ranges::sort(std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), + clusters.end(), + compare_cardinality_and_move_empty_clusters_to_the_end); assert(clusters[0].size() >= clusters[1].size()); // sanity check // assert(cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check @@ -297,8 +270,8 @@ void post_process_clusters(std::vector & clusters, // debug for (size_t cidx = 1; cidx < clusters.size(); ++cidx) { - if (clusters[cidx - 1].empty() - && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end + // once empty - always empty; all empty clusters should be at the end + if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) throw std::runtime_error{"sorting did not work"}; } // debug From 47e50e2d6ee75bab180369da2d198ce353c02a7a Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 16:03:50 +0200 Subject: [PATCH 13/18] uniquify in LSH_fill_hashtable --- src/layout/execute.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 5058e1e3..dc3c624b 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -89,6 +89,15 @@ auto LSH_fill_hashtable(std::vector const & clusters, } assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters + // uniquify list. Since I am inserting representative_idx's into the table, the same number can + // be inserted into multiple splots, and multiple times in the same slot. + for (auto & [key, list] : table) + { + std::ranges::sort(list); + auto const ret = std::ranges::unique(list); + list.erase(ret.begin(), ret.end()); + } + return table; } @@ -164,13 +173,7 @@ std::vector very_similar_LSH_partitioning(std::vector very_similar_LSH_partitioning(std::vector Date: Fri, 25 Jul 2025 16:04:07 +0200 Subject: [PATCH 14/18] libc++ sort --- test/api/layout/fast_layout_test.cpp | 113 ++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp index ca2a4a5c..9f175672 100644 --- a/test/api/layout/fast_layout_test.cpp +++ b/test/api/layout/fast_layout_test.cpp @@ -198,6 +198,115 @@ TEST(execute_test, many_ubs) "@ }\n" "@}\n" "@HIBF_CONFIG_END\n" +#ifdef _LIBCPP_VERSION + "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" + "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" + "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" + "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" + "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_7 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_10 fullest_technical_bin_idx:35\n" + "#LOWER_LEVEL_IBF_13 fullest_technical_bin_idx:7\n" + "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" + "0 5;0 1;2\n" + "1 10;0 1;2\n" + "2 3;0 1;2\n" + "3 13;0 1;2\n" + "4 13;2 1;2\n" + "5 7;0 1;2\n" + "6 10;2 1;2\n" + "7 10;4 1;2\n" + "8 3;2 1;2\n" + "9 3;4 1;2\n" + "10 4;0 1;2\n" + "11 7;2 1;2\n" + "12 2;0 1;2\n" + "13 2;2 1;2\n" + "14 2;4 1;2\n" + "15 4;2 1;2\n" + "16 13;4 1;3\n" + "17 7;4 1;2\n" + "18 5;2 1;3\n" + "19 10;6 1;2\n" + "20 8;0 1;12\n" + "21 8;12 1;12\n" + "22 7;6 1;9\n" + "23 7;15 1;9\n" + "24 10;8 1;9\n" + "25 10;17 1;9\n" + "26 4;4 1;11\n" + "27 4;15 1;11\n" + "28 1;0 1;12\n" + "29 1;12 1;12\n" + "30 2;6 1;11\n" + "31 2;17 1;11\n" + "32 13;7 1;13\n" + "33 7;24 1;9\n" + "34 5;5 1;14\n" + "35 10;26 1;9\n" + "36 3;6 1;13\n" + "37 22 1\n" + "38 21 1\n" + "39 20 1\n" + "40 15 1\n" + "41 18 1\n" + "42 19 1\n" + "43 12 1\n" + "44 17 1\n" + "45 16 1\n" + "46 9 1\n" + "47 14 1\n" + "48 11 1\n" + "49 8;24 1;40\n" + "50 13;20 1;44\n" + "51 7;33 1;31\n" + "52 5;19 1;45\n" + "53 10;35 1;29\n" + "54 3;19 1;45\n" + "55 4;26 1;38\n" + "56 6 1\n" + "57 1;24 1;40\n" + "58 0 1\n" + "59 2;28 1;36\n" + "60 63 1\n" + "61 62 1\n" + "62 60 1\n" + "63 61 1\n" + "64 59 1\n" + "65 58 1\n" + "66 57 1\n" + "67 55 1\n" + "68 56 1\n" + "69 54 1\n" + "70 53 1\n" + "71 52 1\n" + "72 51 1\n" + "73 49 1\n" + "74 48 1\n" + "75 50 1\n" + "76 45 1\n" + "77 47 1\n" + "78 46 1\n" + "79 44 1\n" + "80 43 1\n" + "81 42 1\n" + "82 41 1\n" + "83 40 1\n" + "84 39 1\n" + "85 37 1\n" + "86 38 1\n" + "87 36 1\n" + "88 35 1\n" + "89 34 1\n" + "90 29 2\n" + "91 31 2\n" + "92 27 2\n" + "93 23 2\n" + "94 33 1\n" + "95 25 2\n" +#else "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" @@ -304,7 +413,9 @@ TEST(execute_test, many_ubs) "92 27 2\n" "93 23 2\n" "94 33 1\n" - "95 25 2\n"}; + "95 25 2\n" +#endif + }; std::string const actual_file{string_from_file(layout_file)}; EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl; From 9b2c2dac29772c233258b9d59ee60bdd9585c653 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 16:27:03 +0200 Subject: [PATCH 15/18] very_similar_LSH_partitioning --- src/layout/execute.cpp | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index dc3c624b..cc1a6ebf 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -144,7 +144,6 @@ std::vector very_similar_LSH_partitioning(std::vector very_similar_LSH_partitioning(std::vector Cluster & + { + size_t const cluster_id = LSH_find_representative_cluster(clusters, current); + return clusters[cluster_id]; + }; + // refine clusters //std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? { - //std::cout << "Current number of clusters: " << current_number_of_clusters; - // fill LSH collision hashtable robin_hood::unordered_flat_map> table = LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); @@ -183,39 +186,33 @@ std::vector very_similar_LSH_partitioning(std::vector 1); // there should be at least two user bins now - assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid - - current_cluster_sketches[representative_cluster.id()].merge( - current_cluster_sketches[next_cluster.id()]); - current_cluster_cardinality[representative_cluster.id()] = - current_cluster_sketches[representative_cluster.id()].estimate(); - --current_number_of_clusters; + representative_cluster_sketch.merge(current_cluster_sketches[next_cluster.id()]); } - current_max_cluster_size = std::ranges::max(current_cluster_cardinality); + current_cluster_cardinality[representative_cluster.id()] = representative_cluster_sketch.estimate(); } + current_max_cluster_size = std::ranges::max(current_cluster_cardinality); ++current_sketch_index; } From 208739e1161e03d69d5bea9eed758c9aec632d96 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 16:45:50 +0200 Subject: [PATCH 16/18] small stuff --- src/layout/execute.cpp | 56 +++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index cc1a6ebf..e4931196 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -228,7 +228,6 @@ void post_process_clusters(std::vector & clusters, // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id() // would neet to be found - for (size_t pos = 0; pos < clusters.size(); ++pos) { assert(clusters[pos].is_valid(pos)); @@ -236,35 +235,36 @@ void post_process_clusters(std::vector & clusters, } // push largest p clusters to the front - auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2) - { - if (v1.size() == v2.size() && !v2.empty()) // Note: If v2 is empty, so is v1. - return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()]; - - return v1.size() > v2.size(); - }; std::ranges::partial_sort(clusters, std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), - cluster_size_cmp); + [&cardinalities](auto const & v1, auto const & v2) + { + // Note: If v2 is empty, so is v1. + if (v1.size() == v2.size() && !v2.empty()) + return cardinalities[v1.contained_user_bins().front()] + > cardinalities[v2.contained_user_bins().front()]; + + return v1.size() > v2.size(); + }); // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last. // the largest ub is already at the start because of former sorting. - auto compare_cardinality_and_move_empty_clusters_to_the_end = [&cardinalities](auto const & v1, auto const & v2) - { - if (v1.empty()) - return false; // v1 can never be larger than v2 then - - if (v2.empty()) // and v1 is not, since the first if would catch - return true; - - return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()]; - }; std::ranges::sort(std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), clusters.end(), - compare_cardinality_and_move_empty_clusters_to_the_end); + [&cardinalities](auto const & v1, auto const & v2) + { + if (v1.empty()) + return false; // v1 can never be larger than v2 then + + if (v2.empty()) // and v1 is not, since the first if would catch + return true; + + return cardinalities[v1.contained_user_bins().front()] + > cardinalities[v2.contained_user_bins().front()]; + }); - assert(clusters[0].size() >= clusters[1].size()); // sanity check + assert(clusters.size() < 2 || clusters[0].size() >= clusters[1].size()); // sanity check // assert(cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check // debug @@ -320,10 +320,10 @@ bool find_best_partition(chopper::configuration const & config, size_t best_p{0}; bool best_p_found{false}; - auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p) + auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p) -> size_t { - size_t min = std::min(max_card, min_partition_cardinality[p]); - size_t max = std::max(max_card, max_partition_cardinality[p]); + size_t const min = std::min(max_card, min_partition_cardinality[p]); + size_t const max = std::max(max_card, max_partition_cardinality[p]); if (positions[p].size() > config.hibf_config.tmax) // already a third level { @@ -357,7 +357,7 @@ bool find_best_partition(chopper::configuration const & config, // else, end if-else-block and zero is returned } - return (size_t)0u; + return 0u; }; for (size_t p = 0; p < number_of_partitions; ++p) @@ -402,8 +402,7 @@ bool find_best_partition(chopper::configuration const & config, min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); } partition_sketches[best_p].merge(current_sketch); - corrected_estimate_per_part = - std::max(corrected_estimate_per_part, static_cast(partition_sketches[best_p].estimate())); + corrected_estimate_per_part = std::max(corrected_estimate_per_part, partition_sketches[best_p].estimate()); return true; } @@ -420,8 +419,9 @@ size_t split_bins(chopper::configuration const & config, for (size_t idx = 0; idx < end_idx; ++idx) { size_t const ub_idx{sorted_positions[idx]}; + // Question: divide and ceil? size_t const number_of_split_tbs = - std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); + std::max(1u, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); std::cout << "number_of_split_tbs: " << number_of_split_tbs << " cardinalities[ub_idx]:" << cardinalities[ub_idx] << " sum_of_cardinalities:" << sum_of_cardinalities << std::endl; From df35411a5fc49284d10d0d39ad80f91dbe4f7848 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 25 Jul 2025 20:16:26 +0200 Subject: [PATCH 17/18] [MISC] automatic linting --- src/layout/determine_split_bins.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp index 013975b4..fb4ba653 100644 --- a/src/layout/determine_split_bins.cpp +++ b/src/layout/determine_split_bins.cpp @@ -150,4 +150,4 @@ std::pair determine_split_bins(chopper::configuration const & co return {number_of_split_tbs, max_size}; } -} // namespace chopper::fast_layout +} // namespace chopper::layout From 0d03e6e80bd4f4ce82eaa8b059590de2bf7d4f7c Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 25 Jul 2025 20:20:08 +0200 Subject: [PATCH 18/18] fix multi-line-comment --- src/layout/hibf_statistics.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index fdf2bb7d..d764b394 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -397,8 +397,10 @@ void hibf_statistics::collect_bins() # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Warray-bounds=" # pragma GCC diagnostic ignored "-Wstringop-overflow=" -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV \ - // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV + + // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); + #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV # pragma GCC diagnostic pop #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV