From d24ab45ee98a267987ce443ce27303a1ac3d83e7 Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Mon, 21 Jul 2025 12:30:43 +0200
Subject: [PATCH 01/18] [FEATURE] Add the LSH algorithm.

---
 include/chopper/lsh.hpp | 199 ++++++++++++++++++++++++++++++++++++++++
 test/api/CMakeLists.txt |   1 +
 test/api/lsh_test.cpp   | 160 ++++++++++++++++++++++++++++++++
 3 files changed, 360 insertions(+)
 create mode 100644 include/chopper/lsh.hpp
 create mode 100644 test/api/lsh_test.cpp
diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp
new file mode 100644
index 00000000..af67bfa9
--- /dev/null
+++ b/include/chopper/lsh.hpp
@@ -0,0 +1,199 @@
+// --------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
+// --------------------------------------------------------------------------------------------------
+
+/*!\file
+ * \brief Provides chopper::adjust_seed.
+ * \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace chopper
+{
+
+/*\brief foo
+ */
+struct Cluster
+{
+protected:
+    size_t representative_id{}; // representative id of the cluster; identifier;
+
+    std::vector<size_t> user_bins{}; // the user bins contained in thus cluster
+
+    std::optional<size_t> moved_id{std::nullopt}; // where this Clusters user bins where moved to
+
+public:
+    Cluster() = default;
+    Cluster(const Cluster&) = default;
+    Cluster(Cluster&&) = default;
+    Cluster& operator=(const Cluster&) = default;
+    Cluster& operator=(Cluster&&) = default;
+    ~Cluster() = default;
+
+    Cluster(size_t const id)
+    {
+        representative_id = id;
+        user_bins = {id};
+    }
+
+    Cluster(size_t const id, size_t const user_bins_id)
+    {
+        representative_id = id;
+        user_bins = {user_bins_id};
+    }
+
+    size_t id() const
+    {
+        return representative_id;
+    }
+
+    std::vector<size_t> const & contained_user_bins() const
+    {
+        return user_bins;
+    }
+
+    bool has_been_moved() const
+    {
+        return moved_id.has_value();
+    }
+
+    bool empty() const
+    {
+        return user_bins.empty();
+    }
+
+    size_t size() const
+    {
+        return user_bins.size();
+    }
+
+    bool is_valid(size_t id) const
+    {
+        bool const ids_equal = representative_id == id;
+        bool const properly_moved = has_been_moved() && empty() && moved_id.has_value();
+        bool const not_moved = !has_been_moved() && !empty();
+
+        return ids_equal && (properly_moved || not_moved);
+    }
+
+    size_t moved_to_cluster_id() const
+    {
+        assert(moved_id.has_value());
+        assert(is_valid(representative_id));
+        return moved_id.value();
+    }
+
+    void move_to(Cluster & target_cluster)
+    {
+        target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end());
+        this->user_bins.clear();
+        moved_id = target_cluster.id();
+    }
+
+    void sort_by_cardinality(std::vector<size_t> const & cardinalities)
+    {
+        std::sort(user_bins.begin(), user_bins.end(), [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; });
+    }
+};
+
+struct MultiCluster : Cluster
+{
+protected:
+    std::vector<std::vector<size_t>> user_bins{}; // the user bins contained in this cluster
+
+public:
+    MultiCluster() = default;
+    MultiCluster(const MultiCluster&) = default;
+    MultiCluster(MultiCluster&&) = default;
+    MultiCluster& operator=(const MultiCluster&) = default;
+    MultiCluster& operator=(MultiCluster&&) = default;
+    ~MultiCluster() = default;
+
+    MultiCluster(Cluster const & clust)
+    {
+        representative_id = clust.id();
+        if (clust.has_been_moved())
+        {
+            moved_id = clust.moved_to_cluster_id();
+        }
+        else
+        {
+            user_bins.push_back(clust.contained_user_bins());
+        }
+    }
+
+    std::vector<std::vector<size_t>> const & contained_user_bins() const
+    {
+        return user_bins;
+    }
+
+    // needs to be defined again because of redefinition of `user_bins`
+    bool empty() const
+    {
+        return user_bins.empty();
+    }
+
+    // needs to be defined again because of redefinition of `user_bins`
+    size_t size() const
+    {
+        return user_bins.size();
+    }
+
+    bool is_valid(size_t id) const
+    {
+        bool const ids_equal = representative_id == id;
+        bool const properly_moved = has_been_moved() && empty() && moved_id.has_value();
+        bool const not_moved = !has_been_moved() && !empty();
+
+        return ids_equal && (properly_moved || not_moved);
+    }
+
+    void move_to(MultiCluster & target_cluster)
+    {
+        target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end());
+        this->user_bins.clear();
+        moved_id = target_cluster.id();
+    }
+
+    // sort user bins within a Cluster by cardinality and the clusters themselves by size
+    void sort_by_cardinality(std::vector<size_t> const & cardinalities)
+    {
+        auto cmp = [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; };
+        for (auto & user_bin_cluster : user_bins)
+            std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp);
+
+        auto cmp_clusters = [](auto const & c1, auto const & c2){ return c2.size() < c1.size(); };
+        std::sort(user_bins.begin(), user_bins.end(), cmp_clusters);
+    }
+};
+
+
+// A valid cluster is one that hasn't been moved but actually contains user bins
+// A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i
+// A moved cluster is one that has been joined and thereby moved to another cluster
+// A moved cluster i is identified by the following: cluster[i].size() == 1 && cluster[i][0] != i
+// returns position of the representative cluster
+template <typename cluster_type>
+size_t LSH_find_representative_cluster(std::vector<cluster_type> const & clusters, size_t current_id)
+{
+    std::reference_wrapper<cluster_type const> representative = clusters[current_id];
+
+    assert(representative.get().is_valid(current_id));
+
+    while (representative.get().has_been_moved())
+    {
+        current_id = representative.get().moved_to_cluster_id();
+        representative = clusters[current_id]; // replace by next cluster
+        assert(representative.get().is_valid(current_id));
+    }
+
+    return current_id;
+}
+
+} // namespace chopper
diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt
index 050affcf..3c46f8a4 100644
--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -9,5 +9,6 @@ include (add_subdirectories)
 
 add_api_test (config_test.cpp)
 add_api_test (input_functor_test.cpp)
+add_api_test (lsh_test.cpp)
 
 add_subdirectories ()
diff --git a/test/api/lsh_test.cpp b/test/api/lsh_test.cpp
new file mode 100644
index 00000000..71be4de1
--- /dev/null
+++ b/test/api/lsh_test.cpp
@@ -0,0 +1,160 @@
+#include <gtest/gtest.h> // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult
+
+#include <cstddef>     // for size_t
+#include <sstream>     // for operator<<, char_traits, basic_ostream, basic_stringstream, strings...
+#include <string>      // for allocator, string
+#include <string_view> // for operator<<
+#include <vector>      // for vector
+
+#include <chopper/lsh.hpp>
+
+TEST(Cluster_test, ctor_from_id)
+{
+    size_t user_bin_idx{5};
+    chopper::Cluster const cluster{user_bin_idx};
+
+    EXPECT_EQ(cluster.id(), user_bin_idx);
+    EXPECT_FALSE(cluster.empty());
+    EXPECT_EQ(cluster.size(), 1u);
+    ASSERT_EQ(cluster.contained_user_bins().size(), 1u);
+    EXPECT_EQ(cluster.contained_user_bins()[0], user_bin_idx);
+    EXPECT_TRUE(cluster.is_valid(user_bin_idx));
+}
+
+TEST(Cluster_test, move_to)
+{
+    size_t user_bin_idx1{5};
+    size_t user_bin_idx2{7};
+    chopper::Cluster cluster1{user_bin_idx1};
+    chopper::Cluster cluster2{user_bin_idx2};
+
+    EXPECT_TRUE(cluster1.is_valid(user_bin_idx1));
+    EXPECT_TRUE(cluster2.is_valid(user_bin_idx2));
+
+    cluster2.move_to(cluster1);
+
+    // cluster1 now contains user bins 5 and 7
+    EXPECT_EQ(cluster1.size(), 2u);
+    ASSERT_EQ(cluster1.contained_user_bins().size(), 2u);
+    EXPECT_EQ(cluster1.contained_user_bins()[0], user_bin_idx1);
+    EXPECT_EQ(cluster1.contained_user_bins()[1], user_bin_idx2);
+
+    // cluster 2 is empty
+    EXPECT_TRUE(cluster2.has_been_moved());
+    EXPECT_TRUE(cluster2.empty());
+    EXPECT_EQ(cluster2.size(), 0u);
+    EXPECT_EQ(cluster2.contained_user_bins().size(), 0u);
+    EXPECT_EQ(cluster2.moved_to_cluster_id(), cluster1.id());
+
+    // both should still be valid
+    EXPECT_TRUE(cluster1.is_valid(user_bin_idx1));
+    EXPECT_TRUE(cluster2.is_valid(user_bin_idx2));
+}
+
+TEST(Multicluster_test, ctor_from_cluster)
+{
+    chopper::Cluster const cluster1{5};
+    chopper::MultiCluster const multi_cluster1{cluster1};
+
+    EXPECT_EQ(multi_cluster1.id(), cluster1.id());
+    EXPECT_FALSE(multi_cluster1.empty());
+    EXPECT_EQ(multi_cluster1.size(), 1u);
+    ASSERT_EQ(multi_cluster1.contained_user_bins().size(), 1u);
+    ASSERT_EQ(multi_cluster1.contained_user_bins()[0].size(), 1u);
+    EXPECT_EQ(multi_cluster1.contained_user_bins()[0][0], 5u);
+    EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id()));
+}
+
+TEST(Multicluster_test, ctor_from_moved_cluster)
+{
+    chopper::Cluster cluster1{5};
+    chopper::Cluster cluster2{7};
+    cluster2.move_to(cluster1);
+
+    chopper::MultiCluster const multi_cluster2{cluster2};
+
+    EXPECT_EQ(multi_cluster2.id(), cluster2.id());
+    EXPECT_TRUE(multi_cluster2.empty());
+    EXPECT_TRUE(multi_cluster2.has_been_moved());
+    EXPECT_EQ(multi_cluster2.moved_to_cluster_id(), cluster2.moved_to_cluster_id());
+    EXPECT_EQ(multi_cluster2.size(), 0u);
+    ASSERT_EQ(multi_cluster2.contained_user_bins().size(), 0u);
+    EXPECT_TRUE(multi_cluster2.is_valid(cluster2.id()));
+}
+
+TEST(Multicluster_test, move_to)
+{
+    chopper::Cluster cluster1{5};
+    chopper::Cluster cluster2{7};
+    cluster2.move_to(cluster1);
+    ASSERT_EQ(cluster1.size(), 2u);
+    chopper::Cluster const cluster3{13};
+
+    chopper::MultiCluster multi_cluster1{cluster1};
+    EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id()));
+    EXPECT_EQ(multi_cluster1.size(), 1u);
+    EXPECT_EQ(multi_cluster1.contained_user_bins().size(), 1u);
+    EXPECT_EQ(multi_cluster1.contained_user_bins()[0].size(), 2u);
+
+    chopper::MultiCluster multi_cluster3{cluster3};
+    EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id()));
+    EXPECT_EQ(multi_cluster3.size(), 1u);
+
+    multi_cluster1.move_to(multi_cluster3);
+
+    EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id()));
+    EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id()));
+
+    // multi_cluster1 has been moved and is empty now
+    EXPECT_TRUE(multi_cluster1.empty());
+    EXPECT_EQ(multi_cluster1.size(), 0u);
+    EXPECT_TRUE(multi_cluster1.has_been_moved());
+    EXPECT_EQ(multi_cluster1.moved_to_cluster_id(), multi_cluster3.id());
+
+    // multi_cluster3 contains 2 clusters now, {13} and {5, 7}
+    EXPECT_FALSE(multi_cluster3.empty());
+    EXPECT_EQ(multi_cluster3.size(), 2u); // two clusters
+    ASSERT_EQ(multi_cluster3.contained_user_bins().size(), 2u);
+    ASSERT_EQ(multi_cluster3.contained_user_bins()[0].size(), 1u);
+    ASSERT_EQ(multi_cluster3.contained_user_bins()[1].size(), 2u);
+    ASSERT_EQ(multi_cluster3.contained_user_bins()[0][0], cluster3.id());
+    ASSERT_EQ(multi_cluster3.contained_user_bins()[1][0], cluster1.id());
+    ASSERT_EQ(multi_cluster3.contained_user_bins()[1][1], cluster2.id());
+}
+
+TEST(LSH_find_representative_cluster_test, cluster_one_move)
+{
+    std::vector<chopper::Cluster> clusters{chopper::Cluster{0}, chopper::Cluster{1}};
+    clusters[1].move_to(clusters[0]);
+
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id());
+}
+
+TEST(LSH_find_representative_cluster_test, multi_cluster_one_move)
+{
+    std::vector<chopper::MultiCluster> mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}};
+    mclusters[1].move_to(mclusters[0]);
+
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id());
+
+}
+
+TEST(LSH_find_representative_cluster_test, cluster_two_moves)
+{
+    std::vector<chopper::Cluster> clusters{chopper::Cluster{0}, chopper::Cluster{1}, chopper::Cluster{2}};
+    clusters[2].move_to(clusters[1]);
+    clusters[1].move_to(clusters[0]);
+
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id());
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[2].id()), clusters[0].id());
+}
+
+TEST(LSH_find_representative_cluster_test, multi_cluster_two_moves)
+{
+    std::vector<chopper::MultiCluster> mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}, {chopper::Cluster{2}}};
+    mclusters[2].move_to(mclusters[1]);
+    mclusters[1].move_to(mclusters[0]);
+
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id());
+    EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[2].id()), mclusters[0].id());
+}

From ac45b94ea00a0a902e9d89d94dd9002e7cc9d7bb Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Mon, 21 Jul 2025 12:33:31 +0200
Subject: [PATCH 02/18] add /build_debug/ to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 7dfea74f..90a24ee0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@
 
 # build dir
 /build/
+/build_debug/
 
 # editor configuration
 /.vscode/

From 73c62b06535a31a0425e2369e14c29e8d921f2f8 Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Thu, 24 Jul 2025 21:39:42 +0200
Subject: [PATCH 03/18] [MISC] Add determine_split_bins_test.

---
 .../chopper/layout/determine_split_bins.hpp   |  29 +++
 src/layout/determine_split_bins.cpp           | 152 ++++++++++++++
 test/api/layout/CMakeLists.txt                |   1 +
 test/api/layout/determine_split_bins_test.cpp | 192 ++++++++++++++++++
 4 files changed, 374 insertions(+)
 create mode 100644 include/chopper/layout/determine_split_bins.hpp
 create mode 100644 src/layout/determine_split_bins.cpp
 create mode 100644 test/api/layout/determine_split_bins_test.cpp

diff --git a/include/chopper/layout/determine_split_bins.hpp b/include/chopper/layout/determine_split_bins.hpp
new file mode 100644
index 00000000..dee17b24
--- /dev/null
+++ b/include/chopper/layout/determine_split_bins.hpp
@@ -0,0 +1,29 @@
+// --------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
+// --------------------------------------------------------------------------------------------------
+
+/*!\file
+ * \brief Provides chopper::determine_split_bins.
+ * \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <chopper/configuration.hpp>
+
+namespace chopper::layout
+{
+
+std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
+    std::vector<size_t> const & positions,
+    std::vector<size_t> const & cardinalities,
+    size_t const num_technical_bins,
+    size_t const num_user_bins,
+    std::vector<std::vector<size_t>> & partitions);
+
+}
\ No newline at end of file
diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp
new file mode 100644
index 00000000..71d946b8
--- /dev/null
+++ b/src/layout/determine_split_bins.cpp
@@ -0,0 +1,152 @@
+// ---------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
+// ---------------------------------------------------------------------------------------------------
+
+#include <numeric>  // for allocator, string
+#include <vector>  // for vector
+
+#include <chopper/layout/determine_split_bins.hpp>
+#include <chopper/configuration.hpp>
+#include <hibf/layout/print_matrix.hpp>        // for data_store
+
+#include <hibf/layout/compute_fpr_correction.hpp>
+#include <hibf/misc/divide_and_ceil.hpp>
+
+namespace chopper::fast_layout
+{
+
+std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
+                                               std::vector<size_t> const & positions,
+                                               std::vector<size_t> const & cardinalities,
+                                               size_t const num_technical_bins,
+                                               size_t const num_user_bins,
+                                               std::vector<std::vector<size_t>> & partitions)
+{
+    if (num_user_bins == 0)
+        return {0,0};
+
+    assert(num_technical_bins > 0u);
+    assert(num_user_bins > 0u);
+    assert(num_user_bins <= num_technical_bins);
+
+    auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                        .hash_count = config.hibf_config.number_of_hash_functions,
+                                                        .t_max = num_technical_bins});
+
+    std::vector<std::vector<size_t>> matrix(num_technical_bins); // rows
+    for (auto & v : matrix)
+        v.resize(num_user_bins, std::numeric_limits<size_t>::max()); // columns
+
+    std::vector<std::vector<size_t>> trace(num_technical_bins); // rows
+    for (auto & v : trace)
+        v.resize(num_user_bins, std::numeric_limits<size_t>::max()); // columns
+
+    size_t const extra_bins = num_technical_bins - num_user_bins + 1;
+
+    // initialize first column (first row is initialized with inf)
+    double const ub_cardinality = static_cast<double>(cardinalities[positions[0]]);
+    for (size_t i = 0; i < extra_bins; ++i)
+    {
+        size_t const corrected_ub_cardinality = static_cast<size_t>(ub_cardinality * fpr_correction[i + 1]);
+        matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u);
+    }
+
+    // we must iterate column wise
+    for (size_t j = 1; j < num_user_bins; ++j)
+    {
+        double const ub_cardinality = static_cast<double>(cardinalities[positions[j]]);
+
+        for (size_t i = j; i < j + extra_bins; ++i)
+        {
+            size_t minimum{std::numeric_limits<size_t>::max()};
+
+            for (size_t i_prime = j - 1; i_prime < i; ++i_prime)
+            {
+                size_t const corrected_ub_cardinality =
+                    static_cast<size_t>(ub_cardinality * fpr_correction[(i - i_prime)]);
+                size_t score =
+                    std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]);
+
+                // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl;
+
+                minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum;
+            }
+
+            matrix[i][j] = minimum;
+        }
+    }
+
+    // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits<size_t>::max());
+    //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits<size_t>::max());
+
+    // backtracking
+    // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good)
+    // the less rows, the better
+    size_t trace_i{num_technical_bins - 1};
+    size_t best_score{std::numeric_limits<size_t>::max()};
+    for (size_t best_i{0}; best_i < num_technical_bins; ++best_i)
+    {
+        if (matrix[best_i][num_user_bins - 1] < best_score)
+        {
+            best_score = matrix[best_i][num_user_bins - 1];
+            trace_i = best_i;
+        }
+    }
+    size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number
+
+    // now that we found the best trace_i start usual backtracking
+    size_t trace_j = num_user_bins - 1;
+
+    // size_t max_id{};
+    size_t max_size{};
+
+    size_t bin_id{};
+
+    while (trace_j > 0)
+    {
+        size_t next_i = trace[trace_i][trace_j];
+        size_t const number_of_bins = (trace_i - next_i);
+        size_t const cardinality = cardinalities[positions[trace_j]];
+        size_t const corrected_cardinality = static_cast<size_t>(cardinality * fpr_correction[number_of_bins]);
+        size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins);
+
+        if (cardinality_per_bin > max_size)
+        {
+            // max_id = bin_id;
+            max_size = cardinality_per_bin;
+        }
+
+        for (size_t splits{0}; splits < number_of_bins; ++splits)
+        {
+            partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]);
+            ++bin_id;
+        }
+
+        trace_i = trace[trace_i][trace_j];
+        --trace_j;
+    }
+    ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins
+    size_t const cardinality = cardinalities[positions[0]];
+    size_t const corrected_cardinality = static_cast<size_t>(cardinality * fpr_correction[trace_i]);
+    // NOLINTNEXTLINE(clang-analyzer-core.DivideZero)
+    size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i);
+
+    if (cardinality_per_bin > max_size)
+    {
+        // max_id = bin_id;
+        max_size = cardinality_per_bin;
+    }
+
+    for (size_t splits{0}; splits < trace_i; ++splits)
+    {
+        partitions[partitions.size() - 1 - bin_id].push_back(positions[0]);
+        ++bin_id;
+    }
+
+    return {number_of_split_tbs, max_size};
+}
+
+} // namespace chopper::layout
diff --git a/test/api/layout/CMakeLists.txt b/test/api/layout/CMakeLists.txt
index 9cdb4076..98c6cfe3 100644
--- a/test/api/layout/CMakeLists.txt
+++ b/test/api/layout/CMakeLists.txt
@@ -26,3 +26,4 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU"
 endif ()
 add_api_test (user_bin_io_test.cpp)
 add_api_test (input_test.cpp)
+add_api_test (determine_split_bins_test.cpp)
diff --git a/test/api/layout/determine_split_bins_test.cpp b/test/api/layout/determine_split_bins_test.cpp
new file mode 100644
index 00000000..2af4c488
--- /dev/null
+++ b/test/api/layout/determine_split_bins_test.cpp
@@ -0,0 +1,192 @@
+#include <gtest/gtest.h> // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult
+
+#include <iostream> // for operator<<, char_traits, basic_ostream, basic_stringstream, strings...
+#include <numeric>  // for allocator, string
+#include <vector>  // for vector
+
+#include <chopper/configuration.hpp>
+#include <hibf/layout/print_matrix.hpp>        // for data_store
+
+#include <hibf/layout/compute_fpr_correction.hpp>
+#include <hibf/misc/divide_and_ceil.hpp>
+
+std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
+                                               std::vector<size_t> const & positions,
+                                               std::vector<size_t> const & cardinalities,
+                                               size_t const num_technical_bins,
+                                               size_t const num_user_bins,
+                                               std::vector<std::vector<size_t>> & partitions)
+{
+    if (num_user_bins == 0)
+        return {0,0};
+
+    assert(num_technical_bins > 0u);
+    assert(num_user_bins > 0u);
+    assert(num_user_bins <= num_technical_bins);
+
+    auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                        .hash_count = config.hibf_config.number_of_hash_functions,
+                                                        .t_max = num_technical_bins});
+
+    std::vector<std::vector<size_t>> matrix(num_technical_bins); // rows
+    for (auto & v : matrix)
+        v.resize(num_user_bins, std::numeric_limits<size_t>::max()); // columns
+
+    std::vector<std::vector<size_t>> trace(num_technical_bins); // rows
+    for (auto & v : trace)
+        v.resize(num_user_bins, std::numeric_limits<size_t>::max()); // columns
+
+    size_t const extra_bins = num_technical_bins - num_user_bins + 1;
+
+    // initialize first column (first row is initialized with inf)
+    double const ub_cardinality = static_cast<double>(cardinalities[positions[0]]);
+    for (size_t i = 0; i < extra_bins; ++i)
+    {
+        size_t const corrected_ub_cardinality = static_cast<size_t>(ub_cardinality * fpr_correction[i + 1]);
+        matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u);
+    }
+
+    // we must iterate column wise
+    for (size_t j = 1; j < num_user_bins; ++j)
+    {
+        double const ub_cardinality = static_cast<double>(cardinalities[positions[j]]);
+
+        for (size_t i = j; i < j + extra_bins; ++i)
+        {
+            size_t minimum{std::numeric_limits<size_t>::max()};
+
+            for (size_t i_prime = j - 1; i_prime < i; ++i_prime)
+            {
+                size_t const corrected_ub_cardinality =
+                    static_cast<size_t>(ub_cardinality * fpr_correction[(i - i_prime)]);
+                size_t score =
+                    std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]);
+
+                // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl;
+
+                minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum;
+            }
+
+            matrix[i][j] = minimum;
+        }
+    }
+
+    // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits<size_t>::max());
+    //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits<size_t>::max());
+
+    // backtracking
+    // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good)
+    // the less rows, the better
+    size_t trace_i{num_technical_bins - 1};
+    size_t best_score{std::numeric_limits<size_t>::max()};
+    for (size_t best_i{0}; best_i < num_technical_bins; ++best_i)
+    {
+        if (matrix[best_i][num_user_bins - 1] < best_score)
+        {
+            best_score = matrix[best_i][num_user_bins - 1];
+            trace_i = best_i;
+        }
+    }
+    size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number
+
+    // now that we found the best trace_i start usual backtracking
+    size_t trace_j = num_user_bins - 1;
+
+    // size_t max_id{};
+    size_t max_size{};
+
+    size_t bin_id{};
+
+    while (trace_j > 0)
+    {
+        size_t next_i = trace[trace_i][trace_j];
+        size_t const number_of_bins = (trace_i - next_i);
+        size_t const cardinality = cardinalities[positions[trace_j]];
+        size_t const corrected_cardinality = static_cast<size_t>(cardinality * fpr_correction[number_of_bins]);
+        size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins);
+
+        if (cardinality_per_bin > max_size)
+        {
+            // max_id = bin_id;
+            max_size = cardinality_per_bin;
+        }
+
+        for (size_t splits{0}; splits < number_of_bins; ++splits)
+        {
+            partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]);
+            ++bin_id;
+        }
+
+        trace_i = trace[trace_i][trace_j];
+        --trace_j;
+    }
+    ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins
+    size_t const cardinality = cardinalities[positions[0]];
+    size_t const corrected_cardinality = static_cast<size_t>(cardinality * fpr_correction[trace_i]);
+    // NOLINTNEXTLINE(clang-analyzer-core.DivideZero)
+    size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i);
+
+    if (cardinality_per_bin > max_size)
+    {
+        // max_id = bin_id;
+        max_size = cardinality_per_bin;
+    }
+
+    for (size_t splits{0}; splits < trace_i; ++splits)
+    {
+        partitions[partitions.size() - 1 - bin_id].push_back(positions[0]);
+        ++bin_id;
+    }
+
+    return {number_of_split_tbs, max_size};
+}
+
+TEST(simple_split, first)
+{
+    chopper::configuration config{};
+
+    std::vector<size_t> cardinalities{};
+    for (size_t i{0}; i < 20; ++i)
+        cardinalities.push_back(2400);
+    cardinalities.push_back(200);
+    for (size_t i{0}; i < 84; ++i)
+        cardinalities.push_back(50);
+
+    std::vector<size_t> positions(cardinalities.size());
+    std::iota(positions.begin(), positions.end(), 0u);
+
+    size_t num_technical_bins{63};
+    size_t num_user_bins{21};
+
+    std::vector<std::vector<size_t>> partitions(64);
+
+    auto const [num_splits, max_size] = determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions);
+
+    EXPECT_EQ(num_splits, 61);
+    EXPECT_EQ(max_size, 1452);
+
+    size_t ub_idx{20};
+    size_t tb_idx{partitions.size() - 1};
+
+    // TBs:  0, 1, 2, 3, ...
+    // UBs: 20,19,19,19,18,18,18,17,17,17,16,16,16,15,15,15,14,14,14,13,13,13,12,12,12,11,11,11,10,10,10,9,9,9,8,8,8,7,7,7,6,6,6,5,5,5,4,4,4,3,3,3,2,2,2,1,1,1,0,0,0,
+
+    EXPECT_EQ(partitions[tb_idx][0], ub_idx); // single bin with card = 200
+    --tb_idx;
+    --ub_idx;
+
+    for (size_t i{partitions.size() - 1}; i > partitions.size() - num_splits - 1; --i)
+        std::cerr << partitions[i][0] << ",";
+
+
+    for (;tb_idx > partitions.size() - num_splits - 1; --tb_idx)
+    {
+        for (size_t three = 0; three < 3; ++three)
+        {
+            EXPECT_EQ(partitions[tb_idx][0], ub_idx);
+            --tb_idx;
+        }
+        ++tb_idx; // one too much in loop before
+        --ub_idx;
+    }
+}
\ No newline at end of file

From 6fd5b75e84bdb5bd57993fdf284186a39cf96e60 Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Thu, 24 Jul 2025 21:40:08 +0200
Subject: [PATCH 04/18] [TEST] Add simple fast layout test

---
 test/api/layout/CMakeLists.txt       |   1 +
 test/api/layout/fast_layout_test.cpp | 310 +++++++++++++++++++++++++++
 2 files changed, 311 insertions(+)
 create mode 100644 test/api/layout/fast_layout_test.cpp

diff --git a/test/api/layout/CMakeLists.txt b/test/api/layout/CMakeLists.txt
index 98c6cfe3..45f5b1ce 100644
--- a/test/api/layout/CMakeLists.txt
+++ b/test/api/layout/CMakeLists.txt
@@ -9,6 +9,7 @@ cmake_minimum_required (VERSION 3.18)
 
 add_api_test (ibf_query_cost_test.cpp)
 add_api_test (execute_layout_test.cpp)
+add_api_test (fast_layout_test.cpp)
 
 add_api_test (execute_with_estimation_test.cpp)
 target_use_datasources (execute_with_estimation_test FILES seq1.fa)
diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp
new file mode 100644
index 00000000..2347f66b
--- /dev/null
+++ b/test/api/layout/fast_layout_test.cpp
@@ -0,0 +1,310 @@
+// ---------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
+// ---------------------------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <chopper/layout/execute.hpp>
+
+#include <hibf/sketch/compute_sketches.hpp>
+
+#include "../api_test.hpp"
+#include "print_debug_file.hpp"
+
+TEST(execute_test, many_ubs)
+{
+    seqan3::test::tmp_directory tmp_dir{};
+    std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};
+
+    std::vector<std::vector<std::string>> many_filenames;
+
+    for (size_t i{0}; i < 96u; ++i)
+        many_filenames.push_back({seqan3::detail::to_string("seq", i)});
+
+    // Creates sizes of the following series
+    // [100,101,...,120,222,223,...,241,343,344,...362,464,465,...,483,585,586,...,600]
+    // See also https://godbolt.org/z/9517eaaaG
+    auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
+    {
+        size_t const desired_kmer_count = 1001 * ((num + 20) / 20) + num;
+        for (auto hash : std::views::iota(num * 600, num * 600 + desired_kmer_count))
+            it = hash;
+    };
+
+    chopper::configuration config{};
+    config.output_filename = layout_file;
+    config.disable_sketch_output = true;
+    config.hibf_config.tmax = 64;
+    config.hibf_config.input_fn = simulated_input;
+    config.hibf_config.number_of_user_bins = many_filenames.size();
+    config.hibf_config.disable_estimate_union = true; // also disables rearrangement
+
+    std::vector<seqan::hibf::sketch::hyperloglog> sketches;
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
+
+    chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
+
+    std::string const expected_file{
+    "@CHOPPER_USER_BINS\n"
+    "@0 seq0\n"
+    "@1 seq1\n"
+    "@2 seq2\n"
+    "@3 seq3\n"
+    "@4 seq4\n"
+    "@5 seq5\n"
+    "@6 seq6\n"
+    "@7 seq7\n"
+    "@8 seq8\n"
+    "@9 seq9\n"
+    "@10 seq10\n"
+    "@11 seq11\n"
+    "@12 seq12\n"
+    "@13 seq13\n"
+    "@14 seq14\n"
+    "@15 seq15\n"
+    "@16 seq16\n"
+    "@17 seq17\n"
+    "@18 seq18\n"
+    "@19 seq19\n"
+    "@20 seq20\n"
+    "@21 seq21\n"
+    "@22 seq22\n"
+    "@23 seq23\n"
+    "@24 seq24\n"
+    "@25 seq25\n"
+    "@26 seq26\n"
+    "@27 seq27\n"
+    "@28 seq28\n"
+    "@29 seq29\n"
+    "@30 seq30\n"
+    "@31 seq31\n"
+    "@32 seq32\n"
+    "@33 seq33\n"
+    "@34 seq34\n"
+    "@35 seq35\n"
+    "@36 seq36\n"
+    "@37 seq37\n"
+    "@38 seq38\n"
+    "@39 seq39\n"
+    "@40 seq40\n"
+    "@41 seq41\n"
+    "@42 seq42\n"
+    "@43 seq43\n"
+    "@44 seq44\n"
+    "@45 seq45\n"
+    "@46 seq46\n"
+    "@47 seq47\n"
+    "@48 seq48\n"
+    "@49 seq49\n"
+    "@50 seq50\n"
+    "@51 seq51\n"
+    "@52 seq52\n"
+    "@53 seq53\n"
+    "@54 seq54\n"
+    "@55 seq55\n"
+    "@56 seq56\n"
+    "@57 seq57\n"
+    "@58 seq58\n"
+    "@59 seq59\n"
+    "@60 seq60\n"
+    "@61 seq61\n"
+    "@62 seq62\n"
+    "@63 seq63\n"
+    "@64 seq64\n"
+    "@65 seq65\n"
+    "@66 seq66\n"
+    "@67 seq67\n"
+    "@68 seq68\n"
+    "@69 seq69\n"
+    "@70 seq70\n"
+    "@71 seq71\n"
+    "@72 seq72\n"
+    "@73 seq73\n"
+    "@74 seq74\n"
+    "@75 seq75\n"
+    "@76 seq76\n"
+    "@77 seq77\n"
+    "@78 seq78\n"
+    "@79 seq79\n"
+    "@80 seq80\n"
+    "@81 seq81\n"
+    "@82 seq82\n"
+    "@83 seq83\n"
+    "@84 seq84\n"
+    "@85 seq85\n"
+    "@86 seq86\n"
+    "@87 seq87\n"
+    "@88 seq88\n"
+    "@89 seq89\n"
+    "@90 seq90\n"
+    "@91 seq91\n"
+    "@92 seq92\n"
+    "@93 seq93\n"
+    "@94 seq94\n"
+    "@95 seq95\n"
+    "@CHOPPER_USER_BINS_END\n"
+    "@CHOPPER_CONFIG\n"
+    "@{\n"
+    "@    \"chopper_config\": {\n"
+    "@        \"version\": 2,\n"
+    "@        \"data_file\": {\n"
+    "@            \"value0\": \"\"\n"
+    "@        },\n"
+    "@        \"debug\": false,\n"
+    "@        \"sketch_directory\": {\n"
+    "@            \"value0\": \"\"\n"
+    "@        },\n"
+    "@        \"k\": 19,\n"
+    "@        \"window_size\": 19,\n"
+    "@        \"disable_sketch_output\": true,\n"
+    "@        \"precomputed_files\": false,\n"
+    "@        \"output_filename\": {\n"
+    "@            \"value0\": \"" +  layout_file.string() + "\"\n"
+    "@        },\n"
+    "@        \"determine_best_tmax\": false,\n"
+    "@        \"force_all_binnings\": false\n"
+    "@    }\n"
+    "@}\n"
+    "@CHOPPER_CONFIG_END\n"
+    "@HIBF_CONFIG\n"
+    "@{\n"
+    "@    \"hibf_config\": {\n"
+    "@        \"version\": 2,\n"
+    "@        \"number_of_user_bins\": 96,\n"
+    "@        \"number_of_hash_functions\": 2,\n"
+    "@        \"maximum_fpr\": 0.05,\n"
+    "@        \"relaxed_fpr\": 0.3,\n"
+    "@        \"threads\": 1,\n"
+    "@        \"sketch_bits\": 12,\n"
+    "@        \"tmax\": 64,\n"
+    "@        \"empty_bin_fraction\": 0.0,\n"
+    "@        \"alpha\": 1.2,\n"
+    "@        \"max_rearrangement_ratio\": 0.5,\n"
+    "@        \"disable_estimate_union\": true,\n"
+    "@        \"disable_rearrangement\": true\n"
+    "@    }\n"
+    "@}\n"
+    "@HIBF_CONFIG_END\n"
+    "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n"
+    "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n"
+    "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n"
+    "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n"
+    "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n"
+    "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n"
+    "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n"
+    "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n"
+    "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n"
+    "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n"
+    "#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS\n"
+    "0	5;0	1;2\n"
+    "1	9;0	1;2\n"
+    "2	3;0	1;2\n"
+    "3	12;0	1;2\n"
+    "4	12;2	1;2\n"
+    "5	6;0	1;2\n"
+    "6	9;2	1;2\n"
+    "7	9;4	1;2\n"
+    "8	3;2	1;2\n"
+    "9	3;4	1;2\n"
+    "10	4;0	1;2\n"
+    "11	6;2	1;2\n"
+    "12	2;0	1;2\n"
+    "13	2;2	1;2\n"
+    "14	2;4	1;2\n"
+    "15	4;2	1;2\n"
+    "16	12;4	1;3\n"
+    "17	6;4	1;2\n"
+    "18	5;2	1;3\n"
+    "19	9;6	1;2\n"
+    "20	8;0	1;12\n"
+    "21	8;12	1;12\n"
+    "22	6;6	1;9\n"
+    "23	6;15	1;9\n"
+    "24	9;8	1;9\n"
+    "25	9;17	1;9\n"
+    "26	4;4	1;11\n"
+    "27	4;15	1;11\n"
+    "28	1;0	1;12\n"
+    "29	1;12	1;12\n"
+    "30	2;6	1;11\n"
+    "31	2;17	1;11\n"
+    "32	12;7	1;13\n"
+    "33	6;24	1;9\n"
+    "34	5;5	1;14\n"
+    "35	9;26	1;9\n"
+    "36	3;6	1;13\n"
+    "37	22	1\n"
+    "38	21	1\n"
+    "39	20	1\n"
+    "40	15	1\n"
+    "41	18	1\n"
+    "42	19	1\n"
+    "43	13	1\n"
+    "44	17	1\n"
+    "45	14	1\n"
+    "46	10	1\n"
+    "47	16	1\n"
+    "48	11	1\n"
+    "49	8;24	1;40\n"
+    "50	12;20	1;44\n"
+    "51	6;33	1;31\n"
+    "52	5;19	1;45\n"
+    "53	9;35	1;29\n"
+    "54	3;19	1;45\n"
+    "55	4;26	1;38\n"
+    "56	7	1\n"
+    "57	1;24	1;40\n"
+    "58	0	1\n"
+    "59	2;28	1;36\n"
+    "60	63	1\n"
+    "61	62	1\n"
+    "62	60	1\n"
+    "63	61	1\n"
+    "64	59	1\n"
+    "65	58	1\n"
+    "66	57	1\n"
+    "67	55	1\n"
+    "68	56	1\n"
+    "69	54	1\n"
+    "70	52	1\n"
+    "71	53	1\n"
+    "72	51	1\n"
+    "73	49	1\n"
+    "74	48	1\n"
+    "75	50	1\n"
+    "76	45	1\n"
+    "77	47	1\n"
+    "78	46	1\n"
+    "79	44	1\n"
+    "80	39	1\n"
+    "81	40	1\n"
+    "82	41	1\n"
+    "83	42	1\n"
+    "84	38	1\n"
+    "85	37	1\n"
+    "86	43	1\n"
+    "87	36	1\n"
+    "88	35	1\n"
+    "89	34	1\n"
+    "90	29	2\n"
+    "91	31	2\n"
+    "92	27	2\n"
+    "93	23	2\n"
+    "94	33	1\n"
+    "95	25	2\n"};
+    std::string const actual_file{string_from_file(layout_file)};
+
+    EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl;
+}

From 85da3cdb8fffd257f076b635b43934c11fcdc25d Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Thu, 24 Jul 2025 21:41:27 +0200
Subject: [PATCH 05/18] [FEATURE] fast layout algorthm (still ugly).

---
 include/chopper/configuration.hpp  |   16 +
 include/chopper/layout/execute.hpp |    4 +-
 src/chopper_layout.cpp             |   19 +-
 src/layout/CMakeLists.txt          |    2 +-
 src/layout/execute.cpp             | 1880 +++++++++++++++++++++++++++-
 src/layout/hibf_statistics.cpp     |    2 +-
 6 files changed, 1887 insertions(+), 36 deletions(-)

diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp
index bae58feb..825bc407 100644
--- a/include/chopper/configuration.hpp
+++ b/include/chopper/configuration.hpp
@@ -20,8 +20,20 @@
 namespace chopper
 {
 
+enum partitioning_scheme
+{
+    blocked,       // 0
+    sorted,        // 1
+    folded,        // 2
+    weighted_fold, // 3
+    similarity,    // 4
+    lsh,           // 5
+    lsh_sim        // 6
+};
+
 struct configuration
 {
+    int partitioning_approach{6};
     /*!\name General Configuration
      * \{
      */
@@ -77,6 +89,10 @@ struct configuration
     mutable seqan::hibf::concurrent_timer union_estimation_timer{};
     mutable seqan::hibf::concurrent_timer rearrangement_timer{};
     mutable seqan::hibf::concurrent_timer dp_algorithm_timer{};
+    mutable seqan::hibf::concurrent_timer lsh_algorithm_timer{};
+    mutable seqan::hibf::concurrent_timer search_partition_algorithm_timer{};
+    mutable seqan::hibf::concurrent_timer intital_partition_timer{};
+    mutable seqan::hibf::concurrent_timer small_layouts_timer{};
 
     void read_from(std::istream & stream);
 
diff --git a/include/chopper/layout/execute.hpp b/include/chopper/layout/execute.hpp
index 52bfc3b7..00c4fa2e 100644
--- a/include/chopper/layout/execute.hpp
+++ b/include/chopper/layout/execute.hpp
@@ -13,12 +13,14 @@
 #include <chopper/configuration.hpp>
 
 #include <hibf/sketch/hyperloglog.hpp>
+#include <hibf/sketch/minhashes.hpp>
 
 namespace chopper::layout
 {
 
 int execute(chopper::configuration & config,
             std::vector<std::vector<std::string>> const & filenames,
-            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches);
+            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+            std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches);
 
 } // namespace chopper::layout
diff --git a/src/chopper_layout.cpp b/src/chopper_layout.cpp
index 5fda4ddb..9df893a3 100644
--- a/src/chopper_layout.cpp
+++ b/src/chopper_layout.cpp
@@ -93,6 +93,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
 
     std::vector<std::vector<std::string>> filenames{};
     std::vector<seqan::hibf::sketch::hyperloglog> sketches{};
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
 
     if (input_is_a_sketch_file)
     {
@@ -106,6 +107,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
 
         filenames = std::move(sin.filenames); // No need to call check_filenames because the files are not read.
         sketches = std::move(sin.hll_sketches);
+        minHash_sketches = std::move(sin.minHash_sketches);
         validate_configuration(parser, config, sin.chopper_config);
     }
     else
@@ -128,17 +130,18 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
     if (!input_is_a_sketch_file)
     {
         config.compute_sketches_timer.start();
-        seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+        seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
         config.compute_sketches_timer.stop();
     }
 
-    exit_code |= chopper::layout::execute(config, filenames, sketches);
+    exit_code |= chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     if (!config.disable_sketch_output)
     {
         chopper::sketch::sketch_file sout{.chopper_config = config,
                                           .filenames = std::move(filenames),
-                                          .hll_sketches = std::move(sketches)};
+                                          .hll_sketches = std::move(sketches),
+                                          .minHash_sketches = std::move(minHash_sketches)};
         std::ofstream os{config.sketch_directory, std::ios::binary};
         cereal::BinaryOutputArchive oarchive{os};
         oarchive(sout);
@@ -151,11 +154,19 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
         output_stream << "sketching_in_seconds\t"
                       << "layouting_in_seconds\t"
                       << "union_estimation_in_seconds\t"
-                      << "rearrangement_in_seconds\n";
+                      << "rearrangement_in_seconds\t"
+                      << "lsh_in_seconds\t"
+                      << "intital_partition_timer_in_seconds\t"
+                      << "small_layouts_timer_in_seconds\t"
+                      << "search_best_p_in_seconds\n";
         output_stream << config.compute_sketches_timer.in_seconds() << '\t';
         output_stream << config.dp_algorithm_timer.in_seconds() << '\t';
         output_stream << config.union_estimation_timer.in_seconds() << '\t';
         output_stream << config.rearrangement_timer.in_seconds() << '\t';
+        output_stream << config.lsh_algorithm_timer.in_seconds() << '\t';
+        output_stream << config.intital_partition_timer.in_seconds() << '\t';
+        output_stream << config.small_layouts_timer.in_seconds() << '\t';
+        output_stream << config.search_partition_algorithm_timer.in_seconds() << '\n';
     }
 
     return exit_code;
diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt
index 915c83d7..4c2542c8 100644
--- a/src/layout/CMakeLists.txt
+++ b/src/layout/CMakeLists.txt
@@ -5,7 +5,7 @@ if (TARGET chopper::layout)
 endif ()
 
 add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp
-                                   ibf_query_cost.cpp input.cpp output.cpp
+                                   ibf_query_cost.cpp input.cpp output.cpp determine_split_bins.cpp
 )
 target_link_libraries (chopper_layout PUBLIC chopper::shared)
 add_library (chopper::layout ALIAS chopper_layout)
diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index 6630447e..fd02cd0d 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -5,6 +5,7 @@
 // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
 // ---------------------------------------------------------------------------------------------------
 
+#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
@@ -12,67 +13,1888 @@
 #include <filesystem>
 #include <fstream>
 #include <iostream>
+#include <numeric>
+#include <random>
 #include <stdexcept>
 #include <string>
 #include <tuple>
 #include <vector>
+#include <optional>
+
+#include <hibf/contrib/robin_hood.hpp>
 
 #include <chopper/configuration.hpp>
+#include <chopper/layout/determine_split_bins.hpp>
 #include <chopper/layout/determine_best_number_of_technical_bins.hpp>
 #include <chopper/layout/execute.hpp>
 #include <chopper/layout/hibf_statistics.hpp>
 #include <chopper/layout/output.hpp>
 #include <chopper/next_multiple_of_64.hpp>
+#include <chopper/sketch/output.hpp>
+#include <chopper/lsh.hpp>
 
+#include <hibf/layout/compute_fpr_correction.hpp>
+#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
 #include <hibf/layout/compute_layout.hpp>
 #include <hibf/layout/layout.hpp>
+#include <hibf/misc/divide_and_ceil.hpp>
 #include <hibf/misc/iota_vector.hpp>
 #include <hibf/sketch/estimate_kmer_counts.hpp> // for estimate_kmer_counts
 #include <hibf/sketch/hyperloglog.hpp>
+#include <hibf/sketch/toolbox.hpp>
 
 namespace chopper::layout
 {
 
-int execute(chopper::configuration & config,
-            std::vector<std::vector<std::string>> const & filenames,
-            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
+uint64_t lsh_hash_the_sketch(std::vector<uint64_t> const & sketch, size_t const number_of_hashes_to_consider)
 {
-    config.hibf_config.validate_and_set_defaults();
+    // lets just compute the sum
+    uint64_t sum{0};
 
-    seqan::hibf::layout::layout hibf_layout;
-    std::vector<size_t> kmer_counts;
-    seqan::hibf::sketch::estimate_kmer_counts(sketches, kmer_counts);
+    for (size_t i = 0; i < number_of_hashes_to_consider; ++i)
+        sum += sketch[i];
+
+    return sum;
+}
+
+auto LSH_fill_hashtable(std::vector<Cluster> const & clusters,
+                        std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                        size_t const current_sketch_index,
+                        size_t const current_number_of_sketch_hashes)
+{
+    robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table;
 
-    if (config.determine_best_tmax)
+    size_t processed_user_bins{0}; // only for sanity check
+    for (size_t pos = 0; pos < clusters.size(); ++pos)
     {
-        hibf_layout = determine_best_number_of_technical_bins(config, kmer_counts, sketches);
+        auto const & current = clusters[pos];
+        assert(current.is_valid(pos));
+
+        if (current.has_been_moved()) // cluster has been moved somewhere else, don't process
+            continue;
+
+        for (size_t const user_bin_idx : current.contained_user_bins())
+        {
+            ++processed_user_bins;
+            uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes);
+            table[key].push_back(current.id()); // insert representative for all user bins
+        }
     }
-    else
+    assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters
+
+    return table;
+}
+
+auto LSH_fill_hashtable(std::vector<MultiCluster> const & clusters,
+                        std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                        size_t const current_sketch_index,
+                        size_t const current_number_of_sketch_hashes)
+{
+    robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table;
+
+    size_t processed_user_bins{0}; // only for sanity check
+    for (size_t pos = 0; pos < clusters.size(); ++pos)
+    {
+        auto const & current = clusters[pos];
+        assert(current.is_valid(pos));
+
+        if (current.has_been_moved()) // cluster has been moved somewhere else, don't process
+            continue;
+
+        for (auto const & similarity_cluster : current.contained_user_bins())
+        {
+            for (size_t const user_bin_idx : similarity_cluster)
+            {
+                ++processed_user_bins;
+                uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes);
+                table[key].push_back(current.id()); // insert representative for all user bins
+            }
+        }
+    }
+    assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters
+
+    return table;
+}
+
+// minHash_sketches data structure:
+// Vector L1 : number of user bins
+// Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b)
+// Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r)
+std::vector<Cluster> initital_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                         std::vector<size_t> const & positions,
+                         std::vector<size_t> const & cardinalities,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                         size_t const average_technical_bin_size,
+                         chopper::configuration const & config)
+{
+    assert(!minHash_sketches.empty());
+    assert(!minHash_sketches[0].table.empty());
+    assert(!minHash_sketches[0].table[0].empty());
+
+    size_t const number_of_user_bins{positions.size()};
+    size_t const number_of_max_minHash_sketches{seqan::hibf::sketch::minhashes::num_sketches}; // LSH ADD+OR parameter b
+    size_t const minHash_sketche_size{seqan::hibf::sketch::minhashes::sketch_size};        // LSH ADD+OR parameter r
+    seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits};
+
+    // initialise clusters with a signle user bin per cluster.
+    // clusters are either
+    // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to
+    //    e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y})
+    // 2) of size >= 1; with the first entry beging id == position (a valid cluster)
+    //    e.g. cluster[X] = {X}       // valid singleton
+    //    e.g. cluster[X] = {X, a, b, c, ...}   // valid cluster with more joined entries
+    // The clusters could me moved recursively, s.t.
+    // cluster[A] = {B}
+    // cluster[B] = {C}
+    // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B
+    std::vector<Cluster> clusters;
+    clusters.reserve(number_of_user_bins);
+
+    std::vector<size_t> current_cluster_cardinality(number_of_user_bins);
+    std::vector<seqan::hibf::sketch::hyperloglog> current_cluster_sketches(number_of_user_bins, empty_sketch);
+    size_t current_max_cluster_size{0};
+    size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively
+    size_t current_sketch_index{0};
+    size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
+
+    for (size_t pos = 0; pos < number_of_user_bins; ++pos)
+    {
+        clusters.emplace_back(pos, positions[pos]); // id, user_bin_idx
+        current_cluster_cardinality[pos] = cardinalities[positions[pos]];
+        current_cluster_sketches[pos] = sketches[positions[pos]];
+        current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]);
+    }
+
+    // refine clusters
+//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
+    while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
+           current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%?
+    {
+//std::cout << "Current number of clusters: " << current_number_of_clusters;
+
+        // fill LSH collision hashtable
+        robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
+            LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
+
+        // read out LSH collision hashtable
+        // for each present key, if the list contains more than one cluster, we merge everything contained in the list
+        // into the first cluster, since those clusters collide and should be joined in the same bucket
+        for (auto & [key, list] : table)
+        {
+            assert(!list.empty());
+
+            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
+            // be inserted into multiple splots, and multiple times in the same slot.
+            std::sort(list.begin(), list.end());
+            auto const end = std::unique(list.begin(), list.end());
+            auto const begin = list.begin();
+
+            if (end - begin <= 1) // nothing to do here
+                continue;
+
+            // Now combine all clusters into the first.
+
+            // 1) find the representative cluster to merge everything else into
+            // It can happen, that the representative has already been joined with another cluster
+            // e.g.
+            // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+            // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
+            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
+            auto & representative_cluster = clusters[representative_cluster_id];
+            assert(representative_cluster.is_valid(representative_cluster_id));
+            assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
+
+            for (auto current = begin + 1; current < end; ++current)
+            {
+                // For every other entry in the list, it can happen that I already joined that list with another
+                // e.g.
+                // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+                // [key2] = {0, 2, 11} // now I want to do it again
+                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
+                auto & next_cluster = clusters[next_cluster_id];
+
+                if (next_cluster.id() == representative_cluster.id()) // already joined
+                    continue;
+
+                next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
+                assert(next_cluster.size() == 0);
+                assert(next_cluster.has_been_moved());
+                assert(representative_cluster.size() > 1); // there should be at least two user bins now
+                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
+
+                current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]);
+                current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate();
+
+                --current_number_of_clusters;
+            }
+
+            current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality);
+        }
+
+        ++current_sketch_index;
+
+//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
+    }
+
+    return clusters;
+}
+
+std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                         std::vector<size_t> const & positions,
+                         std::vector<size_t> const & cardinalities,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                         size_t const average_technical_bin_size,
+                         chopper::configuration const & config)
+{
+    assert(!minHash_sketches.empty());
+    assert(!minHash_sketches[0].table.empty());
+    assert(!minHash_sketches[0].table[0].empty());
+
+    size_t const number_of_user_bins{positions.size()};
+    assert(number_of_user_bins <= minHash_sketches.size());
+    size_t const number_of_max_minHash_sketches{3};                     // LSH ADD+OR parameter b
+    // size_t const minHash_sketche_size{minHash_sketches[0].table[0].size()};   // LSH ADD+OR parameter r
+    size_t const minHash_sketche_size{5};   // LSH ADD+OR parameter r
+    seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits};
+// std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl;
+// std::cout << "sketch size used here: " << minHash_sketche_size << std::endl;
+    // initialise clusters with a signle user bin per cluster.
+    // clusters are either
+    // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to
+    //    e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y})
+    // 2) of size >= 1; with the first entry beging id == position (a valid cluster)
+    //    e.g. cluster[X] = {X}       // valid singleton
+    //    e.g. cluster[X] = {X, a, b, c, ...}   // valid cluster with more joined entries
+    // The clusters could me moved recursively, s.t.
+    // cluster[A] = {B}
+    // cluster[B] = {C}
+    // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B
+    std::vector<Cluster> clusters;
+    clusters.reserve(number_of_user_bins);
+
+    std::vector<size_t> current_cluster_cardinality(number_of_user_bins);
+    std::vector<seqan::hibf::sketch::hyperloglog> current_cluster_sketches(number_of_user_bins, empty_sketch);
+    size_t current_max_cluster_size{0};
+    size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively
+    size_t current_sketch_index{0};
+    size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
+
+    for (size_t pos = 0; pos < number_of_user_bins; ++pos)
+    {
+        clusters.emplace_back(pos, positions[pos]);
+        current_cluster_cardinality[pos] = cardinalities[positions[pos]];
+        current_cluster_sketches[pos] = sketches[positions[pos]];
+        current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]);
+    }
+
+    // refine clusters
+//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
+    while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
+           current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%?
+    {
+//std::cout << "Current number of clusters: " << current_number_of_clusters;
+
+        // fill LSH collision hashtable
+        robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
+            LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
+
+        // read out LSH collision hashtable
+        // for each present key, if the list contains more than one cluster, we merge everything contained in the list
+        // into the first cluster, since those clusters collide and should be joined in the same bucket
+        for (auto & [key, list] : table)
+        {
+            assert(!list.empty());
+
+            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
+            // be inserted into multiple splots, and multiple times in the same slot.
+            std::sort(list.begin(), list.end());
+            auto const end = std::unique(list.begin(), list.end());
+            auto const begin = list.begin();
+
+            if (end - begin <= 1) // nothing to do here
+                continue;
+
+            // Now combine all clusters into the first.
+
+            // 1) find the representative cluster to merge everything else into
+            // It can happen, that the representative has already been joined with another cluster
+            // e.g.
+            // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+            // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
+            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
+            auto & representative_cluster = clusters[representative_cluster_id];
+            assert(representative_cluster.is_valid(representative_cluster_id));
+            assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
+
+            for (auto current = begin + 1; current < end; ++current)
+            {
+                // For every other entry in the list, it can happen that I already joined that list with another
+                // e.g.
+                // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+                // [key2] = {0, 2, 11} // now I want to do it again
+                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
+                auto & next_cluster = clusters[next_cluster_id];
+
+                if (next_cluster.id() == representative_cluster.id()) // already joined
+                    continue;
+
+                next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
+                assert(next_cluster.size() == 0);
+                assert(next_cluster.has_been_moved());
+                assert(representative_cluster.size() > 1); // there should be at least two user bins now
+                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
+
+                current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]);
+                current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate();
+
+                --current_number_of_clusters;
+            }
+
+            current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality);
+        }
+
+        ++current_sketch_index;
+
+//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
+    }
+
+    return clusters;
+}
+
+std::vector<MultiCluster> most_distant_LSH_partitioning(std::vector<Cluster> const & initial_clusters,
+                         std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                         chopper::configuration const & config)
+{
+    assert(!minHash_sketches.empty());
+    assert(!minHash_sketches[0].table.empty());
+    assert(!minHash_sketches[0].table[0].empty());
+
+    size_t const number_of_user_bins{initial_clusters.size()};
+    size_t const number_of_max_minHash_sketches{minHash_sketches[0].table.size()}; // LSH ADD+OR parameter b
+    // size_t const minHash_sketche_size{minHash_sketches[0][0].size()};   // LSH ADD+OR parameter r
+
+    size_t current_number_of_sketch_hashes{5};
+    size_t current_sketch_index{0};
+
+    size_t current_number_of_clusters = [&initial_clusters] ()
+    {
+        size_t number{0};
+        for (auto const & cluster : initial_clusters)
+        {
+            if (cluster.size())
+                ++number;
+        }
+        return number;
+    }();
+
+    std::vector<MultiCluster> clusters;
+    clusters.reserve(number_of_user_bins);
+
+    for (size_t pos = 0; pos < number_of_user_bins; ++pos)
+    {
+        clusters.emplace_back(initial_clusters[pos]);
+        assert(clusters[pos].is_valid(pos));
+    }
+
+    // refine clusters. I want to cluster as much as possible to initialise partitions with very distant clusters
+    while (current_number_of_clusters > (config.hibf_config.tmax * 2) &&
+           current_sketch_index < number_of_max_minHash_sketches)
+    {
+//std::cout << "[Dist] Current number of clusters: " << current_number_of_clusters << " sketch size:" << current_number_of_sketch_hashes << " ";
+
+        // fill LSH collision hashtable
+        robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
+            LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
+
+        // read out LSH collision hashtable
+        // for each present key, if the list contains more than one cluster, we merge everything contained in the list
+        // into the first cluster, since those clusters collide and should be joined in the same bucket
+        for (auto & [key, list] : table)
+        {
+            assert(!list.empty());
+
+            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
+            // be inserted into multiple splots, and multiple times in the same slot.
+            std::sort(list.begin(), list.end());
+            auto const end = std::unique(list.begin(), list.end());
+            auto const begin = list.begin();
+
+            if (end - begin <= 1) // nothing to do here
+                continue;
+
+            // Now combine all clusters into the first.
+
+            // 1) find the representative cluster to merge everything else into
+            // It can happen, that the representative has already been joined with another cluster
+            // e.g.
+            // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+            // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
+            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
+            auto & representative_cluster = clusters[representative_cluster_id];
+            assert(representative_cluster.is_valid(representative_cluster_id));
+            assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
+
+            for (auto current = begin + 1; current < end; ++current)
+            {
+                // For every other entry in the list, it can happen that I already joined that list with another
+                // e.g.
+                // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
+                // [key2] = {0, 2, 11} // now I want to do it again
+                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
+                auto & next_cluster = clusters[next_cluster_id];
+
+                if (next_cluster.id() == representative_cluster.id()) // already joined
+                    continue;
+
+                next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
+                assert(next_cluster.size() == 0);
+                assert(next_cluster.has_been_moved());
+                assert(representative_cluster.size() > 1); // there should be at least two user bins now
+                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
+
+                --current_number_of_clusters;
+            }
+        }
+
+        ++current_sketch_index;
+
+        if (current_sketch_index % 4 == 0 && current_number_of_sketch_hashes > 1)
+            --current_number_of_sketch_hashes;
+
+//std::cout << " and after this clustering step there are: " << current_number_of_clusters << std::endl;
+    }
+
+    return clusters;
+}
+
+void post_process_clusters(std::vector<Cluster> & clusters,
+                           std::vector<size_t> const & cardinalities,
+                           chopper::configuration const & config)
+{
+    // clusters are done. Start post processing
+    // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not
+    // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id()
+    // would neet to be found
+
+    for (size_t pos = 0; pos < clusters.size(); ++pos)
+    {
+        assert(clusters[pos].is_valid(pos));
+        clusters[pos].sort_by_cardinality(cardinalities);
+    }
+
+    // push largest p clusters to the front
+    auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2)
+    {
+        if (v2.size() == v1.size() && !v2.empty())
+            return  cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
+        return v2.size() < v1.size();
+    };
+    std::partial_sort(clusters.begin(), clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp);
+
+    // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub
+    // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last.
+    // the largest ub is already at the start because of former sorting.
+    auto compare_cardinality_and_move_empty_clusters_to_the_end = [&cardinalities](auto const & v1, auto const & v2)
+    {
+        if (v1.empty())
+            return false; // v1 can never be larger than v2 then
+
+        if (v2.empty()) // and v1 is not, since the first if would catch
+            return true;
+
+        return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
+    };
+    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), compare_cardinality_and_move_empty_clusters_to_the_end);
+
+    assert(clusters[0].size() >= clusters[1].size()); // sanity check
+    // assert(cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check
+
+// debug
+    for (size_t cidx = 1; cidx < clusters.size(); ++cidx)
+    {
+        if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
+            throw std::runtime_error{"sorting did not work"};
+    }
+// debug
+}
+
+void post_process_clusters(std::vector<MultiCluster> & clusters,
+                           std::vector<size_t> const & cardinalities,
+                           chopper::configuration const & config)
+{
+    // clusters are done. Start post processing
+    // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not
+    // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id()
+    // would neet to be found
+
+    for (size_t pos = 0; pos < clusters.size(); ++pos)
+    {
+        assert(clusters[pos].is_valid(pos));
+        clusters[pos].sort_by_cardinality(cardinalities);
+    }
+
+    // push largest p clusters to the front
+    auto cluster_size_cmp = [&cardinalities](auto const & mc1, auto const & mc2)
+    {
+        if (mc1.empty())
+            return false; // mc1 can never be larger than mc2 then
+        if (mc2.empty()) // and mc1 is not, since the first if would catch
+            return true;
+
+        auto const & largest_c1 = mc1.contained_user_bins()[0]; // first cluster is the largest because of sorting above
+        auto const & largest_c2 = mc2.contained_user_bins()[0]; // first cluster is the largest because of sorting above
+
+        assert(!largest_c1.empty()); // should never be empty
+        assert(!largest_c2.empty()); // should never be empty
+
+        if (largest_c2.size() == largest_c1.size())
+            return  cardinalities[largest_c2[0]] < cardinalities[largest_c1[0]];
+        return largest_c2.size() < largest_c1.size();
+    };
+    std::partial_sort(clusters.begin(), clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp);
+
+    auto move_empty_clusters_to_the_end = [](auto const & v1, auto const & v2)
+    {
+        return v2.size() < v1.size();
+    };
+    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), move_empty_clusters_to_the_end);
+
+// debug sanity check
+    for (size_t cidx = 1; cidx < clusters.size(); ++cidx)
+    {
+        if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
+            throw std::runtime_error{"sorting did not work"};
+    }
+// debug
+}
+
+std::vector<Cluster> LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                                      std::vector<size_t> const & positions,
+                                      std::vector<size_t> const & cardinalities,
+                                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                                      size_t const average_technical_bin_size,
+                                      chopper::configuration const & config)
+{
+    std::vector<Cluster> clusters = initital_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config);
+    post_process_clusters(clusters, cardinalities, config);
+    return clusters;
+}
+
+std::vector<MultiCluster> sim_dist_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                                      std::vector<size_t> const & positions,
+                                      std::vector<size_t> const & cardinalities,
+                                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                                      size_t const average_technical_bin_size,
+                                      chopper::configuration const & config)
+{
+    std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config);
+    std::vector<MultiCluster> multi_clusters = most_distant_LSH_partitioning(clusters, minHash_sketches, config);
+    post_process_clusters(multi_clusters, cardinalities, config);
+    return multi_clusters;
+}
+
+bool find_best_partition(chopper::configuration const & config,
+                         size_t const number_of_partitions,
+                         size_t & corrected_estimate_per_part,
+                         std::vector<size_t> const & cluster,
+                         std::vector<size_t> const & cardinalities,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                         std::vector<std::vector<size_t>> & positions,
+                         std::vector<seqan::hibf::sketch::hyperloglog> & partition_sketches,
+                         std::vector<size_t> & max_partition_cardinality,
+                         std::vector<size_t> & min_partition_cardinality)
+{
+    seqan::hibf::sketch::hyperloglog const current_sketch = [&sketches, &cluster, &config]()
+    {
+        seqan::hibf::sketch::hyperloglog result{config.hibf_config.sketch_bits};
+
+        for (size_t const user_bin_idx : cluster)
+            result.merge(sketches[user_bin_idx]);
+
+        return result;
+    }();
+
+    size_t const max_card = [&cardinalities,  &cluster] ()
+    {
+        size_t max{0};
+
+        for (size_t const user_bin_idx : cluster)
+            max = std::max(max, cardinalities[user_bin_idx]);
+
+        return max;
+    }();
+
+    // TODO: afterwads check if I should again merge by how little the effective text ratio grows
+
+    // search best partition fit by similarity
+    // similarity here is defined as:
+    // "whose (<-partition) effective text size is subsumed most by the current user bin"
+    // or in other words:
+    // "which partition has the largest intersection with user bin b compared to its own (partition) size."
+    // double best_subsume_ratio{0.0};
+    size_t smallest_change{std::numeric_limits<size_t>::max()};
+    size_t best_p{0};
+    bool best_p_found{false};
+
+    auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p)
+    {
+        size_t min = std::min(max_card, min_partition_cardinality[p]);
+        size_t max = std::max(max_card, max_partition_cardinality[p]);
+
+        if (positions[p].size() > config.hibf_config.tmax) // already a third level
+        {
+            // if there must already be another lower level because the current merged bin contains more than tmax
+            // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set
+            // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user
+            // bin needs to be stored additionally
+            size_t const num_ubs_in_merged_bin{positions[p].size() + additional_number_of_user_bins};
+            double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax);
+            return  static_cast<size_t>(max_card * levels);
+        }
+        else if (positions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) // now a third level
+        {
+            // if the current merged bin contains exactly tmax UBS, adding otherone must
+            // result in another lower level. Most likely, the smallest user bin will end up on the lower level
+            // therefore the penalty is set to 'min * tmax'
+            // of course, there could also be a third level with a lower number of user bins, but this is hard to
+            // estimate.
+            size_t const penalty = min * config.hibf_config.tmax;
+            return penalty;
+        }
+        else // positions[p].size() + additional_number_of_user_bins < tmax
+        {
+            // if the new user bin is smaller than all other already contained user bins
+            // the waste of space is high if stored in a single technical bin
+            if (max_card < min)
+                return (max - max_card);
+            // if the new user bin is bigger than all other already contained user bins, the IBF size increases
+            else if (max_card > max)
+                return (max_card - max) * config.hibf_config.tmax;
+            // else, end if-else-block and zero is returned
+        }
+
+        return (size_t)0u;
+    };
+
+    for (size_t p = 0; p < number_of_partitions; ++p)
+    {
+        seqan::hibf::sketch::hyperloglog union_sketch = current_sketch;
+        union_sketch.merge(partition_sketches[p]);
+        size_t const union_estimate = union_sketch.estimate();
+        size_t const current_partition_size = partition_sketches[p].estimate();
+
+        assert(union_estimate >= current_partition_size);
+        size_t const penalty_current_bin = union_estimate - current_partition_size;
+        size_t const penalty_current_ibf = config.hibf_config.tmax * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part);
+        size_t const change = penalty_current_bin + penalty_current_ibf + penalty_lower_level(cluster.size(), p);
+
+        // size_t const intersection = current_sketch.estimate() - change;
+        // double const subsume_ratio = static_cast<double>(intersection) / current_partition_size;
+//std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" <<  penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl;
+        if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/
+            (smallest_change > change /*&& subsume_ratio > best_subsume_ratio &&*/
+            /*current_partition_size < corrected_estimate_per_part*/))
+        {
+//std::cout << "smaller!" << std::endl;
+            // best_subsume_ratio = subsume_ratio;
+            smallest_change = change;
+            best_p = p;
+            best_p_found = true;
+        }
+    }
+
+    if (!best_p_found)
+        throw "currently there are no safety measures if a partition is not found because it is very unlikely";
+
+//std::cout << "best_p:" << best_p << std::endl<< std::endl;
+
+    // now that we know which partition fits best (`best_p`), add those indices to it
+    for (size_t const user_bin_idx : cluster)
+    {
+        positions[best_p].push_back(user_bin_idx);
+        max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]);
+        min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]);
+    }
+    partition_sketches[best_p].merge(current_sketch);
+    corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sketches[best_p].estimate()));
+
+    return true;
+}
+
+bool find_best_sorted_partition(chopper::configuration const & config,
+                         size_t const number_of_partitions,
+                         size_t & corrected_estimate_per_part,
+                         std::vector<size_t> const & cluster,
+                         std::vector<size_t> const & cardinalities,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & /*sketches*/,
+                         std::vector<std::vector<size_t>> & partitions,
+                         std::vector<size_t> & partition_sizes,
+                         std::vector<size_t> & max_partition_cardinality,
+                         std::vector<size_t> & min_partition_cardinality)
+{
+    size_t max_card = [&cardinalities,  &cluster] ()
+    {
+        size_t max{0};
+
+        for (size_t const user_bin_idx : cluster)
+            max = std::max(max, cardinalities[user_bin_idx]);
+
+        return max;
+    }();
+
+    size_t sum_card = [&cardinalities,  &cluster] ()
+    {
+        size_t sum{0};
+
+        for (size_t const user_bin_idx : cluster)
+            sum += cardinalities[user_bin_idx];
+
+        return sum;
+    }();
+
+    size_t smallest_change{std::numeric_limits<size_t>::max()};
+    size_t best_p{0};
+    bool best_p_found{false};
+
+    auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p)
+    {
+        size_t min = std::min(max_card, min_partition_cardinality[p]);
+        size_t max = std::max(max_card, max_partition_cardinality[p]);
+
+        if (partitions[p].size() > config.hibf_config.tmax) // already a third level
+        {
+            // if there must already be another lower level because the current merged bin contains more than tmax
+            // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set
+            // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user
+            // bin needs to be stored additionally
+            size_t const num_ubs_in_merged_bin{partitions[p].size() + additional_number_of_user_bins};
+            double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax);
+            return  static_cast<size_t>(max_card * levels);
+        }
+        else if (partitions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax)
+        {
+            // if the current merged bin contains exactly tmax UBS, adding otherone must
+            // result in another lower level. Most likely, the smallest user bin will end up on the lower level
+            // therefore the penalty is set to 'min * tmax'
+            // of course, there could also be a third level with a lower number of user bins, but this is hard to
+            // estimate.
+            size_t const penalty = min * config.hibf_config.tmax;
+            return penalty;
+        }
+        else // partitions[p].size() + additional_number_of_user_bins <= tmax
+        {
+            // if the new user bin is smaller than all other already contained user bins
+            // the waste of space is high if stored in a single technical bin
+            if (max_card < min)
+                return (max - max_card);
+            // if the new user bin is bigger than all other already contained user bins, the IBF size increases
+            else if (max_card > max)
+                return (max_card - max) * config.hibf_config.tmax;
+            // else, end if-else-block and zero is returned
+        }
+
+        return (size_t)0u;
+    };
+
+    for (size_t p = 0; p < number_of_partitions; ++p)
+    {
+        size_t const sum = partition_sizes[p] + sum_card;
+        size_t const penalty_current_ibf = config.hibf_config.tmax * ((sum <= corrected_estimate_per_part) ? 0u : sum - corrected_estimate_per_part);
+        size_t const change = penalty_current_ibf + penalty_lower_level(cluster.size(), p);
+
+        if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/
+            (smallest_change > change))
+        {
+            smallest_change = change;
+            best_p = p;
+            best_p_found = true;
+        }
+    }
+
+    if (!best_p_found)
+        return false;
+
+//std::cout << "best_p:" << best_p << std::endl<< std::endl;
+
+    // now that we know which partition fits best (`best_p`), add those indices to it
+    for (size_t const user_bin_idx : cluster)
+    {
+        partitions[best_p].push_back(user_bin_idx);
+        partition_sizes[best_p] += cardinalities[user_bin_idx];
+        max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]);
+        min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]);
+    }
+    corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sizes[best_p]));
+
+    return true;
+}
+
+size_t split_bins(chopper::configuration const & config,
+                               std::vector<size_t> const & sorted_positions,
+                               std::vector<size_t> const & cardinalities,
+                               std::vector<std::vector<size_t>> & partitions,
+                               size_t const sum_of_cardinalities,
+                               size_t const end_idx)
+{
+    // assign split bins to the end. makes it easier to process merged bins afterwards
+    size_t pos{partitions.size() - 1};
+    for (size_t idx = 0; idx < end_idx; ++idx)
     {
-        config.dp_algorithm_timer.start();
-        hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
-                                                          kmer_counts,
-                                                          sketches,
-                                                          seqan::hibf::iota_vector(sketches.size()),
-                                                          config.union_estimation_timer,
-                                                          config.rearrangement_timer);
-        config.dp_algorithm_timer.stop();
+        size_t const ub_idx{sorted_positions[idx]};
+        size_t const number_of_split_tbs = std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities);
+std::cout << "number_of_split_tbs: " << number_of_split_tbs
+          << " cardinalities[ub_idx]:" << cardinalities[ub_idx]
+          << " sum_of_cardinalities:" << sum_of_cardinalities
+          << std::endl;
+        // fill partitions from behind to ensure an easier layouting
+        for (size_t i = 0; i < number_of_split_tbs; ++i)
+        {
+            assert(pos > 0);
+            assert(partitions[pos].empty());
+            partitions[pos].push_back(ub_idx);
+            --pos;
+        }
+    }
+    std::cout << "pos: " << pos << std::endl;
+    return pos + 1;
+}
 
-        if (config.output_verbose_statistics)
+std::pair<size_t, size_t> find_bins_to_be_split(std::vector<size_t> const & sorted_positions,
+                            std::vector<size_t> const & cardinalities,
+                            size_t threshold,
+                            size_t const max_bins)
+{
+    size_t idx{0};
+    size_t sum{0};
+    size_t number_of_split_bins = max_bins + 1; // initialise to something bigger to trigger while loop below
+
+    auto find_idx_and_sum = [&]()
+    {
+        while (idx < sorted_positions.size() && cardinalities[sorted_positions[idx]] > threshold)
         {
-            size_t dummy{};
-            chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts};
-            global_stats.hibf_layout = hibf_layout;
-            global_stats.print_header_to(std::cout);
-            global_stats.print_summary_to(dummy, std::cout);
+            sum += cardinalities[sorted_positions[idx]];
+            ++idx;
         }
+    };
+
+    while (number_of_split_bins > max_bins)
+    {
+        idx = 0;
+        sum = 0;
+        find_idx_and_sum();
+        number_of_split_bins = sum / threshold;
+        // max(1.01, ...) because a tie would end in an endless loop.
+        threshold = static_cast<double>(threshold) * std::max(1.01, static_cast<double>(sum) / (threshold * max_bins));
     }
 
-    // brief Write the output to the layout file.
-    std::ofstream fout{config.output_filename};
-    chopper::layout::write_user_bins_to(filenames, fout);
-    config.write_to(fout);
-    hibf_layout.write_to(fout);
+    // Maybe there are more user bins (position of idx) than available split bins.
+    // Thus, clamp the number of user bins to split to maximally the number_of_split_bins
+    idx = std::min(idx, number_of_split_bins);
+
+    return {idx, number_of_split_bins};
+}
+
+size_t lsh_sim_approach(chopper::configuration const & config,
+                      std::vector<size_t> const & sorted_positions2,
+                      std::vector<size_t> const & cardinalities,
+                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                      std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                      std::vector<std::vector<size_t>> & partitions,
+                      size_t const number_of_remaining_tbs,
+                      size_t const technical_bin_size_threshold,
+                      size_t const sum_of_cardinalities)
+{
+    uint8_t const sketch_bits{config.hibf_config.sketch_bits};
+    //std::cout << "LSH partitioning into " << config.hibf_config.tmax << std::endl;
+    std::vector<seqan::hibf::sketch::hyperloglog> partition_sketches(number_of_remaining_tbs,
+                                                                     seqan::hibf::sketch::hyperloglog(sketch_bits));
+
+    std::vector<size_t> max_partition_cardinality(number_of_remaining_tbs, 0u);
+    std::vector<size_t> min_partition_cardinality(number_of_remaining_tbs, std::numeric_limits<size_t>::max());
+
+    // initial partitioning using locality sensitive hashing (LSH)
+    config.lsh_algorithm_timer.start();
+    std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, technical_bin_size_threshold, config);
+    post_process_clusters(clusters, cardinalities, config);
+    config.lsh_algorithm_timer.stop();
+
+std::ofstream ofs{"/tmp/final.clusters"};
+for (size_t i = 0; i < clusters.size(); ++i)
+{
+seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits);
+for (size_t j = 0; j < clusters[i].size(); ++j)
+{
+    sketch.merge(sketches[clusters[i].contained_user_bins()[j]]);
+}
+ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
+}
+
+    std::vector<std::vector<size_t>> remaining_clusters{};
+
+    // initialise partitions with the first p largest clusters (post_processing sorts by size)
+    size_t cidx{0}; // current cluster index
+    for (size_t p = 0; p < number_of_remaining_tbs; ++p)
+    {
+        assert(!clusters[cidx].empty());
+        auto const & cluster = clusters[cidx].contained_user_bins();
+        bool split_cluster = false;
+
+        if (cluster.size() > config.hibf_config.tmax)
+        {
+            size_t card{0};
+            for (size_t uidx = 0; uidx < cluster.size(); ++uidx)
+                card += cardinalities[cluster[uidx]];
+
+            if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax)
+                split_cluster = true;
+        }
+
+        size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax);
+        for (size_t uidx = 0; uidx < end; ++uidx)
+        {
+            size_t const user_bin_idx = cluster[uidx];
+            // if a single cluster already exceeds the cardinality_per_part,
+            // then the remaining user bins of the cluster must spill over into the next partition
+            if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sketches[p].estimate() > technical_bin_size_threshold)
+            {
+                ++p;
+
+                if (p >= number_of_remaining_tbs)
+                {
+                    split_cluster = true;
+                    end = uidx;
+                    break;
+                }
+            }
+
+            partition_sketches[p].merge(sketches[user_bin_idx]);
+            partitions[p].push_back(user_bin_idx);
+            max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]);
+            min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]);
+        }
+
+        if (split_cluster)
+        {
+            std::vector<size_t> remainder(cluster.begin() + end, cluster.end());
+            remaining_clusters.insert(remaining_clusters.end(), remainder);
+        }
+
+        ++cidx;
+    }
+
+    for (size_t i = cidx; i < clusters.size(); ++i)
+    {
+        if (clusters[i].empty())
+            break;
+
+        remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins());
+    }
+
+    // assign the rest by similarity
+    size_t merged_threshold{technical_bin_size_threshold};
+    for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx)
+    {
+        auto const & cluster = remaining_clusters[ridx];
+
+        config.search_partition_algorithm_timer.start();
+        find_best_partition(config, number_of_remaining_tbs, merged_threshold, cluster, cardinalities, sketches, partitions, partition_sketches, max_partition_cardinality, min_partition_cardinality);
+        config.search_partition_algorithm_timer.start();
+    }
+
+    // compute actual max size
+    size_t max_size{0};
+    for (auto const & sketch : partition_sketches)
+        max_size = std::max(max_size, (size_t)sketch.estimate());
+
+    return max_size;
+}
+
+void partition_user_bins(chopper::configuration const & config,
+                         std::vector<size_t> const & positions,
+                         std::vector<size_t> const & cardinalities,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                         std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                         std::vector<std::vector<size_t>> & partitions)
+{
+    // all approaches need sorted positions
+    std::vector<size_t> const sorted_positions = [&positions, &cardinalities]()
+    {
+        std::vector<size_t> ps(positions.begin(), positions.end());
+        seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps);
+        return ps;
+    }();
+
+    auto const [sum_of_cardinalities, max_cardinality, joint_estimate] = [&]()
+    {
+        size_t sum{0};
+        size_t max{0};
+        seqan::hibf::sketch::hyperloglog sketch{config.hibf_config.sketch_bits};
+
+        for (size_t const pos : positions)
+        {
+            sum += cardinalities[pos];
+            max = std::max(max, cardinalities[pos]);
+            sketch.merge(sketches[pos]);
+        }
+
+        return std::tuple<size_t,size_t,size_t>{sum, max, sketch.estimate()};
+    }();
+
+    // If the effective text size is very low, it can happen that the joint_estimate divided by the number of partitions
+    // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size
+    // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand.
+    // This way we make sure there is at least 1 LSH clustering step.
+    size_t const estimate_per_part = std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax),
+                                              max_cardinality + 1);
+
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
+                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
+//std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl;
+
+    size_t split_threshold{};
+    size_t idx{0}; // start in sorted positions
+    size_t number_of_split_tbs{0};
+    size_t number_of_merged_tbs{config.hibf_config.tmax};
+    size_t max_split_size{0};
+    size_t max_merged_size{0};
+
+    // split_threshold = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax);
+    split_threshold = seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax);
+
+    auto parition_split_bins = [&]()
+    {
+        size_t number_of_potential_split_bins{0};
+        std::tie(idx, number_of_potential_split_bins) = find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1);
+        std::tie(number_of_split_tbs, max_split_size) = chopper::layout::determine_split_bins(config, sorted_positions, cardinalities, number_of_potential_split_bins, idx, partitions);
+        number_of_merged_tbs = config.hibf_config.tmax - number_of_split_tbs;
+    };
+
+    auto partitions_merged_bins = [&]()
+    {
+
+    if (config.partitioning_approach == partitioning_scheme::blocked)
+    {
+        size_t const estimated_tb_size = [&]()
+        {
+            size_t sum{0};
+
+            for (size_t pos{idx}; pos < sorted_positions.size(); ++pos)
+                sum += cardinalities[sorted_positions[pos]];
+
+            return seqan::hibf::divide_and_ceil(sum, number_of_merged_tbs);
+        }();
+
+        size_t current_part{0};
+        size_t current_sum{0};
+
+        for (;idx < sorted_positions.size(); ++idx)
+        {
+            size_t const current_user_bin_id{sorted_positions[idx]};
+            partitions[current_part].push_back(current_user_bin_id);
+            current_sum += cardinalities[current_user_bin_id];
+
+            if (current_sum >= estimated_tb_size && current_part < number_of_merged_tbs)
+            {
+                max_merged_size = std::max(current_sum, max_merged_size);
+                ++current_part;
+                current_sum = 0;
+                assert(current_part < number_of_merged_tbs);
+            }
+        }
+    }
+    else if (config.partitioning_approach == partitioning_scheme::sorted)
+    {
+        size_t current_tb_size_threshold = split_threshold;
+
+        size_t current_part{0};
+        seqan::hibf::sketch::hyperloglog current_sketch{config.hibf_config.sketch_bits};
+        std::vector<seqan::hibf::sketch::hyperloglog> partition_sketches(config.hibf_config.tmax, current_sketch);
+
+        for (;idx < sorted_positions.size(); ++idx)
+        {
+            size_t const current_user_bin_id{sorted_positions[idx]};
+            partitions[current_part].push_back(current_user_bin_id);
+            current_sketch.merge(sketches[current_user_bin_id]);
+
+            if (current_sketch.estimate() >= current_tb_size_threshold)
+            {
+                max_merged_size = std::max((size_t)current_sketch.estimate(), max_merged_size);
+                partition_sketches[current_part] = current_sketch;
+                current_sketch = seqan::hibf::sketch::hyperloglog{config.hibf_config.sketch_bits};
+                ++current_part;
+
+                if (current_part >= number_of_merged_tbs)
+                {
+                    current_tb_size_threshold *= 1 + static_cast<double>(idx) / sorted_positions.size();
+                    current_part = 0; // fill up from the start agagin as the threshold increased
+                }
+            }
+        }
+    }
+    else if (config.partitioning_approach == partitioning_scheme::folded)
+    {
+    }
+    else if (config.partitioning_approach == partitioning_scheme::weighted_fold)
+    {
+        size_t const cardinality_per_part =
+            seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax);
+        size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.hibf_config.tmax);
+
+        size_t current_big_pos{0};                          // the next largest user bin to assign to a partition
+        size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin
+
+        for (size_t current_part = 0; current_part + 1 < config.hibf_config.tmax; ++current_part)
+        {
+            size_t current_cardinality{0};
+            std::vector<size_t> small_bins;
+            size_t new_small_bin_addition{0};
+
+            auto compute_score = [&]()
+            {
+                double const weight = static_cast<double>(current_cardinality) / cardinality_per_part;
+                double const amount =
+                    static_cast<double>(partitions[current_part].size() + small_bins.size() + new_small_bin_addition)
+                    / u_bins_per_part;
+                return std::abs(1.0 - weight) + std::abs(1.0 - amount);
+            };
+
+            // first add all large bins that fit
+            while (current_cardinality < cardinality_per_part)
+            {
+                partitions[current_part].push_back(sorted_positions[current_big_pos]);
+                current_cardinality += cardinalities[sorted_positions[current_big_pos]];
+                ++current_big_pos;
+            }
+
+            double local_optimum = compute_score();
+
+            // then remove big bins and add small bins until a local optima is reached
+            while (true)
+            {
+                size_t const cache_last_small_pos{current_small_pos};
+                // remove a big user bin and fill the partition with small user bins
+                current_cardinality -= cardinalities[sorted_positions[current_big_pos]];
+
+                // can we further improve the ratio by adding more small bins?
+                double improved_score{};
+                do
+                {
+                    improved_score = compute_score();
+                    current_cardinality += cardinalities[sorted_positions[current_small_pos]];
+                    --current_small_pos;
+                    ++new_small_bin_addition;
+                }
+                while (compute_score() < improved_score); // smaller is better
+                // remove overstep
+                ++current_small_pos;
+                current_cardinality -= cardinalities[sorted_positions[current_small_pos]];
+                --new_small_bin_addition;
+
+                if (local_optimum < compute_score()) // score would increase. Stop
+                {
+                    current_small_pos = cache_last_small_pos;
+                    break;
+                }
+                else // update
+                {
+                    partitions[current_part].pop_back();
+                    --current_big_pos;
+                    for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos)
+                        small_bins.push_back(sorted_positions[pos]);
+                }
+            }
+            partitions[current_part].insert(partitions[current_part].end(), small_bins.begin(), small_bins.end());
+        }
+
+        // remaining user bins go to last partition
+        while (current_big_pos <= current_small_pos)
+        {
+            partitions[config.hibf_config.tmax - 1].push_back(sorted_positions[current_big_pos]);
+            ++current_big_pos;
+        }
+    }
+    else if (config.partitioning_approach == partitioning_scheme::similarity)
+    {
+        size_t const number_of_remaining_ubs = positions.size() - idx;
+        uint8_t const sketch_bits{config.hibf_config.sketch_bits};
+        std::vector<seqan::hibf::sketch::hyperloglog> partition_sketches(number_of_merged_tbs,
+                                                                         seqan::hibf::sketch::hyperloglog(sketch_bits));
+
+        size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
+        size_t merged_threshold = std::max(corrected_max_split_size, split_threshold);
+
+        size_t const u_bins_per_part = number_of_remaining_ubs / number_of_merged_tbs; // round down
+        size_t const block_size =
+            std::min(u_bins_per_part,
+                     chopper::next_multiple_of_64(static_cast<uint16_t>(std::ceil(std::sqrt(u_bins_per_part)))));
+        size_t const number_of_blocks = seqan::hibf::divide_and_ceil(number_of_remaining_ubs, block_size);
+
+        std::vector<size_t> max_partition_cardinality(number_of_merged_tbs, 0u);
+        std::vector<size_t> min_partition_cardinality(number_of_merged_tbs, std::numeric_limits<size_t>::max());
+
+        // don't move from largest to smallest but pick the next block to process randomly.
+        // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins)
+        std::vector<size_t> indices(number_of_blocks);
+        std::iota(indices.begin(), indices.end(), 0);
+        std::random_device shuffle_random_device;
+        std::mt19937 shuffle_engine(shuffle_random_device());
+        std::shuffle(indices.begin(), indices.end(), shuffle_engine);
+
+        // initialise partitions with the first random number_of_merged_tbs blocks
+        assert(number_of_blocks >= number_of_merged_tbs);
+        for (size_t p = 0; p < number_of_merged_tbs; ++p)
+        {
+            size_t const i = indices[p];
+            size_t const end = (i == (indices.size() - 1) && (number_of_remaining_ubs % block_size != 0)) ? (number_of_remaining_ubs % block_size) : block_size;
+            for (size_t x = 0; x < end; ++x)
+            {
+                assert(idx + block_size * i + x < sorted_positions.size());
+                size_t const user_bin_idx{sorted_positions[idx + block_size * i + x]};
+                partition_sketches[p].merge(sketches[user_bin_idx]);
+                partitions[p].push_back(user_bin_idx);
+                max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]);
+                min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]);
+            }
+        }
+
+        auto has_been_processed_in_init = [&](size_t const i)
+        {
+            bool result{false};
+            for (size_t p = 0; p < number_of_merged_tbs; ++p)
+                result |= ((idx + indices[p] * block_size) == i);
+            return result;
+        };
+
+        // assign the rest by similarity, user in by user bin (single bins, no blocks)
+        for (size_t i = idx; i < sorted_positions.size(); ++i)
+        {
+            if (has_been_processed_in_init(i))
+            {
+                i += (block_size - 1); // -1 because there will be an increment after continue
+                continue;
+            }
+
+            find_best_partition(config,
+                                number_of_merged_tbs,
+                                merged_threshold,
+                                {sorted_positions[i]},
+                                cardinalities,
+                                sketches,
+                                partitions,
+                                partition_sketches,
+                                max_partition_cardinality,
+                                min_partition_cardinality);
+        }
+
+        // compute actual max size
+        for (auto const & sketch : partition_sketches)
+            max_merged_size = std::max(max_merged_size, (size_t)sketch.estimate());
+    }
+    else if (config.partitioning_approach == partitioning_scheme::lsh)
+    {
+        std::vector<size_t> const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end());
+        size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
+        size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold);
+
+        std::vector<size_t> partition_sizes(number_of_merged_tbs, 0u);
+
+        std::vector<size_t> max_partition_cardinality(number_of_merged_tbs, 0u);
+        std::vector<size_t> min_partition_cardinality(number_of_merged_tbs, std::numeric_limits<size_t>::max());
+
+        // initial partitioning using locality sensitive hashing (LSH)
+        config.lsh_algorithm_timer.start();
+        std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, merged_threshold, config);
+        post_process_clusters(clusters, cardinalities, config);
+        config.lsh_algorithm_timer.stop();
+
+std::ofstream ofs{"/tmp/final.clusters"};
+for (size_t i = 0; i < clusters.size(); ++i)
+{
+    seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits);
+    for (size_t j = 0; j < clusters[i].size(); ++j)
+    {
+        sketch.merge(sketches[clusters[i].contained_user_bins()[j]]);
+    }
+    ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
+}
+
+        std::vector<std::vector<size_t>> remaining_clusters{};
+
+        size_t cidx{0}; // current cluster index
+        // initialise partitions with the first p largest clusters (post_processing sorts by size)
+        for (size_t p = 0; p < number_of_merged_tbs; ++p)
+        {
+            assert(!clusters[cidx].empty());
+            auto const & cluster = clusters[cidx].contained_user_bins();
+            bool split_cluster = false;
+
+            if (cluster.size() > config.hibf_config.tmax)
+            {
+                size_t card{0};
+                for (size_t uidx = 0; uidx < cluster.size(); ++uidx)
+                    card += cardinalities[cluster[uidx]];
+
+                if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax)
+                    split_cluster = true;
+            }
+
+            size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax);
+            for (size_t uidx = 0; uidx < end; ++uidx)
+            {
+                size_t const user_bin_idx = cluster[uidx];
+                // if a single cluster already exceeds the cardinality_per_part,
+                // then the remaining user bins of the cluster must spill over into the next partition
+                if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sizes[p] > merged_threshold)
+                {
+                    ++p;
+
+                    if (p >= number_of_merged_tbs)
+                    {
+                        split_cluster = true;
+                        end = uidx;
+                        break;
+                    }
+                }
+
+                partition_sizes[p] += cardinalities[user_bin_idx];
+                partitions[p].push_back(user_bin_idx);
+                max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]);
+                min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]);
+            }
+
+            if (split_cluster)
+            {
+                std::vector<size_t> remainder(cluster.begin() + end, cluster.end());
+                remaining_clusters.insert(remaining_clusters.end(), remainder);
+            }
+
+            ++cidx;
+        }
+
+        for (size_t i = cidx; i < clusters.size(); ++i)
+        {
+            if (clusters[i].empty())
+                break;
+
+            remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins());
+        }
+
+        // assign the rest by similarity
+        max_merged_size = merged_threshold;
+        for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx)
+        {
+            auto const & cluster = remaining_clusters[ridx];
+
+            config.search_partition_algorithm_timer.start();
+            find_best_sorted_partition(config, number_of_merged_tbs, max_merged_size, cluster, cardinalities, sketches, partitions, partition_sizes, max_partition_cardinality, min_partition_cardinality);
+            config.search_partition_algorithm_timer.start();
+
+        }
+    }
+    else if (config.partitioning_approach == partitioning_scheme::lsh_sim)
+    {
+        // determine number of split bins
+        std::vector<size_t> const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end());
+
+        // distribute the rest to merged bins
+        size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
+        size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold);
+
+        max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities);
+    }
+
+    };
+
+    parition_split_bins();
+    partitions_merged_bins();
+
+    int64_t const difference = static_cast<int64_t>(max_merged_size * relaxed_fpr_correction) - static_cast<int64_t>(max_split_size);
+
+std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl;
+std::cout << "Reconfiguring threshold.  from:" << split_threshold;
+
+    if (number_of_split_tbs == 0)
+        split_threshold = (split_threshold + max_merged_size) / 2; // increase threshold
+    else if (difference > 0) // need more merged bins -> increase threshold
+        split_threshold = static_cast<double>(split_threshold) * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
+    else // need more split bins -> decrease threshold
+        split_threshold = static_cast<double>(split_threshold) * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
+
+std::cout << " to:" << split_threshold << std::endl;
+
+    // reset result
+    partitions.clear();
+    partitions.resize(config.hibf_config.tmax);
+    idx = 0;
+    number_of_split_tbs = 0;
+    number_of_merged_tbs = config.hibf_config.tmax;
+    max_split_size = 0;
+    max_merged_size = 0;
+
+    parition_split_bins();
+    partitions_merged_bins();
+
+    // sanity check:
+    size_t sum{0};
+    size_t last_index{positions.size()}; // non existing user bin idx
+    for (auto const & p : partitions)
+    {
+        if (p.size() == 1)
+        {
+            // for split bins, avoid additional counts
+            if (p[0] != last_index)
+                ++sum;
+
+            last_index = p[0];
+        }
+        else
+        {
+            sum += p.size();
+        }
+    }
+
+    if (sum != positions.size())
+    {
+        std::string str{"Not all user bins have been assigned to the "};
+        str += std::to_string(partitions.size());
+        str += " partitions! (";
+        str += std::to_string(sum);
+        str += "/";
+        str += std::to_string(positions.size());
+        str += ")\n";
+        for (auto const & p : partitions)
+        {
+            str += "[";
+            for (auto const h : p)
+            {
+                str += std::to_string(h);
+                str += ",";
+            }
+            str.back() = ']';
+            str += '\n';
+        }
+
+        throw std::logic_error{str};
+    }
+
+    // assert([&](){ bool x{false}; for (auto const & p : partitions) { x &= !p.empty(); }; return x; });
+}
+
+seqan::hibf::layout::layout general_layout(chopper::configuration const & config,
+                std::vector<size_t> positions,
+                std::vector<size_t> const & cardinalities,
+                std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
+{
+    seqan::hibf::layout::layout hibf_layout;
+
+    seqan::hibf::concurrent_timer union_estimation_timer{};
+    seqan::hibf::concurrent_timer rearrangement_timer{};
+    seqan::hibf::concurrent_timer dp_algorithm_timer{};
+
+    dp_algorithm_timer.start();
+    hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
+                                                      cardinalities,
+                                                      sketches,
+                                                      std::move(positions),
+                                                      union_estimation_timer,
+                                                      rearrangement_timer);
+    dp_algorithm_timer.stop();
+
+    return hibf_layout;
+}
+
+bool do_I_need_a_fast_layout(chopper::configuration const & config, std::vector<size_t> const & positions, std::vector<size_t> const & cardinalities)
+{
+    // the fast layout heuristic would greedily merge even if merging only 2 bins at a time
+    // merging only little number of bins is highly disadvantegous for lower levels because few bins
+    // will be heavily split and this will raise the fpr correction for split bins
+    // Thus, if the average number of user bins per technical bin is less then 64, we should not fast layout
+    if (positions.size() < (64 * config.hibf_config.tmax))
+        return false;
+
+    if (positions.size() > 500'000) // layout takes more than half a day (should this be a user option?)
+        return true;
+
+    size_t largest_size{0};
+    size_t sum_of_cardinalities{0};
+
+    for (size_t const i : positions)
+    {
+        sum_of_cardinalities += cardinalities[i];
+        largest_size = std::max(largest_size, cardinalities[i]);
+    }
+
+    size_t const cardinality_per_tb = sum_of_cardinalities / config.hibf_config.tmax;
+
+    bool const largest_user_bin_might_be_split = largest_size > cardinality_per_tb;
+
+    // if no splitting is needed, its worth it to use a fast-merge-only algorithm
+    if (!largest_user_bin_might_be_split)
+        return true;
+
+    return false;
+}
+
+void add_level_to_layout(chopper::configuration const & config,
+                         seqan::hibf::layout::layout & hibf_layout,
+                         std::vector<std::vector<size_t>> const & partitions,
+                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                         std::vector<size_t> const & previous)
+{
+    size_t max_bin_id{0};
+    size_t max_size{0};
+
+    auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                                             .hash_count = config.hibf_config.number_of_hash_functions,
+                                                                             .t_max = partitions.size()});
+
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
+                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
+
+    // we assume here that the user bins have been sorted by user bin id such that pos = idx
+    for (size_t partition_idx{0}; partition_idx < partitions.size(); ++partition_idx)
+    {
+        auto const & partition = partitions[partition_idx];
+
+        if (partition.size() > 1) // merged bin
+        {
+            seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size
+            current_sketch.reset();
+
+            for (size_t const user_bin_id : partition)
+            {
+                assert(hibf_layout.user_bins[user_bin_id].idx == user_bin_id);
+                auto & current_user_bin = hibf_layout.user_bins[user_bin_id];
+
+                // update
+                assert(previous == current_user_bin.previous_TB_indices);
+                current_user_bin.previous_TB_indices.push_back(partition_idx);
+                current_sketch.merge(sketches[user_bin_id]);
+            }
+
+            // update max_bin_id, max_size
+            size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction;
+            if (current_size > max_size)
+            {
+                max_bin_id = partition_idx;
+                max_size = current_size;
+            }
+        }
+        else if (partition.size() == 0) // should not happen.. dge case?
+        {
+            continue;
+        }
+        else // single or split bin (partition.size() == 1)
+        {
+            auto & current_user_bin = hibf_layout.user_bins[partitions[partition_idx][0]];
+            assert(current_user_bin.idx == partitions[partition_idx][0]);
+            current_user_bin.storage_TB_id = partition_idx;
+            current_user_bin.number_of_technical_bins = 1; // initialise to 1
+
+            while (partition_idx + 1 < partitions.size() &&
+                    partitions[partition_idx].size() == 1 &&
+                    partitions[partition_idx + 1].size() == 1 &&
+                    partitions[partition_idx][0] == partitions[partition_idx + 1][0])
+            {
+                ++current_user_bin.number_of_technical_bins;
+                ++partition_idx;
+            }
+
+            // update max_bin_id, max_size
+            size_t const current_size = sketches[current_user_bin.idx].estimate() * split_fpr_correction[current_user_bin.number_of_technical_bins];
+            if (current_size > max_size)
+            {
+                max_bin_id = current_user_bin.storage_TB_id;
+                max_size = current_size;
+            }
+        }
+    }
+
+    hibf_layout.max_bins.emplace_back(previous, max_bin_id); // add lower level meta information
+}
+
+void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout,
+                                     seqan::hibf::layout::layout & hibf_layout,
+                                     std::vector<size_t> const & new_previous)
+{
+    hibf_layout.max_bins.emplace_back(new_previous, child_layout.top_level_max_bin_id);
+
+    for (auto & max_bin : child_layout.max_bins)
+    {
+        max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), new_previous.begin(), new_previous.end());
+        hibf_layout.max_bins.push_back(max_bin);
+    }
+
+    for (auto const & user_bin : child_layout.user_bins)
+    {
+        auto & actual_user_bin = hibf_layout.user_bins[user_bin.idx];
+
+        actual_user_bin.previous_TB_indices.insert(actual_user_bin.previous_TB_indices.end(),
+                                                    user_bin.previous_TB_indices.begin(),
+                                                    user_bin.previous_TB_indices.end());
+        actual_user_bin.number_of_technical_bins = user_bin.number_of_technical_bins;
+        actual_user_bin.storage_TB_id = user_bin.storage_TB_id;
+    }
+}
+
+void fast_layout_recursion(chopper::configuration const & config,
+            std::vector<size_t> const & positions,
+            std::vector<size_t> const & cardinalities,
+            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+            std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+            seqan::hibf::layout::layout & hibf_layout,
+            std::vector<size_t> const & previous)
+{
+    std::vector<std::vector<size_t>> tmax_partitions(config.hibf_config.tmax);
+
+    // here we assume that we want to start with a fast layout
+    partition_user_bins(config, positions, cardinalities, sketches, minHash_sketches, tmax_partitions);
+
+    #pragma omp critical
+    {
+        add_level_to_layout(config, hibf_layout, tmax_partitions, sketches, previous);
+    }
+
+    for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx)
+    {
+        auto const & partition = tmax_partitions[partition_idx];
+        auto const new_previous = [&] () {auto cpy{previous}; cpy.push_back(partition_idx); return cpy; }();
+
+        if (partition.empty() || partition.size() == 1) // nothing to merge
+            continue;
+
+        if (do_I_need_a_fast_layout(config, partition, cardinalities))
+        {
+            fast_layout_recursion(config, partition, cardinalities, sketches, minHash_sketches, hibf_layout, new_previous); // recurse fast_layout
+        }
+        else
+        {
+            auto child_layout = general_layout(config, partition, cardinalities, sketches);
+
+            #pragma omp critical
+            {
+                update_layout_from_child_layout(child_layout, hibf_layout, new_previous);
+            }
+        }
+    }
+}
+
+void fast_layout(chopper::configuration const & config,
+                std::vector<size_t> const & positions,
+                std::vector<size_t> const & cardinalities,
+                std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                seqan::hibf::layout::layout & hibf_layout)
+{
+    auto config_copy = config;
+
+    std::vector<std::vector<size_t>> tmax_partitions(config.hibf_config.tmax);
+
+    // here we assume that we want to start with a fast layout
+    config.intital_partition_timer.start();
+    partition_user_bins(config_copy, positions, cardinalities, sketches, minHash_sketches, tmax_partitions);
+    config.intital_partition_timer.stop();
+
+    auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                                             .hash_count = config.hibf_config.number_of_hash_functions,
+                                                                             .t_max = config.hibf_config.tmax});
+
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
+                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
+
+    size_t max_bin_id{0};
+    size_t max_size{0};
+    hibf_layout.user_bins.resize(config_copy.hibf_config.number_of_user_bins);
+
+    // initialise user bins in layout
+    for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx)
+    {
+        if (tmax_partitions[partition_idx].size() > 1) // merged bin
+        {
+            seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size
+            current_sketch.reset();
+
+            for (size_t const user_bin_id : tmax_partitions[partition_idx])
+            {
+                hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {partition_idx},
+                                                      .storage_TB_id = 0 /*not determiend yet*/,
+                                                      .number_of_technical_bins = 1 /*not determiend yet*/,
+                                                      .idx = user_bin_id};
+                current_sketch.merge(sketches[user_bin_id]);
+            }
+
+            // update max_bin_id, max_size
+            size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction;
+            if (current_size > max_size)
+            {
+                max_bin_id = partition_idx;
+                max_size = current_size;
+            }
+        }
+        else if (tmax_partitions[partition_idx].size() == 0) // should not happen. Edge case?
+        {
+            continue;
+        }
+        else // single or split bin (tmax_partitions[partition_idx].size() == 1)
+        {
+            assert(tmax_partitions[partition_idx].size() == 1);
+            size_t const user_bin_id = tmax_partitions[partition_idx][0];
+            hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {},
+                                                  .storage_TB_id = partition_idx,
+                                                  .number_of_technical_bins = 1 /*determiend below*/,
+                                                  .idx = user_bin_id};
+
+            while (partition_idx + 1 < tmax_partitions.size() &&
+                    tmax_partitions[partition_idx].size() == 1 &&
+                    tmax_partitions[partition_idx + 1].size() == 1 &&
+                    tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0])
+            {
+                ++hibf_layout.user_bins[user_bin_id].number_of_technical_bins;
+                ++partition_idx;
+            }
+
+            // update max_bin_id, max_size
+            size_t const current_size = sketches[user_bin_id].estimate() * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins];
+            if (current_size > max_size)
+            {
+                max_bin_id = hibf_layout.user_bins[user_bin_id].storage_TB_id;
+                max_size = current_size;
+            }
+        }
+    }
+
+    hibf_layout.top_level_max_bin_id = max_bin_id;
+
+    config.small_layouts_timer.start();
+    #pragma omp parallel
+    #pragma omp single
+    {
+        #pragma omp taskloop
+        for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx)
+        {
+            auto const & partition = tmax_partitions[partition_idx];
+
+            if (partition.empty() || partition.size() == 1) // nothing to merge
+                continue;
+
+            if (do_I_need_a_fast_layout(config_copy, partition, cardinalities))
+            {
+                fast_layout_recursion(config_copy, partition, cardinalities, sketches, minHash_sketches, hibf_layout, {partition_idx}); // recurse fast_layout
+            }
+            else
+            {
+                auto small_layout = general_layout(config, partition, cardinalities, sketches);
+
+                #pragma omp critical
+                {
+                    update_layout_from_child_layout(small_layout, hibf_layout, std::vector<size_t>{partition_idx});
+                }
+            }
+        }
+    }
+    config.small_layouts_timer.stop();
+}
+
+int execute(chopper::configuration & config,
+            std::vector<std::vector<std::string>> const & filenames,
+            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+            std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches)
+{
+    config.hibf_config.validate_and_set_defaults();
+
+    std::vector<size_t> cardinalities;
+    seqan::hibf::sketch::estimate_kmer_counts(sketches, cardinalities);
+
+    if (true) // 0 == unset == single HIBF, 1 == single HIBF
+    {
+        seqan::hibf::layout::layout hibf_layout;
+
+
+        if (config.determine_best_tmax)
+        {
+            hibf_layout = determine_best_number_of_technical_bins(config, cardinalities, sketches);
+        }
+        else
+        {
+            config.dp_algorithm_timer.start();
+            fast_layout(config, seqan::hibf::iota_vector(sketches.size()), cardinalities, sketches, minHash_sketches, hibf_layout);
+            config.dp_algorithm_timer.stop();
+
+            // hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
+            //                                                   cardinalities,
+            //                                                   sketches,
+            //                                                   seqan::hibf::iota_vector(sketches.size()),
+            //                                                   config.union_estimation_timer,
+            //                                                   config.rearrangement_timer);
+
+            // sort records ascending by the number of bin indices (corresponds to the IBF levels)
+            // GCOVR_EXCL_START
+            std::ranges::sort(hibf_layout.max_bins,
+                            [](auto const & r, auto const & l)
+                            {
+                                if (r.previous_TB_indices.size() == l.previous_TB_indices.size())
+                                    return std::ranges::lexicographical_compare(r.previous_TB_indices, l.previous_TB_indices);
+                                else
+                                    return r.previous_TB_indices.size() < l.previous_TB_indices.size();
+                            });
+            // GCOVR_EXCL_STOP
+
+            if (config.output_verbose_statistics)
+            {
+                size_t dummy{};
+                chopper::layout::hibf_statistics global_stats{config, sketches, cardinalities};
+                global_stats.hibf_layout = hibf_layout;
+                global_stats.print_header_to(std::cout);
+                global_stats.print_summary_to(dummy, std::cout);
+            }
+        }
+
+        // brief Write the output to the layout file.
+        std::ofstream fout{config.output_filename};
+        chopper::layout::write_user_bins_to(filenames, fout);
+        config.write_to(fout);
+        hibf_layout.write_to(fout);
+    }
+    else
+    {
+        std::vector<std::vector<size_t>> positions(config.hibf_config.tmax); // asign positions for each partition
+
+        std::vector<size_t> ps;
+        ps.resize(cardinalities.size());
+        std::iota(ps.begin(), ps.end(), 0);
+        partition_user_bins(config, ps, cardinalities, sketches, minHash_sketches, positions);
+
+        std::vector<seqan::hibf::layout::layout> hibf_layouts(config.hibf_config.tmax); // multiple layouts
+
+#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads)
+        for (size_t i = 0; i < config.hibf_config.tmax; ++i)
+        {
+            // reset tmax to fit number of user bins in layout
+            auto local_hibf_config = config.hibf_config; // every thread needs to set individual tmax
+            local_hibf_config.tmax =
+                chopper::next_multiple_of_64(static_cast<uint16_t>(std::ceil(std::sqrt(positions[i].size()))));
+
+            config.dp_algorithm_timer.start();
+            hibf_layouts[i] = seqan::hibf::layout::compute_layout(local_hibf_config,
+                                                                  cardinalities,
+                                                                  sketches,
+                                                                  std::move(positions[i]),
+                                                                  config.union_estimation_timer,
+                                                                  config.rearrangement_timer);
+            config.dp_algorithm_timer.stop();
+        }
+
+        // brief Write the output to the layout file.
+        std::ofstream fout{config.output_filename};
+        chopper::layout::write_user_bins_to(filenames, fout);
+        config.write_to(fout);
+
+        for (size_t i = 0; i < config.hibf_config.tmax; ++i)
+            hibf_layouts[i].write_to(fout);
+    }
 
     return 0;
 }
diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp
index df86775c..e171ed93 100644
--- a/src/layout/hibf_statistics.cpp
+++ b/src/layout/hibf_statistics.cpp
@@ -398,7 +398,7 @@ void hibf_statistics::collect_bins()
 #    pragma GCC diagnostic ignored "-Warray-bounds="
 #    pragma GCC diagnostic ignored "-Wstringop-overflow="
 #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
-                ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
+                // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
 #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
 #    pragma GCC diagnostic pop
 #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV

From 4aefac2aee107d2d3c67bd58d39604a71bb7bf05 Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Thu, 24 Jul 2025 21:44:50 +0200
Subject: [PATCH 06/18] delete worse approaches

---
 src/layout/execute.cpp | 324 -----------------------------------------
 1 file changed, 324 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index fd02cd0d..3c323d5f 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -1071,328 +1071,6 @@ void partition_user_bins(chopper::configuration const & config,
     };
 
     auto partitions_merged_bins = [&]()
-    {
-
-    if (config.partitioning_approach == partitioning_scheme::blocked)
-    {
-        size_t const estimated_tb_size = [&]()
-        {
-            size_t sum{0};
-
-            for (size_t pos{idx}; pos < sorted_positions.size(); ++pos)
-                sum += cardinalities[sorted_positions[pos]];
-
-            return seqan::hibf::divide_and_ceil(sum, number_of_merged_tbs);
-        }();
-
-        size_t current_part{0};
-        size_t current_sum{0};
-
-        for (;idx < sorted_positions.size(); ++idx)
-        {
-            size_t const current_user_bin_id{sorted_positions[idx]};
-            partitions[current_part].push_back(current_user_bin_id);
-            current_sum += cardinalities[current_user_bin_id];
-
-            if (current_sum >= estimated_tb_size && current_part < number_of_merged_tbs)
-            {
-                max_merged_size = std::max(current_sum, max_merged_size);
-                ++current_part;
-                current_sum = 0;
-                assert(current_part < number_of_merged_tbs);
-            }
-        }
-    }
-    else if (config.partitioning_approach == partitioning_scheme::sorted)
-    {
-        size_t current_tb_size_threshold = split_threshold;
-
-        size_t current_part{0};
-        seqan::hibf::sketch::hyperloglog current_sketch{config.hibf_config.sketch_bits};
-        std::vector<seqan::hibf::sketch::hyperloglog> partition_sketches(config.hibf_config.tmax, current_sketch);
-
-        for (;idx < sorted_positions.size(); ++idx)
-        {
-            size_t const current_user_bin_id{sorted_positions[idx]};
-            partitions[current_part].push_back(current_user_bin_id);
-            current_sketch.merge(sketches[current_user_bin_id]);
-
-            if (current_sketch.estimate() >= current_tb_size_threshold)
-            {
-                max_merged_size = std::max((size_t)current_sketch.estimate(), max_merged_size);
-                partition_sketches[current_part] = current_sketch;
-                current_sketch = seqan::hibf::sketch::hyperloglog{config.hibf_config.sketch_bits};
-                ++current_part;
-
-                if (current_part >= number_of_merged_tbs)
-                {
-                    current_tb_size_threshold *= 1 + static_cast<double>(idx) / sorted_positions.size();
-                    current_part = 0; // fill up from the start agagin as the threshold increased
-                }
-            }
-        }
-    }
-    else if (config.partitioning_approach == partitioning_scheme::folded)
-    {
-    }
-    else if (config.partitioning_approach == partitioning_scheme::weighted_fold)
-    {
-        size_t const cardinality_per_part =
-            seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax);
-        size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.hibf_config.tmax);
-
-        size_t current_big_pos{0};                          // the next largest user bin to assign to a partition
-        size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin
-
-        for (size_t current_part = 0; current_part + 1 < config.hibf_config.tmax; ++current_part)
-        {
-            size_t current_cardinality{0};
-            std::vector<size_t> small_bins;
-            size_t new_small_bin_addition{0};
-
-            auto compute_score = [&]()
-            {
-                double const weight = static_cast<double>(current_cardinality) / cardinality_per_part;
-                double const amount =
-                    static_cast<double>(partitions[current_part].size() + small_bins.size() + new_small_bin_addition)
-                    / u_bins_per_part;
-                return std::abs(1.0 - weight) + std::abs(1.0 - amount);
-            };
-
-            // first add all large bins that fit
-            while (current_cardinality < cardinality_per_part)
-            {
-                partitions[current_part].push_back(sorted_positions[current_big_pos]);
-                current_cardinality += cardinalities[sorted_positions[current_big_pos]];
-                ++current_big_pos;
-            }
-
-            double local_optimum = compute_score();
-
-            // then remove big bins and add small bins until a local optima is reached
-            while (true)
-            {
-                size_t const cache_last_small_pos{current_small_pos};
-                // remove a big user bin and fill the partition with small user bins
-                current_cardinality -= cardinalities[sorted_positions[current_big_pos]];
-
-                // can we further improve the ratio by adding more small bins?
-                double improved_score{};
-                do
-                {
-                    improved_score = compute_score();
-                    current_cardinality += cardinalities[sorted_positions[current_small_pos]];
-                    --current_small_pos;
-                    ++new_small_bin_addition;
-                }
-                while (compute_score() < improved_score); // smaller is better
-                // remove overstep
-                ++current_small_pos;
-                current_cardinality -= cardinalities[sorted_positions[current_small_pos]];
-                --new_small_bin_addition;
-
-                if (local_optimum < compute_score()) // score would increase. Stop
-                {
-                    current_small_pos = cache_last_small_pos;
-                    break;
-                }
-                else // update
-                {
-                    partitions[current_part].pop_back();
-                    --current_big_pos;
-                    for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos)
-                        small_bins.push_back(sorted_positions[pos]);
-                }
-            }
-            partitions[current_part].insert(partitions[current_part].end(), small_bins.begin(), small_bins.end());
-        }
-
-        // remaining user bins go to last partition
-        while (current_big_pos <= current_small_pos)
-        {
-            partitions[config.hibf_config.tmax - 1].push_back(sorted_positions[current_big_pos]);
-            ++current_big_pos;
-        }
-    }
-    else if (config.partitioning_approach == partitioning_scheme::similarity)
-    {
-        size_t const number_of_remaining_ubs = positions.size() - idx;
-        uint8_t const sketch_bits{config.hibf_config.sketch_bits};
-        std::vector<seqan::hibf::sketch::hyperloglog> partition_sketches(number_of_merged_tbs,
-                                                                         seqan::hibf::sketch::hyperloglog(sketch_bits));
-
-        size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
-        size_t merged_threshold = std::max(corrected_max_split_size, split_threshold);
-
-        size_t const u_bins_per_part = number_of_remaining_ubs / number_of_merged_tbs; // round down
-        size_t const block_size =
-            std::min(u_bins_per_part,
-                     chopper::next_multiple_of_64(static_cast<uint16_t>(std::ceil(std::sqrt(u_bins_per_part)))));
-        size_t const number_of_blocks = seqan::hibf::divide_and_ceil(number_of_remaining_ubs, block_size);
-
-        std::vector<size_t> max_partition_cardinality(number_of_merged_tbs, 0u);
-        std::vector<size_t> min_partition_cardinality(number_of_merged_tbs, std::numeric_limits<size_t>::max());
-
-        // don't move from largest to smallest but pick the next block to process randomly.
-        // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins)
-        std::vector<size_t> indices(number_of_blocks);
-        std::iota(indices.begin(), indices.end(), 0);
-        std::random_device shuffle_random_device;
-        std::mt19937 shuffle_engine(shuffle_random_device());
-        std::shuffle(indices.begin(), indices.end(), shuffle_engine);
-
-        // initialise partitions with the first random number_of_merged_tbs blocks
-        assert(number_of_blocks >= number_of_merged_tbs);
-        for (size_t p = 0; p < number_of_merged_tbs; ++p)
-        {
-            size_t const i = indices[p];
-            size_t const end = (i == (indices.size() - 1) && (number_of_remaining_ubs % block_size != 0)) ? (number_of_remaining_ubs % block_size) : block_size;
-            for (size_t x = 0; x < end; ++x)
-            {
-                assert(idx + block_size * i + x < sorted_positions.size());
-                size_t const user_bin_idx{sorted_positions[idx + block_size * i + x]};
-                partition_sketches[p].merge(sketches[user_bin_idx]);
-                partitions[p].push_back(user_bin_idx);
-                max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]);
-                min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]);
-            }
-        }
-
-        auto has_been_processed_in_init = [&](size_t const i)
-        {
-            bool result{false};
-            for (size_t p = 0; p < number_of_merged_tbs; ++p)
-                result |= ((idx + indices[p] * block_size) == i);
-            return result;
-        };
-
-        // assign the rest by similarity, user in by user bin (single bins, no blocks)
-        for (size_t i = idx; i < sorted_positions.size(); ++i)
-        {
-            if (has_been_processed_in_init(i))
-            {
-                i += (block_size - 1); // -1 because there will be an increment after continue
-                continue;
-            }
-
-            find_best_partition(config,
-                                number_of_merged_tbs,
-                                merged_threshold,
-                                {sorted_positions[i]},
-                                cardinalities,
-                                sketches,
-                                partitions,
-                                partition_sketches,
-                                max_partition_cardinality,
-                                min_partition_cardinality);
-        }
-
-        // compute actual max size
-        for (auto const & sketch : partition_sketches)
-            max_merged_size = std::max(max_merged_size, (size_t)sketch.estimate());
-    }
-    else if (config.partitioning_approach == partitioning_scheme::lsh)
-    {
-        std::vector<size_t> const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end());
-        size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
-        size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold);
-
-        std::vector<size_t> partition_sizes(number_of_merged_tbs, 0u);
-
-        std::vector<size_t> max_partition_cardinality(number_of_merged_tbs, 0u);
-        std::vector<size_t> min_partition_cardinality(number_of_merged_tbs, std::numeric_limits<size_t>::max());
-
-        // initial partitioning using locality sensitive hashing (LSH)
-        config.lsh_algorithm_timer.start();
-        std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, merged_threshold, config);
-        post_process_clusters(clusters, cardinalities, config);
-        config.lsh_algorithm_timer.stop();
-
-std::ofstream ofs{"/tmp/final.clusters"};
-for (size_t i = 0; i < clusters.size(); ++i)
-{
-    seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits);
-    for (size_t j = 0; j < clusters[i].size(); ++j)
-    {
-        sketch.merge(sketches[clusters[i].contained_user_bins()[j]]);
-    }
-    ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
-}
-
-        std::vector<std::vector<size_t>> remaining_clusters{};
-
-        size_t cidx{0}; // current cluster index
-        // initialise partitions with the first p largest clusters (post_processing sorts by size)
-        for (size_t p = 0; p < number_of_merged_tbs; ++p)
-        {
-            assert(!clusters[cidx].empty());
-            auto const & cluster = clusters[cidx].contained_user_bins();
-            bool split_cluster = false;
-
-            if (cluster.size() > config.hibf_config.tmax)
-            {
-                size_t card{0};
-                for (size_t uidx = 0; uidx < cluster.size(); ++uidx)
-                    card += cardinalities[cluster[uidx]];
-
-                if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax)
-                    split_cluster = true;
-            }
-
-            size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax);
-            for (size_t uidx = 0; uidx < end; ++uidx)
-            {
-                size_t const user_bin_idx = cluster[uidx];
-                // if a single cluster already exceeds the cardinality_per_part,
-                // then the remaining user bins of the cluster must spill over into the next partition
-                if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sizes[p] > merged_threshold)
-                {
-                    ++p;
-
-                    if (p >= number_of_merged_tbs)
-                    {
-                        split_cluster = true;
-                        end = uidx;
-                        break;
-                    }
-                }
-
-                partition_sizes[p] += cardinalities[user_bin_idx];
-                partitions[p].push_back(user_bin_idx);
-                max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]);
-                min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]);
-            }
-
-            if (split_cluster)
-            {
-                std::vector<size_t> remainder(cluster.begin() + end, cluster.end());
-                remaining_clusters.insert(remaining_clusters.end(), remainder);
-            }
-
-            ++cidx;
-        }
-
-        for (size_t i = cidx; i < clusters.size(); ++i)
-        {
-            if (clusters[i].empty())
-                break;
-
-            remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins());
-        }
-
-        // assign the rest by similarity
-        max_merged_size = merged_threshold;
-        for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx)
-        {
-            auto const & cluster = remaining_clusters[ridx];
-
-            config.search_partition_algorithm_timer.start();
-            find_best_sorted_partition(config, number_of_merged_tbs, max_merged_size, cluster, cardinalities, sketches, partitions, partition_sizes, max_partition_cardinality, min_partition_cardinality);
-            config.search_partition_algorithm_timer.start();
-
-        }
-    }
-    else if (config.partitioning_approach == partitioning_scheme::lsh_sim)
     {
         // determine number of split bins
         std::vector<size_t> const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end());
@@ -1402,8 +1080,6 @@ for (size_t i = 0; i < clusters.size(); ++i)
         size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold);
 
         max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities);
-    }
-
     };
 
     parition_split_bins();

From f323b4f1bdba6792eea535fea0995fde6542a583 Mon Sep 17 00:00:00 2001
From: smehringer <svenja.mehringer@gmail.com>
Date: Thu, 24 Jul 2025 21:50:24 +0200
Subject: [PATCH 07/18] Remove trial functions linked to now unused approaches.

---
 src/layout/execute.cpp | 407 -----------------------------------------
 1 file changed, 407 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index 3c323d5f..a07d8807 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -120,124 +120,6 @@ auto LSH_fill_hashtable(std::vector<MultiCluster> const & clusters,
 // Vector L1 : number of user bins
 // Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b)
 // Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r)
-std::vector<Cluster> initital_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                         std::vector<size_t> const & positions,
-                         std::vector<size_t> const & cardinalities,
-                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                         size_t const average_technical_bin_size,
-                         chopper::configuration const & config)
-{
-    assert(!minHash_sketches.empty());
-    assert(!minHash_sketches[0].table.empty());
-    assert(!minHash_sketches[0].table[0].empty());
-
-    size_t const number_of_user_bins{positions.size()};
-    size_t const number_of_max_minHash_sketches{seqan::hibf::sketch::minhashes::num_sketches}; // LSH ADD+OR parameter b
-    size_t const minHash_sketche_size{seqan::hibf::sketch::minhashes::sketch_size};        // LSH ADD+OR parameter r
-    seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits};
-
-    // initialise clusters with a signle user bin per cluster.
-    // clusters are either
-    // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to
-    //    e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y})
-    // 2) of size >= 1; with the first entry beging id == position (a valid cluster)
-    //    e.g. cluster[X] = {X}       // valid singleton
-    //    e.g. cluster[X] = {X, a, b, c, ...}   // valid cluster with more joined entries
-    // The clusters could me moved recursively, s.t.
-    // cluster[A] = {B}
-    // cluster[B] = {C}
-    // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B
-    std::vector<Cluster> clusters;
-    clusters.reserve(number_of_user_bins);
-
-    std::vector<size_t> current_cluster_cardinality(number_of_user_bins);
-    std::vector<seqan::hibf::sketch::hyperloglog> current_cluster_sketches(number_of_user_bins, empty_sketch);
-    size_t current_max_cluster_size{0};
-    size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively
-    size_t current_sketch_index{0};
-    size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
-
-    for (size_t pos = 0; pos < number_of_user_bins; ++pos)
-    {
-        clusters.emplace_back(pos, positions[pos]); // id, user_bin_idx
-        current_cluster_cardinality[pos] = cardinalities[positions[pos]];
-        current_cluster_sketches[pos] = sketches[positions[pos]];
-        current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]);
-    }
-
-    // refine clusters
-//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
-    while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
-           current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%?
-    {
-//std::cout << "Current number of clusters: " << current_number_of_clusters;
-
-        // fill LSH collision hashtable
-        robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
-            LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
-
-        // read out LSH collision hashtable
-        // for each present key, if the list contains more than one cluster, we merge everything contained in the list
-        // into the first cluster, since those clusters collide and should be joined in the same bucket
-        for (auto & [key, list] : table)
-        {
-            assert(!list.empty());
-
-            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
-            // be inserted into multiple splots, and multiple times in the same slot.
-            std::sort(list.begin(), list.end());
-            auto const end = std::unique(list.begin(), list.end());
-            auto const begin = list.begin();
-
-            if (end - begin <= 1) // nothing to do here
-                continue;
-
-            // Now combine all clusters into the first.
-
-            // 1) find the representative cluster to merge everything else into
-            // It can happen, that the representative has already been joined with another cluster
-            // e.g.
-            // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
-            // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
-            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
-            auto & representative_cluster = clusters[representative_cluster_id];
-            assert(representative_cluster.is_valid(representative_cluster_id));
-            assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
-
-            for (auto current = begin + 1; current < end; ++current)
-            {
-                // For every other entry in the list, it can happen that I already joined that list with another
-                // e.g.
-                // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
-                // [key2] = {0, 2, 11} // now I want to do it again
-                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
-                auto & next_cluster = clusters[next_cluster_id];
-
-                if (next_cluster.id() == representative_cluster.id()) // already joined
-                    continue;
-
-                next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
-                assert(next_cluster.size() == 0);
-                assert(next_cluster.has_been_moved());
-                assert(representative_cluster.size() > 1); // there should be at least two user bins now
-                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
-
-                current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]);
-                current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate();
-
-                --current_number_of_clusters;
-            }
-
-            current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality);
-        }
-
-        ++current_sketch_index;
-
-//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
-    }
-
-    return clusters;
-}
 
 std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
                          std::vector<size_t> const & positions,
@@ -361,112 +243,6 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     return clusters;
 }
 
-std::vector<MultiCluster> most_distant_LSH_partitioning(std::vector<Cluster> const & initial_clusters,
-                         std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                         chopper::configuration const & config)
-{
-    assert(!minHash_sketches.empty());
-    assert(!minHash_sketches[0].table.empty());
-    assert(!minHash_sketches[0].table[0].empty());
-
-    size_t const number_of_user_bins{initial_clusters.size()};
-    size_t const number_of_max_minHash_sketches{minHash_sketches[0].table.size()}; // LSH ADD+OR parameter b
-    // size_t const minHash_sketche_size{minHash_sketches[0][0].size()};   // LSH ADD+OR parameter r
-
-    size_t current_number_of_sketch_hashes{5};
-    size_t current_sketch_index{0};
-
-    size_t current_number_of_clusters = [&initial_clusters] ()
-    {
-        size_t number{0};
-        for (auto const & cluster : initial_clusters)
-        {
-            if (cluster.size())
-                ++number;
-        }
-        return number;
-    }();
-
-    std::vector<MultiCluster> clusters;
-    clusters.reserve(number_of_user_bins);
-
-    for (size_t pos = 0; pos < number_of_user_bins; ++pos)
-    {
-        clusters.emplace_back(initial_clusters[pos]);
-        assert(clusters[pos].is_valid(pos));
-    }
-
-    // refine clusters. I want to cluster as much as possible to initialise partitions with very distant clusters
-    while (current_number_of_clusters > (config.hibf_config.tmax * 2) &&
-           current_sketch_index < number_of_max_minHash_sketches)
-    {
-//std::cout << "[Dist] Current number of clusters: " << current_number_of_clusters << " sketch size:" << current_number_of_sketch_hashes << " ";
-
-        // fill LSH collision hashtable
-        robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
-            LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
-
-        // read out LSH collision hashtable
-        // for each present key, if the list contains more than one cluster, we merge everything contained in the list
-        // into the first cluster, since those clusters collide and should be joined in the same bucket
-        for (auto & [key, list] : table)
-        {
-            assert(!list.empty());
-
-            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
-            // be inserted into multiple splots, and multiple times in the same slot.
-            std::sort(list.begin(), list.end());
-            auto const end = std::unique(list.begin(), list.end());
-            auto const begin = list.begin();
-
-            if (end - begin <= 1) // nothing to do here
-                continue;
-
-            // Now combine all clusters into the first.
-
-            // 1) find the representative cluster to merge everything else into
-            // It can happen, that the representative has already been joined with another cluster
-            // e.g.
-            // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
-            // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
-            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
-            auto & representative_cluster = clusters[representative_cluster_id];
-            assert(representative_cluster.is_valid(representative_cluster_id));
-            assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
-
-            for (auto current = begin + 1; current < end; ++current)
-            {
-                // For every other entry in the list, it can happen that I already joined that list with another
-                // e.g.
-                // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
-                // [key2] = {0, 2, 11} // now I want to do it again
-                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
-                auto & next_cluster = clusters[next_cluster_id];
-
-                if (next_cluster.id() == representative_cluster.id()) // already joined
-                    continue;
-
-                next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
-                assert(next_cluster.size() == 0);
-                assert(next_cluster.has_been_moved());
-                assert(representative_cluster.size() > 1); // there should be at least two user bins now
-                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
-
-                --current_number_of_clusters;
-            }
-        }
-
-        ++current_sketch_index;
-
-        if (current_sketch_index % 4 == 0 && current_number_of_sketch_hashes > 1)
-            --current_number_of_sketch_hashes;
-
-//std::cout << " and after this clustering step there are: " << current_number_of_clusters << std::endl;
-    }
-
-    return clusters;
-}
-
 void post_process_clusters(std::vector<Cluster> & clusters,
                            std::vector<size_t> const & cardinalities,
                            chopper::configuration const & config)
@@ -518,81 +294,6 @@ void post_process_clusters(std::vector<Cluster> & clusters,
 // debug
 }
 
-void post_process_clusters(std::vector<MultiCluster> & clusters,
-                           std::vector<size_t> const & cardinalities,
-                           chopper::configuration const & config)
-{
-    // clusters are done. Start post processing
-    // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not
-    // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id()
-    // would neet to be found
-
-    for (size_t pos = 0; pos < clusters.size(); ++pos)
-    {
-        assert(clusters[pos].is_valid(pos));
-        clusters[pos].sort_by_cardinality(cardinalities);
-    }
-
-    // push largest p clusters to the front
-    auto cluster_size_cmp = [&cardinalities](auto const & mc1, auto const & mc2)
-    {
-        if (mc1.empty())
-            return false; // mc1 can never be larger than mc2 then
-        if (mc2.empty()) // and mc1 is not, since the first if would catch
-            return true;
-
-        auto const & largest_c1 = mc1.contained_user_bins()[0]; // first cluster is the largest because of sorting above
-        auto const & largest_c2 = mc2.contained_user_bins()[0]; // first cluster is the largest because of sorting above
-
-        assert(!largest_c1.empty()); // should never be empty
-        assert(!largest_c2.empty()); // should never be empty
-
-        if (largest_c2.size() == largest_c1.size())
-            return  cardinalities[largest_c2[0]] < cardinalities[largest_c1[0]];
-        return largest_c2.size() < largest_c1.size();
-    };
-    std::partial_sort(clusters.begin(), clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp);
-
-    auto move_empty_clusters_to_the_end = [](auto const & v1, auto const & v2)
-    {
-        return v2.size() < v1.size();
-    };
-    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), move_empty_clusters_to_the_end);
-
-// debug sanity check
-    for (size_t cidx = 1; cidx < clusters.size(); ++cidx)
-    {
-        if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
-            throw std::runtime_error{"sorting did not work"};
-    }
-// debug
-}
-
-std::vector<Cluster> LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                                      std::vector<size_t> const & positions,
-                                      std::vector<size_t> const & cardinalities,
-                                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                                      size_t const average_technical_bin_size,
-                                      chopper::configuration const & config)
-{
-    std::vector<Cluster> clusters = initital_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config);
-    post_process_clusters(clusters, cardinalities, config);
-    return clusters;
-}
-
-std::vector<MultiCluster> sim_dist_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                                      std::vector<size_t> const & positions,
-                                      std::vector<size_t> const & cardinalities,
-                                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                                      size_t const average_technical_bin_size,
-                                      chopper::configuration const & config)
-{
-    std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, positions, cardinalities, sketches, average_technical_bin_size, config);
-    std::vector<MultiCluster> multi_clusters = most_distant_LSH_partitioning(clusters, minHash_sketches, config);
-    post_process_clusters(multi_clusters, cardinalities, config);
-    return multi_clusters;
-}
-
 bool find_best_partition(chopper::configuration const & config,
                          size_t const number_of_partitions,
                          size_t & corrected_estimate_per_part,
@@ -721,114 +422,6 @@ bool find_best_partition(chopper::configuration const & config,
     return true;
 }
 
-bool find_best_sorted_partition(chopper::configuration const & config,
-                         size_t const number_of_partitions,
-                         size_t & corrected_estimate_per_part,
-                         std::vector<size_t> const & cluster,
-                         std::vector<size_t> const & cardinalities,
-                         std::vector<seqan::hibf::sketch::hyperloglog> const & /*sketches*/,
-                         std::vector<std::vector<size_t>> & partitions,
-                         std::vector<size_t> & partition_sizes,
-                         std::vector<size_t> & max_partition_cardinality,
-                         std::vector<size_t> & min_partition_cardinality)
-{
-    size_t max_card = [&cardinalities,  &cluster] ()
-    {
-        size_t max{0};
-
-        for (size_t const user_bin_idx : cluster)
-            max = std::max(max, cardinalities[user_bin_idx]);
-
-        return max;
-    }();
-
-    size_t sum_card = [&cardinalities,  &cluster] ()
-    {
-        size_t sum{0};
-
-        for (size_t const user_bin_idx : cluster)
-            sum += cardinalities[user_bin_idx];
-
-        return sum;
-    }();
-
-    size_t smallest_change{std::numeric_limits<size_t>::max()};
-    size_t best_p{0};
-    bool best_p_found{false};
-
-    auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p)
-    {
-        size_t min = std::min(max_card, min_partition_cardinality[p]);
-        size_t max = std::max(max_card, max_partition_cardinality[p]);
-
-        if (partitions[p].size() > config.hibf_config.tmax) // already a third level
-        {
-            // if there must already be another lower level because the current merged bin contains more than tmax
-            // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set
-            // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user
-            // bin needs to be stored additionally
-            size_t const num_ubs_in_merged_bin{partitions[p].size() + additional_number_of_user_bins};
-            double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax);
-            return  static_cast<size_t>(max_card * levels);
-        }
-        else if (partitions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax)
-        {
-            // if the current merged bin contains exactly tmax UBS, adding otherone must
-            // result in another lower level. Most likely, the smallest user bin will end up on the lower level
-            // therefore the penalty is set to 'min * tmax'
-            // of course, there could also be a third level with a lower number of user bins, but this is hard to
-            // estimate.
-            size_t const penalty = min * config.hibf_config.tmax;
-            return penalty;
-        }
-        else // partitions[p].size() + additional_number_of_user_bins <= tmax
-        {
-            // if the new user bin is smaller than all other already contained user bins
-            // the waste of space is high if stored in a single technical bin
-            if (max_card < min)
-                return (max - max_card);
-            // if the new user bin is bigger than all other already contained user bins, the IBF size increases
-            else if (max_card > max)
-                return (max_card - max) * config.hibf_config.tmax;
-            // else, end if-else-block and zero is returned
-        }
-
-        return (size_t)0u;
-    };
-
-    for (size_t p = 0; p < number_of_partitions; ++p)
-    {
-        size_t const sum = partition_sizes[p] + sum_card;
-        size_t const penalty_current_ibf = config.hibf_config.tmax * ((sum <= corrected_estimate_per_part) ? 0u : sum - corrected_estimate_per_part);
-        size_t const change = penalty_current_ibf + penalty_lower_level(cluster.size(), p);
-
-        if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/
-            (smallest_change > change))
-        {
-            smallest_change = change;
-            best_p = p;
-            best_p_found = true;
-        }
-    }
-
-    if (!best_p_found)
-        return false;
-
-//std::cout << "best_p:" << best_p << std::endl<< std::endl;
-
-    // now that we know which partition fits best (`best_p`), add those indices to it
-    for (size_t const user_bin_idx : cluster)
-    {
-        partitions[best_p].push_back(user_bin_idx);
-        partition_sizes[best_p] += cardinalities[user_bin_idx];
-        max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]);
-        min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]);
-    }
-    corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sizes[best_p]));
-
-    return true;
-}
-
 size_t split_bins(chopper::configuration const & config,
                                std::vector<size_t> const & sorted_positions,
                                std::vector<size_t> const & cardinalities,

From c36dbdf91028c8c1fecd276645cb2fe196ec6a2a Mon Sep 17 00:00:00 2001
From: "seqan-actions[bot]" <seqan-actions@users.noreply.github.com>
Date: Fri, 25 Jul 2025 20:07:45 +0200
Subject: [PATCH 08/18] [MISC] automatic linting

---
 .../chopper/layout/determine_split_bins.hpp   |  10 +-
 include/chopper/lsh.hpp                       |  34 +-
 src/layout/CMakeLists.txt                     |  10 +-
 src/layout/determine_split_bins.cpp           |  21 +-
 src/layout/execute.cpp                        | 374 +++++++------
 src/layout/hibf_statistics.cpp                |   4 +-
 test/api/layout/determine_split_bins_test.cpp |  23 +-
 test/api/layout/fast_layout_test.cpp          | 497 +++++++++---------
 test/api/lsh_test.cpp                         |   1 -
 9 files changed, 536 insertions(+), 438 deletions(-)

diff --git a/include/chopper/layout/determine_split_bins.hpp b/include/chopper/layout/determine_split_bins.hpp
index dee17b24..91400e42 100644
--- a/include/chopper/layout/determine_split_bins.hpp
+++ b/include/chopper/layout/determine_split_bins.hpp
@@ -20,10 +20,10 @@ namespace chopper::layout
 {
 
 std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
-    std::vector<size_t> const & positions,
-    std::vector<size_t> const & cardinalities,
-    size_t const num_technical_bins,
-    size_t const num_user_bins,
-    std::vector<std::vector<size_t>> & partitions);
+                                               std::vector<size_t> const & positions,
+                                               std::vector<size_t> const & cardinalities,
+                                               size_t const num_technical_bins,
+                                               size_t const num_user_bins,
+                                               std::vector<std::vector<size_t>> & partitions);
 
 }
\ No newline at end of file
diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp
index af67bfa9..5ef11a75 100644
--- a/include/chopper/lsh.hpp
+++ b/include/chopper/lsh.hpp
@@ -30,10 +30,10 @@ struct Cluster
 
 public:
     Cluster() = default;
-    Cluster(const Cluster&) = default;
-    Cluster(Cluster&&) = default;
-    Cluster& operator=(const Cluster&) = default;
-    Cluster& operator=(Cluster&&) = default;
+    Cluster(Cluster const &) = default;
+    Cluster(Cluster &&) = default;
+    Cluster & operator=(Cluster const &) = default;
+    Cluster & operator=(Cluster &&) = default;
     ~Cluster() = default;
 
     Cluster(size_t const id)
@@ -98,7 +98,12 @@ struct Cluster
 
     void sort_by_cardinality(std::vector<size_t> const & cardinalities)
     {
-        std::sort(user_bins.begin(), user_bins.end(), [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; });
+        std::sort(user_bins.begin(),
+                  user_bins.end(),
+                  [&cardinalities](auto const & v1, auto const & v2)
+                  {
+                      return cardinalities[v2] < cardinalities[v1];
+                  });
     }
 };
 
@@ -109,10 +114,10 @@ struct MultiCluster : Cluster
 
 public:
     MultiCluster() = default;
-    MultiCluster(const MultiCluster&) = default;
-    MultiCluster(MultiCluster&&) = default;
-    MultiCluster& operator=(const MultiCluster&) = default;
-    MultiCluster& operator=(MultiCluster&&) = default;
+    MultiCluster(MultiCluster const &) = default;
+    MultiCluster(MultiCluster &&) = default;
+    MultiCluster & operator=(MultiCluster const &) = default;
+    MultiCluster & operator=(MultiCluster &&) = default;
     ~MultiCluster() = default;
 
     MultiCluster(Cluster const & clust)
@@ -164,16 +169,21 @@ struct MultiCluster : Cluster
     // sort user bins within a Cluster by cardinality and the clusters themselves by size
     void sort_by_cardinality(std::vector<size_t> const & cardinalities)
     {
-        auto cmp = [&cardinalities](auto const & v1, auto const & v2){ return cardinalities[v2] < cardinalities[v1]; };
+        auto cmp = [&cardinalities](auto const & v1, auto const & v2)
+        {
+            return cardinalities[v2] < cardinalities[v1];
+        };
         for (auto & user_bin_cluster : user_bins)
             std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp);
 
-        auto cmp_clusters = [](auto const & c1, auto const & c2){ return c2.size() < c1.size(); };
+        auto cmp_clusters = [](auto const & c1, auto const & c2)
+        {
+            return c2.size() < c1.size();
+        };
         std::sort(user_bins.begin(), user_bins.end(), cmp_clusters);
     }
 };
 
-
 // A valid cluster is one that hasn't been moved but actually contains user bins
 // A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i
 // A moved cluster is one that has been joined and thereby moved to another cluster
diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt
index 4c2542c8..f2b16735 100644
--- a/src/layout/CMakeLists.txt
+++ b/src/layout/CMakeLists.txt
@@ -4,8 +4,14 @@ if (TARGET chopper::layout)
     return ()
 endif ()
 
-add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp
-                                   ibf_query_cost.cpp input.cpp output.cpp determine_split_bins.cpp
+add_library (chopper_layout STATIC
+             determine_best_number_of_technical_bins.cpp
+             determine_split_bins.cpp
+             execute.cpp
+             hibf_statistics.cpp
+             ibf_query_cost.cpp
+             input.cpp
+             output.cpp
 )
 target_link_libraries (chopper_layout PUBLIC chopper::shared)
 add_library (chopper::layout ALIAS chopper_layout)
diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp
index 71d946b8..08e5fdee 100644
--- a/src/layout/determine_split_bins.cpp
+++ b/src/layout/determine_split_bins.cpp
@@ -5,14 +5,14 @@
 // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
 // ---------------------------------------------------------------------------------------------------
 
-#include <numeric>  // for allocator, string
+#include <numeric> // for allocator, string
 #include <vector>  // for vector
 
-#include <chopper/layout/determine_split_bins.hpp>
 #include <chopper/configuration.hpp>
-#include <hibf/layout/print_matrix.hpp>        // for data_store
+#include <chopper/layout/determine_split_bins.hpp>
 
 #include <hibf/layout/compute_fpr_correction.hpp>
+#include <hibf/layout/print_matrix.hpp> // for data_store
 #include <hibf/misc/divide_and_ceil.hpp>
 
 namespace chopper::fast_layout
@@ -26,15 +26,16 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
                                                std::vector<std::vector<size_t>> & partitions)
 {
     if (num_user_bins == 0)
-        return {0,0};
+        return {0, 0};
 
     assert(num_technical_bins > 0u);
     assert(num_user_bins > 0u);
     assert(num_user_bins <= num_technical_bins);
 
-    auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                        .hash_count = config.hibf_config.number_of_hash_functions,
-                                                        .t_max = num_technical_bins});
+    auto const fpr_correction =
+        seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                     .hash_count = config.hibf_config.number_of_hash_functions,
+                                                     .t_max = num_technical_bins});
 
     std::vector<std::vector<size_t>> matrix(num_technical_bins); // rows
     for (auto & v : matrix)
@@ -67,8 +68,8 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
             {
                 size_t const corrected_ub_cardinality =
                     static_cast<size_t>(ub_cardinality * fpr_correction[(i - i_prime)]);
-                size_t score =
-                    std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]);
+                size_t score = std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime),
+                                                matrix[i_prime][j - 1]);
 
                 // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl;
 
@@ -149,4 +150,4 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
     return {number_of_split_tbs, max_size};
 }
 
-} // namespace chopper::layout
+} // namespace chopper::fast_layout
diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index a07d8807..9e2f4e0b 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -14,28 +14,27 @@
 #include <fstream>
 #include <iostream>
 #include <numeric>
+#include <optional>
 #include <random>
 #include <stdexcept>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <optional>
-
-#include <hibf/contrib/robin_hood.hpp>
 
 #include <chopper/configuration.hpp>
-#include <chopper/layout/determine_split_bins.hpp>
 #include <chopper/layout/determine_best_number_of_technical_bins.hpp>
+#include <chopper/layout/determine_split_bins.hpp>
 #include <chopper/layout/execute.hpp>
 #include <chopper/layout/hibf_statistics.hpp>
 #include <chopper/layout/output.hpp>
+#include <chopper/lsh.hpp>
 #include <chopper/next_multiple_of_64.hpp>
 #include <chopper/sketch/output.hpp>
-#include <chopper/lsh.hpp>
 
+#include <hibf/contrib/robin_hood.hpp>
 #include <hibf/layout/compute_fpr_correction.hpp>
-#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
 #include <hibf/layout/compute_layout.hpp>
+#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
 #include <hibf/layout/layout.hpp>
 #include <hibf/misc/divide_and_ceil.hpp>
 #include <hibf/misc/iota_vector.hpp>
@@ -76,7 +75,8 @@ auto LSH_fill_hashtable(std::vector<Cluster> const & clusters,
         for (size_t const user_bin_idx : current.contained_user_bins())
         {
             ++processed_user_bins;
-            uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes);
+            uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index],
+                                                     current_number_of_sketch_hashes);
             table[key].push_back(current.id()); // insert representative for all user bins
         }
     }
@@ -106,7 +106,8 @@ auto LSH_fill_hashtable(std::vector<MultiCluster> const & clusters,
             for (size_t const user_bin_idx : similarity_cluster)
             {
                 ++processed_user_bins;
-                uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], current_number_of_sketch_hashes);
+                uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index],
+                                                         current_number_of_sketch_hashes);
                 table[key].push_back(current.id()); // insert representative for all user bins
             }
         }
@@ -122,11 +123,11 @@ auto LSH_fill_hashtable(std::vector<MultiCluster> const & clusters,
 // Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r)
 
 std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                         std::vector<size_t> const & positions,
-                         std::vector<size_t> const & cardinalities,
-                         std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                         size_t const average_technical_bin_size,
-                         chopper::configuration const & config)
+                                                   std::vector<size_t> const & positions,
+                                                   std::vector<size_t> const & cardinalities,
+                                                   std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                                                   size_t const average_technical_bin_size,
+                                                   chopper::configuration const & config)
 {
     assert(!minHash_sketches.empty());
     assert(!minHash_sketches[0].table.empty());
@@ -134,12 +135,12 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
 
     size_t const number_of_user_bins{positions.size()};
     assert(number_of_user_bins <= minHash_sketches.size());
-    size_t const number_of_max_minHash_sketches{3};                     // LSH ADD+OR parameter b
+    size_t const number_of_max_minHash_sketches{3}; // LSH ADD+OR parameter b
     // size_t const minHash_sketche_size{minHash_sketches[0].table[0].size()};   // LSH ADD+OR parameter r
-    size_t const minHash_sketche_size{5};   // LSH ADD+OR parameter r
+    size_t const minHash_sketche_size{5}; // LSH ADD+OR parameter r
     seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits};
-// std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl;
-// std::cout << "sketch size used here: " << minHash_sketche_size << std::endl;
+    // std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl;
+    // std::cout << "sketch size used here: " << minHash_sketche_size << std::endl;
     // initialise clusters with a signle user bin per cluster.
     // clusters are either
     // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to
@@ -170,11 +171,12 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     }
 
     // refine clusters
-//std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
-    while (current_max_cluster_size < average_technical_bin_size && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
+    //std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
+    while (current_max_cluster_size < average_technical_bin_size
+           && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
            current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%?
     {
-//std::cout << "Current number of clusters: " << current_number_of_clusters;
+        //std::cout << "Current number of clusters: " << current_number_of_clusters;
 
         // fill LSH collision hashtable
         robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
@@ -226,8 +228,10 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
                 assert(representative_cluster.size() > 1); // there should be at least two user bins now
                 assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
 
-                current_cluster_sketches[representative_cluster.id()].merge(current_cluster_sketches[next_cluster.id()]);
-                current_cluster_cardinality[representative_cluster.id()] = current_cluster_sketches[representative_cluster.id()].estimate();
+                current_cluster_sketches[representative_cluster.id()].merge(
+                    current_cluster_sketches[next_cluster.id()]);
+                current_cluster_cardinality[representative_cluster.id()] =
+                    current_cluster_sketches[representative_cluster.id()].estimate();
 
                 --current_number_of_clusters;
             }
@@ -237,7 +241,7 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
 
         ++current_sketch_index;
 
-//std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
+        //std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
     }
 
     return clusters;
@@ -262,10 +266,13 @@ void post_process_clusters(std::vector<Cluster> & clusters,
     auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2)
     {
         if (v2.size() == v1.size() && !v2.empty())
-            return  cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
+            return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
         return v2.size() < v1.size();
     };
-    std::partial_sort(clusters.begin(), clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), cluster_size_cmp);
+    std::partial_sort(clusters.begin(),
+                      clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax),
+                      clusters.end(),
+                      cluster_size_cmp);
 
     // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub
     // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last.
@@ -280,18 +287,21 @@ void post_process_clusters(std::vector<Cluster> & clusters,
 
         return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
     };
-    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax), clusters.end(), compare_cardinality_and_move_empty_clusters_to_the_end);
+    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax),
+              clusters.end(),
+              compare_cardinality_and_move_empty_clusters_to_the_end);
 
     assert(clusters[0].size() >= clusters[1].size()); // sanity check
     // assert(cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check
 
-// debug
+    // debug
     for (size_t cidx = 1; cidx < clusters.size(); ++cidx)
     {
-        if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
+        if (clusters[cidx - 1].empty()
+            && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
             throw std::runtime_error{"sorting did not work"};
     }
-// debug
+    // debug
 }
 
 bool find_best_partition(chopper::configuration const & config,
@@ -315,7 +325,7 @@ bool find_best_partition(chopper::configuration const & config,
         return result;
     }();
 
-    size_t const max_card = [&cardinalities,  &cluster] ()
+    size_t const max_card = [&cardinalities, &cluster]()
     {
         size_t max{0};
 
@@ -337,7 +347,7 @@ bool find_best_partition(chopper::configuration const & config,
     size_t best_p{0};
     bool best_p_found{false};
 
-    auto penalty_lower_level = [&] (size_t const additional_number_of_user_bins, size_t const p)
+    auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p)
     {
         size_t min = std::min(max_card, min_partition_cardinality[p]);
         size_t max = std::max(max_card, max_partition_cardinality[p]);
@@ -350,7 +360,7 @@ bool find_best_partition(chopper::configuration const & config,
             // bin needs to be stored additionally
             size_t const num_ubs_in_merged_bin{positions[p].size() + additional_number_of_user_bins};
             double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax);
-            return  static_cast<size_t>(max_card * levels);
+            return static_cast<size_t>(max_card * levels);
         }
         else if (positions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) // now a third level
         {
@@ -386,17 +396,19 @@ bool find_best_partition(chopper::configuration const & config,
 
         assert(union_estimate >= current_partition_size);
         size_t const penalty_current_bin = union_estimate - current_partition_size;
-        size_t const penalty_current_ibf = config.hibf_config.tmax * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part);
+        size_t const penalty_current_ibf =
+            config.hibf_config.tmax
+            * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part);
         size_t const change = penalty_current_bin + penalty_current_ibf + penalty_lower_level(cluster.size(), p);
 
         // size_t const intersection = current_sketch.estimate() - change;
         // double const subsume_ratio = static_cast<double>(intersection) / current_partition_size;
-//std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" <<  penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl;
+        //std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" <<  penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl;
         if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/
             (smallest_change > change /*&& subsume_ratio > best_subsume_ratio &&*/
-            /*current_partition_size < corrected_estimate_per_part*/))
+             /*current_partition_size < corrected_estimate_per_part*/))
         {
-//std::cout << "smaller!" << std::endl;
+            //std::cout << "smaller!" << std::endl;
             // best_subsume_ratio = subsume_ratio;
             smallest_change = change;
             best_p = p;
@@ -407,7 +419,7 @@ bool find_best_partition(chopper::configuration const & config,
     if (!best_p_found)
         throw "currently there are no safety measures if a partition is not found because it is very unlikely";
 
-//std::cout << "best_p:" << best_p << std::endl<< std::endl;
+    //std::cout << "best_p:" << best_p << std::endl<< std::endl;
 
     // now that we know which partition fits best (`best_p`), add those indices to it
     for (size_t const user_bin_idx : cluster)
@@ -417,28 +429,29 @@ bool find_best_partition(chopper::configuration const & config,
         min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]);
     }
     partition_sketches[best_p].merge(current_sketch);
-    corrected_estimate_per_part = std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sketches[best_p].estimate()));
+    corrected_estimate_per_part =
+        std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sketches[best_p].estimate()));
 
     return true;
 }
 
 size_t split_bins(chopper::configuration const & config,
-                               std::vector<size_t> const & sorted_positions,
-                               std::vector<size_t> const & cardinalities,
-                               std::vector<std::vector<size_t>> & partitions,
-                               size_t const sum_of_cardinalities,
-                               size_t const end_idx)
+                  std::vector<size_t> const & sorted_positions,
+                  std::vector<size_t> const & cardinalities,
+                  std::vector<std::vector<size_t>> & partitions,
+                  size_t const sum_of_cardinalities,
+                  size_t const end_idx)
 {
     // assign split bins to the end. makes it easier to process merged bins afterwards
     size_t pos{partitions.size() - 1};
     for (size_t idx = 0; idx < end_idx; ++idx)
     {
         size_t const ub_idx{sorted_positions[idx]};
-        size_t const number_of_split_tbs = std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities);
-std::cout << "number_of_split_tbs: " << number_of_split_tbs
-          << " cardinalities[ub_idx]:" << cardinalities[ub_idx]
-          << " sum_of_cardinalities:" << sum_of_cardinalities
-          << std::endl;
+        size_t const number_of_split_tbs =
+            std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities);
+        std::cout << "number_of_split_tbs: " << number_of_split_tbs
+                  << " cardinalities[ub_idx]:" << cardinalities[ub_idx]
+                  << " sum_of_cardinalities:" << sum_of_cardinalities << std::endl;
         // fill partitions from behind to ensure an easier layouting
         for (size_t i = 0; i < number_of_split_tbs; ++i)
         {
@@ -453,9 +466,9 @@ std::cout << "number_of_split_tbs: " << number_of_split_tbs
 }
 
 std::pair<size_t, size_t> find_bins_to_be_split(std::vector<size_t> const & sorted_positions,
-                            std::vector<size_t> const & cardinalities,
-                            size_t threshold,
-                            size_t const max_bins)
+                                                std::vector<size_t> const & cardinalities,
+                                                size_t threshold,
+                                                size_t const max_bins)
 {
     size_t idx{0};
     size_t sum{0};
@@ -488,14 +501,14 @@ std::pair<size_t, size_t> find_bins_to_be_split(std::vector<size_t> const & sort
 }
 
 size_t lsh_sim_approach(chopper::configuration const & config,
-                      std::vector<size_t> const & sorted_positions2,
-                      std::vector<size_t> const & cardinalities,
-                      std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                      std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                      std::vector<std::vector<size_t>> & partitions,
-                      size_t const number_of_remaining_tbs,
-                      size_t const technical_bin_size_threshold,
-                      size_t const sum_of_cardinalities)
+                        std::vector<size_t> const & sorted_positions2,
+                        std::vector<size_t> const & cardinalities,
+                        std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                        std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                        std::vector<std::vector<size_t>> & partitions,
+                        size_t const number_of_remaining_tbs,
+                        size_t const technical_bin_size_threshold,
+                        size_t const sum_of_cardinalities)
 {
     uint8_t const sketch_bits{config.hibf_config.sketch_bits};
     //std::cout << "LSH partitioning into " << config.hibf_config.tmax << std::endl;
@@ -507,20 +520,25 @@ size_t lsh_sim_approach(chopper::configuration const & config,
 
     // initial partitioning using locality sensitive hashing (LSH)
     config.lsh_algorithm_timer.start();
-    std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches, sorted_positions2, cardinalities, sketches, technical_bin_size_threshold, config);
+    std::vector<Cluster> clusters = very_similar_LSH_partitioning(minHash_sketches,
+                                                                  sorted_positions2,
+                                                                  cardinalities,
+                                                                  sketches,
+                                                                  technical_bin_size_threshold,
+                                                                  config);
     post_process_clusters(clusters, cardinalities, config);
     config.lsh_algorithm_timer.stop();
 
-std::ofstream ofs{"/tmp/final.clusters"};
-for (size_t i = 0; i < clusters.size(); ++i)
-{
-seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits);
-for (size_t j = 0; j < clusters[i].size(); ++j)
-{
-    sketch.merge(sketches[clusters[i].contained_user_bins()[j]]);
-}
-ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
-}
+    std::ofstream ofs{"/tmp/final.clusters"};
+    for (size_t i = 0; i < clusters.size(); ++i)
+    {
+        seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits);
+        for (size_t j = 0; j < clusters[i].size(); ++j)
+        {
+            sketch.merge(sketches[clusters[i].contained_user_bins()[j]]);
+        }
+        ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
+    }
 
     std::vector<std::vector<size_t>> remaining_clusters{};
 
@@ -548,7 +566,8 @@ ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
             size_t const user_bin_idx = cluster[uidx];
             // if a single cluster already exceeds the cardinality_per_part,
             // then the remaining user bins of the cluster must spill over into the next partition
-            if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) || partition_sketches[p].estimate() > technical_bin_size_threshold)
+            if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0))
+                || partition_sketches[p].estimate() > technical_bin_size_threshold)
             {
                 ++p;
 
@@ -590,7 +609,16 @@ ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl;
         auto const & cluster = remaining_clusters[ridx];
 
         config.search_partition_algorithm_timer.start();
-        find_best_partition(config, number_of_remaining_tbs, merged_threshold, cluster, cardinalities, sketches, partitions, partition_sketches, max_partition_cardinality, min_partition_cardinality);
+        find_best_partition(config,
+                            number_of_remaining_tbs,
+                            merged_threshold,
+                            cluster,
+                            cardinalities,
+                            sketches,
+                            partitions,
+                            partition_sketches,
+                            max_partition_cardinality,
+                            min_partition_cardinality);
         config.search_partition_algorithm_timer.start();
     }
 
@@ -630,20 +658,21 @@ void partition_user_bins(chopper::configuration const & config,
             sketch.merge(sketches[pos]);
         }
 
-        return std::tuple<size_t,size_t,size_t>{sum, max, sketch.estimate()};
+        return std::tuple<size_t, size_t, size_t>{sum, max, sketch.estimate()};
     }();
 
     // If the effective text size is very low, it can happen that the joint_estimate divided by the number of partitions
     // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size
     // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand.
     // This way we make sure there is at least 1 LSH clustering step.
-    size_t const estimate_per_part = std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax),
-                                              max_cardinality + 1);
+    size_t const estimate_per_part =
+        std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), max_cardinality + 1);
 
-    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
-                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
-//std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl;
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction(
+        {.fpr = config.hibf_config.maximum_fpr, //
+         .relaxed_fpr = config.hibf_config.relaxed_fpr,
+         .hash_count = config.hibf_config.number_of_hash_functions});
+    //std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl;
 
     size_t split_threshold{};
     size_t idx{0}; // start in sorted positions
@@ -658,8 +687,15 @@ void partition_user_bins(chopper::configuration const & config,
     auto parition_split_bins = [&]()
     {
         size_t number_of_potential_split_bins{0};
-        std::tie(idx, number_of_potential_split_bins) = find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1);
-        std::tie(number_of_split_tbs, max_split_size) = chopper::layout::determine_split_bins(config, sorted_positions, cardinalities, number_of_potential_split_bins, idx, partitions);
+        std::tie(idx, number_of_potential_split_bins) =
+            find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1);
+        std::tie(number_of_split_tbs, max_split_size) =
+            chopper::layout::determine_split_bins(config,
+                                                  sorted_positions,
+                                                  cardinalities,
+                                                  number_of_potential_split_bins,
+                                                  idx,
+                                                  partitions);
         number_of_merged_tbs = config.hibf_config.tmax - number_of_split_tbs;
     };
 
@@ -672,25 +708,38 @@ void partition_user_bins(chopper::configuration const & config,
         size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction;
         size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold);
 
-        max_merged_size = lsh_sim_approach(config, sorted_positions2, cardinalities, sketches, minHash_sketches, partitions, number_of_merged_tbs, merged_threshold, sum_of_cardinalities);
+        max_merged_size = lsh_sim_approach(config,
+                                           sorted_positions2,
+                                           cardinalities,
+                                           sketches,
+                                           minHash_sketches,
+                                           partitions,
+                                           number_of_merged_tbs,
+                                           merged_threshold,
+                                           sum_of_cardinalities);
     };
 
     parition_split_bins();
     partitions_merged_bins();
 
-    int64_t const difference = static_cast<int64_t>(max_merged_size * relaxed_fpr_correction) - static_cast<int64_t>(max_split_size);
+    int64_t const difference =
+        static_cast<int64_t>(max_merged_size * relaxed_fpr_correction) - static_cast<int64_t>(max_split_size);
 
-std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl;
-std::cout << "Reconfiguring threshold.  from:" << split_threshold;
+    std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl;
+    std::cout << "Reconfiguring threshold.  from:" << split_threshold;
 
     if (number_of_split_tbs == 0)
         split_threshold = (split_threshold + max_merged_size) / 2; // increase threshold
-    else if (difference > 0) // need more merged bins -> increase threshold
-        split_threshold = static_cast<double>(split_threshold) * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
+    else if (difference > 0)                                       // need more merged bins -> increase threshold
+        split_threshold =
+            static_cast<double>(split_threshold)
+            * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
     else // need more split bins -> decrease threshold
-        split_threshold = static_cast<double>(split_threshold) * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
+        split_threshold =
+            static_cast<double>(split_threshold)
+            * ((static_cast<double>(max_merged_size) * relaxed_fpr_correction) / static_cast<double>(max_split_size));
 
-std::cout << " to:" << split_threshold << std::endl;
+    std::cout << " to:" << split_threshold << std::endl;
 
     // reset result
     partitions.clear();
@@ -751,9 +800,9 @@ std::cout << " to:" << split_threshold << std::endl;
 }
 
 seqan::hibf::layout::layout general_layout(chopper::configuration const & config,
-                std::vector<size_t> positions,
-                std::vector<size_t> const & cardinalities,
-                std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
+                                           std::vector<size_t> positions,
+                                           std::vector<size_t> const & cardinalities,
+                                           std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
 {
     seqan::hibf::layout::layout hibf_layout;
 
@@ -773,7 +822,9 @@ seqan::hibf::layout::layout general_layout(chopper::configuration const & config
     return hibf_layout;
 }
 
-bool do_I_need_a_fast_layout(chopper::configuration const & config, std::vector<size_t> const & positions, std::vector<size_t> const & cardinalities)
+bool do_I_need_a_fast_layout(chopper::configuration const & config,
+                             std::vector<size_t> const & positions,
+                             std::vector<size_t> const & cardinalities)
 {
     // the fast layout heuristic would greedily merge even if merging only 2 bins at a time
     // merging only little number of bins is highly disadvantegous for lower levels because few bins
@@ -814,13 +865,15 @@ void add_level_to_layout(chopper::configuration const & config,
     size_t max_bin_id{0};
     size_t max_size{0};
 
-    auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                                             .hash_count = config.hibf_config.number_of_hash_functions,
-                                                                             .t_max = partitions.size()});
+    auto const split_fpr_correction =
+        seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                     .hash_count = config.hibf_config.number_of_hash_functions,
+                                                     .t_max = partitions.size()});
 
-    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
-                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction(
+        {.fpr = config.hibf_config.maximum_fpr, //
+         .relaxed_fpr = config.hibf_config.relaxed_fpr,
+         .hash_count = config.hibf_config.number_of_hash_functions});
 
     // we assume here that the user bins have been sorted by user bin id such that pos = idx
     for (size_t partition_idx{0}; partition_idx < partitions.size(); ++partition_idx)
@@ -862,17 +915,17 @@ void add_level_to_layout(chopper::configuration const & config,
             current_user_bin.storage_TB_id = partition_idx;
             current_user_bin.number_of_technical_bins = 1; // initialise to 1
 
-            while (partition_idx + 1 < partitions.size() &&
-                    partitions[partition_idx].size() == 1 &&
-                    partitions[partition_idx + 1].size() == 1 &&
-                    partitions[partition_idx][0] == partitions[partition_idx + 1][0])
+            while (partition_idx + 1 < partitions.size() && partitions[partition_idx].size() == 1
+                   && partitions[partition_idx + 1].size() == 1
+                   && partitions[partition_idx][0] == partitions[partition_idx + 1][0])
             {
                 ++current_user_bin.number_of_technical_bins;
                 ++partition_idx;
             }
 
             // update max_bin_id, max_size
-            size_t const current_size = sketches[current_user_bin.idx].estimate() * split_fpr_correction[current_user_bin.number_of_technical_bins];
+            size_t const current_size = sketches[current_user_bin.idx].estimate()
+                                      * split_fpr_correction[current_user_bin.number_of_technical_bins];
             if (current_size > max_size)
             {
                 max_bin_id = current_user_bin.storage_TB_id;
@@ -892,7 +945,9 @@ void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout,
 
     for (auto & max_bin : child_layout.max_bins)
     {
-        max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), new_previous.begin(), new_previous.end());
+        max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(),
+                                           new_previous.begin(),
+                                           new_previous.end());
         hibf_layout.max_bins.push_back(max_bin);
     }
 
@@ -901,27 +956,27 @@ void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout,
         auto & actual_user_bin = hibf_layout.user_bins[user_bin.idx];
 
         actual_user_bin.previous_TB_indices.insert(actual_user_bin.previous_TB_indices.end(),
-                                                    user_bin.previous_TB_indices.begin(),
-                                                    user_bin.previous_TB_indices.end());
+                                                   user_bin.previous_TB_indices.begin(),
+                                                   user_bin.previous_TB_indices.end());
         actual_user_bin.number_of_technical_bins = user_bin.number_of_technical_bins;
         actual_user_bin.storage_TB_id = user_bin.storage_TB_id;
     }
 }
 
 void fast_layout_recursion(chopper::configuration const & config,
-            std::vector<size_t> const & positions,
-            std::vector<size_t> const & cardinalities,
-            std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-            std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-            seqan::hibf::layout::layout & hibf_layout,
-            std::vector<size_t> const & previous)
+                           std::vector<size_t> const & positions,
+                           std::vector<size_t> const & cardinalities,
+                           std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                           std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                           seqan::hibf::layout::layout & hibf_layout,
+                           std::vector<size_t> const & previous)
 {
     std::vector<std::vector<size_t>> tmax_partitions(config.hibf_config.tmax);
 
     // here we assume that we want to start with a fast layout
     partition_user_bins(config, positions, cardinalities, sketches, minHash_sketches, tmax_partitions);
 
-    #pragma omp critical
+#pragma omp critical
     {
         add_level_to_layout(config, hibf_layout, tmax_partitions, sketches, previous);
     }
@@ -929,20 +984,31 @@ void fast_layout_recursion(chopper::configuration const & config,
     for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx)
     {
         auto const & partition = tmax_partitions[partition_idx];
-        auto const new_previous = [&] () {auto cpy{previous}; cpy.push_back(partition_idx); return cpy; }();
+        auto const new_previous = [&]()
+        {
+            auto cpy{previous};
+            cpy.push_back(partition_idx);
+            return cpy;
+        }();
 
         if (partition.empty() || partition.size() == 1) // nothing to merge
             continue;
 
         if (do_I_need_a_fast_layout(config, partition, cardinalities))
         {
-            fast_layout_recursion(config, partition, cardinalities, sketches, minHash_sketches, hibf_layout, new_previous); // recurse fast_layout
+            fast_layout_recursion(config,
+                                  partition,
+                                  cardinalities,
+                                  sketches,
+                                  minHash_sketches,
+                                  hibf_layout,
+                                  new_previous); // recurse fast_layout
         }
         else
         {
             auto child_layout = general_layout(config, partition, cardinalities, sketches);
 
-            #pragma omp critical
+#pragma omp critical
             {
                 update_layout_from_child_layout(child_layout, hibf_layout, new_previous);
             }
@@ -951,11 +1017,11 @@ void fast_layout_recursion(chopper::configuration const & config,
 }
 
 void fast_layout(chopper::configuration const & config,
-                std::vector<size_t> const & positions,
-                std::vector<size_t> const & cardinalities,
-                std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
-                std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                seqan::hibf::layout::layout & hibf_layout)
+                 std::vector<size_t> const & positions,
+                 std::vector<size_t> const & cardinalities,
+                 std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
+                 std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
+                 seqan::hibf::layout::layout & hibf_layout)
 {
     auto config_copy = config;
 
@@ -966,13 +1032,15 @@ void fast_layout(chopper::configuration const & config,
     partition_user_bins(config_copy, positions, cardinalities, sketches, minHash_sketches, tmax_partitions);
     config.intital_partition_timer.stop();
 
-    auto const split_fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                                             .hash_count = config.hibf_config.number_of_hash_functions,
-                                                                             .t_max = config.hibf_config.tmax});
+    auto const split_fpr_correction =
+        seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                     .hash_count = config.hibf_config.number_of_hash_functions,
+                                                     .t_max = config.hibf_config.tmax});
 
-    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                                                                .relaxed_fpr = config.hibf_config.relaxed_fpr,
-                                                                                                .hash_count = config.hibf_config.number_of_hash_functions});
+    double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction(
+        {.fpr = config.hibf_config.maximum_fpr, //
+         .relaxed_fpr = config.hibf_config.relaxed_fpr,
+         .hash_count = config.hibf_config.number_of_hash_functions});
 
     size_t max_bin_id{0};
     size_t max_size{0};
@@ -1016,17 +1084,18 @@ void fast_layout(chopper::configuration const & config,
                                                   .number_of_technical_bins = 1 /*determiend below*/,
                                                   .idx = user_bin_id};
 
-            while (partition_idx + 1 < tmax_partitions.size() &&
-                    tmax_partitions[partition_idx].size() == 1 &&
-                    tmax_partitions[partition_idx + 1].size() == 1 &&
-                    tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0])
+            while (partition_idx + 1 < tmax_partitions.size() && tmax_partitions[partition_idx].size() == 1
+                   && tmax_partitions[partition_idx + 1].size() == 1
+                   && tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0])
             {
                 ++hibf_layout.user_bins[user_bin_id].number_of_technical_bins;
                 ++partition_idx;
             }
 
             // update max_bin_id, max_size
-            size_t const current_size = sketches[user_bin_id].estimate() * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins];
+            size_t const current_size =
+                sketches[user_bin_id].estimate()
+                * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins];
             if (current_size > max_size)
             {
                 max_bin_id = hibf_layout.user_bins[user_bin_id].storage_TB_id;
@@ -1038,10 +1107,10 @@ void fast_layout(chopper::configuration const & config,
     hibf_layout.top_level_max_bin_id = max_bin_id;
 
     config.small_layouts_timer.start();
-    #pragma omp parallel
-    #pragma omp single
+#pragma omp parallel
+#pragma omp single
     {
-        #pragma omp taskloop
+#pragma omp taskloop
         for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx)
         {
             auto const & partition = tmax_partitions[partition_idx];
@@ -1051,13 +1120,19 @@ void fast_layout(chopper::configuration const & config,
 
             if (do_I_need_a_fast_layout(config_copy, partition, cardinalities))
             {
-                fast_layout_recursion(config_copy, partition, cardinalities, sketches, minHash_sketches, hibf_layout, {partition_idx}); // recurse fast_layout
+                fast_layout_recursion(config_copy,
+                                      partition,
+                                      cardinalities,
+                                      sketches,
+                                      minHash_sketches,
+                                      hibf_layout,
+                                      {partition_idx}); // recurse fast_layout
             }
             else
             {
                 auto small_layout = general_layout(config, partition, cardinalities, sketches);
 
-                #pragma omp critical
+#pragma omp critical
                 {
                     update_layout_from_child_layout(small_layout, hibf_layout, std::vector<size_t>{partition_idx});
                 }
@@ -1081,7 +1156,6 @@ int execute(chopper::configuration & config,
     {
         seqan::hibf::layout::layout hibf_layout;
 
-
         if (config.determine_best_tmax)
         {
             hibf_layout = determine_best_number_of_technical_bins(config, cardinalities, sketches);
@@ -1089,7 +1163,12 @@ int execute(chopper::configuration & config,
         else
         {
             config.dp_algorithm_timer.start();
-            fast_layout(config, seqan::hibf::iota_vector(sketches.size()), cardinalities, sketches, minHash_sketches, hibf_layout);
+            fast_layout(config,
+                        seqan::hibf::iota_vector(sketches.size()),
+                        cardinalities,
+                        sketches,
+                        minHash_sketches,
+                        hibf_layout);
             config.dp_algorithm_timer.stop();
 
             // hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
@@ -1102,13 +1181,14 @@ int execute(chopper::configuration & config,
             // sort records ascending by the number of bin indices (corresponds to the IBF levels)
             // GCOVR_EXCL_START
             std::ranges::sort(hibf_layout.max_bins,
-                            [](auto const & r, auto const & l)
-                            {
-                                if (r.previous_TB_indices.size() == l.previous_TB_indices.size())
-                                    return std::ranges::lexicographical_compare(r.previous_TB_indices, l.previous_TB_indices);
-                                else
-                                    return r.previous_TB_indices.size() < l.previous_TB_indices.size();
-                            });
+                              [](auto const & r, auto const & l)
+                              {
+                                  if (r.previous_TB_indices.size() == l.previous_TB_indices.size())
+                                      return std::ranges::lexicographical_compare(r.previous_TB_indices,
+                                                                                  l.previous_TB_indices);
+                                  else
+                                      return r.previous_TB_indices.size() < l.previous_TB_indices.size();
+                              });
             // GCOVR_EXCL_STOP
 
             if (config.output_verbose_statistics)
diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp
index e171ed93..fdf2bb7d 100644
--- a/src/layout/hibf_statistics.cpp
+++ b/src/layout/hibf_statistics.cpp
@@ -397,8 +397,8 @@ void hibf_statistics::collect_bins()
 #    pragma GCC diagnostic push
 #    pragma GCC diagnostic ignored "-Warray-bounds="
 #    pragma GCC diagnostic ignored "-Wstringop-overflow="
-#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
-                // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
+#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV                                                                          \
+       // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
 #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
 #    pragma GCC diagnostic pop
 #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
diff --git a/test/api/layout/determine_split_bins_test.cpp b/test/api/layout/determine_split_bins_test.cpp
index 2af4c488..4b0c49d2 100644
--- a/test/api/layout/determine_split_bins_test.cpp
+++ b/test/api/layout/determine_split_bins_test.cpp
@@ -2,12 +2,12 @@
 
 #include <iostream> // for operator<<, char_traits, basic_ostream, basic_stringstream, strings...
 #include <numeric>  // for allocator, string
-#include <vector>  // for vector
+#include <vector>   // for vector
 
 #include <chopper/configuration.hpp>
-#include <hibf/layout/print_matrix.hpp>        // for data_store
 
 #include <hibf/layout/compute_fpr_correction.hpp>
+#include <hibf/layout/print_matrix.hpp> // for data_store
 #include <hibf/misc/divide_and_ceil.hpp>
 
 std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
@@ -18,15 +18,16 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
                                                std::vector<std::vector<size_t>> & partitions)
 {
     if (num_user_bins == 0)
-        return {0,0};
+        return {0, 0};
 
     assert(num_technical_bins > 0u);
     assert(num_user_bins > 0u);
     assert(num_user_bins <= num_technical_bins);
 
-    auto const fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
-                                                        .hash_count = config.hibf_config.number_of_hash_functions,
-                                                        .t_max = num_technical_bins});
+    auto const fpr_correction =
+        seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, //
+                                                     .hash_count = config.hibf_config.number_of_hash_functions,
+                                                     .t_max = num_technical_bins});
 
     std::vector<std::vector<size_t>> matrix(num_technical_bins); // rows
     for (auto & v : matrix)
@@ -59,8 +60,8 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
             {
                 size_t const corrected_ub_cardinality =
                     static_cast<size_t>(ub_cardinality * fpr_correction[(i - i_prime)]);
-                size_t score =
-                    std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), matrix[i_prime][j - 1]);
+                size_t score = std::max<size_t>(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime),
+                                                matrix[i_prime][j - 1]);
 
                 // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl;
 
@@ -160,7 +161,8 @@ TEST(simple_split, first)
 
     std::vector<std::vector<size_t>> partitions(64);
 
-    auto const [num_splits, max_size] = determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions);
+    auto const [num_splits, max_size] =
+        determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions);
 
     EXPECT_EQ(num_splits, 61);
     EXPECT_EQ(max_size, 1452);
@@ -178,8 +180,7 @@ TEST(simple_split, first)
     for (size_t i{partitions.size() - 1}; i > partitions.size() - num_splits - 1; --i)
         std::cerr << partitions[i][0] << ",";
 
-
-    for (;tb_idx > partitions.size() - num_splits - 1; --tb_idx)
+    for (; tb_idx > partitions.size() - num_splits - 1; --tb_idx)
     {
         for (size_t three = 0; three < 3; ++three)
         {
diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp
index 2347f66b..ca2a4a5c 100644
--- a/test/api/layout/fast_layout_test.cpp
+++ b/test/api/layout/fast_layout_test.cpp
@@ -56,254 +56,255 @@ TEST(execute_test, many_ubs)
 
     chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
 
-    std::string const expected_file{
-    "@CHOPPER_USER_BINS\n"
-    "@0 seq0\n"
-    "@1 seq1\n"
-    "@2 seq2\n"
-    "@3 seq3\n"
-    "@4 seq4\n"
-    "@5 seq5\n"
-    "@6 seq6\n"
-    "@7 seq7\n"
-    "@8 seq8\n"
-    "@9 seq9\n"
-    "@10 seq10\n"
-    "@11 seq11\n"
-    "@12 seq12\n"
-    "@13 seq13\n"
-    "@14 seq14\n"
-    "@15 seq15\n"
-    "@16 seq16\n"
-    "@17 seq17\n"
-    "@18 seq18\n"
-    "@19 seq19\n"
-    "@20 seq20\n"
-    "@21 seq21\n"
-    "@22 seq22\n"
-    "@23 seq23\n"
-    "@24 seq24\n"
-    "@25 seq25\n"
-    "@26 seq26\n"
-    "@27 seq27\n"
-    "@28 seq28\n"
-    "@29 seq29\n"
-    "@30 seq30\n"
-    "@31 seq31\n"
-    "@32 seq32\n"
-    "@33 seq33\n"
-    "@34 seq34\n"
-    "@35 seq35\n"
-    "@36 seq36\n"
-    "@37 seq37\n"
-    "@38 seq38\n"
-    "@39 seq39\n"
-    "@40 seq40\n"
-    "@41 seq41\n"
-    "@42 seq42\n"
-    "@43 seq43\n"
-    "@44 seq44\n"
-    "@45 seq45\n"
-    "@46 seq46\n"
-    "@47 seq47\n"
-    "@48 seq48\n"
-    "@49 seq49\n"
-    "@50 seq50\n"
-    "@51 seq51\n"
-    "@52 seq52\n"
-    "@53 seq53\n"
-    "@54 seq54\n"
-    "@55 seq55\n"
-    "@56 seq56\n"
-    "@57 seq57\n"
-    "@58 seq58\n"
-    "@59 seq59\n"
-    "@60 seq60\n"
-    "@61 seq61\n"
-    "@62 seq62\n"
-    "@63 seq63\n"
-    "@64 seq64\n"
-    "@65 seq65\n"
-    "@66 seq66\n"
-    "@67 seq67\n"
-    "@68 seq68\n"
-    "@69 seq69\n"
-    "@70 seq70\n"
-    "@71 seq71\n"
-    "@72 seq72\n"
-    "@73 seq73\n"
-    "@74 seq74\n"
-    "@75 seq75\n"
-    "@76 seq76\n"
-    "@77 seq77\n"
-    "@78 seq78\n"
-    "@79 seq79\n"
-    "@80 seq80\n"
-    "@81 seq81\n"
-    "@82 seq82\n"
-    "@83 seq83\n"
-    "@84 seq84\n"
-    "@85 seq85\n"
-    "@86 seq86\n"
-    "@87 seq87\n"
-    "@88 seq88\n"
-    "@89 seq89\n"
-    "@90 seq90\n"
-    "@91 seq91\n"
-    "@92 seq92\n"
-    "@93 seq93\n"
-    "@94 seq94\n"
-    "@95 seq95\n"
-    "@CHOPPER_USER_BINS_END\n"
-    "@CHOPPER_CONFIG\n"
-    "@{\n"
-    "@    \"chopper_config\": {\n"
-    "@        \"version\": 2,\n"
-    "@        \"data_file\": {\n"
-    "@            \"value0\": \"\"\n"
-    "@        },\n"
-    "@        \"debug\": false,\n"
-    "@        \"sketch_directory\": {\n"
-    "@            \"value0\": \"\"\n"
-    "@        },\n"
-    "@        \"k\": 19,\n"
-    "@        \"window_size\": 19,\n"
-    "@        \"disable_sketch_output\": true,\n"
-    "@        \"precomputed_files\": false,\n"
-    "@        \"output_filename\": {\n"
-    "@            \"value0\": \"" +  layout_file.string() + "\"\n"
-    "@        },\n"
-    "@        \"determine_best_tmax\": false,\n"
-    "@        \"force_all_binnings\": false\n"
-    "@    }\n"
-    "@}\n"
-    "@CHOPPER_CONFIG_END\n"
-    "@HIBF_CONFIG\n"
-    "@{\n"
-    "@    \"hibf_config\": {\n"
-    "@        \"version\": 2,\n"
-    "@        \"number_of_user_bins\": 96,\n"
-    "@        \"number_of_hash_functions\": 2,\n"
-    "@        \"maximum_fpr\": 0.05,\n"
-    "@        \"relaxed_fpr\": 0.3,\n"
-    "@        \"threads\": 1,\n"
-    "@        \"sketch_bits\": 12,\n"
-    "@        \"tmax\": 64,\n"
-    "@        \"empty_bin_fraction\": 0.0,\n"
-    "@        \"alpha\": 1.2,\n"
-    "@        \"max_rearrangement_ratio\": 0.5,\n"
-    "@        \"disable_estimate_union\": true,\n"
-    "@        \"disable_rearrangement\": true\n"
-    "@    }\n"
-    "@}\n"
-    "@HIBF_CONFIG_END\n"
-    "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n"
-    "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n"
-    "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n"
-    "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n"
-    "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n"
-    "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n"
-    "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n"
-    "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n"
-    "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n"
-    "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n"
-    "#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS\n"
-    "0	5;0	1;2\n"
-    "1	9;0	1;2\n"
-    "2	3;0	1;2\n"
-    "3	12;0	1;2\n"
-    "4	12;2	1;2\n"
-    "5	6;0	1;2\n"
-    "6	9;2	1;2\n"
-    "7	9;4	1;2\n"
-    "8	3;2	1;2\n"
-    "9	3;4	1;2\n"
-    "10	4;0	1;2\n"
-    "11	6;2	1;2\n"
-    "12	2;0	1;2\n"
-    "13	2;2	1;2\n"
-    "14	2;4	1;2\n"
-    "15	4;2	1;2\n"
-    "16	12;4	1;3\n"
-    "17	6;4	1;2\n"
-    "18	5;2	1;3\n"
-    "19	9;6	1;2\n"
-    "20	8;0	1;12\n"
-    "21	8;12	1;12\n"
-    "22	6;6	1;9\n"
-    "23	6;15	1;9\n"
-    "24	9;8	1;9\n"
-    "25	9;17	1;9\n"
-    "26	4;4	1;11\n"
-    "27	4;15	1;11\n"
-    "28	1;0	1;12\n"
-    "29	1;12	1;12\n"
-    "30	2;6	1;11\n"
-    "31	2;17	1;11\n"
-    "32	12;7	1;13\n"
-    "33	6;24	1;9\n"
-    "34	5;5	1;14\n"
-    "35	9;26	1;9\n"
-    "36	3;6	1;13\n"
-    "37	22	1\n"
-    "38	21	1\n"
-    "39	20	1\n"
-    "40	15	1\n"
-    "41	18	1\n"
-    "42	19	1\n"
-    "43	13	1\n"
-    "44	17	1\n"
-    "45	14	1\n"
-    "46	10	1\n"
-    "47	16	1\n"
-    "48	11	1\n"
-    "49	8;24	1;40\n"
-    "50	12;20	1;44\n"
-    "51	6;33	1;31\n"
-    "52	5;19	1;45\n"
-    "53	9;35	1;29\n"
-    "54	3;19	1;45\n"
-    "55	4;26	1;38\n"
-    "56	7	1\n"
-    "57	1;24	1;40\n"
-    "58	0	1\n"
-    "59	2;28	1;36\n"
-    "60	63	1\n"
-    "61	62	1\n"
-    "62	60	1\n"
-    "63	61	1\n"
-    "64	59	1\n"
-    "65	58	1\n"
-    "66	57	1\n"
-    "67	55	1\n"
-    "68	56	1\n"
-    "69	54	1\n"
-    "70	52	1\n"
-    "71	53	1\n"
-    "72	51	1\n"
-    "73	49	1\n"
-    "74	48	1\n"
-    "75	50	1\n"
-    "76	45	1\n"
-    "77	47	1\n"
-    "78	46	1\n"
-    "79	44	1\n"
-    "80	39	1\n"
-    "81	40	1\n"
-    "82	41	1\n"
-    "83	42	1\n"
-    "84	38	1\n"
-    "85	37	1\n"
-    "86	43	1\n"
-    "87	36	1\n"
-    "88	35	1\n"
-    "89	34	1\n"
-    "90	29	2\n"
-    "91	31	2\n"
-    "92	27	2\n"
-    "93	23	2\n"
-    "94	33	1\n"
-    "95	25	2\n"};
+    std::string const expected_file{"@CHOPPER_USER_BINS\n"
+                                    "@0 seq0\n"
+                                    "@1 seq1\n"
+                                    "@2 seq2\n"
+                                    "@3 seq3\n"
+                                    "@4 seq4\n"
+                                    "@5 seq5\n"
+                                    "@6 seq6\n"
+                                    "@7 seq7\n"
+                                    "@8 seq8\n"
+                                    "@9 seq9\n"
+                                    "@10 seq10\n"
+                                    "@11 seq11\n"
+                                    "@12 seq12\n"
+                                    "@13 seq13\n"
+                                    "@14 seq14\n"
+                                    "@15 seq15\n"
+                                    "@16 seq16\n"
+                                    "@17 seq17\n"
+                                    "@18 seq18\n"
+                                    "@19 seq19\n"
+                                    "@20 seq20\n"
+                                    "@21 seq21\n"
+                                    "@22 seq22\n"
+                                    "@23 seq23\n"
+                                    "@24 seq24\n"
+                                    "@25 seq25\n"
+                                    "@26 seq26\n"
+                                    "@27 seq27\n"
+                                    "@28 seq28\n"
+                                    "@29 seq29\n"
+                                    "@30 seq30\n"
+                                    "@31 seq31\n"
+                                    "@32 seq32\n"
+                                    "@33 seq33\n"
+                                    "@34 seq34\n"
+                                    "@35 seq35\n"
+                                    "@36 seq36\n"
+                                    "@37 seq37\n"
+                                    "@38 seq38\n"
+                                    "@39 seq39\n"
+                                    "@40 seq40\n"
+                                    "@41 seq41\n"
+                                    "@42 seq42\n"
+                                    "@43 seq43\n"
+                                    "@44 seq44\n"
+                                    "@45 seq45\n"
+                                    "@46 seq46\n"
+                                    "@47 seq47\n"
+                                    "@48 seq48\n"
+                                    "@49 seq49\n"
+                                    "@50 seq50\n"
+                                    "@51 seq51\n"
+                                    "@52 seq52\n"
+                                    "@53 seq53\n"
+                                    "@54 seq54\n"
+                                    "@55 seq55\n"
+                                    "@56 seq56\n"
+                                    "@57 seq57\n"
+                                    "@58 seq58\n"
+                                    "@59 seq59\n"
+                                    "@60 seq60\n"
+                                    "@61 seq61\n"
+                                    "@62 seq62\n"
+                                    "@63 seq63\n"
+                                    "@64 seq64\n"
+                                    "@65 seq65\n"
+                                    "@66 seq66\n"
+                                    "@67 seq67\n"
+                                    "@68 seq68\n"
+                                    "@69 seq69\n"
+                                    "@70 seq70\n"
+                                    "@71 seq71\n"
+                                    "@72 seq72\n"
+                                    "@73 seq73\n"
+                                    "@74 seq74\n"
+                                    "@75 seq75\n"
+                                    "@76 seq76\n"
+                                    "@77 seq77\n"
+                                    "@78 seq78\n"
+                                    "@79 seq79\n"
+                                    "@80 seq80\n"
+                                    "@81 seq81\n"
+                                    "@82 seq82\n"
+                                    "@83 seq83\n"
+                                    "@84 seq84\n"
+                                    "@85 seq85\n"
+                                    "@86 seq86\n"
+                                    "@87 seq87\n"
+                                    "@88 seq88\n"
+                                    "@89 seq89\n"
+                                    "@90 seq90\n"
+                                    "@91 seq91\n"
+                                    "@92 seq92\n"
+                                    "@93 seq93\n"
+                                    "@94 seq94\n"
+                                    "@95 seq95\n"
+                                    "@CHOPPER_USER_BINS_END\n"
+                                    "@CHOPPER_CONFIG\n"
+                                    "@{\n"
+                                    "@    \"chopper_config\": {\n"
+                                    "@        \"version\": 2,\n"
+                                    "@        \"data_file\": {\n"
+                                    "@            \"value0\": \"\"\n"
+                                    "@        },\n"
+                                    "@        \"debug\": false,\n"
+                                    "@        \"sketch_directory\": {\n"
+                                    "@            \"value0\": \"\"\n"
+                                    "@        },\n"
+                                    "@        \"k\": 19,\n"
+                                    "@        \"window_size\": 19,\n"
+                                    "@        \"disable_sketch_output\": true,\n"
+                                    "@        \"precomputed_files\": false,\n"
+                                    "@        \"output_filename\": {\n"
+                                    "@            \"value0\": \""
+                                    + layout_file.string()
+                                    + "\"\n"
+                                      "@        },\n"
+                                      "@        \"determine_best_tmax\": false,\n"
+                                      "@        \"force_all_binnings\": false\n"
+                                      "@    }\n"
+                                      "@}\n"
+                                      "@CHOPPER_CONFIG_END\n"
+                                      "@HIBF_CONFIG\n"
+                                      "@{\n"
+                                      "@    \"hibf_config\": {\n"
+                                      "@        \"version\": 2,\n"
+                                      "@        \"number_of_user_bins\": 96,\n"
+                                      "@        \"number_of_hash_functions\": 2,\n"
+                                      "@        \"maximum_fpr\": 0.05,\n"
+                                      "@        \"relaxed_fpr\": 0.3,\n"
+                                      "@        \"threads\": 1,\n"
+                                      "@        \"sketch_bits\": 12,\n"
+                                      "@        \"tmax\": 64,\n"
+                                      "@        \"empty_bin_fraction\": 0.0,\n"
+                                      "@        \"alpha\": 1.2,\n"
+                                      "@        \"max_rearrangement_ratio\": 0.5,\n"
+                                      "@        \"disable_estimate_union\": true,\n"
+                                      "@        \"disable_rearrangement\": true\n"
+                                      "@    }\n"
+                                      "@}\n"
+                                      "@HIBF_CONFIG_END\n"
+                                      "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n"
+                                      "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n"
+                                      "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n"
+                                      "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n"
+                                      "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n"
+                                      "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n"
+                                      "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n"
+                                      "#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS\n"
+                                      "0	5;0	1;2\n"
+                                      "1	9;0	1;2\n"
+                                      "2	3;0	1;2\n"
+                                      "3	12;0	1;2\n"
+                                      "4	12;2	1;2\n"
+                                      "5	6;0	1;2\n"
+                                      "6	9;2	1;2\n"
+                                      "7	9;4	1;2\n"
+                                      "8	3;2	1;2\n"
+                                      "9	3;4	1;2\n"
+                                      "10	4;0	1;2\n"
+                                      "11	6;2	1;2\n"
+                                      "12	2;0	1;2\n"
+                                      "13	2;2	1;2\n"
+                                      "14	2;4	1;2\n"
+                                      "15	4;2	1;2\n"
+                                      "16	12;4	1;3\n"
+                                      "17	6;4	1;2\n"
+                                      "18	5;2	1;3\n"
+                                      "19	9;6	1;2\n"
+                                      "20	8;0	1;12\n"
+                                      "21	8;12	1;12\n"
+                                      "22	6;6	1;9\n"
+                                      "23	6;15	1;9\n"
+                                      "24	9;8	1;9\n"
+                                      "25	9;17	1;9\n"
+                                      "26	4;4	1;11\n"
+                                      "27	4;15	1;11\n"
+                                      "28	1;0	1;12\n"
+                                      "29	1;12	1;12\n"
+                                      "30	2;6	1;11\n"
+                                      "31	2;17	1;11\n"
+                                      "32	12;7	1;13\n"
+                                      "33	6;24	1;9\n"
+                                      "34	5;5	1;14\n"
+                                      "35	9;26	1;9\n"
+                                      "36	3;6	1;13\n"
+                                      "37	22	1\n"
+                                      "38	21	1\n"
+                                      "39	20	1\n"
+                                      "40	15	1\n"
+                                      "41	18	1\n"
+                                      "42	19	1\n"
+                                      "43	13	1\n"
+                                      "44	17	1\n"
+                                      "45	14	1\n"
+                                      "46	10	1\n"
+                                      "47	16	1\n"
+                                      "48	11	1\n"
+                                      "49	8;24	1;40\n"
+                                      "50	12;20	1;44\n"
+                                      "51	6;33	1;31\n"
+                                      "52	5;19	1;45\n"
+                                      "53	9;35	1;29\n"
+                                      "54	3;19	1;45\n"
+                                      "55	4;26	1;38\n"
+                                      "56	7	1\n"
+                                      "57	1;24	1;40\n"
+                                      "58	0	1\n"
+                                      "59	2;28	1;36\n"
+                                      "60	63	1\n"
+                                      "61	62	1\n"
+                                      "62	60	1\n"
+                                      "63	61	1\n"
+                                      "64	59	1\n"
+                                      "65	58	1\n"
+                                      "66	57	1\n"
+                                      "67	55	1\n"
+                                      "68	56	1\n"
+                                      "69	54	1\n"
+                                      "70	52	1\n"
+                                      "71	53	1\n"
+                                      "72	51	1\n"
+                                      "73	49	1\n"
+                                      "74	48	1\n"
+                                      "75	50	1\n"
+                                      "76	45	1\n"
+                                      "77	47	1\n"
+                                      "78	46	1\n"
+                                      "79	44	1\n"
+                                      "80	39	1\n"
+                                      "81	40	1\n"
+                                      "82	41	1\n"
+                                      "83	42	1\n"
+                                      "84	38	1\n"
+                                      "85	37	1\n"
+                                      "86	43	1\n"
+                                      "87	36	1\n"
+                                      "88	35	1\n"
+                                      "89	34	1\n"
+                                      "90	29	2\n"
+                                      "91	31	2\n"
+                                      "92	27	2\n"
+                                      "93	23	2\n"
+                                      "94	33	1\n"
+                                      "95	25	2\n"};
     std::string const actual_file{string_from_file(layout_file)};
 
     EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl;
diff --git a/test/api/lsh_test.cpp b/test/api/lsh_test.cpp
index 71be4de1..1ebfd552 100644
--- a/test/api/lsh_test.cpp
+++ b/test/api/lsh_test.cpp
@@ -136,7 +136,6 @@ TEST(LSH_find_representative_cluster_test, multi_cluster_one_move)
     mclusters[1].move_to(mclusters[0]);
 
     EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id());
-
 }
 
 TEST(LSH_find_representative_cluster_test, cluster_two_moves)

From 7c647ab890d6110bdc34f546d0621597f762212c Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 13:50:51 +0200
Subject: [PATCH 09/18] unused variables

---
 src/layout/execute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index 9e2f4e0b..e8053800 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -160,7 +160,7 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     size_t current_max_cluster_size{0};
     size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively
     size_t current_sketch_index{0};
-    size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
+    [[maybe_unused]] size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
 
     for (size_t pos = 0; pos < number_of_user_bins; ++pos)
     {
@@ -665,7 +665,7 @@ void partition_user_bins(chopper::configuration const & config,
     // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size
     // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand.
     // This way we make sure there is at least 1 LSH clustering step.
-    size_t const estimate_per_part =
+    [[maybe_unused]] size_t const estimate_per_part =
         std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), max_cardinality + 1);
 
     double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction(

From 64b55128b327abe92e1a2b678dd08c2fc2e394f4 Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 13:59:56 +0200
Subject: [PATCH 10/18] fix linkage

---
 src/layout/determine_split_bins.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp
index 08e5fdee..013975b4 100644
--- a/src/layout/determine_split_bins.cpp
+++ b/src/layout/determine_split_bins.cpp
@@ -15,7 +15,7 @@
 #include <hibf/layout/print_matrix.hpp> // for data_store
 #include <hibf/misc/divide_and_ceil.hpp>
 
-namespace chopper::fast_layout
+namespace chopper::layout
 {
 
 std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,

From 53e301f9e93e479a82ebd43082259dbb7ad8785f Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 14:04:26 +0200
Subject: [PATCH 11/18] [DELETE ME] temporarily make tests compile

---
 test/api/layout/execute_layout_test.cpp       | 15 ++++++++------
 .../layout/execute_with_estimation_test.cpp   | 20 +++++++++++--------
 test/api/layout/hibf_statistics_test.cpp      | 10 ++++++----
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp
index 5449ec12..d77a22f7 100644
--- a/test/api/layout/execute_layout_test.cpp
+++ b/test/api/layout/execute_layout_test.cpp
@@ -46,9 +46,10 @@ TEST(execute_test, few_ubs)
         filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, filenames, sketches);
+    chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     std::string const expected_file{"@CHOPPER_USER_BINS\n"
                                     "@0 seq0a seq0b\n"
@@ -142,9 +143,10 @@ TEST(execute_test, set_default_tmax)
         filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, filenames, sketches);
+    chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     EXPECT_EQ(config.hibf_config.tmax, 64u);
 }
@@ -178,9 +180,10 @@ TEST(execute_test, many_ubs)
     config.hibf_config.disable_estimate_union = true; // also disables rearrangement
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, many_filenames, sketches);
+    chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
 
     std::string const expected_file{"@CHOPPER_USER_BINS\n"
                                     "@0 seq0\n"
diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp
index c33d0243..9951befe 100644
--- a/test/api/layout/execute_with_estimation_test.cpp
+++ b/test/api/layout/execute_with_estimation_test.cpp
@@ -53,9 +53,10 @@ TEST(execute_estimation_test, few_ubs)
         filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, filenames, sketches);
+    chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     ASSERT_TRUE(std::filesystem::exists(stats_file));
 
@@ -114,9 +115,10 @@ TEST(execute_estimation_test, many_ubs)
     config.hibf_config.disable_estimate_union = true; // also disables rearrangement
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, many_filenames, sketches);
+    chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
 
     ASSERT_TRUE(std::filesystem::exists(stats_file));
 
@@ -421,9 +423,10 @@ TEST(execute_estimation_test, many_ubs_force_all)
     config.hibf_config.disable_estimate_union = true; // also disables rearrangement
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, many_filenames, sketches);
+    chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
 
     ASSERT_TRUE(std::filesystem::exists(stats_file));
 
@@ -523,9 +526,10 @@ TEST(execute_estimation_test, with_rearrangement)
     config.hibf_config.number_of_user_bins = filenames.size();
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, filenames, sketches);
+    chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     ASSERT_TRUE(std::filesystem::exists(stats_file));
 
diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp
index 718b43dc..054fbcb2 100644
--- a/test/api/layout/hibf_statistics_test.cpp
+++ b/test/api/layout/hibf_statistics_test.cpp
@@ -133,11 +133,12 @@ TEST(execute_test, chopper_layout_statistics)
                                                   .disable_estimate_union = true /* also disable rearrangement */}};
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
     testing::internal::CaptureStdout();
     testing::internal::CaptureStderr();
-    chopper::layout::execute(config, many_filenames, sketches);
+    chopper::layout::execute(config, many_filenames, sketches, minHash_sketches);
     std::string layout_result_stdout = testing::internal::GetCapturedStdout();
     std::string layout_result_stderr = testing::internal::GetCapturedStderr();
 
@@ -190,9 +191,10 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins)
                                                   .disable_estimate_union = true /* also disable rearrangement */}};
 
     std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
+    std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
+    seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
 
-    chopper::layout::execute(config, filenames, sketches);
+    chopper::layout::execute(config, filenames, sketches, minHash_sketches);
 
     std::string expected_cout =
         R"expected_cout(## ### Parameters ###

From 8003d00b19664628c721f1463694fcb52bcc231a Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 15:40:07 +0200
Subject: [PATCH 12/18] dedup

---
 include/chopper/configuration.hpp |   5 +-
 include/chopper/lsh.hpp           |  61 ++++++++----------
 src/layout/execute.cpp            | 103 +++++++++++-------------------
 3 files changed, 68 insertions(+), 101 deletions(-)

diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp
index 825bc407..4d8e80ff 100644
--- a/include/chopper/configuration.hpp
+++ b/include/chopper/configuration.hpp
@@ -20,7 +20,7 @@
 namespace chopper
 {
 
-enum partitioning_scheme
+enum class partitioning_scheme : uint8_t
 {
     blocked,       // 0
     sorted,        // 1
@@ -33,7 +33,8 @@ enum partitioning_scheme
 
 struct configuration
 {
-    int partitioning_approach{6};
+    partitioning_scheme partitioning_approach{partitioning_scheme::lsh_sim};
+
     /*!\name General Configuration
      * \{
      */
diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp
index 5ef11a75..caa83975 100644
--- a/include/chopper/lsh.hpp
+++ b/include/chopper/lsh.hpp
@@ -36,17 +36,11 @@ struct Cluster
     Cluster & operator=(Cluster &&) = default;
     ~Cluster() = default;
 
-    Cluster(size_t const id)
-    {
-        representative_id = id;
-        user_bins = {id};
-    }
+    Cluster(size_t const id, size_t const user_bins_id) : representative_id{id}, user_bins({user_bins_id})
+    {}
 
-    Cluster(size_t const id, size_t const user_bins_id)
-    {
-        representative_id = id;
-        user_bins = {user_bins_id};
-    }
+    Cluster(size_t const id) : Cluster{id, id}
+    {}
 
     size_t id() const
     {
@@ -76,7 +70,7 @@ struct Cluster
     bool is_valid(size_t id) const
     {
         bool const ids_equal = representative_id == id;
-        bool const properly_moved = has_been_moved() && empty() && moved_id.has_value();
+        bool const properly_moved = has_been_moved() && empty();
         bool const not_moved = !has_been_moved() && !empty();
 
         return ids_equal && (properly_moved || not_moved);
@@ -98,12 +92,11 @@ struct Cluster
 
     void sort_by_cardinality(std::vector<size_t> const & cardinalities)
     {
-        std::sort(user_bins.begin(),
-                  user_bins.end(),
-                  [&cardinalities](auto const & v1, auto const & v2)
-                  {
-                      return cardinalities[v2] < cardinalities[v1];
-                  });
+        std::ranges::sort(user_bins,
+                          [&cardinalities](auto const & v1, auto const & v2)
+                          {
+                              return cardinalities[v1] > cardinalities[v2];
+                          });
     }
 };
 
@@ -123,14 +116,11 @@ struct MultiCluster : Cluster
     MultiCluster(Cluster const & clust)
     {
         representative_id = clust.id();
+
         if (clust.has_been_moved())
-        {
             moved_id = clust.moved_to_cluster_id();
-        }
         else
-        {
             user_bins.push_back(clust.contained_user_bins());
-        }
     }
 
     std::vector<std::vector<size_t>> const & contained_user_bins() const
@@ -153,7 +143,7 @@ struct MultiCluster : Cluster
     bool is_valid(size_t id) const
     {
         bool const ids_equal = representative_id == id;
-        bool const properly_moved = has_been_moved() && empty() && moved_id.has_value();
+        bool const properly_moved = has_been_moved() && empty();
         bool const not_moved = !has_been_moved() && !empty();
 
         return ids_equal && (properly_moved || not_moved);
@@ -169,18 +159,21 @@ struct MultiCluster : Cluster
     // sort user bins within a Cluster by cardinality and the clusters themselves by size
     void sort_by_cardinality(std::vector<size_t> const & cardinalities)
     {
-        auto cmp = [&cardinalities](auto const & v1, auto const & v2)
-        {
-            return cardinalities[v2] < cardinalities[v1];
-        };
-        for (auto & user_bin_cluster : user_bins)
-            std::sort(user_bin_cluster.begin(), user_bin_cluster.end(), cmp);
-
-        auto cmp_clusters = [](auto const & c1, auto const & c2)
-        {
-            return c2.size() < c1.size();
-        };
-        std::sort(user_bins.begin(), user_bins.end(), cmp_clusters);
+        std::ranges::for_each(user_bins,
+                              [&cardinalities](auto & user_bin_cluster)
+                              {
+                                  std::ranges::sort(user_bin_cluster,
+                                                    [&cardinalities](auto const & v1, auto const & v2)
+                                                    {
+                                                        return cardinalities[v1] > cardinalities[v2];
+                                                    });
+                              });
+
+        std::ranges::sort(user_bins,
+                          [](auto const & c1, auto const & c2)
+                          {
+                              return c1.size() > c2.size();
+                          });
     }
 };
 
diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index e8053800..5058e1e3 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -47,23 +47,30 @@ namespace chopper::layout
 
 uint64_t lsh_hash_the_sketch(std::vector<uint64_t> const & sketch, size_t const number_of_hashes_to_consider)
 {
-    // lets just compute the sum
-    uint64_t sum{0};
-
-    for (size_t i = 0; i < number_of_hashes_to_consider; ++i)
-        sum += sketch[i];
-
-    return sum;
+    return std::reduce(sketch.begin(), sketch.begin() + number_of_hashes_to_consider);
 }
 
-auto LSH_fill_hashtable(std::vector<Cluster> const & clusters,
+template <typename cluster_type>
+auto LSH_fill_hashtable(std::vector<cluster_type> const & clusters,
                         std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
                         size_t const current_sketch_index,
                         size_t const current_number_of_sketch_hashes)
 {
+    static constexpr bool is_multi_cluster = std::same_as<cluster_type, MultiCluster>;
+    static_assert(std::same_as<cluster_type, Cluster> || std::same_as<cluster_type, MultiCluster>);
+
     robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table;
 
-    size_t processed_user_bins{0}; // only for sanity check
+    [[maybe_unused]] size_t processed_user_bins{0}; // only for sanity check
+
+    auto get_user_bin_idx = [](cluster_type const & cluster)
+    {
+        if constexpr (is_multi_cluster)
+            return std::views::join(cluster.contained_user_bins());
+        else
+            return cluster.contained_user_bins();
+    };
+
     for (size_t pos = 0; pos < clusters.size(); ++pos)
     {
         auto const & current = clusters[pos];
@@ -72,7 +79,7 @@ auto LSH_fill_hashtable(std::vector<Cluster> const & clusters,
         if (current.has_been_moved()) // cluster has been moved somewhere else, don't process
             continue;
 
-        for (size_t const user_bin_idx : current.contained_user_bins())
+        for (size_t const user_bin_idx : get_user_bin_idx(current))
         {
             ++processed_user_bins;
             uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index],
@@ -85,42 +92,10 @@ auto LSH_fill_hashtable(std::vector<Cluster> const & clusters,
     return table;
 }
 
-auto LSH_fill_hashtable(std::vector<MultiCluster> const & clusters,
-                        std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
-                        size_t const current_sketch_index,
-                        size_t const current_number_of_sketch_hashes)
-{
-    robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table;
-
-    size_t processed_user_bins{0}; // only for sanity check
-    for (size_t pos = 0; pos < clusters.size(); ++pos)
-    {
-        auto const & current = clusters[pos];
-        assert(current.is_valid(pos));
-
-        if (current.has_been_moved()) // cluster has been moved somewhere else, don't process
-            continue;
-
-        for (auto const & similarity_cluster : current.contained_user_bins())
-        {
-            for (size_t const user_bin_idx : similarity_cluster)
-            {
-                ++processed_user_bins;
-                uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index],
-                                                         current_number_of_sketch_hashes);
-                table[key].push_back(current.id()); // insert representative for all user bins
-            }
-        }
-    }
-    assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters
-
-    return table;
-}
-
 // minHash_sketches data structure:
 // Vector L1 : number of user bins
 // Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b)
-// Vector L3 : minHash_sketche_size (LSH ADD+OR parameter r)
+// Vector L3 : minHash_sketch_size (LSH ADD+OR parameter r)
 
 std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches,
                                                    std::vector<size_t> const & positions,
@@ -136,11 +111,11 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     size_t const number_of_user_bins{positions.size()};
     assert(number_of_user_bins <= minHash_sketches.size());
     size_t const number_of_max_minHash_sketches{3}; // LSH ADD+OR parameter b
-    // size_t const minHash_sketche_size{minHash_sketches[0].table[0].size()};   // LSH ADD+OR parameter r
-    size_t const minHash_sketche_size{5}; // LSH ADD+OR parameter r
+    // size_t const minHash_sketch_size{minHash_sketches[0].table[0].size()};   // LSH ADD+OR parameter r
+    size_t const minHash_sketch_size{5}; // LSH ADD+OR parameter r
     seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits};
     // std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl;
-    // std::cout << "sketch size used here: " << minHash_sketche_size << std::endl;
+    // std::cout << "sketch size used here: " << minHash_sketch_size << std::endl;
     // initialise clusters with a signle user bin per cluster.
     // clusters are either
     // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to
@@ -158,7 +133,7 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     std::vector<size_t> current_cluster_cardinality(number_of_user_bins);
     std::vector<seqan::hibf::sketch::hyperloglog> current_cluster_sketches(number_of_user_bins, empty_sketch);
     size_t current_max_cluster_size{0};
-    size_t current_number_of_sketch_hashes{minHash_sketche_size}; // start with high r but decrease it iteratively
+    size_t current_number_of_sketch_hashes{minHash_sketch_size}; // start with high r but decrease it iteratively
     size_t current_sketch_index{0};
     [[maybe_unused]] size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
 
@@ -191,8 +166,8 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
 
             // uniquify list. Since I am inserting representative_idx's into the table, the same number can
             // be inserted into multiple splots, and multiple times in the same slot.
-            std::sort(list.begin(), list.end());
-            auto const end = std::unique(list.begin(), list.end());
+            std::ranges::sort(list);
+            auto const end = std::ranges::unique(list);
             auto const begin = list.begin();
 
             if (end - begin <= 1) // nothing to do here
@@ -236,12 +211,10 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
                 --current_number_of_clusters;
             }
 
-            current_max_cluster_size = *std::ranges::max_element(current_cluster_cardinality);
+            current_max_cluster_size = std::ranges::max(current_cluster_cardinality);
         }
 
         ++current_sketch_index;
-
-        //std::cout << " and after this clustering step there are: " << current_number_of_clusters << "with max cluster size" << current_max_cluster_size << std::endl;
     }
 
     return clusters;
@@ -265,14 +238,14 @@ void post_process_clusters(std::vector<Cluster> & clusters,
     // push largest p clusters to the front
     auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2)
     {
-        if (v2.size() == v1.size() && !v2.empty())
-            return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
-        return v2.size() < v1.size();
+        if (v1.size() == v2.size() && !v2.empty()) // Note: If v2 is empty, so is v1.
+            return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()];
+
+        return v1.size() > v2.size();
     };
-    std::partial_sort(clusters.begin(),
-                      clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax),
-                      clusters.end(),
-                      cluster_size_cmp);
+    std::ranges::partial_sort(clusters,
+                              std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()),
+                              cluster_size_cmp);
 
     // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub
     // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last.
@@ -285,11 +258,11 @@ void post_process_clusters(std::vector<Cluster> & clusters,
         if (v2.empty()) // and v1 is not, since the first if would catch
             return true;
 
-        return cardinalities[v2.contained_user_bins()[0]] < cardinalities[v1.contained_user_bins()[0]];
+        return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()];
     };
-    std::sort(clusters.begin() + std::min<size_t>(clusters.size(), config.hibf_config.tmax),
-              clusters.end(),
-              compare_cardinality_and_move_empty_clusters_to_the_end);
+    std::ranges::sort(std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()),
+                      clusters.end(),
+                      compare_cardinality_and_move_empty_clusters_to_the_end);
 
     assert(clusters[0].size() >= clusters[1].size()); // sanity check
     // assert(cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check
@@ -297,8 +270,8 @@ void post_process_clusters(std::vector<Cluster> & clusters,
     // debug
     for (size_t cidx = 1; cidx < clusters.size(); ++cidx)
     {
-        if (clusters[cidx - 1].empty()
-            && !clusters[cidx].empty()) // once empty - always empty; all empty clusters should be at the end
+        // once empty - always empty; all empty clusters should be at the end
+        if (clusters[cidx - 1].empty() && !clusters[cidx].empty())
             throw std::runtime_error{"sorting did not work"};
     }
     // debug

From 47e50e2d6ee75bab180369da2d198ce353c02a7a Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 16:03:50 +0200
Subject: [PATCH 13/18] uniquify in LSH_fill_hashtable

---
 src/layout/execute.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index 5058e1e3..dc3c624b 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -89,6 +89,15 @@ auto LSH_fill_hashtable(std::vector<cluster_type> const & clusters,
     }
     assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters
 
+    // uniquify list. Since I am inserting representative_idx's into the table, the same number can
+    // be inserted into multiple splots, and multiple times in the same slot.
+    for (auto & [key, list] : table)
+    {
+        std::ranges::sort(list);
+        auto const ret = std::ranges::unique(list);
+        list.erase(ret.begin(), ret.end());
+    }
+
     return table;
 }
 
@@ -164,13 +173,7 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
         {
             assert(!list.empty());
 
-            // uniquify list. Since I am inserting representative_idx's into the table, the same number can
-            // be inserted into multiple splots, and multiple times in the same slot.
-            std::ranges::sort(list);
-            auto const end = std::ranges::unique(list);
-            auto const begin = list.begin();
-
-            if (end - begin <= 1) // nothing to do here
+            if (list.size() <= 1) // nothing to do here
                 continue;
 
             // Now combine all clusters into the first.
@@ -180,18 +183,18 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
             // e.g.
             // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
             // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
-            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, *begin);
+            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, list[0]);
             auto & representative_cluster = clusters[representative_cluster_id];
             assert(representative_cluster.is_valid(representative_cluster_id));
             assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
 
-            for (auto current = begin + 1; current < end; ++current)
+            for (size_t const current : std::views::drop(list, 1))
             {
                 // For every other entry in the list, it can happen that I already joined that list with another
                 // e.g.
                 // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
                 // [key2] = {0, 2, 11} // now I want to do it again
-                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, *current);
+                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, current);
                 auto & next_cluster = clusters[next_cluster_id];
 
                 if (next_cluster.id() == representative_cluster.id()) // already joined

From f20870353f561dfe60503bc88614aa45819a0646 Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 16:04:07 +0200
Subject: [PATCH 14/18] libc++ sort

---
 test/api/layout/fast_layout_test.cpp | 113 ++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 1 deletion(-)

diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp
index ca2a4a5c..9f175672 100644
--- a/test/api/layout/fast_layout_test.cpp
+++ b/test/api/layout/fast_layout_test.cpp
@@ -198,6 +198,115 @@ TEST(execute_test, many_ubs)
                                       "@    }\n"
                                       "@}\n"
                                       "@HIBF_CONFIG_END\n"
+#ifdef _LIBCPP_VERSION
+                                      "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n"
+                                      "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n"
+                                      "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n"
+                                      "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n"
+                                      "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n"
+                                      "#LOWER_LEVEL_IBF_7 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n"
+                                      "#LOWER_LEVEL_IBF_10 fullest_technical_bin_idx:35\n"
+                                      "#LOWER_LEVEL_IBF_13 fullest_technical_bin_idx:7\n"
+                                      "#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS\n"
+                                      "0	5;0	1;2\n"
+                                      "1	10;0	1;2\n"
+                                      "2	3;0	1;2\n"
+                                      "3	13;0	1;2\n"
+                                      "4	13;2	1;2\n"
+                                      "5	7;0	1;2\n"
+                                      "6	10;2	1;2\n"
+                                      "7	10;4	1;2\n"
+                                      "8	3;2	1;2\n"
+                                      "9	3;4	1;2\n"
+                                      "10	4;0	1;2\n"
+                                      "11	7;2	1;2\n"
+                                      "12	2;0	1;2\n"
+                                      "13	2;2	1;2\n"
+                                      "14	2;4	1;2\n"
+                                      "15	4;2	1;2\n"
+                                      "16	13;4	1;3\n"
+                                      "17	7;4	1;2\n"
+                                      "18	5;2	1;3\n"
+                                      "19	10;6	1;2\n"
+                                      "20	8;0	1;12\n"
+                                      "21	8;12	1;12\n"
+                                      "22	7;6	1;9\n"
+                                      "23	7;15	1;9\n"
+                                      "24	10;8	1;9\n"
+                                      "25	10;17	1;9\n"
+                                      "26	4;4	1;11\n"
+                                      "27	4;15	1;11\n"
+                                      "28	1;0	1;12\n"
+                                      "29	1;12	1;12\n"
+                                      "30	2;6	1;11\n"
+                                      "31	2;17	1;11\n"
+                                      "32	13;7	1;13\n"
+                                      "33	7;24	1;9\n"
+                                      "34	5;5	1;14\n"
+                                      "35	10;26	1;9\n"
+                                      "36	3;6	1;13\n"
+                                      "37	22	1\n"
+                                      "38	21	1\n"
+                                      "39	20	1\n"
+                                      "40	15	1\n"
+                                      "41	18	1\n"
+                                      "42	19	1\n"
+                                      "43	12	1\n"
+                                      "44	17	1\n"
+                                      "45	16	1\n"
+                                      "46	9	1\n"
+                                      "47	14	1\n"
+                                      "48	11	1\n"
+                                      "49	8;24	1;40\n"
+                                      "50	13;20	1;44\n"
+                                      "51	7;33	1;31\n"
+                                      "52	5;19	1;45\n"
+                                      "53	10;35	1;29\n"
+                                      "54	3;19	1;45\n"
+                                      "55	4;26	1;38\n"
+                                      "56	6	1\n"
+                                      "57	1;24	1;40\n"
+                                      "58	0	1\n"
+                                      "59	2;28	1;36\n"
+                                      "60	63	1\n"
+                                      "61	62	1\n"
+                                      "62	60	1\n"
+                                      "63	61	1\n"
+                                      "64	59	1\n"
+                                      "65	58	1\n"
+                                      "66	57	1\n"
+                                      "67	55	1\n"
+                                      "68	56	1\n"
+                                      "69	54	1\n"
+                                      "70	53	1\n"
+                                      "71	52	1\n"
+                                      "72	51	1\n"
+                                      "73	49	1\n"
+                                      "74	48	1\n"
+                                      "75	50	1\n"
+                                      "76	45	1\n"
+                                      "77	47	1\n"
+                                      "78	46	1\n"
+                                      "79	44	1\n"
+                                      "80	43	1\n"
+                                      "81	42	1\n"
+                                      "82	41	1\n"
+                                      "83	40	1\n"
+                                      "84	39	1\n"
+                                      "85	37	1\n"
+                                      "86	38	1\n"
+                                      "87	36	1\n"
+                                      "88	35	1\n"
+                                      "89	34	1\n"
+                                      "90	29	2\n"
+                                      "91	31	2\n"
+                                      "92	27	2\n"
+                                      "93	23	2\n"
+                                      "94	33	1\n"
+                                      "95	25	2\n"
+#else
                                       "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n"
                                       "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n"
                                       "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n"
@@ -304,7 +413,9 @@ TEST(execute_test, many_ubs)
                                       "92	27	2\n"
                                       "93	23	2\n"
                                       "94	33	1\n"
-                                      "95	25	2\n"};
+                                      "95	25	2\n"
+#endif
+    };
     std::string const actual_file{string_from_file(layout_file)};
 
     EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl;

From 9b2c2dac29772c233258b9d59ee60bdd9585c653 Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 16:27:03 +0200
Subject: [PATCH 15/18] very_similar_LSH_partitioning

---
 src/layout/execute.cpp | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index dc3c624b..cc1a6ebf 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -144,7 +144,6 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
     size_t current_max_cluster_size{0};
     size_t current_number_of_sketch_hashes{minHash_sketch_size}; // start with high r but decrease it iteratively
     size_t current_sketch_index{0};
-    [[maybe_unused]] size_t current_number_of_clusters{number_of_user_bins}; // initially, each UB is a separate cluster
 
     for (size_t pos = 0; pos < number_of_user_bins; ++pos)
     {
@@ -154,14 +153,18 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
         current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]);
     }
 
+    auto get_cluster = [&clusters](size_t const current) -> Cluster &
+    {
+        size_t const cluster_id = LSH_find_representative_cluster(clusters, current);
+        return clusters[cluster_id];
+    };
+
     // refine clusters
     //std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl;
     while (current_max_cluster_size < average_technical_bin_size
            && /*number_of_clusters / static_cast<double>(number_of_user_bins) > 0.5 &&*/
            current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%?
     {
-        //std::cout << "Current number of clusters: " << current_number_of_clusters;
-
         // fill LSH collision hashtable
         robin_hood::unordered_flat_map<uint64_t, std::vector<size_t>> table =
             LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes);
@@ -183,39 +186,33 @@ std::vector<Cluster> very_similar_LSH_partitioning(std::vector<seqan::hibf::sket
             // e.g.
             // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
             // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved
-            size_t const representative_cluster_id = LSH_find_representative_cluster(clusters, list[0]);
-            auto & representative_cluster = clusters[representative_cluster_id];
-            assert(representative_cluster.is_valid(representative_cluster_id));
+            auto & representative_cluster = get_cluster(list[0]);
             assert(representative_cluster.id() == clusters[representative_cluster.id()].id());
 
+            auto & representative_cluster_sketch = current_cluster_sketches[representative_cluster.id()];
+
             for (size_t const current : std::views::drop(list, 1))
             {
                 // For every other entry in the list, it can happen that I already joined that list with another
                 // e.g.
                 // [key1] = {0,11}  // then clusters[11] is merged into clusters[0]
                 // [key2] = {0, 2, 11} // now I want to do it again
-                size_t const next_cluster_id = LSH_find_representative_cluster(clusters, current);
-                auto & next_cluster = clusters[next_cluster_id];
+                auto & next_cluster = get_cluster(current);
 
                 if (next_cluster.id() == representative_cluster.id()) // already joined
                     continue;
 
                 next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster
-                assert(next_cluster.size() == 0);
+                assert(next_cluster.empty());
                 assert(next_cluster.has_been_moved());
                 assert(representative_cluster.size() > 1); // there should be at least two user bins now
-                assert(representative_cluster.is_valid(representative_cluster_id)); // and it should still be valid
-
-                current_cluster_sketches[representative_cluster.id()].merge(
-                    current_cluster_sketches[next_cluster.id()]);
-                current_cluster_cardinality[representative_cluster.id()] =
-                    current_cluster_sketches[representative_cluster.id()].estimate();
 
-                --current_number_of_clusters;
+                representative_cluster_sketch.merge(current_cluster_sketches[next_cluster.id()]);
             }
 
-            current_max_cluster_size = std::ranges::max(current_cluster_cardinality);
+            current_cluster_cardinality[representative_cluster.id()] = representative_cluster_sketch.estimate();
         }
+        current_max_cluster_size = std::ranges::max(current_cluster_cardinality);
 
         ++current_sketch_index;
     }

From 208739e1161e03d69d5bea9eed758c9aec632d96 Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 16:45:50 +0200
Subject: [PATCH 16/18] small stuff

---
 src/layout/execute.cpp | 56 +++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
index cc1a6ebf..e4931196 100644
--- a/src/layout/execute.cpp
+++ b/src/layout/execute.cpp
@@ -228,7 +228,6 @@ void post_process_clusters(std::vector<Cluster> & clusters,
     // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not
     // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id()
     // would neet to be found
-
     for (size_t pos = 0; pos < clusters.size(); ++pos)
     {
         assert(clusters[pos].is_valid(pos));
@@ -236,35 +235,36 @@ void post_process_clusters(std::vector<Cluster> & clusters,
     }
 
     // push largest p clusters to the front
-    auto cluster_size_cmp = [&cardinalities](auto const & v1, auto const & v2)
-    {
-        if (v1.size() == v2.size() && !v2.empty()) // Note: If v2 is empty, so is v1.
-            return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()];
-
-        return v1.size() > v2.size();
-    };
     std::ranges::partial_sort(clusters,
                               std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()),
-                              cluster_size_cmp);
+                              [&cardinalities](auto const & v1, auto const & v2)
+                              {
+                                  // Note: If v2 is empty, so is v1.
+                                  if (v1.size() == v2.size() && !v2.empty())
+                                      return cardinalities[v1.contained_user_bins().front()]
+                                           > cardinalities[v2.contained_user_bins().front()];
+
+                                  return v1.size() > v2.size();
+                              });
 
     // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub
     // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last.
     // the largest ub is already at the start because of former sorting.
-    auto compare_cardinality_and_move_empty_clusters_to_the_end = [&cardinalities](auto const & v1, auto const & v2)
-    {
-        if (v1.empty())
-            return false; // v1 can never be larger than v2 then
-
-        if (v2.empty()) // and v1 is not, since the first if would catch
-            return true;
-
-        return cardinalities[v1.contained_user_bins().front()] > cardinalities[v2.contained_user_bins().front()];
-    };
     std::ranges::sort(std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()),
                       clusters.end(),
-                      compare_cardinality_and_move_empty_clusters_to_the_end);
+                      [&cardinalities](auto const & v1, auto const & v2)
+                      {
+                          if (v1.empty())
+                              return false; // v1 can never be larger than v2 then
+
+                          if (v2.empty()) // and v1 is not, since the first if would catch
+                              return true;
+
+                          return cardinalities[v1.contained_user_bins().front()]
+                               > cardinalities[v2.contained_user_bins().front()];
+                      });
 
-    assert(clusters[0].size() >= clusters[1].size()); // sanity check
+    assert(clusters.size() < 2 || clusters[0].size() >= clusters[1].size()); // sanity check
     // assert(cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min<size_t>(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check
 
     // debug
@@ -320,10 +320,10 @@ bool find_best_partition(chopper::configuration const & config,
     size_t best_p{0};
     bool best_p_found{false};
 
-    auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p)
+    auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p) -> size_t
     {
-        size_t min = std::min(max_card, min_partition_cardinality[p]);
-        size_t max = std::max(max_card, max_partition_cardinality[p]);
+        size_t const min = std::min(max_card, min_partition_cardinality[p]);
+        size_t const max = std::max(max_card, max_partition_cardinality[p]);
 
         if (positions[p].size() > config.hibf_config.tmax) // already a third level
         {
@@ -357,7 +357,7 @@ bool find_best_partition(chopper::configuration const & config,
             // else, end if-else-block and zero is returned
         }
 
-        return (size_t)0u;
+        return 0u;
     };
 
     for (size_t p = 0; p < number_of_partitions; ++p)
@@ -402,8 +402,7 @@ bool find_best_partition(chopper::configuration const & config,
         min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]);
     }
     partition_sketches[best_p].merge(current_sketch);
-    corrected_estimate_per_part =
-        std::max(corrected_estimate_per_part, static_cast<size_t>(partition_sketches[best_p].estimate()));
+    corrected_estimate_per_part = std::max<size_t>(corrected_estimate_per_part, partition_sketches[best_p].estimate());
 
     return true;
 }
@@ -420,8 +419,9 @@ size_t split_bins(chopper::configuration const & config,
     for (size_t idx = 0; idx < end_idx; ++idx)
     {
         size_t const ub_idx{sorted_positions[idx]};
+        // Question: divide and ceil?
         size_t const number_of_split_tbs =
-            std::max((size_t)1, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities);
+            std::max<size_t>(1u, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities);
         std::cout << "number_of_split_tbs: " << number_of_split_tbs
                   << " cardinalities[ub_idx]:" << cardinalities[ub_idx]
                   << " sum_of_cardinalities:" << sum_of_cardinalities << std::endl;

From df35411a5fc49284d10d0d39ad80f91dbe4f7848 Mon Sep 17 00:00:00 2001
From: "seqan-actions[bot]" <seqan-actions@users.noreply.github.com>
Date: Fri, 25 Jul 2025 20:16:26 +0200
Subject: [PATCH 17/18] [MISC] automatic linting

---
 src/layout/determine_split_bins.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp
index 013975b4..fb4ba653 100644
--- a/src/layout/determine_split_bins.cpp
+++ b/src/layout/determine_split_bins.cpp
@@ -150,4 +150,4 @@ std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & co
     return {number_of_split_tbs, max_size};
 }
 
-} // namespace chopper::fast_layout
+} // namespace chopper::layout

From 0d03e6e80bd4f4ce82eaa8b059590de2bf7d4f7c Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 25 Jul 2025 20:20:08 +0200
Subject: [PATCH 18/18] fix multi-line-comment

---
 src/layout/hibf_statistics.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp
index fdf2bb7d..d764b394 100644
--- a/src/layout/hibf_statistics.cpp
+++ b/src/layout/hibf_statistics.cpp
@@ -397,8 +397,10 @@ void hibf_statistics::collect_bins()
 #    pragma GCC diagnostic push
 #    pragma GCC diagnostic ignored "-Warray-bounds="
 #    pragma GCC diagnostic ignored "-Wstringop-overflow="
-#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV                                                                          \
-       // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
+#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
+
+                // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector<size_t>{user_bin_info.idx});
+
 #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV
 #    pragma GCC diagnostic pop
 #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV