From 43123a290a73a2bf8674d22e611ad1d068a87608 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 14:00:59 +0200
Subject: [PATCH 01/17] feat(cubestore): dict-encoded string group keys for
 inline aggregate (wip, behind flag)

Behind CUBESTORE_DICTIONARY_ENCODING (dev default currently ON - flip to false
before merge). Adds DictionaryComparator + DictionaryGroupColumn to the sorted
inline aggregate and routes dict schemas there via supported_type; String columns
flow as Dictionary(Int32,Utf8) end-to-end (metastore as_arrow_field + CubeTable /
CubeTableLogical schema). Measured on stend: inline/sorted ~10% faster, but
hash/Linear ~60% slower (read-side cast_to_dictionary + df row fallback).

WIP / not yet handled (only matters when flag ON): in-memory chunks stay Utf8
(schema mismatch), dict->string result serialization for non-aggregate SELECT,
native dict parquet read to avoid the read cast, and hash-aggregate dict-awareness.
---
 rust/cubestore/cubestore/src/config/mod.rs    |  10 +
 rust/cubestore/cubestore/src/metastore/mod.rs |  52 ++--
 .../inline_aggregate/column_comparator.rs     | 159 ++++++++++++
 .../dictionary_group_column.rs                | 139 +++++++++++
 .../src/queryplanner/inline_aggregate/mod.rs  |  66 +++--
 .../inline_aggregate/sorted_group_values.rs   | 233 +++++++++++++++++-
 .../cubestore/src/queryplanner/mod.rs         |   9 +-
 .../cubestore/src/queryplanner/planning.rs    |   6 +-
 .../src/queryplanner/query_executor.rs        |   3 +-
 9 files changed, 629 insertions(+), 48 deletions(-)
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs

diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 5cba4c1222959..602aa34c14294 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -585,6 +585,10 @@ pub trait ConfigObj: DIService {
     /// streaming split never drops rows (the first child is the low catch-all, the last
     /// the high one), and the legacy per-chunk path performed no such metadata check.
     fn repartition_check_overlapping_children(&self) -> bool;
+    /// Master gate for string dictionary-encoding work: reading string columns as
+    /// `DictionaryArray` and the dictionary-aware inline aggregate paths. Off by default while
+    /// the feature is built up incrementally behind this flag.
+    fn dictionary_encoding_enabled(&self) -> bool;
 
     fn allow_decimal128(&self) -> bool;
 
@@ -745,6 +749,7 @@ pub struct ConfigObjImpl {
     pub repartition_merge_max_input_files: usize,
     pub repartition_merge_max_rows: u64,
     pub repartition_check_overlapping_children: bool,
+    pub dictionary_encoding_enabled: bool,
     pub allow_decimal128: bool,
     pub enable_remove_orphaned_remote_files: bool,
     pub enable_startup_warmup: bool,
@@ -1085,6 +1090,8 @@ impl ConfigObj for ConfigObjImpl {
     }
     fn repartition_check_overlapping_children(&self) -> bool {
         self.repartition_check_overlapping_children
+    fn dictionary_encoding_enabled(&self) -> bool {
+        self.dictionary_encoding_enabled
     }
 
     fn allow_decimal128(&self) -> bool {
@@ -1783,6 +1790,8 @@ impl Config {
                     "CUBESTORE_REPARTITION_CHECK_OVERLAPPING_CHILDREN",
                     false,
                 ),
+                // TODO: dev default; flip back to false before merge.
+                dictionary_encoding_enabled: env_bool("CUBESTORE_DICTIONARY_ENCODING", true),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",
@@ -2039,6 +2048,7 @@ impl Config {
                 repartition_merge_max_input_files: 50,
                 repartition_merge_max_rows: 4_000_000,
                 repartition_check_overlapping_children: false,
+                dictionary_encoding_enabled: false,
                 allow_decimal128: false,
                 enable_remove_orphaned_remote_files: false,
                 enable_startup_warmup: true,
diff --git a/rust/cubestore/cubestore/src/metastore/mod.rs b/rust/cubestore/cubestore/src/metastore/mod.rs
index 5d6fadd650638..fb0aa7be43702 100644
--- a/rust/cubestore/cubestore/src/metastore/mod.rs
+++ b/rust/cubestore/cubestore/src/metastore/mod.rs
@@ -558,28 +558,40 @@ impl Into<Field> for Column {
     }
 }
 
+impl Column {
+    /// Arrow field for this column. When `dictionary_encoding` is set, `String` columns are
+    /// exposed as `Dictionary(Int32, Utf8)` so they flow dictionary-encoded through the plan;
+    /// otherwise they are plain `Utf8`. All other types are unaffected.
+    pub fn as_arrow_field(&self, dictionary_encoding: bool) -> Field {
+        let data_type = match self.column_type {
+            ColumnType::String => {
+                if dictionary_encoding {
+                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+                } else {
+                    DataType::Utf8
+                }
+            }
+            ColumnType::Int => DataType::Int64,
+            ColumnType::Int96 => DataType::Decimal128(38, 0),
+            ColumnType::Timestamp => DataType::Timestamp(Microsecond, None),
+            ColumnType::Boolean => DataType::Boolean,
+            ColumnType::Decimal { scale, precision } => {
+                DataType::Decimal128(precision as u8, scale as i8)
+            }
+            ColumnType::Decimal96 { scale, precision } => {
+                DataType::Decimal128(precision as u8, scale as i8)
+            }
+            ColumnType::Bytes => DataType::Binary,
+            ColumnType::HyperLogLog(_) => DataType::Binary,
+            ColumnType::Float => DataType::Float64,
+        };
+        Field::new(self.name.as_str(), data_type, true)
+    }
+}
+
 impl<'a> Into<Field> for &'a Column {
     fn into(self) -> Field {
-        Field::new(
-            self.name.as_str(),
-            match self.column_type {
-                ColumnType::String => DataType::Utf8,
-                ColumnType::Int => DataType::Int64,
-                ColumnType::Int96 => DataType::Decimal128(38, 0),
-                ColumnType::Timestamp => DataType::Timestamp(Microsecond, None),
-                ColumnType::Boolean => DataType::Boolean,
-                ColumnType::Decimal { scale, precision } => {
-                    DataType::Decimal128(precision as u8, scale as i8)
-                }
-                ColumnType::Decimal96 { scale, precision } => {
-                    DataType::Decimal128(precision as u8, scale as i8)
-                }
-                ColumnType::Bytes => DataType::Binary,
-                ColumnType::HyperLogLog(_) => DataType::Binary,
-                ColumnType::Float => DataType::Float64,
-            },
-            true,
-        )
+        self.as_arrow_field(false)
     }
 }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/column_comparator.rs b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/column_comparator.rs
index e085381ed2736..691f75b5a1816 100644
--- a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/column_comparator.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/column_comparator.rs
@@ -1,5 +1,7 @@
 use datafusion::arrow::array::*;
+use datafusion::arrow::compute::SortOptions;
 use datafusion::arrow::datatypes::*;
+use std::cmp::Ordering;
 use std::marker::PhantomData;
 
 /// Trait for comparing adjacent rows in an array to detect group boundaries.
@@ -193,6 +195,82 @@ where
     }
 }
 
+/// Comparator for dictionary-encoded columns (e.g. `Dictionary(Int32, Utf8)`).
+///
+/// The hot path compares dictionary keys (small integers) instead of the underlying
+/// values. Within a single batch all rows share one dictionary, so key equality implies
+/// value equality. The reverse does not hold when a dictionary carries duplicate values
+/// (e.g. after a merge unions several local dictionaries), so when adjacent keys differ we
+/// fall back to comparing the actual values to avoid splitting a group incorrectly. That
+/// fallback only fires on group boundaries, which are rare in a sorted stream.
+pub struct DictionaryComparator<K: ArrowDictionaryKeyType, const NULLABLE: bool> {
+    _phantom: PhantomData<fn() -> K>,
+}
+
+impl<K: ArrowDictionaryKeyType, const NULLABLE: bool> DictionaryComparator<K, NULLABLE> {
+    pub fn new() -> Self {
+        Self {
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType, const NULLABLE: bool> ColumnComparator
+    for DictionaryComparator<K, NULLABLE>
+{
+    #[inline]
+    fn compare_adjacent(&self, col: &ArrayRef, equal_results: &mut [bool]) {
+        let array = col
+            .as_any()
+            .downcast_ref::<DictionaryArray<K>>()
+            .expect("DictionaryComparator got non-dictionary array");
+        let keys = array.keys();
+        let values = array.values();
+
+        if !NULLABLE {
+            // A non-nullable field must not carry null keys; the loop below skips null checks.
+            debug_assert_eq!(
+                keys.null_count(),
+                0,
+                "DictionaryComparator<_, false> received null keys"
+            );
+        }
+
+        // Built lazily, only when adjacent keys actually differ. The values array is always one
+        // of the types accepted by `new_dictionary_group_column`, all of which `make_comparator`
+        // supports.
+        let mut value_cmp: Option<DynComparator> = None;
+
+        for i in 0..equal_results.len() {
+            if !equal_results[i] {
+                continue;
+            }
+
+            if NULLABLE {
+                let null1 = keys.is_null(i);
+                let null2 = keys.is_null(i + 1);
+                if null1 || null2 {
+                    // Both null => same group; one null => boundary.
+                    equal_results[i] = null1 && null2;
+                    continue;
+                }
+            }
+
+            let k1 = keys.value(i).as_usize();
+            let k2 = keys.value(i + 1).as_usize();
+            if k1 == k2 {
+                continue;
+            }
+
+            let cmp = value_cmp.get_or_insert_with(|| {
+                make_comparator(values.as_ref(), values.as_ref(), SortOptions::default())
+                    .expect("make_comparator for dictionary values")
+            });
+            equal_results[i] = cmp(k1, k2) == Ordering::Equal;
+        }
+    }
+}
+
 /// Instantiate a primitive comparator and push it into the vector.
 ///
 /// Handles const generic NULLABLE parameter based on field nullability.
@@ -260,3 +338,84 @@ macro_rules! instantiate_byte_view_comparator {
         }
     };
 }
+
+/// Instantiate a dictionary comparator and push it into the vector.
+#[macro_export]
+macro_rules! instantiate_dictionary_comparator {
+    ($v:expr, $nullable:expr, $k:ty) => {
+        if $nullable {
+            $v.push(Box::new(
+                $crate::queryplanner::inline_aggregate::column_comparator::DictionaryComparator::<
+                    $k,
+                    true,
+                >::new(),
+            ) as _)
+        } else {
+            $v.push(Box::new(
+                $crate::queryplanner::inline_aggregate::column_comparator::DictionaryComparator::<
+                    $k,
+                    false,
+                >::new(),
+            ) as _)
+        }
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    fn run(comparator: &dyn ColumnComparator, col: &ArrayRef) -> Vec<bool> {
+        let n = col.len();
+        let mut eq = vec![true; n - 1];
+        comparator.compare_adjacent(col, &mut eq);
+        eq
+    }
+
+    #[test]
+    fn dict_compare_same_dictionary_sorted() {
+        // values [a,b,c], keys [0,0,1,1,2] => rows a,a,b,b,c
+        let dict: DictionaryArray<Int32Type> = vec!["a", "a", "b", "b", "c"].into_iter().collect();
+        let col: ArrayRef = Arc::new(dict);
+        let cmp = DictionaryComparator::<Int32Type, false>::new();
+        assert_eq!(run(&cmp, &col), vec![true, false, true, false]);
+    }
+
+    #[test]
+    fn dict_compare_duplicate_values_fallback() {
+        // Dictionary with duplicate values: keys 0 and 1 both map to "a".
+        // Adjacent keys differ but values are equal -> must NOT be a boundary.
+        let keys = Int32Array::from(vec![0, 1, 2]);
+        let values = Arc::new(StringArray::from(vec!["a", "a", "b"]));
+        let dict = DictionaryArray::<Int32Type>::new(keys, values);
+        let col: ArrayRef = Arc::new(dict);
+        let cmp = DictionaryComparator::<Int32Type, false>::new();
+        // rows: a, a, b => (a,a) equal via fallback, (a,b) boundary
+        assert_eq!(run(&cmp, &col), vec![true, false]);
+    }
+
+    #[test]
+    fn dict_compare_nulls() {
+        // rows: null, null, "a", "a", null
+        let dict: DictionaryArray<Int32Type> = vec![None, None, Some("a"), Some("a"), None]
+            .into_iter()
+            .collect();
+        let col: ArrayRef = Arc::new(dict);
+        let cmp = DictionaryComparator::<Int32Type, true>::new();
+        // (null,null) equal, (null,a) boundary, (a,a) equal, (a,null) boundary
+        assert_eq!(run(&cmp, &col), vec![true, false, true, false]);
+    }
+
+    #[test]
+    fn dict_compare_respects_short_circuit() {
+        // values [a,b], keys [0,0,1]; pre-mark first pair as already-false.
+        let dict: DictionaryArray<Int32Type> = vec!["a", "a", "b"].into_iter().collect();
+        let col: ArrayRef = Arc::new(dict);
+        let cmp = DictionaryComparator::<Int32Type, false>::new();
+        let mut eq = vec![false, true];
+        cmp.compare_adjacent(&col, &mut eq);
+        // first stays false (short-circuit), second is a real boundary a->b
+        assert_eq!(eq, vec![false, false]);
+    }
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs
new file mode 100644
index 0000000000000..56857846b7cd4
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs
@@ -0,0 +1,139 @@
+use std::marker::PhantomData;
+
+use datafusion::arrow::array::{new_null_array, Array, ArrayRef, DictionaryArray};
+use datafusion::arrow::datatypes::{
+    ArrowDictionaryKeyType, ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type, Int8Type,
+    UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+};
+use datafusion::dfschema::not_impl_err;
+use datafusion::error::Result as DFResult;
+use datafusion::physical_expr::binary_map::OutputType;
+use datafusion::physical_plan::aggregates::group_values::multi_group_by::{
+    ByteGroupValueBuilder, GroupColumn,
+};
+
+/// A [`GroupColumn`] for dictionary-encoded columns that stores the group values in their
+/// decoded form (delegating to an inner byte-array builder) while accepting dictionary input.
+///
+/// Group storage operations (`append_val`/`equal_to`) only happen on group boundaries, so they
+/// resolve the dictionary value on demand: a non-null row delegates to the inner builder using
+/// `(dict.values(), dict.key(row))`, and a null row delegates against a cached single-null array.
+/// The per-row hot path stays in `DictionaryComparator`, which never touches this builder.
+pub struct DictionaryGroupColumn<K: ArrowDictionaryKeyType> {
+    inner: Box<dyn GroupColumn>,
+    /// One-element null array of the dictionary's value type, used to append/compare null keys.
+    null_row: ArrayRef,
+    _k: PhantomData<fn() -> K>,
+}
+
+impl<K: ArrowDictionaryKeyType> DictionaryGroupColumn<K> {
+    fn new(inner: Box<dyn GroupColumn>, null_row: ArrayRef) -> Self {
+        Self {
+            inner,
+            null_row,
+            _k: PhantomData,
+        }
+    }
+
+    #[inline]
+    fn dict(column: &ArrayRef) -> &DictionaryArray<K> {
+        column
+            .as_any()
+            .downcast_ref::<DictionaryArray<K>>()
+            .expect("DictionaryGroupColumn got non-dictionary array")
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> GroupColumn for DictionaryGroupColumn<K> {
+    fn equal_to(&self, lhs_row: usize, column: &ArrayRef, rhs_row: usize) -> bool {
+        let dict = Self::dict(column);
+        if dict.is_null(rhs_row) {
+            self.inner.equal_to(lhs_row, &self.null_row, 0)
+        } else {
+            let key = dict.keys().value(rhs_row).as_usize();
+            self.inner.equal_to(lhs_row, dict.values(), key)
+        }
+    }
+
+    fn append_val(&mut self, column: &ArrayRef, row: usize) {
+        let dict = Self::dict(column);
+        if dict.is_null(row) {
+            self.inner.append_val(&self.null_row, 0);
+        } else {
+            let key = dict.keys().value(row).as_usize();
+            self.inner.append_val(dict.values(), key);
+        }
+    }
+
+    fn vectorized_equal_to(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        for i in 0..lhs_rows.len() {
+            if equal_to_results[i] {
+                equal_to_results[i] = self.equal_to(lhs_rows[i], array, rhs_rows[i]);
+            }
+        }
+    }
+
+    fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) {
+        for &row in rows {
+            self.append_val(array, row);
+        }
+    }
+
+    fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size() + self.null_row.get_array_memory_size()
+    }
+
+    fn build(self: Box<Self>) -> ArrayRef {
+        (*self).inner.build()
+    }
+
+    fn take_n(&mut self, n: usize) -> ArrayRef {
+        self.inner.take_n(n)
+    }
+}
+
+/// Builds a [`DictionaryGroupColumn`] for the given dictionary key/value types.
+///
+/// The inner builder stores the decoded value (Utf8/Binary); the wrapper is generic over the
+/// key type so it can read keys without decoding the whole batch.
+pub fn new_dictionary_group_column(
+    key_type: &DataType,
+    value_type: &DataType,
+) -> DFResult<Box<dyn GroupColumn>> {
+    let inner: Box<dyn GroupColumn> = match value_type {
+        DataType::Utf8 => Box::new(ByteGroupValueBuilder::<i32>::new(OutputType::Utf8)),
+        DataType::LargeUtf8 => Box::new(ByteGroupValueBuilder::<i64>::new(OutputType::Utf8)),
+        DataType::Binary => Box::new(ByteGroupValueBuilder::<i32>::new(OutputType::Binary)),
+        DataType::LargeBinary => Box::new(ByteGroupValueBuilder::<i64>::new(OutputType::Binary)),
+        other => {
+            return not_impl_err!(
+                "dictionary value type {other} not supported in SortedGroupValues"
+            )
+        }
+    };
+    let null_row = new_null_array(value_type, 1);
+
+    Ok(match key_type {
+        DataType::Int8 => Box::new(DictionaryGroupColumn::<Int8Type>::new(inner, null_row)),
+        DataType::Int16 => Box::new(DictionaryGroupColumn::<Int16Type>::new(inner, null_row)),
+        DataType::Int32 => Box::new(DictionaryGroupColumn::<Int32Type>::new(inner, null_row)),
+        DataType::Int64 => Box::new(DictionaryGroupColumn::<Int64Type>::new(inner, null_row)),
+        DataType::UInt8 => Box::new(DictionaryGroupColumn::<UInt8Type>::new(inner, null_row)),
+        DataType::UInt16 => Box::new(DictionaryGroupColumn::<UInt16Type>::new(inner, null_row)),
+        DataType::UInt32 => Box::new(DictionaryGroupColumn::<UInt32Type>::new(inner, null_row)),
+        DataType::UInt64 => Box::new(DictionaryGroupColumn::<UInt64Type>::new(inner, null_row)),
+        other => {
+            return not_impl_err!("dictionary key type {other} not supported in SortedGroupValues")
+        }
+    })
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/mod.rs
index 8a58d1a8c0dba..3ed078cb3683c 100644
--- a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/mod.rs
@@ -1,4 +1,5 @@
 mod column_comparator;
+mod dictionary_group_column;
 mod inline_aggregate_stream;
 mod sorted_group_values;
 mod sorted_group_values_rows;
@@ -279,31 +280,48 @@ fn supported_schema(schema: &datafusion::arrow::datatypes::Schema) -> bool {
 ///
 /// Types not in this list will use the row-based [`SortedGroupValuesRows`] implementation
 fn supported_type(data_type: &DataType) -> bool {
-    matches!(
-        *data_type,
+    match data_type {
         DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64
-            | DataType::Float32
-            | DataType::Float64
-            | DataType::Decimal128(_, _)
-            | DataType::Utf8
-            | DataType::LargeUtf8
-            | DataType::Binary
-            | DataType::LargeBinary
-            | DataType::Date32
-            | DataType::Date64
-            | DataType::Time32(_)
-            | DataType::Time64(_)
-            | DataType::Timestamp(_, _)
-            | DataType::Utf8View
-            | DataType::BinaryView
-    )
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Decimal128(_, _)
+        | DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Binary
+        | DataType::LargeBinary
+        | DataType::Date32
+        | DataType::Date64
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Timestamp(_, _)
+        | DataType::Utf8View
+        | DataType::BinaryView => true,
+        // Dictionary group columns handled by DictionaryGroupColumn + DictionaryComparator.
+        DataType::Dictionary(key_type, value_type) => {
+            matches!(
+                key_type.as_ref(),
+                DataType::Int8
+                    | DataType::Int16
+                    | DataType::Int32
+                    | DataType::Int64
+                    | DataType::UInt8
+                    | DataType::UInt16
+                    | DataType::UInt32
+                    | DataType::UInt64
+            ) && matches!(
+                value_type.as_ref(),
+                DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary
+            )
+        }
+        _ => false,
+    }
 }
 
 #[cfg(test)]
diff --git a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/sorted_group_values.rs b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/sorted_group_values.rs
index e7c0e82b2f7cb..08c49a93a7fd6 100644
--- a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/sorted_group_values.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/sorted_group_values.rs
@@ -22,9 +22,10 @@ use datafusion::physical_plan::aggregates::group_values::multi_group_by::{
 use datafusion::physical_plan::aggregates::group_values::GroupValues;
 
 use crate::queryplanner::inline_aggregate::column_comparator::ColumnComparator;
+use crate::queryplanner::inline_aggregate::dictionary_group_column::new_dictionary_group_column;
 use crate::{
     instantiate_byte_array_comparator, instantiate_byte_view_comparator,
-    instantiate_primitive_comparator,
+    instantiate_dictionary_comparator, instantiate_primitive_comparator,
 };
 
 pub struct SortedGroupValues {
@@ -319,6 +320,52 @@ impl GroupValues for SortedGroupValues {
                         v.push(Box::new(b) as _);
                         instantiate_byte_view_comparator!(comparators, nullable, BinaryViewType);
                     }
+                    &DataType::Dictionary(ref key_type, ref value_type) => {
+                        v.push(new_dictionary_group_column(key_type, value_type)?);
+                        match key_type.as_ref() {
+                            DataType::Int8 => {
+                                instantiate_dictionary_comparator!(comparators, nullable, Int8Type)
+                            }
+                            DataType::Int16 => {
+                                instantiate_dictionary_comparator!(comparators, nullable, Int16Type)
+                            }
+                            DataType::Int32 => {
+                                instantiate_dictionary_comparator!(comparators, nullable, Int32Type)
+                            }
+                            DataType::Int64 => {
+                                instantiate_dictionary_comparator!(comparators, nullable, Int64Type)
+                            }
+                            DataType::UInt8 => {
+                                instantiate_dictionary_comparator!(comparators, nullable, UInt8Type)
+                            }
+                            DataType::UInt16 => {
+                                instantiate_dictionary_comparator!(
+                                    comparators,
+                                    nullable,
+                                    UInt16Type
+                                )
+                            }
+                            DataType::UInt32 => {
+                                instantiate_dictionary_comparator!(
+                                    comparators,
+                                    nullable,
+                                    UInt32Type
+                                )
+                            }
+                            DataType::UInt64 => {
+                                instantiate_dictionary_comparator!(
+                                    comparators,
+                                    nullable,
+                                    UInt64Type
+                                )
+                            }
+                            dt => {
+                                return not_impl_err!(
+                                    "dictionary key type {dt} not supported in SortedGroupValues"
+                                )
+                            }
+                        }
+                    }
                     dt => return not_impl_err!("{dt} not supported in SortedGroupValues"),
                 }
             }
@@ -390,3 +437,187 @@ impl GroupValues for SortedGroupValues {
         self.equal_to_results.clear();
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::arrow::array::{Int32Array, StringArray};
+    use datafusion::arrow::datatypes::{Field, Schema};
+    use std::sync::Arc;
+
+    fn dict_schema(nullable: bool) -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new(
+            "g",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            nullable,
+        )]))
+    }
+
+    fn decode(dict: &ArrayRef) -> Vec<Option<String>> {
+        let dict = dict
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::DictionaryArray<Int32Type>>()
+            .unwrap();
+        let values = dict
+            .values()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        (0..dict.len())
+            .map(|i| {
+                if dict.is_null(i) {
+                    None
+                } else {
+                    Some(values.value(dict.keys().value(i) as usize).to_string())
+                }
+            })
+            .collect()
+    }
+
+    /// Groups must continue across a batch boundary even when the two batches carry different
+    /// local dictionaries (the same string is encoded with different keys per batch).
+    #[test]
+    fn sorted_group_values_dictionary_cross_batch() {
+        let mut gv = SortedGroupValues::try_new(dict_schema(false)).unwrap();
+
+        // Batch 1: a, a, b  (values [a,b], keys [0,0,1])
+        let b1 = datafusion::arrow::array::DictionaryArray::<Int32Type>::new(
+            Int32Array::from(vec![0, 0, 1]),
+            Arc::new(StringArray::from(vec!["a", "b"])),
+        );
+        let mut groups = vec![];
+        gv.intern(&[Arc::new(b1) as ArrayRef], &mut groups).unwrap();
+        assert_eq!(groups, vec![0, 0, 1]);
+
+        // Batch 2: b, c  with a DIFFERENT local dictionary (values [b,c], keys [0,1]).
+        let b2 = datafusion::arrow::array::DictionaryArray::<Int32Type>::new(
+            Int32Array::from(vec![0, 1]),
+            Arc::new(StringArray::from(vec!["b", "c"])),
+        );
+        gv.intern(&[Arc::new(b2) as ArrayRef], &mut groups).unwrap();
+        // "b" continues the last group (idx 1), "c" opens group 2.
+        assert_eq!(groups, vec![1, 2]);
+
+        assert_eq!(gv.len(), 3);
+        let out = gv.emit(EmitTo::All).unwrap();
+        assert_eq!(
+            decode(&out[0]),
+            vec![
+                Some("a".to_string()),
+                Some("b".to_string()),
+                Some("c".to_string())
+            ]
+        );
+    }
+
+    /// Isolated timing: dictionary vs Utf8 group keys over a sorted 10-column stream.
+    /// Run with: cargo test -p cubestore --lib sorted_group_values_dict_vs_utf8_bench -- --ignored --nocapture
+    #[test]
+    #[ignore]
+    fn sorted_group_values_dict_vs_utf8_bench() {
+        use std::time::Instant;
+        const NCOLS: usize = 10;
+        const ROWS: usize = 2_000_000;
+        const BATCH: usize = 8192;
+        const ROWS_PER_GROUP: usize = 20; // ~100k groups, low per-column cardinality
+
+        // tuple value for column j of group g, c0 most significant -> stream is sorted ascending
+        let val = |g: usize, j: usize| -> String {
+            let digit = (g / 4usize.pow((NCOLS - 1 - j) as u32)) % 4;
+            format!("c{j}_{digit}")
+        };
+
+        // Build Utf8 batches and Dictionary batches for the same sorted data.
+        let mut utf8_batches: Vec<Vec<ArrayRef>> = vec![];
+        let mut dict_batches: Vec<Vec<ArrayRef>> = vec![];
+        let mut row = 0usize;
+        while row < ROWS {
+            let n = BATCH.min(ROWS - row);
+            let mut utf8_cols: Vec<ArrayRef> = Vec::with_capacity(NCOLS);
+            let mut dict_cols: Vec<ArrayRef> = Vec::with_capacity(NCOLS);
+            for j in 0..NCOLS {
+                let vals: Vec<String> =
+                    (0..n).map(|i| val((row + i) / ROWS_PER_GROUP, j)).collect();
+                let strs: Vec<&str> = vals.iter().map(|s| s.as_str()).collect();
+                utf8_cols.push(Arc::new(StringArray::from(strs.clone())) as ArrayRef);
+                let dict: datafusion::arrow::array::DictionaryArray<Int32Type> =
+                    strs.into_iter().collect();
+                dict_cols.push(Arc::new(dict) as ArrayRef);
+            }
+            utf8_batches.push(utf8_cols);
+            dict_batches.push(dict_cols);
+            row += n;
+        }
+
+        let utf8_schema = Arc::new(Schema::new(
+            (0..NCOLS)
+                .map(|j| Field::new(format!("c{j}"), DataType::Utf8, false))
+                .collect::<Vec<_>>(),
+        ));
+        let dict_schema = Arc::new(Schema::new(
+            (0..NCOLS)
+                .map(|j| {
+                    Field::new(
+                        format!("c{j}"),
+                        DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                        false,
+                    )
+                })
+                .collect::<Vec<_>>(),
+        ));
+
+        let run = |schema: SchemaRef, batches: &Vec<Vec<ArrayRef>>| -> (u128, usize) {
+            let mut gv = SortedGroupValues::try_new(schema).unwrap();
+            let mut groups = vec![];
+            let t0 = Instant::now();
+            for cols in batches {
+                gv.intern(cols, &mut groups).unwrap();
+            }
+            (t0.elapsed().as_micros(), gv.len())
+        };
+
+        // warm + measure (best of 3)
+        let mut utf8_us = u128::MAX;
+        let mut dict_us = u128::MAX;
+        let mut ngroups = 0;
+        for _ in 0..3 {
+            let (u, gu) = run(utf8_schema.clone(), &utf8_batches);
+            let (d, gd) = run(dict_schema.clone(), &dict_batches);
+            assert_eq!(gu, gd, "group counts must match");
+            ngroups = gu;
+            utf8_us = utf8_us.min(u);
+            dict_us = dict_us.min(d);
+        }
+        println!(
+            "intern over {ROWS} rows x {NCOLS} cols, {ngroups} groups:\n  Utf8: {:.1} ms\n  Dict: {:.1} ms\n  speedup: {:.2}x",
+            utf8_us as f64 / 1000.0,
+            dict_us as f64 / 1000.0,
+            utf8_us as f64 / dict_us as f64,
+        );
+    }
+
+    /// Null keys form their own group and continue across batches.
+    #[test]
+    fn sorted_group_values_dictionary_nulls() {
+        let mut gv = SortedGroupValues::try_new(dict_schema(true)).unwrap();
+
+        // rows: null, null, a
+        let b1: datafusion::arrow::array::DictionaryArray<Int32Type> =
+            vec![None, None, Some("a")].into_iter().collect();
+        let mut groups = vec![];
+        gv.intern(&[Arc::new(b1) as ArrayRef], &mut groups).unwrap();
+        assert_eq!(groups, vec![0, 0, 1]);
+
+        // rows: a, b -> "a" continues group 1, "b" new
+        let b2: datafusion::arrow::array::DictionaryArray<Int32Type> =
+            vec![Some("a"), Some("b")].into_iter().collect();
+        gv.intern(&[Arc::new(b2) as ArrayRef], &mut groups).unwrap();
+        assert_eq!(groups, vec![1, 2]);
+
+        let out = gv.emit(EmitTo::All).unwrap();
+        assert_eq!(
+            decode(&out[0]),
+            vec![None, Some("a".to_string()), Some("b".to_string())]
+        );
+    }
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/mod.rs b/rust/cubestore/cubestore/src/queryplanner/mod.rs
index c4d4742312e4d..86c032b4d28d6 100644
--- a/rust/cubestore/cubestore/src/queryplanner/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/mod.rs
@@ -161,6 +161,7 @@ impl QueryPlanner for QueryPlannerImpl {
             inline_tables,
             self.cache.clone(),
             state.clone(),
+            self.config.dictionary_encoding_enabled(),
         );
 
         let query_planner = SqlToRel::new_with_options(&schema_provider, sql_to_rel_options());
@@ -231,6 +232,7 @@ impl QueryPlanner for QueryPlannerImpl {
                 logical_plan,
                 &self.meta_store.as_ref(),
                 self.config.enable_topk(),
+                self.config.dictionary_encoding_enabled(),
             )
             .await?;
             let workers = compute_workers(
@@ -372,6 +374,7 @@ struct MetaStoreSchemaProvider {
     inline_tables: InlineTables,
     cache: Arc<SqlResultCache>,
     config_options: ConfigOptions,
+    dictionary_encoding: bool,
     expr_planners: Vec<Arc<dyn ExprPlanner>>, // session_state.expr_planners clone
     session_state: Arc<SessionState>,
 }
@@ -408,6 +411,7 @@ impl MetaStoreSchemaProvider {
         inline_tables: &InlineTables,
         cache: Arc<SqlResultCache>,
         session_state: Arc<SessionState>,
+        dictionary_encoding: bool,
     ) -> Self {
         let by_name = tables.iter().map(|t| TableKey(t)).collect();
         Self {
@@ -418,6 +422,7 @@ impl MetaStoreSchemaProvider {
             cache,
             inline_tables: (*inline_tables).clone(),
             config_options: ConfigOptions::new(),
+            dictionary_encoding,
             expr_planners: datafusion::execution::FunctionRegistry::expr_planners(
                 session_state.as_ref(),
             ),
@@ -486,13 +491,14 @@ impl ContextProvider for MetaStoreSchemaProvider {
             .get(&TableKey(&table_path))
             .map(|table| -> Arc<dyn TableProvider> {
                 let table = unsafe { &*table.0 };
+                let dictionary_encoding = self.dictionary_encoding;
                 let schema = Arc::new(Schema::new(
                     table
                         .table
                         .get_row()
                         .get_columns()
                         .iter()
-                        .map(|c| c.clone().into())
+                        .map(|c| c.as_arrow_field(dictionary_encoding))
                         .collect::<Vec<Field>>(),
                 ));
                 Arc::new(CubeTableLogical {
@@ -1095,6 +1101,7 @@ pub mod tests {
             &vec![],
             Arc::new(SqlResultCache::new(1 << 20, None, 10000, None)),
             Arc::new(SessionContext::new().state()),
+            false,
         )
     }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/planning.rs b/rust/cubestore/cubestore/src/queryplanner/planning.rs
index 9c9cd7b352df6..e6786376fcd01 100644
--- a/rust/cubestore/cubestore/src/queryplanner/planning.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/planning.rs
@@ -85,7 +85,7 @@ pub async fn choose_index(
     p: LogicalPlan,
     metastore: &dyn PlanIndexStore,
 ) -> Result<(LogicalPlan, PlanningMeta), DataFusionError> {
-    choose_index_ext(p, metastore, true).await
+    choose_index_ext(p, metastore, true, false).await
 }
 
 /// Information required to distribute the logical plan into multiple workers.
@@ -123,6 +123,7 @@ pub async fn choose_index_ext(
     p: LogicalPlan,
     metastore: &dyn PlanIndexStore,
     enable_topk: bool,
+    dictionary_encoding: bool,
 ) -> Result<(LogicalPlan, PlanningMeta), DataFusionError> {
     // Prepare information to choose the index.
     let mut collector = CollectConstraints::default();
@@ -238,6 +239,7 @@ pub async fn choose_index_ext(
         chosen_indices: &indices,
         next_index: 0,
         enable_topk,
+        dictionary_encoding,
         can_pushdown_limit: true,
         cluster_send_next_id: 1,
     };
@@ -831,6 +833,7 @@ struct ChooseIndex<'a> {
     next_index: usize,
     chosen_indices: &'a [IndexSnapshot],
     enable_topk: bool,
+    dictionary_encoding: bool,
     can_pushdown_limit: bool,
     cluster_send_next_id: usize,
 }
@@ -1057,6 +1060,7 @@ impl ChooseIndex<'_> {
                             HashMap::new(),
                             Vec::new(),
                             NoopParquetMetadataCache::new(),
+                            self.dictionary_encoding,
                         )?)));
 
                         let index_schema = source.schema();
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index 528f268f7048f..d2ba6954801b1 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -679,6 +679,7 @@ impl CubeTable {
         remote_to_local_names: HashMap<String, String>,
         worker_partition_ids: Vec<(u64, RowFilter)>,
         parquet_metadata_cache: Arc<dyn ParquetFileReaderFactory>,
+        dictionary_encoding: bool,
     ) -> Result<Self, CubeError> {
         let schema = Arc::new(Schema::new(
             // Tables are always exposed only using table columns order instead of index one because
@@ -690,7 +691,7 @@ impl CubeTable {
                 .get_row()
                 .get_columns()
                 .iter()
-                .map(|c| c.clone().into())
+                .map(|c| c.as_arrow_field(dictionary_encoding))
                 .collect::<Vec<Field>>(),
         ));
         Ok(Self {

From d3aa25fad92724265b4adb710c1986bef354f0c9 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 16:35:22 +0200
Subject: [PATCH 02/17] feat(cubestore): port group-by-limit partial aggregate
 (renamed from topk), wip behind flag

Brings the worker-side partial hash-aggregate trim for
`GROUP BY <non-index-prefix> ORDER BY <subset of group-by> LIMIT k` onto this
branch, renamed from the misleading "TopK" naming (it trims by group-key order,
not by an aggregate top-N):
- GroupByLimitAggregateExec + group_by_limit_aggregate module
- group_by_limit_rewriter
- config group_by_limit_factor / CUBESTORE_GROUP_BY_LIMIT_FACTOR (default 2)

Reuses DataFusion's GroupValues building blocks; the only fork change is
pub new_group_values (df branch cubestore-hash-aggregate-limit).

WIP: the trim is planted during router planning but is NOT yet carried across
the ClusterSend boundary to the worker's independent physical re-plan, so it
does not engage at distributed execution yet. Next: cluster-boundary carry.
---
 rust/cubestore/Cargo.lock                     |  52 ++--
 .../cubestore-sql-tests/src/tests.rs          | 143 +++++++++
 rust/cubestore/cubestore/Cargo.toml           |   8 +-
 rust/cubestore/cubestore/src/config/mod.rs    |  20 +-
 .../group_by_limit_aggregate_stream.rs        | 284 ++++++++++++++++++
 .../group_by_limit_aggregate/mod.rs           | 204 +++++++++++++
 .../cubestore/src/queryplanner/mod.rs         |   1 +
 .../optimizations/group_by_limit_rewriter.rs  | 251 ++++++++++++++++
 .../src/queryplanner/optimizations/mod.rs     |  18 +-
 .../src/queryplanner/pretty_printers.rs       |  11 +
 .../src/queryplanner/query_executor.rs        |   9 +-
 11 files changed, 960 insertions(+), 41 deletions(-)
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs

diff --git a/rust/cubestore/Cargo.lock b/rust/cubestore/Cargo.lock
index 8d36a7d26c016..c5785ec686690 100644
--- a/rust/cubestore/Cargo.lock
+++ b/rust/cubestore/Cargo.lock
@@ -1758,7 +1758,7 @@ checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
 [[package]]
 name = "datafusion"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-ipc",
@@ -1811,7 +1811,7 @@ dependencies = [
 [[package]]
 name = "datafusion-catalog"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1830,7 +1830,7 @@ dependencies = [
 [[package]]
 name = "datafusion-catalog-listing"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1851,7 +1851,7 @@ dependencies = [
 [[package]]
 name = "datafusion-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -1874,7 +1874,7 @@ dependencies = [
 [[package]]
 name = "datafusion-common-runtime"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "log",
  "tokio",
@@ -1883,7 +1883,7 @@ dependencies = [
 [[package]]
 name = "datafusion-datasource"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-compression 0.4.17",
@@ -1916,12 +1916,12 @@ dependencies = [
 [[package]]
 name = "datafusion-doc"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 
 [[package]]
 name = "datafusion-execution"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "dashmap",
@@ -1941,7 +1941,7 @@ dependencies = [
 [[package]]
 name = "datafusion-expr"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -1961,7 +1961,7 @@ dependencies = [
 [[package]]
 name = "datafusion-expr-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1973,7 +1973,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-buffer",
@@ -2001,7 +2001,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-aggregate"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2021,7 +2021,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-aggregate-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2033,7 +2033,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-nested"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-ord",
@@ -2053,7 +2053,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-table"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2068,7 +2068,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-window"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-common",
  "datafusion-doc",
@@ -2084,7 +2084,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-window-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -2093,7 +2093,7 @@ dependencies = [
 [[package]]
 name = "datafusion-macros"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-expr",
  "quote",
@@ -2103,7 +2103,7 @@ dependencies = [
 [[package]]
 name = "datafusion-optimizer"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -2121,7 +2121,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2142,7 +2142,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2155,7 +2155,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-optimizer"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2173,7 +2173,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-plan"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2205,7 +2205,7 @@ dependencies = [
 [[package]]
 name = "datafusion-proto"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -2220,7 +2220,7 @@ dependencies = [
 [[package]]
 name = "datafusion-proto-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2230,7 +2230,7 @@ dependencies = [
 [[package]]
 name = "datafusion-sql"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "bigdecimal 0.4.8",
diff --git a/rust/cubestore/cubestore-sql-tests/src/tests.rs b/rust/cubestore/cubestore-sql-tests/src/tests.rs
index 68925496d1721..551c70234c839 100644
--- a/rust/cubestore/cubestore-sql-tests/src/tests.rs
+++ b/rust/cubestore/cubestore-sql-tests/src/tests.rs
@@ -154,6 +154,8 @@ pub fn sql_tests(prefix: &str) -> Vec<(&'static str, TestFn)> {
         t("planning_inplace_aggregate", planning_inplace_aggregate),
         t("planning_hints", planning_hints),
         t("planning_inplace_aggregate2", planning_inplace_aggregate2),
+        t("planning_topk_hash_aggregate", planning_topk_hash_aggregate),
+        t("topk_hash_aggregate_trim", topk_hash_aggregate_trim),
         t("topk_large_inputs", topk_large_inputs),
         t("partitioned_index", partitioned_index),
         t(
@@ -424,6 +426,7 @@ lazy_static::lazy_static! {
         "limit_pushdown_group_having",
         "limit_pushdown_group_nonprefix_order",
         "prefilter_chunks_shared_scan",
+        "planning_topk_hash_aggregate",
     ].into_iter().map(ToOwned::to_owned).collect();
 }
 
@@ -3200,6 +3203,146 @@ async fn planning_inplace_aggregate(service: Box<dyn SqlClient>) -> Result<(), C
     Ok(())
 }
 
+async fn planning_topk_hash_aggregate(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
+    service.exec_query("CREATE SCHEMA s").await?;
+    service
+        .exec_query("CREATE TABLE s.Data(url text, day int, hits int)")
+        .await?;
+    service
+        .exec_query("CREATE TABLE s.D3(a int, b int, c int, h int)")
+        .await?;
+
+    // GROUP BY a non-indexed column -> hash (Linear) partial aggregate; ORDER BY the group
+    // column with a LIMIT -> the worker partial aggregate is replaced by GroupByLimitAggregate.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1 LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,"),
+        "expected GroupByLimitAggregate on the worker, got:\n{}",
+        pp
+    );
+
+    // LIMIT + OFFSET -> k = limit + offset.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1 LIMIT 10 OFFSET 5")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 15, factor: 2,"),
+        "expected k=15 (limit+offset), got:\n{}",
+        pp
+    );
+
+    // ORDER BY an aggregate (not a group-by column) -> no trim.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 2 DESC LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        !pp.contains("GroupByLimitAggregate"),
+        "did not expect GroupByLimitAggregate when ordering by an aggregate, got:\n{}",
+        pp
+    );
+
+    // No LIMIT -> no trim.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        !pp.contains("GroupByLimitAggregate"),
+        "did not expect GroupByLimitAggregate without a limit, got:\n{}",
+        pp
+    );
+
+    // ORDER BY a proper SUBSET of GROUP BY (b out of b, c). The worker cut and the router sort must
+    // both use the total order T = [b, c]: the worker trim order carries the tie-break column c, and
+    // the router's global Sort is extended with c so its top-k matches the global top-k by T.
+    let p = service
+        .plan_query("SELECT b, c, SUM(h) FROM s.D3 GROUP BY 1, 2 ORDER BY 1 LIMIT 3")
+        .await?;
+    let worker_pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        worker_pp.contains("GroupByLimitAggregate, k: 3, factor: 2,")
+            && worker_pp.contains("(0, SortOptions { descending: false, nulls_first: false })")
+            && worker_pp.contains("(1, SortOptions { descending: false, nulls_first: true })"),
+        "expected worker trim order [b, c] totalized, got:\n{}",
+        worker_pp
+    );
+    let router_pp = pp_phys_plan_ext(
+        p.router.as_ref(),
+        &PPOptions {
+            show_sort_by: true,
+            ..PPOptions::none()
+        },
+    );
+    assert!(
+        router_pp.contains("b@0") && router_pp.contains("c@1"),
+        "expected router Sort extended with the tie-break column c, got:\n{}",
+        router_pp
+    );
+
+    Ok(())
+}
+
+async fn topk_hash_aggregate_trim(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
+    service.exec_query("CREATE SCHEMA s").await?;
+    service
+        .exec_query("CREATE TABLE s.Data(a int, b int, hits int)")
+        .await?;
+    // 12 distinct (a, b) groups, each with two rows so partial aggregation actually groups.
+    // With k=3 and factor=2 the trim activates (g=12 > 6) but the result must match a full
+    // top-k. ORDER BY a (a proper subset of GROUP BY a, b) exercises totalization: the worker
+    // breaks ties on a by b so the router still receives every needed partial state.
+    service
+        .exec_query(
+            "INSERT INTO s.Data(a, b, hits) VALUES \
+             (1,1,10),(1,1,5),(1,2,1),(1,2,2),\
+             (2,1,7),(2,1,3),(2,2,4),(2,2,6),\
+             (3,1,8),(3,1,2),(3,2,9),(3,2,1),\
+             (4,1,1),(4,1,1),(4,2,1),(4,2,1),\
+             (5,1,1),(5,1,1),(5,2,1),(5,2,1),\
+             (6,1,1),(6,1,1),(6,2,1),(6,2,1)",
+        )
+        .await?;
+
+    // ORDER BY a, b LIMIT 3 (ascending): smallest three groups by (a, b).
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1, 2 LIMIT 3")
+        .await?;
+    assert_eq!(to_rows(&r), rows(&[(1, 1, 15), (1, 2, 3), (2, 1, 10)]));
+
+    // ORDER BY a, b DESC LIMIT 3: largest three groups by (a, b).
+    let r = service
+        .exec_query(
+            "SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1 DESC, 2 DESC LIMIT 3",
+        )
+        .await?;
+    assert_eq!(to_rows(&r), rows(&[(6, 2, 2), (6, 1, 2), (5, 2, 2)]));
+
+    // ORDER BY a only (a proper subset of GROUP BY a, b), LIMIT 2. The selected group SET is
+    // deterministic (both groups of a=1), but the intra-tie row order is not, so assert as a set.
+    // Each returned group must carry its complete sum regardless of cross-worker tie-breaking,
+    // which is what totalization (append b to the cut order) guarantees.
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1 LIMIT 2")
+        .await?;
+    let got = to_rows(&r);
+    assert_eq!(got.len(), 2, "expected 2 rows, got: {:?}", got);
+    for expected in rows(&[(1, 1, 15), (1, 2, 3)]) {
+        assert!(
+            got.contains(&expected),
+            "missing {:?} in {:?}",
+            expected,
+            got
+        );
+    }
+
+    Ok(())
+}
+
 async fn planning_hints(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
     service.exec_query("CREATE SCHEMA s").await?;
     service
diff --git a/rust/cubestore/cubestore/Cargo.toml b/rust/cubestore/cubestore/Cargo.toml
index effab752735b8..167ca2a37690e 100644
--- a/rust/cubestore/cubestore/Cargo.toml
+++ b/rust/cubestore/cubestore/Cargo.toml
@@ -28,10 +28,10 @@ cubezetasketch = { path = "../cubezetasketch" }
 cubedatasketches = { path = "../cubedatasketches" }
 cubeshared = { path = "../../cube/cubeshared" }
 cuberpc = { path = "../cuberpc" }
-datafusion = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1", features = ["serde"] }
-datafusion-datasource = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
-datafusion-proto = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
-datafusion-proto-common = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
+datafusion = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit", features = ["serde"] }
+datafusion-datasource = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
+datafusion-proto = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
+datafusion-proto-common = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
 csv = "1.1.3"
 bytes = "1.6.0"
 serde_json = "1.0.56"
diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 602aa34c14294..45123119faada 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -589,6 +589,10 @@ pub trait ConfigObj: DIService {
     /// `DictionaryArray` and the dictionary-aware inline aggregate paths. Off by default while
     /// the feature is built up incrementally behind this flag.
     fn dictionary_encoding_enabled(&self) -> bool;
+    /// Factor `f` controlling when the worker-side partial hash aggregate trims its output to the
+    /// top-k groups. Trimming happens only when the number of local groups exceeds `f * k`, where
+    /// `k = limit + offset`. `0` disables the optimization.
+    fn group_by_limit_factor(&self) -> usize;
 
     fn allow_decimal128(&self) -> bool;
 
@@ -750,6 +754,7 @@ pub struct ConfigObjImpl {
     pub repartition_merge_max_rows: u64,
     pub repartition_check_overlapping_children: bool,
     pub dictionary_encoding_enabled: bool,
+    pub group_by_limit_factor: usize,
     pub allow_decimal128: bool,
     pub enable_remove_orphaned_remote_files: bool,
     pub enable_startup_warmup: bool,
@@ -1094,6 +1099,10 @@ impl ConfigObj for ConfigObjImpl {
         self.dictionary_encoding_enabled
     }
 
+    fn group_by_limit_factor(&self) -> usize {
+        self.group_by_limit_factor
+    }
+
     fn allow_decimal128(&self) -> bool {
         self.allow_decimal128
     }
@@ -1792,6 +1801,10 @@ impl Config {
                 ),
                 // TODO: dev default; flip back to false before merge.
                 dictionary_encoding_enabled: env_bool("CUBESTORE_DICTIONARY_ENCODING", true),
+                group_by_limit_factor: env_parse(
+                    "CUBESTORE_GROUP_BY_LIMIT_FACTOR",
+                    2,
+                ),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",
@@ -2049,6 +2062,7 @@ impl Config {
                 repartition_merge_max_rows: 4_000_000,
                 repartition_check_overlapping_children: false,
                 dictionary_encoding_enabled: false,
+                group_by_limit_factor: 2,
                 allow_decimal128: false,
                 enable_remove_orphaned_remote_files: false,
                 enable_startup_warmup: true,
@@ -2744,10 +2758,6 @@ impl Config {
 
         self.injector
             .register_typed_with_default::<dyn QueryExecutor, _, _, _>(async move |i| {
-                let push_partial_aggregate_below_merge = i
-                    .get_service_typed::<dyn ConfigObj>()
-                    .await
-                    .push_partial_aggregate_below_merge_enabled();
                 QueryExecutorImpl::new(
                     i.get_service_typed::<dyn CubestoreMetadataCacheFactory>()
                         .await
@@ -2755,7 +2765,7 @@ impl Config {
                         .clone(),
                     i.get_service_typed().await,
                     i.get_service_typed().await,
-                    push_partial_aggregate_below_merge,
+                    i.get_service_typed::<dyn ConfigObj>().await,
                 )
             })
             .await;
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
new file mode 100644
index 0000000000000..e1ee77c685ca9
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -0,0 +1,284 @@
+use datafusion::arrow::array::{ArrayRef, AsArray, RecordBatch};
+use datafusion::arrow::compute::{lexsort_to_indices, take, SortColumn, SortOptions};
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::dfschema::internal_err;
+use datafusion::error::Result as DFResult;
+use datafusion::execution::{RecordBatchStream, TaskContext};
+use datafusion::logical_expr::{EmitTo, GroupsAccumulator};
+use datafusion::physical_expr::GroupsAccumulatorAdapter;
+use datafusion::physical_plan::aggregates::group_values::{new_group_values, GroupValues};
+use datafusion::physical_plan::aggregates::order::GroupOrdering;
+use datafusion::physical_plan::aggregates::PhysicalGroupBy;
+use datafusion::physical_plan::udaf::AggregateFunctionExpr;
+use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr, SendableRecordBatchStream};
+use futures::ready;
+use futures::stream::{Stream, StreamExt};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::GroupByLimitAggregateExec;
+
+enum ExecutionState {
+    ReadingInput,
+    ProducingOutput(RecordBatch),
+    Done,
+}
+
+pub(crate) struct GroupByLimitAggregateStream {
+    schema: SchemaRef,
+    input: SendableRecordBatchStream,
+    aggregate_arguments: Vec<Vec<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    group_by: PhysicalGroupBy,
+    batch_size: usize,
+    exec_state: ExecutionState,
+    input_done: bool,
+    accumulators: Vec<Box<dyn GroupsAccumulator>>,
+    group_values: Box<dyn GroupValues>,
+    current_group_indices: Vec<usize>,
+    k: usize,
+    factor: usize,
+    order: Vec<(usize, SortOptions)>,
+}
+
+impl GroupByLimitAggregateStream {
+    pub fn new(
+        agg: &GroupByLimitAggregateExec,
+        context: Arc<TaskContext>,
+        partition: usize,
+    ) -> DFResult<Self> {
+        let agg_schema = Arc::clone(&agg.schema());
+        let agg_group_by = agg.group_expr().clone();
+        let agg_filter_expr = agg.filter_expr().to_vec();
+
+        let batch_size = context.session_config().batch_size();
+        let input = agg.input().execute(partition, Arc::clone(&context))?;
+
+        let aggregate_arguments =
+            aggregate_expressions(agg.aggr_expr(), agg_group_by.num_group_exprs())?;
+
+        let accumulators: Vec<_> = agg
+            .aggr_expr()
+            .iter()
+            .map(create_group_accumulator)
+            .collect::<DFResult<_>>()?;
+
+        let group_schema = agg_group_by.group_schema(&agg.input().schema())?;
+        let group_values = new_group_values(group_schema, &GroupOrdering::None)?;
+
+        Ok(GroupByLimitAggregateStream {
+            schema: agg_schema,
+            input,
+            aggregate_arguments,
+            filter_expressions: agg_filter_expr,
+            group_by: agg_group_by,
+            batch_size,
+            exec_state: ExecutionState::ReadingInput,
+            input_done: false,
+            accumulators,
+            group_values,
+            current_group_indices: Vec::with_capacity(batch_size),
+            k: agg.k(),
+            factor: agg.factor(),
+            order: agg.order().to_vec(),
+        })
+    }
+}
+
+impl Stream for GroupByLimitAggregateStream {
+    type Item = DFResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        loop {
+            match &self.exec_state {
+                ExecutionState::ReadingInput => match ready!(self.input.poll_next_unpin(cx)) {
+                    Some(Ok(batch)) => {
+                        if let Err(e) = self.group_aggregate_batch(batch) {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                    }
+                    Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                    // Input exhausted: emit the whole group table at once, then trim to top-k.
+                    None => {
+                        self.input_done = true;
+                        match self.emit_all_trimmed() {
+                            Ok(Some(batch)) => {
+                                self.exec_state = ExecutionState::ProducingOutput(batch)
+                            }
+                            Ok(None) => self.exec_state = ExecutionState::Done,
+                            Err(e) => return Poll::Ready(Some(Err(e))),
+                        }
+                    }
+                },
+
+                ExecutionState::ProducingOutput(batch) => {
+                    let batch = batch.clone();
+                    let size = self.batch_size;
+                    let (next_state, output) = if batch.num_rows() <= size {
+                        (ExecutionState::Done, batch)
+                    } else {
+                        let remaining = batch.slice(size, batch.num_rows() - size);
+                        let output = batch.slice(0, size);
+                        (ExecutionState::ProducingOutput(remaining), output)
+                    };
+                    self.exec_state = next_state;
+                    return Poll::Ready(Some(Ok(output)));
+                }
+
+                ExecutionState::Done => return Poll::Ready(None),
+            }
+        }
+    }
+}
+
+impl RecordBatchStream for GroupByLimitAggregateStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl GroupByLimitAggregateStream {
+    fn group_aggregate_batch(&mut self, batch: RecordBatch) -> DFResult<()> {
+        let group_by_values = evaluate_group_by(&self.group_by, &batch)?;
+        let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
+        let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
+
+        assert_eq!(group_by_values.len(), 1, "Exactly 1 group value required");
+        self.group_values
+            .intern(&group_by_values[0], &mut self.current_group_indices)?;
+        let group_indices = &self.current_group_indices;
+        let total_num_groups = self.group_values.len();
+
+        for ((acc, values), opt_filter) in self
+            .accumulators
+            .iter_mut()
+            .zip(input_values.iter())
+            .zip(filter_values.iter())
+        {
+            let opt_filter = opt_filter.as_ref().map(|filter| filter.as_boolean());
+            acc.update_batch(values, group_indices, opt_filter, total_num_groups)?;
+        }
+        Ok(())
+    }
+
+    /// Build the partial-state batch for all groups, then keep only the `k` smallest by the total
+    /// order when the number of groups exceeds `factor * k`.
+    fn emit_all_trimmed(&mut self) -> DFResult<Option<RecordBatch>> {
+        if self.group_values.is_empty() {
+            return Ok(None);
+        }
+        let mut columns = self.group_values.emit(EmitTo::All)?;
+        for acc in &mut self.accumulators {
+            columns.extend(acc.state(EmitTo::All)?);
+        }
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+        Ok(Some(self.trim_top_k(batch)?))
+    }
+
+    fn trim_top_k(&self, batch: RecordBatch) -> DFResult<RecordBatch> {
+        let g = batch.num_rows();
+        if self.k == 0 || g <= self.factor.saturating_mul(self.k) {
+            return Ok(batch);
+        }
+        let sort_columns: Vec<SortColumn> = self
+            .order
+            .iter()
+            .map(|(idx, options)| SortColumn {
+                values: Arc::clone(batch.column(*idx)),
+                options: Some(*options),
+            })
+            .collect();
+        let indices = lexsort_to_indices(&sort_columns, Some(self.k))?;
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c.as_ref(), &indices, None))
+            .collect::<std::result::Result<Vec<_>, _>>()?;
+        Ok(RecordBatch::try_new(batch.schema(), columns)?)
+    }
+}
+
+/// Partial-aggregate argument expressions, one vec per aggregate. Mirrors DataFusion's private
+/// `aggregate_expressions` for `AggregateMode::Partial`.
+fn aggregate_expressions(
+    aggr_expr: &[Arc<AggregateFunctionExpr>],
+    _col_idx_base: usize,
+) -> DFResult<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
+    Ok(aggr_expr
+        .iter()
+        .map(|agg| {
+            let mut result = agg.expressions();
+            if let Some(ordering_req) = agg.order_bys() {
+                result.extend(ordering_req.iter().map(|item| Arc::clone(&item.expr)));
+            }
+            result
+        })
+        .collect())
+}
+
+fn create_group_accumulator(
+    agg_expr: &Arc<AggregateFunctionExpr>,
+) -> DFResult<Box<dyn GroupsAccumulator>> {
+    if agg_expr.groups_accumulator_supported() {
+        agg_expr.create_groups_accumulator()
+    } else {
+        let agg_expr_captured = Arc::clone(agg_expr);
+        let factory = move || agg_expr_captured.create_accumulator();
+        Ok(Box::new(GroupsAccumulatorAdapter::new(factory)))
+    }
+}
+
+fn evaluate(expr: &[Arc<dyn PhysicalExpr>], batch: &RecordBatch) -> DFResult<Vec<ArrayRef>> {
+    expr.iter()
+        .map(|expr| {
+            expr.evaluate(batch)
+                .and_then(|v| v.into_array(batch.num_rows()))
+        })
+        .collect()
+}
+
+fn evaluate_many(
+    expr: &[Vec<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> DFResult<Vec<Vec<ArrayRef>>> {
+    expr.iter().map(|expr| evaluate(expr, batch)).collect()
+}
+
+fn evaluate_optional(
+    expr: &[Option<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> DFResult<Vec<Option<ArrayRef>>> {
+    expr.iter()
+        .map(|expr| {
+            expr.as_ref()
+                .map(|expr| {
+                    expr.evaluate(batch)
+                        .and_then(|v| v.into_array(batch.num_rows()))
+                })
+                .transpose()
+        })
+        .collect()
+}
+
+fn evaluate_group_by(
+    group_by: &PhysicalGroupBy,
+    batch: &RecordBatch,
+) -> DFResult<Vec<Vec<ArrayRef>>> {
+    let exprs: Vec<ArrayRef> = group_by
+        .expr()
+        .iter()
+        .map(|(expr, _)| {
+            let value = expr.evaluate(batch)?;
+            value.into_array(batch.num_rows())
+        })
+        .collect::<DFResult<Vec<_>>>()?;
+
+    if !group_by.is_single() {
+        return internal_err!("GroupByLimitAggregate does not support grouping sets");
+    }
+
+    Ok(vec![exprs])
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
new file mode 100644
index 0000000000000..4fe9ba5850cca
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -0,0 +1,204 @@
+mod group_by_limit_aggregate_stream;
+
+use datafusion::arrow::compute::SortOptions;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::common::stats::Precision;
+use datafusion::common::Statistics;
+use datafusion::error::Result as DFResult;
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::aggregate::AggregateFunctionExpr;
+use datafusion::physical_expr::{Distribution, LexRequirement};
+use datafusion::physical_plan::execution_plan::CardinalityEffect;
+use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::{aggregates::*, InputOrderMode};
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PhysicalExpr, PlanProperties,
+    SendableRecordBatchStream,
+};
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// Worker-side partial hash aggregate that trims its output to the top-k groups by a total order,
+/// so far fewer partial-state rows cross the network to the router's Final aggregate.
+///
+/// This is a custom copy of DataFusion's partial hash aggregate (it reuses DF's `GroupValues` and
+/// `GroupsAccumulator` building blocks but owns the consume/emit loop), so the only change required
+/// in the DataFusion fork is making `new_group_values` public. The aggregation builds the whole
+/// group table and, at the single final emit, keeps only the `k` smallest groups by `order` when
+/// the number of groups exceeds `factor * k`; otherwise it emits all groups unchanged.
+///
+/// `order` is a TOTAL order over groups (ORDER BY columns followed by the remaining group-by
+/// columns), expressed as `(partial-output column index, sort options)`. A total order is required
+/// for correctness: the same group key can live on multiple workers, and a consistent cut across
+/// workers guarantees every partial state the router selects reaches it.
+#[derive(Debug, Clone)]
+pub struct GroupByLimitAggregateExec {
+    group_by: PhysicalGroupBy,
+    aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+    filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    pub input: Arc<dyn ExecutionPlan>,
+    /// Partial-aggregate output schema (group columns followed by accumulator state columns).
+    schema: SchemaRef,
+    input_schema: SchemaRef,
+    cache: PlanProperties,
+    /// Fetch count, `k = limit + offset`.
+    k: usize,
+    /// Only trim when the number of local groups exceeds `factor * k`.
+    factor: usize,
+    /// Total order over the partial output columns.
+    order: Vec<(usize, SortOptions)>,
+}
+
+impl GroupByLimitAggregateExec {
+    /// Build a `GroupByLimitAggregateExec` from a partial hash `AggregateExec`, or `None` if it is not a
+    /// single-group-by partial aggregate (grouping sets and non-partial modes are not supported).
+    pub fn try_new_from_partial(
+        aggregate: &AggregateExec,
+        k: usize,
+        factor: usize,
+        order: Vec<(usize, SortOptions)>,
+    ) -> Option<Self> {
+        if *aggregate.mode() != AggregateMode::Partial {
+            return None;
+        }
+        // Sorted-prefix aggregates are handled by InlineAggregateExec; this targets the hash path.
+        if matches!(aggregate.input_order_mode(), InputOrderMode::Sorted) {
+            return None;
+        }
+        let group_by = aggregate.group_expr().clone();
+        if !group_by.is_single() {
+            return None;
+        }
+        Some(Self {
+            group_by,
+            aggr_expr: aggregate.aggr_expr().to_vec(),
+            filter_expr: aggregate.filter_expr().to_vec(),
+            input: aggregate.input().clone(),
+            schema: aggregate.schema().clone(),
+            input_schema: aggregate.input_schema().clone(),
+            cache: aggregate.cache().clone(),
+            k,
+            factor,
+            order,
+        })
+    }
+
+    pub fn k(&self) -> usize {
+        self.k
+    }
+
+    pub fn factor(&self) -> usize {
+        self.factor
+    }
+
+    pub fn order(&self) -> &[(usize, SortOptions)] {
+        &self.order
+    }
+
+    pub fn aggr_expr(&self) -> &[Arc<AggregateFunctionExpr>] {
+        &self.aggr_expr
+    }
+
+    pub fn filter_expr(&self) -> &[Option<Arc<dyn PhysicalExpr>>] {
+        &self.filter_expr
+    }
+
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    pub fn group_expr(&self) -> &PhysicalGroupBy {
+        &self.group_by
+    }
+}
+
+impl DisplayAs for GroupByLimitAggregateExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "GroupByLimitAggregateExec: k={}, factor={}, order={:?}",
+                    self.k, self.factor, self.order
+                )?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ExecutionPlan for GroupByLimitAggregateExec {
+    fn name(&self) -> &'static str {
+        "GroupByLimitAggregateExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.cache
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+        vec![None]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self {
+            group_by: self.group_by.clone(),
+            aggr_expr: self.aggr_expr.clone(),
+            filter_expr: self.filter_expr.clone(),
+            input: children[0].clone(),
+            schema: self.schema.clone(),
+            input_schema: self.input_schema.clone(),
+            cache: self.cache.clone(),
+            k: self.k,
+            factor: self.factor,
+            order: self.order.clone(),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DFResult<SendableRecordBatchStream> {
+        let stream = group_by_limit_aggregate_stream::GroupByLimitAggregateStream::new(
+            self, context, partition,
+        )?;
+        Ok(Box::pin(stream))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        None
+    }
+
+    fn statistics(&self) -> DFResult<Statistics> {
+        Ok(Statistics {
+            num_rows: Precision::Absent,
+            column_statistics: Statistics::unknown_column(&self.schema),
+            total_byte_size: Precision::Absent,
+        })
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::LowerEqual
+    }
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/mod.rs b/rust/cubestore/cubestore/src/queryplanner/mod.rs
index 86c032b4d28d6..0a2536a34a3f7 100644
--- a/rust/cubestore/cubestore/src/queryplanner/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/mod.rs
@@ -10,6 +10,7 @@ use datafusion_datasource::memory::MemorySourceConfig;
 use datafusion_datasource::source::DataSourceExec;
 pub use planning::PlanningMeta;
 mod check_memory;
+mod group_by_limit_aggregate;
 pub mod physical_plan_flags;
 pub mod pretty_printers;
 mod projection_above_limit;
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
new file mode 100644
index 0000000000000..3bf40d08c1c65
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
@@ -0,0 +1,251 @@
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
+use crate::queryplanner::planning::WorkerExec;
+use crate::queryplanner::query_executor::ClusterSendExec;
+use datafusion::arrow::compute::SortOptions;
+use datafusion::error::DataFusionError;
+use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode};
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion::physical_plan::expressions::Column;
+use datafusion::physical_plan::limit::GlobalLimitExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion::physical_plan::{ExecutionPlan, InputOrderMode};
+use std::sync::Arc;
+
+/// Trim the worker-side partial hash aggregate to the top-k groups when the plan is
+/// `LIMIT k` over `ORDER BY <subset of group-by columns>` over a distributed hash aggregate.
+///
+/// Correctness requires a TOTAL order over groups (`T = ORDER BY ++ remaining group-by columns`,
+/// in group-by order) applied in TWO places that must agree:
+///   - the worker cut: each worker keeps its local top-k by `T`;
+///   - the router select: the global Sort + Limit must also order by `T`.
+/// Under `T` the router's top-k equals the global top-k by `T`, and every worker that holds a
+/// partial state for such a group keeps it (its local rank can only be smaller), so every needed
+/// partial state reaches the router. Ordering the router by `T` instead of the bare `ORDER BY` does
+/// not change the query contract: `ORDER BY` is a prefix of `T`, so the output stays validly
+/// ordered and the previously-unspecified tie order just becomes deterministic.
+///
+/// We only rewrite when the plan matches exactly `Sort(/Limit) -> [passthrough] -> Final aggregate
+/// -> [passthrough/cluster boundary] -> Partial hash aggregate`; anything else on the path (a
+/// HAVING filter, a nested aggregate, a computed projection) makes us bail, so we never trim a plan
+/// where the limit does not directly govern this aggregate.
+///
+/// `factor` gates trimming at runtime (only when local groups exceed `factor * k`); `0` disables.
+pub fn replace_with_group_by_limit_aggregate(
+    plan: Arc<dyn ExecutionPlan>,
+    factor: usize,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    if factor == 0 {
+        return Ok(plan);
+    }
+    let Some(target) = analyze(&plan) else {
+        return Ok(plan);
+    };
+    apply(plan, &target, factor)
+}
+
+struct Target {
+    /// The router `SortExec` whose ordering must be extended to the total order.
+    sort: Arc<dyn ExecutionPlan>,
+    /// The worker-side partial hash `AggregateExec` to replace with a trimming exec.
+    partial: Arc<dyn ExecutionPlan>,
+    /// Tail of the total order to append to the router sort (over the sort's input schema).
+    router_tail: Vec<PhysicalSortExpr>,
+    /// Full total order over the partial output schema for the worker cut.
+    trim_order: Vec<(usize, SortOptions)>,
+    /// `k = limit + offset`.
+    k: usize,
+}
+
+fn analyze(root: &Arc<dyn ExecutionPlan>) -> Option<Target> {
+    // Peel an optional top GlobalLimit (carries the offset), then require a SortExec.
+    let (skip, extra_fetch, sort_node) =
+        if let Some(gl) = root.as_any().downcast_ref::<GlobalLimitExec>() {
+            (gl.skip(), gl.fetch(), child(root)?)
+        } else {
+            (0, None, root.clone())
+        };
+    let sort = sort_node.as_any().downcast_ref::<SortExec>()?;
+    let order: Vec<PhysicalSortExpr> = sort.expr().iter().cloned().collect();
+    if order.is_empty() {
+        return None;
+    }
+    // The worker must keep enough groups to cover `limit + offset`. When a top GlobalLimit carries
+    // the offset, DataFusion already folds `skip + limit` into the sort's fetch, so prefer it;
+    // otherwise fall back to the GlobalLimit's own `skip + fetch`.
+    let k = sort
+        .fetch()
+        .or_else(|| extra_fetch.map(|fetch| skip + fetch))?;
+
+    // Sort -> [passthrough] -> Final aggregate.
+    let final_agg_node = descend_to_final_aggregate(sort.input().clone())?;
+    let final_agg = final_agg_node.as_any().downcast_ref::<AggregateExec>()?;
+
+    // Final aggregate -> [passthrough/boundary] -> Partial hash aggregate.
+    let partial_node = descend_to_worker_partial(final_agg.input().clone())?;
+    let partial = partial_node.as_any().downcast_ref::<AggregateExec>()?;
+    if !partial.group_expr().is_single()
+        || matches!(partial.input_order_mode(), InputOrderMode::Sorted)
+    {
+        return None;
+    }
+
+    let num_group_cols = partial.group_expr().output_exprs().len();
+    if num_group_cols == 0 {
+        return None;
+    }
+    let partial_schema = partial.schema();
+    let group_names: Vec<String> = partial_schema
+        .fields()
+        .iter()
+        .take(num_group_cols)
+        .map(|f| f.name().clone())
+        .collect();
+
+    // Map ORDER BY columns onto group-by columns (by name; robust to projections).
+    let mut used = vec![false; num_group_cols];
+    let mut trim_order: Vec<(usize, SortOptions)> = Vec::with_capacity(num_group_cols);
+    for e in &order {
+        let column = e.expr.as_any().downcast_ref::<Column>()?;
+        let idx = group_names.iter().position(|n| n == column.name())?;
+        if used[idx] {
+            continue;
+        }
+        used[idx] = true;
+        trim_order.push((idx, e.options));
+    }
+    if trim_order.is_empty() {
+        return None;
+    }
+
+    // Totalize: append the remaining group-by columns in group-by order. Build the matching tail
+    // for the router sort over its own (Final-output) schema, resolved by name.
+    let sort_input_schema = sort.input().schema();
+    let mut router_tail: Vec<PhysicalSortExpr> = Vec::new();
+    for (idx, is_used) in used.into_iter().enumerate() {
+        if is_used {
+            continue;
+        }
+        let name = &group_names[idx];
+        let options = SortOptions::default();
+        let sort_col_idx = sort_input_schema.index_of(name).ok()?;
+        router_tail.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(name, sort_col_idx)),
+            options,
+        });
+        trim_order.push((idx, options));
+    }
+
+    Some(Target {
+        sort: sort_node,
+        partial: partial_node,
+        router_tail,
+        trim_order,
+        k,
+    })
+}
+
+fn apply(
+    node: Arc<dyn ExecutionPlan>,
+    target: &Target,
+    factor: usize,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    let is_sort = Arc::ptr_eq(&node, &target.sort);
+    let is_partial = Arc::ptr_eq(&node, &target.partial);
+
+    let new_children = node
+        .children()
+        .into_iter()
+        .map(|c| apply(c.clone(), target, factor))
+        .collect::<Result<Vec<_>, _>>()?;
+    let node = node.with_new_children(new_children)?;
+
+    if is_partial {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            if let Some(exec) = GroupByLimitAggregateExec::try_new_from_partial(
+                agg,
+                target.k,
+                factor,
+                target.trim_order.clone(),
+            ) {
+                return Ok(Arc::new(exec));
+            }
+        }
+        // Leaving the full aggregate in place stays correct; the router still sorts by the total
+        // order, it just receives every group instead of the trimmed top-k.
+        return Ok(node);
+    }
+
+    if is_sort {
+        if let Some(sort) = node.as_any().downcast_ref::<SortExec>() {
+            let mut exprs: Vec<PhysicalSortExpr> = sort.expr().iter().cloned().collect();
+            exprs.extend(target.router_tail.iter().cloned());
+            let new_sort = SortExec::new(LexOrdering::new(exprs), sort.input().clone())
+                .with_preserve_partitioning(sort.preserve_partitioning())
+                .with_fetch(sort.fetch());
+            return Ok(Arc::new(new_sort));
+        }
+    }
+
+    Ok(node)
+}
+
+/// Walk down single-child passthrough nodes (which preserve rows and grouping) until the first
+/// `Final`/`FinalPartitioned` `AggregateExec`. Returns `None` if a non-passthrough node is hit
+/// first (e.g. a filter or a computed projection).
+fn descend_to_final_aggregate(mut node: Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    loop {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            return matches!(
+                agg.mode(),
+                AggregateMode::Final | AggregateMode::FinalPartitioned
+            )
+            .then_some(node.clone());
+        }
+        if is_row_passthrough(&node) {
+            node = child(&node)?;
+        } else {
+            return None;
+        }
+    }
+}
+
+/// Walk down passthrough nodes from a `Final` aggregate's input to the worker-side `Partial`
+/// aggregate, requiring that exactly one `ClusterSend`/`Worker` boundary is crossed. Returns `None`
+/// if anything unexpected (a second aggregate, a filter, ...) is on the path.
+fn descend_to_worker_partial(mut node: Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    let mut crossed_boundary = false;
+    loop {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            return (crossed_boundary && *agg.mode() == AggregateMode::Partial)
+                .then_some(node.clone());
+        }
+        if node.as_any().is::<ClusterSendExec>() || node.as_any().is::<WorkerExec>() {
+            crossed_boundary = true;
+            node = child(&node)?;
+        } else if is_row_passthrough(&node) {
+            node = child(&node)?;
+        } else {
+            return None;
+        }
+    }
+}
+
+/// Single-child nodes that pass rows through unchanged (preserving grouping), so a limit/sort above
+/// them governs the aggregate below them.
+fn is_row_passthrough(node: &Arc<dyn ExecutionPlan>) -> bool {
+    let any = node.as_any();
+    any.is::<CoalescePartitionsExec>()
+        || any.is::<SortPreservingMergeExec>()
+        || any.is::<ClusterSendExec>()
+        || any.is::<WorkerExec>()
+}
+
+fn child(node: &Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    let children = node.children();
+    if children.len() != 1 {
+        return None;
+    }
+    Some(children[0].clone())
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index edf902b44d3e5..42b24d252d899 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -1,5 +1,6 @@
 mod check_memory;
 mod distributed_partial_aggregate;
+mod group_by_limit_rewriter;
 mod inline_aggregate_rewriter;
 pub mod is_not_distinct_from_join_keys;
 pub mod rewrite_plan;
@@ -13,6 +14,7 @@ use crate::queryplanner::optimizations::distributed_partial_aggregate::{
     push_aggregate_to_workers, push_sorted_partial_aggregate_below_merge,
     push_worker_sort_and_limit, replace_suboptimal_merge_sorts,
 };
+use crate::queryplanner::optimizations::group_by_limit_rewriter::replace_with_group_by_limit_aggregate;
 use crate::queryplanner::optimizations::inline_aggregate_rewriter::replace_with_inline_aggregate;
 use crate::queryplanner::planning::CubeExtensionPlanner;
 use crate::queryplanner::pretty_printers::{pp_phys_plan_ext, PPOptions};
@@ -112,12 +114,14 @@ impl QueryPlanner for CubeQueryPlanner {
 #[derive(Debug)]
 pub struct PreOptimizeRule {
     push_partial_aggregate_below_merge: bool,
+    group_by_limit_factor: usize,
 }
 
 impl PreOptimizeRule {
-    pub fn new(push_partial_aggregate_below_merge: bool) -> Self {
+    pub fn new(push_partial_aggregate_below_merge: bool, group_by_limit_factor: usize) -> Self {
         Self {
             push_partial_aggregate_below_merge,
+            group_by_limit_factor,
         }
     }
 }
@@ -128,7 +132,11 @@ impl PhysicalOptimizerRule for PreOptimizeRule {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
-        pre_optimize_physical_plan(plan, self.push_partial_aggregate_below_merge)
+        pre_optimize_physical_plan(
+            plan,
+            self.push_partial_aggregate_below_merge,
+            self.group_by_limit_factor,
+        )
     }
 
     fn name(&self) -> &str {
@@ -143,6 +151,7 @@ impl PhysicalOptimizerRule for PreOptimizeRule {
 fn pre_optimize_physical_plan(
     p: Arc<dyn ExecutionPlan>,
     push_partial_aggregate_below_merge: bool,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| push_aggregate_to_workers(p))?;
 
@@ -164,6 +173,11 @@ fn pre_optimize_physical_plan(
     // Replace sorted AggregateExec with InlineAggregateExec for better performance
     let p = rewrite_physical_plan(p, &mut |p| replace_with_inline_aggregate(p))?;
 
+    // Trim the worker-side partial hash aggregate to the top-k groups when the query orders by a
+    // subset of group-by columns and has a limit. Runs after inline-aggregate replacement so it
+    // only sees the remaining (hash) partial aggregates.
+    let p = replace_with_group_by_limit_aggregate(p, group_by_limit_factor)?;
+
     Ok(p)
 }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs b/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
index 63d386add4951..3b50620742de7 100644
--- a/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
@@ -28,6 +28,7 @@ use std::sync::Arc;
 
 use crate::queryplanner::check_memory::CheckMemoryExec;
 use crate::queryplanner::filter_by_key_range::FilterByKeyRangeExec;
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
 use crate::queryplanner::inline_aggregate::{InlineAggregateExec, InlineAggregateMode};
 use crate::queryplanner::merge_sort::LastRowByUniqueKeyExec;
 use crate::queryplanner::panic::{PanicWorkerExec, PanicWorkerNode};
@@ -617,6 +618,16 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
             if let Some(limit) = agg.limit() {
                 *out += &format!(", limit: {}", limit)
             }
+        } else if let Some(agg) = a.downcast_ref::<GroupByLimitAggregateExec>() {
+            *out += &format!(
+                "GroupByLimitAggregate, k: {}, factor: {}, order: {:?}",
+                agg.k(),
+                agg.factor(),
+                agg.order()
+            );
+            if o.show_aggregations {
+                *out += &format!(", aggs: {:?}", agg.aggr_expr())
+            }
         } else if let Some(l) = a.downcast_ref::<LocalLimitExec>() {
             *out += &format!("LocalLimit, n: {}", l.fetch());
         } else if let Some(l) = a.downcast_ref::<GlobalLimitExec>() {
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index d2ba6954801b1..cff350e594be1 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -226,7 +226,7 @@ pub struct QueryExecutorImpl {
     metadata_cache_factory: Arc<dyn MetadataCacheFactory>,
     parquet_metadata_cache: Arc<dyn CubestoreParquetMetadataCache>,
     memory_handler: Arc<dyn MemoryHandler>,
-    push_partial_aggregate_below_merge: bool,
+    config: Arc<dyn ConfigObj>,
 }
 
 crate::di_service!(QueryExecutorImpl, [QueryExecutor]);
@@ -547,13 +547,13 @@ impl QueryExecutorImpl {
         metadata_cache_factory: Arc<dyn MetadataCacheFactory>,
         parquet_metadata_cache: Arc<dyn CubestoreParquetMetadataCache>,
         memory_handler: Arc<dyn MemoryHandler>,
-        push_partial_aggregate_below_merge: bool,
+        config: Arc<dyn ConfigObj>,
     ) -> Arc<Self> {
         Arc::new(QueryExecutorImpl {
             metadata_cache_factory,
             parquet_metadata_cache,
             memory_handler,
-            push_partial_aggregate_below_merge,
+            config,
         })
     }
 
@@ -603,7 +603,8 @@ impl QueryExecutorImpl {
         vec![
             // Cube rules
             Arc::new(PreOptimizeRule::new(
-                self.push_partial_aggregate_below_merge,
+                self.config.push_partial_aggregate_below_merge_enabled(),
+                self.config.group_by_limit_factor(),
             )),
             // DF rules without EnforceDistribution.  We do need to keep EnforceSorting.
             Arc::new(OutputRequirements::new_add_mode()),

From f6d09954a8af1afaf30fe041b9f2d1c8bc64a54a Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 18:06:52 +0200
Subject: [PATCH 03/17] feat(cubestore): engage group-by-limit trim at
 distributed execution via worker_sort_and_limit

The rewriter only plants GroupByLimitAggregate during ROUTER planning, but the
worker re-plans physical from logical independently, so the trim never reached
execution. Instead of a new proto, reuse the existing worker_sort_and_limit
carry (already serialized across ClusterSend): when group_by_limit_factor > 0,
resort_worker_subtree now wraps the worker partial hash aggregate in
GroupByLimitAggregateExec (trim during aggregation, bounded O(factor*k) memory)
instead of a SortExec over the full partial; the Sort above still orders the
<= k surviving groups so the router's sort-preserving merge stays correct.
group_by_limit_factor (CUBESTORE_GROUP_BY_LIMIT_FACTOR, default 2) is threaded
router->worker via CubeQueryPlanner; 0 keeps the previous sort-then-trim.

Verified on the production-dump stend: with factor>0 the worker EXPLAIN ANALYZE
shows GroupByLimitAggregate (was LinearPartialAggregate) and results are
identical. Value is bounded hash-table memory (OOM avoidance) on
high-cardinality group-bys, not speed on small inputs.

WIP: only fires when worker_sort_and_limit fires, i.e. single-table
`ORDER BY <group subset> LIMIT k`. UNION-of-tables (the prod query.sql shape)
does not populate worker_sort_and_limit yet (separate ctx-propagation gap).
---
 .../distributed_partial_aggregate.rs          | 52 +++++++++++++++++--
 .../src/queryplanner/optimizations/mod.rs     | 11 +++-
 .../src/queryplanner/query_executor.rs        |  2 +
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 5246b1878f132..cde5f026e8e2f 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -1,5 +1,6 @@
 use crate::cluster::WorkerPlanningParams;
 use crate::queryplanner::check_memory::CheckMemoryExec;
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
 use crate::queryplanner::inline_aggregate::{InlineAggregateExec, InlineAggregateMode};
 use crate::queryplanner::planning::WorkerExec;
 use crate::queryplanner::query_executor::ClusterSendExec;
@@ -286,13 +287,15 @@ pub fn push_sorted_partial_aggregate_below_merge(
 /// key) would drop partial states of tied groups.
 pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     // Worker side: wrap the partial aggregate with a per-partition bounded sort.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
         };
-        let Some(new_input) = resort_worker_subtree(&w.input, &cols, fetch) else {
+        let Some(new_input) = resort_worker_subtree(&w.input, &cols, fetch, group_by_limit_factor)
+        else {
             return Ok(p);
         };
         return Ok(Arc::new(WorkerExec::new(
@@ -324,8 +327,12 @@ pub fn push_worker_sort_and_limit(
     let Some((cols, fetch)) = cs.worker_sort_and_limit.clone() else {
         return Ok(p);
     };
-    let Some(new_worker_subtree) = resort_worker_subtree(&cs.input_for_optimizations, &cols, fetch)
-    else {
+    let Some(new_worker_subtree) = resort_worker_subtree(
+        &cs.input_for_optimizations,
+        &cols,
+        fetch,
+        group_by_limit_factor,
+    ) else {
         return Ok(p);
     };
     let new_cs: Arc<dyn ExecutionPlan> =
@@ -377,6 +384,7 @@ fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
     cols: &[(usize, bool, bool)],
     fetch: usize,
+    group_by_limit_factor: usize,
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
     let schema = partial.schema();
@@ -392,8 +400,44 @@ fn resort_worker_subtree(
         });
     }
     let worker_order = LexOrdering::new(exprs);
+
+    // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
+    // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
+    // the Sort below trim afterwards. The Sort above still sorts the (now <= k) groups so the
+    // router's sort-preserving merge stays correct. `group_by_limit_factor == 0` disables this and
+    // falls back to the plain sort-then-trim.
+    let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
+        if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
+            let order: Vec<(usize, SortOptions)> = cols
+                .iter()
+                .map(|(idx, asc, nulls_first)| {
+                    (
+                        *idx,
+                        SortOptions {
+                            descending: !asc,
+                            nulls_first: *nulls_first,
+                        },
+                    )
+                })
+                .collect();
+            match GroupByLimitAggregateExec::try_new_from_partial(
+                agg,
+                fetch,
+                group_by_limit_factor,
+                order,
+            ) {
+                Some(e) => Arc::new(e) as Arc<dyn ExecutionPlan>,
+                None => partial,
+            }
+        } else {
+            partial
+        }
+    } else {
+        partial
+    };
+
     let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
-        SortExec::new(worker_order.clone(), partial)
+        SortExec::new(worker_order.clone(), trimmed)
             .with_fetch(Some(fetch))
             .with_preserve_partitioning(true),
     );
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index 42b24d252d899..2df406a1a62af 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -45,6 +45,7 @@ pub struct CubeQueryPlanner {
     serialized_plan: Arc<PreSerializedPlan>,
     memory_handler: Arc<dyn MemoryHandler>,
     data_loaded_size: Option<Arc<DataLoadedSize>>,
+    group_by_limit_factor: usize,
 }
 
 impl CubeQueryPlanner {
@@ -52,6 +53,7 @@ impl CubeQueryPlanner {
         cluster: Arc<dyn Cluster>,
         serialized_plan: Arc<PreSerializedPlan>,
         memory_handler: Arc<dyn MemoryHandler>,
+        group_by_limit_factor: usize,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             cluster: Some(cluster),
@@ -59,6 +61,7 @@ impl CubeQueryPlanner {
             serialized_plan,
             memory_handler,
             data_loaded_size: None,
+            group_by_limit_factor,
         }
     }
 
@@ -67,6 +70,7 @@ impl CubeQueryPlanner {
         worker_planning_params: WorkerPlanningParams,
         memory_handler: Arc<dyn MemoryHandler>,
         data_loaded_size: Option<Arc<DataLoadedSize>>,
+        group_by_limit_factor: usize,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             serialized_plan,
@@ -74,6 +78,7 @@ impl CubeQueryPlanner {
             worker_partition_count: Some(worker_planning_params),
             memory_handler,
             data_loaded_size,
+            group_by_limit_factor,
         }
     }
 }
@@ -106,6 +111,7 @@ impl QueryPlanner for CubeQueryPlanner {
             self.memory_handler.clone(),
             self.data_loaded_size.clone(),
             ctx_state.config().options(),
+            self.group_by_limit_factor,
         );
         result
     }
@@ -187,6 +193,7 @@ fn finalize_physical_plan(
     memory_handler: Arc<dyn MemoryHandler>,
     data_loaded_size: Option<Arc<DataLoadedSize>>,
     config: &ConfigOptions,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| add_check_memory_exec(p, memory_handler.clone()))?;
     log::trace!(
@@ -215,7 +222,9 @@ fn finalize_physical_plan(
     // Last: bound worker memory for ORDER BY <group cols> LIMIT that isn't an index prefix. Runs
     // after replace_suboptimal_merge_sorts so it doesn't push the query's row limit into the
     // worker merge we add (which would cut uncombined partial rows and undercount).
-    let p = rewrite_physical_plan(p, &mut |p| push_worker_sort_and_limit(p))?;
+    let p = rewrite_physical_plan(p, &mut |p| {
+        push_worker_sort_and_limit(p, group_by_limit_factor)
+    })?;
     log::trace!(
         "Rewrote physical plan by push_worker_sort_and_limit:\n{}",
         pp_phys_plan_ext(p.as_ref(), &PPOptions::show_nonmeta())
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index cff350e594be1..f1d4318086bc1 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -567,6 +567,7 @@ impl QueryExecutorImpl {
             cluster,
             serialized_plan,
             self.memory_handler.clone(),
+            self.config.group_by_limit_factor(),
         ))
     }
 
@@ -582,6 +583,7 @@ impl QueryExecutorImpl {
             worker_planning_params,
             self.memory_handler.clone(),
             data_loaded_size.clone(),
+            self.config.group_by_limit_factor(),
         ))
     }
 

From 65420c6ed559bef8ce9458e49a87c7c8ae5d00e6 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 19:57:00 +0200
Subject: [PATCH 04/17] feat(cubestore): extend group-by-limit pushdown to bare
 LIMIT and through UNION

The worker group-by-limit trim previously required an ORDER BY and was dropped
when the per-branch ClusterSends of a UNION were merged. Now:

- compute_worker_sort_and_limit handles a bare LIMIT (no ORDER BY): the total
  order is the full group key in group-by order, so "any n" becomes "the n
  smallest by group key" -- a valid deterministic choice. An ORDER BY prefix,
  when present, still sorts first.
- pull_up_cluster_send preserves worker_sort_and_limit across a UNION (the same
  group-by/limit context descends to every branch, so the descriptors are
  positionally identical and stay valid over the union); kept only when all
  branches agree.

Adds planning coverage (bare LIMIT, UNION ALL + bare LIMIT) and a bare-LIMIT
execution assertion to topk_hash_aggregate_trim; excludes that test from the
migration harness (new test, no recorded fixture).
---
 .../cubestore-sql-tests/src/tests.rs          | 49 +++++++++++++++++++
 .../cubestore/src/queryplanner/planning.rs    | 38 ++++++++++----
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/rust/cubestore/cubestore-sql-tests/src/tests.rs b/rust/cubestore/cubestore-sql-tests/src/tests.rs
index 551c70234c839..b2cd81474808d 100644
--- a/rust/cubestore/cubestore-sql-tests/src/tests.rs
+++ b/rust/cubestore/cubestore-sql-tests/src/tests.rs
@@ -427,6 +427,7 @@ lazy_static::lazy_static! {
         "limit_pushdown_group_nonprefix_order",
         "prefilter_chunks_shared_scan",
         "planning_topk_hash_aggregate",
+        "topk_hash_aggregate_trim",
     ].into_iter().map(ToOwned::to_owned).collect();
 }
 
@@ -3284,6 +3285,37 @@ async fn planning_topk_hash_aggregate(service: Box<dyn SqlClient>) -> Result<(),
         router_pp
     );
 
+    // Bare LIMIT (no ORDER BY) on a non-indexed group column: the limit can't ride the index, so the
+    // worker still trims to the smallest groups by the full group key -- "any k" made deterministic.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,")
+            && pp.contains("(0, SortOptions { descending: false, nulls_first: false })"),
+        "expected GroupByLimitAggregate on a bare LIMIT, got:\n{}",
+        pp
+    );
+
+    // UNION ALL + bare LIMIT: the per-branch trim descriptor must survive the cluster-send pull-up
+    // over the union so the worker still trims above the union.
+    service
+        .exec_query("CREATE TABLE s.Data2(url text, day int, hits int)")
+        .await?;
+    let p = service
+        .plan_query(
+            "SELECT day, SUM(hits) FROM \
+             (SELECT * FROM s.Data UNION ALL SELECT * FROM s.Data2) u GROUP BY 1 LIMIT 10",
+        )
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,") && pp.contains("Union"),
+        "expected GroupByLimitAggregate over the Union, got:\n{}",
+        pp
+    );
+
     Ok(())
 }
 
@@ -3340,6 +3372,23 @@ async fn topk_hash_aggregate_trim(service: Box<dyn SqlClient>) -> Result<(), Cub
         );
     }
 
+    // Bare LIMIT 3 (no ORDER BY): the trim orders by the full group key, so "any 3" resolves to the
+    // 3 smallest by (a, b). The result order is unspecified, but the group SET and each group's full
+    // sum must be exact -- the latter guards against undercounting a group split across workers.
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 LIMIT 3")
+        .await?;
+    let got = to_rows(&r);
+    assert_eq!(got.len(), 3, "expected 3 rows, got: {:?}", got);
+    for expected in rows(&[(1, 1, 15), (1, 2, 3), (2, 1, 10)]) {
+        assert!(
+            got.contains(&expected),
+            "missing {:?} in {:?}",
+            expected,
+            got
+        );
+    }
+
     Ok(())
 }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/planning.rs b/rust/cubestore/cubestore/src/queryplanner/planning.rs
index e6786376fcd01..c406eb596e10c 100644
--- a/rust/cubestore/cubestore/src/queryplanner/planning.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/planning.rs
@@ -1147,19 +1147,22 @@ impl ChooseIndex<'_> {
             return None;
         }
         let limit = ctx.limit?;
-        let sort = ctx.sort.as_ref().filter(|s| !s.is_empty())?;
         let group_by = ctx.group_by.as_ref().filter(|g| !g.is_empty())?;
 
-        // Every ORDER BY column must be a group-by column; map it to its group-key position.
+        // Every ORDER BY column must be a group-by column; map it to its group-key position. A bare
+        // LIMIT (no ORDER BY) leaves the prefix empty, so the total order is the full group key in
+        // group-by order -- "any n groups" becomes "the n smallest by group key", still valid.
         let mut cols: Vec<(usize, bool, bool)> = Vec::with_capacity(group_by.len());
         let mut used = vec![false; group_by.len()];
-        for name in sort {
-            let idx = group_by.iter().position(|g| g == name)?;
-            if used[idx] {
-                continue;
+        if let Some(sort) = ctx.sort.as_ref().filter(|s| !s.is_empty()) {
+            for name in sort {
+                let idx = group_by.iter().position(|g| g == name)?;
+                if used[idx] {
+                    continue;
+                }
+                used[idx] = true;
+                cols.push((idx, ctx.sort_is_asc, !ctx.sort_is_asc));
             }
-            used[idx] = true;
-            cols.push((idx, ctx.sort_is_asc, !ctx.sort_is_asc));
         }
         // Extend with the remaining group keys to make it a total order on the full group key.
         for (idx, is_used) in used.iter().enumerate() {
@@ -1837,6 +1840,7 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
             }
             let mut union_snapshots = Vec::new();
             let mut limits = Vec::new();
+            let mut worker_sort_and_limits = Vec::new();
             let mut id = 0;
             for i in inputs.into_iter() {
                 let send;
@@ -1852,6 +1856,7 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
                 }
                 union_snapshots.extend(send.snapshots.concat());
                 limits.push(send.limit_and_reverse);
+                worker_sort_and_limits.push(send.worker_sort_and_limit.clone());
                 *i = send.input.clone();
             }
             let limit = if limits.is_empty() || limits.iter().any(|l| l.is_none()) {
@@ -1861,8 +1866,23 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
             } else {
                 limits[0]
             };
+            // The same group-by/limit context descends to every UNION branch, so the per-branch
+            // descriptors are identical and stay valid over the union (same schema, group columns by
+            // position). Keep it only when all branches agree; otherwise drop the pushdown.
+            let worker_sort_and_limit = if worker_sort_and_limits.is_empty()
+                || worker_sort_and_limits.iter().any(|w| w.is_none())
+                || worker_sort_and_limits
+                    .iter()
+                    .any(|w| w != &worker_sort_and_limits[0])
+            {
+                None
+            } else {
+                worker_sort_and_limits[0].clone()
+            };
             snapshots = vec![union_snapshots];
-            return Ok(ClusterSendNode::new(id, Arc::new(p), snapshots, limit).into_plan());
+            let mut new_send = ClusterSendNode::new(id, Arc::new(p), snapshots, limit);
+            new_send.worker_sort_and_limit = worker_sort_and_limit;
+            return Ok(new_send.into_plan());
         }
         LogicalPlan::Join(Join { left, right, .. }) => {
             let lsend;

From 83a10c198cd4b976c5f37438a900a6f7bb6c628a Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 20:46:46 +0200
Subject: [PATCH 05/17] feat(cubestore): dict-aware group keys + native
 dictionary parquet read

Make dictionary-encoded string group keys fast in the worker partial aggregate
instead of materializing strings.

- GroupByLimitAggregateStream remaps each Dictionary(Int32, Utf8) group column to
  Int32 global ids (interning each distinct value once into a per-stream global
  dictionary) and groups on those via DataFusion's primitive GroupValues path,
  rebuilding the Dictionary columns at the single final emit. The trim/merge sort
  the rebuilt dictionaries by value, so the per-worker id maps stay internal and
  cross-worker combination is unaffected.
- SuppliedSchemaReaderCustomizer pins the index (dictionary) schema on the parquet
  reader (ArrowReaderOptions::with_schema), so dictionary string columns are read
  natively from the on-disk dictionary pages instead of being materialized as Utf8
  and cast to dictionary per batch by the schema adapter. Composes with the
  configured customizer; used only for indexes that carry a Dictionary column.
- batches_to_dataframe decodes Dictionary result columns to their value type.

On the production dump (UNION, GROUP BY 6 string cols, LIMIT 10) the native read
removes the per-batch Utf8->Dictionary cast that made dictionary encoding a
regression; the query is now slightly faster than the non-dictionary path.
---
 .../group_by_limit_aggregate/dict_remap.rs    | 90 +++++++++++++++++++
 .../group_by_limit_aggregate_stream.rs        | 45 +++++++++-
 .../group_by_limit_aggregate/mod.rs           |  1 +
 .../src/queryplanner/query_executor.rs        | 87 ++++++++++++++++--
 4 files changed, 214 insertions(+), 9 deletions(-)
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
new file mode 100644
index 0000000000000..fa72915cd586f
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
@@ -0,0 +1,90 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::arrow::array::{Array, ArrayRef, DictionaryArray, Int32Array, StringArray};
+use datafusion::arrow::datatypes::{DataType, Int32Type};
+use datafusion::error::{DataFusionError, Result as DFResult};
+
+/// True for the only dictionary layout CubeStore produces for string group keys.
+pub(crate) fn is_int32_utf8_dict(dt: &DataType) -> bool {
+    matches!(dt, DataType::Dictionary(k, v)
+        if k.as_ref() == &DataType::Int32 && v.as_ref() == &DataType::Utf8)
+}
+
+/// Accumulates a global `String -> id` mapping across batches so a dictionary-encoded group column
+/// can be grouped as `Int32` global ids on DataFusion's fast primitive path, instead of
+/// materializing the string on every row. The per-batch string work is proportional to the batch's
+/// distinct dictionary values, not its row count. Null dictionary entries and null keys stay null.
+pub(crate) struct GlobalDict {
+    value_to_id: HashMap<String, i32>,
+    values: Vec<String>,
+}
+
+impl GlobalDict {
+    pub fn new() -> Self {
+        Self {
+            value_to_id: HashMap::new(),
+            values: Vec::new(),
+        }
+    }
+
+    fn intern_value(&mut self, v: &str) -> i32 {
+        if let Some(id) = self.value_to_id.get(v) {
+            return *id;
+        }
+        let id = self.values.len() as i32;
+        self.values.push(v.to_string());
+        self.value_to_id.insert(v.to_string(), id);
+        id
+    }
+
+    /// Remap a `Dictionary(Int32, Utf8)` array to an `Int32Array` of global ids.
+    pub fn remap(&mut self, array: &ArrayRef) -> DFResult<ArrayRef> {
+        let dict = array
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .ok_or_else(|| {
+                DataFusionError::Internal(
+                    "GlobalDict::remap expected Dictionary(Int32)".to_string(),
+                )
+            })?;
+        let local_values = dict
+            .values()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| {
+                DataFusionError::Internal("GlobalDict::remap expected Utf8 values".to_string())
+            })?;
+
+        // Intern each distinct dictionary value once; `None` marks a null entry.
+        let mut local_to_global: Vec<Option<i32>> = Vec::with_capacity(local_values.len());
+        for i in 0..local_values.len() {
+            if local_values.is_null(i) {
+                local_to_global.push(None);
+            } else {
+                local_to_global.push(Some(self.intern_value(local_values.value(i))));
+            }
+        }
+
+        let ids: Int32Array = dict
+            .keys()
+            .iter()
+            .map(|k| match k {
+                Some(local) => local_to_global[local as usize],
+                None => None,
+            })
+            .collect();
+        Ok(Arc::new(ids))
+    }
+
+    /// Rebuild a `Dictionary(Int32, Utf8)` array from an `Int32Array` of global ids emitted by the
+    /// group table; the values are the full accumulated global dictionary.
+    pub fn rebuild(&self, ids: &ArrayRef) -> DFResult<ArrayRef> {
+        let ids = ids.as_any().downcast_ref::<Int32Array>().ok_or_else(|| {
+            DataFusionError::Internal("GlobalDict::rebuild expected Int32 ids".to_string())
+        })?;
+        let values = StringArray::from_iter_values(self.values.iter());
+        let dict = DictionaryArray::<Int32Type>::try_new(ids.clone(), Arc::new(values))?;
+        Ok(Arc::new(dict))
+    }
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
index e1ee77c685ca9..288a7b804d901 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -1,6 +1,6 @@
 use datafusion::arrow::array::{ArrayRef, AsArray, RecordBatch};
 use datafusion::arrow::compute::{lexsort_to_indices, take, SortColumn, SortOptions};
-use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::dfschema::internal_err;
 use datafusion::error::Result as DFResult;
 use datafusion::execution::{RecordBatchStream, TaskContext};
@@ -16,6 +16,7 @@ use futures::stream::{Stream, StreamExt};
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use super::dict_remap::{is_int32_utf8_dict, GlobalDict};
 use super::GroupByLimitAggregateExec;
 
 enum ExecutionState {
@@ -35,6 +36,9 @@ pub(crate) struct GroupByLimitAggregateStream {
     input_done: bool,
     accumulators: Vec<Box<dyn GroupsAccumulator>>,
     group_values: Box<dyn GroupValues>,
+    /// One slot per group column: `Some` for a `Dictionary(Int32, Utf8)` key grouped as Int32
+    /// global ids, `None` for a column passed through to `group_values` unchanged.
+    dict_remaps: Vec<Option<GlobalDict>>,
     current_group_indices: Vec<usize>,
     k: usize,
     factor: usize,
@@ -64,7 +68,21 @@ impl GroupByLimitAggregateStream {
             .collect::<DFResult<_>>()?;
 
         let group_schema = agg_group_by.group_schema(&agg.input().schema())?;
-        let group_values = new_group_values(group_schema, &GroupOrdering::None)?;
+        // Expose `Dictionary(Int32, Utf8)` group keys to `group_values` as plain `Int32` global ids
+        // (df's fast primitive path); other columns are passed through unchanged.
+        let mut int_fields = Vec::with_capacity(group_schema.fields().len());
+        let mut dict_remaps = Vec::with_capacity(group_schema.fields().len());
+        for field in group_schema.fields() {
+            if is_int32_utf8_dict(field.data_type()) {
+                int_fields.push(Arc::new(Field::new(field.name(), DataType::Int32, true)));
+                dict_remaps.push(Some(GlobalDict::new()));
+            } else {
+                int_fields.push(field.clone());
+                dict_remaps.push(None);
+            }
+        }
+        let int_group_schema = Arc::new(Schema::new(int_fields));
+        let group_values = new_group_values(int_group_schema, &GroupOrdering::None)?;
 
         Ok(GroupByLimitAggregateStream {
             schema: agg_schema,
@@ -77,6 +95,7 @@ impl GroupByLimitAggregateStream {
             input_done: false,
             accumulators,
             group_values,
+            dict_remaps,
             current_group_indices: Vec::with_capacity(batch_size),
             k: agg.k(),
             factor: agg.factor(),
@@ -141,14 +160,27 @@ impl RecordBatchStream for GroupByLimitAggregateStream {
 }
 
 impl GroupByLimitAggregateStream {
+    /// Remap dictionary group columns to `Int32` global ids; pass other columns through unchanged.
+    fn remap_group_cols(&mut self, cols: &[ArrayRef]) -> DFResult<Vec<ArrayRef>> {
+        let mut out = Vec::with_capacity(cols.len());
+        for (i, col) in cols.iter().enumerate() {
+            match &mut self.dict_remaps[i] {
+                Some(gd) => out.push(gd.remap(col)?),
+                None => out.push(Arc::clone(col)),
+            }
+        }
+        Ok(out)
+    }
+
     fn group_aggregate_batch(&mut self, batch: RecordBatch) -> DFResult<()> {
         let group_by_values = evaluate_group_by(&self.group_by, &batch)?;
         let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
         let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
 
         assert_eq!(group_by_values.len(), 1, "Exactly 1 group value required");
+        let group_cols = self.remap_group_cols(&group_by_values[0])?;
         self.group_values
-            .intern(&group_by_values[0], &mut self.current_group_indices)?;
+            .intern(&group_cols, &mut self.current_group_indices)?;
         let group_indices = &self.current_group_indices;
         let total_num_groups = self.group_values.len();
 
@@ -171,6 +203,13 @@ impl GroupByLimitAggregateStream {
             return Ok(None);
         }
         let mut columns = self.group_values.emit(EmitTo::All)?;
+        // Convert the Int32 global ids of dictionary keys back to `Dictionary(Int32, Utf8)` so the
+        // emitted columns match the partial-aggregate output schema.
+        for (i, remap) in self.dict_remaps.iter().enumerate() {
+            if let Some(gd) = remap {
+                columns[i] = gd.rebuild(&columns[i])?;
+            }
+        }
         for acc in &mut self.accumulators {
             columns.extend(acc.state(EmitTo::All)?);
         }
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
index 4fe9ba5850cca..1b463a5da54a0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -1,3 +1,4 @@
+mod dict_remap;
 mod group_by_limit_aggregate_stream;
 
 use datafusion::arrow::compute::SortOptions;
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index f1d4318086bc1..7cf87eb00807b 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -29,7 +29,7 @@ use datafusion::arrow::array::{
     Float64Array, Int16Array, Int32Array, Int64Array, MutableArrayData, NullArray, StringArray,
     TimestampMicrosecondArray, TimestampNanosecondArray, UInt16Array, UInt32Array, UInt64Array,
 };
-use datafusion::arrow::compute::{filter_record_batch, SortOptions};
+use datafusion::arrow::compute::{cast, filter_record_batch, SortOptions};
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
 use datafusion::arrow::ipc::reader::StreamReader;
 use datafusion::arrow::ipc::writer::StreamWriter;
@@ -39,7 +39,9 @@ use datafusion::common::ToDFSchema;
 use datafusion::config::TableParquetOptions;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
-use datafusion::datasource::physical_plan::parquet::get_reader_options_customizer;
+use datafusion::datasource::physical_plan::parquet::{
+    get_reader_options_customizer, ReaderOptionsCustomizer,
+};
 use datafusion::datasource::physical_plan::{
     FileScanConfig, ParquetFileReaderFactory, ParquetSource,
 };
@@ -50,6 +52,7 @@ use datafusion::execution::memory_pool::{MemoryPool, MemoryReservation};
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::execution::TaskContext;
 use datafusion::logical_expr::{Expr, LogicalPlan};
+use datafusion::parquet::arrow::arrow_reader::ArrowReaderOptions;
 use datafusion::physical_expr;
 use datafusion::physical_expr::LexOrdering;
 use datafusion::physical_expr::{
@@ -653,6 +656,26 @@ impl QueryExecutorImpl {
     }
 }
 
+/// Forces the parquet reader to produce arrays of the supplied schema directly, so
+/// `Dictionary(Int32, Utf8)` string columns are read natively from the on-disk dictionary pages
+/// instead of being materialized as `Utf8` and cast to dictionary by the schema adapter.
+#[derive(Debug)]
+struct SuppliedSchemaReaderCustomizer {
+    schema: SchemaRef,
+    inner: Arc<dyn ReaderOptionsCustomizer>,
+}
+
+impl ReaderOptionsCustomizer for SuppliedSchemaReaderCustomizer {
+    fn adjust_reader_options(
+        &self,
+        options: ArrowReaderOptions,
+    ) -> Result<ArrowReaderOptions, DataFusionError> {
+        // Compose with the configured customizer so its adjustments are kept, then pin the schema.
+        let options = self.inner.adjust_reader_options(options)?;
+        Ok(options.with_schema(Arc::clone(&self.schema)))
+    }
+}
+
 #[derive(Clone, Serialize, Deserialize)]
 pub struct CubeTable {
     index_snapshot: IndexSnapshot,
@@ -834,6 +857,22 @@ impl CubeTable {
             None
         };
 
+        // With dictionary encoding the index schema carries `Dictionary` string columns; supply it to
+        // the parquet reader so they are read natively from the dictionary pages instead of being
+        // materialized as `Utf8` and cast to dictionary per batch.
+        let reader_options_customizer: Arc<dyn ReaderOptionsCustomizer> = if index_schema
+            .fields()
+            .iter()
+            .any(|f| matches!(f.data_type(), DataType::Dictionary(_, _)))
+        {
+            Arc::new(SuppliedSchemaReaderCustomizer {
+                schema: index_schema.clone(),
+                inner: get_reader_options_customizer(state.config()),
+            })
+        } else {
+            get_reader_options_customizer(state.config())
+        };
+
         let unique_key_columns = self
             .index_snapshot
             .table_path
@@ -883,9 +922,8 @@ impl CubeTable {
                 let mut options = TableParquetOptions::new();
                 options.global = state.config_options().execution.parquet.clone();
 
-                let parquet_source =
-                    ParquetSource::new(options, get_reader_options_customizer(state.config()))
-                        .with_parquet_file_reader_factory(self.parquet_metadata_cache.clone());
+                let parquet_source = ParquetSource::new(options, reader_options_customizer.clone())
+                    .with_parquet_file_reader_factory(self.parquet_metadata_cache.clone());
                 let parquet_source = if let Some(phys_pred) = &physical_predicate {
                     parquet_source.with_predicate(index_schema.clone(), phys_pred.clone())
                 } else {
@@ -964,7 +1002,7 @@ impl CubeTable {
                     let mut options = TableParquetOptions::new();
                     options.global = state.config_options().execution.parquet.clone();
                     let parquet_source =
-                        ParquetSource::new(options, get_reader_options_customizer(state.config()))
+                        ParquetSource::new(options, reader_options_customizer.clone())
                             .with_parquet_file_reader_factory(self.parquet_metadata_cache.clone());
                     let parquet_source = if let Some(phys_pred) = &physical_predicate {
                         parquet_source.with_predicate(index_schema.clone(), phys_pred.clone())
@@ -2152,11 +2190,48 @@ macro_rules! convert_array {
     }};
 }
 
+/// Cast any `Dictionary` columns of a batch to their value type, leaving other columns untouched.
+fn decode_dictionary_columns(batch: RecordBatch) -> Result<RecordBatch, CubeError> {
+    let schema = batch.schema();
+    if !schema
+        .fields()
+        .iter()
+        .any(|f| matches!(f.data_type(), DataType::Dictionary(_, _)))
+    {
+        return Ok(batch);
+    }
+    let mut fields = Vec::with_capacity(schema.fields().len());
+    let mut columns = Vec::with_capacity(batch.num_columns());
+    for (field, column) in schema.fields().iter().zip(batch.columns()) {
+        match field.data_type() {
+            DataType::Dictionary(_, value_type) => {
+                columns.push(cast(column, value_type)?);
+                fields.push(Arc::new(Field::new(
+                    field.name(),
+                    value_type.as_ref().clone(),
+                    field.is_nullable(),
+                )));
+            }
+            _ => {
+                columns.push(column.clone());
+                fields.push(field.clone());
+            }
+        }
+    }
+    Ok(RecordBatch::try_new(
+        Arc::new(Schema::new(fields)),
+        columns,
+    )?)
+}
+
 pub fn batches_to_dataframe(batches: Vec<RecordBatch>) -> Result<DataFrame, CubeError> {
     let mut cols = vec![];
     let mut all_rows = vec![];
 
     for batch in batches.into_iter() {
+        // Dictionary-encoded columns (string group keys) reach the result as `Dictionary(_, _)`;
+        // decode them to their value type for the row conversion below.
+        let batch = decode_dictionary_columns(batch)?;
         if cols.len() == 0 {
             for (i, field) in batch.schema().fields().iter().enumerate() {
                 cols.push(Column::new(

From 302f9a51fdbde55f681f6e8d06e541e834e22d66 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 21:01:08 +0200
Subject: [PATCH 06/17] perf(cubestore): vectorize dictionary group-key remap

Replace the per-row build of the global-id Int32Array (iterate keys, match
Option, collect) with a vectorized `take`: build the local->global map once per
batch as an Int32Array (null where the dictionary entry is null) and gather it by
the dictionary keys. Null keys and null entries propagate to null, identical to
the previous semantics. Cuts the remap from ~16% of worker compute to negligible
(~3.7s -> ~3.1s on the production-dump UNION query, now ~25% faster than the
non-dictionary path).
---
 .../group_by_limit_aggregate/dict_remap.rs    | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
index fa72915cd586f..34a44c61ef3e0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
@@ -1,7 +1,10 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use datafusion::arrow::array::{Array, ArrayRef, DictionaryArray, Int32Array, StringArray};
+use datafusion::arrow::array::{
+    Array, ArrayRef, DictionaryArray, Int32Array, Int32Builder, StringArray,
+};
+use datafusion::arrow::compute::take;
 use datafusion::arrow::datatypes::{DataType, Int32Type};
 use datafusion::error::{DataFusionError, Result as DFResult};
 
@@ -56,25 +59,21 @@ impl GlobalDict {
                 DataFusionError::Internal("GlobalDict::remap expected Utf8 values".to_string())
             })?;
 
-        // Intern each distinct dictionary value once; `None` marks a null entry.
-        let mut local_to_global: Vec<Option<i32>> = Vec::with_capacity(local_values.len());
+        // local id -> global id, interning each distinct value once; a null dictionary entry is a
+        // null in this map. Built once per batch (O(distinct values)).
+        let mut builder = Int32Builder::with_capacity(local_values.len());
         for i in 0..local_values.len() {
             if local_values.is_null(i) {
-                local_to_global.push(None);
+                builder.append_null();
             } else {
-                local_to_global.push(Some(self.intern_value(local_values.value(i))));
+                builder.append_value(self.intern_value(local_values.value(i)));
             }
         }
+        let local_to_global = builder.finish();
 
-        let ids: Int32Array = dict
-            .keys()
-            .iter()
-            .map(|k| match k {
-                Some(local) => local_to_global[local as usize],
-                None => None,
-            })
-            .collect();
-        Ok(Arc::new(ids))
+        // Gather the global id per row via a vectorized take: null keys and null dictionary entries
+        // both propagate to null, matching how the string path groups nulls.
+        Ok(take(&local_to_global, dict.keys(), None)?)
     }
 
     /// Rebuild a `Dictionary(Int32, Utf8)` array from an `Int32Array` of global ids emitted by the

From d38cb8e4aa5447863d80f580a81d41626c70e417 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 21:13:20 +0200
Subject: [PATCH 07/17] test(cubestore): cover dictionary group-key remap and
 native dict read

- GlobalDict unit tests: global ids stay consistent across batches with different
  local dictionaries, and null keys / null dictionary entries round-trip as null.
- End-to-end test with dictionary encoding on: a String group key alongside
  Timestamp and Decimal columns, compacted to a parquet partition, then grouped
  by the string. Locks in that the native dictionary read (with_supplied_schema)
  accepts every column type and the result decodes back to strings.
---
 .../group_by_limit_aggregate/dict_remap.rs    | 79 +++++++++++++++++
 .../cubestore/src/store/compaction.rs         | 87 +++++++++++++++++++
 2 files changed, 166 insertions(+)

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
index 34a44c61ef3e0..99e158d4f3b71 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
@@ -87,3 +87,82 @@ impl GlobalDict {
         Ok(Arc::new(dict))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn dict(values: Vec<Option<&str>>, keys: Vec<Option<i32>>) -> ArrayRef {
+        let values = StringArray::from(values);
+        let keys = Int32Array::from(keys);
+        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
+    }
+
+    fn ids(a: &ArrayRef) -> Int32Array {
+        a.as_any().downcast_ref::<Int32Array>().unwrap().clone()
+    }
+
+    fn rebuilt_strings(a: &ArrayRef) -> Vec<Option<String>> {
+        let d = a
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+        let v = d.values().as_any().downcast_ref::<StringArray>().unwrap();
+        d.keys()
+            .iter()
+            .map(|k| k.map(|k| v.value(k as usize).to_string()))
+            .collect()
+    }
+
+    #[test]
+    fn remaps_to_consistent_global_ids_across_batches() {
+        let mut gd = GlobalDict::new();
+        // batch 1: local dict ["b", "a"], rows b, a, b
+        let b1 = ids(&gd
+            .remap(&dict(
+                vec![Some("b"), Some("a")],
+                vec![Some(0), Some(1), Some(0)],
+            ))
+            .unwrap());
+        // batch 2: a DIFFERENT local dict ["a", "c"], rows c, a -- "a" must reuse its global id
+        let b2 = ids(&gd
+            .remap(&dict(vec![Some("a"), Some("c")], vec![Some(1), Some(0)]))
+            .unwrap());
+
+        assert_eq!(b1.values(), &[0, 1, 0]); // b=0, a=1 (first-seen)
+        assert_eq!(b2.value(1), b1.value(1)); // same string "a" -> same global id across batches
+        assert_ne!(b2.value(0), b1.value(0)); // "c" is a new id
+
+        // rebuild over the accumulated global ids yields the original strings
+        let all: ArrayRef = Arc::new(Int32Array::from(vec![
+            b1.value(0),
+            b1.value(1),
+            b2.value(0),
+        ]));
+        assert_eq!(
+            rebuilt_strings(&gd.rebuild(&all).unwrap()),
+            vec![
+                Some("b".to_string()),
+                Some("a".to_string()),
+                Some("c".to_string())
+            ]
+        );
+    }
+
+    #[test]
+    fn null_keys_and_null_entries_stay_null() {
+        let mut gd = GlobalDict::new();
+        // local dict ["x", null]; rows: x, null-key, points-to-null-entry
+        let r = gd
+            .remap(&dict(vec![Some("x"), None], vec![Some(0), None, Some(1)]))
+            .unwrap();
+        let r = ids(&r);
+        assert!(r.is_valid(0));
+        assert!(r.is_null(1));
+        assert!(r.is_null(2));
+        assert_eq!(
+            rebuilt_strings(&gd.rebuild(&(Arc::new(r) as ArrayRef)).unwrap()),
+            vec![Some("x".to_string()), None, None]
+        );
+    }
+}
diff --git a/rust/cubestore/cubestore/src/store/compaction.rs b/rust/cubestore/cubestore/src/store/compaction.rs
index 9576f2ed0cdec..186d1d4e9f77f 100644
--- a/rust/cubestore/cubestore/src/store/compaction.rs
+++ b/rust/cubestore/cubestore/src/store/compaction.rs
@@ -2785,6 +2785,93 @@ mod tests {
             .await;
     }
 
+    #[tokio::test]
+    async fn dictionary_encoding_native_read_group_by() {
+        Config::test("dictionary_encoding_native_read_group_by")
+            .update_config(|mut c| {
+                c.dictionary_encoding_enabled = true;
+                c
+            })
+            .start_test(async move |services| {
+                let service = services.sql_service;
+                let compaction_service = services
+                    .injector
+                    .get_service_typed::<dyn CompactionService>()
+                    .await;
+                service
+                    .exec_query("CREATE SCHEMA d")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                // A string group key alongside Timestamp and Decimal columns: with dictionary encoding
+                // the parquet reader is handed the dictionary schema, whose strict per-field check must
+                // accept every column type (not just the dictionary one).
+                service
+                    .exec_query("CREATE TABLE d.t (s text, ts timestamp, dec decimal(10,2), n int)")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                service
+                    .exec_query(
+                        "INSERT INTO d.t (s, ts, dec, n) VALUES \
+                         ('b', '2020-01-01T00:00:00.000Z', 1.50, 10), \
+                         ('a', '2020-01-02T00:00:00.000Z', 2.50, 5), \
+                         ('b', '2020-01-03T00:00:00.000Z', 3.50, 7), \
+                         ('c', '2020-01-04T00:00:00.000Z', 4.50, 1), \
+                         ('a', '2020-01-05T00:00:00.000Z', 2.00, 2)",
+                    )
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                // Persist the in-memory chunk into the partition's parquet main table so the scan goes
+                // through the native dictionary reader rather than the in-memory chunk.
+                compaction_service
+                    .compact(1, DataLoadedSize::new())
+                    .await
+                    .unwrap();
+                let partitions = services
+                    .meta_store
+                    .get_active_partitions_by_index_id(1)
+                    .await
+                    .unwrap();
+                assert_eq!(partitions.len(), 1);
+                assert_eq!(partitions[0].get_row().main_table_row_count(), 5);
+
+                let result = service
+                    .exec_query("SELECT s, sum(n) FROM d.t GROUP BY 1 ORDER BY 1")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                assert_eq!(
+                    result.get_rows(),
+                    &vec![
+                        Row::new(vec![
+                            TableValue::String("a".to_string()),
+                            TableValue::Int(7)
+                        ]),
+                        Row::new(vec![
+                            TableValue::String("b".to_string()),
+                            TableValue::Int(17)
+                        ]),
+                        Row::new(vec![
+                            TableValue::String("c".to_string()),
+                            TableValue::Int(1)
+                        ]),
+                    ]
+                );
+                Ok::<(), CubeError>(())
+            })
+            .await;
+    }
+
     #[tokio::test]
     async fn compaction_wide_string_batches() {
         // Each chunk is read as a single sorted run whose batches keep their on-disk row-group

From 4c9d5a6462ad1284ca86de3b1fc1ff4049dab0f7 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:13:46 +0200
Subject: [PATCH 08/17] feat(cubestore): parallel hash-final worker top-k
 (experimental, flag-gated)

Behind CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL (default off, gated on
group_by_limit_factor > 0): route the worker top-k through a router-side hash
final aggregate instead of the sorted-merge framework.

- Worker subtree: CoalescePartitions <- GroupByLimitAggregate (drops the
  per-partition SortExec and SortPreservingMerge; emits the trimmed top-k
  unsorted). CoalescePartitions spawns a task per input partition, so the
  per-union-branch aggregates run in parallel -- the sorted merge drained them
  on a single task.
- Router: SortExec(T, fetch=k) <- AggregateExec(Final, hash) <-
  CoalescePartitions <- ClusterSend. The explicit top-k sort by the full total
  order T is required even for a bare LIMIT: it keeps exactly the groups every
  worker kept (fully combined here), where a plain limit could take an
  undercounted group only one worker retained.

On the production-dump UNION query (GROUP BY 6 cols, LIMIT 10): ~2.1x faster
(~3.1s -> ~1.5s) at neutral peak RSS, because the worker now uses ~2 cores
instead of 1. Same result as the sorted path (ORDER BY and bare LIMIT verified).

The flag must be set fleet-wide: worker and router re-plan independently, and a
worker-on/router-off split would feed unsorted streams into a sorted merge.
---
 .../distributed_partial_aggregate.rs          | 78 +++++++++++++++----
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index cde5f026e8e2f..c810ac11558f0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -338,6 +338,27 @@ pub fn push_worker_sort_and_limit(
     let new_cs: Arc<dyn ExecutionPlan> =
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
+
+    // Hash-final variant: combine the worker top-k with a hash final aggregate over coalesced
+    // (unordered) streams, then re-apply the top-k sort by the total order. The Sort(fetch) is
+    // required even for a bare LIMIT: it picks the k smallest by the total order, which are exactly
+    // the groups every worker kept (and therefore fully combined here); a plain limit could take a
+    // group only one worker kept (undercounted).
+    if group_by_limit_factor > 0 && hash_final_enabled() {
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            coalesced,
+            final_agg.input_schema,
+        )?);
+        return Ok(Arc::new(
+            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+        ));
+    }
+
     let merged: Arc<dyn ExecutionPlan> =
         Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
     Ok(Arc::new(AggregateExec::try_new(
@@ -387,25 +408,10 @@ fn resort_worker_subtree(
     group_by_limit_factor: usize,
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
-    let schema = partial.schema();
-    let mut exprs = Vec::with_capacity(cols.len());
-    for (idx, asc, nulls_first) in cols {
-        let field = schema.fields().get(*idx)?;
-        exprs.push(PhysicalSortExpr {
-            expr: Arc::new(Column::new(field.name(), *idx)),
-            options: SortOptions {
-                descending: !asc,
-                nulls_first: *nulls_first,
-            },
-        });
-    }
-    let worker_order = LexOrdering::new(exprs);
 
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
     // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
-    // the Sort below trim afterwards. The Sort above still sorts the (now <= k) groups so the
-    // router's sort-preserving merge stays correct. `group_by_limit_factor == 0` disables this and
-    // falls back to the plain sort-then-trim.
+    // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
     let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
         if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
             let order: Vec<(usize, SortOptions)> = cols
@@ -436,6 +442,33 @@ fn resort_worker_subtree(
         partial
     };
 
+    // Hash-final variant (CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL): emit the trimmed top-k unsorted,
+    // coalesced to one stream. The router hash-combines and re-applies the top-k sort, so no
+    // worker-side sort/merge holds the whole result, and the aggregates run in parallel
+    // (CoalescePartitions spawns a task per input). Gated on `group_by_limit_factor > 0`; when
+    // `worker_sort_and_limit` is set the partial is the hash `AggregateExec` so the trim engages and
+    // the coalesced output is <= partitions*k rows. (If the trim ever fell back to the untrimmed
+    // partial -- e.g. grouping sets -- the router hash final would size to all groups: a memory
+    // regression, not a correctness bug.)
+    if group_by_limit_factor > 0 && hash_final_enabled() {
+        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
+    }
+
+    // Sorted path: per-partition bounded sort, then a sort-preserving merge, so the router's sorted
+    // final aggregate sees adjacent equal keys and its limit cuts on the total order.
+    let schema = trimmed.schema();
+    let mut exprs = Vec::with_capacity(cols.len());
+    for (idx, asc, nulls_first) in cols {
+        let field = schema.fields().get(*idx)?;
+        exprs.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(field.name(), *idx)),
+            options: SortOptions {
+                descending: !asc,
+                nulls_first: *nulls_first,
+            },
+        });
+    }
+    let worker_order = LexOrdering::new(exprs);
     let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
         SortExec::new(worker_order.clone(), trimmed)
             .with_fetch(Some(fetch))
@@ -447,6 +480,19 @@ fn resort_worker_subtree(
     )))
 }
 
+/// Experiment toggle: route the worker top-k through a router-side hash final aggregate instead of
+/// the sorted-merge framework.
+///
+/// Must be set consistently across the whole fleet: the worker and router re-plan in separate
+/// processes, each reading this flag. A worker emitting unsorted top-k (flag on) into a router that
+/// still expects the sorted-merge input (flag off) would silently undercount. Set it via deployment
+/// env so every node agrees.
+fn hash_final_enabled() -> bool {
+    std::env::var("CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {

From d43016768886f13b8ad036ceae37c7cb0578ecf8 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:44:32 +0200
Subject: [PATCH 09/17] feat(cubestore): per-partition N-way top-k aggregate
 (experimental, flag-gated)

Behind CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION (default off; only active with
CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL and group_by_limit_factor > 0): strip the
CoalescePartitions below the trimmed worker aggregate so it runs over every raw
CubeTableExec partition instead of one stream per union branch. The hash-final
CoalescePartitions on top then parallelizes all partitions.

Per-partition top-k stays complete by the same total-order argument as the
per-worker cut: a group in the global top-k by T has fewer than k smaller-keyed
groups globally, hence in any single partition, so it survives every partition's
local top-k and reaches the router fully combined. The strip only removes
CoalescePartitions, never the SortPreservingMerge that feeds the single-partition
LastRowByUniqueKeyExec, so unique-key tables stay correct.

On the production-dump UNION query: ~799ms vs ~1504ms (hash-final 3-way) and
~2159ms (master), ~9 cores at peak, peak RSS still neutral because the
index-sorted partitions are key-local (small per-partition tables). Memory is
bounded by partition count, so a group key spread across partitions would cost
~N x; gated off by default.
---
 .../distributed_partial_aggregate.rs          | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index c810ac11558f0..9fa8df19dcd29 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -409,6 +409,22 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
+    // N-way experiment: drop the per-partition merge below the aggregate so it runs per partition;
+    // the hash-final CoalescePartitions then parallelizes all partitions. Only with hash-final (the
+    // sorted path's SortPreservingMerge would not parallelize the extra partitions anyway).
+    let partial = if group_by_limit_factor > 0 && hash_final_enabled() && per_partition_enabled() {
+        partial
+            .as_any()
+            .downcast_ref::<AggregateExec>()
+            .and_then(|agg| {
+                let new_input = strip_coalesce_partitions(agg.input()).ok()?;
+                partial.clone().with_new_children(vec![new_input]).ok()
+            })
+            .unwrap_or(partial)
+    } else {
+        partial
+    };
+
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
     // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
     // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
@@ -493,6 +509,39 @@ fn hash_final_enabled() -> bool {
         .unwrap_or(false)
 }
 
+/// Experiment toggle (only meaningful with [hash_final_enabled]): drop the per-partition merge
+/// below the trimmed aggregate so it runs per partition (N-way) instead of per union branch. The
+/// hash-final `CoalescePartitions` then parallelizes all partitions.
+///
+/// Trades memory for parallelism: one group table per DF partition (= per parquet file and chunk),
+/// so peak memory is bounded by the partition count, not by k. It stays low only while partitions
+/// are key-local (group keys do not span partitions) -- true for index-sorted pre-aggregations. On
+/// a group key spread across many partitions each table approaches full cardinality (~N x memory).
+fn per_partition_enabled() -> bool {
+    std::env::var("CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
+/// Recursively drop `CoalescePartitionsExec` nodes, exposing the underlying multi-partition streams
+/// to the parent.
+fn strip_coalesce_partitions(
+    p: &Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    if p.as_any().is::<CoalescePartitionsExec>() {
+        return strip_coalesce_partitions(&p.children()[0].clone());
+    }
+    let children = p.children();
+    if children.is_empty() {
+        return Ok(p.clone());
+    }
+    let new_children = children
+        .iter()
+        .map(|c| strip_coalesce_partitions(c))
+        .collect::<Result<Vec<_>, _>>()?;
+    p.clone().with_new_children(new_children)
+}
+
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {

From e52d56ab34bd50f2684ab4a6362ae3bf7023e01a Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:51:59 +0200
Subject: [PATCH 10/17] refactor(cubestore): drop sorted worker top-k,
 hash-final is the only path

The sorted-merge worker top-k (per-partition SortExec + SortPreservingMerge ->
router SortedFinalAggregate) drained the worker partitions on a single task, so
it ran on one core and was slower than master, which keeps a parallel
CoalescePartitions over the partial aggregate. The hash-final path (worker
CoalescePartitions -> router hash Final + Sort(T, fetch=k)) is strictly better:
same memory, ~2x faster because CoalescePartitions parallelizes the per-partition
aggregates. So remove the sorted variant and the CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL
flag entirely; hash-final is now how the trimmed top-k is always combined.

Two knobs remain:
- group_by_limit_factor (env CUBESTORE_GROUP_BY_LIMIT_FACTOR, >0): whether to use
  the trimming aggregate at all. 0 leaves the plan untouched (master behavior).
- CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION: whether to push it below the merge
  (per-partition N-way) -- still experimental, default off.

push_worker_sort_and_limit now returns early when factor == 0. Existing
planning/execution/limit_pushdown/union tests pass against the hash-final plan.
---
 .../distributed_partial_aggregate.rs          | 134 +++++-------------
 1 file changed, 38 insertions(+), 96 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 9fa8df19dcd29..970de53ba5ecf 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -289,7 +289,13 @@ pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
     group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
-    // Worker side: wrap the partial aggregate with a per-partition bounded sort.
+    // The worker top-k engages only when the trimming aggregate is enabled (factor > 0); otherwise
+    // leave the plan as planned (no trim).
+    if group_by_limit_factor == 0 {
+        return Ok(p);
+    }
+
+    // Worker side: replace the partial aggregate's merge with the trimmed top-k.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
@@ -310,10 +316,8 @@ pub fn push_worker_sort_and_limit(
         )));
     }
 
-    // Router side: rebuild the final aggregate over a sort-preserving merge in `worker_order`, and
-    // reorder the (optimization-only) worker subtree to match. Same keys are adjacent in
-    // `worker_order`, so the sorted final combines them; its output is `worker_order`-sorted (whose
-    // prefix is the query's ORDER BY), so the limit above stays correct.
+    // Router side: combine the workers' top-k with a hash final aggregate, then re-apply the top-k
+    // sort by `worker_order` (the total order T).
     let Some(final_agg) = FinalAggregateInfo::extract(&p) else {
         return Ok(p);
     };
@@ -339,36 +343,23 @@ pub fn push_worker_sort_and_limit(
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
 
-    // Hash-final variant: combine the worker top-k with a hash final aggregate over coalesced
-    // (unordered) streams, then re-apply the top-k sort by the total order. The Sort(fetch) is
-    // required even for a bare LIMIT: it picks the k smallest by the total order, which are exactly
-    // the groups every worker kept (and therefore fully combined here); a plain limit could take a
-    // group only one worker kept (undercounted).
-    if group_by_limit_factor > 0 && hash_final_enabled() {
-        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
-        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
-            AggregateMode::Final,
-            final_agg.group_expr,
-            final_agg.aggr_expr,
-            final_agg.filter_expr,
-            coalesced,
-            final_agg.input_schema,
-        )?);
-        return Ok(Arc::new(
-            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
-        ));
-    }
-
-    let merged: Arc<dyn ExecutionPlan> =
-        Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
-    Ok(Arc::new(AggregateExec::try_new(
+    // Combine the worker top-k with a hash final aggregate over coalesced (unordered) streams, then
+    // re-apply the top-k sort by the total order T. The Sort(fetch) is required even for a bare
+    // LIMIT: it keeps the k smallest by T -- exactly the groups every worker kept and fully combined
+    // here -- where a plain limit could take a group only one worker kept (undercounted). The
+    // coalesce also drains the workers' streams in parallel.
+    let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+    let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
         final_agg.group_expr,
         final_agg.aggr_expr,
         final_agg.filter_expr,
-        merged,
+        coalesced,
         final_agg.input_schema,
-    )?))
+    )?);
+    Ok(Arc::new(
+        SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+    ))
 }
 
 /// Builds the `worker_order` LexOrdering over an aggregate's group columns from the descriptor.
@@ -393,14 +384,9 @@ fn worker_ordering(
     Ok(LexOrdering::new(exprs))
 }
 
-/// Rebuilds a worker subtree as `SortPreservingMerge(worker_order) <- Sort(worker_order, fetch, per
-/// partition) <- partial`. Returns `None` for an unrecognized or already-rewritten subtree, which
-/// keeps [push_worker_sort_and_limit] idempotent.
-///
-/// The per-partition `Sort` does the bounding (a bounded heap, O(fetch) memory); the merge above it
-/// carries no fetch. Because this pass runs last, `replace_suboptimal_merge_sorts` has already run
-/// and won't push the query's row limit into the merge -- which would cut the merged stream of
-/// (still uncombined) partial rows by rows and undercount groups split across partitions.
+/// Rebuilds a worker subtree as `CoalescePartitions <- GroupByLimitAggregate <- ...`: the trimmed
+/// top-k is emitted unsorted and coalesced to one stream for the router's hash final aggregate.
+/// Returns `None` for an unrecognized subtree (no locatable partial aggregate).
 fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
     cols: &[(usize, bool, bool)],
@@ -409,10 +395,10 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
-    // N-way experiment: drop the per-partition merge below the aggregate so it runs per partition;
-    // the hash-final CoalescePartitions then parallelizes all partitions. Only with hash-final (the
-    // sorted path's SortPreservingMerge would not parallelize the extra partitions anyway).
-    let partial = if group_by_limit_factor > 0 && hash_final_enabled() && per_partition_enabled() {
+    // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate so
+    // it runs over every raw partition; the CoalescePartitions below then parallelizes all of them
+    // (N-way) instead of one stream per union branch.
+    let partial = if per_partition_enabled() {
         partial
             .as_any()
             .downcast_ref::<AggregateExec>()
@@ -426,8 +412,8 @@ fn resort_worker_subtree(
     };
 
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
-    // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
-    // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
+    // keeping only the top-k groups by `cols`. `group_by_limit_factor` is the trim threshold; the
+    // caller only reaches here with it > 0.
     let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
         if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
             let order: Vec<(usize, SortOptions)> = cols
@@ -458,60 +444,16 @@ fn resort_worker_subtree(
         partial
     };
 
-    // Hash-final variant (CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL): emit the trimmed top-k unsorted,
-    // coalesced to one stream. The router hash-combines and re-applies the top-k sort, so no
-    // worker-side sort/merge holds the whole result, and the aggregates run in parallel
-    // (CoalescePartitions spawns a task per input). Gated on `group_by_limit_factor > 0`; when
-    // `worker_sort_and_limit` is set the partial is the hash `AggregateExec` so the trim engages and
-    // the coalesced output is <= partitions*k rows. (If the trim ever fell back to the untrimmed
-    // partial -- e.g. grouping sets -- the router hash final would size to all groups: a memory
-    // regression, not a correctness bug.)
-    if group_by_limit_factor > 0 && hash_final_enabled() {
-        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
-    }
-
-    // Sorted path: per-partition bounded sort, then a sort-preserving merge, so the router's sorted
-    // final aggregate sees adjacent equal keys and its limit cuts on the total order.
-    let schema = trimmed.schema();
-    let mut exprs = Vec::with_capacity(cols.len());
-    for (idx, asc, nulls_first) in cols {
-        let field = schema.fields().get(*idx)?;
-        exprs.push(PhysicalSortExpr {
-            expr: Arc::new(Column::new(field.name(), *idx)),
-            options: SortOptions {
-                descending: !asc,
-                nulls_first: *nulls_first,
-            },
-        });
-    }
-    let worker_order = LexOrdering::new(exprs);
-    let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
-        SortExec::new(worker_order.clone(), trimmed)
-            .with_fetch(Some(fetch))
-            .with_preserve_partitioning(true),
-    );
-    Some(Arc::new(SortPreservingMergeExec::new(
-        worker_order,
-        per_partition_sort,
-    )))
-}
-
-/// Experiment toggle: route the worker top-k through a router-side hash final aggregate instead of
-/// the sorted-merge framework.
-///
-/// Must be set consistently across the whole fleet: the worker and router re-plan in separate
-/// processes, each reading this flag. A worker emitting unsorted top-k (flag on) into a router that
-/// still expects the sorted-merge input (flag off) would silently undercount. Set it via deployment
-/// env so every node agrees.
-fn hash_final_enabled() -> bool {
-    std::env::var("CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL")
-        .map(|v| v == "true" || v == "1")
-        .unwrap_or(false)
+    // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
+    // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
+    // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input) instead
+    // of being drained one at a time by a sort-preserving merge.
+    Some(Arc::new(CoalescePartitionsExec::new(trimmed)))
 }
 
-/// Experiment toggle (only meaningful with [hash_final_enabled]): drop the per-partition merge
-/// below the trimmed aggregate so it runs per partition (N-way) instead of per union branch. The
-/// hash-final `CoalescePartitions` then parallelizes all partitions.
+/// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the trimmed aggregate so it
+/// runs per partition (N-way) instead of per union branch. The `CoalescePartitions` then
+/// parallelizes all partitions.
 ///
 /// Trades memory for parallelism: one group table per DF partition (= per parquet file and chunk),
 /// so peak memory is bounded by the partition count, not by k. It stays low only while partitions

From 19e7af0250ae0e3aff40e6c9a16ffd798d62dcd4 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 23:21:51 +0200
Subject: [PATCH 11/17] feat(cubestore): dict-encode in-memory chunks on scan

With dictionary encoding on, the index schema exposes string columns as
Dictionary(Int32, Utf8), but in-memory chunks are written with plain Utf8. The
memory scan previously rejected this as an "index schema / chunk schema
mismatch", so any query touching an uncompacted (in-memory) chunk failed under
dictionary encoding. Cast the chunk batches up to the index schema instead
(Utf8 -> Dictionary via cast_record_batch_to_schema): the memory scan now matches
the dictionary parquet partitions and feeds the dict-aware aggregate, so
streaming / freshly-imported data gets the same path and benefit.

Tests:
- test_cast_record_batch_to_dictionary_schema: the cast preserves values and
  nulls and leaves non-string columns untouched.
- dictionary_encoding_in_memory_group_by: dictionary encoding on, a unique-key
  table (streaming in-memory chunks), no compaction, group-by the string column
  returns correct sums (asserts the chunks are in-memory).
---
 .../src/queryplanner/query_executor.rs        | 113 ++++++++++++++++--
 .../cubestore/src/store/compaction.rs         |  80 +++++++++++++
 2 files changed, 182 insertions(+), 11 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index 7cf87eb00807b..093b0a95fbaf2 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -970,18 +970,21 @@ impl CubeTable {
                             "Record batch for in memory chunk {:?} is not provided",
                             chunk
                         )))?;
-                    if let Some(batch) = record_batches.iter().next() {
-                        if batch.schema() != index_schema {
-                            return Err(CubeError::internal(format!(
-                                "Index schema {:?} and in memory chunk schema {:?} mismatch",
-                                index_schema,
-                                record_batches[0].schema()
-                            )));
-                        }
-                    }
+                    // In-memory chunks are written with plain column types. When dictionary encoding
+                    // is on the index schema exposes string columns as Dictionary, so cast the chunk
+                    // batches to the index schema (Utf8 -> Dictionary) instead of rejecting them --
+                    // this keeps the memory scan consistent with the dictionary parquet partitions
+                    // and feeds the dict-aware aggregate.
+                    let record_batches = match record_batches.iter().next() {
+                        Some(batch) if batch.schema() != index_schema => record_batches
+                            .iter()
+                            .map(|b| cast_record_batch_to_schema(b, &index_schema))
+                            .collect::<Result<Vec<_>, _>>()?,
+                        _ => record_batches.clone(),
+                    };
                     Arc::new(DataSourceExec::new(Arc::new(
                         MemorySourceConfig::try_new(
-                            &[record_batches.clone()],
+                            &[record_batches],
                             index_schema.clone(),
                             index_projection_or_none_on_schema_match.clone(),
                         )?
@@ -2190,6 +2193,26 @@ macro_rules! convert_array {
     }};
 }
 
+/// Cast a record batch's columns to `schema`'s field types. Used to bring in-memory chunk batches
+/// (written with plain types) up to the index schema, whose string columns are `Dictionary` when
+/// dictionary encoding is on.
+fn cast_record_batch_to_schema(
+    batch: &RecordBatch,
+    schema: &SchemaRef,
+) -> Result<RecordBatch, DataFusionError> {
+    // Columns are cast positionally, so the batch must align with the schema.
+    debug_assert_eq!(batch.num_columns(), schema.fields().len());
+    let mut columns = Vec::with_capacity(batch.num_columns());
+    for (column, field) in batch.columns().iter().zip(schema.fields()) {
+        if column.data_type() == field.data_type() {
+            columns.push(Arc::clone(column));
+        } else {
+            columns.push(cast(column, field.data_type())?);
+        }
+    }
+    Ok(RecordBatch::try_new(Arc::clone(schema), columns)?)
+}
+
 /// Cast any `Dictionary` columns of a batch to their value type, leaving other columns untouched.
 fn decode_dictionary_columns(batch: RecordBatch) -> Result<RecordBatch, CubeError> {
     let schema = batch.schema();
@@ -2602,7 +2625,75 @@ fn slice_copy(a: &dyn Array, start: usize, len: usize) -> ArrayRef {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion::arrow::datatypes::Field;
+    use datafusion::arrow::array::DictionaryArray;
+    use datafusion::arrow::datatypes::{Field, Int32Type};
+
+    #[test]
+    fn test_cast_record_batch_to_dictionary_schema() -> Result<(), CubeError> {
+        // An in-memory chunk batch (plain Utf8) cast up to a dictionary-encoded index schema: the
+        // string column becomes Dictionary(Int32, Utf8) preserving values and nulls; other columns
+        // are untouched.
+        let src = Arc::new(Schema::new(vec![
+            Field::new("s", DataType::Utf8, true),
+            Field::new("n", DataType::Int64, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            src,
+            vec![
+                Arc::new(StringArray::from(vec![
+                    Some("b"),
+                    Some("a"),
+                    None,
+                    Some("b"),
+                ])) as ArrayRef,
+                Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
+            ],
+        )?;
+        let target = Arc::new(Schema::new(vec![
+            Field::new(
+                "s",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new("n", DataType::Int64, true),
+        ]));
+
+        let out = cast_record_batch_to_schema(&batch, &target)?;
+        assert_eq!(out.schema(), target);
+        let dict = out
+            .column(0)
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+        let vals = dict
+            .values()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        let got: Vec<Option<String>> = dict
+            .keys()
+            .iter()
+            .map(|k| k.map(|k| vals.value(k as usize).to_string()))
+            .collect();
+        assert_eq!(
+            got,
+            vec![
+                Some("b".to_string()),
+                Some("a".to_string()),
+                None,
+                Some("b".to_string())
+            ]
+        );
+        assert_eq!(
+            out.column(1)
+                .as_any()
+                .downcast_ref::<Int64Array>()
+                .unwrap()
+                .values(),
+            &[1, 2, 3, 4]
+        );
+        Ok(())
+    }
 
     #[test]
     fn test_batch_to_dataframe() -> Result<(), CubeError> {
diff --git a/rust/cubestore/cubestore/src/store/compaction.rs b/rust/cubestore/cubestore/src/store/compaction.rs
index 186d1d4e9f77f..990b9c1512ffb 100644
--- a/rust/cubestore/cubestore/src/store/compaction.rs
+++ b/rust/cubestore/cubestore/src/store/compaction.rs
@@ -2872,6 +2872,86 @@ mod tests {
             .await;
     }
 
+    #[tokio::test]
+    async fn dictionary_encoding_in_memory_group_by() {
+        Config::test("dictionary_encoding_in_memory_group_by")
+            .update_config(|mut c| {
+                c.dictionary_encoding_enabled = true;
+                c
+            })
+            .start_test(async move |services| {
+                let service = services.sql_service;
+                service
+                    .exec_query("CREATE SCHEMA d")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                // A unique-key table routes inserts through the streaming in-memory chunk path, so
+                // the scan reads them via the memory source (not parquet). `s` is a non-key string
+                // dimension we group by.
+                service
+                    .exec_query("CREATE TABLE d.t (id int, s text, n int) unique key (id)")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                service
+                    .exec_query(
+                        "INSERT INTO d.t (id, s, n, __seq) VALUES \
+                         (1, 'b', 10, 1), (2, 'a', 5, 2), (3, 'b', 7, 3), \
+                         (4, 'c', 1, 4), (5, 'a', 2, 5)",
+                    )
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                // No compaction: the data stays in an in-memory chunk (Utf8), while the dictionary
+                // index schema exposes `s` as Dictionary. The memory scan must cast it instead of
+                // rejecting the mismatch.
+                let chunks = services
+                    .meta_store
+                    .get_chunks_by_partition(1, false)
+                    .await
+                    .unwrap();
+                assert!(
+                    !chunks.is_empty() && chunks.iter().all(|c| c.get_row().in_memory()),
+                    "expected in-memory chunks, got: {:?}",
+                    chunks
+                );
+
+                let result = service
+                    .exec_query("SELECT s, sum(n) FROM d.t GROUP BY 1 ORDER BY 1")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                assert_eq!(
+                    result.get_rows(),
+                    &vec![
+                        Row::new(vec![
+                            TableValue::String("a".to_string()),
+                            TableValue::Int(7)
+                        ]),
+                        Row::new(vec![
+                            TableValue::String("b".to_string()),
+                            TableValue::Int(17)
+                        ]),
+                        Row::new(vec![
+                            TableValue::String("c".to_string()),
+                            TableValue::Int(1)
+                        ]),
+                    ]
+                );
+                Ok::<(), CubeError>(())
+            })
+            .await;
+    }
+
     #[tokio::test]
     async fn compaction_wide_string_batches() {
         // Each chunk is read as a single sorted run whose batches keep their on-disk row-group

From 6d3fd6f53eeeaf01f60e1d86757ed0e5fe24c2c9 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 09:14:56 +0200
Subject: [PATCH 12/17] fix(cubestore): handle dictionary group keys in
 aggregate top-k

ORDER BY <aggregate> DESC LIMIT goes through ClusterAggregateTopK, whose scalar
helpers (create_builder / append_value via cube_match_scalar) had no arm for
ScalarValue::Dictionary and panicked ("Unhandled cube_match_scalar match arm:
Dictionary(...)") whenever the group key was dictionary-encoded. So with
dictionary encoding on, any top-k-by-aggregate query crashed the worker
subprocess.

Special-case ScalarValue::Dictionary at the function level (the macro is left
untouched): build into a StringDictionaryBuilder<Int32Type> and append the inner
Utf8 value, so the top-k result columns keep the dictionary type the schema
expects. Grouping already works (ScalarValue::Dictionary hashes/compares by its
resolved inner value) and cmp_same_types only ever sees aggregate values, so no
other change is needed. Covers the only dictionary layout CubeStore produces
(Dictionary(Int32, Utf8)); other layouts still panic explicitly.

Verified on the production dump: ORDER BY sum(...) DESC LIMIT 10 with dictionary
encoding on now returns the same rows as with it off. Adds an end-to-end test.
---
 .../cubestore/src/queryplanner/topk/util.rs   | 40 +++++++++-
 .../cubestore/src/store/compaction.rs         | 74 +++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/topk/util.rs b/rust/cubestore/cubestore/src/queryplanner/topk/util.rs
index ed84d9a524e22..6a176713ebe73 100644
--- a/rust/cubestore/cubestore/src/queryplanner/topk/util.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/topk/util.rs
@@ -1,4 +1,5 @@
-use datafusion::arrow::array::ArrayBuilder;
+use datafusion::arrow::array::{ArrayBuilder, StringDictionaryBuilder};
+use datafusion::arrow::datatypes::{DataType, Int32Type};
 use datafusion::error::DataFusionError;
 use datafusion::scalar::ScalarValue;
 
@@ -50,8 +51,42 @@ macro_rules! cube_match_scalar {
     }};
 }
 
+/// Dictionary group keys (CubeStore dictionary encoding produces `Dictionary(Int32, Utf8)`) are not
+/// handled by [cube_match_scalar], which works on plain scalar variants. Build/append into a string
+/// dictionary builder so the top-k result columns keep the dictionary type of the schema.
+fn create_dictionary_builder(key_type: &DataType, value: &ScalarValue) -> Box<dyn ArrayBuilder> {
+    match (key_type, value) {
+        (DataType::Int32, ScalarValue::Utf8(_)) => {
+            Box::new(StringDictionaryBuilder::<Int32Type>::new())
+        }
+        _ => panic!(
+            "Unhandled dictionary topk type: key={:?} value={:?}",
+            key_type, value
+        ),
+    }
+}
+
+fn append_dictionary_value(
+    b: &mut dyn ArrayBuilder,
+    value: &ScalarValue,
+) -> Result<(), DataFusionError> {
+    let b = b
+        .as_any_mut()
+        .downcast_mut::<StringDictionaryBuilder<Int32Type>>()
+        .expect("expected StringDictionaryBuilder<Int32Type>");
+    match value {
+        ScalarValue::Utf8(None) => b.append_null(),
+        ScalarValue::Utf8(Some(v)) => b.append_value(v),
+        other => panic!("Unhandled dictionary topk value: {:?}", other),
+    }
+    Ok(())
+}
+
 #[allow(unused_variables)]
 pub fn create_builder(s: &ScalarValue) -> Box<dyn ArrayBuilder> {
+    if let ScalarValue::Dictionary(key_type, value) = s {
+        return create_dictionary_builder(key_type, value);
+    }
     macro_rules! create_list_builder {
         ($v: expr, $inner_data_type: expr, ListBuilder $(, $rest: tt)*) => {{
             panic!("nested lists not supported")
@@ -84,6 +119,9 @@ pub(crate) fn append_value(
     b: &mut dyn ArrayBuilder,
     v: &ScalarValue,
 ) -> Result<(), DataFusionError> {
+    if let ScalarValue::Dictionary(_key_type, value) = v {
+        return append_dictionary_value(b, value);
+    }
     let b = b.as_any_mut();
     macro_rules! append_list_value {
         ($list: expr, $dummy: expr, $inner_data_type: expr, ListBuilder $(, $rest: tt)*) => {{
diff --git a/rust/cubestore/cubestore/src/store/compaction.rs b/rust/cubestore/cubestore/src/store/compaction.rs
index 990b9c1512ffb..fc138a4a76527 100644
--- a/rust/cubestore/cubestore/src/store/compaction.rs
+++ b/rust/cubestore/cubestore/src/store/compaction.rs
@@ -2872,6 +2872,80 @@ mod tests {
             .await;
     }
 
+    #[tokio::test]
+    async fn dictionary_encoding_topk_by_aggregate() {
+        // ORDER BY <aggregate> DESC LIMIT goes through the ClusterAggregateTopK path, whose scalar
+        // helpers (cube_match_scalar) had no dictionary arm and panicked when the group key was
+        // dictionary-encoded. The top-k result must materialize the dictionary group key correctly.
+        Config::test("dictionary_encoding_topk_by_aggregate")
+            .update_config(|mut c| {
+                c.dictionary_encoding_enabled = true;
+                c
+            })
+            .start_test(async move |services| {
+                let service = services.sql_service;
+                let compaction_service = services
+                    .injector
+                    .get_service_typed::<dyn CompactionService>()
+                    .await;
+                service
+                    .exec_query("CREATE SCHEMA d")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                service
+                    .exec_query("CREATE TABLE d.t (s text, n int)")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                // Includes a NULL group key landing in the top-k, to exercise the dictionary
+                // builder's null path in the result.
+                service
+                    .exec_query(
+                        "INSERT INTO d.t (s, n) VALUES \
+                         ('a', 5), ('b', 10), ('b', 7), ('c', 1), ('a', 2), ('d', 100), (NULL, 50)",
+                    )
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                compaction_service
+                    .compact(1, DataLoadedSize::new())
+                    .await
+                    .unwrap();
+
+                // groups: a=7, b=17, c=1, d=100, NULL=50 -> top 3 by sum desc: d=100, NULL=50, b=17
+                let result = service
+                    .exec_query("SELECT s, sum(n) FROM d.t GROUP BY 1 ORDER BY 2 DESC LIMIT 3")
+                    .await
+                    .unwrap()
+                    .collect()
+                    .await
+                    .unwrap();
+                assert_eq!(
+                    result.get_rows(),
+                    &vec![
+                        Row::new(vec![
+                            TableValue::String("d".to_string()),
+                            TableValue::Int(100)
+                        ]),
+                        Row::new(vec![TableValue::Null, TableValue::Int(50)]),
+                        Row::new(vec![
+                            TableValue::String("b".to_string()),
+                            TableValue::Int(17)
+                        ]),
+                    ]
+                );
+                Ok::<(), CubeError>(())
+            })
+            .await;
+    }
+
     #[tokio::test]
     async fn dictionary_encoding_in_memory_group_by() {
         Config::test("dictionary_encoding_in_memory_group_by")

From e8a2a90bec58ee1e65a512c1c673d2289edc05c5 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 12:34:34 +0200
Subject: [PATCH 13/17] chore(cubestore): default dictionary encoding and
 group-by-limit trim off (opt-in via env)

---
 rust/cubestore/cubestore/src/config/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 45123119faada..9a69ffd6df6b0 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -1095,6 +1095,7 @@ impl ConfigObj for ConfigObjImpl {
     }
     fn repartition_check_overlapping_children(&self) -> bool {
         self.repartition_check_overlapping_children
+    }
     fn dictionary_encoding_enabled(&self) -> bool {
         self.dictionary_encoding_enabled
     }
@@ -1799,11 +1800,10 @@ impl Config {
                     "CUBESTORE_REPARTITION_CHECK_OVERLAPPING_CHILDREN",
                     false,
                 ),
-                // TODO: dev default; flip back to false before merge.
-                dictionary_encoding_enabled: env_bool("CUBESTORE_DICTIONARY_ENCODING", true),
+                dictionary_encoding_enabled: env_bool("CUBESTORE_DICTIONARY_ENCODING", false),
                 group_by_limit_factor: env_parse(
                     "CUBESTORE_GROUP_BY_LIMIT_FACTOR",
-                    2,
+                    0,
                 ),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(

From 4f9f439dfadff45a6a47415259fe5d06c0fa45e4 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 13:04:37 +0200
Subject: [PATCH 14/17] refactor(cubestore): address PR review on
 group-by-limit/dict

- per-partition: strip only the leading CoalescePartitionsExec feeding the
  aggregate, not every one in the subtree (preserves UNION/mixed-partitioning
  plan semantics)
- GlobalDict: share one Arc<str> allocation between the id map and values vec
  instead of two String allocations per new value
- GroupByLimitAggregateExec::statistics: report an inexact upper bound
  (min(input_rows, factor*k*partitions)) instead of Absent
- drop unused col_idx_base from aggregate_expressions (Partial-only)
- comments: scalar-fallback note on DictionaryGroupColumn, duplicate ORDER BY skip
---
 .../group_by_limit_aggregate/dict_remap.rs    | 10 +++++---
 .../group_by_limit_aggregate_stream.rs        |  7 +++---
 .../group_by_limit_aggregate/mod.rs           | 17 ++++++++++++-
 .../dictionary_group_column.rs                |  3 +++
 .../distributed_partial_aggregate.rs          | 25 +++++++------------
 .../optimizations/group_by_limit_rewriter.rs  |  1 +
 6 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
index 99e158d4f3b71..e8ff9b79fb577 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/dict_remap.rs
@@ -19,8 +19,8 @@ pub(crate) fn is_int32_utf8_dict(dt: &DataType) -> bool {
 /// materializing the string on every row. The per-batch string work is proportional to the batch's
 /// distinct dictionary values, not its row count. Null dictionary entries and null keys stay null.
 pub(crate) struct GlobalDict {
-    value_to_id: HashMap<String, i32>,
-    values: Vec<String>,
+    value_to_id: HashMap<Arc<str>, i32>,
+    values: Vec<Arc<str>>,
 }
 
 impl GlobalDict {
@@ -36,8 +36,10 @@ impl GlobalDict {
             return *id;
         }
         let id = self.values.len() as i32;
-        self.values.push(v.to_string());
-        self.value_to_id.insert(v.to_string(), id);
+        // One allocation shared between the map key and the values vec.
+        let key: Arc<str> = Arc::from(v);
+        self.values.push(key.clone());
+        self.value_to_id.insert(key, id);
         id
     }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
index 288a7b804d901..74c7d8ed4c72b 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -58,8 +58,7 @@ impl GroupByLimitAggregateStream {
         let batch_size = context.session_config().batch_size();
         let input = agg.input().execute(partition, Arc::clone(&context))?;
 
-        let aggregate_arguments =
-            aggregate_expressions(agg.aggr_expr(), agg_group_by.num_group_exprs())?;
+        let aggregate_arguments = aggregate_expressions(agg.aggr_expr())?;
 
         let accumulators: Vec<_> = agg
             .aggr_expr()
@@ -241,10 +240,10 @@ impl GroupByLimitAggregateStream {
 }
 
 /// Partial-aggregate argument expressions, one vec per aggregate. Mirrors DataFusion's private
-/// `aggregate_expressions` for `AggregateMode::Partial`.
+/// `aggregate_expressions` for `AggregateMode::Partial` only — the Final-mode column offset that
+/// DataFusion's version takes is not needed here, so it is omitted.
 fn aggregate_expressions(
     aggr_expr: &[Arc<AggregateFunctionExpr>],
-    _col_idx_base: usize,
 ) -> DFResult<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
     Ok(aggr_expr
         .iter()
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
index 1b463a5da54a0..b8ab43d6ee618 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -192,8 +192,23 @@ impl ExecutionPlan for GroupByLimitAggregateExec {
     }
 
     fn statistics(&self) -> DFResult<Statistics> {
+        // The trim keeps at most `factor * k` groups per output partition, so the output is bounded
+        // by that and by the input row count. Report it (inexact) instead of Absent, which makes
+        // downstream planners bail. `factor` is always > 0 here (the rewriter only builds this exec
+        // when trimming is enabled), but guard anyway.
+        let input_rows = self.input.statistics()?.num_rows;
+        let num_rows = if self.factor == 0 {
+            input_rows
+        } else {
+            let parts = self.cache.output_partitioning().partition_count().max(1);
+            let cap = self.factor.saturating_mul(self.k).saturating_mul(parts);
+            match input_rows {
+                Precision::Exact(n) | Precision::Inexact(n) => Precision::Inexact(n.min(cap)),
+                Precision::Absent => Precision::Inexact(cap),
+            }
+        };
         Ok(Statistics {
-            num_rows: Precision::Absent,
+            num_rows,
             column_statistics: Statistics::unknown_column(&self.schema),
             total_byte_size: Precision::Absent,
         })
diff --git a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs
index 56857846b7cd4..c39c8b7c49d94 100644
--- a/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/inline_aggregate/dictionary_group_column.rs
@@ -65,6 +65,9 @@ impl<K: ArrowDictionaryKeyType> GroupColumn for DictionaryGroupColumn<K> {
         }
     }
 
+    // Scalar fallbacks, not a fast path: on the sorted/inline path the column comparator does the
+    // hot row-by-row work, so these per-row loops are not on the critical path. A vectorized
+    // implementation would only matter if this column were used by the hash aggregate.
     fn vectorized_equal_to(
         &self,
         lhs_rows: &[usize],
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 970de53ba5ecf..578b0cbcfbbf2 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -403,7 +403,7 @@ fn resort_worker_subtree(
             .as_any()
             .downcast_ref::<AggregateExec>()
             .and_then(|agg| {
-                let new_input = strip_coalesce_partitions(agg.input()).ok()?;
+                let new_input = strip_leading_coalesce_partitions(agg.input());
                 partial.clone().with_new_children(vec![new_input]).ok()
             })
             .unwrap_or(partial)
@@ -465,23 +465,16 @@ fn per_partition_enabled() -> bool {
         .unwrap_or(false)
 }
 
-/// Recursively drop `CoalescePartitionsExec` nodes, exposing the underlying multi-partition streams
-/// to the parent.
-fn strip_coalesce_partitions(
-    p: &Arc<dyn ExecutionPlan>,
-) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+/// Peel the leading `CoalescePartitionsExec` chain directly feeding the aggregate, exposing the
+/// underlying multi-partition streams. Only the immediate coalesce(s) are removed; any
+/// `CoalescePartitionsExec` deeper in the subtree (e.g. one a child inserted to satisfy its own
+/// single-partition input requirement, as in a UNION branch) is left intact so plan semantics are
+/// preserved.
+fn strip_leading_coalesce_partitions(p: &Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     if p.as_any().is::<CoalescePartitionsExec>() {
-        return strip_coalesce_partitions(&p.children()[0].clone());
-    }
-    let children = p.children();
-    if children.is_empty() {
-        return Ok(p.clone());
+        return strip_leading_coalesce_partitions(&p.children()[0]);
     }
-    let new_children = children
-        .iter()
-        .map(|c| strip_coalesce_partitions(c))
-        .collect::<Result<Vec<_>, _>>()?;
-    p.clone().with_new_children(new_children)
+    p.clone()
 }
 
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
index 3bf40d08c1c65..0b17807bf873b 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
@@ -109,6 +109,7 @@ fn analyze(root: &Arc<dyn ExecutionPlan>) -> Option<Target> {
     for e in &order {
         let column = e.expr.as_any().downcast_ref::<Column>()?;
         let idx = group_names.iter().position(|n| n == column.name())?;
+        // A repeated ORDER BY column adds nothing to the total order; skip it.
         if used[idx] {
             continue;
         }

From cbff8401df4eb921a597a979736e6e093c4a7158 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 13:34:12 +0200
Subject: [PATCH 15/17] test(cubestore): ignore worker_sort_and_limit_cluster
 (worker limit-pushdown Sort not emitted)

---
 rust/cubestore/cubestore/src/sql/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rust/cubestore/cubestore/src/sql/mod.rs b/rust/cubestore/cubestore/src/sql/mod.rs
index 472465ade9279..93d39820809cb 100644
--- a/rust/cubestore/cubestore/src/sql/mod.rs
+++ b/rust/cubestore/cubestore/src/sql/mod.rs
@@ -6339,6 +6339,8 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore = "worker-side limit-pushdown Sort(fetch) is not emitted on the inline path in this \
+                cluster setup; re-enable once compute_worker_sort_and_limit produces it again"]
     async fn worker_sort_and_limit_cluster() -> Result<(), CubeError> {
         Config::test("worker_sort_limit_router")
             .update_config(|mut config| {

From 4bb1499d060cd857ba6ec630a0410c8f6895a932 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 15:49:08 +0200
Subject: [PATCH 16/17] fix(cubestore): restore worker-side sort bounding for
 the inline limit path

The 'drop sorted worker top-k' refactor narrowed resort_worker_subtree to only
trim plain hash AggregateExec, so the inline (sorted) aggregate lost the
per-partition bounded Sort it had: for GROUP BY <prefix> ... ORDER BY <non-prefix
group col> LIMIT n the worker shipped every group to the router (router-only
bounding -> the inline-aggregate limit OOM class).

Split resort_worker_subtree by partial aggregate kind, one mechanism two shapes:
- inline/sorted: SortPreservingMerge(T) <- Sort(T, fetch, per partition) -- we
  can't trim a sorted aggregate and don't know the group count, so always bound
  with a sort. Router: sorted final over a sort-preserving merge (the query's
  Sort+Limit does the final bounding, no extra Sort).
- hash: the trimming GroupByLimitAggregate + coalesce (factor-aware), router does
  the hash final + Sort(fetch). Unchanged.

Also drop the top-level factor==0 early return so the inline bounding applies
regardless of the trim factor (factor only gates the hash trim).

Tests: un-ignore worker_sort_and_limit_cluster; update its hash-path case to
assert the GroupByLimitAggregate trim instead of the old worker Sort. Full lib
and in-process SQL suites pass with all optimization envs on.
---
 .../distributed_partial_aggregate.rs          | 203 ++++++++++-----
 rust/cubestore/cubestore/src/sql/mod.rs       |  11 +-
 .../metastore/000004.log                      | Bin 0 -> 2270 bytes
 .../metastore/CURRENT                         |   1 +
 .../metastore/IDENTITY                        |   1 +
 .../metastore/LOCK                            |   0
 .../metastore/LOG                             | 245 ++++++++++++++++++
 .../metastore/MANIFEST-000005                 | Bin 0 -> 66 bytes
 .../metastore/OPTIONS-000007                  | 198 ++++++++++++++
 .../metastore/000004.log                      | Bin 0 -> 2907 bytes
 .../metastore/CURRENT                         |   1 +
 .../metastore/IDENTITY                        |   1 +
 .../metastore/LOCK                            |   0
 .../metastore/LOG                             | 245 ++++++++++++++++++
 .../metastore/MANIFEST-000005                 | Bin 0 -> 66 bytes
 .../metastore/OPTIONS-000007                  | 198 ++++++++++++++
 16 files changed, 1028 insertions(+), 76 deletions(-)
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005
 create mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005
 create mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 578b0cbcfbbf2..07904c0fc67a0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -289,13 +289,11 @@ pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
     group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
-    // The worker top-k engages only when the trimming aggregate is enabled (factor > 0); otherwise
-    // leave the plan as planned (no trim).
-    if group_by_limit_factor == 0 {
-        return Ok(p);
-    }
-
-    // Worker side: replace the partial aggregate's merge with the trimmed top-k.
+    // Worker side: bound the partial aggregate's output. The sorted (inline) aggregate is always
+    // bounded with a per-partition Sort(fetch) -- the group count is unknown, so we can't trim, we
+    // just sort. The hash aggregate uses the trimming top-k, engaged only when factor > 0.
+    // `resort_worker_subtree` returns None when nothing applies (hash with factor == 0), leaving the
+    // plan as planned.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
@@ -331,6 +329,11 @@ pub fn push_worker_sort_and_limit(
     let Some((cols, fetch)) = cs.worker_sort_and_limit.clone() else {
         return Ok(p);
     };
+    // The hash aggregate emits unordered trimmed groups (combined by a hash final + re-sort); the
+    // sorted/inline aggregate emits bounded sorted streams (combined by a sort-preserving merge +
+    // sorted final). Decide from the worker's partial aggregate before rebuilding.
+    let is_hash = locate_partial_aggregate(&cs.input_for_optimizations)
+        .map_or(false, |partial| partial.as_any().is::<AggregateExec>());
     let Some(new_worker_subtree) = resort_worker_subtree(
         &cs.input_for_optimizations,
         &cols,
@@ -343,23 +346,39 @@ pub fn push_worker_sort_and_limit(
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
 
-    // Combine the worker top-k with a hash final aggregate over coalesced (unordered) streams, then
-    // re-apply the top-k sort by the total order T. The Sort(fetch) is required even for a bare
-    // LIMIT: it keeps the k smallest by T -- exactly the groups every worker kept and fully combined
-    // here -- where a plain limit could take a group only one worker kept (undercounted). The
-    // coalesce also drains the workers' streams in parallel.
-    let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
-    let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
-        AggregateMode::Final,
-        final_agg.group_expr,
-        final_agg.aggr_expr,
-        final_agg.filter_expr,
-        coalesced,
-        final_agg.input_schema,
-    )?);
-    Ok(Arc::new(
-        SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
-    ))
+    if is_hash {
+        // Hash final over coalesced (unordered) streams, then re-apply the top-k sort by the total
+        // order T. The Sort(fetch) is required even for a bare LIMIT: it keeps the k smallest by T --
+        // exactly the groups every worker kept and fully combined here -- where a plain limit could
+        // take a group only one worker kept (undercounted). The coalesce drains the workers in
+        // parallel.
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            coalesced,
+            final_agg.input_schema,
+        )?);
+        Ok(Arc::new(
+            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+        ))
+    } else {
+        // Sorted final over a sort-preserving merge in worker_order: equal keys are adjacent so the
+        // sorted final combines them, and its output stays worker_order-sorted (whose prefix is the
+        // query's ORDER BY), so the query's limit above stays correct -- no extra Sort needed.
+        let merged: Arc<dyn ExecutionPlan> =
+            Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
+        Ok(Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            merged,
+            final_agg.input_schema,
+        )?))
+    }
 }
 
 /// Builds the `worker_order` LexOrdering over an aggregate's group columns from the descriptor.
@@ -384,8 +403,14 @@ fn worker_ordering(
     Ok(LexOrdering::new(exprs))
 }
 
-/// Rebuilds a worker subtree as `CoalescePartitions <- GroupByLimitAggregate <- ...`: the trimmed
-/// top-k is emitted unsorted and coalesced to one stream for the router's hash final aggregate.
+/// Rebuilds a worker subtree to bound its output to the top `fetch` groups by the total order in
+/// `cols`. Two shapes, by partial aggregate kind:
+/// - hash (`AggregateExec`): `CoalescePartitions <- GroupByLimitAggregate` -- trim during
+///   aggregation, emitted unsorted for the router's hash final. Only when `factor > 0`; returns
+///   `None` otherwise (trimming disabled, leave the plan as planned).
+/// - sorted/inline: `SortPreservingMerge(T) <- Sort(T, fetch, per partition) <- PartialAggregate` --
+///   we can't trim a sorted aggregate and don't know the group count, so always bound with a sort.
+///
 /// Returns `None` for an unrecognized subtree (no locatable partial aggregate).
 fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
@@ -395,60 +420,96 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
-    // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate so
-    // it runs over every raw partition; the CoalescePartitions below then parallelizes all of them
-    // (N-way) instead of one stream per union branch.
-    let partial = if per_partition_enabled() {
-        partial
-            .as_any()
-            .downcast_ref::<AggregateExec>()
-            .and_then(|agg| {
-                let new_input = strip_leading_coalesce_partitions(agg.input());
-                partial.clone().with_new_children(vec![new_input]).ok()
-            })
-            .unwrap_or(partial)
-    } else {
-        partial
-    };
+    // Hash path: trim during aggregation, emit unsorted for the router's hash final. The factor
+    // gates whether trimming applies; with it off, nothing applies on this path.
+    if partial.as_any().is::<AggregateExec>() {
+        if group_by_limit_factor == 0 {
+            return None;
+        }
 
-    // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
-    // keeping only the top-k groups by `cols`. `group_by_limit_factor` is the trim threshold; the
-    // caller only reaches here with it > 0.
-    let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
-        if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
-            let order: Vec<(usize, SortOptions)> = cols
-                .iter()
-                .map(|(idx, asc, nulls_first)| {
-                    (
-                        *idx,
-                        SortOptions {
-                            descending: !asc,
-                            nulls_first: *nulls_first,
-                        },
-                    )
+        // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate
+        // so it runs over every raw partition; the CoalescePartitions below then parallelizes all of
+        // them (N-way) instead of one stream per union branch.
+        let partial = if per_partition_enabled() {
+            partial
+                .as_any()
+                .downcast_ref::<AggregateExec>()
+                .and_then(|agg| {
+                    let new_input = strip_leading_coalesce_partitions(agg.input());
+                    partial.clone().with_new_children(vec![new_input]).ok()
                 })
-                .collect();
-            match GroupByLimitAggregateExec::try_new_from_partial(
+                .unwrap_or(partial)
+        } else {
+            partial
+        };
+
+        let order: Vec<(usize, SortOptions)> = cols
+            .iter()
+            .map(|(idx, asc, nulls_first)| {
+                (
+                    *idx,
+                    SortOptions {
+                        descending: !asc,
+                        nulls_first: *nulls_first,
+                    },
+                )
+            })
+            .collect();
+        let trimmed: Arc<dyn ExecutionPlan> = match partial.as_any().downcast_ref::<AggregateExec>()
+        {
+            Some(agg) => match GroupByLimitAggregateExec::try_new_from_partial(
                 agg,
                 fetch,
                 group_by_limit_factor,
                 order,
             ) {
-                Some(e) => Arc::new(e) as Arc<dyn ExecutionPlan>,
+                Some(e) => Arc::new(e),
                 None => partial,
-            }
-        } else {
-            partial
-        }
-    } else {
-        partial
-    };
+            },
+            None => partial,
+        };
+
+        // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
+        // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
+        // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input)
+        // instead of being drained one at a time by a sort-preserving merge.
+        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
+    }
+
+    // Sorted/inline path: bound each partition with Sort(fetch) and merge in the total order. The
+    // per-partition `fetch` is sound because the key is the full group key: a globally top-`fetch`
+    // group stays within every partition's first `fetch`, so the router's sorted final still sees
+    // all its partial states (see this function's doc and the module note on the total order).
+    let worker_order = lex_ordering_from_cols(cols, &partial.schema())?;
+    let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
+        SortExec::new(worker_order.clone(), partial)
+            .with_fetch(Some(fetch))
+            .with_preserve_partitioning(true),
+    );
+    Some(Arc::new(SortPreservingMergeExec::new(
+        worker_order,
+        per_partition_sort,
+    )))
+}
 
-    // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
-    // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
-    // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input) instead
-    // of being drained one at a time by a sort-preserving merge.
-    Some(Arc::new(CoalescePartitionsExec::new(trimmed)))
+/// Build a `LexOrdering` over the partial aggregate's group columns from the descriptor (indices
+/// into the partial output schema). Returns `None` if a column index is out of range.
+fn lex_ordering_from_cols(
+    cols: &[(usize, bool, bool)],
+    schema: &datafusion::arrow::datatypes::SchemaRef,
+) -> Option<LexOrdering> {
+    let mut exprs = Vec::with_capacity(cols.len());
+    for (idx, asc, nulls_first) in cols {
+        let field = schema.fields().get(*idx)?;
+        exprs.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(field.name(), *idx)),
+            options: SortOptions {
+                descending: !asc,
+                nulls_first: *nulls_first,
+            },
+        });
+    }
+    Some(LexOrdering::new(exprs))
 }
 
 /// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the trimmed aggregate so it
diff --git a/rust/cubestore/cubestore/src/sql/mod.rs b/rust/cubestore/cubestore/src/sql/mod.rs
index 93d39820809cb..4a444123b6501 100644
--- a/rust/cubestore/cubestore/src/sql/mod.rs
+++ b/rust/cubestore/cubestore/src/sql/mod.rs
@@ -6339,8 +6339,6 @@ mod tests {
     }
 
     #[tokio::test]
-    #[ignore = "worker-side limit-pushdown Sort(fetch) is not emitted on the inline path in this \
-                cluster setup; re-enable once compute_worker_sort_and_limit produces it again"]
     async fn worker_sort_and_limit_cluster() -> Result<(), CubeError> {
         Config::test("worker_sort_limit_router")
             .update_config(|mut config| {
@@ -6489,7 +6487,9 @@ mod tests {
                                     }
                                 }
 
-                                // Test 4: ORDER BY 1 DESC with LIMIT on non-prefix column
+                                // Test 4: ORDER BY DESC + LIMIT on a non-prefix column, grouped by a
+                                // non-prefix column (hash aggregate). The hash path bounds the worker
+                                // output with the trimming aggregate, not a Sort.
                                 {
                                     let result = service
                                         .exec_query(
@@ -6506,8 +6506,9 @@ mod tests {
                                         _ => panic!("expected string"),
                                     };
                                     assert!(
-                                        worker_plan.contains("Sort, fetch: 2"),
-                                        "Worker should have Sort with fetch=2 for DESC. Plan: {}",
+                                        worker_plan.contains("GroupByLimitAggregate"),
+                                        "Hash-aggregate worker should bound output with \
+                                         GroupByLimitAggregate. Plan: {}",
                                         worker_plan
                                     );
                                 }
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log b/rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log
new file mode 100644
index 0000000000000000000000000000000000000000..86d68225d039e3a5f00627c9627af2526b210e14
GIT binary patch
literal 2270
zcmc&!O=ule6h3EW;=ESVTAT<`7g?ksi`3?&A+13QQt<~wix#cfTyAIXd+*lyd6|1B
znF!seAPAD;LT#a<h)~@qxUnu&LW?N4uqwI`EJ7FVT!}6!o;yG9CGVNqjdb2)?wxbK
z`*XhYowKyQXZ#*Ogz_;Xj#%`9Ms50G{mTna0g|zJ%!p$Itk+a_GI%kM1lX-40lXCv
zcX-w$;+Rb?>>mEzeeJDtL4`(xnt0{puXMOm+sw}9aI1g#@hhi01Q{8G*nP<7-@fn|
zL2em@IDN=?<CD%s2i*#Cn>omYP{&l+$@-hE`H4`lA`(kCLHQjKaV7)slQ4^71%=C@
zuV!UpdaX-gKN{=WAmc#fVDFlkvC)R{nTF?~I~gWE*D|3kWYkSk&00wsaSa`=Sim(0
zsaPs_um(O2S_F)ochAkvE$o|J*f%$OtU0&XJVgI{7iJIMci_+xoXBJtu$0Gb!9*N@
z5<E1E%qYxfD16P7rp+Qj*HcNB`hwMwC{5=?ENQir(2&J)ITP$Tu~q>Xg&}>RfVbPJ
zXwxQx4mx3ZlZ|4oV`M?H8ixrFn3B)a7G0idX@X~}8%oV|Cl#ubgaL?Li&&8dJdki1
z+W0rkJe<Vc_&ChtBD{z{k^LKp;0yQ|-p2P~nlk6XDN5VZI8o`amSa7ZIo+VoX`J>@
z(vK8bywSt^_m;x6@bttZUoX9R^^ZyexU-(<W}y*M@74z|U?BlF5G|gC8cM}%QpgR8
z<Dt#sP(~1jd&AcdT|Gxc2TGF|ksD2xlR|&)lcJr4JO%gVJEz`x;OOfvy3cs<9@XA7
z@d=H_uBsSq>$|BvN6zj&KU0U=tM#H-ZFIknj%@rgdbsxQt|wK<CCvW$__d9zmp=Wh
z_F`F8bv;*Rs;W!ayD<rYl*syepeV|qwwh$jR(O~Rh<IMjxUc1kfK{GGR2x~BT!bpw
zPm@&=Vn!Mdxza3CM*dl=C39%R2ij2(hV#&pRM`c;k&Mm8FnY{8+kALuaC`TqGy9(%
z#Zd~OzV<=rPzR}~@j7=^>^XRS_8jZqXdx9j_E%F!pE%!K)IGjR4LZ(tD@KML>+13c
z)a~x5_n@)pnf-2Oee}j(*9hWm53wKo@xyahEF7yKy@oiHI_u`9ZaI}Q&%5$QQb$D8
z5pPai<2tFcwhDhJb(T?dk-C9I^c>DWPAzIO#fp(vPv!6K|MQGr-O1X~yIs*%hM2Z*
gF4OtR75dGq|A}e)KbW?+W4h-UWZK@2X}iz#KZ3*uod5s;

literal 0
HcmV?d00001

diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT b/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
new file mode 100644
index 0000000000000..aa5bb8ea50905
--- /dev/null
+++ b/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000005
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY b/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
new file mode 100644
index 0000000000000..b67c275246ec7
--- /dev/null
+++ b/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
@@ -0,0 +1 @@
+83f38b5a-946f-41ab-8922-849ced38b4df
\ No newline at end of file
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK b/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG b/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
new file mode 100644
index 0000000000000..2ef96a9651d55
--- /dev/null
+++ b/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
@@ -0,0 +1,245 @@
+2026/06/20-14:18:18.600117 6159085568 RocksDB version: 7.10.2
+2026/06/20-14:18:18.600157 6159085568 Compile date 2022-12-22 09:30:39
+2026/06/20-14:18:18.600166 6159085568 DB SUMMARY
+2026/06/20-14:18:18.600168 6159085568 DB Session ID:  VFFO8AISUMHVF8LR3OK3
+2026/06/20-14:18:18.600307 6159085568 SST files in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore dir, Total Num: 0, files: 
+2026/06/20-14:18:18.600309 6159085568 Write Ahead Log file in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore: 
+2026/06/20-14:18:18.600310 6159085568                         Options.error_if_exists: 0
+2026/06/20-14:18:18.600311 6159085568                       Options.create_if_missing: 1
+2026/06/20-14:18:18.600312 6159085568                         Options.paranoid_checks: 1
+2026/06/20-14:18:18.600312 6159085568             Options.flush_verify_memtable_count: 1
+2026/06/20-14:18:18.600313 6159085568                               Options.track_and_verify_wals_in_manifest: 0
+2026/06/20-14:18:18.600313 6159085568        Options.verify_sst_unique_id_in_manifest: 1
+2026/06/20-14:18:18.600314 6159085568                                     Options.env: 0x10c4d9d10
+2026/06/20-14:18:18.600315 6159085568                                      Options.fs: PosixFileSystem
+2026/06/20-14:18:18.600316 6159085568                                Options.info_log: 0xa2085cd98
+2026/06/20-14:18:18.600316 6159085568                Options.max_file_opening_threads: 16
+2026/06/20-14:18:18.600317 6159085568                              Options.statistics: 0x0
+2026/06/20-14:18:18.600318 6159085568                               Options.use_fsync: 0
+2026/06/20-14:18:18.600318 6159085568                       Options.max_log_file_size: 0
+2026/06/20-14:18:18.600319 6159085568                  Options.max_manifest_file_size: 1073741824
+2026/06/20-14:18:18.600319 6159085568                   Options.log_file_time_to_roll: 0
+2026/06/20-14:18:18.600320 6159085568                       Options.keep_log_file_num: 1000
+2026/06/20-14:18:18.600321 6159085568                    Options.recycle_log_file_num: 0
+2026/06/20-14:18:18.600321 6159085568                         Options.allow_fallocate: 1
+2026/06/20-14:18:18.600322 6159085568                        Options.allow_mmap_reads: 0
+2026/06/20-14:18:18.600322 6159085568                       Options.allow_mmap_writes: 0
+2026/06/20-14:18:18.600323 6159085568                        Options.use_direct_reads: 0
+2026/06/20-14:18:18.600323 6159085568                        Options.use_direct_io_for_flush_and_compaction: 0
+2026/06/20-14:18:18.600324 6159085568          Options.create_missing_column_families: 0
+2026/06/20-14:18:18.600325 6159085568                              Options.db_log_dir: 
+2026/06/20-14:18:18.600325 6159085568                                 Options.wal_dir: 
+2026/06/20-14:18:18.600326 6159085568                Options.table_cache_numshardbits: 6
+2026/06/20-14:18:18.600326 6159085568                         Options.WAL_ttl_seconds: 330
+2026/06/20-14:18:18.600327 6159085568                       Options.WAL_size_limit_MB: 0
+2026/06/20-14:18:18.600327 6159085568                        Options.max_write_batch_group_size_bytes: 1048576
+2026/06/20-14:18:18.600328 6159085568             Options.manifest_preallocation_size: 4194304
+2026/06/20-14:18:18.600329 6159085568                     Options.is_fd_close_on_exec: 1
+2026/06/20-14:18:18.600329 6159085568                   Options.advise_random_on_open: 1
+2026/06/20-14:18:18.600330 6159085568                    Options.db_write_buffer_size: 0
+2026/06/20-14:18:18.600330 6159085568                    Options.write_buffer_manager: 0xa21438fc0
+2026/06/20-14:18:18.600331 6159085568         Options.access_hint_on_compaction_start: 1
+2026/06/20-14:18:18.600332 6159085568           Options.random_access_max_buffer_size: 1048576
+2026/06/20-14:18:18.600334 6159085568                      Options.use_adaptive_mutex: 0
+2026/06/20-14:18:18.600335 6159085568                            Options.rate_limiter: 0x0
+2026/06/20-14:18:18.600336 6159085568     Options.sst_file_manager.rate_bytes_per_sec: 0
+2026/06/20-14:18:18.600336 6159085568                       Options.wal_recovery_mode: 2
+2026/06/20-14:18:18.600337 6159085568                  Options.enable_thread_tracking: 0
+2026/06/20-14:18:18.600338 6159085568                  Options.enable_pipelined_write: 0
+2026/06/20-14:18:18.600338 6159085568                  Options.unordered_write: 0
+2026/06/20-14:18:18.600339 6159085568         Options.allow_concurrent_memtable_write: 1
+2026/06/20-14:18:18.600339 6159085568      Options.enable_write_thread_adaptive_yield: 1
+2026/06/20-14:18:18.600340 6159085568             Options.write_thread_max_yield_usec: 100
+2026/06/20-14:18:18.600340 6159085568            Options.write_thread_slow_yield_usec: 3
+2026/06/20-14:18:18.600341 6159085568                               Options.row_cache: None
+2026/06/20-14:18:18.600342 6159085568                              Options.wal_filter: None
+2026/06/20-14:18:18.600342 6159085568             Options.avoid_flush_during_recovery: 0
+2026/06/20-14:18:18.600343 6159085568             Options.allow_ingest_behind: 0
+2026/06/20-14:18:18.600344 6159085568             Options.two_write_queues: 0
+2026/06/20-14:18:18.600344 6159085568             Options.manual_wal_flush: 0
+2026/06/20-14:18:18.600345 6159085568             Options.wal_compression: 0
+2026/06/20-14:18:18.600345 6159085568             Options.atomic_flush: 0
+2026/06/20-14:18:18.600346 6159085568             Options.avoid_unnecessary_blocking_io: 0
+2026/06/20-14:18:18.600346 6159085568                 Options.persist_stats_to_disk: 0
+2026/06/20-14:18:18.600347 6159085568                 Options.write_dbid_to_manifest: 0
+2026/06/20-14:18:18.600348 6159085568                 Options.log_readahead_size: 0
+2026/06/20-14:18:18.600348 6159085568                 Options.file_checksum_gen_factory: Unknown
+2026/06/20-14:18:18.600349 6159085568                 Options.best_efforts_recovery: 0
+2026/06/20-14:18:18.600350 6159085568                Options.max_bgerror_resume_count: 2147483647
+2026/06/20-14:18:18.600350 6159085568            Options.bgerror_resume_retry_interval: 1000000
+2026/06/20-14:18:18.600351 6159085568             Options.allow_data_in_errors: 0
+2026/06/20-14:18:18.600351 6159085568             Options.db_host_id: __hostname__
+2026/06/20-14:18:18.600352 6159085568             Options.enforce_single_del_contracts: true
+2026/06/20-14:18:18.600353 6159085568             Options.max_background_jobs: 2
+2026/06/20-14:18:18.600353 6159085568             Options.max_background_compactions: -1
+2026/06/20-14:18:18.600354 6159085568             Options.max_subcompactions: 1
+2026/06/20-14:18:18.600354 6159085568             Options.avoid_flush_during_shutdown: 0
+2026/06/20-14:18:18.600355 6159085568           Options.writable_file_max_buffer_size: 1048576
+2026/06/20-14:18:18.600356 6159085568             Options.delayed_write_rate : 16777216
+2026/06/20-14:18:18.600356 6159085568             Options.max_total_wal_size: 0
+2026/06/20-14:18:18.600357 6159085568             Options.delete_obsolete_files_period_micros: 21600000000
+2026/06/20-14:18:18.600357 6159085568                   Options.stats_dump_period_sec: 600
+2026/06/20-14:18:18.600358 6159085568                 Options.stats_persist_period_sec: 600
+2026/06/20-14:18:18.600358 6159085568                 Options.stats_history_buffer_size: 1048576
+2026/06/20-14:18:18.600359 6159085568                          Options.max_open_files: -1
+2026/06/20-14:18:18.600360 6159085568                          Options.bytes_per_sync: 0
+2026/06/20-14:18:18.600360 6159085568                      Options.wal_bytes_per_sync: 0
+2026/06/20-14:18:18.600361 6159085568                   Options.strict_bytes_per_sync: 0
+2026/06/20-14:18:18.600361 6159085568       Options.compaction_readahead_size: 0
+2026/06/20-14:18:18.600362 6159085568                  Options.max_background_flushes: -1
+2026/06/20-14:18:18.600362 6159085568 Compression algorithms supported:
+2026/06/20-14:18:18.600364 6159085568 	kZSTD supported: 1
+2026/06/20-14:18:18.600365 6159085568 	kZlibCompression supported: 0
+2026/06/20-14:18:18.600365 6159085568 	kXpressCompression supported: 0
+2026/06/20-14:18:18.600366 6159085568 	kSnappyCompression supported: 1
+2026/06/20-14:18:18.600367 6159085568 	kZSTDNotFinalCompression supported: 1
+2026/06/20-14:18:18.600367 6159085568 	kLZ4HCCompression supported: 0
+2026/06/20-14:18:18.600368 6159085568 	kLZ4Compression supported: 0
+2026/06/20-14:18:18.600369 6159085568 	kBZip2Compression supported: 0
+2026/06/20-14:18:18.600375 6159085568 Fast CRC32 supported: Supported on Arm64
+2026/06/20-14:18:18.600376 6159085568 DMutex implementation: pthread_mutex_t
+2026/06/20-14:18:18.600377 6159085568 Allocator: System
+2026/06/20-14:18:18.601165 6159085568 [db/db_impl/db_impl_open.cc:317] Creating manifest 1 
+2026/06/20-14:18:18.602118 6159085568 [db/version_set.cc:5617] Recovering from manifest file: /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000001
+2026/06/20-14:18:18.602253 6159085568 [db/column_family.cc:632] --------------- Options for column family [default]:
+2026/06/20-14:18:18.602265 6159085568               Options.comparator: leveldb.BytewiseComparator
+2026/06/20-14:18:18.602268 6159085568           Options.merge_operator: meta_store merge
+2026/06/20-14:18:18.602270 6159085568        Options.compaction_filter: None
+2026/06/20-14:18:18.602271 6159085568        Options.compaction_filter_factory: None
+2026/06/20-14:18:18.602273 6159085568  Options.sst_partitioner_factory: None
+2026/06/20-14:18:18.602275 6159085568         Options.memtable_factory: SkipListFactory
+2026/06/20-14:18:18.602276 6159085568            Options.table_factory: BlockBasedTable
+2026/06/20-14:18:18.602295 6159085568            table_factory options:   flush_block_policy_factory: FlushBlockBySizePolicyFactory (0xa214d9380)
+  cache_index_and_filter_blocks: 0
+  cache_index_and_filter_blocks_with_high_priority: 1
+  pin_l0_filter_and_index_blocks_in_cache: 0
+  pin_top_level_index_and_filter: 1
+  index_type: 0
+  data_block_index_type: 0
+  index_shortening: 1
+  data_block_hash_table_util_ratio: 0.750000
+  checksum: 4
+  no_block_cache: 0
+  block_cache: 0xa214a9158
+  block_cache_name: LRUCache
+  block_cache_options:
+    capacity : 8388608
+    num_shard_bits : 4
+    strict_capacity_limit : 0
+    memory_allocator : None
+    high_pri_pool_ratio: 0.500
+    low_pri_pool_ratio: 0.000
+  block_cache_compressed: 0x0
+  persistent_cache: 0x0
+  block_size: 4096
+  block_size_deviation: 10
+  block_restart_interval: 16
+  index_block_restart_interval: 1
+  metadata_block_size: 4096
+  partition_filters: 0
+  use_delta_encoding: 1
+  filter_policy: nullptr
+  whole_key_filtering: 1
+  verify_compression: 0
+  read_amp_bytes_per_bit: 0
+  format_version: 5
+  enable_index_compression: 1
+  block_align: 0
+  max_auto_readahead_size: 262144
+  prepopulate_block_cache: 0
+  initial_auto_readahead_size: 8192
+  num_file_reads_for_auto_readahead: 2
+2026/06/20-14:18:18.602297 6159085568        Options.write_buffer_size: 67108864
+2026/06/20-14:18:18.602299 6159085568  Options.max_write_buffer_number: 2
+2026/06/20-14:18:18.602300 6159085568          Options.compression: NoCompression
+2026/06/20-14:18:18.602302 6159085568                  Options.bottommost_compression: NoCompression
+2026/06/20-14:18:18.602305 6159085568       Options.prefix_extractor: rocksdb.FixedPrefix
+2026/06/20-14:18:18.602306 6159085568   Options.memtable_insert_with_hint_prefix_extractor: nullptr
+2026/06/20-14:18:18.602308 6159085568             Options.num_levels: 7
+2026/06/20-14:18:18.602309 6159085568        Options.min_write_buffer_number_to_merge: 1
+2026/06/20-14:18:18.602310 6159085568     Options.max_write_buffer_number_to_maintain: 0
+2026/06/20-14:18:18.602311 6159085568     Options.max_write_buffer_size_to_maintain: 0
+2026/06/20-14:18:18.602313 6159085568            Options.bottommost_compression_opts.window_bits: -14
+2026/06/20-14:18:18.602314 6159085568                  Options.bottommost_compression_opts.level: 32767
+2026/06/20-14:18:18.602315 6159085568               Options.bottommost_compression_opts.strategy: 0
+2026/06/20-14:18:18.602317 6159085568         Options.bottommost_compression_opts.max_dict_bytes: 0
+2026/06/20-14:18:18.602319 6159085568         Options.bottommost_compression_opts.zstd_max_train_bytes: 0
+2026/06/20-14:18:18.602320 6159085568         Options.bottommost_compression_opts.parallel_threads: 1
+2026/06/20-14:18:18.602321 6159085568                  Options.bottommost_compression_opts.enabled: false
+2026/06/20-14:18:18.602323 6159085568         Options.bottommost_compression_opts.max_dict_buffer_bytes: 0
+2026/06/20-14:18:18.602324 6159085568         Options.bottommost_compression_opts.use_zstd_dict_trainer: true
+2026/06/20-14:18:18.602325 6159085568            Options.compression_opts.window_bits: -14
+2026/06/20-14:18:18.602326 6159085568                  Options.compression_opts.level: 32767
+2026/06/20-14:18:18.602328 6159085568               Options.compression_opts.strategy: 0
+2026/06/20-14:18:18.602329 6159085568         Options.compression_opts.max_dict_bytes: 0
+2026/06/20-14:18:18.602331 6159085568         Options.compression_opts.zstd_max_train_bytes: 0
+2026/06/20-14:18:18.602332 6159085568         Options.compression_opts.use_zstd_dict_trainer: true
+2026/06/20-14:18:18.602334 6159085568         Options.compression_opts.parallel_threads: 1
+2026/06/20-14:18:18.602335 6159085568                  Options.compression_opts.enabled: false
+2026/06/20-14:18:18.602336 6159085568         Options.compression_opts.max_dict_buffer_bytes: 0
+2026/06/20-14:18:18.602337 6159085568      Options.level0_file_num_compaction_trigger: 4
+2026/06/20-14:18:18.602339 6159085568          Options.level0_slowdown_writes_trigger: 20
+2026/06/20-14:18:18.602340 6159085568              Options.level0_stop_writes_trigger: 36
+2026/06/20-14:18:18.602341 6159085568                   Options.target_file_size_base: 67108864
+2026/06/20-14:18:18.602342 6159085568             Options.target_file_size_multiplier: 1
+2026/06/20-14:18:18.602344 6159085568                Options.max_bytes_for_level_base: 268435456
+2026/06/20-14:18:18.602346 6159085568 Options.level_compaction_dynamic_level_bytes: 0
+2026/06/20-14:18:18.602347 6159085568          Options.max_bytes_for_level_multiplier: 10.000000
+2026/06/20-14:18:18.602349 6159085568 Options.max_bytes_for_level_multiplier_addtl[0]: 1
+2026/06/20-14:18:18.602350 6159085568 Options.max_bytes_for_level_multiplier_addtl[1]: 1
+2026/06/20-14:18:18.602351 6159085568 Options.max_bytes_for_level_multiplier_addtl[2]: 1
+2026/06/20-14:18:18.602353 6159085568 Options.max_bytes_for_level_multiplier_addtl[3]: 1
+2026/06/20-14:18:18.602354 6159085568 Options.max_bytes_for_level_multiplier_addtl[4]: 1
+2026/06/20-14:18:18.602355 6159085568 Options.max_bytes_for_level_multiplier_addtl[5]: 1
+2026/06/20-14:18:18.602357 6159085568 Options.max_bytes_for_level_multiplier_addtl[6]: 1
+2026/06/20-14:18:18.602359 6159085568       Options.max_sequential_skip_in_iterations: 8
+2026/06/20-14:18:18.602360 6159085568                    Options.max_compaction_bytes: 1677721600
+2026/06/20-14:18:18.602361 6159085568   Options.ignore_max_compaction_bytes_for_input: true
+2026/06/20-14:18:18.602363 6159085568                        Options.arena_block_size: 1048576
+2026/06/20-14:18:18.602364 6159085568   Options.soft_pending_compaction_bytes_limit: 68719476736
+2026/06/20-14:18:18.602365 6159085568   Options.hard_pending_compaction_bytes_limit: 274877906944
+2026/06/20-14:18:18.602366 6159085568                Options.disable_auto_compactions: 0
+2026/06/20-14:18:18.602369 6159085568                        Options.compaction_style: kCompactionStyleLevel
+2026/06/20-14:18:18.602371 6159085568                          Options.compaction_pri: kMinOverlappingRatio
+2026/06/20-14:18:18.602372 6159085568 Options.compaction_options_universal.size_ratio: 1
+2026/06/20-14:18:18.602374 6159085568 Options.compaction_options_universal.min_merge_width: 2
+2026/06/20-14:18:18.602375 6159085568 Options.compaction_options_universal.max_merge_width: 4294967295
+2026/06/20-14:18:18.602384 6159085568 Options.compaction_options_universal.max_size_amplification_percent: 200
+2026/06/20-14:18:18.602385 6159085568 Options.compaction_options_universal.compression_size_percent: -1
+2026/06/20-14:18:18.602388 6159085568 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize
+2026/06/20-14:18:18.602389 6159085568 Options.compaction_options_fifo.max_table_files_size: 1073741824
+2026/06/20-14:18:18.602391 6159085568 Options.compaction_options_fifo.allow_compaction: 0
+2026/06/20-14:18:18.602393 6159085568                   Options.table_properties_collectors: 
+2026/06/20-14:18:18.602395 6159085568                   Options.inplace_update_support: 0
+2026/06/20-14:18:18.602396 6159085568                 Options.inplace_update_num_locks: 10000
+2026/06/20-14:18:18.602398 6159085568               Options.memtable_prefix_bloom_size_ratio: 0.000000
+2026/06/20-14:18:18.602400 6159085568               Options.memtable_whole_key_filtering: 0
+2026/06/20-14:18:18.602401 6159085568   Options.memtable_huge_page_size: 0
+2026/06/20-14:18:18.602402 6159085568                           Options.bloom_locality: 0
+2026/06/20-14:18:18.602403 6159085568                    Options.max_successive_merges: 0
+2026/06/20-14:18:18.602405 6159085568                Options.optimize_filters_for_hits: 0
+2026/06/20-14:18:18.602406 6159085568                Options.paranoid_file_checks: 0
+2026/06/20-14:18:18.602407 6159085568                Options.force_consistency_checks: 1
+2026/06/20-14:18:18.602408 6159085568                Options.report_bg_io_stats: 0
+2026/06/20-14:18:18.602410 6159085568                               Options.ttl: 2592000
+2026/06/20-14:18:18.602411 6159085568          Options.periodic_compaction_seconds: 0
+2026/06/20-14:18:18.602413 6159085568  Options.preclude_last_level_data_seconds: 0
+2026/06/20-14:18:18.602414 6159085568    Options.preserve_internal_time_seconds: 0
+2026/06/20-14:18:18.602415 6159085568                       Options.enable_blob_files: false
+2026/06/20-14:18:18.602417 6159085568                           Options.min_blob_size: 0
+2026/06/20-14:18:18.602418 6159085568                          Options.blob_file_size: 268435456
+2026/06/20-14:18:18.602419 6159085568                   Options.blob_compression_type: NoCompression
+2026/06/20-14:18:18.602421 6159085568          Options.enable_blob_garbage_collection: false
+2026/06/20-14:18:18.602422 6159085568      Options.blob_garbage_collection_age_cutoff: 0.250000
+2026/06/20-14:18:18.602424 6159085568 Options.blob_garbage_collection_force_threshold: 1.000000
+2026/06/20-14:18:18.602425 6159085568          Options.blob_compaction_readahead_size: 0
+2026/06/20-14:18:18.602427 6159085568                Options.blob_file_starting_level: 0
+2026/06/20-14:18:18.602428 6159085568 Options.experimental_mempurge_threshold: 0.000000
+2026/06/20-14:18:18.603051 6159085568 [db/version_set.cc:5668] Recovered from manifest file:/Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000001 succeeded,manifest_file_number is 1, next_file_number is 3, last_sequence is 0, log_number is 0,prev_log_number is 0,max_column_family is 0,min_log_number_to_keep is 0
+2026/06/20-14:18:18.603054 6159085568 [db/version_set.cc:5677] Column family [default] (ID 0), log number is 0
+2026/06/20-14:18:18.603171 6159085568 [db/db_impl/db_impl_open.cc:539] DB ID: 83f38b5a-946f-41ab-8922-849ced38b4df
+2026/06/20-14:18:18.603685 6159085568 [db/version_set.cc:5135] Creating manifest 5
+2026/06/20-14:18:18.606931 6159085568 [db/db_impl/db_impl_open.cc:1992] SstFileManager instance 0xa208f6a00
+2026/06/20-14:18:18.607243 6159085568 DB pointer 0xa2087fc00
+2026/06/20-14:18:18.610261 6159085568 [db/db_impl/db_impl.cc:504] Shutdown: canceling all background work
+2026/06/20-14:18:18.610650 6159085568 [db/db_impl/db_impl.cc:711] Shutdown complete
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005 b/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005
new file mode 100644
index 0000000000000000000000000000000000000000..f1d1169ebb5084f6ab379df690c29da290eba713
GIT binary patch
literal 66
zcmZS8)^KKEU<~`Y<A)3bBcoJKYFTPdN|K&aWl3szW^t->er`cxQDRAc(HCZ(C>91r
VCI%LUKRRkZ*%%l(8JO8v7yzD{5)c3Y

literal 0
HcmV?d00001

diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007 b/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
new file mode 100644
index 0000000000000..d2b2e9bfc5695
--- /dev/null
+++ b/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
@@ -0,0 +1,198 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+  rocksdb_version=7.10.2
+  options_file_version=1.1
+
+[DBOptions]
+  max_background_flushes=-1
+  compaction_readahead_size=0
+  strict_bytes_per_sync=false
+  wal_bytes_per_sync=0
+  max_open_files=-1
+  stats_history_buffer_size=1048576
+  max_total_wal_size=0
+  stats_persist_period_sec=600
+  stats_dump_period_sec=600
+  avoid_flush_during_shutdown=false
+  max_subcompactions=1
+  bytes_per_sync=0
+  delayed_write_rate=16777216
+  max_background_compactions=-1
+  max_background_jobs=2
+  delete_obsolete_files_period_micros=21600000000
+  writable_file_max_buffer_size=1048576
+  file_checksum_gen_factory=nullptr
+  allow_data_in_errors=false
+  max_bgerror_resume_count=2147483647
+  best_efforts_recovery=false
+  write_dbid_to_manifest=false
+  atomic_flush=false
+  wal_compression=kNoCompression
+  manual_wal_flush=false
+  two_write_queues=false
+  avoid_flush_during_recovery=false
+  dump_malloc_stats=false
+  info_log_level=INFO_LEVEL
+  write_thread_slow_yield_usec=3
+  allow_ingest_behind=false
+  fail_if_options_file_error=false
+  persist_stats_to_disk=false
+  WAL_ttl_seconds=330
+  bgerror_resume_retry_interval=1000000
+  allow_concurrent_memtable_write=true
+  paranoid_checks=true
+  WAL_size_limit_MB=0
+  lowest_used_cache_tier=kNonVolatileBlockTier
+  keep_log_file_num=1000
+  table_cache_numshardbits=6
+  max_file_opening_threads=16
+  use_fsync=false
+  unordered_write=false
+  random_access_max_buffer_size=1048576
+  log_readahead_size=0
+  enable_pipelined_write=false
+  wal_recovery_mode=kPointInTimeRecovery
+  db_write_buffer_size=0
+  allow_2pc=false
+  skip_checking_sst_file_sizes_on_db_open=false
+  skip_stats_update_on_db_open=false
+  recycle_log_file_num=0
+  db_host_id=__hostname__
+  access_hint_on_compaction_start=NORMAL
+  verify_sst_unique_id_in_manifest=true
+  track_and_verify_wals_in_manifest=false
+  error_if_exists=false
+  manifest_preallocation_size=4194304
+  is_fd_close_on_exec=true
+  enable_write_thread_adaptive_yield=true
+  enable_thread_tracking=false
+  avoid_unnecessary_blocking_io=false
+  allow_fallocate=true
+  max_log_file_size=0
+  advise_random_on_open=true
+  create_missing_column_families=false
+  max_write_batch_group_size_bytes=1048576
+  use_adaptive_mutex=false
+  wal_filter=nullptr
+  create_if_missing=true
+  enforce_single_del_contracts=true
+  allow_mmap_writes=false
+  log_file_time_to_roll=0
+  use_direct_io_for_flush_and_compaction=false
+  flush_verify_memtable_count=true
+  max_manifest_file_size=1073741824
+  write_thread_max_yield_usec=100
+  use_direct_reads=false
+  allow_mmap_reads=false
+  
+
+[CFOptions "default"]
+  memtable_protection_bytes_per_key=0
+  bottommost_compression=kNoCompression
+  sample_for_compression=0
+  blob_garbage_collection_age_cutoff=0.250000
+  blob_compression_type=kNoCompression
+  prepopulate_blob_cache=kDisable
+  blob_compaction_readahead_size=0
+  level0_stop_writes_trigger=36
+  min_blob_size=0
+  last_level_temperature=kUnknown
+  compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;compression_size_percent=-1;max_size_amplification_percent=200;incremental=false;max_merge_width=4294967295;size_ratio=1;}
+  target_file_size_base=67108864
+  ignore_max_compaction_bytes_for_input=true
+  memtable_whole_key_filtering=false
+  blob_file_starting_level=0
+  soft_pending_compaction_bytes_limit=68719476736
+  max_write_buffer_number=2
+  ttl=2592000
+  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
+  check_flush_compaction_key_order=true
+  memtable_huge_page_size=0
+  max_successive_merges=0
+  inplace_update_num_locks=10000
+  enable_blob_garbage_collection=false
+  arena_block_size=1048576
+  bottommost_compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
+  target_file_size_multiplier=1
+  max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
+  blob_garbage_collection_force_threshold=1.000000
+  enable_blob_files=false
+  level0_slowdown_writes_trigger=20
+  compression=kNoCompression
+  level0_file_num_compaction_trigger=4
+  prefix_extractor=rocksdb.FixedPrefix.13
+  max_bytes_for_level_multiplier=10.000000
+  write_buffer_size=67108864
+  disable_auto_compactions=false
+  max_compaction_bytes=1677721600
+  compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
+  hard_pending_compaction_bytes_limit=274877906944
+  blob_file_size=268435456
+  periodic_compaction_seconds=0
+  paranoid_file_checks=false
+  experimental_mempurge_threshold=0.000000
+  memtable_prefix_bloom_size_ratio=0.000000
+  max_bytes_for_level_base=268435456
+  max_sequential_skip_in_iterations=8
+  report_bg_io_stats=false
+  sst_partitioner_factory=nullptr
+  compaction_pri=kMinOverlappingRatio
+  compaction_style=kCompactionStyleLevel
+  compaction_filter_factory=nullptr
+  compaction_filter=nullptr
+  memtable_factory=SkipListFactory
+  comparator=leveldb.BytewiseComparator
+  bloom_locality=0
+  min_write_buffer_number_to_merge=1
+  table_factory=BlockBasedTable
+  max_write_buffer_size_to_maintain=0
+  max_write_buffer_number_to_maintain=0
+  preserve_internal_time_seconds=0
+  force_consistency_checks=true
+  optimize_filters_for_hits=false
+  merge_operator=meta_store merge
+  num_levels=7
+  level_compaction_dynamic_file_size=true
+  memtable_insert_with_hint_prefix_extractor=nullptr
+  level_compaction_dynamic_level_bytes=false
+  preclude_last_level_data_seconds=0
+  inplace_update_support=false
+  
+[TableOptions/BlockBasedTable "default"]
+  num_file_reads_for_auto_readahead=2
+  metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
+  read_amp_bytes_per_bit=0
+  verify_compression=false
+  format_version=5
+  optimize_filters_for_memory=false
+  partition_filters=false
+  detect_filter_construct_corruption=false
+  initial_auto_readahead_size=8192
+  max_auto_readahead_size=262144
+  enable_index_compression=true
+  checksum=kXXH3
+  index_block_restart_interval=1
+  pin_top_level_index_and_filter=true
+  block_align=false
+  block_size=4096
+  index_type=kBinarySearch
+  filter_policy=nullptr
+  metadata_block_size=4096
+  no_block_cache=false
+  index_shortening=kShortenSeparators
+  whole_key_filtering=true
+  block_size_deviation=10
+  data_block_index_type=kDataBlockBinarySearch
+  data_block_hash_table_util_ratio=0.750000
+  cache_index_and_filter_blocks=false
+  prepopulate_block_cache=kDisable
+  block_restart_interval=16
+  pin_l0_filter_and_index_blocks_in_cache=false
+  cache_index_and_filter_blocks_with_high_priority=true
+  flush_block_policy_factory=FlushBlockBySizePolicyFactory
+  
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log
new file mode 100644
index 0000000000000000000000000000000000000000..cb3cd81b35938ab98bd6610802c752160f6884ad
GIT binary patch
literal 2907
zcmeHJUuYaf82@H=(>vRye{e5Q^&x5!BBq)>YFb+@NWm%yDK<!xBE$4{?rz)tbJ?9u
zE&+WIrQ%CUA4CHxf`y{^q%ZmqF;Ju}N+l`?f<-9xMer$4O8sVc=dQWrLcs@7+&gw=
zzwb9Q`~BuOKXz(i_n!NJ5Xdgok{*gTDC*Gn3txS990&%}OSPngKs_dpW55cyFTkjV
zA&_l^kh?58KuF2J6lom(y>R~Bcbpte6srG?)4$;2rjDtZ)#9|h_=)T1>ImuSgcxne
z7vH|}9748qLd-T~;Ng!}Z<(Z!L$<4hR0+DKJdV+`rsStQ!O}{ov>~q4h0mGmKqd4N
zKZqesnfPSqAtvV<SoX7l)B!heL=%i1{UZi3V9!X=vebYwY&k9?H>iQ}(4nK4B0Kvq
zMIR^*40vIMOE<*%UB3}Vl2yaV=Mw5XW)7De+?YjzJ9ALMMhb^v^4{`jd2H|K*xvH!
zgk3&pAHe@3V`GmWu**|$DserBMLehpCISay!96udwL(35q9R!=aWY@vyT@S?RRq&z
z#uC@@Sy-*cLP8R_(}`eb#9R(wSfg7JG4NU~5;dHpGl45i-xfncbdAgjX9F+f4vXD+
zoT9-a>8j#c-0)n<WIYmbJ@gz9O(}vHGXNHxh81{?+=3F6$ROE8#^Et?2;L_@V*DEj
z;7j-vmdJ-NjQeWA%ebkN<o;ZxTCBRwcKkX%4wGSvxcEc8rgvhICk8XwQgW{UnJZIo
z-B`^PLbmFTv{MD4w%1<d2a*b;;D{<uJPDbeZqv^lO6TeB(&1~R6Dzr>ol$JABeRw8
z@oikSZKA2<oOM#^7M>Ti#N!bZKIj`-dg|CG1yY!_U>PWKgH#raJMwdE(3PRxM-~S!
zj%XfQHj$olgA~5=k1YS(dsydp>Q2r_1~Yy-@#pf5OP??4EXfX}wt_s22l5hWy)nuJ
zb;{am!78$%)`r}G&2TRf;PYnQ;}z-72$<!Ok0&Q-U<y&$)<oef$~3_f;PF_pBvywb
zsaD-46j6VHI}<jb>Y^#6{Q7R7CWhV#YtcTs)!Ef}_3cL|dr2>rpkI4G)X{u$YOMM_
zIeRAAoIR6nL<>sHG%gPv8^35Dlr6q8ftqHw5k2b@!Hd5VG|Id69N2c)ExmB|<m(8r
zIw5VlrGKn0{)UiJC!}q+^o6fqdJyflFNbuuTiRi_<$t5y?jN=H?fc(uakKxv-QN6o
zJes(A;byKJ+P2&B#&%2pQ+7+&+3oPjC*M0izwo9eMB6RR?RMqb<-+#rwQV}D?Urt8
zx3rtx(tp-&gEO_*uFp^3nccR&-n!cD=IpJr+mx1#?3Uukf%=SHXSb~popy_fzW`5%
Bu)6>N

literal 0
HcmV?d00001

diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
new file mode 100644
index 0000000000000..aa5bb8ea50905
--- /dev/null
+++ b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000005
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
new file mode 100644
index 0000000000000..82f78f1bfc646
--- /dev/null
+++ b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
@@ -0,0 +1 @@
+d00a5f92-ccd3-4958-88a5-390cb7553a78
\ No newline at end of file
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
new file mode 100644
index 0000000000000..840bd4c15f97d
--- /dev/null
+++ b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
@@ -0,0 +1,245 @@
+2026/06/20-14:18:18.665188 6165524480 RocksDB version: 7.10.2
+2026/06/20-14:18:18.665209 6165524480 Compile date 2022-12-22 09:30:39
+2026/06/20-14:18:18.665210 6165524480 DB SUMMARY
+2026/06/20-14:18:18.665211 6165524480 DB Session ID:  VFFO8AISUMHVF8LR3OJV
+2026/06/20-14:18:18.665268 6165524480 SST files in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore dir, Total Num: 0, files: 
+2026/06/20-14:18:18.665270 6165524480 Write Ahead Log file in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore: 
+2026/06/20-14:18:18.665271 6165524480                         Options.error_if_exists: 0
+2026/06/20-14:18:18.665272 6165524480                       Options.create_if_missing: 1
+2026/06/20-14:18:18.665273 6165524480                         Options.paranoid_checks: 1
+2026/06/20-14:18:18.665273 6165524480             Options.flush_verify_memtable_count: 1
+2026/06/20-14:18:18.665274 6165524480                               Options.track_and_verify_wals_in_manifest: 0
+2026/06/20-14:18:18.665274 6165524480        Options.verify_sst_unique_id_in_manifest: 1
+2026/06/20-14:18:18.665275 6165524480                                     Options.env: 0x10c4d9d10
+2026/06/20-14:18:18.665276 6165524480                                      Options.fs: PosixFileSystem
+2026/06/20-14:18:18.665276 6165524480                                Options.info_log: 0xa2085cd98
+2026/06/20-14:18:18.665277 6165524480                Options.max_file_opening_threads: 16
+2026/06/20-14:18:18.665278 6165524480                              Options.statistics: 0x0
+2026/06/20-14:18:18.665278 6165524480                               Options.use_fsync: 0
+2026/06/20-14:18:18.665279 6165524480                       Options.max_log_file_size: 0
+2026/06/20-14:18:18.665279 6165524480                  Options.max_manifest_file_size: 1073741824
+2026/06/20-14:18:18.665280 6165524480                   Options.log_file_time_to_roll: 0
+2026/06/20-14:18:18.665281 6165524480                       Options.keep_log_file_num: 1000
+2026/06/20-14:18:18.665281 6165524480                    Options.recycle_log_file_num: 0
+2026/06/20-14:18:18.665282 6165524480                         Options.allow_fallocate: 1
+2026/06/20-14:18:18.665282 6165524480                        Options.allow_mmap_reads: 0
+2026/06/20-14:18:18.665283 6165524480                       Options.allow_mmap_writes: 0
+2026/06/20-14:18:18.665283 6165524480                        Options.use_direct_reads: 0
+2026/06/20-14:18:18.665284 6165524480                        Options.use_direct_io_for_flush_and_compaction: 0
+2026/06/20-14:18:18.665285 6165524480          Options.create_missing_column_families: 0
+2026/06/20-14:18:18.665285 6165524480                              Options.db_log_dir: 
+2026/06/20-14:18:18.665286 6165524480                                 Options.wal_dir: 
+2026/06/20-14:18:18.665286 6165524480                Options.table_cache_numshardbits: 6
+2026/06/20-14:18:18.665287 6165524480                         Options.WAL_ttl_seconds: 330
+2026/06/20-14:18:18.665287 6165524480                       Options.WAL_size_limit_MB: 0
+2026/06/20-14:18:18.665288 6165524480                        Options.max_write_batch_group_size_bytes: 1048576
+2026/06/20-14:18:18.665289 6165524480             Options.manifest_preallocation_size: 4194304
+2026/06/20-14:18:18.665289 6165524480                     Options.is_fd_close_on_exec: 1
+2026/06/20-14:18:18.665290 6165524480                   Options.advise_random_on_open: 1
+2026/06/20-14:18:18.665290 6165524480                    Options.db_write_buffer_size: 0
+2026/06/20-14:18:18.665291 6165524480                    Options.write_buffer_manager: 0xa21438620
+2026/06/20-14:18:18.665291 6165524480         Options.access_hint_on_compaction_start: 1
+2026/06/20-14:18:18.665292 6165524480           Options.random_access_max_buffer_size: 1048576
+2026/06/20-14:18:18.665293 6165524480                      Options.use_adaptive_mutex: 0
+2026/06/20-14:18:18.665293 6165524480                            Options.rate_limiter: 0x0
+2026/06/20-14:18:18.665294 6165524480     Options.sst_file_manager.rate_bytes_per_sec: 0
+2026/06/20-14:18:18.665294 6165524480                       Options.wal_recovery_mode: 2
+2026/06/20-14:18:18.665295 6165524480                  Options.enable_thread_tracking: 0
+2026/06/20-14:18:18.665296 6165524480                  Options.enable_pipelined_write: 0
+2026/06/20-14:18:18.665296 6165524480                  Options.unordered_write: 0
+2026/06/20-14:18:18.665297 6165524480         Options.allow_concurrent_memtable_write: 1
+2026/06/20-14:18:18.665297 6165524480      Options.enable_write_thread_adaptive_yield: 1
+2026/06/20-14:18:18.665298 6165524480             Options.write_thread_max_yield_usec: 100
+2026/06/20-14:18:18.665298 6165524480            Options.write_thread_slow_yield_usec: 3
+2026/06/20-14:18:18.665299 6165524480                               Options.row_cache: None
+2026/06/20-14:18:18.665300 6165524480                              Options.wal_filter: None
+2026/06/20-14:18:18.665300 6165524480             Options.avoid_flush_during_recovery: 0
+2026/06/20-14:18:18.665301 6165524480             Options.allow_ingest_behind: 0
+2026/06/20-14:18:18.665302 6165524480             Options.two_write_queues: 0
+2026/06/20-14:18:18.665302 6165524480             Options.manual_wal_flush: 0
+2026/06/20-14:18:18.665303 6165524480             Options.wal_compression: 0
+2026/06/20-14:18:18.665303 6165524480             Options.atomic_flush: 0
+2026/06/20-14:18:18.665304 6165524480             Options.avoid_unnecessary_blocking_io: 0
+2026/06/20-14:18:18.665304 6165524480                 Options.persist_stats_to_disk: 0
+2026/06/20-14:18:18.665305 6165524480                 Options.write_dbid_to_manifest: 0
+2026/06/20-14:18:18.665305 6165524480                 Options.log_readahead_size: 0
+2026/06/20-14:18:18.665306 6165524480                 Options.file_checksum_gen_factory: Unknown
+2026/06/20-14:18:18.665307 6165524480                 Options.best_efforts_recovery: 0
+2026/06/20-14:18:18.665308 6165524480                Options.max_bgerror_resume_count: 2147483647
+2026/06/20-14:18:18.665308 6165524480            Options.bgerror_resume_retry_interval: 1000000
+2026/06/20-14:18:18.665309 6165524480             Options.allow_data_in_errors: 0
+2026/06/20-14:18:18.665309 6165524480             Options.db_host_id: __hostname__
+2026/06/20-14:18:18.665310 6165524480             Options.enforce_single_del_contracts: true
+2026/06/20-14:18:18.665311 6165524480             Options.max_background_jobs: 2
+2026/06/20-14:18:18.665311 6165524480             Options.max_background_compactions: -1
+2026/06/20-14:18:18.665312 6165524480             Options.max_subcompactions: 1
+2026/06/20-14:18:18.665312 6165524480             Options.avoid_flush_during_shutdown: 0
+2026/06/20-14:18:18.665313 6165524480           Options.writable_file_max_buffer_size: 1048576
+2026/06/20-14:18:18.665313 6165524480             Options.delayed_write_rate : 16777216
+2026/06/20-14:18:18.665314 6165524480             Options.max_total_wal_size: 0
+2026/06/20-14:18:18.665315 6165524480             Options.delete_obsolete_files_period_micros: 21600000000
+2026/06/20-14:18:18.665315 6165524480                   Options.stats_dump_period_sec: 600
+2026/06/20-14:18:18.665316 6165524480                 Options.stats_persist_period_sec: 600
+2026/06/20-14:18:18.665316 6165524480                 Options.stats_history_buffer_size: 1048576
+2026/06/20-14:18:18.665317 6165524480                          Options.max_open_files: -1
+2026/06/20-14:18:18.665318 6165524480                          Options.bytes_per_sync: 0
+2026/06/20-14:18:18.665318 6165524480                      Options.wal_bytes_per_sync: 0
+2026/06/20-14:18:18.665319 6165524480                   Options.strict_bytes_per_sync: 0
+2026/06/20-14:18:18.665319 6165524480       Options.compaction_readahead_size: 0
+2026/06/20-14:18:18.665320 6165524480                  Options.max_background_flushes: -1
+2026/06/20-14:18:18.665320 6165524480 Compression algorithms supported:
+2026/06/20-14:18:18.665321 6165524480 	kZSTD supported: 1
+2026/06/20-14:18:18.665322 6165524480 	kZlibCompression supported: 0
+2026/06/20-14:18:18.665323 6165524480 	kXpressCompression supported: 0
+2026/06/20-14:18:18.665324 6165524480 	kSnappyCompression supported: 1
+2026/06/20-14:18:18.665324 6165524480 	kZSTDNotFinalCompression supported: 1
+2026/06/20-14:18:18.665325 6165524480 	kLZ4HCCompression supported: 0
+2026/06/20-14:18:18.665326 6165524480 	kLZ4Compression supported: 0
+2026/06/20-14:18:18.665326 6165524480 	kBZip2Compression supported: 0
+2026/06/20-14:18:18.665333 6165524480 Fast CRC32 supported: Supported on Arm64
+2026/06/20-14:18:18.665333 6165524480 DMutex implementation: pthread_mutex_t
+2026/06/20-14:18:18.665334 6165524480 Allocator: System
+2026/06/20-14:18:18.665830 6165524480 [db/db_impl/db_impl_open.cc:317] Creating manifest 1 
+2026/06/20-14:18:18.666540 6165524480 [db/version_set.cc:5617] Recovering from manifest file: /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000001
+2026/06/20-14:18:18.666603 6165524480 [db/column_family.cc:632] --------------- Options for column family [default]:
+2026/06/20-14:18:18.666605 6165524480               Options.comparator: leveldb.BytewiseComparator
+2026/06/20-14:18:18.666606 6165524480           Options.merge_operator: meta_store merge
+2026/06/20-14:18:18.666607 6165524480        Options.compaction_filter: None
+2026/06/20-14:18:18.666608 6165524480        Options.compaction_filter_factory: None
+2026/06/20-14:18:18.666609 6165524480  Options.sst_partitioner_factory: None
+2026/06/20-14:18:18.666609 6165524480         Options.memtable_factory: SkipListFactory
+2026/06/20-14:18:18.666610 6165524480            Options.table_factory: BlockBasedTable
+2026/06/20-14:18:18.666618 6165524480            table_factory options:   flush_block_policy_factory: FlushBlockBySizePolicyFactory (0xa214d94c0)
+  cache_index_and_filter_blocks: 0
+  cache_index_and_filter_blocks_with_high_priority: 1
+  pin_l0_filter_and_index_blocks_in_cache: 0
+  pin_top_level_index_and_filter: 1
+  index_type: 0
+  data_block_index_type: 0
+  index_shortening: 1
+  data_block_hash_table_util_ratio: 0.750000
+  checksum: 4
+  no_block_cache: 0
+  block_cache: 0xa214a8b58
+  block_cache_name: LRUCache
+  block_cache_options:
+    capacity : 8388608
+    num_shard_bits : 4
+    strict_capacity_limit : 0
+    memory_allocator : None
+    high_pri_pool_ratio: 0.500
+    low_pri_pool_ratio: 0.000
+  block_cache_compressed: 0x0
+  persistent_cache: 0x0
+  block_size: 4096
+  block_size_deviation: 10
+  block_restart_interval: 16
+  index_block_restart_interval: 1
+  metadata_block_size: 4096
+  partition_filters: 0
+  use_delta_encoding: 1
+  filter_policy: nullptr
+  whole_key_filtering: 1
+  verify_compression: 0
+  read_amp_bytes_per_bit: 0
+  format_version: 5
+  enable_index_compression: 1
+  block_align: 0
+  max_auto_readahead_size: 262144
+  prepopulate_block_cache: 0
+  initial_auto_readahead_size: 8192
+  num_file_reads_for_auto_readahead: 2
+2026/06/20-14:18:18.666619 6165524480        Options.write_buffer_size: 67108864
+2026/06/20-14:18:18.666620 6165524480  Options.max_write_buffer_number: 2
+2026/06/20-14:18:18.666621 6165524480          Options.compression: NoCompression
+2026/06/20-14:18:18.666621 6165524480                  Options.bottommost_compression: NoCompression
+2026/06/20-14:18:18.666623 6165524480       Options.prefix_extractor: rocksdb.FixedPrefix
+2026/06/20-14:18:18.666623 6165524480   Options.memtable_insert_with_hint_prefix_extractor: nullptr
+2026/06/20-14:18:18.666624 6165524480             Options.num_levels: 7
+2026/06/20-14:18:18.666625 6165524480        Options.min_write_buffer_number_to_merge: 1
+2026/06/20-14:18:18.666625 6165524480     Options.max_write_buffer_number_to_maintain: 0
+2026/06/20-14:18:18.666626 6165524480     Options.max_write_buffer_size_to_maintain: 0
+2026/06/20-14:18:18.666626 6165524480            Options.bottommost_compression_opts.window_bits: -14
+2026/06/20-14:18:18.666627 6165524480                  Options.bottommost_compression_opts.level: 32767
+2026/06/20-14:18:18.666627 6165524480               Options.bottommost_compression_opts.strategy: 0
+2026/06/20-14:18:18.666628 6165524480         Options.bottommost_compression_opts.max_dict_bytes: 0
+2026/06/20-14:18:18.666629 6165524480         Options.bottommost_compression_opts.zstd_max_train_bytes: 0
+2026/06/20-14:18:18.666629 6165524480         Options.bottommost_compression_opts.parallel_threads: 1
+2026/06/20-14:18:18.666630 6165524480                  Options.bottommost_compression_opts.enabled: false
+2026/06/20-14:18:18.666631 6165524480         Options.bottommost_compression_opts.max_dict_buffer_bytes: 0
+2026/06/20-14:18:18.666631 6165524480         Options.bottommost_compression_opts.use_zstd_dict_trainer: true
+2026/06/20-14:18:18.666632 6165524480            Options.compression_opts.window_bits: -14
+2026/06/20-14:18:18.666632 6165524480                  Options.compression_opts.level: 32767
+2026/06/20-14:18:18.666633 6165524480               Options.compression_opts.strategy: 0
+2026/06/20-14:18:18.666633 6165524480         Options.compression_opts.max_dict_bytes: 0
+2026/06/20-14:18:18.666634 6165524480         Options.compression_opts.zstd_max_train_bytes: 0
+2026/06/20-14:18:18.666635 6165524480         Options.compression_opts.use_zstd_dict_trainer: true
+2026/06/20-14:18:18.666635 6165524480         Options.compression_opts.parallel_threads: 1
+2026/06/20-14:18:18.666636 6165524480                  Options.compression_opts.enabled: false
+2026/06/20-14:18:18.666636 6165524480         Options.compression_opts.max_dict_buffer_bytes: 0
+2026/06/20-14:18:18.666637 6165524480      Options.level0_file_num_compaction_trigger: 4
+2026/06/20-14:18:18.666638 6165524480          Options.level0_slowdown_writes_trigger: 20
+2026/06/20-14:18:18.666638 6165524480              Options.level0_stop_writes_trigger: 36
+2026/06/20-14:18:18.666639 6165524480                   Options.target_file_size_base: 67108864
+2026/06/20-14:18:18.666639 6165524480             Options.target_file_size_multiplier: 1
+2026/06/20-14:18:18.666640 6165524480                Options.max_bytes_for_level_base: 268435456
+2026/06/20-14:18:18.666641 6165524480 Options.level_compaction_dynamic_level_bytes: 0
+2026/06/20-14:18:18.666641 6165524480          Options.max_bytes_for_level_multiplier: 10.000000
+2026/06/20-14:18:18.666642 6165524480 Options.max_bytes_for_level_multiplier_addtl[0]: 1
+2026/06/20-14:18:18.666642 6165524480 Options.max_bytes_for_level_multiplier_addtl[1]: 1
+2026/06/20-14:18:18.666643 6165524480 Options.max_bytes_for_level_multiplier_addtl[2]: 1
+2026/06/20-14:18:18.666644 6165524480 Options.max_bytes_for_level_multiplier_addtl[3]: 1
+2026/06/20-14:18:18.666644 6165524480 Options.max_bytes_for_level_multiplier_addtl[4]: 1
+2026/06/20-14:18:18.666645 6165524480 Options.max_bytes_for_level_multiplier_addtl[5]: 1
+2026/06/20-14:18:18.666645 6165524480 Options.max_bytes_for_level_multiplier_addtl[6]: 1
+2026/06/20-14:18:18.666646 6165524480       Options.max_sequential_skip_in_iterations: 8
+2026/06/20-14:18:18.666647 6165524480                    Options.max_compaction_bytes: 1677721600
+2026/06/20-14:18:18.666647 6165524480   Options.ignore_max_compaction_bytes_for_input: true
+2026/06/20-14:18:18.666648 6165524480                        Options.arena_block_size: 1048576
+2026/06/20-14:18:18.666648 6165524480   Options.soft_pending_compaction_bytes_limit: 68719476736
+2026/06/20-14:18:18.666649 6165524480   Options.hard_pending_compaction_bytes_limit: 274877906944
+2026/06/20-14:18:18.666649 6165524480                Options.disable_auto_compactions: 0
+2026/06/20-14:18:18.666651 6165524480                        Options.compaction_style: kCompactionStyleLevel
+2026/06/20-14:18:18.666652 6165524480                          Options.compaction_pri: kMinOverlappingRatio
+2026/06/20-14:18:18.666652 6165524480 Options.compaction_options_universal.size_ratio: 1
+2026/06/20-14:18:18.666653 6165524480 Options.compaction_options_universal.min_merge_width: 2
+2026/06/20-14:18:18.666654 6165524480 Options.compaction_options_universal.max_merge_width: 4294967295
+2026/06/20-14:18:18.666654 6165524480 Options.compaction_options_universal.max_size_amplification_percent: 200
+2026/06/20-14:18:18.666655 6165524480 Options.compaction_options_universal.compression_size_percent: -1
+2026/06/20-14:18:18.666656 6165524480 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize
+2026/06/20-14:18:18.666657 6165524480 Options.compaction_options_fifo.max_table_files_size: 1073741824
+2026/06/20-14:18:18.666657 6165524480 Options.compaction_options_fifo.allow_compaction: 0
+2026/06/20-14:18:18.666658 6165524480                   Options.table_properties_collectors: 
+2026/06/20-14:18:18.666659 6165524480                   Options.inplace_update_support: 0
+2026/06/20-14:18:18.666659 6165524480                 Options.inplace_update_num_locks: 10000
+2026/06/20-14:18:18.666660 6165524480               Options.memtable_prefix_bloom_size_ratio: 0.000000
+2026/06/20-14:18:18.666661 6165524480               Options.memtable_whole_key_filtering: 0
+2026/06/20-14:18:18.666661 6165524480   Options.memtable_huge_page_size: 0
+2026/06/20-14:18:18.666662 6165524480                           Options.bloom_locality: 0
+2026/06/20-14:18:18.666662 6165524480                    Options.max_successive_merges: 0
+2026/06/20-14:18:18.666663 6165524480                Options.optimize_filters_for_hits: 0
+2026/06/20-14:18:18.666664 6165524480                Options.paranoid_file_checks: 0
+2026/06/20-14:18:18.666664 6165524480                Options.force_consistency_checks: 1
+2026/06/20-14:18:18.666665 6165524480                Options.report_bg_io_stats: 0
+2026/06/20-14:18:18.666665 6165524480                               Options.ttl: 2592000
+2026/06/20-14:18:18.666666 6165524480          Options.periodic_compaction_seconds: 0
+2026/06/20-14:18:18.666667 6165524480  Options.preclude_last_level_data_seconds: 0
+2026/06/20-14:18:18.666667 6165524480    Options.preserve_internal_time_seconds: 0
+2026/06/20-14:18:18.666668 6165524480                       Options.enable_blob_files: false
+2026/06/20-14:18:18.666668 6165524480                           Options.min_blob_size: 0
+2026/06/20-14:18:18.666669 6165524480                          Options.blob_file_size: 268435456
+2026/06/20-14:18:18.666669 6165524480                   Options.blob_compression_type: NoCompression
+2026/06/20-14:18:18.666670 6165524480          Options.enable_blob_garbage_collection: false
+2026/06/20-14:18:18.666671 6165524480      Options.blob_garbage_collection_age_cutoff: 0.250000
+2026/06/20-14:18:18.666671 6165524480 Options.blob_garbage_collection_force_threshold: 1.000000
+2026/06/20-14:18:18.666672 6165524480          Options.blob_compaction_readahead_size: 0
+2026/06/20-14:18:18.666672 6165524480                Options.blob_file_starting_level: 0
+2026/06/20-14:18:18.666673 6165524480 Options.experimental_mempurge_threshold: 0.000000
+2026/06/20-14:18:18.666925 6165524480 [db/version_set.cc:5668] Recovered from manifest file:/Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000001 succeeded,manifest_file_number is 1, next_file_number is 3, last_sequence is 0, log_number is 0,prev_log_number is 0,max_column_family is 0,min_log_number_to_keep is 0
+2026/06/20-14:18:18.666927 6165524480 [db/version_set.cc:5677] Column family [default] (ID 0), log number is 0
+2026/06/20-14:18:18.666969 6165524480 [db/db_impl/db_impl_open.cc:539] DB ID: d00a5f92-ccd3-4958-88a5-390cb7553a78
+2026/06/20-14:18:18.667187 6165524480 [db/version_set.cc:5135] Creating manifest 5
+2026/06/20-14:18:18.669398 6165524480 [db/db_impl/db_impl_open.cc:1992] SstFileManager instance 0xa208f5880
+2026/06/20-14:18:18.669483 6165524480 DB pointer 0xa20875400
+2026/06/20-14:18:18.670784 6165524480 [db/db_impl/db_impl.cc:504] Shutdown: canceling all background work
+2026/06/20-14:18:18.671086 6165524480 [db/db_impl/db_impl.cc:711] Shutdown complete
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005 b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005
new file mode 100644
index 0000000000000000000000000000000000000000..f1d1169ebb5084f6ab379df690c29da290eba713
GIT binary patch
literal 66
zcmZS8)^KKEU<~`Y<A)3bBcoJKYFTPdN|K&aWl3szW^t->er`cxQDRAc(HCZ(C>91r
VCI%LUKRRkZ*%%l(8JO8v7yzD{5)c3Y

literal 0
HcmV?d00001

diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007 b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007
new file mode 100644
index 0000000000000..d2b2e9bfc5695
--- /dev/null
+++ b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007
@@ -0,0 +1,198 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+  rocksdb_version=7.10.2
+  options_file_version=1.1
+
+[DBOptions]
+  max_background_flushes=-1
+  compaction_readahead_size=0
+  strict_bytes_per_sync=false
+  wal_bytes_per_sync=0
+  max_open_files=-1
+  stats_history_buffer_size=1048576
+  max_total_wal_size=0
+  stats_persist_period_sec=600
+  stats_dump_period_sec=600
+  avoid_flush_during_shutdown=false
+  max_subcompactions=1
+  bytes_per_sync=0
+  delayed_write_rate=16777216
+  max_background_compactions=-1
+  max_background_jobs=2
+  delete_obsolete_files_period_micros=21600000000
+  writable_file_max_buffer_size=1048576
+  file_checksum_gen_factory=nullptr
+  allow_data_in_errors=false
+  max_bgerror_resume_count=2147483647
+  best_efforts_recovery=false
+  write_dbid_to_manifest=false
+  atomic_flush=false
+  wal_compression=kNoCompression
+  manual_wal_flush=false
+  two_write_queues=false
+  avoid_flush_during_recovery=false
+  dump_malloc_stats=false
+  info_log_level=INFO_LEVEL
+  write_thread_slow_yield_usec=3
+  allow_ingest_behind=false
+  fail_if_options_file_error=false
+  persist_stats_to_disk=false
+  WAL_ttl_seconds=330
+  bgerror_resume_retry_interval=1000000
+  allow_concurrent_memtable_write=true
+  paranoid_checks=true
+  WAL_size_limit_MB=0
+  lowest_used_cache_tier=kNonVolatileBlockTier
+  keep_log_file_num=1000
+  table_cache_numshardbits=6
+  max_file_opening_threads=16
+  use_fsync=false
+  unordered_write=false
+  random_access_max_buffer_size=1048576
+  log_readahead_size=0
+  enable_pipelined_write=false
+  wal_recovery_mode=kPointInTimeRecovery
+  db_write_buffer_size=0
+  allow_2pc=false
+  skip_checking_sst_file_sizes_on_db_open=false
+  skip_stats_update_on_db_open=false
+  recycle_log_file_num=0
+  db_host_id=__hostname__
+  access_hint_on_compaction_start=NORMAL
+  verify_sst_unique_id_in_manifest=true
+  track_and_verify_wals_in_manifest=false
+  error_if_exists=false
+  manifest_preallocation_size=4194304
+  is_fd_close_on_exec=true
+  enable_write_thread_adaptive_yield=true
+  enable_thread_tracking=false
+  avoid_unnecessary_blocking_io=false
+  allow_fallocate=true
+  max_log_file_size=0
+  advise_random_on_open=true
+  create_missing_column_families=false
+  max_write_batch_group_size_bytes=1048576
+  use_adaptive_mutex=false
+  wal_filter=nullptr
+  create_if_missing=true
+  enforce_single_del_contracts=true
+  allow_mmap_writes=false
+  log_file_time_to_roll=0
+  use_direct_io_for_flush_and_compaction=false
+  flush_verify_memtable_count=true
+  max_manifest_file_size=1073741824
+  write_thread_max_yield_usec=100
+  use_direct_reads=false
+  allow_mmap_reads=false
+  
+
+[CFOptions "default"]
+  memtable_protection_bytes_per_key=0
+  bottommost_compression=kNoCompression
+  sample_for_compression=0
+  blob_garbage_collection_age_cutoff=0.250000
+  blob_compression_type=kNoCompression
+  prepopulate_blob_cache=kDisable
+  blob_compaction_readahead_size=0
+  level0_stop_writes_trigger=36
+  min_blob_size=0
+  last_level_temperature=kUnknown
+  compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;compression_size_percent=-1;max_size_amplification_percent=200;incremental=false;max_merge_width=4294967295;size_ratio=1;}
+  target_file_size_base=67108864
+  ignore_max_compaction_bytes_for_input=true
+  memtable_whole_key_filtering=false
+  blob_file_starting_level=0
+  soft_pending_compaction_bytes_limit=68719476736
+  max_write_buffer_number=2
+  ttl=2592000
+  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
+  check_flush_compaction_key_order=true
+  memtable_huge_page_size=0
+  max_successive_merges=0
+  inplace_update_num_locks=10000
+  enable_blob_garbage_collection=false
+  arena_block_size=1048576
+  bottommost_compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
+  target_file_size_multiplier=1
+  max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
+  blob_garbage_collection_force_threshold=1.000000
+  enable_blob_files=false
+  level0_slowdown_writes_trigger=20
+  compression=kNoCompression
+  level0_file_num_compaction_trigger=4
+  prefix_extractor=rocksdb.FixedPrefix.13
+  max_bytes_for_level_multiplier=10.000000
+  write_buffer_size=67108864
+  disable_auto_compactions=false
+  max_compaction_bytes=1677721600
+  compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
+  hard_pending_compaction_bytes_limit=274877906944
+  blob_file_size=268435456
+  periodic_compaction_seconds=0
+  paranoid_file_checks=false
+  experimental_mempurge_threshold=0.000000
+  memtable_prefix_bloom_size_ratio=0.000000
+  max_bytes_for_level_base=268435456
+  max_sequential_skip_in_iterations=8
+  report_bg_io_stats=false
+  sst_partitioner_factory=nullptr
+  compaction_pri=kMinOverlappingRatio
+  compaction_style=kCompactionStyleLevel
+  compaction_filter_factory=nullptr
+  compaction_filter=nullptr
+  memtable_factory=SkipListFactory
+  comparator=leveldb.BytewiseComparator
+  bloom_locality=0
+  min_write_buffer_number_to_merge=1
+  table_factory=BlockBasedTable
+  max_write_buffer_size_to_maintain=0
+  max_write_buffer_number_to_maintain=0
+  preserve_internal_time_seconds=0
+  force_consistency_checks=true
+  optimize_filters_for_hits=false
+  merge_operator=meta_store merge
+  num_levels=7
+  level_compaction_dynamic_file_size=true
+  memtable_insert_with_hint_prefix_extractor=nullptr
+  level_compaction_dynamic_level_bytes=false
+  preclude_last_level_data_seconds=0
+  inplace_update_support=false
+  
+[TableOptions/BlockBasedTable "default"]
+  num_file_reads_for_auto_readahead=2
+  metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
+  read_amp_bytes_per_bit=0
+  verify_compression=false
+  format_version=5
+  optimize_filters_for_memory=false
+  partition_filters=false
+  detect_filter_construct_corruption=false
+  initial_auto_readahead_size=8192
+  max_auto_readahead_size=262144
+  enable_index_compression=true
+  checksum=kXXH3
+  index_block_restart_interval=1
+  pin_top_level_index_and_filter=true
+  block_align=false
+  block_size=4096
+  index_type=kBinarySearch
+  filter_policy=nullptr
+  metadata_block_size=4096
+  no_block_cache=false
+  index_shortening=kShortenSeparators
+  whole_key_filtering=true
+  block_size_deviation=10
+  data_block_index_type=kDataBlockBinarySearch
+  data_block_hash_table_util_ratio=0.750000
+  cache_index_and_filter_blocks=false
+  prepopulate_block_cache=kDisable
+  block_restart_interval=16
+  pin_l0_filter_and_index_blocks_in_cache=false
+  cache_index_and_filter_blocks_with_high_priority=true
+  flush_block_policy_factory=FlushBlockBySizePolicyFactory
+  

From c6faaedd5900ed1cb69392065f442b5891354642 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 15:51:07 +0200
Subject: [PATCH 17/17] chore(cubestore): drop test scratch dirs committed by
 mistake; ignore them

The previous commit accidentally added RocksDB scratch directories left by the
metastore unit tests (test-*-local). Remove them and ignore the pattern.
---
 rust/cubestore/.gitignore                     |   3 +
 .../metastore/000004.log                      | Bin 2270 -> 0 bytes
 .../metastore/CURRENT                         |   1 -
 .../metastore/IDENTITY                        |   1 -
 .../metastore/LOCK                            |   0
 .../metastore/LOG                             | 245 ------------------
 .../metastore/MANIFEST-000005                 | Bin 66 -> 0 bytes
 .../metastore/OPTIONS-000007                  | 198 --------------
 .../metastore/000004.log                      | Bin 2907 -> 0 bytes
 .../metastore/CURRENT                         |   1 -
 .../metastore/IDENTITY                        |   1 -
 .../metastore/LOCK                            |   0
 .../metastore/LOG                             | 245 ------------------
 .../metastore/MANIFEST-000005                 | Bin 66 -> 0 bytes
 .../metastore/OPTIONS-000007                  | 198 --------------
 15 files changed, 3 insertions(+), 890 deletions(-)
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005
 delete mode 100644 rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005
 delete mode 100644 rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007

diff --git a/rust/cubestore/.gitignore b/rust/cubestore/.gitignore
index 66d0436bd8f34..64f8a320fdced 100644
--- a/rust/cubestore/.gitignore
+++ b/rust/cubestore/.gitignore
@@ -10,3 +10,6 @@ cubestore/target
 cubesql/target
 cubestore-sql-tests/data/**
 cubestore/db-tmp
+# RocksDB scratch dirs left by metastore unit tests (run from the crate root)
+/cubestore/test-*-local/
+/cubestore/test-*-upstream/
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log b/rust/cubestore/cubestore/test-create_partitions-local/metastore/000004.log
deleted file mode 100644
index 86d68225d039e3a5f00627c9627af2526b210e14..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2270
zcmc&!O=ule6h3EW;=ESVTAT<`7g?ksi`3?&A+13QQt<~wix#cfTyAIXd+*lyd6|1B
znF!seAPAD;LT#a<h)~@qxUnu&LW?N4uqwI`EJ7FVT!}6!o;yG9CGVNqjdb2)?wxbK
z`*XhYowKyQXZ#*Ogz_;Xj#%`9Ms50G{mTna0g|zJ%!p$Itk+a_GI%kM1lX-40lXCv
zcX-w$;+Rb?>>mEzeeJDtL4`(xnt0{puXMOm+sw}9aI1g#@hhi01Q{8G*nP<7-@fn|
zL2em@IDN=?<CD%s2i*#Cn>omYP{&l+$@-hE`H4`lA`(kCLHQjKaV7)slQ4^71%=C@
zuV!UpdaX-gKN{=WAmc#fVDFlkvC)R{nTF?~I~gWE*D|3kWYkSk&00wsaSa`=Sim(0
zsaPs_um(O2S_F)ochAkvE$o|J*f%$OtU0&XJVgI{7iJIMci_+xoXBJtu$0Gb!9*N@
z5<E1E%qYxfD16P7rp+Qj*HcNB`hwMwC{5=?ENQir(2&J)ITP$Tu~q>Xg&}>RfVbPJ
zXwxQx4mx3ZlZ|4oV`M?H8ixrFn3B)a7G0idX@X~}8%oV|Cl#ubgaL?Li&&8dJdki1
z+W0rkJe<Vc_&ChtBD{z{k^LKp;0yQ|-p2P~nlk6XDN5VZI8o`amSa7ZIo+VoX`J>@
z(vK8bywSt^_m;x6@bttZUoX9R^^ZyexU-(<W}y*M@74z|U?BlF5G|gC8cM}%QpgR8
z<Dt#sP(~1jd&AcdT|Gxc2TGF|ksD2xlR|&)lcJr4JO%gVJEz`x;OOfvy3cs<9@XA7
z@d=H_uBsSq>$|BvN6zj&KU0U=tM#H-ZFIknj%@rgdbsxQt|wK<CCvW$__d9zmp=Wh
z_F`F8bv;*Rs;W!ayD<rYl*syepeV|qwwh$jR(O~Rh<IMjxUc1kfK{GGR2x~BT!bpw
zPm@&=Vn!Mdxza3CM*dl=C39%R2ij2(hV#&pRM`c;k&Mm8FnY{8+kALuaC`TqGy9(%
z#Zd~OzV<=rPzR}~@j7=^>^XRS_8jZqXdx9j_E%F!pE%!K)IGjR4LZ(tD@KML>+13c
z)a~x5_n@)pnf-2Oee}j(*9hWm53wKo@xyahEF7yKy@oiHI_u`9ZaI}Q&%5$QQb$D8
z5pPai<2tFcwhDhJb(T?dk-C9I^c>DWPAzIO#fp(vPv!6K|MQGr-O1X~yIs*%hM2Z*
gF4OtR75dGq|A}e)KbW?+W4h-UWZK@2X}iz#KZ3*uod5s;

diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT b/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
deleted file mode 100644
index aa5bb8ea50905..0000000000000
--- a/rust/cubestore/cubestore/test-create_partitions-local/metastore/CURRENT
+++ /dev/null
@@ -1 +0,0 @@
-MANIFEST-000005
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY b/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
deleted file mode 100644
index b67c275246ec7..0000000000000
--- a/rust/cubestore/cubestore/test-create_partitions-local/metastore/IDENTITY
+++ /dev/null
@@ -1 +0,0 @@
-83f38b5a-946f-41ab-8922-849ced38b4df
\ No newline at end of file
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK b/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOCK
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG b/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
deleted file mode 100644
index 2ef96a9651d55..0000000000000
--- a/rust/cubestore/cubestore/test-create_partitions-local/metastore/LOG
+++ /dev/null
@@ -1,245 +0,0 @@
-2026/06/20-14:18:18.600117 6159085568 RocksDB version: 7.10.2
-2026/06/20-14:18:18.600157 6159085568 Compile date 2022-12-22 09:30:39
-2026/06/20-14:18:18.600166 6159085568 DB SUMMARY
-2026/06/20-14:18:18.600168 6159085568 DB Session ID:  VFFO8AISUMHVF8LR3OK3
-2026/06/20-14:18:18.600307 6159085568 SST files in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore dir, Total Num: 0, files: 
-2026/06/20-14:18:18.600309 6159085568 Write Ahead Log file in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore: 
-2026/06/20-14:18:18.600310 6159085568                         Options.error_if_exists: 0
-2026/06/20-14:18:18.600311 6159085568                       Options.create_if_missing: 1
-2026/06/20-14:18:18.600312 6159085568                         Options.paranoid_checks: 1
-2026/06/20-14:18:18.600312 6159085568             Options.flush_verify_memtable_count: 1
-2026/06/20-14:18:18.600313 6159085568                               Options.track_and_verify_wals_in_manifest: 0
-2026/06/20-14:18:18.600313 6159085568        Options.verify_sst_unique_id_in_manifest: 1
-2026/06/20-14:18:18.600314 6159085568                                     Options.env: 0x10c4d9d10
-2026/06/20-14:18:18.600315 6159085568                                      Options.fs: PosixFileSystem
-2026/06/20-14:18:18.600316 6159085568                                Options.info_log: 0xa2085cd98
-2026/06/20-14:18:18.600316 6159085568                Options.max_file_opening_threads: 16
-2026/06/20-14:18:18.600317 6159085568                              Options.statistics: 0x0
-2026/06/20-14:18:18.600318 6159085568                               Options.use_fsync: 0
-2026/06/20-14:18:18.600318 6159085568                       Options.max_log_file_size: 0
-2026/06/20-14:18:18.600319 6159085568                  Options.max_manifest_file_size: 1073741824
-2026/06/20-14:18:18.600319 6159085568                   Options.log_file_time_to_roll: 0
-2026/06/20-14:18:18.600320 6159085568                       Options.keep_log_file_num: 1000
-2026/06/20-14:18:18.600321 6159085568                    Options.recycle_log_file_num: 0
-2026/06/20-14:18:18.600321 6159085568                         Options.allow_fallocate: 1
-2026/06/20-14:18:18.600322 6159085568                        Options.allow_mmap_reads: 0
-2026/06/20-14:18:18.600322 6159085568                       Options.allow_mmap_writes: 0
-2026/06/20-14:18:18.600323 6159085568                        Options.use_direct_reads: 0
-2026/06/20-14:18:18.600323 6159085568                        Options.use_direct_io_for_flush_and_compaction: 0
-2026/06/20-14:18:18.600324 6159085568          Options.create_missing_column_families: 0
-2026/06/20-14:18:18.600325 6159085568                              Options.db_log_dir: 
-2026/06/20-14:18:18.600325 6159085568                                 Options.wal_dir: 
-2026/06/20-14:18:18.600326 6159085568                Options.table_cache_numshardbits: 6
-2026/06/20-14:18:18.600326 6159085568                         Options.WAL_ttl_seconds: 330
-2026/06/20-14:18:18.600327 6159085568                       Options.WAL_size_limit_MB: 0
-2026/06/20-14:18:18.600327 6159085568                        Options.max_write_batch_group_size_bytes: 1048576
-2026/06/20-14:18:18.600328 6159085568             Options.manifest_preallocation_size: 4194304
-2026/06/20-14:18:18.600329 6159085568                     Options.is_fd_close_on_exec: 1
-2026/06/20-14:18:18.600329 6159085568                   Options.advise_random_on_open: 1
-2026/06/20-14:18:18.600330 6159085568                    Options.db_write_buffer_size: 0
-2026/06/20-14:18:18.600330 6159085568                    Options.write_buffer_manager: 0xa21438fc0
-2026/06/20-14:18:18.600331 6159085568         Options.access_hint_on_compaction_start: 1
-2026/06/20-14:18:18.600332 6159085568           Options.random_access_max_buffer_size: 1048576
-2026/06/20-14:18:18.600334 6159085568                      Options.use_adaptive_mutex: 0
-2026/06/20-14:18:18.600335 6159085568                            Options.rate_limiter: 0x0
-2026/06/20-14:18:18.600336 6159085568     Options.sst_file_manager.rate_bytes_per_sec: 0
-2026/06/20-14:18:18.600336 6159085568                       Options.wal_recovery_mode: 2
-2026/06/20-14:18:18.600337 6159085568                  Options.enable_thread_tracking: 0
-2026/06/20-14:18:18.600338 6159085568                  Options.enable_pipelined_write: 0
-2026/06/20-14:18:18.600338 6159085568                  Options.unordered_write: 0
-2026/06/20-14:18:18.600339 6159085568         Options.allow_concurrent_memtable_write: 1
-2026/06/20-14:18:18.600339 6159085568      Options.enable_write_thread_adaptive_yield: 1
-2026/06/20-14:18:18.600340 6159085568             Options.write_thread_max_yield_usec: 100
-2026/06/20-14:18:18.600340 6159085568            Options.write_thread_slow_yield_usec: 3
-2026/06/20-14:18:18.600341 6159085568                               Options.row_cache: None
-2026/06/20-14:18:18.600342 6159085568                              Options.wal_filter: None
-2026/06/20-14:18:18.600342 6159085568             Options.avoid_flush_during_recovery: 0
-2026/06/20-14:18:18.600343 6159085568             Options.allow_ingest_behind: 0
-2026/06/20-14:18:18.600344 6159085568             Options.two_write_queues: 0
-2026/06/20-14:18:18.600344 6159085568             Options.manual_wal_flush: 0
-2026/06/20-14:18:18.600345 6159085568             Options.wal_compression: 0
-2026/06/20-14:18:18.600345 6159085568             Options.atomic_flush: 0
-2026/06/20-14:18:18.600346 6159085568             Options.avoid_unnecessary_blocking_io: 0
-2026/06/20-14:18:18.600346 6159085568                 Options.persist_stats_to_disk: 0
-2026/06/20-14:18:18.600347 6159085568                 Options.write_dbid_to_manifest: 0
-2026/06/20-14:18:18.600348 6159085568                 Options.log_readahead_size: 0
-2026/06/20-14:18:18.600348 6159085568                 Options.file_checksum_gen_factory: Unknown
-2026/06/20-14:18:18.600349 6159085568                 Options.best_efforts_recovery: 0
-2026/06/20-14:18:18.600350 6159085568                Options.max_bgerror_resume_count: 2147483647
-2026/06/20-14:18:18.600350 6159085568            Options.bgerror_resume_retry_interval: 1000000
-2026/06/20-14:18:18.600351 6159085568             Options.allow_data_in_errors: 0
-2026/06/20-14:18:18.600351 6159085568             Options.db_host_id: __hostname__
-2026/06/20-14:18:18.600352 6159085568             Options.enforce_single_del_contracts: true
-2026/06/20-14:18:18.600353 6159085568             Options.max_background_jobs: 2
-2026/06/20-14:18:18.600353 6159085568             Options.max_background_compactions: -1
-2026/06/20-14:18:18.600354 6159085568             Options.max_subcompactions: 1
-2026/06/20-14:18:18.600354 6159085568             Options.avoid_flush_during_shutdown: 0
-2026/06/20-14:18:18.600355 6159085568           Options.writable_file_max_buffer_size: 1048576
-2026/06/20-14:18:18.600356 6159085568             Options.delayed_write_rate : 16777216
-2026/06/20-14:18:18.600356 6159085568             Options.max_total_wal_size: 0
-2026/06/20-14:18:18.600357 6159085568             Options.delete_obsolete_files_period_micros: 21600000000
-2026/06/20-14:18:18.600357 6159085568                   Options.stats_dump_period_sec: 600
-2026/06/20-14:18:18.600358 6159085568                 Options.stats_persist_period_sec: 600
-2026/06/20-14:18:18.600358 6159085568                 Options.stats_history_buffer_size: 1048576
-2026/06/20-14:18:18.600359 6159085568                          Options.max_open_files: -1
-2026/06/20-14:18:18.600360 6159085568                          Options.bytes_per_sync: 0
-2026/06/20-14:18:18.600360 6159085568                      Options.wal_bytes_per_sync: 0
-2026/06/20-14:18:18.600361 6159085568                   Options.strict_bytes_per_sync: 0
-2026/06/20-14:18:18.600361 6159085568       Options.compaction_readahead_size: 0
-2026/06/20-14:18:18.600362 6159085568                  Options.max_background_flushes: -1
-2026/06/20-14:18:18.600362 6159085568 Compression algorithms supported:
-2026/06/20-14:18:18.600364 6159085568 	kZSTD supported: 1
-2026/06/20-14:18:18.600365 6159085568 	kZlibCompression supported: 0
-2026/06/20-14:18:18.600365 6159085568 	kXpressCompression supported: 0
-2026/06/20-14:18:18.600366 6159085568 	kSnappyCompression supported: 1
-2026/06/20-14:18:18.600367 6159085568 	kZSTDNotFinalCompression supported: 1
-2026/06/20-14:18:18.600367 6159085568 	kLZ4HCCompression supported: 0
-2026/06/20-14:18:18.600368 6159085568 	kLZ4Compression supported: 0
-2026/06/20-14:18:18.600369 6159085568 	kBZip2Compression supported: 0
-2026/06/20-14:18:18.600375 6159085568 Fast CRC32 supported: Supported on Arm64
-2026/06/20-14:18:18.600376 6159085568 DMutex implementation: pthread_mutex_t
-2026/06/20-14:18:18.600377 6159085568 Allocator: System
-2026/06/20-14:18:18.601165 6159085568 [db/db_impl/db_impl_open.cc:317] Creating manifest 1 
-2026/06/20-14:18:18.602118 6159085568 [db/version_set.cc:5617] Recovering from manifest file: /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000001
-2026/06/20-14:18:18.602253 6159085568 [db/column_family.cc:632] --------------- Options for column family [default]:
-2026/06/20-14:18:18.602265 6159085568               Options.comparator: leveldb.BytewiseComparator
-2026/06/20-14:18:18.602268 6159085568           Options.merge_operator: meta_store merge
-2026/06/20-14:18:18.602270 6159085568        Options.compaction_filter: None
-2026/06/20-14:18:18.602271 6159085568        Options.compaction_filter_factory: None
-2026/06/20-14:18:18.602273 6159085568  Options.sst_partitioner_factory: None
-2026/06/20-14:18:18.602275 6159085568         Options.memtable_factory: SkipListFactory
-2026/06/20-14:18:18.602276 6159085568            Options.table_factory: BlockBasedTable
-2026/06/20-14:18:18.602295 6159085568            table_factory options:   flush_block_policy_factory: FlushBlockBySizePolicyFactory (0xa214d9380)
-  cache_index_and_filter_blocks: 0
-  cache_index_and_filter_blocks_with_high_priority: 1
-  pin_l0_filter_and_index_blocks_in_cache: 0
-  pin_top_level_index_and_filter: 1
-  index_type: 0
-  data_block_index_type: 0
-  index_shortening: 1
-  data_block_hash_table_util_ratio: 0.750000
-  checksum: 4
-  no_block_cache: 0
-  block_cache: 0xa214a9158
-  block_cache_name: LRUCache
-  block_cache_options:
-    capacity : 8388608
-    num_shard_bits : 4
-    strict_capacity_limit : 0
-    memory_allocator : None
-    high_pri_pool_ratio: 0.500
-    low_pri_pool_ratio: 0.000
-  block_cache_compressed: 0x0
-  persistent_cache: 0x0
-  block_size: 4096
-  block_size_deviation: 10
-  block_restart_interval: 16
-  index_block_restart_interval: 1
-  metadata_block_size: 4096
-  partition_filters: 0
-  use_delta_encoding: 1
-  filter_policy: nullptr
-  whole_key_filtering: 1
-  verify_compression: 0
-  read_amp_bytes_per_bit: 0
-  format_version: 5
-  enable_index_compression: 1
-  block_align: 0
-  max_auto_readahead_size: 262144
-  prepopulate_block_cache: 0
-  initial_auto_readahead_size: 8192
-  num_file_reads_for_auto_readahead: 2
-2026/06/20-14:18:18.602297 6159085568        Options.write_buffer_size: 67108864
-2026/06/20-14:18:18.602299 6159085568  Options.max_write_buffer_number: 2
-2026/06/20-14:18:18.602300 6159085568          Options.compression: NoCompression
-2026/06/20-14:18:18.602302 6159085568                  Options.bottommost_compression: NoCompression
-2026/06/20-14:18:18.602305 6159085568       Options.prefix_extractor: rocksdb.FixedPrefix
-2026/06/20-14:18:18.602306 6159085568   Options.memtable_insert_with_hint_prefix_extractor: nullptr
-2026/06/20-14:18:18.602308 6159085568             Options.num_levels: 7
-2026/06/20-14:18:18.602309 6159085568        Options.min_write_buffer_number_to_merge: 1
-2026/06/20-14:18:18.602310 6159085568     Options.max_write_buffer_number_to_maintain: 0
-2026/06/20-14:18:18.602311 6159085568     Options.max_write_buffer_size_to_maintain: 0
-2026/06/20-14:18:18.602313 6159085568            Options.bottommost_compression_opts.window_bits: -14
-2026/06/20-14:18:18.602314 6159085568                  Options.bottommost_compression_opts.level: 32767
-2026/06/20-14:18:18.602315 6159085568               Options.bottommost_compression_opts.strategy: 0
-2026/06/20-14:18:18.602317 6159085568         Options.bottommost_compression_opts.max_dict_bytes: 0
-2026/06/20-14:18:18.602319 6159085568         Options.bottommost_compression_opts.zstd_max_train_bytes: 0
-2026/06/20-14:18:18.602320 6159085568         Options.bottommost_compression_opts.parallel_threads: 1
-2026/06/20-14:18:18.602321 6159085568                  Options.bottommost_compression_opts.enabled: false
-2026/06/20-14:18:18.602323 6159085568         Options.bottommost_compression_opts.max_dict_buffer_bytes: 0
-2026/06/20-14:18:18.602324 6159085568         Options.bottommost_compression_opts.use_zstd_dict_trainer: true
-2026/06/20-14:18:18.602325 6159085568            Options.compression_opts.window_bits: -14
-2026/06/20-14:18:18.602326 6159085568                  Options.compression_opts.level: 32767
-2026/06/20-14:18:18.602328 6159085568               Options.compression_opts.strategy: 0
-2026/06/20-14:18:18.602329 6159085568         Options.compression_opts.max_dict_bytes: 0
-2026/06/20-14:18:18.602331 6159085568         Options.compression_opts.zstd_max_train_bytes: 0
-2026/06/20-14:18:18.602332 6159085568         Options.compression_opts.use_zstd_dict_trainer: true
-2026/06/20-14:18:18.602334 6159085568         Options.compression_opts.parallel_threads: 1
-2026/06/20-14:18:18.602335 6159085568                  Options.compression_opts.enabled: false
-2026/06/20-14:18:18.602336 6159085568         Options.compression_opts.max_dict_buffer_bytes: 0
-2026/06/20-14:18:18.602337 6159085568      Options.level0_file_num_compaction_trigger: 4
-2026/06/20-14:18:18.602339 6159085568          Options.level0_slowdown_writes_trigger: 20
-2026/06/20-14:18:18.602340 6159085568              Options.level0_stop_writes_trigger: 36
-2026/06/20-14:18:18.602341 6159085568                   Options.target_file_size_base: 67108864
-2026/06/20-14:18:18.602342 6159085568             Options.target_file_size_multiplier: 1
-2026/06/20-14:18:18.602344 6159085568                Options.max_bytes_for_level_base: 268435456
-2026/06/20-14:18:18.602346 6159085568 Options.level_compaction_dynamic_level_bytes: 0
-2026/06/20-14:18:18.602347 6159085568          Options.max_bytes_for_level_multiplier: 10.000000
-2026/06/20-14:18:18.602349 6159085568 Options.max_bytes_for_level_multiplier_addtl[0]: 1
-2026/06/20-14:18:18.602350 6159085568 Options.max_bytes_for_level_multiplier_addtl[1]: 1
-2026/06/20-14:18:18.602351 6159085568 Options.max_bytes_for_level_multiplier_addtl[2]: 1
-2026/06/20-14:18:18.602353 6159085568 Options.max_bytes_for_level_multiplier_addtl[3]: 1
-2026/06/20-14:18:18.602354 6159085568 Options.max_bytes_for_level_multiplier_addtl[4]: 1
-2026/06/20-14:18:18.602355 6159085568 Options.max_bytes_for_level_multiplier_addtl[5]: 1
-2026/06/20-14:18:18.602357 6159085568 Options.max_bytes_for_level_multiplier_addtl[6]: 1
-2026/06/20-14:18:18.602359 6159085568       Options.max_sequential_skip_in_iterations: 8
-2026/06/20-14:18:18.602360 6159085568                    Options.max_compaction_bytes: 1677721600
-2026/06/20-14:18:18.602361 6159085568   Options.ignore_max_compaction_bytes_for_input: true
-2026/06/20-14:18:18.602363 6159085568                        Options.arena_block_size: 1048576
-2026/06/20-14:18:18.602364 6159085568   Options.soft_pending_compaction_bytes_limit: 68719476736
-2026/06/20-14:18:18.602365 6159085568   Options.hard_pending_compaction_bytes_limit: 274877906944
-2026/06/20-14:18:18.602366 6159085568                Options.disable_auto_compactions: 0
-2026/06/20-14:18:18.602369 6159085568                        Options.compaction_style: kCompactionStyleLevel
-2026/06/20-14:18:18.602371 6159085568                          Options.compaction_pri: kMinOverlappingRatio
-2026/06/20-14:18:18.602372 6159085568 Options.compaction_options_universal.size_ratio: 1
-2026/06/20-14:18:18.602374 6159085568 Options.compaction_options_universal.min_merge_width: 2
-2026/06/20-14:18:18.602375 6159085568 Options.compaction_options_universal.max_merge_width: 4294967295
-2026/06/20-14:18:18.602384 6159085568 Options.compaction_options_universal.max_size_amplification_percent: 200
-2026/06/20-14:18:18.602385 6159085568 Options.compaction_options_universal.compression_size_percent: -1
-2026/06/20-14:18:18.602388 6159085568 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize
-2026/06/20-14:18:18.602389 6159085568 Options.compaction_options_fifo.max_table_files_size: 1073741824
-2026/06/20-14:18:18.602391 6159085568 Options.compaction_options_fifo.allow_compaction: 0
-2026/06/20-14:18:18.602393 6159085568                   Options.table_properties_collectors: 
-2026/06/20-14:18:18.602395 6159085568                   Options.inplace_update_support: 0
-2026/06/20-14:18:18.602396 6159085568                 Options.inplace_update_num_locks: 10000
-2026/06/20-14:18:18.602398 6159085568               Options.memtable_prefix_bloom_size_ratio: 0.000000
-2026/06/20-14:18:18.602400 6159085568               Options.memtable_whole_key_filtering: 0
-2026/06/20-14:18:18.602401 6159085568   Options.memtable_huge_page_size: 0
-2026/06/20-14:18:18.602402 6159085568                           Options.bloom_locality: 0
-2026/06/20-14:18:18.602403 6159085568                    Options.max_successive_merges: 0
-2026/06/20-14:18:18.602405 6159085568                Options.optimize_filters_for_hits: 0
-2026/06/20-14:18:18.602406 6159085568                Options.paranoid_file_checks: 0
-2026/06/20-14:18:18.602407 6159085568                Options.force_consistency_checks: 1
-2026/06/20-14:18:18.602408 6159085568                Options.report_bg_io_stats: 0
-2026/06/20-14:18:18.602410 6159085568                               Options.ttl: 2592000
-2026/06/20-14:18:18.602411 6159085568          Options.periodic_compaction_seconds: 0
-2026/06/20-14:18:18.602413 6159085568  Options.preclude_last_level_data_seconds: 0
-2026/06/20-14:18:18.602414 6159085568    Options.preserve_internal_time_seconds: 0
-2026/06/20-14:18:18.602415 6159085568                       Options.enable_blob_files: false
-2026/06/20-14:18:18.602417 6159085568                           Options.min_blob_size: 0
-2026/06/20-14:18:18.602418 6159085568                          Options.blob_file_size: 268435456
-2026/06/20-14:18:18.602419 6159085568                   Options.blob_compression_type: NoCompression
-2026/06/20-14:18:18.602421 6159085568          Options.enable_blob_garbage_collection: false
-2026/06/20-14:18:18.602422 6159085568      Options.blob_garbage_collection_age_cutoff: 0.250000
-2026/06/20-14:18:18.602424 6159085568 Options.blob_garbage_collection_force_threshold: 1.000000
-2026/06/20-14:18:18.602425 6159085568          Options.blob_compaction_readahead_size: 0
-2026/06/20-14:18:18.602427 6159085568                Options.blob_file_starting_level: 0
-2026/06/20-14:18:18.602428 6159085568 Options.experimental_mempurge_threshold: 0.000000
-2026/06/20-14:18:18.603051 6159085568 [db/version_set.cc:5668] Recovered from manifest file:/Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000001 succeeded,manifest_file_number is 1, next_file_number is 3, last_sequence is 0, log_number is 0,prev_log_number is 0,max_column_family is 0,min_log_number_to_keep is 0
-2026/06/20-14:18:18.603054 6159085568 [db/version_set.cc:5677] Column family [default] (ID 0), log number is 0
-2026/06/20-14:18:18.603171 6159085568 [db/db_impl/db_impl_open.cc:539] DB ID: 83f38b5a-946f-41ab-8922-849ced38b4df
-2026/06/20-14:18:18.603685 6159085568 [db/version_set.cc:5135] Creating manifest 5
-2026/06/20-14:18:18.606931 6159085568 [db/db_impl/db_impl_open.cc:1992] SstFileManager instance 0xa208f6a00
-2026/06/20-14:18:18.607243 6159085568 DB pointer 0xa2087fc00
-2026/06/20-14:18:18.610261 6159085568 [db/db_impl/db_impl.cc:504] Shutdown: canceling all background work
-2026/06/20-14:18:18.610650 6159085568 [db/db_impl/db_impl.cc:711] Shutdown complete
diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005 b/rust/cubestore/cubestore/test-create_partitions-local/metastore/MANIFEST-000005
deleted file mode 100644
index f1d1169ebb5084f6ab379df690c29da290eba713..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 66
zcmZS8)^KKEU<~`Y<A)3bBcoJKYFTPdN|K&aWl3szW^t->er`cxQDRAc(HCZ(C>91r
VCI%LUKRRkZ*%%l(8JO8v7yzD{5)c3Y

diff --git a/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007 b/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
deleted file mode 100644
index d2b2e9bfc5695..0000000000000
--- a/rust/cubestore/cubestore/test-create_partitions-local/metastore/OPTIONS-000007
+++ /dev/null
@@ -1,198 +0,0 @@
-# This is a RocksDB option file.
-#
-# For detailed file format spec, please refer to the example file
-# in examples/rocksdb_option_file_example.ini
-#
-
-[Version]
-  rocksdb_version=7.10.2
-  options_file_version=1.1
-
-[DBOptions]
-  max_background_flushes=-1
-  compaction_readahead_size=0
-  strict_bytes_per_sync=false
-  wal_bytes_per_sync=0
-  max_open_files=-1
-  stats_history_buffer_size=1048576
-  max_total_wal_size=0
-  stats_persist_period_sec=600
-  stats_dump_period_sec=600
-  avoid_flush_during_shutdown=false
-  max_subcompactions=1
-  bytes_per_sync=0
-  delayed_write_rate=16777216
-  max_background_compactions=-1
-  max_background_jobs=2
-  delete_obsolete_files_period_micros=21600000000
-  writable_file_max_buffer_size=1048576
-  file_checksum_gen_factory=nullptr
-  allow_data_in_errors=false
-  max_bgerror_resume_count=2147483647
-  best_efforts_recovery=false
-  write_dbid_to_manifest=false
-  atomic_flush=false
-  wal_compression=kNoCompression
-  manual_wal_flush=false
-  two_write_queues=false
-  avoid_flush_during_recovery=false
-  dump_malloc_stats=false
-  info_log_level=INFO_LEVEL
-  write_thread_slow_yield_usec=3
-  allow_ingest_behind=false
-  fail_if_options_file_error=false
-  persist_stats_to_disk=false
-  WAL_ttl_seconds=330
-  bgerror_resume_retry_interval=1000000
-  allow_concurrent_memtable_write=true
-  paranoid_checks=true
-  WAL_size_limit_MB=0
-  lowest_used_cache_tier=kNonVolatileBlockTier
-  keep_log_file_num=1000
-  table_cache_numshardbits=6
-  max_file_opening_threads=16
-  use_fsync=false
-  unordered_write=false
-  random_access_max_buffer_size=1048576
-  log_readahead_size=0
-  enable_pipelined_write=false
-  wal_recovery_mode=kPointInTimeRecovery
-  db_write_buffer_size=0
-  allow_2pc=false
-  skip_checking_sst_file_sizes_on_db_open=false
-  skip_stats_update_on_db_open=false
-  recycle_log_file_num=0
-  db_host_id=__hostname__
-  access_hint_on_compaction_start=NORMAL
-  verify_sst_unique_id_in_manifest=true
-  track_and_verify_wals_in_manifest=false
-  error_if_exists=false
-  manifest_preallocation_size=4194304
-  is_fd_close_on_exec=true
-  enable_write_thread_adaptive_yield=true
-  enable_thread_tracking=false
-  avoid_unnecessary_blocking_io=false
-  allow_fallocate=true
-  max_log_file_size=0
-  advise_random_on_open=true
-  create_missing_column_families=false
-  max_write_batch_group_size_bytes=1048576
-  use_adaptive_mutex=false
-  wal_filter=nullptr
-  create_if_missing=true
-  enforce_single_del_contracts=true
-  allow_mmap_writes=false
-  log_file_time_to_roll=0
-  use_direct_io_for_flush_and_compaction=false
-  flush_verify_memtable_count=true
-  max_manifest_file_size=1073741824
-  write_thread_max_yield_usec=100
-  use_direct_reads=false
-  allow_mmap_reads=false
-  
-
-[CFOptions "default"]
-  memtable_protection_bytes_per_key=0
-  bottommost_compression=kNoCompression
-  sample_for_compression=0
-  blob_garbage_collection_age_cutoff=0.250000
-  blob_compression_type=kNoCompression
-  prepopulate_blob_cache=kDisable
-  blob_compaction_readahead_size=0
-  level0_stop_writes_trigger=36
-  min_blob_size=0
-  last_level_temperature=kUnknown
-  compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;compression_size_percent=-1;max_size_amplification_percent=200;incremental=false;max_merge_width=4294967295;size_ratio=1;}
-  target_file_size_base=67108864
-  ignore_max_compaction_bytes_for_input=true
-  memtable_whole_key_filtering=false
-  blob_file_starting_level=0
-  soft_pending_compaction_bytes_limit=68719476736
-  max_write_buffer_number=2
-  ttl=2592000
-  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
-  check_flush_compaction_key_order=true
-  memtable_huge_page_size=0
-  max_successive_merges=0
-  inplace_update_num_locks=10000
-  enable_blob_garbage_collection=false
-  arena_block_size=1048576
-  bottommost_compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
-  target_file_size_multiplier=1
-  max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
-  blob_garbage_collection_force_threshold=1.000000
-  enable_blob_files=false
-  level0_slowdown_writes_trigger=20
-  compression=kNoCompression
-  level0_file_num_compaction_trigger=4
-  prefix_extractor=rocksdb.FixedPrefix.13
-  max_bytes_for_level_multiplier=10.000000
-  write_buffer_size=67108864
-  disable_auto_compactions=false
-  max_compaction_bytes=1677721600
-  compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
-  hard_pending_compaction_bytes_limit=274877906944
-  blob_file_size=268435456
-  periodic_compaction_seconds=0
-  paranoid_file_checks=false
-  experimental_mempurge_threshold=0.000000
-  memtable_prefix_bloom_size_ratio=0.000000
-  max_bytes_for_level_base=268435456
-  max_sequential_skip_in_iterations=8
-  report_bg_io_stats=false
-  sst_partitioner_factory=nullptr
-  compaction_pri=kMinOverlappingRatio
-  compaction_style=kCompactionStyleLevel
-  compaction_filter_factory=nullptr
-  compaction_filter=nullptr
-  memtable_factory=SkipListFactory
-  comparator=leveldb.BytewiseComparator
-  bloom_locality=0
-  min_write_buffer_number_to_merge=1
-  table_factory=BlockBasedTable
-  max_write_buffer_size_to_maintain=0
-  max_write_buffer_number_to_maintain=0
-  preserve_internal_time_seconds=0
-  force_consistency_checks=true
-  optimize_filters_for_hits=false
-  merge_operator=meta_store merge
-  num_levels=7
-  level_compaction_dynamic_file_size=true
-  memtable_insert_with_hint_prefix_extractor=nullptr
-  level_compaction_dynamic_level_bytes=false
-  preclude_last_level_data_seconds=0
-  inplace_update_support=false
-  
-[TableOptions/BlockBasedTable "default"]
-  num_file_reads_for_auto_readahead=2
-  metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
-  read_amp_bytes_per_bit=0
-  verify_compression=false
-  format_version=5
-  optimize_filters_for_memory=false
-  partition_filters=false
-  detect_filter_construct_corruption=false
-  initial_auto_readahead_size=8192
-  max_auto_readahead_size=262144
-  enable_index_compression=true
-  checksum=kXXH3
-  index_block_restart_interval=1
-  pin_top_level_index_and_filter=true
-  block_align=false
-  block_size=4096
-  index_type=kBinarySearch
-  filter_policy=nullptr
-  metadata_block_size=4096
-  no_block_cache=false
-  index_shortening=kShortenSeparators
-  whole_key_filtering=true
-  block_size_deviation=10
-  data_block_index_type=kDataBlockBinarySearch
-  data_block_hash_table_util_ratio=0.750000
-  cache_index_and_filter_blocks=false
-  prepopulate_block_cache=kDisable
-  block_restart_interval=16
-  pin_l0_filter_and_index_blocks_in_cache=false
-  cache_index_and_filter_blocks_with_high_priority=true
-  flush_block_policy_factory=FlushBlockBySizePolicyFactory
-  
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/000004.log
deleted file mode 100644
index cb3cd81b35938ab98bd6610802c752160f6884ad..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2907
zcmeHJUuYaf82@H=(>vRye{e5Q^&x5!BBq)>YFb+@NWm%yDK<!xBE$4{?rz)tbJ?9u
zE&+WIrQ%CUA4CHxf`y{^q%ZmqF;Ju}N+l`?f<-9xMer$4O8sVc=dQWrLcs@7+&gw=
zzwb9Q`~BuOKXz(i_n!NJ5Xdgok{*gTDC*Gn3txS990&%}OSPngKs_dpW55cyFTkjV
zA&_l^kh?58KuF2J6lom(y>R~Bcbpte6srG?)4$;2rjDtZ)#9|h_=)T1>ImuSgcxne
z7vH|}9748qLd-T~;Ng!}Z<(Z!L$<4hR0+DKJdV+`rsStQ!O}{ov>~q4h0mGmKqd4N
zKZqesnfPSqAtvV<SoX7l)B!heL=%i1{UZi3V9!X=vebYwY&k9?H>iQ}(4nK4B0Kvq
zMIR^*40vIMOE<*%UB3}Vl2yaV=Mw5XW)7De+?YjzJ9ALMMhb^v^4{`jd2H|K*xvH!
zgk3&pAHe@3V`GmWu**|$DserBMLehpCISay!96udwL(35q9R!=aWY@vyT@S?RRq&z
z#uC@@Sy-*cLP8R_(}`eb#9R(wSfg7JG4NU~5;dHpGl45i-xfncbdAgjX9F+f4vXD+
zoT9-a>8j#c-0)n<WIYmbJ@gz9O(}vHGXNHxh81{?+=3F6$ROE8#^Et?2;L_@V*DEj
z;7j-vmdJ-NjQeWA%ebkN<o;ZxTCBRwcKkX%4wGSvxcEc8rgvhICk8XwQgW{UnJZIo
z-B`^PLbmFTv{MD4w%1<d2a*b;;D{<uJPDbeZqv^lO6TeB(&1~R6Dzr>ol$JABeRw8
z@oikSZKA2<oOM#^7M>Ti#N!bZKIj`-dg|CG1yY!_U>PWKgH#raJMwdE(3PRxM-~S!
zj%XfQHj$olgA~5=k1YS(dsydp>Q2r_1~Yy-@#pf5OP??4EXfX}wt_s22l5hWy)nuJ
zb;{am!78$%)`r}G&2TRf;PYnQ;}z-72$<!Ok0&Q-U<y&$)<oef$~3_f;PF_pBvywb
zsaD-46j6VHI}<jb>Y^#6{Q7R7CWhV#YtcTs)!Ef}_3cL|dr2>rpkI4G)X{u$YOMM_
zIeRAAoIR6nL<>sHG%gPv8^35Dlr6q8ftqHw5k2b@!Hd5VG|Id69N2c)ExmB|<m(8r
zIw5VlrGKn0{)UiJC!}q+^o6fqdJyflFNbuuTiRi_<$t5y?jN=H?fc(uakKxv-QN6o
zJes(A;byKJ+P2&B#&%2pQ+7+&+3oPjC*M0izwo9eMB6RR?RMqb<-+#rwQV}D?Urt8
zx3rtx(tp-&gEO_*uFp^3nccR&-n!cD=IpJr+mx1#?3Uukf%=SHXSb~popy_fzW`5%
Bu)6>N

diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
deleted file mode 100644
index aa5bb8ea50905..0000000000000
--- a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/CURRENT
+++ /dev/null
@@ -1 +0,0 @@
-MANIFEST-000005
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
deleted file mode 100644
index 82f78f1bfc646..0000000000000
--- a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/IDENTITY
+++ /dev/null
@@ -1 +0,0 @@
-d00a5f92-ccd3-4958-88a5-390cb7553a78
\ No newline at end of file
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOCK
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
deleted file mode 100644
index 840bd4c15f97d..0000000000000
--- a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/LOG
+++ /dev/null
@@ -1,245 +0,0 @@
-2026/06/20-14:18:18.665188 6165524480 RocksDB version: 7.10.2
-2026/06/20-14:18:18.665209 6165524480 Compile date 2022-12-22 09:30:39
-2026/06/20-14:18:18.665210 6165524480 DB SUMMARY
-2026/06/20-14:18:18.665211 6165524480 DB Session ID:  VFFO8AISUMHVF8LR3OJV
-2026/06/20-14:18:18.665268 6165524480 SST files in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore dir, Total Num: 0, files: 
-2026/06/20-14:18:18.665270 6165524480 Write Ahead Log file in /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore: 
-2026/06/20-14:18:18.665271 6165524480                         Options.error_if_exists: 0
-2026/06/20-14:18:18.665272 6165524480                       Options.create_if_missing: 1
-2026/06/20-14:18:18.665273 6165524480                         Options.paranoid_checks: 1
-2026/06/20-14:18:18.665273 6165524480             Options.flush_verify_memtable_count: 1
-2026/06/20-14:18:18.665274 6165524480                               Options.track_and_verify_wals_in_manifest: 0
-2026/06/20-14:18:18.665274 6165524480        Options.verify_sst_unique_id_in_manifest: 1
-2026/06/20-14:18:18.665275 6165524480                                     Options.env: 0x10c4d9d10
-2026/06/20-14:18:18.665276 6165524480                                      Options.fs: PosixFileSystem
-2026/06/20-14:18:18.665276 6165524480                                Options.info_log: 0xa2085cd98
-2026/06/20-14:18:18.665277 6165524480                Options.max_file_opening_threads: 16
-2026/06/20-14:18:18.665278 6165524480                              Options.statistics: 0x0
-2026/06/20-14:18:18.665278 6165524480                               Options.use_fsync: 0
-2026/06/20-14:18:18.665279 6165524480                       Options.max_log_file_size: 0
-2026/06/20-14:18:18.665279 6165524480                  Options.max_manifest_file_size: 1073741824
-2026/06/20-14:18:18.665280 6165524480                   Options.log_file_time_to_roll: 0
-2026/06/20-14:18:18.665281 6165524480                       Options.keep_log_file_num: 1000
-2026/06/20-14:18:18.665281 6165524480                    Options.recycle_log_file_num: 0
-2026/06/20-14:18:18.665282 6165524480                         Options.allow_fallocate: 1
-2026/06/20-14:18:18.665282 6165524480                        Options.allow_mmap_reads: 0
-2026/06/20-14:18:18.665283 6165524480                       Options.allow_mmap_writes: 0
-2026/06/20-14:18:18.665283 6165524480                        Options.use_direct_reads: 0
-2026/06/20-14:18:18.665284 6165524480                        Options.use_direct_io_for_flush_and_compaction: 0
-2026/06/20-14:18:18.665285 6165524480          Options.create_missing_column_families: 0
-2026/06/20-14:18:18.665285 6165524480                              Options.db_log_dir: 
-2026/06/20-14:18:18.665286 6165524480                                 Options.wal_dir: 
-2026/06/20-14:18:18.665286 6165524480                Options.table_cache_numshardbits: 6
-2026/06/20-14:18:18.665287 6165524480                         Options.WAL_ttl_seconds: 330
-2026/06/20-14:18:18.665287 6165524480                       Options.WAL_size_limit_MB: 0
-2026/06/20-14:18:18.665288 6165524480                        Options.max_write_batch_group_size_bytes: 1048576
-2026/06/20-14:18:18.665289 6165524480             Options.manifest_preallocation_size: 4194304
-2026/06/20-14:18:18.665289 6165524480                     Options.is_fd_close_on_exec: 1
-2026/06/20-14:18:18.665290 6165524480                   Options.advise_random_on_open: 1
-2026/06/20-14:18:18.665290 6165524480                    Options.db_write_buffer_size: 0
-2026/06/20-14:18:18.665291 6165524480                    Options.write_buffer_manager: 0xa21438620
-2026/06/20-14:18:18.665291 6165524480         Options.access_hint_on_compaction_start: 1
-2026/06/20-14:18:18.665292 6165524480           Options.random_access_max_buffer_size: 1048576
-2026/06/20-14:18:18.665293 6165524480                      Options.use_adaptive_mutex: 0
-2026/06/20-14:18:18.665293 6165524480                            Options.rate_limiter: 0x0
-2026/06/20-14:18:18.665294 6165524480     Options.sst_file_manager.rate_bytes_per_sec: 0
-2026/06/20-14:18:18.665294 6165524480                       Options.wal_recovery_mode: 2
-2026/06/20-14:18:18.665295 6165524480                  Options.enable_thread_tracking: 0
-2026/06/20-14:18:18.665296 6165524480                  Options.enable_pipelined_write: 0
-2026/06/20-14:18:18.665296 6165524480                  Options.unordered_write: 0
-2026/06/20-14:18:18.665297 6165524480         Options.allow_concurrent_memtable_write: 1
-2026/06/20-14:18:18.665297 6165524480      Options.enable_write_thread_adaptive_yield: 1
-2026/06/20-14:18:18.665298 6165524480             Options.write_thread_max_yield_usec: 100
-2026/06/20-14:18:18.665298 6165524480            Options.write_thread_slow_yield_usec: 3
-2026/06/20-14:18:18.665299 6165524480                               Options.row_cache: None
-2026/06/20-14:18:18.665300 6165524480                              Options.wal_filter: None
-2026/06/20-14:18:18.665300 6165524480             Options.avoid_flush_during_recovery: 0
-2026/06/20-14:18:18.665301 6165524480             Options.allow_ingest_behind: 0
-2026/06/20-14:18:18.665302 6165524480             Options.two_write_queues: 0
-2026/06/20-14:18:18.665302 6165524480             Options.manual_wal_flush: 0
-2026/06/20-14:18:18.665303 6165524480             Options.wal_compression: 0
-2026/06/20-14:18:18.665303 6165524480             Options.atomic_flush: 0
-2026/06/20-14:18:18.665304 6165524480             Options.avoid_unnecessary_blocking_io: 0
-2026/06/20-14:18:18.665304 6165524480                 Options.persist_stats_to_disk: 0
-2026/06/20-14:18:18.665305 6165524480                 Options.write_dbid_to_manifest: 0
-2026/06/20-14:18:18.665305 6165524480                 Options.log_readahead_size: 0
-2026/06/20-14:18:18.665306 6165524480                 Options.file_checksum_gen_factory: Unknown
-2026/06/20-14:18:18.665307 6165524480                 Options.best_efforts_recovery: 0
-2026/06/20-14:18:18.665308 6165524480                Options.max_bgerror_resume_count: 2147483647
-2026/06/20-14:18:18.665308 6165524480            Options.bgerror_resume_retry_interval: 1000000
-2026/06/20-14:18:18.665309 6165524480             Options.allow_data_in_errors: 0
-2026/06/20-14:18:18.665309 6165524480             Options.db_host_id: __hostname__
-2026/06/20-14:18:18.665310 6165524480             Options.enforce_single_del_contracts: true
-2026/06/20-14:18:18.665311 6165524480             Options.max_background_jobs: 2
-2026/06/20-14:18:18.665311 6165524480             Options.max_background_compactions: -1
-2026/06/20-14:18:18.665312 6165524480             Options.max_subcompactions: 1
-2026/06/20-14:18:18.665312 6165524480             Options.avoid_flush_during_shutdown: 0
-2026/06/20-14:18:18.665313 6165524480           Options.writable_file_max_buffer_size: 1048576
-2026/06/20-14:18:18.665313 6165524480             Options.delayed_write_rate : 16777216
-2026/06/20-14:18:18.665314 6165524480             Options.max_total_wal_size: 0
-2026/06/20-14:18:18.665315 6165524480             Options.delete_obsolete_files_period_micros: 21600000000
-2026/06/20-14:18:18.665315 6165524480                   Options.stats_dump_period_sec: 600
-2026/06/20-14:18:18.665316 6165524480                 Options.stats_persist_period_sec: 600
-2026/06/20-14:18:18.665316 6165524480                 Options.stats_history_buffer_size: 1048576
-2026/06/20-14:18:18.665317 6165524480                          Options.max_open_files: -1
-2026/06/20-14:18:18.665318 6165524480                          Options.bytes_per_sync: 0
-2026/06/20-14:18:18.665318 6165524480                      Options.wal_bytes_per_sync: 0
-2026/06/20-14:18:18.665319 6165524480                   Options.strict_bytes_per_sync: 0
-2026/06/20-14:18:18.665319 6165524480       Options.compaction_readahead_size: 0
-2026/06/20-14:18:18.665320 6165524480                  Options.max_background_flushes: -1
-2026/06/20-14:18:18.665320 6165524480 Compression algorithms supported:
-2026/06/20-14:18:18.665321 6165524480 	kZSTD supported: 1
-2026/06/20-14:18:18.665322 6165524480 	kZlibCompression supported: 0
-2026/06/20-14:18:18.665323 6165524480 	kXpressCompression supported: 0
-2026/06/20-14:18:18.665324 6165524480 	kSnappyCompression supported: 1
-2026/06/20-14:18:18.665324 6165524480 	kZSTDNotFinalCompression supported: 1
-2026/06/20-14:18:18.665325 6165524480 	kLZ4HCCompression supported: 0
-2026/06/20-14:18:18.665326 6165524480 	kLZ4Compression supported: 0
-2026/06/20-14:18:18.665326 6165524480 	kBZip2Compression supported: 0
-2026/06/20-14:18:18.665333 6165524480 Fast CRC32 supported: Supported on Arm64
-2026/06/20-14:18:18.665333 6165524480 DMutex implementation: pthread_mutex_t
-2026/06/20-14:18:18.665334 6165524480 Allocator: System
-2026/06/20-14:18:18.665830 6165524480 [db/db_impl/db_impl_open.cc:317] Creating manifest 1 
-2026/06/20-14:18:18.666540 6165524480 [db/version_set.cc:5617] Recovering from manifest file: /Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000001
-2026/06/20-14:18:18.666603 6165524480 [db/column_family.cc:632] --------------- Options for column family [default]:
-2026/06/20-14:18:18.666605 6165524480               Options.comparator: leveldb.BytewiseComparator
-2026/06/20-14:18:18.666606 6165524480           Options.merge_operator: meta_store merge
-2026/06/20-14:18:18.666607 6165524480        Options.compaction_filter: None
-2026/06/20-14:18:18.666608 6165524480        Options.compaction_filter_factory: None
-2026/06/20-14:18:18.666609 6165524480  Options.sst_partitioner_factory: None
-2026/06/20-14:18:18.666609 6165524480         Options.memtable_factory: SkipListFactory
-2026/06/20-14:18:18.666610 6165524480            Options.table_factory: BlockBasedTable
-2026/06/20-14:18:18.666618 6165524480            table_factory options:   flush_block_policy_factory: FlushBlockBySizePolicyFactory (0xa214d94c0)
-  cache_index_and_filter_blocks: 0
-  cache_index_and_filter_blocks_with_high_priority: 1
-  pin_l0_filter_and_index_blocks_in_cache: 0
-  pin_top_level_index_and_filter: 1
-  index_type: 0
-  data_block_index_type: 0
-  index_shortening: 1
-  data_block_hash_table_util_ratio: 0.750000
-  checksum: 4
-  no_block_cache: 0
-  block_cache: 0xa214a8b58
-  block_cache_name: LRUCache
-  block_cache_options:
-    capacity : 8388608
-    num_shard_bits : 4
-    strict_capacity_limit : 0
-    memory_allocator : None
-    high_pri_pool_ratio: 0.500
-    low_pri_pool_ratio: 0.000
-  block_cache_compressed: 0x0
-  persistent_cache: 0x0
-  block_size: 4096
-  block_size_deviation: 10
-  block_restart_interval: 16
-  index_block_restart_interval: 1
-  metadata_block_size: 4096
-  partition_filters: 0
-  use_delta_encoding: 1
-  filter_policy: nullptr
-  whole_key_filtering: 1
-  verify_compression: 0
-  read_amp_bytes_per_bit: 0
-  format_version: 5
-  enable_index_compression: 1
-  block_align: 0
-  max_auto_readahead_size: 262144
-  prepopulate_block_cache: 0
-  initial_auto_readahead_size: 8192
-  num_file_reads_for_auto_readahead: 2
-2026/06/20-14:18:18.666619 6165524480        Options.write_buffer_size: 67108864
-2026/06/20-14:18:18.666620 6165524480  Options.max_write_buffer_number: 2
-2026/06/20-14:18:18.666621 6165524480          Options.compression: NoCompression
-2026/06/20-14:18:18.666621 6165524480                  Options.bottommost_compression: NoCompression
-2026/06/20-14:18:18.666623 6165524480       Options.prefix_extractor: rocksdb.FixedPrefix
-2026/06/20-14:18:18.666623 6165524480   Options.memtable_insert_with_hint_prefix_extractor: nullptr
-2026/06/20-14:18:18.666624 6165524480             Options.num_levels: 7
-2026/06/20-14:18:18.666625 6165524480        Options.min_write_buffer_number_to_merge: 1
-2026/06/20-14:18:18.666625 6165524480     Options.max_write_buffer_number_to_maintain: 0
-2026/06/20-14:18:18.666626 6165524480     Options.max_write_buffer_size_to_maintain: 0
-2026/06/20-14:18:18.666626 6165524480            Options.bottommost_compression_opts.window_bits: -14
-2026/06/20-14:18:18.666627 6165524480                  Options.bottommost_compression_opts.level: 32767
-2026/06/20-14:18:18.666627 6165524480               Options.bottommost_compression_opts.strategy: 0
-2026/06/20-14:18:18.666628 6165524480         Options.bottommost_compression_opts.max_dict_bytes: 0
-2026/06/20-14:18:18.666629 6165524480         Options.bottommost_compression_opts.zstd_max_train_bytes: 0
-2026/06/20-14:18:18.666629 6165524480         Options.bottommost_compression_opts.parallel_threads: 1
-2026/06/20-14:18:18.666630 6165524480                  Options.bottommost_compression_opts.enabled: false
-2026/06/20-14:18:18.666631 6165524480         Options.bottommost_compression_opts.max_dict_buffer_bytes: 0
-2026/06/20-14:18:18.666631 6165524480         Options.bottommost_compression_opts.use_zstd_dict_trainer: true
-2026/06/20-14:18:18.666632 6165524480            Options.compression_opts.window_bits: -14
-2026/06/20-14:18:18.666632 6165524480                  Options.compression_opts.level: 32767
-2026/06/20-14:18:18.666633 6165524480               Options.compression_opts.strategy: 0
-2026/06/20-14:18:18.666633 6165524480         Options.compression_opts.max_dict_bytes: 0
-2026/06/20-14:18:18.666634 6165524480         Options.compression_opts.zstd_max_train_bytes: 0
-2026/06/20-14:18:18.666635 6165524480         Options.compression_opts.use_zstd_dict_trainer: true
-2026/06/20-14:18:18.666635 6165524480         Options.compression_opts.parallel_threads: 1
-2026/06/20-14:18:18.666636 6165524480                  Options.compression_opts.enabled: false
-2026/06/20-14:18:18.666636 6165524480         Options.compression_opts.max_dict_buffer_bytes: 0
-2026/06/20-14:18:18.666637 6165524480      Options.level0_file_num_compaction_trigger: 4
-2026/06/20-14:18:18.666638 6165524480          Options.level0_slowdown_writes_trigger: 20
-2026/06/20-14:18:18.666638 6165524480              Options.level0_stop_writes_trigger: 36
-2026/06/20-14:18:18.666639 6165524480                   Options.target_file_size_base: 67108864
-2026/06/20-14:18:18.666639 6165524480             Options.target_file_size_multiplier: 1
-2026/06/20-14:18:18.666640 6165524480                Options.max_bytes_for_level_base: 268435456
-2026/06/20-14:18:18.666641 6165524480 Options.level_compaction_dynamic_level_bytes: 0
-2026/06/20-14:18:18.666641 6165524480          Options.max_bytes_for_level_multiplier: 10.000000
-2026/06/20-14:18:18.666642 6165524480 Options.max_bytes_for_level_multiplier_addtl[0]: 1
-2026/06/20-14:18:18.666642 6165524480 Options.max_bytes_for_level_multiplier_addtl[1]: 1
-2026/06/20-14:18:18.666643 6165524480 Options.max_bytes_for_level_multiplier_addtl[2]: 1
-2026/06/20-14:18:18.666644 6165524480 Options.max_bytes_for_level_multiplier_addtl[3]: 1
-2026/06/20-14:18:18.666644 6165524480 Options.max_bytes_for_level_multiplier_addtl[4]: 1
-2026/06/20-14:18:18.666645 6165524480 Options.max_bytes_for_level_multiplier_addtl[5]: 1
-2026/06/20-14:18:18.666645 6165524480 Options.max_bytes_for_level_multiplier_addtl[6]: 1
-2026/06/20-14:18:18.666646 6165524480       Options.max_sequential_skip_in_iterations: 8
-2026/06/20-14:18:18.666647 6165524480                    Options.max_compaction_bytes: 1677721600
-2026/06/20-14:18:18.666647 6165524480   Options.ignore_max_compaction_bytes_for_input: true
-2026/06/20-14:18:18.666648 6165524480                        Options.arena_block_size: 1048576
-2026/06/20-14:18:18.666648 6165524480   Options.soft_pending_compaction_bytes_limit: 68719476736
-2026/06/20-14:18:18.666649 6165524480   Options.hard_pending_compaction_bytes_limit: 274877906944
-2026/06/20-14:18:18.666649 6165524480                Options.disable_auto_compactions: 0
-2026/06/20-14:18:18.666651 6165524480                        Options.compaction_style: kCompactionStyleLevel
-2026/06/20-14:18:18.666652 6165524480                          Options.compaction_pri: kMinOverlappingRatio
-2026/06/20-14:18:18.666652 6165524480 Options.compaction_options_universal.size_ratio: 1
-2026/06/20-14:18:18.666653 6165524480 Options.compaction_options_universal.min_merge_width: 2
-2026/06/20-14:18:18.666654 6165524480 Options.compaction_options_universal.max_merge_width: 4294967295
-2026/06/20-14:18:18.666654 6165524480 Options.compaction_options_universal.max_size_amplification_percent: 200
-2026/06/20-14:18:18.666655 6165524480 Options.compaction_options_universal.compression_size_percent: -1
-2026/06/20-14:18:18.666656 6165524480 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize
-2026/06/20-14:18:18.666657 6165524480 Options.compaction_options_fifo.max_table_files_size: 1073741824
-2026/06/20-14:18:18.666657 6165524480 Options.compaction_options_fifo.allow_compaction: 0
-2026/06/20-14:18:18.666658 6165524480                   Options.table_properties_collectors: 
-2026/06/20-14:18:18.666659 6165524480                   Options.inplace_update_support: 0
-2026/06/20-14:18:18.666659 6165524480                 Options.inplace_update_num_locks: 10000
-2026/06/20-14:18:18.666660 6165524480               Options.memtable_prefix_bloom_size_ratio: 0.000000
-2026/06/20-14:18:18.666661 6165524480               Options.memtable_whole_key_filtering: 0
-2026/06/20-14:18:18.666661 6165524480   Options.memtable_huge_page_size: 0
-2026/06/20-14:18:18.666662 6165524480                           Options.bloom_locality: 0
-2026/06/20-14:18:18.666662 6165524480                    Options.max_successive_merges: 0
-2026/06/20-14:18:18.666663 6165524480                Options.optimize_filters_for_hits: 0
-2026/06/20-14:18:18.666664 6165524480                Options.paranoid_file_checks: 0
-2026/06/20-14:18:18.666664 6165524480                Options.force_consistency_checks: 1
-2026/06/20-14:18:18.666665 6165524480                Options.report_bg_io_stats: 0
-2026/06/20-14:18:18.666665 6165524480                               Options.ttl: 2592000
-2026/06/20-14:18:18.666666 6165524480          Options.periodic_compaction_seconds: 0
-2026/06/20-14:18:18.666667 6165524480  Options.preclude_last_level_data_seconds: 0
-2026/06/20-14:18:18.666667 6165524480    Options.preserve_internal_time_seconds: 0
-2026/06/20-14:18:18.666668 6165524480                       Options.enable_blob_files: false
-2026/06/20-14:18:18.666668 6165524480                           Options.min_blob_size: 0
-2026/06/20-14:18:18.666669 6165524480                          Options.blob_file_size: 268435456
-2026/06/20-14:18:18.666669 6165524480                   Options.blob_compression_type: NoCompression
-2026/06/20-14:18:18.666670 6165524480          Options.enable_blob_garbage_collection: false
-2026/06/20-14:18:18.666671 6165524480      Options.blob_garbage_collection_age_cutoff: 0.250000
-2026/06/20-14:18:18.666671 6165524480 Options.blob_garbage_collection_force_threshold: 1.000000
-2026/06/20-14:18:18.666672 6165524480          Options.blob_compaction_readahead_size: 0
-2026/06/20-14:18:18.666672 6165524480                Options.blob_file_starting_level: 0
-2026/06/20-14:18:18.666673 6165524480 Options.experimental_mempurge_threshold: 0.000000
-2026/06/20-14:18:18.666925 6165524480 [db/version_set.cc:5668] Recovered from manifest file:/Users/aleksandrromanenko/cube_projects/cube.js__worktrees/cubestore-dicts/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000001 succeeded,manifest_file_number is 1, next_file_number is 3, last_sequence is 0, log_number is 0,prev_log_number is 0,max_column_family is 0,min_log_number_to_keep is 0
-2026/06/20-14:18:18.666927 6165524480 [db/version_set.cc:5677] Column family [default] (ID 0), log number is 0
-2026/06/20-14:18:18.666969 6165524480 [db/db_impl/db_impl_open.cc:539] DB ID: d00a5f92-ccd3-4958-88a5-390cb7553a78
-2026/06/20-14:18:18.667187 6165524480 [db/version_set.cc:5135] Creating manifest 5
-2026/06/20-14:18:18.669398 6165524480 [db/db_impl/db_impl_open.cc:1992] SstFileManager instance 0xa208f5880
-2026/06/20-14:18:18.669483 6165524480 DB pointer 0xa20875400
-2026/06/20-14:18:18.670784 6165524480 [db/db_impl/db_impl.cc:504] Shutdown: canceling all background work
-2026/06/20-14:18:18.671086 6165524480 [db/db_impl/db_impl.cc:711] Shutdown complete
diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005 b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/MANIFEST-000005
deleted file mode 100644
index f1d1169ebb5084f6ab379df690c29da290eba713..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 66
zcmZS8)^KKEU<~`Y<A)3bBcoJKYFTPdN|K&aWl3szW^t->er`cxQDRAc(HCZ(C>91r
VCI%LUKRRkZ*%%l(8JO8v7yzD{5)c3Y

diff --git a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007 b/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007
deleted file mode 100644
index d2b2e9bfc5695..0000000000000
--- a/rust/cubestore/cubestore/test-get_active_partitions_for_indexes-local/metastore/OPTIONS-000007
+++ /dev/null
@@ -1,198 +0,0 @@
-# This is a RocksDB option file.
-#
-# For detailed file format spec, please refer to the example file
-# in examples/rocksdb_option_file_example.ini
-#
-
-[Version]
-  rocksdb_version=7.10.2
-  options_file_version=1.1
-
-[DBOptions]
-  max_background_flushes=-1
-  compaction_readahead_size=0
-  strict_bytes_per_sync=false
-  wal_bytes_per_sync=0
-  max_open_files=-1
-  stats_history_buffer_size=1048576
-  max_total_wal_size=0
-  stats_persist_period_sec=600
-  stats_dump_period_sec=600
-  avoid_flush_during_shutdown=false
-  max_subcompactions=1
-  bytes_per_sync=0
-  delayed_write_rate=16777216
-  max_background_compactions=-1
-  max_background_jobs=2
-  delete_obsolete_files_period_micros=21600000000
-  writable_file_max_buffer_size=1048576
-  file_checksum_gen_factory=nullptr
-  allow_data_in_errors=false
-  max_bgerror_resume_count=2147483647
-  best_efforts_recovery=false
-  write_dbid_to_manifest=false
-  atomic_flush=false
-  wal_compression=kNoCompression
-  manual_wal_flush=false
-  two_write_queues=false
-  avoid_flush_during_recovery=false
-  dump_malloc_stats=false
-  info_log_level=INFO_LEVEL
-  write_thread_slow_yield_usec=3
-  allow_ingest_behind=false
-  fail_if_options_file_error=false
-  persist_stats_to_disk=false
-  WAL_ttl_seconds=330
-  bgerror_resume_retry_interval=1000000
-  allow_concurrent_memtable_write=true
-  paranoid_checks=true
-  WAL_size_limit_MB=0
-  lowest_used_cache_tier=kNonVolatileBlockTier
-  keep_log_file_num=1000
-  table_cache_numshardbits=6
-  max_file_opening_threads=16
-  use_fsync=false
-  unordered_write=false
-  random_access_max_buffer_size=1048576
-  log_readahead_size=0
-  enable_pipelined_write=false
-  wal_recovery_mode=kPointInTimeRecovery
-  db_write_buffer_size=0
-  allow_2pc=false
-  skip_checking_sst_file_sizes_on_db_open=false
-  skip_stats_update_on_db_open=false
-  recycle_log_file_num=0
-  db_host_id=__hostname__
-  access_hint_on_compaction_start=NORMAL
-  verify_sst_unique_id_in_manifest=true
-  track_and_verify_wals_in_manifest=false
-  error_if_exists=false
-  manifest_preallocation_size=4194304
-  is_fd_close_on_exec=true
-  enable_write_thread_adaptive_yield=true
-  enable_thread_tracking=false
-  avoid_unnecessary_blocking_io=false
-  allow_fallocate=true
-  max_log_file_size=0
-  advise_random_on_open=true
-  create_missing_column_families=false
-  max_write_batch_group_size_bytes=1048576
-  use_adaptive_mutex=false
-  wal_filter=nullptr
-  create_if_missing=true
-  enforce_single_del_contracts=true
-  allow_mmap_writes=false
-  log_file_time_to_roll=0
-  use_direct_io_for_flush_and_compaction=false
-  flush_verify_memtable_count=true
-  max_manifest_file_size=1073741824
-  write_thread_max_yield_usec=100
-  use_direct_reads=false
-  allow_mmap_reads=false
-  
-
-[CFOptions "default"]
-  memtable_protection_bytes_per_key=0
-  bottommost_compression=kNoCompression
-  sample_for_compression=0
-  blob_garbage_collection_age_cutoff=0.250000
-  blob_compression_type=kNoCompression
-  prepopulate_blob_cache=kDisable
-  blob_compaction_readahead_size=0
-  level0_stop_writes_trigger=36
-  min_blob_size=0
-  last_level_temperature=kUnknown
-  compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;compression_size_percent=-1;max_size_amplification_percent=200;incremental=false;max_merge_width=4294967295;size_ratio=1;}
-  target_file_size_base=67108864
-  ignore_max_compaction_bytes_for_input=true
-  memtable_whole_key_filtering=false
-  blob_file_starting_level=0
-  soft_pending_compaction_bytes_limit=68719476736
-  max_write_buffer_number=2
-  ttl=2592000
-  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
-  check_flush_compaction_key_order=true
-  memtable_huge_page_size=0
-  max_successive_merges=0
-  inplace_update_num_locks=10000
-  enable_blob_garbage_collection=false
-  arena_block_size=1048576
-  bottommost_compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
-  target_file_size_multiplier=1
-  max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
-  blob_garbage_collection_force_threshold=1.000000
-  enable_blob_files=false
-  level0_slowdown_writes_trigger=20
-  compression=kNoCompression
-  level0_file_num_compaction_trigger=4
-  prefix_extractor=rocksdb.FixedPrefix.13
-  max_bytes_for_level_multiplier=10.000000
-  write_buffer_size=67108864
-  disable_auto_compactions=false
-  max_compaction_bytes=1677721600
-  compression_opts={use_zstd_dict_trainer=true;enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}
-  hard_pending_compaction_bytes_limit=274877906944
-  blob_file_size=268435456
-  periodic_compaction_seconds=0
-  paranoid_file_checks=false
-  experimental_mempurge_threshold=0.000000
-  memtable_prefix_bloom_size_ratio=0.000000
-  max_bytes_for_level_base=268435456
-  max_sequential_skip_in_iterations=8
-  report_bg_io_stats=false
-  sst_partitioner_factory=nullptr
-  compaction_pri=kMinOverlappingRatio
-  compaction_style=kCompactionStyleLevel
-  compaction_filter_factory=nullptr
-  compaction_filter=nullptr
-  memtable_factory=SkipListFactory
-  comparator=leveldb.BytewiseComparator
-  bloom_locality=0
-  min_write_buffer_number_to_merge=1
-  table_factory=BlockBasedTable
-  max_write_buffer_size_to_maintain=0
-  max_write_buffer_number_to_maintain=0
-  preserve_internal_time_seconds=0
-  force_consistency_checks=true
-  optimize_filters_for_hits=false
-  merge_operator=meta_store merge
-  num_levels=7
-  level_compaction_dynamic_file_size=true
-  memtable_insert_with_hint_prefix_extractor=nullptr
-  level_compaction_dynamic_level_bytes=false
-  preclude_last_level_data_seconds=0
-  inplace_update_support=false
-  
-[TableOptions/BlockBasedTable "default"]
-  num_file_reads_for_auto_readahead=2
-  metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
-  read_amp_bytes_per_bit=0
-  verify_compression=false
-  format_version=5
-  optimize_filters_for_memory=false
-  partition_filters=false
-  detect_filter_construct_corruption=false
-  initial_auto_readahead_size=8192
-  max_auto_readahead_size=262144
-  enable_index_compression=true
-  checksum=kXXH3
-  index_block_restart_interval=1
-  pin_top_level_index_and_filter=true
-  block_align=false
-  block_size=4096
-  index_type=kBinarySearch
-  filter_policy=nullptr
-  metadata_block_size=4096
-  no_block_cache=false
-  index_shortening=kShortenSeparators
-  whole_key_filtering=true
-  block_size_deviation=10
-  data_block_index_type=kDataBlockBinarySearch
-  data_block_hash_table_util_ratio=0.750000
-  cache_index_and_filter_blocks=false
-  prepopulate_block_cache=kDisable
-  block_restart_interval=16
-  pin_l0_filter_and_index_blocks_in_cache=false
-  cache_index_and_filter_blocks_with_high_priority=true
-  flush_block_policy_factory=FlushBlockBySizePolicyFactory
-