diff --git a/be/src/vec/aggregate_functions/aggregate_function_entropy.cpp b/be/src/vec/aggregate_functions/aggregate_function_entropy.cpp new file mode 100644 index 00000000000000..146c6f842a14d2 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_entropy.cpp @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/AggregateFunctionEntropy.cpp +// and modified by Doris + +#include "vec/aggregate_functions/aggregate_function_entropy.h" + +#include "runtime/define_primitive_type.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +AggregateFunctionPtr create_aggregate_function_entropy(const std::string& name, + const DataTypes& argument_types, + const DataTypePtr& result_type, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.size() == 1) { + auto res = creator_with_type_list< + TYPE_BOOLEAN, TYPE_TINYINT, TYPE_SMALLINT, TYPE_INT, TYPE_BIGINT, TYPE_LARGEINT, + TYPE_DECIMAL32, TYPE_DECIMAL64, TYPE_DECIMAL128I, TYPE_DECIMAL256, TYPE_DECIMALV2, + TYPE_FLOAT, TYPE_DOUBLE, TYPE_DATE, TYPE_DATETIME, TYPE_DATEV2, TYPE_DATETIMEV2, + TYPE_TIME, TYPE_TIMEV2, TYPE_TIMESTAMPTZ>:: + create( + argument_types, result_is_nullable, attr); + if (res) { + return res; + } + + auto type = argument_types[0]->get_primitive_type(); + if (is_string_type(type) || is_varbinary(type) || type == TYPE_JSONB) { + res = creator_without_type::create< + AggregateFunctionEntropy>( + argument_types, result_is_nullable, attr); + return res; + } + } + + return creator_without_type::create< + AggregateFunctionEntropy>( + argument_types, result_is_nullable, attr); +} + +void register_aggregate_function_entropy(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("entropy", create_aggregate_function_entropy); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_entropy.h b/be/src/vec/aggregate_functions/aggregate_function_entropy.h new file mode 100644 index 00000000000000..edcf7b228aae75 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_entropy.h @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/AggregateFunctionEntropy.cpp +// and modified by Doris + +#pragma once + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" +#include "vec/columns/column_decimal.h" +#include "vec/common/assert_cast.h" +#include "vec/common/hash_table/hash.h" +#include "vec/common/hash_table/phmap_fwd_decl.h" +#include "vec/common/string_ref.h" +#include "vec/common/uint128.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +class Arena; +class BufferReadable; +class BufferWritable; +template +class ColumnVector; + +/** Calculates Shannon Entropy, using HashMap and computing empirical distribution function. + * Entropy is measured in bits (base-2 logarithm is used). + */ +template > +struct AggregateFunctionEntropyData { + using Container = flat_hash_map; + using Self = AggregateFunctionEntropyData; + Container frequency_map; + uint64_t total_count = 0; + + void clear() { + frequency_map.clear(); + total_count = 0; + } + + void add(const Value& elem) { + ++frequency_map[elem]; + ++total_count; + } + + void merge(const Self& rhs) { + frequency_map.reserve(frequency_map.size() + rhs.frequency_map.size()); + for (const auto& [elem, count] : rhs.frequency_map) { + frequency_map[elem] += count; + } + total_count += rhs.total_count; + } + + void write(BufferWritable& buf) const { + buf.write_var_uint(frequency_map.size()); + for (const auto& [elem, count] : frequency_map) { + buf.write_binary(elem); + buf.write_binary(count); + } + } + + void read(BufferReadable& buf) { + uint64_t new_size = 0; + buf.read_var_uint(new_size); + frequency_map.reserve(frequency_map.size() + new_size); + + Value elem; + uint64_t count; + for (size_t i = 0; i < new_size; ++i) { + buf.read_binary(elem); + buf.read_binary(count); + frequency_map[elem] += count; + total_count += count; + } + } + + Float64 get_result() const { + if (total_count == 0) { + return 0; + } + Float64 entropy = 0; + for (const auto& [_, count] : frequency_map) { + Float64 p = static_cast(count) / static_cast(total_count); + entropy -= p * std::log2(p); + } + return entropy; + } + + static String get_name() { return "entropy"; } +}; + +template +struct AggregateFunctionEntropySingleNumericData + : public AggregateFunctionEntropyData::CppType> { + using Base = AggregateFunctionEntropyData::CppType>; + + void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena&) { + const auto& vec = assert_cast::ColumnType&, + TypeCheckOnRelease::DISABLE>(*columns[0]) + .get_data(); + Base::add(vec[row_num]); + } +}; + +struct AggregateFunctionEntropySingleStringData + : public AggregateFunctionEntropyData { + using Base = AggregateFunctionEntropyData; + + void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena&) { + auto key = columns[0]->get_data_at(row_num); + auto hash_value = XXH_INLINE_XXH128(key.data, key.size, 0); + Base::add(UInt128 {hash_value.high64, hash_value.low64}); + } +}; + +struct AggregateFunctionEntropyGenericData + : public AggregateFunctionEntropyData { + using Base = AggregateFunctionEntropyData; + + void add(const IColumn** columns, size_t columns_num, size_t row_num, Arena& arena) { + const char* begin = nullptr; + StringRef key(begin, 0); + for (size_t i = 0; i < columns_num; ++i) { + auto cur_ref = columns[i]->serialize_value_into_arena(row_num, arena, begin); + key.data = cur_ref.data - key.size; + key.size += cur_ref.size; + } + auto hash_value = XXH_INLINE_XXH128(key.data, key.size, 0); + Base::add(UInt128 {hash_value.high64, hash_value.low64}); + } +}; + +template +class AggregateFunctionEntropy final + : public IAggregateFunctionDataHelper>, + VarargsExpression, + NullableAggregateFunction { +private: + size_t arguments_num; + +public: + AggregateFunctionEntropy(const DataTypes& arguments) + : IAggregateFunctionDataHelper>(arguments), + arguments_num(arguments.size()) {} + + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena& arena) const override { + this->data(place).add(columns, arguments_num, row_num, arena); + } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena&) const override { + this->data(place).merge(this->data(rhs)); + } + + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).write(buf); + } + + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena&) const override { + this->data(place).read(buf); + } + + void insert_result_into(ConstAggregateDataPtr place, IColumn& to) const override { + auto& column = assert_cast(to); + column.get_data().push_back(this->data(place).get_result()); + } + + void reset(AggregateDataPtr place) const override { this->data(place).clear(); } + + String get_name() const override { return Data::get_name(); } + + DataTypePtr get_return_type() const override { + return std::make_shared>(); + } +}; + +} // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp index 8ba0696b667535..9aa68a3c3dbb85 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp @@ -79,6 +79,7 @@ void register_aggregate_function_percentile_reservoir(AggregateFunctionSimpleFac void register_aggregate_function_ai_agg(AggregateFunctionSimpleFactory& factory); void register_aggregate_function_bool_union(AggregateFunctionSimpleFactory& factory); void register_aggregate_function_sem(AggregateFunctionSimpleFactory& factory); +void register_aggregate_function_entropy(AggregateFunctionSimpleFactory& factory); AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { static std::once_flag oc; @@ -135,6 +136,7 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_ai_agg(instance); register_aggregate_function_bool_union(instance); register_aggregate_function_sem(instance); + register_aggregate_function_entropy(instance); // Register foreach and foreachv2 functions register_aggregate_function_combinator_foreach(instance); register_aggregate_function_combinator_foreachv2(instance); diff --git a/be/test/testutil/column_helper.h b/be/test/testutil/column_helper.h index d297b7cdd40fd8..164ca7b65bd148 100644 --- a/be/test/testutil/column_helper.h +++ b/be/test/testutil/column_helper.h @@ -127,14 +127,15 @@ struct ColumnHelper { return block; } - template - static Block create_block(const std::vector& data1, - const std::vector& data2) { - auto column1 = create_column(data1); - auto column2 = create_column(data2); - auto data_type = std::make_shared(); - Block block({ColumnWithTypeAndName(column1, data_type, "column1"), - ColumnWithTypeAndName(column2, data_type, "column2")}); + template + static Block create_block(const std::vector& data1, + const std::vector& data2) { + auto column1 = create_column(data1); + auto column2 = create_column(data2); + auto data_type1 = std::make_shared(); + auto data_type2 = std::make_shared(); + Block block({ColumnWithTypeAndName(column1, data_type1, "column1"), + ColumnWithTypeAndName(column2, data_type2, "column2")}); return block; } diff --git a/be/test/vec/aggregate_functions/agg_entropy_test.cpp b/be/test/vec/aggregate_functions/agg_entropy_test.cpp new file mode 100644 index 00000000000000..d74f49e812881d --- /dev/null +++ b/be/test/vec/aggregate_functions/agg_entropy_test.cpp @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "agg_function_test.h" + +namespace doris::vectorized { + +struct AggregateFunctionEntropyTest : public AggregateFunctiontest {}; + +// ------------------------------------------------------------ +// 1. Numeric entropy test +// ------------------------------------------------------------ +TEST_F(AggregateFunctionEntropyTest, test_numeric_entropy) { + create_agg("entropy", false, {std::make_shared()}, + std::make_shared()); + + // values: 1,1,2,2,3 + Block block = ColumnHelper::create_block({1, 1, 2, 2, 3}); + + double p1 = 2.0 / 5; + double p2 = 2.0 / 5; + double p3 = 1.0 / 5; + double expected = -(p1 * log2(p1) + p2 * log2(p2) + p3 * log2(p3)); + + execute(block, ColumnHelper::create_column_with_name({expected})); +} + +// ------------------------------------------------------------ +// 2. String entropy test +// ------------------------------------------------------------ +TEST_F(AggregateFunctionEntropyTest, test_string_entropy) { + create_agg("entropy", false, {std::make_shared()}, + std::make_shared()); + + Block block = ColumnHelper::create_block({"a", "a", "b"}); + + double p1 = 2.0 / 3; + double p2 = 1.0 / 3; + double expected = -(p1 * log2(p1) + p2 * log2(p2)); + + execute(block, ColumnHelper::create_column_with_name({expected})); +} + +// ------------------------------------------------------------ +// 3. Generic entropy test +// ------------------------------------------------------------ +TEST_F(AggregateFunctionEntropyTest, test_generic_entropy) { + create_agg("entropy", false, + {std::make_shared(), std::make_shared()}, + std::make_shared()); + + // rows: + // (1, "a") + // (1, "a") + // (2, "b") + Block block = + ColumnHelper::create_block({1, 1, 2}, {"a", "a", "b"}); + + double p1 = 2.0 / 3; + double p2 = 1.0 / 3; + double expected = -(p1 * log2(p1) + p2 * log2(p2)); + + execute(block, ColumnHelper::create_column_with_name({expected})); +} + +// ------------------------------------------------------------ +// 4. NULL entropy test +// ------------------------------------------------------------ +TEST_F(AggregateFunctionEntropyTest, test_nullable_entropy) { + create_agg("entropy", false, + {std::make_shared(std::make_shared())}, + std::make_shared()); + + // values: 1,1,NULL,2,NULL + Block block = + ColumnHelper::create_nullable_block({1, 1, 0, 2, 0}, {0, 0, 1, 0, 1}); + + // only non-null values: 1,1,2 + double p1 = 2.0 / 3; + double p2 = 1.0 / 3; + double expected = -(p1 * log2(p1) + p2 * log2(p2)); + + execute(block, ColumnHelper::create_column_with_name({expected})); +} + +// ------------------------------------------------------------ +// 5. Empty input test +// ------------------------------------------------------------ +TEST_F(AggregateFunctionEntropyTest, test_empty) { + create_agg("entropy", false, {std::make_shared()}, + std::make_shared()); + + Block block = ColumnHelper::create_block({}); + + // entropy of empty set = 0 + execute(block, ColumnHelper::create_column_with_name({0.0})); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/aggregate_functions/agg_function_test.h b/be/test/vec/aggregate_functions/agg_function_test.h index 47577d58a45d1b..b58375e61776d6 100644 --- a/be/test/vec/aggregate_functions/agg_function_test.h +++ b/be/test/vec/aggregate_functions/agg_function_test.h @@ -179,8 +179,10 @@ struct AggregateFunctiontest : public testing::Test { agg_fn->create(place); Defer defer([&]() { agg_fn->destroy(place); }); - agg_fn->function()->deserialize_and_merge_from_column_range( - place, *serialize_column, 0, block.rows() - 1, arena); + if (block.rows() != 0) { + agg_fn->function()->deserialize_and_merge_from_column_range( + place, *serialize_column, 0, block.rows() - 1, arena); + } check_result(place); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java index 9a7ba40a709eaa..e7e134652f6f25 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java @@ -38,6 +38,7 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.CountByEnum; import org.apache.doris.nereids.trees.expressions.functions.agg.Covar; import org.apache.doris.nereids.trees.expressions.functions.agg.CovarSamp; +import org.apache.doris.nereids.trees.expressions.functions.agg.Entropy; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupArrayIntersect; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupArrayUnion; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupBitAnd; @@ -138,6 +139,7 @@ private BuiltinAggregateFunctions() { agg(CountByEnum.class, "count_by_enum"), agg(Covar.class, "covar", "covar_pop"), agg(CovarSamp.class, "covar_samp"), + agg(Entropy.class, "entropy"), agg(GroupArrayIntersect.class, "group_array_intersect"), agg(GroupArrayUnion.class, "group_array_union"), agg(GroupBitAnd.class, "group_bit_and"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/Entropy.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/Entropy.java new file mode 100644 index 00000000000000..2eea03bac2a31e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/Entropy.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.agg; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DoubleType; +import org.apache.doris.nereids.types.coercion.AnyDataType; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * AggregateFunction 'entropy'. Calculates Shannon Entropy of the input values. + */ +public class Entropy extends NullableAggregateFunction implements ExplicitlyCastableSignature { + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(DoubleType.INSTANCE).varArgs(AnyDataType.INSTANCE_WITHOUT_INDEX) + ); + + /** + * constructor with 1 or more arguments. + */ + public Entropy(Expression arg0, Expression... varArgs) { + this(false, false, arg0, varArgs); + } + + /** + * constructor with distinct flag and 1 or more arguments. + */ + public Entropy(boolean distinct, Expression arg0, Expression... varArgs) { + this(distinct, false, arg0, varArgs); + } + + private Entropy(boolean distinct, boolean alwaysNullable, Expression arg0, Expression... varArgs) { + super("entropy", distinct, alwaysNullable, ExpressionUtils.mergeArguments(arg0, varArgs)); + } + + /** constructor for withChildren and reuse signature */ + private Entropy(NullableAggregateFunctionParams functionParams) { + super(functionParams); + } + + @Override + public Entropy withDistinctAndChildren(boolean distinct, List children) { + return new Entropy(getFunctionParams(distinct, children)); + } + + @Override + public Entropy withAlwaysNullable(boolean alwaysNullable) { + return new Entropy(getAlwaysNullableFunctionParams(alwaysNullable)); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitEntropy(this, context); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java index bb4be4ffffaff6..523e0672e2dc6b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java @@ -39,6 +39,7 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.CountByEnum; import org.apache.doris.nereids.trees.expressions.functions.agg.Covar; import org.apache.doris.nereids.trees.expressions.functions.agg.CovarSamp; +import org.apache.doris.nereids.trees.expressions.functions.agg.Entropy; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupArrayIntersect; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupArrayUnion; import org.apache.doris.nereids.trees.expressions.functions.agg.GroupBitAnd; @@ -188,6 +189,10 @@ default R visitCovarSamp(CovarSamp covarSamp, C context) { return visitNullableAggregateFunction(covarSamp, context); } + default R visitEntropy(Entropy entropy, C context) { + return visitNullableAggregateFunction(entropy, context); + } + default R visitMultiDistinctCount(MultiDistinctCount multiDistinctCount, C context) { return visitAggregateFunction(multiDistinctCount, context); } diff --git a/regression-test/data/query_p0/aggregate/aggregate_function_entropy.out b/regression-test/data/query_p0/aggregate/aggregate_function_entropy.out new file mode 100644 index 00000000000000..a5397039353e9a --- /dev/null +++ b/regression-test/data/query_p0/aggregate/aggregate_function_entropy.out @@ -0,0 +1,115 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !entropy_null -- +\N + +-- !entropy_literal -- +0 + +-- !entropy_boolean -- +0 + +-- !entropy_tinyint -- +2.321928094887362 + +-- !entropy_smallint -- +2.321928094887362 + +-- !entropy_int -- +2.321928094887362 + +-- !entropy_bigint -- +2.321928094887362 + +-- !entropy_largeint -- +2.321928094887362 + +-- !entropy_float -- +2.321928094887362 + +-- !entropy_double -- +2.321928094887362 + +-- !entropy_char -- +2.321928094887362 + +-- !entropy_varchar -- +2.321928094887362 + +-- !entropy_string -- +2.321928094887362 + +-- !entropy_date -- +2.321928094887362 + +-- !entropy_datetime -- +2.321928094887362 + +-- !entropy_decimal32 -- +2.321928094887362 + +-- !entropy_decimal64 -- +2.321928094887362 + +-- !entropy_decimal128 -- +2.321928094887362 + +-- !entropy_decimal256 -- +2.321928094887362 + +-- !entropy_ipv4 -- +2.321928094887362 + +-- !entropy_ipv6 -- +2.321928094887362 + +-- !entropy_array -- +2.321928094887362 + +-- !entropy_map -- +2.321928094887362 + +-- !entropy_struct -- +2.321928094887362 + +-- !entropy_multicol -- +2.321928094887362 + +-- !entropy_groupby -- +1 +1.584962500721156 + +-- !entropy_window -- +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 +1.584962500721156 + +-- !entropy_empty -- +\N + diff --git a/regression-test/suites/query_p0/aggregate/aggregate_function_entropy.groovy b/regression-test/suites/query_p0/aggregate/aggregate_function_entropy.groovy new file mode 100644 index 00000000000000..24018a77a4554b --- /dev/null +++ b/regression-test/suites/query_p0/aggregate/aggregate_function_entropy.groovy @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_entropy") { + + sql """set enable_nereids_planner=true""" + sql """set enable_fallback_to_original_planner=false""" + sql """set enable_decimal256=true""" + + qt_entropy_null """select entropy(NULL)""" + qt_entropy_literal """select entropy(1)""" + + sql """drop table if exists test_entropy_base""" + sql """ + create table test_entropy_base ( + k1 int null, + col_boolean boolean null, + col_tinyint tinyint null, + col_smallint smallint null, + col_int int null, + col_bigint bigint null, + col_largeint largeint null, + col_float float null, + col_double double null, + col_char char(10) null, + col_varchar varchar(100) null, + col_string string null, + col_date date null, + col_datetime datetime null, + col_decimal32 decimal(9,2) null, + col_decimal64 decimal(18,4) null, + col_decimal128 decimal(38,8) null, + col_decimal256 decimal(76,8) null, + col_ipv4 ipv4 null, + col_ipv6 ipv6 null, + col_array array null, + col_map map null, + col_struct struct null, + col_bitmap bitmap not null, + col_hll hll not null, + col_quantile_state quantile_state not null + ) + duplicate key (k1) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1") + """ + + sql """ + insert into test_entropy_base VALUES + (0, null, + null, null, null, null, null, + null, null, + null, null, null, + null, null, + null, null, null, null, + null, null, + null, null, null, + to_bitmap(0), hll_hash(0), to_quantile_state(0,2048) + ), + + (1, true, + 10, 1000, 1000000, 10000000000, 100000000000000000000, + 3.14, 1.718281828, + 'char1', 'varchar1', 'string1', + '2023-01-01', '2023-01-01 12:00:00', + 123.45, 12345.6789, 123456789.12345678, 123456789123456789123456789123456789123456789123456789123456789.12345678, + '192.168.1.1', '2001:db8::1', + [1,2,3], {'key1':100, 'key2':200}, named_struct('name','John1','age',21), + to_bitmap(1), hll_hash(1), to_quantile_state(1,2048) + ), + + (2, true, + 20, 2000, 2000000, 20000000000, 200000000000000000000, + 3.24, 2.718281828, + 'char2', 'varchar2', 'string2', + '2024-01-02', '2024-01-02 12:00:00', + 223.45, 22345.6789, 223456789.12345678, 223456789123456789123456789123456789123456789123456789123456789.12345678, + '192.168.1.2', '2001:db8::2', + [4,5,6], {'key1':200, 'key2':400}, named_struct('name','John2','age',22), + to_bitmap(2), hll_hash(2), to_quantile_state(2,2048) + ), + + (3, true, + 30, 3000, 3000000, 30000000000, 300000000000000000000, + 3.34, 3.718281828, + 'char3', 'varchar3', 'string3', + '2025-01-03', '2025-01-03 12:00:00', + 323.45, 32345.6789, 323456789.12345678, 323456789123456789123456789123456789123456789123456789123456789.12345678, + '192.168.1.3', '2001:db8::3', + [7,8,9], {'key1':300, 'key2':600}, named_struct('name','John3','age',23), + to_bitmap(3), hll_hash(3), to_quantile_state(3,2048) + ), + + (4, true, + 40, 4000, 4000000, 40000000000, 400000000000000000000, + 3.44, 4.718281828, + 'char4', 'varchar4', 'string4', + '2026-01-04', '2026-01-04 12:00:00', + 423.45, 42345.6789, 423456789.12345678, 423456789123456789123456789123456789123456789123456789123456789.12345678, + '192.168.1.4', '2001:db8::4', + [10,11,12], {'key1':400, 'key2':800}, named_struct('name','John4','age',24), + to_bitmap(4), hll_hash(4), to_quantile_state(4,2048) + ), + + (5, true, + 50, 5000, 5000000, 50000000000, 500000000000000000000, + 3.54, 5.718281828, + 'char5', 'varchar5', 'string5', + '2027-01-05', '2027-01-05 12:00:00', + 523.45, 52345.6789, 523456789.12345678, 523456789123456789123456789123456789123456789123456789123456789.12345678, + '192.168.1.5', '2001:db8::5', + [13,14,15], {'key1':500, 'key2':1000}, named_struct('name','John5','age',25), + to_bitmap(5), hll_hash(5), to_quantile_state(5,2048) + )""" + + sql """drop table if exists test_entropy""" + sql """ + create table test_entropy + duplicate key (k1) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1") + as + select test_entropy_base.* from test_entropy_base + lateral view explode( + array( + 1,2,2,2,5,0, + 1,4,4,3,3,0, + 5,5,3,5,1,0, + 1,1,2,3,4,0, + 5,3,4,4,2,0 + ) + ) test_entropy_seq as k + where test_entropy_seq.k = test_entropy_base.k1 + """ + + qt_entropy_boolean """select entropy(col_boolean) from test_entropy""" + qt_entropy_tinyint """select entropy(col_tinyint) from test_entropy""" + qt_entropy_smallint """select entropy(col_smallint) from test_entropy""" + qt_entropy_int """select entropy(col_int) from test_entropy""" + qt_entropy_bigint """select entropy(col_bigint) from test_entropy""" + qt_entropy_largeint """select entropy(col_largeint) from test_entropy""" + qt_entropy_float """select entropy(col_float) from test_entropy""" + qt_entropy_double """select entropy(col_double) from test_entropy""" + + qt_entropy_char """select entropy(col_char) from test_entropy""" + qt_entropy_varchar """select entropy(col_varchar) from test_entropy""" + qt_entropy_string """select entropy(col_string) from test_entropy""" + + qt_entropy_date """select entropy(col_date) from test_entropy""" + qt_entropy_datetime """select entropy(col_datetime) from test_entropy""" + + qt_entropy_decimal32 """select entropy(col_decimal32) from test_entropy""" + qt_entropy_decimal64 """select entropy(col_decimal64) from test_entropy""" + qt_entropy_decimal128 """select entropy(col_decimal128) from test_entropy""" + qt_entropy_decimal256 """select entropy(col_decimal256) from test_entropy""" + + qt_entropy_ipv4 """select entropy(col_ipv4) from test_entropy""" + qt_entropy_ipv6 """select entropy(col_ipv6) from test_entropy""" + + qt_entropy_array """select entropy(col_array) from test_entropy""" + qt_entropy_map """select entropy(col_map) from test_entropy""" + qt_entropy_struct """select entropy(col_struct) from test_entropy""" + + qt_entropy_multicol """select entropy(col_int, col_string, col_datetime) from test_entropy""" + + qt_entropy_groupby """select entropy(col_int) from test_entropy group by k1 % 2 order by k1%2""" + qt_entropy_window """select entropy(col_int) over (partition by k1%2) from test_entropy order by k1%2""" + + qt_entropy_empty """select entropy(col_int) from test_entropy where 1=0""" + +}