diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 069a64798d062a..92553ba90783d9 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -136,9 +136,9 @@ struct GlobRegexCacheEntry { std::list::iterator lru_it; }; -std::mutex g_glob_regex_cache_mutex; -std::list g_glob_regex_cache_lru; -std::unordered_map g_glob_regex_cache; +static std::mutex g_glob_regex_cache_mutex; +static std::list g_glob_regex_cache_lru; +static std::unordered_map g_glob_regex_cache; std::shared_ptr get_or_build_re2(const std::string& glob_pattern) { { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java index 9b641b81755be0..c033bb69d17d21 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java @@ -217,7 +217,7 @@ private boolean containsVariantTypeOutsideCast(Expression expr) { } private boolean containsVariantTypeOutsideCast(Expression expr, boolean underCast) { - boolean nextUnderCast = underCast || expr instanceof Cast; + boolean nextUnderCast = underCast || (expr instanceof Cast && !expr.getDataType().isVariantType()); if (!nextUnderCast && expr.getDataType().isVariantType()) { return true; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java index 134610b5c345b1..3ef9941946d67a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java @@ -735,10 +735,7 @@ private boolean isEnableVariantSchemaAutoCast(ExpressionRewriteContext context) return false; } SessionVariable sessionVariable = context.cascadesContext.getConnectContext().getSessionVariable(); - if (sessionVariable == null || !sessionVariable.isEnableVariantSchemaAutoCast()) { - return false; - } - return sessionVariable.isEnableVariantSchemaAutoCast(); + return sessionVariable != null && sessionVariable.isEnableVariantSchemaAutoCast(); } private Expression wrapVariantElementAtWithCast(Expression expr) { @@ -808,6 +805,9 @@ private Expression maybeCastAliasExpression(Alias alias, ExpressionRewriteContex return alias; } Expression child = alias.child(); + if (!(child instanceof ElementAt)) { + return alias; + } Expression casted = wrapVariantElementAtWithCast(child); if (casted == child) { return alias; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckMatchExpression.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckMatchExpression.java index 623c3085962b47..aefd8070ad9392 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckMatchExpression.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckMatchExpression.java @@ -49,10 +49,7 @@ private Plan checkChildren(LogicalFilter filter) { for (Expression expr : expressions) { if (expr instanceof Match) { Match matchExpression = (Match) expr; - boolean isSlotReference = matchExpression.left() instanceof SlotReference; - boolean isCastChildWithSlotReference = (matchExpression.left() instanceof Cast - && matchExpression.left().child(0) instanceof SlotReference); - if (!(isSlotReference || isCastChildWithSlotReference) + if (!isSlotOrCastChainOnSlot(matchExpression.left()) || !(matchExpression.right() instanceof Literal)) { throw new AnalysisException(String.format("Only support match left operand is SlotRef," + " right operand is Literal. But meet expression %s", matchExpression)); @@ -61,4 +58,12 @@ private Plan checkChildren(LogicalFilter filter) { } return filter; } + + private boolean isSlotOrCastChainOnSlot(Expression expression) { + Expression current = expression; + while (current instanceof Cast) { + current = current.child(0); + } + return current instanceof SlotReference; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index fc8bcbb9eab302..d70d8793d51f7d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -2442,7 +2442,7 @@ public boolean isEnableHboNonStrictMatchingMode() { + "因为外表会存在表的 schema 中 char 或者 varchar 列的最大长度和底层 parquet 或者 orc 文件中的 schema 不一致" + "的情况。此时开启改选项,会按照表的 schema 中的最大长度进行截断。", "Whether to truncate char or varchar columns according to the table's schema. " - + "The default is false.\n" + + "The default is true.\n" + "Because the maximum length of the char or varchar column in the schema of the table" + " is inconsistent with the schema in the underlying parquet or orc file." + " At this time, if the option is turned on, it will be truncated according to the maximum length" @@ -3297,12 +3297,12 @@ public boolean isEnableESParallelScroll() { needForward = true, affectQueryResultInExecution = true, description = { - "是否启用基于 schema template 的 variant 自动 cast,默认关闭。", + "是否启用基于 schema template 的 variant 自动 cast,默认开启。", "Whether to enable schema-template-based auto cast for variant expressions. " - + "The default is false." + + "The default is true." } ) - public boolean enableVariantSchemaAutoCast = false; + public boolean enableVariantSchemaAutoCast = true; @VariableMgr.VarAttr( name = DEFAULT_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE, diff --git a/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out b/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out index 3b9ecdd4580947..a1df514a0cf327 100644 --- a/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out +++ b/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out @@ -299,3 +299,44 @@ charlie 50 -- !leaf_having_mixed -- 3033333 3033333 4044444 4044444 + +-- !glob_wild_match -- +2 3 + +-- !glob_literal_match -- +2 + +-- !nonleaf_auto_cast_on -- +\\N + +-- !nonleaf_auto_cast_off -- +{"level1_num_1":1011111,"level1_num_2":102} + +-- !explicit_cast_chain_select_2 -- +10 +30 +50 +15 + +-- !explicit_cast_chain_where_3 -- +2 +3 + +-- !explicit_cast_chain_order_by_4 -- +3 +2 +4 +1 + +-- !explicit_cast_chain_group_having_4 -- +15 1 +30 1 +50 1 + +-- !explicit_cast_chain_match_2 -- +1 +4 + +-- !explicit_cast_chain_match_4 -- +1 +4 diff --git a/regression-test/suites/variant_p0/predefine/predefined_typed_to_sparse.groovy b/regression-test/suites/variant_p0/predefine/predefined_typed_to_sparse.groovy index 7f6ee1974b2daf..21efbd9584c904 100644 --- a/regression-test/suites/variant_p0/predefine/predefined_typed_to_sparse.groovy +++ b/regression-test/suites/variant_p0/predefine/predefined_typed_to_sparse.groovy @@ -17,6 +17,7 @@ suite("test_predefine_typed_to_sparse", "p0"){ sql """ set enable_common_expr_pushdown = true """ sql """ set default_variant_enable_doc_mode = false """ + sql """ set enable_variant_schema_auto_cast = false """ def count = new Random().nextInt(10) + 1 def load_json_data = {table_name, file_name -> diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy index c8e80b03bdd4a2..0e5a90ed9c9939 100644 --- a/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy @@ -307,4 +307,108 @@ suite("test_schema_template_auto_cast", "p0") { ORDER BY data['int_nested.level1_num_1'] """ sql "DROP TABLE IF EXISTS ${leafTable}" -} + + // Test 16: backslash escaping in schema template pattern + def globWildTable = "test_variant_schema_auto_cast_glob_wild" + def globLiteralTable = "test_variant_schema_auto_cast_glob_literal" + def globLiteralPattern = "a\\*b" // SQL sees a\*b, glob sees a\*b (literal *) + + sql "DROP TABLE IF EXISTS ${globWildTable}" + sql "DROP TABLE IF EXISTS ${globLiteralTable}" + + sql """CREATE TABLE ${globWildTable} ( + `id` bigint NULL, + `data` variant<'a*b': BIGINT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """CREATE TABLE ${globLiteralTable} ( + `id` bigint NULL, + `data` variant<'${globLiteralPattern}': BIGINT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${globWildTable} values(1, '{\"a*b\": 1, \"axb\": 2}')""" + sql """insert into ${globLiteralTable} values(1, '{\"a*b\": 1, \"axb\": 2}')""" + + // wildcard a*b matches both a*b and axb + qt_glob_wild_match """ SELECT data['a*b'] + 1 AS v1, data['axb'] + 1 AS v2 + FROM ${globWildTable} ORDER BY id """ + + // literal a\*b matches only a*b + qt_glob_literal_match """ SELECT data['a*b'] + 1 AS v1 FROM ${globLiteralTable} ORDER BY id """ + test { + sql """ SELECT data['axb'] + 1 FROM ${globLiteralTable} """ + exception "Cannot cast from variant" + } + + sql "DROP TABLE IF EXISTS ${globWildTable}" + sql "DROP TABLE IF EXISTS ${globLiteralTable}" + + + // Test 17: non-leaf path auto cast limitation + def nonleafTable = "test_variant_schema_auto_cast_nonleaf_limit" + sql "DROP TABLE IF EXISTS ${nonleafTable}" + sql """CREATE TABLE ${nonleafTable} ( + `id` int NULL, + `data` variant<'int_*': INT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${nonleafTable} values( + 1, '{"int_1": 1, "int_nested": {"level1_num_1": 1011111, "level1_num_2": 102}}')""" + + // auto cast enabled: non-leaf path matches int_* and returns NULL + sql "set enable_variant_schema_auto_cast = true" + qt_nonleaf_auto_cast_on """ SELECT data['int_nested'] FROM ${nonleafTable} ORDER BY id """ + + // auto cast disabled: return original object + sql "set enable_variant_schema_auto_cast = false" + qt_nonleaf_auto_cast_off """ SELECT data['int_nested'] FROM ${nonleafTable} ORDER BY id """ + + // restore default + sql "set enable_variant_schema_auto_cast = true" + sql "DROP TABLE IF EXISTS ${nonleafTable}" + + + // Test 18: multi-layer explicit cast chain (2~4), including MATCH clause + def castChainTable = "test_variant_schema_auto_cast_cast_chain" + sql "DROP TABLE IF EXISTS ${castChainTable}" + sql """CREATE TABLE ${castChainTable} ( + `id` bigint NULL, + `data` variant<'num_*': BIGINT, 'str_*': STRING> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${castChainTable} values(1, '{\"num_a\": 10, \"num_b\": 20, \"str_name\": \"alice\"}')""" + sql """insert into ${castChainTable} values(2, '{\"num_a\": 30, \"num_b\": 40, \"str_name\": \"bob\"}')""" + sql """insert into ${castChainTable} values(3, '{\"num_a\": 50, \"num_b\": 60, \"str_name\": \"charlie\"}')""" + sql """insert into ${castChainTable} values(4, '{\"num_a\": 15, \"num_b\": 25, \"str_name\": \"alice\"}')""" + + qt_explicit_cast_chain_select_2 """ SELECT CAST(CAST(data['num_a'] AS BIGINT) AS BIGINT) + FROM ${castChainTable} ORDER BY id """ + qt_explicit_cast_chain_where_3 """ SELECT id FROM ${castChainTable} + WHERE CAST(CAST(CAST(data['num_a'] AS BIGINT) AS BIGINT) AS BIGINT) > 20 ORDER BY id """ + qt_explicit_cast_chain_order_by_4 """ SELECT id FROM ${castChainTable} + ORDER BY CAST(CAST(CAST(CAST(data['num_b'] AS BIGINT) AS BIGINT) AS BIGINT) AS BIGINT) DESC, id """ + qt_explicit_cast_chain_group_having_4 """ SELECT + CAST(CAST(CAST(CAST(data['num_a'] AS BIGINT) AS BIGINT) AS BIGINT) AS BIGINT) AS v, COUNT(*) + FROM ${castChainTable} + GROUP BY CAST(CAST(CAST(CAST(data['num_a'] AS BIGINT) AS BIGINT) AS BIGINT) AS BIGINT) + HAVING CAST(CAST(CAST(CAST(data['num_a'] AS BIGINT) AS BIGINT) AS BIGINT) AS BIGINT) >= 15 + ORDER BY v """ + + sql """ set enable_match_without_inverted_index = true """ + qt_explicit_cast_chain_match_2 """ SELECT id FROM ${castChainTable} + WHERE CAST(CAST(data['str_name'] AS STRING) AS VARCHAR) MATCH 'alice' ORDER BY id """ + qt_explicit_cast_chain_match_4 """ SELECT id FROM ${castChainTable} + WHERE CAST(CAST(CAST(CAST(data['str_name'] AS STRING) AS VARCHAR) AS STRING) AS VARCHAR) MATCH 'alice' ORDER BY id """ + sql """ set enable_match_without_inverted_index = false """ + + sql "DROP TABLE IF EXISTS ${castChainTable}" + +} \ No newline at end of file