diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 39e720630678ae..069a64798d062a 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -38,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +63,7 @@ #include "olap/tablet.h" #include "olap/tablet_fwd.h" #include "olap/tablet_schema.h" +#include "re2/re2.h" #include "runtime/client_cache.h" #include "runtime/define_primitive_type.h" #include "runtime/exec_env.h" @@ -102,6 +103,162 @@ namespace doris::vectorized::variant_util { #include "common/compile_check_begin.h" +inline void append_escaped_regex_char(std::string* regex_output, char ch) { + switch (ch) { + case '.': + case '^': + case '$': + case '+': + case '*': + case '?': + case '(': + case ')': + case '|': + case '{': + case '}': + case '[': + case ']': + case '\\': + regex_output->push_back('\\'); + regex_output->push_back(ch); + break; + default: + regex_output->push_back(ch); + break; + } +} + +// Small LRU to cap compiled glob patterns +constexpr size_t kGlobRegexCacheCapacity = 256; + +struct GlobRegexCacheEntry { + std::shared_ptr re2; + std::list::iterator lru_it; +}; + +std::mutex g_glob_regex_cache_mutex; +std::list g_glob_regex_cache_lru; +std::unordered_map g_glob_regex_cache; + +std::shared_ptr get_or_build_re2(const std::string& glob_pattern) { + { + std::lock_guard lock(g_glob_regex_cache_mutex); + auto it = g_glob_regex_cache.find(glob_pattern); + if (it != g_glob_regex_cache.end()) { + g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru, + it->second.lru_it); + return it->second.re2; + } + } + std::string regex_pattern; + Status st = glob_to_regex(glob_pattern, ®ex_pattern); + if (!st.ok()) { + return nullptr; + } + auto compiled = std::make_shared(regex_pattern); + if (!compiled->ok()) { + return nullptr; + } + { + std::lock_guard lock(g_glob_regex_cache_mutex); + auto it = g_glob_regex_cache.find(glob_pattern); + if (it != g_glob_regex_cache.end()) { + g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru, + it->second.lru_it); + return it->second.re2; + } + g_glob_regex_cache_lru.push_front(glob_pattern); + g_glob_regex_cache.emplace(glob_pattern, + GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()}); + if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) { + const std::string& evict_key = g_glob_regex_cache_lru.back(); + g_glob_regex_cache.erase(evict_key); + g_glob_regex_cache_lru.pop_back(); + } + } + return compiled; +} + +// Convert a restricted glob pattern into a regex. +// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals. +Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) { + regex_pattern->clear(); + regex_pattern->append("^"); + bool is_escaped = false; + size_t pattern_length = glob_pattern.size(); + for (size_t index = 0; index < pattern_length; ++index) { + char current_char = glob_pattern[index]; + if (is_escaped) { + append_escaped_regex_char(regex_pattern, current_char); + is_escaped = false; + continue; + } + if (current_char == '\\') { + is_escaped = true; + continue; + } + if (current_char == '*') { + regex_pattern->append(".*"); + continue; + } + if (current_char == '?') { + regex_pattern->append("."); + continue; + } + if (current_char == '[') { + size_t class_index = index + 1; + bool class_closed = false; + bool is_class_escaped = false; + std::string class_buffer; + if (class_index < pattern_length && + (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) { + class_buffer.push_back('^'); + ++class_index; + } + for (; class_index < pattern_length; ++class_index) { + char class_char = glob_pattern[class_index]; + if (is_class_escaped) { + class_buffer.push_back(class_char); + is_class_escaped = false; + continue; + } + if (class_char == '\\') { + is_class_escaped = true; + continue; + } + if (class_char == ']') { + class_closed = true; + break; + } + class_buffer.push_back(class_char); + } + if (!class_closed) { + return Status::InvalidArgument("Unclosed character class in glob pattern: {}", + glob_pattern); + } + regex_pattern->append("["); + regex_pattern->append(class_buffer); + regex_pattern->append("]"); + index = class_index; + continue; + } + append_escaped_regex_char(regex_pattern, current_char); + } + if (is_escaped) { + append_escaped_regex_char(regex_pattern, '\\'); + } + regex_pattern->append("$"); + return Status::OK(); +} + +bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) { + auto compiled = get_or_build_re2(glob_pattern); + if (compiled == nullptr) { + return false; + } + return RE2::FullMatch(candidate_path, *compiled); +} + size_t get_number_of_dimensions(const IDataType& type) { if (const auto* type_array = typeid_cast(&type)) { return type_array->get_number_of_dimensions(); @@ -1307,8 +1464,7 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, break; } case PatternTypePB::MATCH_NAME_GLOB: { - int result = fnmatch(pattern, path.c_str(), FNM_PATHNAME); - if (result == 0) { + if (glob_match_re2(pattern, path)) { generate_result_column(*sub_column, &sub_column_info->column); generate_index(sub_column->name()); return true; @@ -1788,8 +1944,6 @@ std::unordered_map materialize_docs_ return subcolumns; } -namespace { - Status _parse_and_materialize_variant_columns(Block& block, const std::vector& variant_pos, const std::vector& configs) { @@ -1864,8 +2018,6 @@ Status _parse_and_materialize_variant_columns(Block& block, return Status::OK(); } -} // namespace - Status parse_and_materialize_variant_columns(Block& block, const std::vector& variant_pos, const std::vector& configs) { RETURN_IF_CATCH_EXCEPTION( diff --git a/be/src/vec/common/variant_util.h b/be/src/vec/common/variant_util.h index 37dc452a3a2f62..a36179ac0fbf50 100644 --- a/be/src/vec/common/variant_util.h +++ b/be/src/vec/common/variant_util.h @@ -64,6 +64,13 @@ using JsonParser = JSONDataParser; const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__"; namespace doris::vectorized::variant_util { + +// Convert a restricted glob pattern into a regex (for tests/internal use). +Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); + +// Match a glob pattern against a path using RE2. +bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); + using PathToNoneNullValues = std::unordered_map; using PathToDataTypes = std::unordered_map, PathInData::Hash>; diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index 78eacd6b3ac91c..bb87ee0ebd7d78 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -209,4 +209,136 @@ TEST(VariantUtilTest, ParseVariantColumns_DocModeRejectOnlySubcolumnsConfig) { EXPECT_TRUE(st.ok()) << st.to_string(); } -} // namespace doris::vectorized::variant_util \ No newline at end of file +TEST(VariantUtilTest, GlobToRegex) { + struct Case { + std::string glob; + std::string expected_regex; + }; + const std::vector cases = { + {"*", "^.*$"}, + {"?", "^.$"}, + {"a?b", "^a.b$"}, + {"a*b", "^a.*b$"}, + {"a**b", "^a.*.*b$"}, + {"a??b", "^a..b$"}, + {"?*", "^..*$"}, + {"*?", "^.*.$"}, + {"a.b", "^a\\.b$"}, + {"a+b", "^a\\+b$"}, + {"a{b}", "^a\\{b\\}$"}, + {R"(a\*b)", R"(^a\*b$)"}, + {"a\\?b", "^a\\?b$"}, + {"a\\[b", "^a\\[b$"}, + {"abc\\", "^abc\\\\$"}, + {"a|b", "^a\\|b$"}, + {"a(b)c", "^a\\(b\\)c$"}, + {"a^b", "^a\\^b$"}, + {"a$b", "^a\\$b$"}, + {"int_[0-9]", "^int_[0-9]$"}, + {"int_[!0-9]", "^int_[^0-9]$"}, + {"int_[^0-9]", "^int_[^0-9]$"}, + {"a[\\-]b", "^a[-]b$"}, + {"a[b-d]e", "^a[b-d]e$"}, + {"a[\\]]b", "^a[]]b$"}, + {"a[\\!]b", "^a[!]b$"}, + {"", "^$"}, + {"a[[]b", "^a[[]b$"}, + {"a[]b", "^a[]b$"}, + {"[]", "^[]$"}, + {"[!]", "^[^]$"}, + {"[^]", "^[^]$"}, + {"\\", "^\\\\$"}, + {"\\*", "^\\*$"}, + {"a\\*b", "^a\\*b$"}, + {"a[!\\]]b", "^a[^]]b$"}, + }; + + for (const auto& test_case : cases) { + std::string regex; + Status st = glob_to_regex(test_case.glob, ®ex); + EXPECT_TRUE(st.ok()) << st.to_string() << " pattern=" << test_case.glob; + EXPECT_EQ(regex, test_case.expected_regex) << "pattern=" << test_case.glob; + } + + std::string regex; + Status st = glob_to_regex("int_[0-9", ®ex); + EXPECT_FALSE(st.ok()); + + st = glob_to_regex("a[\\]b", ®ex); + EXPECT_FALSE(st.ok()); +} + +TEST(VariantUtilTest, GlobMatchRe2) { + struct Case { + std::string glob; + std::string candidate; + bool expected; + }; + const std::vector cases = { + {"*", "", true}, + {"*", "abc", true}, + {"?", "a", true}, + {"?", "", false}, + {"a?b", "acb", true}, + {"a?b", "ab", false}, + {"a*b", "ab", true}, + {"a*b", "axxxb", true}, + {"a**b", "ab", true}, + {"a**b", "axxxb", true}, + {"?*", "", false}, + {"?*", "a", true}, + {"*?", "", false}, + {"*?", "a", true}, + {"a*b", "a/b", true}, + {"a.b", "a.b", true}, + {"a.b", "acb", false}, + {"a+b", "a+b", true}, + {"a{b}", "a{b}", true}, + {"a|b", "a|b", true}, + {"a|b", "ab", false}, + {"a(b)c", "a(b)c", true}, + {"a(b)c", "abc", false}, + {"a^b", "a^b", true}, + {"a^b", "ab", false}, + {"a$b", "a$b", true}, + {"a$b", "ab", false}, + {"a[b-d]e", "ace", true}, + {"a[b-d]e", "aee", false}, + {"a[\\]]b", "a]b", true}, + {"a[\\]]b", "a[b", false}, + {"a[\\!]b", "a!b", true}, + {"a[\\!]b", "a]b", false}, + {"[]", "a", false}, + {"[!]", "]", false}, + {"\\", "\\", true}, + {"\\*", "\\abc", false}, + {"a[!\\]]b", "aXb", true}, + {"a[!\\]]b", "a]b", false}, + {"a[]b", "aXb", false}, + {"a[[]b", "a[b", true}, + {R"(a\*b)", "a*b", true}, + {R"(a\?b)", "a?b", true}, + {R"(a\[b)", "a[b", true}, + {R"(abc\)", R"(abc\)", true}, + {"int_[0-9]", "int_1", true}, + {"int_[0-9]", "int_a", false}, + {"int_[!0-9]", "int_a", true}, + {"int_[!0-9]", "int_1", false}, + {"int_[^0-9]", "int_b", true}, + {"int_[^0-9]", "int_2", false}, + {R"(a[\-]b)", "a-b", true}, + {"", "", true}, + {"", "a", false}, + }; + + for (const auto& test_case : cases) { + bool matched = glob_match_re2(test_case.glob, test_case.candidate); + EXPECT_EQ(matched, test_case.expected) + << "pattern=" << test_case.glob << " candidate=" << test_case.candidate; + } + + EXPECT_FALSE(glob_match_re2("int_[0-9", "int_1")); + EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b")); +} + +} // namespace doris::vectorized::variant_util diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 3eb6fd0e3c01da..07c550a639721d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -41,6 +41,7 @@ import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.GlobRegexUtil; import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.common.io.DeepCopy; @@ -3750,12 +3751,11 @@ public Index getInvertedIndex(Column column, List subPath, String analyz String childName = child.getName(); if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) { try { - java.nio.file.PathMatcher matcher = java.nio.file.FileSystems.getDefault() - .getPathMatcher("glob:" + childName); - if (matcher.matches(java.nio.file.Paths.get(subPathString))) { + com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName); + if (compiled.matcher(subPathString).matches()) { fieldPattern = childName; } - } catch (Exception e) { + } catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) { continue; } } else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/GlobRegexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/common/GlobRegexUtil.java new file mode 100644 index 00000000000000..ff0687d5cc10f1 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/common/GlobRegexUtil.java @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.google.re2j.Pattern; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Utility to convert a restricted glob pattern into a regex. + * + * Supported glob syntax: + * - '*' matches any sequence of characters + * - '?' matches any single character + * - '[...]' matches any character in the brackets + * - '[!...]' matches any character not in the brackets + * - '\\' escapes the next character + */ +public final class GlobRegexUtil { + // Small LRU to cap compiled pattern memory + private static final int REGEX_CACHE_CAPACITY = 256; + private static final Map REGEX_CACHE = new LinkedHashMap( + REGEX_CACHE_CAPACITY, 0.75f, true) { + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > REGEX_CACHE_CAPACITY; + } + }; + + private GlobRegexUtil() { + } + + public static Pattern getOrCompilePattern(String globPattern) { + synchronized (REGEX_CACHE) { + Pattern cached = REGEX_CACHE.get(globPattern); + if (cached != null) { + return cached; + } + String regex = globToRegex(globPattern); + Pattern compiled = Pattern.compile(regex); + REGEX_CACHE.put(globPattern, compiled); + return compiled; + } + } + + public static String globToRegex(String pattern) { + StringBuilder regexBuilder = new StringBuilder(); + regexBuilder.append("^"); + boolean isEscaped = false; + int patternLength = pattern.length(); + for (int index = 0; index < patternLength; index++) { + char currentChar = pattern.charAt(index); + if (isEscaped) { + appendEscapedRegexChar(regexBuilder, currentChar); + isEscaped = false; + continue; + } + if (currentChar == '\\') { + isEscaped = true; + continue; + } + if (currentChar == '*') { + regexBuilder.append(".*"); + continue; + } + if (currentChar == '?') { + regexBuilder.append('.'); + continue; + } + if (currentChar == '[') { + int classIndex = index + 1; + boolean classClosed = false; + boolean isClassEscaped = false; + StringBuilder classBuffer = new StringBuilder(); + if (classIndex < patternLength + && (pattern.charAt(classIndex) == '!' || pattern.charAt(classIndex) == '^')) { + classBuffer.append('^'); + classIndex++; + } + for (; classIndex < patternLength; classIndex++) { + char classChar = pattern.charAt(classIndex); + if (isClassEscaped) { + classBuffer.append(classChar); + isClassEscaped = false; + continue; + } + if (classChar == '\\') { + isClassEscaped = true; + continue; + } + if (classChar == ']') { + classClosed = true; + break; + } + classBuffer.append(classChar); + } + if (!classClosed) { + throw new IllegalArgumentException("Unclosed character class in glob pattern: " + pattern); + } + regexBuilder.append('[').append(classBuffer).append(']'); + index = classIndex; + continue; + } + appendEscapedRegexChar(regexBuilder, currentChar); + } + if (isEscaped) { + appendEscapedRegexChar(regexBuilder, '\\'); + } + regexBuilder.append("$"); + return regexBuilder.toString(); + } + + private static void appendEscapedRegexChar(StringBuilder regexBuilder, char ch) { + switch (ch) { + case '.': + case '^': + case '$': + case '+': + case '*': + case '?': + case '(': + case ')': + case '|': + case '{': + case '}': + case '[': + case ']': + case '\\': + regexBuilder.append('\\').append(ch); + break; + default: + regexBuilder.append(ch); + break; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java index 915ac92a9b5aa3..9b641b81755be0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java @@ -23,6 +23,7 @@ import org.apache.doris.nereids.rules.Rule; import org.apache.doris.nereids.rules.RuleType; import org.apache.doris.nereids.trees.expressions.Alias; +import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Match; import org.apache.doris.nereids.trees.expressions.Slot; @@ -193,7 +194,7 @@ private void checkMetricTypeIsUsedCorrectly(Plan plan) { } else if (plan instanceof LogicalJoin) { LogicalJoin join = (LogicalJoin) plan; for (Expression conjunct : join.getHashJoinConjuncts()) { - if (conjunct.anyMatch(e -> ((Expression) e).getDataType().isVariantType())) { + if (containsVariantTypeOutsideCast(conjunct)) { throw new AnalysisException("variant type could not in join equal conditions: " + conjunct.toSql()); } else if (conjunct.anyMatch(e -> ((Expression) e).getDataType().isVarBinaryType())) { throw new AnalysisException( @@ -201,7 +202,7 @@ private void checkMetricTypeIsUsedCorrectly(Plan plan) { } } for (Expression conjunct : join.getMarkJoinConjuncts()) { - if (conjunct.anyMatch(e -> ((Expression) e).getDataType().isVariantType())) { + if (containsVariantTypeOutsideCast(conjunct)) { throw new AnalysisException("variant type could not in join equal conditions: " + conjunct.toSql()); } else if (conjunct.anyMatch(e -> ((Expression) e).getDataType().isVarBinaryType())) { throw new AnalysisException( @@ -211,6 +212,23 @@ private void checkMetricTypeIsUsedCorrectly(Plan plan) { } } + private boolean containsVariantTypeOutsideCast(Expression expr) { + return containsVariantTypeOutsideCast(expr, false); + } + + private boolean containsVariantTypeOutsideCast(Expression expr, boolean underCast) { + boolean nextUnderCast = underCast || expr instanceof Cast; + if (!nextUnderCast && expr.getDataType().isVariantType()) { + return true; + } + for (Expression child : expr.children()) { + if (containsVariantTypeOutsideCast(child, nextUnderCast)) { + return true; + } + } + return false; + } + private void checkMatchIsUsedCorrectly(Plan plan) { for (Expression expression : plan.getExpressions()) { if (expression instanceof Match) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java index a4fb7f3ae76593..134610b5c345b1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzer.java @@ -84,6 +84,7 @@ import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral; import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.literal.NullLiteral; +import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.literal.StringLiteral; import org.apache.doris.nereids.trees.expressions.typecoercion.ImplicitCastInputTypes; import org.apache.doris.nereids.trees.plans.PlaceholderId; @@ -99,6 +100,8 @@ import org.apache.doris.nereids.types.StructField; import org.apache.doris.nereids.types.StructType; import org.apache.doris.nereids.types.TinyIntType; +import org.apache.doris.nereids.types.VariantField; +import org.apache.doris.nereids.types.VariantType; import org.apache.doris.nereids.util.ExpressionUtils; import org.apache.doris.nereids.util.TypeCoercionUtils; import org.apache.doris.nereids.util.Utils; @@ -155,6 +158,7 @@ protected Expression processCompoundNewChildren(CompoundPredicate cp, List outerScope = getScope().getOuterScope(); @@ -316,6 +343,9 @@ public Expression visitUnboundSlot(UnboundSlot unboundSlot, ExpressionRewriteCon } else if (firstBound.containsType(ElementAt.class, StructElement.class)) { context.cascadesContext.getStatementContext().setHasNestedColumns(true); } + if (firstBound instanceof Alias) { + return maybeCastAliasExpression((Alias) firstBound, context); + } return firstBound; default: if (enableExactMatch) { @@ -700,6 +730,91 @@ protected Expression processCompoundNewChildren(CompoundPredicate cp, List 0) { + return elementAt; + } + Optional path = resolveVariantElementAtPath(elementAt); + if (!path.isPresent()) { + return expr; + } + VariantType variantType = (VariantType) path.get().root.getDataType(); + Optional matchingField = variantType.findMatchingField(path.get().path); + if (!matchingField.isPresent()) { + return expr; + } + DataType targetType = matchingField.get().getDataType(); + return new Cast(elementAt, targetType); + } + + private Optional resolveVariantElementAtPath(ElementAt elementAt) { + List segments = new ArrayList<>(); + Expression current = elementAt; + Expression root = null; + while (current instanceof ElementAt) { + ElementAt currentElementAt = (ElementAt) current; + Optional key = getVariantPathKey(currentElementAt.right()); + if (!key.isPresent()) { + return Optional.empty(); + } + segments.add(0, key.get()); + Expression left = currentElementAt.left(); + if (left instanceof Cast && !((Cast) left).isExplicitType()) { + left = ((Cast) left).child(); + } + current = left; + root = left; + } + if (root == null || !(root.getDataType() instanceof VariantType)) { + return Optional.empty(); + } + if (segments.isEmpty()) { + return Optional.empty(); + } + return Optional.of(new VariantElementAtPath(root, String.join(".", segments))); + } + + private Optional getVariantPathKey(Expression expr) { + if (expr instanceof StringLikeLiteral) { + return Optional.of(((StringLikeLiteral) expr).getStringValue()); + } + return Optional.empty(); + } + + private static final class VariantElementAtPath { + private final Expression root; + private final String path; + + private VariantElementAtPath(Expression root, String path) { + this.root = root; + this.path = path; + } + } + + private Expression maybeCastAliasExpression(Alias alias, ExpressionRewriteContext context) { + if (suppressVariantElementAtCastDepth > 0 || !isEnableVariantSchemaAutoCast(context)) { + return alias; + } + Expression child = alias.child(); + Expression casted = wrapVariantElementAtWithCast(child); + if (casted == child) { + return alias; + } + return alias.withChildren(ImmutableList.of(casted)); + } + @Override public Expression visitNot(Not not, ExpressionRewriteContext context) { // maybe is `not subquery`, we should bind it first diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java index 1716ae1f91e714..dd715c3c54d065 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java @@ -104,10 +104,10 @@ public FunctionSignature computeSignature(FunctionSignature signature) { DataType expressionType = arguments.get(0).getDataType(); DataType sigType = signature.argumentsTypes.get(0); if (expressionType instanceof VariantType && sigType instanceof VariantType) { - // only keep the variant max subcolumns count - VariantType variantType = new VariantType(((VariantType) expressionType).getVariantMaxSubcolumnsCount()); - signature = signature.withArgumentType(0, variantType); - signature = signature.withReturnType(variantType); + // Preserve predefinedFields for schema template matching + VariantType originalType = (VariantType) expressionType; + signature = signature.withArgumentType(0, originalType); + signature = signature.withReturnType(originalType); } return super.computeSignature(signature); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java index 5faed6893be958..a8e3bd9ded136b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java @@ -17,9 +17,13 @@ package org.apache.doris.nereids.types; +import org.apache.doris.common.GlobRegexUtil; import org.apache.doris.nereids.util.Utils; import org.apache.doris.thrift.TPatternType; +import com.google.re2j.Pattern; +import com.google.re2j.PatternSyntaxException; + import java.util.Objects; /** @@ -67,6 +71,35 @@ public String getComment() { return comment; } + /** + * Check if the given field name matches this field's pattern. + * This method uses a restricted glob syntax converted to regex. + * + * Supported glob syntax: + * - '*' matches any sequence of characters + * - '?' matches any single character + * - '[...]' matches any character in the brackets + * - '[!...]' matches any character not in the brackets + * - '\\' escapes the next character + * + * @param fieldName the field name to check + * @return true if the field name matches the pattern + */ + public boolean matches(String fieldName) { + if (patternType == TPatternType.MATCH_NAME) { + return pattern.equals(fieldName); + } + if (patternType != TPatternType.MATCH_NAME_GLOB) { + return false; + } + try { + Pattern compiled = GlobRegexUtil.getOrCompilePattern(pattern); + return compiled.matcher(fieldName).matches(); + } catch (PatternSyntaxException | IllegalArgumentException e) { + return false; + } + } + public org.apache.doris.catalog.VariantField toCatalogDataType() { return new org.apache.doris.catalog.VariantField( pattern, dataType.toCatalogDataType(), comment, patternType); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index 337658520e4123..af25e1f9061f2f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; /** @@ -232,6 +233,22 @@ public List getPredefinedFields() { return predefinedFields; } + /** + * Find the first matching VariantField for the given field name. + * The matching is done in definition order, so the first matching pattern wins. + * + * @param fieldName the field name to match + * @return Optional containing the matching VariantField, or empty if no match + */ + public Optional findMatchingField(String fieldName) { + for (VariantField field : predefinedFields) { + if (field.matches(fieldName)) { + return Optional.of(field); + } + } + return Optional.empty(); + } + public int getVariantMaxSubcolumnsCount() { return variantMaxSubcolumnsCount; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index e30207880b18b7..6f7de872afd0de 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -784,6 +784,7 @@ public class SessionVariable implements Serializable, Writable { // enable variant flatten nested as session variable, default is false, // which means do not flatten nested when create table public static final String ENABLE_VARIANT_FLATTEN_NESTED = "enable_variant_flatten_nested"; + public static final String ENABLE_VARIANT_SCHEMA_AUTO_CAST = "enable_variant_schema_auto_cast"; // CLOUD_VARIABLES_BEGIN public static final String CLOUD_CLUSTER = "cloud_cluster"; @@ -3286,6 +3287,18 @@ public boolean isEnableESParallelScroll() { ) public int defaultVariantMaxSubcolumnsCount = 2048; + @VariableMgr.VarAttr( + name = ENABLE_VARIANT_SCHEMA_AUTO_CAST, + needForward = true, + affectQueryResultInExecution = true, + description = { + "是否启用基于 schema template 的 variant 自动 cast,默认关闭。", + "Whether to enable schema-template-based auto cast for variant expressions. " + + "The default is false." + } + ) + public boolean enableVariantSchemaAutoCast = false; + @VariableMgr.VarAttr( name = DEFAULT_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE, needForward = true, @@ -5904,6 +5917,10 @@ public boolean getEnableVariantFlattenNested() { return enableVariantFlattenNested; } + public boolean isEnableVariantSchemaAutoCast() { + return enableVariantSchemaAutoCast; + } + public void setProfileLevel(String profileLevel) { int profileLevelTmp = Integer.valueOf(profileLevel); if (profileLevelTmp < 1 || profileLevelTmp > 3) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/GlobRegexUtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/GlobRegexUtilTest.java new file mode 100644 index 00000000000000..48f86c4e70c21a --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/common/GlobRegexUtilTest.java @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import com.google.re2j.Pattern; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + + +public class GlobRegexUtilTest { + + private void assertGlobToRegex(String globPattern, String expectedRegex) { + String regex = GlobRegexUtil.globToRegex(globPattern); + Assertions.assertEquals(expectedRegex, regex, "pattern: " + globPattern); + } + + @Test + public void testGlobToRegexBasicTokens() { + assertGlobToRegex("*", "^.*$"); + assertGlobToRegex("?", "^.$"); + assertGlobToRegex("a?b", "^a.b$"); + assertGlobToRegex("a*b", "^a.*b$"); + } + + @Test + public void testGlobToRegexRepeatedWildcards() { + assertGlobToRegex("a**b", "^a.*.*b$"); + assertGlobToRegex("a??b", "^a..b$"); + assertGlobToRegex("?*", "^..*$"); + assertGlobToRegex("*?", "^.*.$"); + } + + + @Test + public void testGlobToRegexEscaping() { + assertGlobToRegex("a.b", "^a\\.b$"); + assertGlobToRegex("a+b", "^a\\+b$"); + assertGlobToRegex("a{b}", "^a\\{b\\}$"); + assertGlobToRegex("a\\*b", "^a\\*b$"); + assertGlobToRegex("a\\?b", "^a\\?b$"); + assertGlobToRegex("a\\[b", "^a\\[b$"); + assertGlobToRegex("abc\\", "^abc\\\\$"); + assertGlobToRegex("a|b", "^a\\|b$"); + assertGlobToRegex("a(b)c", "^a\\(b\\)c$"); + assertGlobToRegex("a^b", "^a\\^b$"); + assertGlobToRegex("a$b", "^a\\$b$"); + } + + @Test + public void testGlobToRegexCharacterClasses() { + assertGlobToRegex("int_[0-9]", "^int_[0-9]$"); + assertGlobToRegex("int_[!0-9]", "^int_[^0-9]$"); + assertGlobToRegex("int_[^0-9]", "^int_[^0-9]$"); + assertGlobToRegex("a[\\-]b", "^a[-]b$"); + assertGlobToRegex("a[b-d]e", "^a[b-d]e$"); + assertGlobToRegex("a[\\]]b", "^a[]]b$"); + assertGlobToRegex("a[\\!]b", "^a[!]b$"); + } + + @Test + public void testGlobToRegexEmptyPattern() { + assertGlobToRegex("", "^$"); + } + + + @Test + public void testGlobToRegexWeirdClasses() { + assertGlobToRegex("a[[]b", "^a[[]b$"); + assertGlobToRegex("a[]b", "^a[]b$"); + Assertions.assertThrows(IllegalArgumentException.class, + () -> GlobRegexUtil.globToRegex("a[\\]b")); + } + + @Test + public void testGlobToRegexUnclosedClass() { + Assertions.assertThrows(IllegalArgumentException.class, + () -> GlobRegexUtil.globToRegex("int_[0-9")); + } + + + @Test + public void testGlobToRegexMoreWeirdCases() { + assertGlobToRegex("[]", "^[]$"); + assertGlobToRegex("[!]", "^[^]$"); + assertGlobToRegex("[^]", "^[^]$"); + assertGlobToRegex("\\", "^\\\\$"); + assertGlobToRegex("\\*", "^\\*$"); + assertGlobToRegex("a\\*b", "^a\\*b$"); + assertGlobToRegex("a[!\\]]b", "^a[^]]b$"); + } + + @Test + public void testGetOrCompilePatternCache() { + Pattern first = GlobRegexUtil.getOrCompilePattern("num_*"); + Pattern second = GlobRegexUtil.getOrCompilePattern("num_*"); + Assertions.assertSame(first, second); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzerVariantAutoCastTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzerVariantAutoCastTest.java new file mode 100644 index 00000000000000..4ab38d7606dcf8 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/ExpressionAnalyzerVariantAutoCastTest.java @@ -0,0 +1,322 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.analysis; + +import org.apache.doris.nereids.CascadesContext; +import org.apache.doris.nereids.analyzer.Scope; +import org.apache.doris.nereids.analyzer.UnboundSlot; +import org.apache.doris.nereids.trees.expressions.Alias; +import org.apache.doris.nereids.trees.expressions.Between; +import org.apache.doris.nereids.trees.expressions.Cast; +import org.apache.doris.nereids.trees.expressions.ExprId; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.GreaterThan; +import org.apache.doris.nereids.trees.expressions.InPredicate; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.functions.agg.Avg; +import org.apache.doris.nereids.trees.expressions.functions.agg.Count; +import org.apache.doris.nereids.trees.expressions.functions.agg.Max; +import org.apache.doris.nereids.trees.expressions.functions.agg.Min; +import org.apache.doris.nereids.trees.expressions.functions.agg.Sum; +import org.apache.doris.nereids.trees.expressions.functions.scalar.ElementAt; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; +import org.apache.doris.nereids.trees.expressions.literal.StringLiteral; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VariantField; +import org.apache.doris.nereids.types.VariantType; +import org.apache.doris.qe.ConnectContext; + +import com.google.common.collect.ImmutableList; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class ExpressionAnalyzerVariantAutoCastTest { + + @AfterEach + public void cleanup() { + ConnectContext.remove(); + } + + private CascadesContext createContext(boolean enableAutoCast) { + ConnectContext ctx = new ConnectContext(); + ctx.getSessionVariable().enableVariantSchemaAutoCast = enableAutoCast; + ctx.setThreadLocalInfo(); + return CascadesContext.initTempContext(); + } + + private Expression analyze(Expression expr, Scope scope, boolean enableAutoCast) { + CascadesContext cascadesContext = createContext(enableAutoCast); + ExpressionAnalyzer analyzer = new ExpressionAnalyzer(null, scope, cascadesContext, true, true); + return analyzer.analyze(expr); + } + + private SlotReference buildVariantSlot(VariantType variantType) { + return new SlotReference(new org.apache.doris.nereids.trees.expressions.ExprId(1), + "data", variantType, true, ImmutableList.of()); + } + + private VariantType buildVariantType() { + VariantField numField = new VariantField("num_*", BigIntType.INSTANCE, ""); + VariantField strField = new VariantField("str_*", StringType.INSTANCE, ""); + return new VariantType(ImmutableList.of(numField, strField)); + } + + private void assertCastElementAt(Expression expr) { + Assertions.assertTrue(expr instanceof Cast, "expect Cast wrapping ElementAt"); + Cast cast = (Cast) expr; + Assertions.assertTrue(cast.child() instanceof ElementAt, "cast child should be ElementAt"); + } + + @Test + public void testSelectAutoCastElementAt() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Expression result = analyze(elementAt, scope, true); + assertCastElementAt(result); + } + + @Test + public void testSelectDotSyntaxAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + UnboundSlot unbound = new UnboundSlot("data", "num_a"); + Expression result = analyze(unbound, scope, true); + Assertions.assertTrue(result instanceof Alias); + Alias alias = (Alias) result; + assertCastElementAt(alias.child()); + } + + @Test + public void testWhereAutoCastComparison() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + GreaterThan predicate = new GreaterThan(elementAt, new BigIntLiteral(10)); + Expression result = analyze(predicate, scope, true); + + Assertions.assertTrue(result instanceof GreaterThan); + GreaterThan gt = (GreaterThan) result; + assertCastElementAt(gt.left()); + } + + @Test + public void testOrderByExpressionAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Expression result = analyze(elementAt, scope, true); + assertCastElementAt(result); + } + + @Test + public void testGroupByExpressionAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("str_name")); + Expression result = analyze(elementAt, scope, true); + assertCastElementAt(result); + } + + @Test + public void testAggregateFunctionAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Sum sum = new Sum(elementAt); + Expression result = analyze(sum, scope, true); + + Assertions.assertTrue(result instanceof Sum); + Sum analyzedSum = (Sum) result; + assertCastElementAt(analyzedSum.child()); + } + + @Test + public void testHavingAutoCastWithAggregate() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Sum sum = new Sum(elementAt); + GreaterThan having = new GreaterThan(sum, new BigIntLiteral(100)); + Expression result = analyze(having, scope, true); + + Assertions.assertTrue(result instanceof GreaterThan); + GreaterThan gt = (GreaterThan) result; + Assertions.assertTrue(gt.left() instanceof Sum); + Sum analyzedSum = (Sum) gt.left(); + assertCastElementAt(analyzedSum.child()); + } + + @Test + public void testNonLiteralKeyNoAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + SlotReference keySlot = new SlotReference(new ExprId(2), "col", StringType.INSTANCE, true, ImmutableList.of()); + Scope scope = new Scope(ImmutableList.of(slot, keySlot)); + + ElementAt elementAt = new ElementAt(slot, keySlot); + Expression result = analyze(elementAt, scope, true); + Assertions.assertTrue(result instanceof ElementAt); + Assertions.assertFalse(result instanceof Cast); + } + + @Test + public void testNoMatchingTemplateNoAutoCast() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("unknown")); + Expression result = analyze(elementAt, scope, true); + Assertions.assertTrue(result instanceof ElementAt); + Assertions.assertFalse(result instanceof Cast); + } + + @Test + public void testChainedPathOnlyOuterCast() { + VariantField nestedField = new VariantField("int_nested.level1_num_1", BigIntType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(nestedField)); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt inner = new ElementAt(slot, new StringLiteral("int_nested")); + ElementAt outer = new ElementAt(inner, new StringLiteral("level1_num_1")); + Expression result = analyze(outer, scope, true); + + Assertions.assertTrue(result instanceof Cast); + Cast cast = (Cast) result; + Assertions.assertTrue(cast.child() instanceof ElementAt); + ElementAt castChild = (ElementAt) cast.child(); + Assertions.assertTrue(castChild.left() instanceof ElementAt); + Assertions.assertFalse(castChild.left() instanceof Cast); + } + + @Test + public void testDotPathMergedAliasCast() { + VariantField nestedField = new VariantField("int_nested.level1_num_1", BigIntType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(nestedField)); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + UnboundSlot unbound = new UnboundSlot("data", "int_nested", "level1_num_1"); + Expression result = analyze(unbound, scope, true); + Assertions.assertTrue(result instanceof Alias); + Alias alias = (Alias) result; + assertCastElementAt(alias.child()); + } + + @Test + public void testExplicitCastStillAutoCastsInner() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Cast explicit = new Cast(elementAt, IntegerType.INSTANCE); + Expression result = analyze(explicit, scope, true); + + Assertions.assertTrue(result instanceof Cast); + Cast outer = (Cast) result; + Assertions.assertTrue(outer.child() instanceof Cast); + Cast inner = (Cast) outer.child(); + Assertions.assertTrue(inner.child() instanceof ElementAt); + } + + @Test + public void testWhereBetweenAndIn() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Between between = new Between(elementAt, new BigIntLiteral(10), new BigIntLiteral(20)); + Expression betweenResult = analyze(between, scope, true); + Assertions.assertTrue(betweenResult.containsType(Cast.class)); + Assertions.assertTrue(betweenResult.containsType(ElementAt.class)); + Assertions.assertTrue(betweenResult.collectFirst( + expr -> expr instanceof Cast && ((Cast) expr).child() instanceof ElementAt).isPresent()); + + ElementAt elementAtStr = new ElementAt(slot, new StringLiteral("str_name")); + InPredicate inPredicate = new InPredicate(elementAtStr, + ImmutableList.of(new StringLiteral("alice"), new StringLiteral("bob"))); + Expression inResult = analyze(inPredicate, scope, true); + Assertions.assertTrue(inResult.containsType(Cast.class)); + Assertions.assertTrue(inResult.containsType(ElementAt.class)); + Assertions.assertTrue(inResult.collectFirst( + expr -> expr instanceof Cast && ((Cast) expr).child() instanceof ElementAt).isPresent()); + } + + @Test + public void testAggregateMinMaxAvgCountDistinct() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Min min = new Min(elementAt); + Max max = new Max(elementAt); + Avg avg = new Avg(elementAt); + Count countDistinct = new Count(true, elementAt); + + Expression minResult = analyze(min, scope, true); + Assertions.assertTrue(minResult instanceof Min); + assertCastElementAt(((Min) minResult).child()); + + Expression maxResult = analyze(max, scope, true); + Assertions.assertTrue(maxResult instanceof Max); + assertCastElementAt(((Max) maxResult).child()); + + Expression avgResult = analyze(avg, scope, true); + Assertions.assertTrue(avgResult instanceof Avg); + assertCastElementAt(((Avg) avgResult).child()); + + Expression countResult = analyze(countDistinct, scope, true); + Assertions.assertTrue(countResult instanceof Count); + assertCastElementAt(((Count) countResult).child(0)); + } + + @Test + public void testAutoCastDisabled() { + VariantType variantType = buildVariantType(); + SlotReference slot = buildVariantSlot(variantType); + Scope scope = new Scope(ImmutableList.of(slot)); + + ElementAt elementAt = new ElementAt(slot, new StringLiteral("num_a")); + Expression result = analyze(elementAt, scope, false); + + Assertions.assertTrue(result instanceof ElementAt); + Assertions.assertFalse(result instanceof Cast); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java new file mode 100644 index 00000000000000..66289238e86414 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java @@ -0,0 +1,365 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.types; + +import org.apache.doris.thrift.TPatternType; + +import com.google.common.collect.ImmutableList; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Optional; + +/** + * Unit tests for VariantField pattern matching and VariantType field lookup. + */ +public class VariantFieldMatchTest { + + // ==================== VariantField.matches() tests ==================== + + @Test + public void testExactMatch() { + VariantField field = new VariantField("number_latency", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME.name()); + + Assertions.assertTrue(field.matches("number_latency")); + Assertions.assertFalse(field.matches("number_latency_ms")); + Assertions.assertFalse(field.matches("other_field")); + } + + @Test + public void testRegexMetaLiteralPatterns() { + VariantField pipe = new VariantField("a|b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(pipe.matches("a|b")); + Assertions.assertFalse(pipe.matches("ab")); + + VariantField paren = new VariantField("a(b)c", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(paren.matches("a(b)c")); + Assertions.assertFalse(paren.matches("abc")); + + VariantField caret = new VariantField("a^b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(caret.matches("a^b")); + Assertions.assertFalse(caret.matches("ab")); + + VariantField dollar = new VariantField("a$b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(dollar.matches("a$b")); + Assertions.assertFalse(dollar.matches("ab")); + + VariantField range = new VariantField("a[b-d]e", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(range.matches("ace")); + Assertions.assertFalse(range.matches("aee")); + + VariantField escapedRight = new VariantField("a[\\]]b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedRight.matches("a]b")); + Assertions.assertFalse(escapedRight.matches("a[b")); + + VariantField escapedBang = new VariantField("a[\\!]b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedBang.matches("a!b")); + Assertions.assertFalse(escapedBang.matches("a]b")); + } + + @Test + public void testExactMatchDoesNotTreatGlob() { + VariantField field = new VariantField("num_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME.name()); + + Assertions.assertTrue(field.matches("num_*")); + Assertions.assertFalse(field.matches("num_a")); + } + + @Test + public void testGlobMatchSuffix() { + // Pattern: number_* should match number_latency, number_count, etc. + VariantField field = new VariantField("number_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("number_latency")); + Assertions.assertTrue(field.matches("number_count")); + Assertions.assertTrue(field.matches("number_")); + Assertions.assertFalse(field.matches("string_message")); + Assertions.assertFalse(field.matches("numbering")); + } + + @Test + public void testGlobMatchPrefix() { + // Pattern: *_latency should match number_latency, string_latency, etc. + VariantField field = new VariantField("*_latency", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("number_latency")); + Assertions.assertTrue(field.matches("string_latency")); + Assertions.assertTrue(field.matches("_latency")); + Assertions.assertFalse(field.matches("latency_ms")); + } + + @Test + public void testGlobMatchMiddle() { + // Pattern: num_*_ms should match num_latency_ms, num_count_ms, etc. + VariantField field = new VariantField("num_*_ms", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("num_latency_ms")); + Assertions.assertTrue(field.matches("num_count_ms")); + Assertions.assertTrue(field.matches("num__ms")); + Assertions.assertFalse(field.matches("num_latency")); + Assertions.assertFalse(field.matches("number_latency_ms")); + } + + @Test + public void testGlobMatchAll() { + // Pattern: * should match everything + VariantField field = new VariantField("*", StringType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("anything")); + Assertions.assertTrue(field.matches("")); + Assertions.assertTrue(field.matches("a.b.c")); + } + + @Test + public void testRepeatedWildcardPatterns() { + VariantField doubleStar = new VariantField("a**b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(doubleStar.matches("ab")); + Assertions.assertTrue(doubleStar.matches("axxxb")); + + VariantField questionStar = new VariantField("?*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertFalse(questionStar.matches("")); + Assertions.assertTrue(questionStar.matches("a")); + + VariantField starQuestion = new VariantField("*?", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertFalse(starQuestion.matches("")); + Assertions.assertTrue(starQuestion.matches("a")); + } + + @Test + public void testGlobMatchWithDot() { + // Pattern: metrics.* should match metrics.score, metrics.count, etc. + VariantField field = new VariantField("metrics.*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("metrics.score")); + Assertions.assertTrue(field.matches("metrics.count")); + Assertions.assertFalse(field.matches("metricsXscore")); + Assertions.assertFalse(field.matches("metrics")); + } + + @Test + public void testGlobMatchDotLiteral() { + // '.' should be treated as literal in glob and escaped in regex + VariantField field = new VariantField("a.b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("a.b")); + Assertions.assertFalse(field.matches("acb")); + } + + @Test + public void testDefaultPatternTypeIsGlob() { + // Default constructor should use MATCH_NAME_GLOB + VariantField field = new VariantField("number_*", BigIntType.INSTANCE, ""); + + Assertions.assertTrue(field.matches("number_latency")); + } + + // ==================== VariantType.findMatchingField() tests ==================== + + @Test + public void testFindMatchingFieldSinglePattern() { + VariantField field = new VariantField("number_*", BigIntType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(field)); + + Optional result = variantType.findMatchingField("number_latency"); + Assertions.assertTrue(result.isPresent()); + Assertions.assertEquals(BigIntType.INSTANCE, result.get().getDataType()); + } + + @Test + public void testFindMatchingFieldMultiplePatterns() { + VariantField numberField = new VariantField("number_*", BigIntType.INSTANCE, ""); + VariantField stringField = new VariantField("string_*", StringType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(numberField, stringField)); + + // Test number pattern + Optional numberResult = variantType.findMatchingField("number_latency"); + Assertions.assertTrue(numberResult.isPresent()); + Assertions.assertEquals(BigIntType.INSTANCE, numberResult.get().getDataType()); + + // Test string pattern + Optional stringResult = variantType.findMatchingField("string_message"); + Assertions.assertTrue(stringResult.isPresent()); + Assertions.assertEquals(StringType.INSTANCE, stringResult.get().getDataType()); + } + + @Test + public void testFindMatchingFieldNoMatch() { + VariantField field = new VariantField("number_*", BigIntType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(field)); + + Optional result = variantType.findMatchingField("string_message"); + Assertions.assertFalse(result.isPresent()); + } + + @Test + public void testFindMatchingFieldFirstMatchWins() { + // When multiple patterns match, the first one should win + VariantField field1 = new VariantField("num*", BigIntType.INSTANCE, ""); + VariantField field2 = new VariantField("number_*", DoubleType.INSTANCE, ""); + VariantType variantType = new VariantType(ImmutableList.of(field1, field2)); + + Optional result = variantType.findMatchingField("number_latency"); + Assertions.assertTrue(result.isPresent()); + // First pattern "num*" should match, returning BigIntType + Assertions.assertEquals(BigIntType.INSTANCE, result.get().getDataType()); + } + + @Test + public void testFindMatchingFieldEmptyPredefinedFields() { + VariantType variantType = new VariantType(0); + + Optional result = variantType.findMatchingField("any_field"); + Assertions.assertFalse(result.isPresent()); + } + + // ==================== Escape sequence tests (aligning with fnmatch behavior) ==================== + + @Test + public void testGlobEscapeAsterisk() { + // Pattern: int_\* should match literal "int_*", not "int_" followed by anything + VariantField field = new VariantField("int_\\*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("int_*")); + Assertions.assertFalse(field.matches("int_nested")); + Assertions.assertFalse(field.matches("int_")); + } + + @Test + public void testGlobEscapeQuestionMark() { + // Pattern: int_\? should match literal "int_?", not "int_" followed by any single char + VariantField field = new VariantField("int_\\?", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("int_?")); + Assertions.assertFalse(field.matches("int_1")); + Assertions.assertFalse(field.matches("int_")); + } + + @Test + public void testGlobEscapeBracket() { + // Pattern: int_\[ should match literal "int_[" + VariantField field = new VariantField("int_\\[", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("int_[")); + Assertions.assertFalse(field.matches("int_a")); + } + + @Test + public void testGlobEscapeBackslash() { + // Pattern: int_\\ should match literal "int_\" + VariantField field = new VariantField("int_\\\\", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("int_\\")); + Assertions.assertFalse(field.matches("int_")); + } + + @Test + public void testGlobUnclosedBracket() { + // No closing bracket: invalid glob for PathMatcher, expect no match + VariantField field = new VariantField("int_[0-9", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertFalse(field.matches("int_[0-9")); + Assertions.assertFalse(field.matches("int_1")); + } + + @Test + public void testWeirdGlobPatterns() { + VariantField emptyClass = new VariantField("a[]b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertFalse(emptyClass.matches("aXb")); + + VariantField escapedBracket = new VariantField("a[[]b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedBracket.matches("a[b")); + } + + @Test + public void testMoreWeirdGlobPatterns() { + VariantField emptyClass = new VariantField("[]", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertFalse(emptyClass.matches("a")); + Assertions.assertFalse(emptyClass.matches("")); + + VariantField negatedEmpty = new VariantField("[!]", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertFalse(negatedEmpty.matches("]")); + + VariantField escapedBackslash = new VariantField("\\", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedBackslash.matches("\\")); + + VariantField escapedStar = new VariantField("\\*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedStar.matches("*")); + Assertions.assertFalse(escapedStar.matches("\\\\abc")); + + VariantField escapedCharInClass = new VariantField("a[!\\]]b", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(escapedCharInClass.matches("aXb")); + Assertions.assertFalse(escapedCharInClass.matches("a]b")); + } + + @Test + public void testGlobWithSlashSeparator() { + // With glob->regex, '*' should match '/' + VariantField field = new VariantField("int_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + + Assertions.assertTrue(field.matches("int_nested")); + Assertions.assertTrue(field.matches("int_nested.level1")); // '.' is matched by '*' + Assertions.assertTrue(field.matches("int_nested/level1")); // '/' is matched by '*' + } + + @Test + public void testGlobCharacterClass() { + // Character class tests + VariantField field1 = new VariantField("int_[0-9]", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(field1.matches("int_1")); + Assertions.assertFalse(field1.matches("int_a")); + + // Negated character class with ! + VariantField field2 = new VariantField("int_[!0-9]", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + Assertions.assertTrue(field2.matches("int_a")); + Assertions.assertFalse(field2.matches("int_1")); + + } +} diff --git a/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out b/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out new file mode 100644 index 00000000000000..3b9ecdd4580947 --- /dev/null +++ b/regression-test/data/variant_p0/predefine/test_schema_template_auto_cast.out @@ -0,0 +1,301 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !where_simple -- +2 +3 + +-- !where_and -- +2 +4 + +-- !where_or -- +1 +3 +4 + +-- !where_between -- +2 +4 + +-- !where_in -- +1 +3 +4 + +-- !order_by -- +3 50 +2 30 +4 15 +1 10 + +-- !order_by_expr -- +3 51 +2 31 +4 16 +1 11 + +-- !topn -- +3 50 +2 30 + +-- !select_arithmetic -- +1 30 +2 70 +3 110 +4 40 + +-- !case_when -- +1 low +2 high +3 high +4 low + +-- !order_by_alias_expr -- +30 +40 +70 +110 + +-- !explicit_cast_select -- +10 +30 +50 +15 + +-- !explicit_cast_where -- +2 +3 + +-- !explicit_cast_order_by -- +3 +2 +4 +1 + +-- !group_by -- +alice 25 +bob 30 +charlie 50 + +-- !group_by_multi_agg -- +alice 10 15 2 +bob 30 30 1 +charlie 50 50 1 + +-- !having -- +alice 25 +bob 30 +charlie 50 + +-- !having_min -- +bob 30 +charlie 50 + +-- !having_non_agg -- +bob 30 +charlie 50 + +-- !order_by_alias -- +10 +15 +30 +50 + +-- !order_by_alias_subquery -- +1 10 +4 15 +2 30 +3 50 + +-- !group_by_alias_subquery -- +10 1 +15 1 +30 1 +50 1 + +-- !order_by_alias_nested -- +10 +15 +30 +50 + +-- !group_by_alias_nested -- +10 1 +15 1 +30 1 +50 1 + +-- !window_partition_order -- +1 1 +2 1 +3 1 +4 2 + +-- !window_sum -- +1 25 +2 30 +3 50 +4 25 + +-- !window_sum_order -- +1 10 +2 30 +3 50 +4 25 + +-- !agg_min_max -- +10 50 + +-- !agg_count_distinct -- +3 + +-- !join_on -- +1 first +2 second + +-- !join_on_alias_subquery -- +1 first +2 second + +-- !match_name_exact_where -- +2 + +-- !match_name_glob_where -- +1 + +-- !match_name_exact_order -- +1 +2 + +-- !match_name_glob_order -- +1 +2 + +-- !leaf_int1_select -- +1 +2 +1 +3 + +-- !leaf_int1_add -- +2 +3 +2 +4 + +-- !leaf_int_nested_nonleaf -- +\\N +\\N +\\N +\\N + +-- !leaf_int_nested_chain_select -- +1011111 +2022222 +3033333 +4044444 + +-- !leaf_int_nested_dot_select -- +1011111 +2022222 +3033333 +4044444 + +-- !leaf_int_nested_deref_select -- +1011111 +2022222 +3033333 +4044444 + +-- !leaf_int_nested_chain_add -- +1011112 +2022223 +3033334 +4044445 + +-- !leaf_int_nested_dot_add -- +1011112 +2022223 +3033334 +4044445 + +-- !leaf_int_nested_deref_add -- +1011112 +2022223 +3033334 +4044445 + +-- !leaf_where_ok -- +1 +2 +3 +4 + +-- !leaf_where_nonleaf -- + +-- !leaf_where_mixed_1 -- +2 +3 +4 + +-- !leaf_where_mixed_2 -- +2 +3 +4 + +-- !leaf_where_mixed_3 -- +2 +3 +4 + +-- !leaf_order_by_ok -- +1 +3 +2 +4 + +-- !leaf_order_by_nonleaf -- +1 +2 +3 +4 + +-- !leaf_order_by_mixed_1 -- +1 +2 +3 +4 + +-- !leaf_order_by_mixed_2 -- +1 +2 +3 +4 + +-- !leaf_order_by_paren_root -- +1 +2 +3 +4 + +-- !leaf_group_by_ok -- +1 2 +2 1 +3 1 + +-- !leaf_group_by_nonleaf -- +\\N 4 + +-- !leaf_group_by_mixed -- +1011111 1 +2022222 1 +3033333 1 +4044444 1 + +-- !leaf_having_ok -- +1 2 +2 2 +3 3 + +-- !leaf_having_mixed -- +3033333 3033333 +4044444 4044444 diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy new file mode 100644 index 00000000000000..c8e80b03bdd4a2 --- /dev/null +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_auto_cast.groovy @@ -0,0 +1,310 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_schema_template_auto_cast", "p0") { + sql """ set describe_extend_variant_column = true """ + sql """ set enable_match_without_inverted_index = false """ + sql """ set enable_common_expr_pushdown = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + sql """ set enable_variant_schema_auto_cast = true """ + + def tableName = "test_variant_schema_auto_cast" + + // Test 1: WHERE clause with auto-cast + sql "DROP TABLE IF EXISTS ${tableName}" + sql """CREATE TABLE ${tableName} ( + `id` bigint NULL, + `data` variant<'num_*': BIGINT, 'str_*': STRING> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName} values(1, '{"num_a": 10, "num_b": 20, "str_name": "alice"}')""" + sql """insert into ${tableName} values(2, '{"num_a": 30, "num_b": 40, "str_name": "bob"}')""" + sql """insert into ${tableName} values(3, '{"num_a": 50, "num_b": 60, "str_name": "charlie"}')""" + sql """insert into ${tableName} values(4, '{"num_a": 15, "num_b": 25, "str_name": "alice"}')""" + + // Simple WHERE + qt_where_simple """ SELECT id FROM ${tableName} + WHERE data['num_a'] > 20 ORDER BY id """ + + // AND condition + qt_where_and """ SELECT id FROM ${tableName} + WHERE data['num_a'] > 10 AND data['num_b'] < 50 + ORDER BY id """ + + // OR condition + qt_where_or """ SELECT id FROM ${tableName} + WHERE data['num_a'] > 40 OR data['str_name'] = 'alice' + ORDER BY id """ + + // BETWEEN condition + qt_where_between """ SELECT id FROM ${tableName} + WHERE data['num_a'] BETWEEN 15 AND 30 + ORDER BY id """ + + // IN condition + qt_where_in """ SELECT id FROM ${tableName} + WHERE data['str_name'] IN ('alice', 'charlie') + ORDER BY id """ + + // Test 2: ORDER BY with auto-cast + qt_order_by """ SELECT id, data['num_a'] FROM ${tableName} + ORDER BY data['num_a'] DESC """ + + // ORDER BY expression + qt_order_by_expr """ SELECT id, data['num_a'] + 1 AS n FROM ${tableName} + ORDER BY data['num_a'] + 1 DESC """ + + // Test 3: TopN (ORDER BY + LIMIT) + qt_topn """ SELECT id, data['num_a'] FROM ${tableName} + ORDER BY data['num_a'] DESC LIMIT 2 """ + + // Test 4: SELECT with auto-cast (arithmetic operations) + qt_select_arithmetic """ SELECT id, data['num_a'] + data['num_b'] as sum_val + FROM ${tableName} ORDER BY id """ + + // CASE WHEN with auto-cast + qt_case_when """ SELECT id, + CASE WHEN data['num_a'] > 20 THEN 'high' ELSE 'low' END AS level + FROM ${tableName} ORDER BY id """ + + // ORDER BY alias from expression + qt_order_by_alias_expr """ SELECT data['num_a'] + data['num_b'] AS sum_val FROM ${tableName} + ORDER BY sum_val """ + + // Explicit CAST should still trigger schema template auto cast + qt_explicit_cast_select """ SELECT CAST(data['num_a'] AS INT) FROM ${tableName} ORDER BY id """ + qt_explicit_cast_where """ SELECT id FROM ${tableName} + WHERE CAST(data['num_a'] AS INT) > 20 ORDER BY id """ + qt_explicit_cast_order_by """ SELECT id FROM ${tableName} + ORDER BY CAST(data['num_a'] AS INT) DESC """ + + // Test 5: GROUP BY with auto-cast + qt_group_by """ SELECT data['str_name'], SUM(data['num_a']) as total + FROM ${tableName} GROUP BY data['str_name'] ORDER BY data['str_name'] """ + + // GROUP BY with multiple aggregates + qt_group_by_multi_agg """ SELECT data['str_name'], + MIN(data['num_a']) AS min_a, MAX(data['num_a']) AS max_a, COUNT(*) AS cnt + FROM ${tableName} GROUP BY data['str_name'] ORDER BY data['str_name'] """ + + // Test 6: HAVING with auto-cast + qt_having """ SELECT data['str_name'], SUM(data['num_a']) as total + FROM ${tableName} GROUP BY data['str_name'] + HAVING SUM(data['num_a']) > 20 ORDER BY data['str_name'] """ + + // HAVING with MIN + qt_having_min """ SELECT data['str_name'], MIN(data['num_a']) AS min_a + FROM ${tableName} GROUP BY data['str_name'] + HAVING MIN(data['num_a']) >= 15 ORDER BY data['str_name'] """ + + // HAVING with non-aggregate expression on group key + qt_having_non_agg """ SELECT data['str_name'], SUM(data['num_a']) AS total + FROM ${tableName} GROUP BY data['str_name'] + HAVING data['str_name'] != 'alice' ORDER BY data['str_name'] """ + + // Test 7: ORDER BY with alias from project + qt_order_by_alias """ SELECT data['num_a'] AS num_a FROM ${tableName} + ORDER BY num_a """ + + // Test 8: ORDER BY with alias from subquery + qt_order_by_alias_subquery """ SELECT * FROM (SELECT id, data['num_a'] AS num_a FROM ${tableName}) t + ORDER BY num_a, id """ + + // Test 9: GROUP BY with alias from subquery + qt_group_by_alias_subquery """ SELECT num_a, COUNT(*) AS cnt + FROM (SELECT data['num_a'] AS num_a FROM ${tableName}) t + GROUP BY num_a ORDER BY num_a """ + + // ORDER BY with nested alias + qt_order_by_alias_nested """ SELECT * FROM ( + SELECT num_a FROM (SELECT data['num_a'] AS num_a FROM ${tableName}) s1 + ) s2 ORDER BY num_a """ + + // GROUP BY with nested alias + qt_group_by_alias_nested """ SELECT num_a, COUNT(*) AS cnt FROM ( + SELECT num_a FROM (SELECT data['num_a'] AS num_a FROM ${tableName}) s1 + ) s2 GROUP BY num_a ORDER BY num_a """ + + // Test 10: WINDOW partition/order by with auto-cast + qt_window_partition_order """ SELECT id, + row_number() OVER (PARTITION BY data['str_name'] ORDER BY data['num_a']) AS rn + FROM ${tableName} ORDER BY id """ + + // WINDOW aggregate + qt_window_sum """ SELECT id, + SUM(data['num_a']) OVER (PARTITION BY data['str_name']) AS s + FROM ${tableName} ORDER BY id """ + + // WINDOW partition + order by with both paths + qt_window_sum_order """ SELECT id, + SUM(data['num_a']) OVER (PARTITION BY data['str_name'] ORDER BY data['num_a']) AS s + FROM ${tableName} ORDER BY id """ + + // Aggregates without GROUP BY + qt_agg_min_max """ SELECT MIN(data['num_a']), MAX(data['num_a']) FROM ${tableName} """ + qt_agg_count_distinct """ SELECT COUNT(DISTINCT data['str_name']) FROM ${tableName} """ + + // Test 11: disable auto-cast should error in ORDER BY + sql """ set enable_variant_schema_auto_cast = false """ + test { + sql """ SELECT id FROM ${tableName} ORDER BY data['num_a'] """ + exception "Doris hll, bitmap, array, map, struct, jsonb, variant column must use with specific function" + } + sql """ set enable_variant_schema_auto_cast = true """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Test 12: JOIN ON with auto-cast + def leftTable = "test_variant_join_left" + def rightTable = "test_variant_join_right" + + sql "DROP TABLE IF EXISTS ${leftTable}" + sql "DROP TABLE IF EXISTS ${rightTable}" + + sql """CREATE TABLE ${leftTable} ( + `id` bigint NULL, + `data` variant<'key_*': BIGINT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """CREATE TABLE ${rightTable} ( + `id` bigint NULL, + `info` variant<'key_*': BIGINT, 'name_*': STRING> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${leftTable} values(1, '{"key_id": 100}')""" + sql """insert into ${leftTable} values(2, '{"key_id": 200}')""" + sql """insert into ${leftTable} values(3, '{"key_id": 300}')""" + + sql """insert into ${rightTable} values(1, '{"key_id": 100, "name_val": "first"}')""" + sql """insert into ${rightTable} values(2, '{"key_id": 200, "name_val": "second"}')""" + sql """insert into ${rightTable} values(3, '{"key_id": 400, "name_val": "fourth"}')""" + + qt_join_on """ SELECT l.id, r.info['name_val'] + FROM ${leftTable} l JOIN ${rightTable} r + ON l.data['key_id'] = r.info['key_id'] + ORDER BY l.id """ + + // Test 13: JOIN ON with alias from subquery + qt_join_on_alias_subquery """ SELECT l.id, r.name_val + FROM (SELECT id, data['key_id'] AS key_id FROM ${leftTable}) l + JOIN (SELECT id, info['key_id'] AS key_id, info['name_val'] AS name_val FROM ${rightTable}) r + ON l.key_id = r.key_id + ORDER BY l.id """ + + sql "DROP TABLE IF EXISTS ${leftTable}" + sql "DROP TABLE IF EXISTS ${rightTable}" + + // Test 14: MATCH_NAME and MATCH_NAME_GLOB + def exactTable = "test_variant_schema_auto_cast_exact" + sql "DROP TABLE IF EXISTS ${exactTable}" + sql """CREATE TABLE ${exactTable} ( + `id` bigint NULL, + `data` variant<'exact_key': BIGINT, 'glob_*': BIGINT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${exactTable} values(1, '{"exact_key": 10, "glob_1": 20, "glob_2": 5}')""" + sql """insert into ${exactTable} values(2, '{"exact_key": 30, "glob_2": 40}')""" + + qt_match_name_exact_where """ SELECT id FROM ${exactTable} + WHERE data['exact_key'] > 10 ORDER BY id """ + qt_match_name_glob_where """ SELECT id FROM ${exactTable} + WHERE data['glob_1'] >= 20 ORDER BY id """ + qt_match_name_exact_order """ SELECT id FROM ${exactTable} + ORDER BY data['exact_key'] """ + qt_match_name_glob_order """ SELECT id FROM ${exactTable} + ORDER BY data['glob_2'], id """ + + sql "DROP TABLE IF EXISTS ${exactTable}" + + // Test 15: leaf vs non-leaf path auto cast limitation + def leafTable = "test_variant_schema_auto_cast_leaf" + sql "DROP TABLE IF EXISTS ${leafTable}" + sql """CREATE TABLE ${leafTable} ( + `id` bigint NULL, + `data` variant<'int_*': BIGINT> NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${leafTable} values + (1, '{"int_1": 1, "int_nested": {"level1_num_1": 1011111, "level1_num_2": 102}}'), + (2, '{"int_1": 2, "int_nested": {"level1_num_1": 2022222, "level1_num_2": 202}}'), + (3, '{"int_1": 1, "int_nested": {"level1_num_1": 3033333, "level1_num_2": 302}}'), + (4, '{"int_1": 3, "int_nested": {"level1_num_1": 4044444, "level1_num_2": 402}}')""" + + qt_leaf_int1_select """ SELECT data['int_1'] FROM ${leafTable} ORDER BY id """ + qt_leaf_int1_add """ SELECT data['int_1'] + 1 FROM ${leafTable} ORDER BY id """ + // still fails: FE can't distinguish leaf/non-leaf, may cast int_nested to int + qt_leaf_int_nested_nonleaf """ SELECT data['int_nested'] FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_chain_select """ SELECT data['int_nested']['level1_num_1'] + FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_dot_select """ SELECT data['int_nested.level1_num_1'] FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_deref_select """ SELECT data.int_nested.level1_num_1 FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_chain_add """ SELECT data['int_nested']['level1_num_1'] + 1 + FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_dot_add """ SELECT data['int_nested.level1_num_1'] + 1 + FROM ${leafTable} ORDER BY id """ + qt_leaf_int_nested_deref_add """ SELECT data.int_nested.level1_num_1 + 1 + FROM ${leafTable} ORDER BY id """ + + // Non-select clauses: leaf vs non-leaf + qt_leaf_where_ok """ SELECT id FROM ${leafTable} + WHERE data['int_1'] > 0 ORDER BY id """ + qt_leaf_where_nonleaf """ SELECT id FROM ${leafTable} + WHERE data['int_nested'] > 0 ORDER BY id """ + qt_leaf_where_mixed_1 """ SELECT id FROM ${leafTable} + WHERE data['int_nested']['level1_num_1'] > 2000000 ORDER BY id """ + qt_leaf_where_mixed_2 """ SELECT id FROM ${leafTable} + WHERE data['int_nested.level1_num_1'] > 2000000 ORDER BY id """ + qt_leaf_where_mixed_3 """ SELECT id FROM ${leafTable} + WHERE data.int_nested.level1_num_1 > 2000000 ORDER BY id """ + qt_leaf_order_by_ok """ SELECT id FROM ${leafTable} + ORDER BY data['int_1'], id """ + qt_leaf_order_by_nonleaf """ SELECT id FROM ${leafTable} + ORDER BY data['int_nested'], id """ + qt_leaf_order_by_mixed_1 """ SELECT id FROM ${leafTable} + ORDER BY data['int_nested']['level1_num_1'] """ + qt_leaf_order_by_mixed_2 """ SELECT id FROM ${leafTable} + ORDER BY data['int_nested.level1_num_1'] """ + qt_leaf_order_by_paren_root """ SELECT id FROM ${leafTable} + ORDER BY data.int_nested.level1_num_1 """ + qt_leaf_group_by_ok """ SELECT data['int_1'], COUNT(*) AS cnt + FROM ${leafTable} GROUP BY data['int_1'] ORDER BY data['int_1'] """ + qt_leaf_group_by_nonleaf """ SELECT data['int_nested'], COUNT(*) AS cnt + FROM ${leafTable} GROUP BY data['int_nested'] ORDER BY data['int_nested'] """ + qt_leaf_group_by_mixed """ SELECT data['int_nested.level1_num_1'], COUNT(*) AS cnt + FROM ${leafTable} GROUP BY data['int_nested.level1_num_1'] + ORDER BY data['int_nested.level1_num_1'] """ + qt_leaf_having_ok """ SELECT data['int_1'], SUM(data['int_1']) AS total + FROM ${leafTable} GROUP BY data['int_1'] + HAVING SUM(data['int_1']) > 0 ORDER BY data['int_1'] """ + qt_leaf_having_mixed """ SELECT data['int_nested.level1_num_1'], SUM(data['int_nested.level1_num_1']) AS total + FROM ${leafTable} GROUP BY data['int_nested.level1_num_1'] + HAVING SUM(data['int_nested.level1_num_1']) > 3000000 + ORDER BY data['int_nested.level1_num_1'] """ + + sql "DROP TABLE IF EXISTS ${leafTable}" +}