Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
70c3db2
implement schema template cast
gary-cloud Jan 27, 2026
a365d06
fix and test
gary-cloud Jan 28, 2026
2fb3545
fix pipline
gary-cloud Jan 29, 2026
3d6e7c2
add slot ref and delete join case
gary-cloud Jan 30, 2026
947bc8c
add join, select, group, having tests
gary-cloud Jan 30, 2026
da360f6
cover more
gary-cloud Jan 30, 2026
c502501
all tests pass
gary-cloud Jan 30, 2026
cd777b7
maybe done
gary-cloud Jan 30, 2026
c21e118
use processBoundFunction
gary-cloud Jan 31, 2026
b7f1825
enhance fe ut
gary-cloud Jan 31, 2026
72c0f66
reapply VariantSchemaCast rules
gary-cloud Jan 31, 2026
60b0555
may be aborted
gary-cloud Jan 31, 2026
0a04b03
enable_variant_schema_auto_cast_in_select is very complex
gary-cloud Jan 31, 2026
f2e44c2
hope last commit
gary-cloud Jan 31, 2026
0892813
use processBoundFunction
gary-cloud Feb 1, 2026
7de43f9
remove alias map
gary-cloud Feb 2, 2026
06fedbd
simplify code and enhance tests
gary-cloud Feb 2, 2026
11d0df9
revert multi cast of SlotReference
gary-cloud Feb 2, 2026
d137225
fix review
gary-cloud Feb 3, 2026
668a981
fix FE UT
gary-cloud Feb 3, 2026
c75364b
use PathMatcher to replace fnmatch
gary-cloud Feb 4, 2026
62927b2
delete shouldSuppressVariantElementAtCast
gary-cloud Feb 4, 2026
151452d
Revert "use PathMatcher to replace fnmatch"
gary-cloud Feb 4, 2026
2661c43
glob -> regex
gary-cloud Feb 4, 2026
7675aff
fix regex to pass ut
gary-cloud Feb 4, 2026
3f2fa07
fix format and enhance test
gary-cloud Feb 5, 2026
0d23a24
fix format and BE UT
gary-cloud Feb 5, 2026
566e82c
Merge branch 'master' into schema-cast
gary-cloud Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 159 additions & 7 deletions be/src/vec/common/variant_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <assert.h>
#include <fmt/format.h>
#include <fnmatch.h>
#include <gen_cpp/FrontendService.h>
#include <gen_cpp/FrontendService_types.h>
#include <gen_cpp/HeartbeatService_types.h>
Expand All @@ -38,6 +37,7 @@
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <list>
#include <memory>
#include <mutex>
#include <optional>
Expand All @@ -63,6 +63,7 @@
#include "olap/tablet.h"
#include "olap/tablet_fwd.h"
#include "olap/tablet_schema.h"
#include "re2/re2.h"
#include "runtime/client_cache.h"
#include "runtime/define_primitive_type.h"
#include "runtime/exec_env.h"
Expand Down Expand Up @@ -102,6 +103,162 @@
namespace doris::vectorized::variant_util {
#include "common/compile_check_begin.h"

inline void append_escaped_regex_char(std::string* regex_output, char ch) {
switch (ch) {
case '.':
case '^':
case '$':
case '+':
case '*':
case '?':
case '(':
case ')':
case '|':
case '{':
case '}':
case '[':
case ']':
case '\\':
regex_output->push_back('\\');
regex_output->push_back(ch);
break;
default:
regex_output->push_back(ch);
break;
}
}

// Small LRU to cap compiled glob patterns
constexpr size_t kGlobRegexCacheCapacity = 256;

struct GlobRegexCacheEntry {
std::shared_ptr<RE2> re2;
std::list<std::string>::iterator lru_it;
};

std::mutex g_glob_regex_cache_mutex;
std::list<std::string> g_glob_regex_cache_lru;
std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
Comment on lines +139 to +141
Copy link

Copilot AI Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The glob regex cache globals have external linkage (g_glob_regex_cache_mutex, g_glob_regex_cache_lru, g_glob_regex_cache). Since they're only used in this translation unit, consider marking them static or moving them into an anonymous namespace to avoid unintended symbol exports and reduce the chance of name collisions.

Suggested change
std::mutex g_glob_regex_cache_mutex;
std::list<std::string> g_glob_regex_cache_lru;
std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
static std::mutex g_glob_regex_cache_mutex;
static std::list<std::string> g_glob_regex_cache_lru;
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;

Copilot uses AI. Check for mistakes.

std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
{
std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
auto it = g_glob_regex_cache.find(glob_pattern);
if (it != g_glob_regex_cache.end()) {
g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
it->second.lru_it);
return it->second.re2;
}
}
std::string regex_pattern;
Status st = glob_to_regex(glob_pattern, &regex_pattern);
if (!st.ok()) {
return nullptr;
}
auto compiled = std::make_shared<RE2>(regex_pattern);
if (!compiled->ok()) {
return nullptr;
}
{
std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
auto it = g_glob_regex_cache.find(glob_pattern);
if (it != g_glob_regex_cache.end()) {
g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
it->second.lru_it);
return it->second.re2;
}
g_glob_regex_cache_lru.push_front(glob_pattern);
g_glob_regex_cache.emplace(glob_pattern,
GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
const std::string& evict_key = g_glob_regex_cache_lru.back();
g_glob_regex_cache.erase(evict_key);
g_glob_regex_cache_lru.pop_back();
}
}
return compiled;
}

// Convert a restricted glob pattern into a regex.
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
regex_pattern->clear();
regex_pattern->append("^");
bool is_escaped = false;
size_t pattern_length = glob_pattern.size();
for (size_t index = 0; index < pattern_length; ++index) {
char current_char = glob_pattern[index];
if (is_escaped) {
append_escaped_regex_char(regex_pattern, current_char);
is_escaped = false;
continue;
}
if (current_char == '\\') {
is_escaped = true;
continue;
}
if (current_char == '*') {
regex_pattern->append(".*");
continue;
}
if (current_char == '?') {
regex_pattern->append(".");
continue;
}
if (current_char == '[') {
size_t class_index = index + 1;
bool class_closed = false;
bool is_class_escaped = false;
std::string class_buffer;
if (class_index < pattern_length &&
(glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
class_buffer.push_back('^');
++class_index;
}
for (; class_index < pattern_length; ++class_index) {
char class_char = glob_pattern[class_index];
if (is_class_escaped) {
class_buffer.push_back(class_char);
is_class_escaped = false;
continue;
}
if (class_char == '\\') {
is_class_escaped = true;
continue;
}
if (class_char == ']') {
class_closed = true;
break;
}
class_buffer.push_back(class_char);
}
if (!class_closed) {
return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
glob_pattern);
}
regex_pattern->append("[");
regex_pattern->append(class_buffer);
regex_pattern->append("]");
index = class_index;
continue;
}
append_escaped_regex_char(regex_pattern, current_char);
}
if (is_escaped) {
append_escaped_regex_char(regex_pattern, '\\');
}
regex_pattern->append("$");
return Status::OK();
}

bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
auto compiled = get_or_build_re2(glob_pattern);
if (compiled == nullptr) {
return false;
}
return RE2::FullMatch(candidate_path, *compiled);
}

size_t get_number_of_dimensions(const IDataType& type) {
if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
return type_array->get_number_of_dimensions();
Expand Down Expand Up @@ -1307,8 +1464,7 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
break;
}
case PatternTypePB::MATCH_NAME_GLOB: {
int result = fnmatch(pattern, path.c_str(), FNM_PATHNAME);
if (result == 0) {
if (glob_match_re2(pattern, path)) {
generate_result_column(*sub_column, &sub_column_info->column);
generate_index(sub_column->name());
return true;
Expand Down Expand Up @@ -1788,8 +1944,6 @@ std::unordered_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_
return subcolumns;
}

namespace {

Status _parse_and_materialize_variant_columns(Block& block,
const std::vector<uint32_t>& variant_pos,
const std::vector<ParseConfig>& configs) {
Expand Down Expand Up @@ -1864,8 +2018,6 @@ Status _parse_and_materialize_variant_columns(Block& block,
return Status::OK();
}

} // namespace

Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
const std::vector<ParseConfig>& configs) {
RETURN_IF_CATCH_EXCEPTION(
Expand Down
7 changes: 7 additions & 0 deletions be/src/vec/common/variant_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ using JsonParser = JSONDataParser<SimdJSONParser>;
const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__";
namespace doris::vectorized::variant_util {

// Convert a restricted glob pattern into a regex (for tests/internal use).
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern);

// Match a glob pattern against a path using RE2.
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path);

using PathToNoneNullValues = std::unordered_map<std::string, int64_t>;
using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>;

Expand Down
134 changes: 133 additions & 1 deletion be/test/olap/rowset/segment_v2/variant_util_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,4 +209,136 @@ TEST(VariantUtilTest, ParseVariantColumns_DocModeRejectOnlySubcolumnsConfig) {
EXPECT_TRUE(st.ok()) << st.to_string();
}

} // namespace doris::vectorized::variant_util
TEST(VariantUtilTest, GlobToRegex) {
struct Case {
std::string glob;
std::string expected_regex;
};
const std::vector<Case> cases = {
{"*", "^.*$"},
{"?", "^.$"},
{"a?b", "^a.b$"},
{"a*b", "^a.*b$"},
{"a**b", "^a.*.*b$"},
{"a??b", "^a..b$"},
{"?*", "^..*$"},
{"*?", "^.*.$"},
{"a.b", "^a\\.b$"},
{"a+b", "^a\\+b$"},
{"a{b}", "^a\\{b\\}$"},
{R"(a\*b)", R"(^a\*b$)"},
{"a\\?b", "^a\\?b$"},
{"a\\[b", "^a\\[b$"},
{"abc\\", "^abc\\\\$"},
{"a|b", "^a\\|b$"},
{"a(b)c", "^a\\(b\\)c$"},
{"a^b", "^a\\^b$"},
{"a$b", "^a\\$b$"},
{"int_[0-9]", "^int_[0-9]$"},
{"int_[!0-9]", "^int_[^0-9]$"},
{"int_[^0-9]", "^int_[^0-9]$"},
{"a[\\-]b", "^a[-]b$"},
{"a[b-d]e", "^a[b-d]e$"},
{"a[\\]]b", "^a[]]b$"},
{"a[\\!]b", "^a[!]b$"},
{"", "^$"},
{"a[[]b", "^a[[]b$"},
{"a[]b", "^a[]b$"},
{"[]", "^[]$"},
{"[!]", "^[^]$"},
{"[^]", "^[^]$"},
{"\\", "^\\\\$"},
{"\\*", "^\\*$"},
{"a\\*b", "^a\\*b$"},
{"a[!\\]]b", "^a[^]]b$"},
};

for (const auto& test_case : cases) {
std::string regex;
Status st = glob_to_regex(test_case.glob, &regex);
EXPECT_TRUE(st.ok()) << st.to_string() << " pattern=" << test_case.glob;
EXPECT_EQ(regex, test_case.expected_regex) << "pattern=" << test_case.glob;
}

std::string regex;
Status st = glob_to_regex("int_[0-9", &regex);
EXPECT_FALSE(st.ok());

st = glob_to_regex("a[\\]b", &regex);
EXPECT_FALSE(st.ok());
}

TEST(VariantUtilTest, GlobMatchRe2) {
struct Case {
std::string glob;
std::string candidate;
bool expected;
};
const std::vector<Case> cases = {
{"*", "", true},
{"*", "abc", true},
{"?", "a", true},
{"?", "", false},
{"a?b", "acb", true},
{"a?b", "ab", false},
{"a*b", "ab", true},
{"a*b", "axxxb", true},
{"a**b", "ab", true},
{"a**b", "axxxb", true},
{"?*", "", false},
{"?*", "a", true},
{"*?", "", false},
{"*?", "a", true},
{"a*b", "a/b", true},
{"a.b", "a.b", true},
{"a.b", "acb", false},
{"a+b", "a+b", true},
{"a{b}", "a{b}", true},
{"a|b", "a|b", true},
{"a|b", "ab", false},
{"a(b)c", "a(b)c", true},
{"a(b)c", "abc", false},
{"a^b", "a^b", true},
{"a^b", "ab", false},
{"a$b", "a$b", true},
{"a$b", "ab", false},
{"a[b-d]e", "ace", true},
{"a[b-d]e", "aee", false},
{"a[\\]]b", "a]b", true},
{"a[\\]]b", "a[b", false},
{"a[\\!]b", "a!b", true},
{"a[\\!]b", "a]b", false},
{"[]", "a", false},
{"[!]", "]", false},
{"\\", "\\", true},
{"\\*", "\\abc", false},
{"a[!\\]]b", "aXb", true},
{"a[!\\]]b", "a]b", false},
{"a[]b", "aXb", false},
{"a[[]b", "a[b", true},
{R"(a\*b)", "a*b", true},
{R"(a\?b)", "a?b", true},
{R"(a\[b)", "a[b", true},
{R"(abc\)", R"(abc\)", true},
{"int_[0-9]", "int_1", true},
{"int_[0-9]", "int_a", false},
{"int_[!0-9]", "int_a", true},
{"int_[!0-9]", "int_1", false},
{"int_[^0-9]", "int_b", true},
{"int_[^0-9]", "int_2", false},
{R"(a[\-]b)", "a-b", true},
{"", "", true},
{"", "a", false},
};

for (const auto& test_case : cases) {
bool matched = glob_match_re2(test_case.glob, test_case.candidate);
EXPECT_EQ(matched, test_case.expected)
<< "pattern=" << test_case.glob << " candidate=" << test_case.candidate;
}

EXPECT_FALSE(glob_match_re2("int_[0-9", "int_1"));
EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b"));
}

} // namespace doris::vectorized::variant_util
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.GlobRegexUtil;
import org.apache.doris.common.Pair;
import org.apache.doris.common.UserException;
import org.apache.doris.common.io.DeepCopy;
Expand Down Expand Up @@ -3750,12 +3751,11 @@ public Index getInvertedIndex(Column column, List<String> subPath, String analyz
String childName = child.getName();
if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) {
try {
java.nio.file.PathMatcher matcher = java.nio.file.FileSystems.getDefault()
.getPathMatcher("glob:" + childName);
if (matcher.matches(java.nio.file.Paths.get(subPathString))) {
com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName);
if (compiled.matcher(subPathString).matches()) {
fieldPattern = childName;
}
} catch (Exception e) {
} catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) {
continue;
}
} else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) {
Expand Down
Loading
Loading