From 9c258bf5b36698e534ba3b72e0ef2b7536512e06 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Thu, 23 Apr 2026 19:09:00 +0300 Subject: [PATCH 1/2] upgrade filepath and datetime, change func normalizeByScanner, add test cases for changes --- plugin/action/hash/normalize/token_normalizer.go | 8 ++++---- plugin/action/hash/normalize/token_normalizer_test.go | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/plugin/action/hash/normalize/token_normalizer.go b/plugin/action/hash/normalize/token_normalizer.go index 526b12331..8b2adb0c8 100644 --- a/plugin/action/hash/normalize/token_normalizer.go +++ b/plugin/action/hash/normalize/token_normalizer.go @@ -268,7 +268,7 @@ func (n *tokenNormalizer) normalizeByScanner(out []byte, scanner *lexmachine.Sca prevEnd := 0 for tokRaw, err, eos := scanner.Next(); !eos; tokRaw, err, eos = scanner.Next() { if ui, is := err.(*machines.UnconsumedInput); is { - scanner.TC = ui.FailTC // skip + scanner.TC = max(scanner.TC+1, ui.FailTC-1) // skip continue } else if err != nil { out = out[:0] @@ -484,7 +484,7 @@ var builtinTokenPatterns = []TokenPattern{ }, { Placeholder: placeholderByPattern[pFilepath], - RE: `(/[a-zA-Z0-9-_.]+)+`, + RE: `(/[a-zA-Z-_.][a-zA-Z0-9-_.]*)+`, mask: pFilepath, }, { @@ -511,10 +511,10 @@ var builtinTokenPatterns = []TokenPattern{ mask: pHash, }, { - // RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with monotonic clock + // RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with optional monotonic clock Placeholder: placeholderByPattern[pDatetime], RE: fmt.Sprintf(`(%s)|(%s)|(%s)|(%s)`, - `\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+ m=[+\-]\d+\.\d+`, + `\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+( m=[+\-]\d+\.\d+)?`, `\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(Z|[\+\-]\d\d:\d\d)`, `\d\d:\d\d:\d\d`, `\d\d\d\d-\d\d-\d\d( \d\d:\d\d:\d\d)?`, diff --git a/plugin/action/hash/normalize/token_normalizer_test.go b/plugin/action/hash/normalize/token_normalizer_test.go index 552e8ee8d..51a401440 100644 --- a/plugin/action/hash/normalize/token_normalizer_test.go +++ b/plugin/action/hash/normalize/token_normalizer_test.go @@ -265,6 +265,8 @@ func TestTokenNormalizerBuiltin(t *testing.T) { "some 2025-01-13 20:58:04.019973588 +0000 UTC m=+1417512.275697914 here", "some 2025-01-13 20:58:04.019973588 -0700 MST m=-123.456789012 here", "some 2025-01-13 20:58:04.019973588 +0300 MSK m=+0.123456789 here", + "some 2025-01-13 20:58:04.019973588 -0700 MST here", + "some 2025-01-13 20:58:04.019973588 +0300 MSK here", "some 2025-01-13T10:20:40Z here", "some 2025-01-13T10:20:40.999999999Z here", "some 2025-01-13T10:20:40-06:00 here", @@ -470,9 +472,9 @@ func TestTokenNormalizerCustom(t *testing.T) { }, }, inputs: []string{ - `2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84"`, + `2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84", part/offset: 10117/2461584`, }, - want: " error occurred, client: , upstream: , host: ", + want: " error occurred, client: , upstream: , host: , part/offset: /", }, { name: "empty_patterns", From 697804779358690ac63bed0e5d90386fdb2d4a93 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Thu, 23 Apr 2026 19:28:28 +0300 Subject: [PATCH 2/2] add examples in genBenchInput --- .../hash/normalize/token_normalizer_test.go | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/plugin/action/hash/normalize/token_normalizer_test.go b/plugin/action/hash/normalize/token_normalizer_test.go index 51a401440..08a313eed 100644 --- a/plugin/action/hash/normalize/token_normalizer_test.go +++ b/plugin/action/hash/normalize/token_normalizer_test.go @@ -517,19 +517,21 @@ func TestTokenNormalizerCustom(t *testing.T) { func genBenchInput(count int) []byte { var examples = []string{ - "s1mple falsehood", // no match - "test@host1.host2.com", // email - "http://some.host.com/page1?a=1", // url - "hello-world-123.COM", // host - "7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid - "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // sha1 - "098f6bcd4621d373cade4e832627b4f6", // md5 - "2025-01-13T10:20:40Z", // datetime - "1.2.3.4", // ip - "-1.2m5s", // duration - "0x13eb85e69dfbc0758b12acdaae36287d", // hex - "-4.56", // float - "123", // int + "48757ec9f04efe7faacec8722f3476339b125a6b6172b8a69ff3aa329e0bd0ff", // hash(sha256) + "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // hash(sha1) + "098f6bcd4621d373cade4e832627b4f6", // hash(md5) + "s1mple falsehood", // no match + "test@host1.host2.com", // email + "http://some.host.com/page1?a=1", // url + "hello-world-123.COM", // host + "7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid + "/home/user/photos", // filepath + "2025-01-13T10:20:40Z", // datetime + "1.2.3.4", // ip + "-1.2m5s", // duration + "0x13eb85e69dfbc0758b12acdaae36287d", // hex + "-4.56", // float + "123", // int "truE faLse", }