Skip to content

Commit 30171da

Browse files
committed
fix: skip git-filter-repo header line in ParseCommitMap
The commit-map file produced by git-filter-repo starts with an 'old new' header line. The new SHA length validation rejects it. Skip the header when it appears on the first line. perf(commitremap): hex lookup table and byte-level sliding window Replace branch-heavy isHexByte with a [256]bool lookup table for branchless hex byte validation. Replace JSON-walk replaceSHA with byte-level replaceSHABytes sliding window that finds and replaces SHAs everywhere in file content (URLs, markdown, composite strings). Remove old replaceSHA function. Add commitMapSHALen validation, benchmark suite, and updated tests. refactor(bench): migrate benchmarks to b.Loop() (Go 1.24+) - Replace for i := 0; i < b.N; i++ with b.Loop() in all 5 benchmarks - Fix dead-code elimination risk in BenchmarkIsHexByte - Move make+copy setup outside timed loop in BenchmarkReplaceSHABytes - Remove unnecessary b.ResetTimer() calls (b.Loop auto-excludes setup)
1 parent b21eff5 commit 30171da

3 files changed

Lines changed: 280 additions & 6 deletions

File tree

pkg/commitremap/benchmark_test.go

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
package commitremap
2+
3+
import (
4+
"crypto/rand"
5+
"encoding/hex"
6+
"encoding/json"
7+
"fmt"
8+
"os"
9+
"path/filepath"
10+
"testing"
11+
)
12+
13+
// ── Helpers ──────────────────────────────────────────────────────────────────
14+
15+
// generateCommitMap creates a commit map with n entries of 40-char hex SHAs.
16+
func generateCommitMap(n int) map[string]string {
17+
m := make(map[string]string, n)
18+
buf := make([]byte, 20) // 20 bytes = 40 hex chars
19+
for i := 0; i < n; i++ {
20+
rand.Read(buf)
21+
old := hex.EncodeToString(buf)
22+
rand.Read(buf)
23+
new_ := hex.EncodeToString(buf)
24+
m[old] = new_
25+
}
26+
return m
27+
}
28+
29+
// generateJSONWithSHAs creates a JSON byte slice containing numObjects objects,
30+
// each with shaFields fields containing SHAs from the commit map.
31+
// hitRate controls what fraction of SHAs are in the commit map (0.0-1.0).
32+
func generateJSONWithSHAs(commitMap map[string]string, numObjects, shaFields int, hitRate float64) []byte {
33+
// Collect some real keys for hits
34+
keys := make([]string, 0, len(commitMap))
35+
for k := range commitMap {
36+
keys = append(keys, k)
37+
if len(keys) >= numObjects*shaFields {
38+
break
39+
}
40+
}
41+
42+
buf := make([]byte, 20)
43+
objects := make([]map[string]interface{}, numObjects)
44+
hitCount := int(float64(numObjects*shaFields) * hitRate)
45+
idx := 0
46+
for i := 0; i < numObjects; i++ {
47+
obj := map[string]interface{}{
48+
"id": i,
49+
"created_at": "2024-01-15T10:30:00Z",
50+
"url": fmt.Sprintf("https://github.com/org/repo/pull/%d", i),
51+
}
52+
for f := 0; f < shaFields; f++ {
53+
fieldName := fmt.Sprintf("sha_%d", f)
54+
if idx < hitCount && len(keys) > 0 {
55+
obj[fieldName] = keys[idx%len(keys)]
56+
} else {
57+
rand.Read(buf)
58+
obj[fieldName] = hex.EncodeToString(buf)
59+
}
60+
idx++
61+
}
62+
objects[i] = obj
63+
}
64+
65+
data, _ := json.Marshal(objects)
66+
return data
67+
}
68+
69+
// writeJSONFixtureFiles creates numFiles JSON files in dir, each containing
70+
// numObjects objects with SHA fields.
71+
func writeJSONFixtureFiles(tb testing.TB, dir string, prefix string, numFiles, numObjects int, commitMap map[string]string) {
72+
tb.Helper()
73+
for i := 0; i < numFiles; i++ {
74+
name := fmt.Sprintf("%s_%06d.json", prefix, i+1)
75+
data := generateJSONWithSHAs(commitMap, numObjects, 3, 0.5)
76+
if err := os.WriteFile(filepath.Join(dir, name), data, 0644); err != nil {
77+
tb.Fatal(err)
78+
}
79+
}
80+
}
81+
82+
// ── Benchmarks ───────────────────────────────────────────────────────────────
83+
84+
// BenchmarkReplaceSHABytes measures the core byte-sliding window replacement.
85+
// This is the innermost hot loop — called once per metadata file.
86+
func BenchmarkReplaceSHABytes(b *testing.B) {
87+
sizes := []struct {
88+
name string
89+
mapSize int
90+
jsonObjs int
91+
shaFields int
92+
}{
93+
{"small-map/small-json", 100, 50, 2},
94+
{"large-map/small-json", 1_000_000, 50, 2},
95+
{"large-map/medium-json", 1_000_000, 500, 3},
96+
{"large-map/large-json", 1_000_000, 5000, 3},
97+
{"snowflake-scale", 1_834_000, 1000, 4},
98+
}
99+
100+
for _, s := range sizes {
101+
b.Run(s.name, func(b *testing.B) {
102+
commitMap := generateCommitMap(s.mapSize)
103+
data := generateJSONWithSHAs(commitMap, s.jsonObjs, s.shaFields, 0.5)
104+
shaLen := 40
105+
106+
// Pre-allocate a working buffer to avoid measuring make+copy overhead
107+
input := make([]byte, len(data))
108+
b.SetBytes(int64(len(data)))
109+
for b.Loop() {
110+
copy(input, data)
111+
replaceSHABytes(input, commitMap, shaLen)
112+
}
113+
})
114+
}
115+
}
116+
117+
// BenchmarkReplaceSHABytes_NoHits measures scanning overhead when no SHAs match.
118+
func BenchmarkReplaceSHABytes_NoHits(b *testing.B) {
119+
commitMap := generateCommitMap(1_834_000)
120+
data := generateJSONWithSHAs(commitMap, 1000, 4, 0.0)
121+
differentMap := generateCommitMap(1_834_000)
122+
shaLen := 40
123+
124+
input := make([]byte, len(data))
125+
b.SetBytes(int64(len(data)))
126+
for b.Loop() {
127+
copy(input, data)
128+
replaceSHABytes(input, differentMap, shaLen)
129+
}
130+
}
131+
132+
// BenchmarkUpdateMetadataFile measures single-file remap (read + replace + write).
133+
func BenchmarkUpdateMetadataFile(b *testing.B) {
134+
commitMap := generateCommitMap(1_834_000)
135+
shaLen := 40
136+
data := generateJSONWithSHAs(commitMap, 500, 3, 0.5)
137+
138+
dir := b.TempDir()
139+
filePath := filepath.Join(dir, "test_000001.json")
140+
141+
b.SetBytes(int64(len(data)))
142+
for b.Loop() {
143+
os.WriteFile(filePath, data, 0644)
144+
updateMetadataFile(filePath, commitMap, shaLen)
145+
}
146+
}
147+
148+
// BenchmarkParseCommitMap measures parsing a commit-map file.
149+
func BenchmarkParseCommitMap(b *testing.B) {
150+
sizes := []struct {
151+
name string
152+
n int
153+
}{
154+
{"1K", 1_000},
155+
{"100K", 100_000},
156+
{"1M", 1_000_000},
157+
{"1.8M", 1_834_000},
158+
}
159+
160+
for _, s := range sizes {
161+
b.Run(s.name, func(b *testing.B) {
162+
commitMap := generateCommitMap(s.n)
163+
dir := b.TempDir()
164+
filePath := filepath.Join(dir, "commit-map")
165+
166+
// Write commit-map file
167+
f, _ := os.Create(filePath)
168+
fmt.Fprintln(f, "old new")
169+
for old, new_ := range commitMap {
170+
fmt.Fprintf(f, "%s %s\n", old, new_)
171+
}
172+
f.Close()
173+
174+
fi, _ := os.Stat(filePath)
175+
b.SetBytes(fi.Size())
176+
for b.Loop() {
177+
ParseCommitMap(filePath)
178+
}
179+
})
180+
}
181+
}
182+
183+
// BenchmarkProcessFiles measures the full parallel pipeline.
184+
func BenchmarkProcessFiles(b *testing.B) {
185+
configs := []struct {
186+
name string
187+
mapSize int
188+
numFiles int
189+
objPerFile int
190+
workers int
191+
}{
192+
{"10-files/8-workers", 100_000, 10, 200, 8},
193+
{"100-files/8-workers", 100_000, 100, 200, 8},
194+
{"100-files/16-workers", 100_000, 100, 200, 16},
195+
}
196+
197+
for _, c := range configs {
198+
b.Run(c.name, func(b *testing.B) {
199+
commitMap := generateCommitMap(c.mapSize)
200+
201+
// Create fixture dir once
202+
baseDir := b.TempDir()
203+
writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap)
204+
205+
b.ResetTimer()
206+
for b.Loop() {
207+
b.StopTimer()
208+
writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap)
209+
b.StartTimer()
210+
211+
ProcessFiles(baseDir, []string{"pull_requests"}, commitMap, c.workers)
212+
}
213+
})
214+
}
215+
}
216+
217+
// BenchmarkIsHexByte measures the hex byte check (called per byte in the hot loop).
218+
func BenchmarkIsHexByte(b *testing.B) {
219+
inputs := []byte("0123456789abcdefABCDEF!@#$%^&*()ghijklmnopqrstuvwxyz")
220+
for b.Loop() {
221+
for _, c := range inputs {
222+
isHexByte(c)
223+
}
224+
}
225+
}

pkg/commitremap/commitremap.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func ParseCommitMap(filePath string) (map[string]string, error) {
4040
return nil, fmt.Errorf("reading commit map %s: %w", filePath, err)
4141
}
4242

43-
for _, line := range strings.Split(string(content), "\n") {
43+
for i, line := range strings.Split(string(content), "\n") {
4444
if strings.TrimSpace(line) == "" {
4545
continue
4646
}
@@ -51,6 +51,11 @@ func ParseCommitMap(filePath string) (map[string]string, error) {
5151
return nil, fmt.Errorf("invalid commit map line: %w", lineErr)
5252
}
5353

54+
// Skip the header line produced by git-filter-repo ("old new")
55+
if i == 0 && fields[0] == "old" && fields[1] == "new" {
56+
continue
57+
}
58+
5459
commitMap[fields[0]] = fields[1]
5560
}
5661

@@ -149,9 +154,21 @@ func updateMetadataFile(filePath string, commitMap map[string]string, shaLen int
149154
return count, nil
150155
}
151156

157+
// hexTable is a branchless lookup table for valid hex bytes.
158+
// A single array index replaces the 6-comparison branch chain in isHexByte,
159+
// improving throughput in the per-byte hot loop.
160+
var hexTable [256]bool
161+
162+
func init() {
163+
for _, b := range []byte("0123456789abcdefABCDEF") {
164+
hexTable[b] = true
165+
}
166+
}
167+
152168
// isHexByte reports whether b is a valid hexadecimal byte (0-9, a-f, A-F).
169+
// Uses a precomputed lookup table for branchless evaluation.
153170
func isHexByte(b byte) bool {
154-
return (b >= '0' && b <= '9') || (b >= 'a' && b <= 'f') || (b >= 'A' && b <= 'F')
171+
return hexTable[b]
155172
}
156173

157174
// commitMapSHALen returns the SHA length common to every key in commitMap.
@@ -182,8 +199,9 @@ func commitMapSHALen(commitMap map[string]string) (int, error) {
182199
// 2. When a non-hex byte is hit, reset the counter — no SHA can span it.
183200
// 3. Once we have shaLen consecutive hex bytes, extract that window and
184201
// look it up in commitMap.
185-
// 4. On match: replace in-place, reset counter to 0. The next window
186-
// starts fresh from the byte after the replacement.
202+
// 4. On match: replace in-place, skip past the replaced bytes. The next
203+
// window starts fresh from the byte after the replacement, avoiding
204+
// re-scanning the bytes we just wrote.
187205
// 5. On no match: keep going. The counter grows past shaLen so the
188206
// window slides forward by one byte each step, checking every
189207
// overlapping shaLen-sized substring. For example with shaLen=40,
@@ -211,8 +229,12 @@ func replaceSHABytes(data []byte, commitMap map[string]string, shaLen int) ([]by
211229
if newSHA, ok := commitMap[candidate]; ok {
212230
copy(data[start:i+1], newSHA)
213231
count++
214-
// Reset so the next window starts after the replacement,
215-
// avoiding re-matching bytes we just wrote.
232+
// Skip past the replaced bytes. Since we just wrote shaLen
233+
// bytes, the next possible SHA starts at i+1. Setting
234+
// consecutiveHex to 0 means the loop will begin counting
235+
// fresh from the next byte without re-scanning the
236+
// replacement. The loop increment (i++) moves us to i+1
237+
// automatically.
216238
consecutiveHex = 0
217239
}
218240
// If no match, consecutiveHex keeps growing and the window

pkg/commitremap/commitremap_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,33 @@ func TestParseCommitMap(t *testing.T) {
9191
content: "oldSHA1 newSHA1\noldSHA2 newSHA2 extra\noldSHA3 newSHA3",
9292
errContains: "oldSHA2 newSHA2 extra",
9393
},
94+
{
95+
name: "skips git-filter-repo header line",
96+
content: "old new\n" +
97+
"abc123 def456\n" +
98+
"ghi789 jkl012",
99+
expected: map[string]string{
100+
"abc123": "def456",
101+
"ghi789": "jkl012",
102+
},
103+
},
104+
{
105+
name: "header with trailing CRLF",
106+
content: "old new\r\n" +
107+
"abc123 def456\r\n",
108+
expected: map[string]string{
109+
"abc123": "def456",
110+
},
111+
},
112+
{
113+
name: "old new as data when not on first line",
114+
content: "abc123 def456\n" +
115+
"old new",
116+
expected: map[string]string{
117+
"abc123": "def456",
118+
"old": "new",
119+
},
120+
},
94121
}
95122

96123
for _, tt := range tests {

0 commit comments

Comments
 (0)