|
| 1 | +package commitremap |
| 2 | + |
| 3 | +import ( |
| 4 | + "crypto/rand" |
| 5 | + "encoding/hex" |
| 6 | + "encoding/json" |
| 7 | + "fmt" |
| 8 | + "os" |
| 9 | + "path/filepath" |
| 10 | + "testing" |
| 11 | +) |
| 12 | + |
| 13 | +// ── Helpers ────────────────────────────────────────────────────────────────── |
| 14 | + |
| 15 | +// generateCommitMap creates a commit map with n entries of 40-char hex SHAs. |
| 16 | +func generateCommitMap(n int) map[string]string { |
| 17 | + m := make(map[string]string, n) |
| 18 | + buf := make([]byte, 20) // 20 bytes = 40 hex chars |
| 19 | + for i := 0; i < n; i++ { |
| 20 | + rand.Read(buf) |
| 21 | + old := hex.EncodeToString(buf) |
| 22 | + rand.Read(buf) |
| 23 | + new_ := hex.EncodeToString(buf) |
| 24 | + m[old] = new_ |
| 25 | + } |
| 26 | + return m |
| 27 | +} |
| 28 | + |
| 29 | +// generateJSONWithSHAs creates a JSON byte slice containing numObjects objects, |
| 30 | +// each with shaFields fields containing SHAs from the commit map. |
| 31 | +// hitRate controls what fraction of SHAs are in the commit map (0.0-1.0). |
| 32 | +func generateJSONWithSHAs(commitMap map[string]string, numObjects, shaFields int, hitRate float64) []byte { |
| 33 | + // Collect some real keys for hits |
| 34 | + keys := make([]string, 0, len(commitMap)) |
| 35 | + for k := range commitMap { |
| 36 | + keys = append(keys, k) |
| 37 | + if len(keys) >= numObjects*shaFields { |
| 38 | + break |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + buf := make([]byte, 20) |
| 43 | + objects := make([]map[string]interface{}, numObjects) |
| 44 | + hitCount := int(float64(numObjects*shaFields) * hitRate) |
| 45 | + idx := 0 |
| 46 | + for i := 0; i < numObjects; i++ { |
| 47 | + obj := map[string]interface{}{ |
| 48 | + "id": i, |
| 49 | + "created_at": "2024-01-15T10:30:00Z", |
| 50 | + "url": fmt.Sprintf("https://github.com/org/repo/pull/%d", i), |
| 51 | + } |
| 52 | + for f := 0; f < shaFields; f++ { |
| 53 | + fieldName := fmt.Sprintf("sha_%d", f) |
| 54 | + if idx < hitCount && len(keys) > 0 { |
| 55 | + obj[fieldName] = keys[idx%len(keys)] |
| 56 | + } else { |
| 57 | + rand.Read(buf) |
| 58 | + obj[fieldName] = hex.EncodeToString(buf) |
| 59 | + } |
| 60 | + idx++ |
| 61 | + } |
| 62 | + objects[i] = obj |
| 63 | + } |
| 64 | + |
| 65 | + data, _ := json.Marshal(objects) |
| 66 | + return data |
| 67 | +} |
| 68 | + |
| 69 | +// writeJSONFixtureFiles creates numFiles JSON files in dir, each containing |
| 70 | +// numObjects objects with SHA fields. |
| 71 | +func writeJSONFixtureFiles(tb testing.TB, dir string, prefix string, numFiles, numObjects int, commitMap map[string]string) { |
| 72 | + tb.Helper() |
| 73 | + for i := 0; i < numFiles; i++ { |
| 74 | + name := fmt.Sprintf("%s_%06d.json", prefix, i+1) |
| 75 | + data := generateJSONWithSHAs(commitMap, numObjects, 3, 0.5) |
| 76 | + if err := os.WriteFile(filepath.Join(dir, name), data, 0644); err != nil { |
| 77 | + tb.Fatal(err) |
| 78 | + } |
| 79 | + } |
| 80 | +} |
| 81 | + |
| 82 | +// ── Benchmarks ─────────────────────────────────────────────────────────────── |
| 83 | + |
| 84 | +// BenchmarkReplaceSHABytes measures the core byte-sliding window replacement. |
| 85 | +// This is the innermost hot loop — called once per metadata file. |
| 86 | +func BenchmarkReplaceSHABytes(b *testing.B) { |
| 87 | + sizes := []struct { |
| 88 | + name string |
| 89 | + mapSize int |
| 90 | + jsonObjs int |
| 91 | + shaFields int |
| 92 | + }{ |
| 93 | + {"small-map/small-json", 100, 50, 2}, |
| 94 | + {"large-map/small-json", 1_000_000, 50, 2}, |
| 95 | + {"large-map/medium-json", 1_000_000, 500, 3}, |
| 96 | + {"large-map/large-json", 1_000_000, 5000, 3}, |
| 97 | + {"snowflake-scale", 1_834_000, 1000, 4}, |
| 98 | + } |
| 99 | + |
| 100 | + for _, s := range sizes { |
| 101 | + b.Run(s.name, func(b *testing.B) { |
| 102 | + commitMap := generateCommitMap(s.mapSize) |
| 103 | + data := generateJSONWithSHAs(commitMap, s.jsonObjs, s.shaFields, 0.5) |
| 104 | + shaLen := 40 |
| 105 | + |
| 106 | + // Pre-allocate a working buffer to avoid measuring make+copy overhead |
| 107 | + input := make([]byte, len(data)) |
| 108 | + b.SetBytes(int64(len(data))) |
| 109 | + for b.Loop() { |
| 110 | + copy(input, data) |
| 111 | + replaceSHABytes(input, commitMap, shaLen) |
| 112 | + } |
| 113 | + }) |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +// BenchmarkReplaceSHABytes_NoHits measures scanning overhead when no SHAs match. |
| 118 | +func BenchmarkReplaceSHABytes_NoHits(b *testing.B) { |
| 119 | + commitMap := generateCommitMap(1_834_000) |
| 120 | + data := generateJSONWithSHAs(commitMap, 1000, 4, 0.0) |
| 121 | + differentMap := generateCommitMap(1_834_000) |
| 122 | + shaLen := 40 |
| 123 | + |
| 124 | + input := make([]byte, len(data)) |
| 125 | + b.SetBytes(int64(len(data))) |
| 126 | + for b.Loop() { |
| 127 | + copy(input, data) |
| 128 | + replaceSHABytes(input, differentMap, shaLen) |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +// BenchmarkUpdateMetadataFile measures single-file remap (read + replace + write). |
| 133 | +func BenchmarkUpdateMetadataFile(b *testing.B) { |
| 134 | + commitMap := generateCommitMap(1_834_000) |
| 135 | + shaLen := 40 |
| 136 | + data := generateJSONWithSHAs(commitMap, 500, 3, 0.5) |
| 137 | + |
| 138 | + dir := b.TempDir() |
| 139 | + filePath := filepath.Join(dir, "test_000001.json") |
| 140 | + |
| 141 | + b.SetBytes(int64(len(data))) |
| 142 | + for b.Loop() { |
| 143 | + os.WriteFile(filePath, data, 0644) |
| 144 | + updateMetadataFile(filePath, commitMap, shaLen) |
| 145 | + } |
| 146 | +} |
| 147 | + |
| 148 | +// BenchmarkParseCommitMap measures parsing a commit-map file. |
| 149 | +func BenchmarkParseCommitMap(b *testing.B) { |
| 150 | + sizes := []struct { |
| 151 | + name string |
| 152 | + n int |
| 153 | + }{ |
| 154 | + {"1K", 1_000}, |
| 155 | + {"100K", 100_000}, |
| 156 | + {"1M", 1_000_000}, |
| 157 | + {"1.8M", 1_834_000}, |
| 158 | + } |
| 159 | + |
| 160 | + for _, s := range sizes { |
| 161 | + b.Run(s.name, func(b *testing.B) { |
| 162 | + commitMap := generateCommitMap(s.n) |
| 163 | + dir := b.TempDir() |
| 164 | + filePath := filepath.Join(dir, "commit-map") |
| 165 | + |
| 166 | + // Write commit-map file |
| 167 | + f, _ := os.Create(filePath) |
| 168 | + fmt.Fprintln(f, "old new") |
| 169 | + for old, new_ := range commitMap { |
| 170 | + fmt.Fprintf(f, "%s %s\n", old, new_) |
| 171 | + } |
| 172 | + f.Close() |
| 173 | + |
| 174 | + fi, _ := os.Stat(filePath) |
| 175 | + b.SetBytes(fi.Size()) |
| 176 | + for b.Loop() { |
| 177 | + ParseCommitMap(filePath) |
| 178 | + } |
| 179 | + }) |
| 180 | + } |
| 181 | +} |
| 182 | + |
| 183 | +// BenchmarkProcessFiles measures the full parallel pipeline. |
| 184 | +func BenchmarkProcessFiles(b *testing.B) { |
| 185 | + configs := []struct { |
| 186 | + name string |
| 187 | + mapSize int |
| 188 | + numFiles int |
| 189 | + objPerFile int |
| 190 | + workers int |
| 191 | + }{ |
| 192 | + {"10-files/8-workers", 100_000, 10, 200, 8}, |
| 193 | + {"100-files/8-workers", 100_000, 100, 200, 8}, |
| 194 | + {"100-files/16-workers", 100_000, 100, 200, 16}, |
| 195 | + } |
| 196 | + |
| 197 | + for _, c := range configs { |
| 198 | + b.Run(c.name, func(b *testing.B) { |
| 199 | + commitMap := generateCommitMap(c.mapSize) |
| 200 | + |
| 201 | + // Create fixture dir once |
| 202 | + baseDir := b.TempDir() |
| 203 | + writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap) |
| 204 | + |
| 205 | + b.ResetTimer() |
| 206 | + for b.Loop() { |
| 207 | + b.StopTimer() |
| 208 | + writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap) |
| 209 | + b.StartTimer() |
| 210 | + |
| 211 | + ProcessFiles(baseDir, []string{"pull_requests"}, commitMap, c.workers) |
| 212 | + } |
| 213 | + }) |
| 214 | + } |
| 215 | +} |
| 216 | + |
| 217 | +// BenchmarkIsHexByte measures the hex byte check (called per byte in the hot loop). |
| 218 | +func BenchmarkIsHexByte(b *testing.B) { |
| 219 | + inputs := []byte("0123456789abcdefABCDEF!@#$%^&*()ghijklmnopqrstuvwxyz") |
| 220 | + for b.Loop() { |
| 221 | + for _, c := range inputs { |
| 222 | + isHexByte(c) |
| 223 | + } |
| 224 | + } |
| 225 | +} |
0 commit comments