Skip to content

Commit b21eff5

Browse files
committed
Improved SHA replacement logic with sliding byte window comparison; add tests for new functionality
1 parent f66d96e commit b21eff5

2 files changed

Lines changed: 296 additions & 111 deletions

File tree

pkg/commitremap/commitremap.go

Lines changed: 78 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
package commitremap
33

44
import (
5-
"encoding/json"
65
"fmt"
76
"os"
87
"path/filepath"
@@ -60,15 +59,20 @@ func ParseCommitMap(filePath string) (map[string]string, error) {
6059

6160
// ProcessFiles rewrites SHAs in JSON metadata files matching <prefix>_*.json inside archiveDir.
6261
//
63-
// Each file is walked once, replacing string values that exactly match a key in
64-
// commitMap. Only whole-string SHA values are replaced. SHAs embedded in URLs,
65-
// markdown, or composite strings are not rewritten.
62+
// Each file is scanned byte-by-byte using a sliding window that matches
63+
// SHA-length hex sequences against the commit map. SHAs are replaced
64+
// wherever they appear — including inside URLs, markdown, or composite strings.
6665
//
6766
// numWorkers controls how many goroutines process files in parallel.
6867
// If numWorkers <= 0, it defaults to runtime.NumCPU().
6968
func ProcessFiles(archiveDir string, prefixes []string, commitMap map[string]string, numWorkers int) (Stats, error) {
7069
stats := Stats{PerFile: make(map[string]int)}
7170

71+
shaLen, err := commitMapSHALen(commitMap)
72+
if err != nil {
73+
return stats, fmt.Errorf("validating commit map: %w", err)
74+
}
75+
7276
if numWorkers <= 0 {
7377
numWorkers = runtime.NumCPU()
7478
}
@@ -105,7 +109,7 @@ func ProcessFiles(archiveDir string, prefixes []string, commitMap map[string]str
105109
go func() {
106110
defer wg.Done()
107111
for idx := range workCh {
108-
n, err := updateMetadataFile(allFiles[idx], commitMap)
112+
n, err := updateMetadataFile(allFiles[idx], commitMap, shaLen)
109113
results[idx] = fileResult{file: allFiles[idx], count: n, err: err}
110114
}
111115
}()
@@ -126,67 +130,97 @@ func ProcessFiles(archiveDir string, prefixes []string, commitMap map[string]str
126130
return stats, firstErr
127131
}
128132

129-
func updateMetadataFile(filePath string, commitMap map[string]string) (int, error) {
133+
func updateMetadataFile(filePath string, commitMap map[string]string, shaLen int) (int, error) {
130134
data, err := os.ReadFile(filePath)
131135
if err != nil {
132136
return 0, fmt.Errorf("reading data: %w", err)
133137
}
134138

135-
var dataMap interface{}
136-
err = json.Unmarshal(data, &dataMap)
137-
if err != nil {
138-
return 0, fmt.Errorf("unmarshaling data: %w", err)
139-
}
140-
141-
count := replaceSHA(dataMap, commitMap)
139+
data, count := replaceSHABytes(data, commitMap, shaLen)
142140
if count == 0 {
143141
return 0, nil
144142
}
145143

146-
updatedData, err := json.MarshalIndent(dataMap, "", " ")
147-
if err != nil {
148-
return count, fmt.Errorf("marshaling updated data: %w", err)
149-
}
150-
151-
err = os.WriteFile(filePath, updatedData, 0644)
144+
err = os.WriteFile(filePath, data, 0644)
152145
if err != nil {
153146
return count, fmt.Errorf("writing updated data: %w", err)
154147
}
155148

156149
return count, nil
157150
}
158151

159-
// replaceSHA walks data in place, rewriting whole-string values that match a
160-
// key in commitMap. It returns the number of replacements performed.
161-
func replaceSHA(data interface{}, commitMap map[string]string) int {
162-
count := 0
163-
switch v := data.(type) {
164-
case map[string]interface{}:
165-
for key, value := range v {
166-
if str, ok := value.(string); ok {
167-
if newSHA, hit := commitMap[str]; hit {
168-
v[key] = newSHA
169-
count++
170-
}
171-
continue
152+
// isHexByte reports whether b is a valid hexadecimal byte (0-9, a-f, A-F).
153+
func isHexByte(b byte) bool {
154+
return (b >= '0' && b <= '9') || (b >= 'a' && b <= 'f') || (b >= 'A' && b <= 'F')
155+
}
156+
157+
// commitMapSHALen returns the SHA length common to every key in commitMap.
158+
// It returns an error if the map is empty or if keys/values have different lengths.
159+
func commitMapSHALen(commitMap map[string]string) (int, error) {
160+
shaLen := 0
161+
for old, new_ := range commitMap {
162+
if shaLen == 0 {
163+
shaLen = len(old)
164+
if shaLen == 0 {
165+
return 0, fmt.Errorf("commit map contains an empty key")
172166
}
167+
}
168+
if len(old) != shaLen || len(new_) != shaLen {
169+
return 0, fmt.Errorf("commit map SHAs have inconsistent lengths: expected %d, got key len %d / value len %d", shaLen, len(old), len(new_))
170+
}
171+
}
172+
if shaLen == 0 {
173+
return 0, fmt.Errorf("commit map is empty")
174+
}
175+
return shaLen, nil
176+
}
173177

174-
count += replaceSHA(value, commitMap)
178+
// replaceSHABytes scans data byte-by-byte using a sliding window of shaLen.
179+
//
180+
// Algorithm:
181+
// 1. Walk each byte, counting consecutive valid hex (SHA) bytes.
182+
// 2. When a non-hex byte is hit, reset the counter — no SHA can span it.
183+
// 3. Once we have shaLen consecutive hex bytes, extract that window and
184+
// look it up in commitMap.
185+
// 4. On match: replace in-place, reset counter to 0. The next window
186+
// starts fresh from the byte after the replacement.
187+
// 5. On no match: keep going. The counter grows past shaLen so the
188+
// window slides forward by one byte each step, checking every
189+
// overlapping shaLen-sized substring. For example with shaLen=40,
190+
// if bytes 0–39 don't match, bytes 1–40 are checked next, etc.
191+
//
192+
// Returns the (potentially modified) byte slice and the replacement count.
193+
func replaceSHABytes(data []byte, commitMap map[string]string, shaLen int) ([]byte, int) {
194+
count := 0
195+
consecutiveHex := 0
196+
197+
for i := 0; i < len(data); i++ {
198+
if isHexByte(data[i]) {
199+
consecutiveHex++
200+
} else {
201+
// Non-hex byte breaks any potential SHA sequence.
202+
consecutiveHex = 0
203+
continue
175204
}
176-
case []interface{}:
177-
for i, value := range v {
178-
if str, ok := value.(string); ok {
179-
if newSHA, hit := commitMap[str]; hit {
180-
v[i] = newSHA
181-
count++
182-
}
183-
continue
184-
}
185205

186-
count += replaceSHA(value, commitMap)
206+
// Once we have enough consecutive hex bytes, check if the last
207+
// shaLen bytes match an entry in the commit map.
208+
if consecutiveHex >= shaLen {
209+
start := i - shaLen + 1
210+
candidate := string(data[start : i+1])
211+
if newSHA, ok := commitMap[candidate]; ok {
212+
copy(data[start:i+1], newSHA)
213+
count++
214+
// Reset so the next window starts after the replacement,
215+
// avoiding re-matching bytes we just wrote.
216+
consecutiveHex = 0
217+
}
218+
// If no match, consecutiveHex keeps growing and the window
219+
// slides forward on the next iteration.
187220
}
188221
}
189-
return count
222+
223+
return data, count
190224
}
191225

192226
// summarize the work performed by a ProcessFiles call.

0 commit comments

Comments
 (0)