22package commitremap
33
44import (
5- "encoding/json"
65 "fmt"
76 "os"
87 "path/filepath"
@@ -60,15 +59,20 @@ func ParseCommitMap(filePath string) (map[string]string, error) {
6059
6160// ProcessFiles rewrites SHAs in JSON metadata files matching <prefix>_*.json inside archiveDir.
6261//
63- // Each file is walked once, replacing string values that exactly match a key in
64- // commitMap. Only whole-string SHA values are replaced . SHAs embedded in URLs,
65- // markdown, or composite strings are not rewritten .
62+ // Each file is scanned byte-by-byte using a sliding window that matches
63+ // SHA-length hex sequences against the commit map . SHAs are replaced
64+ // wherever they appear — including inside URLs, markdown, or composite strings.
6665//
6766// numWorkers controls how many goroutines process files in parallel.
6867// If numWorkers <= 0, it defaults to runtime.NumCPU().
6968func ProcessFiles (archiveDir string , prefixes []string , commitMap map [string ]string , numWorkers int ) (Stats , error ) {
7069 stats := Stats {PerFile : make (map [string ]int )}
7170
71+ shaLen , err := commitMapSHALen (commitMap )
72+ if err != nil {
73+ return stats , fmt .Errorf ("validating commit map: %w" , err )
74+ }
75+
7276 if numWorkers <= 0 {
7377 numWorkers = runtime .NumCPU ()
7478 }
@@ -105,7 +109,7 @@ func ProcessFiles(archiveDir string, prefixes []string, commitMap map[string]str
105109 go func () {
106110 defer wg .Done ()
107111 for idx := range workCh {
108- n , err := updateMetadataFile (allFiles [idx ], commitMap )
112+ n , err := updateMetadataFile (allFiles [idx ], commitMap , shaLen )
109113 results [idx ] = fileResult {file : allFiles [idx ], count : n , err : err }
110114 }
111115 }()
@@ -126,67 +130,97 @@ func ProcessFiles(archiveDir string, prefixes []string, commitMap map[string]str
126130 return stats , firstErr
127131}
128132
129- func updateMetadataFile (filePath string , commitMap map [string ]string ) (int , error ) {
133+ func updateMetadataFile (filePath string , commitMap map [string ]string , shaLen int ) (int , error ) {
130134 data , err := os .ReadFile (filePath )
131135 if err != nil {
132136 return 0 , fmt .Errorf ("reading data: %w" , err )
133137 }
134138
135- var dataMap interface {}
136- err = json .Unmarshal (data , & dataMap )
137- if err != nil {
138- return 0 , fmt .Errorf ("unmarshaling data: %w" , err )
139- }
140-
141- count := replaceSHA (dataMap , commitMap )
139+ data , count := replaceSHABytes (data , commitMap , shaLen )
142140 if count == 0 {
143141 return 0 , nil
144142 }
145143
146- updatedData , err := json .MarshalIndent (dataMap , "" , " " )
147- if err != nil {
148- return count , fmt .Errorf ("marshaling updated data: %w" , err )
149- }
150-
151- err = os .WriteFile (filePath , updatedData , 0644 )
144+ err = os .WriteFile (filePath , data , 0644 )
152145 if err != nil {
153146 return count , fmt .Errorf ("writing updated data: %w" , err )
154147 }
155148
156149 return count , nil
157150}
158151
159- // replaceSHA walks data in place, rewriting whole-string values that match a
160- // key in commitMap. It returns the number of replacements performed.
161- func replaceSHA (data interface {}, commitMap map [string ]string ) int {
162- count := 0
163- switch v := data .(type ) {
164- case map [string ]interface {}:
165- for key , value := range v {
166- if str , ok := value .(string ); ok {
167- if newSHA , hit := commitMap [str ]; hit {
168- v [key ] = newSHA
169- count ++
170- }
171- continue
152+ // isHexByte reports whether b is a valid hexadecimal byte (0-9, a-f, A-F).
153+ func isHexByte (b byte ) bool {
154+ return (b >= '0' && b <= '9' ) || (b >= 'a' && b <= 'f' ) || (b >= 'A' && b <= 'F' )
155+ }
156+
157+ // commitMapSHALen returns the SHA length common to every key in commitMap.
158+ // It returns an error if the map is empty or if keys/values have different lengths.
159+ func commitMapSHALen (commitMap map [string ]string ) (int , error ) {
160+ shaLen := 0
161+ for old , new_ := range commitMap {
162+ if shaLen == 0 {
163+ shaLen = len (old )
164+ if shaLen == 0 {
165+ return 0 , fmt .Errorf ("commit map contains an empty key" )
172166 }
167+ }
168+ if len (old ) != shaLen || len (new_ ) != shaLen {
169+ return 0 , fmt .Errorf ("commit map SHAs have inconsistent lengths: expected %d, got key len %d / value len %d" , shaLen , len (old ), len (new_ ))
170+ }
171+ }
172+ if shaLen == 0 {
173+ return 0 , fmt .Errorf ("commit map is empty" )
174+ }
175+ return shaLen , nil
176+ }
173177
174- count += replaceSHA (value , commitMap )
178+ // replaceSHABytes scans data byte-by-byte using a sliding window of shaLen.
179+ //
180+ // Algorithm:
181+ // 1. Walk each byte, counting consecutive valid hex (SHA) bytes.
182+ // 2. When a non-hex byte is hit, reset the counter — no SHA can span it.
183+ // 3. Once we have shaLen consecutive hex bytes, extract that window and
184+ // look it up in commitMap.
185+ // 4. On match: replace in-place, reset counter to 0. The next window
186+ // starts fresh from the byte after the replacement.
187+ // 5. On no match: keep going. The counter grows past shaLen so the
188+ // window slides forward by one byte each step, checking every
189+ // overlapping shaLen-sized substring. For example with shaLen=40,
190+ // if bytes 0–39 don't match, bytes 1–40 are checked next, etc.
191+ //
192+ // Returns the (potentially modified) byte slice and the replacement count.
193+ func replaceSHABytes (data []byte , commitMap map [string ]string , shaLen int ) ([]byte , int ) {
194+ count := 0
195+ consecutiveHex := 0
196+
197+ for i := 0 ; i < len (data ); i ++ {
198+ if isHexByte (data [i ]) {
199+ consecutiveHex ++
200+ } else {
201+ // Non-hex byte breaks any potential SHA sequence.
202+ consecutiveHex = 0
203+ continue
175204 }
176- case []interface {}:
177- for i , value := range v {
178- if str , ok := value .(string ); ok {
179- if newSHA , hit := commitMap [str ]; hit {
180- v [i ] = newSHA
181- count ++
182- }
183- continue
184- }
185205
186- count += replaceSHA (value , commitMap )
206+ // Once we have enough consecutive hex bytes, check if the last
207+ // shaLen bytes match an entry in the commit map.
208+ if consecutiveHex >= shaLen {
209+ start := i - shaLen + 1
210+ candidate := string (data [start : i + 1 ])
211+ if newSHA , ok := commitMap [candidate ]; ok {
212+ copy (data [start :i + 1 ], newSHA )
213+ count ++
214+ // Reset so the next window starts after the replacement,
215+ // avoiding re-matching bytes we just wrote.
216+ consecutiveHex = 0
217+ }
218+ // If no match, consecutiveHex keeps growing and the window
219+ // slides forward on the next iteration.
187220 }
188221 }
189- return count
222+
223+ return data , count
190224}
191225
192226// summarize the work performed by a ProcessFiles call.
0 commit comments