Skip to content

Commit 724695d

Browse files
committed
Added tar streaming and parllel gzip
1 parent 30171da commit 724695d

6 files changed

Lines changed: 530 additions & 2 deletions

File tree

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/mona-actions/gh-commit-remap
33
go 1.25
44

55
require (
6+
github.com/klauspost/pgzip v1.2.6
67
github.com/pterm/pterm v0.12.83
78
github.com/spf13/cobra v1.8.1
89
)
@@ -15,6 +16,7 @@ require (
1516
github.com/containerd/console v1.0.5 // indirect
1617
github.com/gookit/color v1.6.0 // indirect
1718
github.com/inconshreveable/mousetrap v1.1.0 // indirect
19+
github.com/klauspost/compress v1.18.6 // indirect
1820
github.com/lithammer/fuzzysearch v1.1.8 // indirect
1921
github.com/mattn/go-runewidth v0.0.20 // indirect
2022
github.com/spf13/pflag v1.0.5 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,15 @@ github.com/gookit/color v1.6.0 h1:JjJXBTk1ETNyqyilJhkTXJYYigHG24TM9Xa2M1xAhRA=
3333
github.com/gookit/color v1.6.0/go.mod h1:9ACFc7/1IpHGBW8RwuDm/0YEnhg3dwwXpoMsmtyHfjs=
3434
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
3535
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
36+
github.com/klauspost/compress v1.18.6 h1:2jupLlAwFm95+YDR+NwD2MEfFO9d4z4Prjl1XXDjuao=
37+
github.com/klauspost/compress v1.18.6/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
3638
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
3739
github.com/klauspost/cpuid/v2 v2.0.10/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
3840
github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
3941
github.com/klauspost/cpuid/v2 v2.2.3 h1:sxCkb+qR91z4vsqw4vGGZlDgPz3G7gjaLyK3V8y70BU=
4042
github.com/klauspost/cpuid/v2 v2.2.3/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
43+
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
44+
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
4145
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
4246
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
4347
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=

pkg/archive/archive.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import (
88
"io/fs"
99
"os"
1010
"path/filepath"
11+
"runtime"
1112
"strings"
13+
14+
pgzip "github.com/klauspost/pgzip"
1215
)
1316

1417
// UnTar decompresses a .tar.gz file into destDir, returning the directory containing the extracted contents.
@@ -147,7 +150,8 @@ func ReTarDir(srcDir, outPath string) (retErr error) {
147150
if err != nil {
148151
return fmt.Errorf("failed to create archive: %w", err)
149152
}
150-
gzipWriter := gzip.NewWriter(outFile)
153+
gzipWriter, _ := pgzip.NewWriterLevel(outFile, pgzip.BestSpeed)
154+
gzipWriter.SetConcurrency(256<<10, runtime.NumCPU())
151155
tarWriter := tar.NewWriter(gzipWriter)
152156

153157
// The success path closes each writer explicitly (in tar -> gzip -> file order)

pkg/commitremap/benchmark_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func BenchmarkReplaceSHABytes(b *testing.B) {
9494
{"large-map/small-json", 1_000_000, 50, 2},
9595
{"large-map/medium-json", 1_000_000, 500, 3},
9696
{"large-map/large-json", 1_000_000, 5000, 3},
97-
{"snowflake-scale", 1_834_000, 1000, 4},
97+
{"monorepo-scale", 1_834_000, 1000, 4},
9898
}
9999

100100
for _, s := range sizes {

pkg/commitremap/stream.go

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
package commitremap
2+
3+
import (
4+
"archive/tar"
5+
"compress/gzip"
6+
"fmt"
7+
"io"
8+
"os"
9+
"path/filepath"
10+
"runtime"
11+
"strings"
12+
13+
pgzip "github.com/klauspost/pgzip"
14+
)
15+
16+
const maxMatchedFileSize = 2 << 30 // 2 GB guard for matched entries
17+
18+
// StreamRemap reads a .tar.gz migration archive, remaps SHAs in matching
19+
// JSON metadata files in-flight, and writes the result to a new .tar.gz.
20+
// This eliminates the extract→modify→retar cycle entirely.
21+
//
22+
// Files whose base name matches "<prefix>_<digits>.json" for any prefix in
23+
// prefixes are read into memory, processed by replaceSHABytes, and written
24+
// back. All other entries are streamed through unchanged.
25+
//
26+
// The output archive uses parallel gzip (pgzip) at BestSpeed for fast
27+
// compression. Tar entry order from the input is preserved.
28+
func StreamRemap(inArchive, outArchive string, commitMap map[string]string, prefixes []string) (Stats, error) {
29+
stats := Stats{PerFile: make(map[string]int)}
30+
31+
if len(commitMap) == 0 {
32+
return stats, fmt.Errorf("commit map is empty; nothing to remap")
33+
}
34+
35+
shaLen, err := commitMapSHALen(commitMap)
36+
if err != nil {
37+
return stats, fmt.Errorf("validating commit map: %w", err)
38+
}
39+
40+
prefixSet := make(map[string]bool, len(prefixes))
41+
for _, p := range prefixes {
42+
prefixSet[p] = true
43+
}
44+
45+
// Open input tar.gz
46+
inFile, err := os.Open(inArchive)
47+
if err != nil {
48+
return stats, fmt.Errorf("open input archive: %w", err)
49+
}
50+
defer inFile.Close()
51+
52+
gzReader, err := gzip.NewReader(inFile)
53+
if err != nil {
54+
return stats, fmt.Errorf("create gzip reader: %w", err)
55+
}
56+
defer gzReader.Close()
57+
58+
tarReader := tar.NewReader(gzReader)
59+
60+
// Open output tar.gz
61+
outFile, err := os.Create(outArchive)
62+
if err != nil {
63+
return stats, fmt.Errorf("create output archive: %w", err)
64+
}
65+
66+
gzWriter, _ := pgzip.NewWriterLevel(outFile, pgzip.BestSpeed)
67+
gzWriter.SetConcurrency(256<<10, runtime.NumCPU())
68+
tarWriter := tar.NewWriter(gzWriter)
69+
70+
var tarClosed, gzClosed, fileClosed bool
71+
cleanup := func(retErr error) {
72+
if !tarClosed {
73+
_ = tarWriter.Close()
74+
}
75+
if !gzClosed {
76+
_ = gzWriter.Close()
77+
}
78+
if !fileClosed {
79+
_ = outFile.Close()
80+
}
81+
if retErr != nil {
82+
_ = os.Remove(outArchive)
83+
}
84+
}
85+
86+
for {
87+
header, err := tarReader.Next()
88+
if err == io.EOF {
89+
break
90+
}
91+
if err != nil {
92+
cleanup(err)
93+
return stats, fmt.Errorf("reading tar entry: %w", err)
94+
}
95+
96+
// Safety validation (matching UnTar/ReTarDir behavior)
97+
if filepath.IsAbs(header.Name) {
98+
err := fmt.Errorf("archive entry %q has absolute path", header.Name)
99+
cleanup(err)
100+
return stats, err
101+
}
102+
if pathHasParentRef(header.Name) {
103+
err := fmt.Errorf("archive entry %q escapes destination", header.Name)
104+
cleanup(err)
105+
return stats, err
106+
}
107+
if header.Typeflag != tar.TypeDir && header.Typeflag != tar.TypeReg {
108+
err := fmt.Errorf("unsupported tar entry type %d for %q", header.Typeflag, header.Name)
109+
cleanup(err)
110+
return stats, err
111+
}
112+
113+
// Clone the header to avoid aliasing
114+
hdr := *header
115+
116+
if hdr.Typeflag == tar.TypeDir {
117+
if err := tarWriter.WriteHeader(&hdr); err != nil {
118+
cleanup(err)
119+
return stats, fmt.Errorf("writing dir header %q: %w", hdr.Name, err)
120+
}
121+
continue
122+
}
123+
124+
// Check if this file matches a SHA-bearing prefix
125+
if hdr.Size >= 0 && shouldRemap(hdr.Name, prefixSet) {
126+
if hdr.Size > maxMatchedFileSize {
127+
err := fmt.Errorf("matched file %q is too large (%d bytes, max %d)", hdr.Name, hdr.Size, maxMatchedFileSize)
128+
cleanup(err)
129+
return stats, err
130+
}
131+
132+
data, err := io.ReadAll(tarReader)
133+
if err != nil {
134+
cleanup(err)
135+
return stats, fmt.Errorf("reading matched file %q: %w", hdr.Name, err)
136+
}
137+
138+
data, count := replaceSHABytes(data, commitMap, shaLen)
139+
stats.FilesScanned++
140+
if count > 0 {
141+
stats.PerFile[hdr.Name] = count
142+
}
143+
144+
// Size is unchanged (same-length SHA replacement), but set it
145+
// from actual data length for correctness.
146+
hdr.Size = int64(len(data))
147+
148+
if err := tarWriter.WriteHeader(&hdr); err != nil {
149+
cleanup(err)
150+
return stats, fmt.Errorf("writing header for %q: %w", hdr.Name, err)
151+
}
152+
if _, err := tarWriter.Write(data); err != nil {
153+
cleanup(err)
154+
return stats, fmt.Errorf("writing data for %q: %w", hdr.Name, err)
155+
}
156+
} else {
157+
// Pass through unchanged
158+
if err := tarWriter.WriteHeader(&hdr); err != nil {
159+
cleanup(err)
160+
return stats, fmt.Errorf("writing passthrough header %q: %w", hdr.Name, err)
161+
}
162+
if hdr.Size > 0 {
163+
if _, err := io.Copy(tarWriter, tarReader); err != nil {
164+
cleanup(err)
165+
return stats, fmt.Errorf("copying passthrough file %q: %w", hdr.Name, err)
166+
}
167+
}
168+
}
169+
}
170+
171+
// Close in order: tar → gzip → file
172+
if err := tarWriter.Close(); err != nil {
173+
cleanup(err)
174+
return stats, fmt.Errorf("closing tar writer: %w", err)
175+
}
176+
tarClosed = true
177+
if err := gzWriter.Close(); err != nil {
178+
cleanup(err)
179+
return stats, fmt.Errorf("closing gzip writer: %w", err)
180+
}
181+
gzClosed = true
182+
if err := outFile.Close(); err != nil {
183+
cleanup(err)
184+
return stats, fmt.Errorf("closing output file: %w", err)
185+
}
186+
fileClosed = true
187+
188+
return stats, nil
189+
}
190+
191+
// shouldRemap checks if a tar entry name matches "<prefix>_<digits>.json"
192+
// for any prefix in the set.
193+
func shouldRemap(name string, prefixSet map[string]bool) bool {
194+
base := filepath.Base(name)
195+
if !strings.HasSuffix(base, ".json") {
196+
return false
197+
}
198+
stem := strings.TrimSuffix(base, ".json")
199+
idx := strings.LastIndex(stem, "_")
200+
if idx <= 0 {
201+
return false
202+
}
203+
suffix := stem[idx+1:]
204+
if len(suffix) == 0 {
205+
return false
206+
}
207+
for _, r := range suffix {
208+
if r < '0' || r > '9' {
209+
return false
210+
}
211+
}
212+
prefix := stem[:idx]
213+
return prefixSet[prefix]
214+
}
215+
216+
// pathHasParentRef checks for ".." path components.
217+
func pathHasParentRef(name string) bool {
218+
for _, part := range strings.Split(filepath.ToSlash(name), "/") {
219+
if part == ".." {
220+
return true
221+
}
222+
}
223+
return false
224+
}

0 commit comments

Comments
 (0)