Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ func init() {
rootCmd.Flags().StringP("migration-archive", "m", "", "Path to the migration archive Example: /path/to/migration-archive.tar.gz")
rootCmd.MarkFlagRequired("migration-archive")

rootCmd.Flags().IntP("threads", "t", 0, "Number of parallel goroutines for metadata processing (default: number of CPUs)")

rootCmd.SilenceErrors = true
rootCmd.SilenceUsage = true
}
Expand Down Expand Up @@ -94,7 +96,8 @@ var rootCmd = &cobra.Command{

pterm.DefaultSection.Println("Remap")
remapSpinner, _ := pterm.DefaultSpinner.Start("Remapping SHAs...")
stats, err := commitremap.ProcessFiles(extractedDir, commitremap.DefaultPrefixes(), commitMap)
threads, _ := cmd.Flags().GetInt("threads")
stats, err := commitremap.ProcessFiles(extractedDir, commitremap.DefaultPrefixes(), commitMap, threads)
if err != nil {
remapSpinner.Fail("Remap failed")
renderSummaryTable(stats, extractedDir)
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/mona-actions/gh-commit-remap
go 1.25

require (
github.com/klauspost/pgzip v1.2.6
github.com/pterm/pterm v0.12.83
github.com/spf13/cobra v1.8.1
)
Expand All @@ -15,6 +16,7 @@ require (
github.com/containerd/console v1.0.5 // indirect
github.com/gookit/color v1.6.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/klauspost/compress v1.18.6 // indirect
github.com/lithammer/fuzzysearch v1.1.8 // indirect
github.com/mattn/go-runewidth v0.0.20 // indirect
github.com/spf13/pflag v1.0.5 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,15 @@ github.com/gookit/color v1.6.0 h1:JjJXBTk1ETNyqyilJhkTXJYYigHG24TM9Xa2M1xAhRA=
github.com/gookit/color v1.6.0/go.mod h1:9ACFc7/1IpHGBW8RwuDm/0YEnhg3dwwXpoMsmtyHfjs=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/klauspost/compress v1.18.6 h1:2jupLlAwFm95+YDR+NwD2MEfFO9d4z4Prjl1XXDjuao=
github.com/klauspost/compress v1.18.6/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.0.10/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
github.com/klauspost/cpuid/v2 v2.2.3 h1:sxCkb+qR91z4vsqw4vGGZlDgPz3G7gjaLyK3V8y70BU=
github.com/klauspost/cpuid/v2 v2.2.3/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
Expand Down
6 changes: 5 additions & 1 deletion pkg/archive/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ import (
"io/fs"
"os"
"path/filepath"
"runtime"
"strings"

pgzip "github.com/klauspost/pgzip"
)

// UnTar decompresses a .tar.gz file into destDir, returning the directory containing the extracted contents.
Expand Down Expand Up @@ -147,7 +150,8 @@ func ReTarDir(srcDir, outPath string) (retErr error) {
if err != nil {
return fmt.Errorf("failed to create archive: %w", err)
}
gzipWriter := gzip.NewWriter(outFile)
gzipWriter, _ := pgzip.NewWriterLevel(outFile, pgzip.BestSpeed)
gzipWriter.SetConcurrency(256<<10, runtime.NumCPU())
tarWriter := tar.NewWriter(gzipWriter)

// The success path closes each writer explicitly (in tar -> gzip -> file order)
Expand Down
225 changes: 225 additions & 0 deletions pkg/commitremap/benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
package commitremap

import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
)

// ── Helpers ──────────────────────────────────────────────────────────────────

// generateCommitMap creates a commit map with n entries of 40-char hex SHAs.
func generateCommitMap(n int) map[string]string {
m := make(map[string]string, n)
buf := make([]byte, 20) // 20 bytes = 40 hex chars
for i := 0; i < n; i++ {
rand.Read(buf)
old := hex.EncodeToString(buf)
rand.Read(buf)
new_ := hex.EncodeToString(buf)
m[old] = new_
}
return m
}

// generateJSONWithSHAs creates a JSON byte slice containing numObjects objects,
// each with shaFields fields containing SHAs from the commit map.
// hitRate controls what fraction of SHAs are in the commit map (0.0-1.0).
func generateJSONWithSHAs(commitMap map[string]string, numObjects, shaFields int, hitRate float64) []byte {
// Collect some real keys for hits
keys := make([]string, 0, len(commitMap))
for k := range commitMap {
keys = append(keys, k)
if len(keys) >= numObjects*shaFields {
break
}
}

buf := make([]byte, 20)
objects := make([]map[string]interface{}, numObjects)
hitCount := int(float64(numObjects*shaFields) * hitRate)
idx := 0
for i := 0; i < numObjects; i++ {
obj := map[string]interface{}{
"id": i,
"created_at": "2024-01-15T10:30:00Z",
"url": fmt.Sprintf("https://github.com/org/repo/pull/%d", i),
}
for f := 0; f < shaFields; f++ {
fieldName := fmt.Sprintf("sha_%d", f)
if idx < hitCount && len(keys) > 0 {
obj[fieldName] = keys[idx%len(keys)]
} else {
rand.Read(buf)
obj[fieldName] = hex.EncodeToString(buf)
}
idx++
}
objects[i] = obj
}

data, _ := json.Marshal(objects)
return data
}

// writeJSONFixtureFiles creates numFiles JSON files in dir, each containing
// numObjects objects with SHA fields.
func writeJSONFixtureFiles(tb testing.TB, dir string, prefix string, numFiles, numObjects int, commitMap map[string]string) {
tb.Helper()
for i := 0; i < numFiles; i++ {
name := fmt.Sprintf("%s_%06d.json", prefix, i+1)
data := generateJSONWithSHAs(commitMap, numObjects, 3, 0.5)
if err := os.WriteFile(filepath.Join(dir, name), data, 0644); err != nil {
tb.Fatal(err)
}
}
}

// ── Benchmarks ───────────────────────────────────────────────────────────────

// BenchmarkReplaceSHABytes measures the core byte-sliding window replacement.
// This is the innermost hot loop — called once per metadata file.
func BenchmarkReplaceSHABytes(b *testing.B) {
sizes := []struct {
name string
mapSize int
jsonObjs int
shaFields int
}{
{"small-map/small-json", 100, 50, 2},
{"large-map/small-json", 1_000_000, 50, 2},
{"large-map/medium-json", 1_000_000, 500, 3},
{"large-map/large-json", 1_000_000, 5000, 3},
{"monorepo-scale", 1_834_000, 1000, 4},
}

for _, s := range sizes {
b.Run(s.name, func(b *testing.B) {
commitMap := generateCommitMap(s.mapSize)
data := generateJSONWithSHAs(commitMap, s.jsonObjs, s.shaFields, 0.5)
shaLen := 40

// Pre-allocate a working buffer to avoid measuring make+copy overhead
input := make([]byte, len(data))
b.SetBytes(int64(len(data)))
for b.Loop() {
copy(input, data)
replaceSHABytes(input, commitMap, shaLen)
}
})
}
}

// BenchmarkReplaceSHABytes_NoHits measures scanning overhead when no SHAs match.
func BenchmarkReplaceSHABytes_NoHits(b *testing.B) {
commitMap := generateCommitMap(1_834_000)
data := generateJSONWithSHAs(commitMap, 1000, 4, 0.0)
differentMap := generateCommitMap(1_834_000)
shaLen := 40

input := make([]byte, len(data))
b.SetBytes(int64(len(data)))
for b.Loop() {
copy(input, data)
replaceSHABytes(input, differentMap, shaLen)
}
}

// BenchmarkUpdateMetadataFile measures single-file remap (read + replace + write).
func BenchmarkUpdateMetadataFile(b *testing.B) {
commitMap := generateCommitMap(1_834_000)
shaLen := 40
data := generateJSONWithSHAs(commitMap, 500, 3, 0.5)

dir := b.TempDir()
filePath := filepath.Join(dir, "test_000001.json")

b.SetBytes(int64(len(data)))
for b.Loop() {
os.WriteFile(filePath, data, 0644)
updateMetadataFile(filePath, commitMap, shaLen)
}
}

// BenchmarkParseCommitMap measures parsing a commit-map file.
func BenchmarkParseCommitMap(b *testing.B) {
sizes := []struct {
name string
n int
}{
{"1K", 1_000},
{"100K", 100_000},
{"1M", 1_000_000},
{"1.8M", 1_834_000},
}

for _, s := range sizes {
b.Run(s.name, func(b *testing.B) {
commitMap := generateCommitMap(s.n)
dir := b.TempDir()
filePath := filepath.Join(dir, "commit-map")

// Write commit-map file
f, _ := os.Create(filePath)
fmt.Fprintln(f, "old new")
for old, new_ := range commitMap {
fmt.Fprintf(f, "%s %s\n", old, new_)
}
f.Close()

fi, _ := os.Stat(filePath)
b.SetBytes(fi.Size())
for b.Loop() {
ParseCommitMap(filePath)
}
})
}
}

// BenchmarkProcessFiles measures the full parallel pipeline.
func BenchmarkProcessFiles(b *testing.B) {
configs := []struct {
name string
mapSize int
numFiles int
objPerFile int
workers int
}{
{"10-files/8-workers", 100_000, 10, 200, 8},
{"100-files/8-workers", 100_000, 100, 200, 8},
{"100-files/16-workers", 100_000, 100, 200, 16},
}

for _, c := range configs {
b.Run(c.name, func(b *testing.B) {
commitMap := generateCommitMap(c.mapSize)

// Create fixture dir once
baseDir := b.TempDir()
writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap)

b.ResetTimer()
for b.Loop() {
b.StopTimer()
writeJSONFixtureFiles(b, baseDir, "pull_requests", c.numFiles, c.objPerFile, commitMap)
b.StartTimer()

ProcessFiles(baseDir, []string{"pull_requests"}, commitMap, c.workers)
}
})
}
}

// BenchmarkIsHexByte measures the hex byte check (called per byte in the hot loop).
func BenchmarkIsHexByte(b *testing.B) {
inputs := []byte("0123456789abcdefABCDEF!@#$%^&*()ghijklmnopqrstuvwxyz")
for b.Loop() {
for _, c := range inputs {
isHexByte(c)
}
}
}
Loading
Loading