diff --git a/.github/agents/copilot-instructions.md b/.github/agents/copilot-instructions.md index dcb66be..508c403 100644 --- a/.github/agents/copilot-instructions.md +++ b/.github/agents/copilot-instructions.md @@ -19,7 +19,7 @@ You are **Aegis**, a Security Architect and Golang Sentinel. Your job is to fort ## Critical Developer Workflows -- **Install:** `go install github.com/BlackVectorOps/semantic_firewall/v3/cmd/sfw@latest` +- **Install:** `go install github.com/BlackVectorOps/semantic_firewall/v4/cmd/sfw@latest` - **Check file:** `sfw check ./main.go` - **Semantic diff:** `sfw diff old.go new.go` - **Index malware:** `sfw index malware.go --name "Beacon_v1" --severity CRITICAL` diff --git a/README.md b/README.md index 7da67c6..5fd6217 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ### Next-Gen Code Integrity & Malware Detection for Go -[![Go Reference](https://pkg.go.dev/badge/github.com/BlackVectorOps/semantic_firewall.svg)](https://pkg.go.dev/github.com/BlackVectorOps/semantic_firewall/v3) +[![Go Reference](https://pkg.go.dev/badge/github.com/BlackVectorOps/semantic_firewall.svg)](https://pkg.go.dev/github.com/BlackVectorOps/semantic_firewall/v4) [![License: MIT](https://img.shields.io/badge/License-MIT-00d4aa.svg)](LICENSE) [![Marketplace](https://img.shields.io/badge/Marketplace-Semantic_Firewall-7c3aed.svg)](https://github.com/marketplace/actions/semantic-firewall) [![Semantic Check](https://github.com/BlackVectorOps/semantic_firewall/actions/workflows/semantic-check.yml/badge.svg)](https://github.com/BlackVectorOps/semantic_firewall/actions/workflows/semantic-check.yml) @@ -68,7 +68,7 @@ ## Getting Started ```bash -go install github.com/BlackVectorOps/semantic_firewall/v3/cmd/sfw@latest +go install github.com/BlackVectorOps/semantic_firewall/v4/cmd/sfw@latest ``` @@ -270,7 +270,7 @@ jobs: - uses: actions/checkout@v3 - name: Run Semantic Firewall run: | - go install github.com/BlackVectorOps/semantic_firewall/v3/cmd/sfw@latest + go install github.com/BlackVectorOps/semantic_firewall/v4/cmd/sfw@latest sfw diff old.go new.go sfw scan . --deps ``` @@ -731,7 +731,7 @@ Functions are matched by their **structural fingerprint** (block count, call pro ### Fingerprinting ```go -import semanticfw "github.com/BlackVectorOps/semantic_firewall/v3" +import semanticfw "github.com/BlackVectorOps/semantic_firewall/v4" src := `package main func Add(a, b int) int { return a + b } @@ -750,7 +750,7 @@ for _, r := range results { ### Malware Scanning with PebbleDB ```go -import semanticfw "github.com/BlackVectorOps/semantic_firewall/v3" +import semanticfw "github.com/BlackVectorOps/semantic_firewall/v4" // Open the signature database scanner, err := semanticfw.NewPebbleScanner("signatures.db", semanticfw.DefaultPebbleScannerOptions()) @@ -778,7 +778,7 @@ for _, alert := range alerts { ### Topology Extraction ```go -import semanticfw "github.com/BlackVectorOps/semantic_firewall/v3" +import semanticfw "github.com/BlackVectorOps/semantic_firewall/v4" // Extract structural features from an SSA function topo := semanticfw.ExtractTopology(ssaFunction) diff --git a/action.yml b/action.yml index b5efec3..c192730 100644 --- a/action.yml +++ b/action.yml @@ -372,6 +372,17 @@ runs: ;; audit) + # sfw resolves the audit API key from OPENAI_API_KEY or + # GEMINI_API_KEY (selected by the model name) -- it never reads + # SFW_API_KEY. Re-export the supplied key under the name the CLI + # actually consumes, otherwise audit always fails "API Key required". + if [[ -n "${SFW_API_KEY:-}" ]]; then + if [[ "${INPUT_MODEL,,}" == gemini* ]]; then + export GEMINI_API_KEY="$SFW_API_KEY" + else + export OPENAI_API_KEY="$SFW_API_KEY" + fi + fi readonly WORKTREE_DIR="${PREP_WORKTREE_DIR:-.sfw_base_worktree}" readonly DIFF_STREAM_FILE=".sfw_diff_stream.bin" diff --git a/cmd/sfw/main.go b/cmd/sfw/main.go index a6679eb..faa9d77 100644 --- a/cmd/sfw/main.go +++ b/cmd/sfw/main.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/internal/cli" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" - version "github.com/BlackVectorOps/semantic_firewall/v3/pkg/version" + "github.com/BlackVectorOps/semantic_firewall/v4/internal/cli" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" + version "github.com/BlackVectorOps/semantic_firewall/v4/pkg/version" ) // Package main provides the sfw CLI tool for semantic fingerprinting and malware scanning of Go source files. @@ -53,6 +53,7 @@ Commands: --api-key API Key (OpenAI or Gemini). REQUIRED. --model LLM Model (default: gpt-4o, supports gemini-1.5-pro) --api-base Custom API Base URL (for testing/proxying) + --no-sandbox Disable gVisor/Namespace isolation index Index a reference malware sample (Lab Phase) scan Scan target code for malware signatures (Hunter Phase) @@ -92,6 +93,7 @@ Examples: // Default updated to gpt-4o per 2026 standards (Reasoning Optimized) auditModel := auditCmd.String("model", "gpt-4o", "LLM Model to use") auditApiBase := auditCmd.String("api-base", "", "Custom API Base URL") + auditNoSandbox := auditCmd.Bool("no-sandbox", false, "Disable gVisor/Namespace isolation") indexCmd := flag.NewFlagSet("index", flag.ExitOnError) indexName := indexCmd.String("name", "", "Signature name (required)") @@ -166,7 +168,7 @@ Examples: os.Exit(1) } - exitCode, err := cli.RunAudit(os.Stdout, auditCmd.Arg(0), auditCmd.Arg(1), auditCmd.Arg(2), apiKey, *auditModel, *auditApiBase) + exitCode, err := cli.RunAudit(os.Stdout, auditCmd.Arg(0), auditCmd.Arg(1), auditCmd.Arg(2), apiKey, *auditModel, *auditApiBase, *auditNoSandbox) if err != nil { cli.ExitError(err) } @@ -269,19 +271,11 @@ func runWorker(args []string) error { return cli.RunCheckLogic(fsys, *target, *strict, *scan, resolvedDB) case "diff": - // FIXED: Support both standard 3-arg usage [diff, old, new] and legacy 5-arg usage - // 3-arg usage: sfw internal-worker diff - if len(args) == 3 { - return cli.RunDiffLogic(fsys, args[1], args[2]) + // sfw internal-worker diff + if len(args) != 3 { + return fmt.Errorf("diff worker requires arguments (old new ); got %d", len(args)-1) } - // 5-arg usage (hypothetical/legacy): sfw internal-worker diff -old -new - if len(args) >= 5 { - // Assuming indices 2 and 4 based on previous code - oldFile := args[2] - newFile := args[4] - return cli.RunDiffLogic(fsys, oldFile, newFile) - } - return fmt.Errorf("diff worker requires arguments (old new )") + return cli.RunDiffLogic(fsys, os.Stdout, args[1], args[2]) case "scan": fs := flag.NewFlagSet("scan", flag.ExitOnError) diff --git a/go.mod b/go.mod index ccd0b54..f1d32f9 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,7 @@ //go.mod go 1.24.0 -module github.com/BlackVectorOps/semantic_firewall/v3 +module github.com/BlackVectorOps/semantic_firewall/v4 require ( github.com/cockroachdb/pebble v1.1.5 diff --git a/internal/cli/audit.go b/internal/cli/audit.go index 6fe7b21..44c16ab 100644 --- a/internal/cli/audit.go +++ b/internal/cli/audit.go @@ -10,24 +10,36 @@ import ( "path/filepath" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/internal/llm" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/internal/llm" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" ) // -- AUDIT COMMAND -- -func RunAudit(w io.Writer, oldFile, newFile, commitMsg, apiKey, model, apiBase string) (int, error) { +func RunAudit(w io.Writer, oldFile, newFile, commitMsg, apiKey, model, apiBase string, noSandbox bool) (int, error) { cleanOld := filepath.Clean(oldFile) cleanNew := filepath.Clean(newFile) - args := []string{cleanOld, cleanNew} sb := RealSandboxer{} var outputBuf bytes.Buffer - // FIX: Pass os.Stderr instead of nil to capture sandbox runtime errors. - err := SandboxExec(sb, &outputBuf, os.Stderr, "diff", args, cleanOld, cleanNew) - if err != nil { - // FAIL-CLOSED: Infrastructure error must not allow bypass. - return 1, fmt.Errorf("audit failed during sandboxed diff: %w", err) + + // When the caller opted out of sandboxing -- or when we're already running + // inside one (nested CI containers, runsc-in-runsc) -- skip SandboxExec and + // invoke the diff logic directly. The previous unconditional SandboxExec + // failed closed with "process is already sandboxed; nested sandboxing is + // not supported" and left audit unusable in any pre-sandboxed environment. + if noSandbox || sb.IsSandboxed() { + if err := RunDiffLogic(RealFileSystem{}, &outputBuf, cleanOld, cleanNew); err != nil { + return 1, fmt.Errorf("audit failed during diff: %w", err) + } + } else { + args := []string{cleanOld, cleanNew} + // FIX: Pass os.Stderr instead of nil to capture sandbox runtime errors. + err := SandboxExec(sb, &outputBuf, os.Stderr, "diff", args, cleanOld, cleanNew) + if err != nil { + // FAIL-CLOSED: Infrastructure error must not allow bypass. + return 1, fmt.Errorf("audit failed during sandboxed diff: %w", err) + } } var diffOutput models.DiffOutput @@ -95,9 +107,12 @@ func RunAudit(w io.Writer, oldFile, newFile, commitMsg, apiKey, model, apiBase s return 1, fmt.Errorf("json encode failed: %w", err) } - // FAIL-CLOSED: Strict Verdict Enforcement - switch output.Output.Verdict { - case models.VerdictMatch, models.StatusPreserved: + // FAIL-CLOSED: Strict Verdict Enforcement. + // Normalize case to stay consistent with llm.validateOutput, which accepts + // verdicts case-insensitively; otherwise a valid lowercase "match" would + // fall through to the default branch and be reported as an error. + switch strings.ToUpper(output.Output.Verdict) { + case models.VerdictMatch: return 0, nil case models.VerdictLie, models.VerdictSuspicious, models.VerdictError: return 1, nil diff --git a/internal/cli/check.go b/internal/cli/check.go index aba5bd2..d89b04e 100644 --- a/internal/cli/check.go +++ b/internal/cli/check.go @@ -10,12 +10,12 @@ import ( "runtime" "sync" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/jsondb" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/jsondb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" "golang.org/x/sync/errgroup" ) @@ -148,10 +148,21 @@ func ProcessFilesParallel(fsys FileSystem, files []string, strictMode bool, scan return ctx.Err() } - // Robustness: Recover from panics in SSA generation to protect the run + // Robustness: Recover from panics in SSA generation to protect the run. + // Record the panic as a per-file error so it shows up in the JSON output + // and -- critically -- so strict mode still fails closed. Previously a + // panic produced a zero-value FileOutput, left hasErrors false, and let + // --strict silently succeed on a run that crashed mid-analysis. defer func() { if r := recover(); r != nil { fmt.Fprintf(os.Stderr, "warning: panic recovered analyzing %s: %v\n", f, r) + mu.Lock() + results[idx] = models.FileOutput{ + File: f, + ErrorMessage: fmt.Sprintf("panic during analysis: %v", r), + } + hasErrors = true + mu.Unlock() } }() diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index fe45d84..53ad2e8 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -11,8 +11,8 @@ import ( "testing" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" "golang.org/x/tools/go/packages" ) @@ -243,6 +243,56 @@ func TestRunCheckLogic_Isolation(t *testing.T) { } } +// panickingFileSystem wraps MockFileSystem but panics on Stat for a specific +// path. Used to exercise the panic-recovery path in ProcessFilesParallel +// without needing to corrupt SSA generation. +type panickingFileSystem struct { + *MockFileSystem + panicOn string +} + +func (p *panickingFileSystem) Stat(name string) (os.FileInfo, error) { + if name == p.panicOn { + panic("synthetic panic for test") + } + return p.MockFileSystem.Stat(name) +} + +// TestProcessFilesParallel_PanicSurfacedAsError ensures a panic during +// per-file analysis is recorded as an ErrorMessage on the FileOutput and +// flips the hasErrors flag, so --strict fails closed instead of silently +// passing a crashed run. +func TestProcessFilesParallel_PanicSurfacedAsError(t *testing.T) { + mockFS := &panickingFileSystem{ + MockFileSystem: &MockFileSystem{ + Files: map[string][]byte{ + "/app/boom.go": []byte("package main"), + }, + }, + panicOn: "/app/boom.go", + } + + results, hasErrors, err := ProcessFilesParallel(mockFS, []string{"/app/boom.go"}, true, nil) + if err != nil { + t.Fatalf("ProcessFilesParallel returned err: %v", err) + } + if !hasErrors { + t.Fatal("expected hasErrors=true after a recovered panic; got false") + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if results[0].ErrorMessage == "" { + t.Error("expected ErrorMessage to be populated for the panicking file") + } + if !strings.Contains(results[0].ErrorMessage, "panic") { + t.Errorf("ErrorMessage should mention panic, got: %q", results[0].ErrorMessage) + } + if results[0].File != "/app/boom.go" { + t.Errorf("File field lost; got %q", results[0].File) + } +} + // TestFileSizeGuard verifies strict size limits are enforced. func TestFileSizeGuard(t *testing.T) { mockFS := &MockFileSystem{ diff --git a/internal/cli/diff_logic.go b/internal/cli/diff_logic.go index 3d668a1..b3fad05 100644 --- a/internal/cli/diff_logic.go +++ b/internal/cli/diff_logic.go @@ -3,15 +3,11 @@ package cli import ( "encoding/json" - "fmt" + "io" "os" "path/filepath" - "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/api" ) // -- Public API -- @@ -27,283 +23,22 @@ func RunDiff(oldFile, newFile string, noSandbox bool) error { return SandboxExec(sb, os.Stdout, os.Stderr, "diff", args, cleanOld, cleanNew) } - return RunDiffLogic(fsys, cleanOld, cleanNew) + return RunDiffLogic(fsys, os.Stdout, cleanOld, cleanNew) } // -- Core Logic -- -func RunDiffLogic(fsys FileSystem, oldFile, newFile string) error { - diffOutput, err := ComputeDiff(fsys, oldFile, newFile) +// RunDiffLogic is the CLI shim around api.DiffWithFS. The diff +// computation itself lives in pkg/api so external integrations (the +// MCP server, third-party Go callers) can invoke it without importing +// internal/cli. +func RunDiffLogic(fsys FileSystem, w io.Writer, oldFile, newFile string) error { + diffOutput, err := api.DiffWithFS(fsys, oldFile, newFile) if err != nil { return err } - encoder := json.NewEncoder(os.Stdout) + encoder := json.NewEncoder(w) encoder.SetIndent("", " ") return encoder.Encode(diffOutput) } - -func ComputeDiff(fsys FileSystem, oldFile, newFile string) (*models.DiffOutput, error) { - processFile := func(path string) ([]diff.FingerprintResult, error) { - if path == "" { - return []diff.FingerprintResult{}, nil - } - info, statErr := fsys.Stat(path) - if os.IsNotExist(statErr) { - return []diff.FingerprintResult{}, nil - } - if info.Size() > MaxSourceFileSize { - return nil, fmt.Errorf("file %s exceeds maximum analysis size of %d bytes", path, MaxSourceFileSize) - } - content, err := fsys.ReadFile(path) - if err != nil { - return nil, err - } - absPath, err := fsys.Abs(path) - if err != nil { - absPath = path - } - return diff.FingerprintSource(absPath, string(content), ir.DefaultLiteralPolicy) - } - - oldResults, err := processFile(oldFile) - if err != nil { - return nil, fmt.Errorf("failed to process old file: %w", err) - } - - newResults, err := processFile(newFile) - if err != nil { - return nil, fmt.Errorf("failed to process new file: %w", err) - } - - matched, addedFuncs, removedFuncs := diff.MatchFunctionsByTopology( - oldResults, newResults, models.DefaultTopologyMatchThreshold, - ) - - var functionDiffs []models.FunctionDiff - var topologyMatches []models.TopologyMatchInfo - preserved, modified, renamed, highRisk := 0, 0, 0, 0 - - for _, m := range matched { - oldShort := ShortFunctionName(m.OldResult.FunctionName) - newShort := ShortFunctionName(m.NewResult.FunctionName) - - d := CompareFunctions(oldShort, m.OldResult, m.NewResult) - - if d.Status == models.StatusModified && m.OldTopology != nil && m.NewTopology != nil { - delta, riskScore := CalculateTopologyDelta(m.OldTopology, m.NewTopology) - d.TopologyDelta = delta - d.RiskScore = riskScore - if riskScore >= models.RiskScoreHigh { - highRisk++ - } - } - - if !m.ByName { - d.Function = fmt.Sprintf("%s → %s", oldShort, newShort) - d.Status = models.StatusRenamed - renamed++ - } - - functionDiffs = append(functionDiffs, d) - - if d.Status == models.StatusPreserved { - preserved++ - } else { - modified++ - } - - oldTopoStr := "" - if m.OldTopology != nil { - oldTopoStr = topology.TopologyFingerprint(m.OldTopology) - } - newTopoStr := "" - if m.NewTopology != nil { - newTopoStr = topology.TopologyFingerprint(m.NewTopology) - } - - topologyMatches = append(topologyMatches, models.TopologyMatchInfo{ - OldFunction: oldShort, - NewFunction: newShort, - Similarity: m.Similarity, - MatchedByName: m.ByName, - OldTopology: oldTopoStr, - NewTopology: newTopoStr, - }) - } - - for _, r := range addedFuncs { - risk := models.BaseRiskAddedFunc - delta := models.TopoDeltaNew - - fn := r.GetSSAFunction() - if fn != nil { - topo := topology.ExtractTopology(fn) - if topo != nil { - d, s := CalculateTopologyDelta(nil, topo) - delta = d - risk += s - } - } - - if risk >= models.RiskScoreHigh { - highRisk++ - } - - functionDiffs = append(functionDiffs, models.FunctionDiff{ - Function: ShortFunctionName(r.FunctionName), - Status: models.StatusAdded, - NewFingerprint: r.Fingerprint, - RiskScore: risk, - TopologyDelta: delta, - }) - } - - for _, r := range removedFuncs { - functionDiffs = append(functionDiffs, models.FunctionDiff{ - Function: ShortFunctionName(r.FunctionName), - Status: models.StatusRemoved, - OldFingerprint: r.Fingerprint, - }) - } - - added := len(addedFuncs) - removed := len(removedFuncs) - total := len(matched) + added + removed - matchPct := 0.0 - topoMatchPct := 0.0 - if total > 0 { - matchPct = float64(preserved) / float64(total) * 100.0 - } - if len(matched) > 0 { - topoMatchPct = float64(len(matched)) / float64(total) * 100.0 - } - - return &models.DiffOutput{ - OldFile: oldFile, - NewFile: newFile, - Summary: models.DiffSummary{ - TotalFunctions: total, - Preserved: preserved, - Modified: modified, - Added: added, - Removed: removed, - SemanticMatchPct: matchPct, - TopologyMatchedPct: topoMatchPct, - RenamedFunctions: renamed, - HighRiskChanges: highRisk, - }, - Functions: functionDiffs, - TopologyMatches: topologyMatches, - }, nil -} - -func CalculateTopologyDelta(oldT, newT *topology.FunctionTopology) (string, int) { - if newT == nil { - return models.TopoDeltaUnknown, 0 - } - if oldT == nil { - oldT = &topology.FunctionTopology{} - } - - var deltas []string - riskScore := 0 - - callDiff := len(newT.CallSignatures) - len(oldT.CallSignatures) - if callDiff > 0 { - deltas = append(deltas, fmt.Sprintf("Calls+%d", callDiff)) - riskScore += callDiff * 5 - } else if callDiff < 0 { - deltas = append(deltas, fmt.Sprintf("Calls%d", callDiff)) - } - - loopDiff := newT.LoopCount - oldT.LoopCount - if loopDiff > 0 { - deltas = append(deltas, fmt.Sprintf("Loops+%d", loopDiff)) - riskScore += loopDiff * 10 - } else if loopDiff < 0 { - deltas = append(deltas, fmt.Sprintf("Loops%d", loopDiff)) - } - - branchDiff := newT.BranchCount - oldT.BranchCount - if branchDiff > 0 { - deltas = append(deltas, fmt.Sprintf("Branches+%d", branchDiff)) - riskScore += branchDiff * 2 - } else if branchDiff < 0 { - deltas = append(deltas, fmt.Sprintf("Branches%d", branchDiff)) - } - - if newT.HasGo && !oldT.HasGo { - deltas = append(deltas, models.TopoDeltaGoroutine) - riskScore += 15 - } - - if newT.HasDefer && !oldT.HasDefer { - deltas = append(deltas, models.TopoDeltaDefer) - riskScore += 3 - } - - if newT.HasPanic && !oldT.HasPanic { - deltas = append(deltas, models.TopoDeltaPanic) - riskScore += 5 - } - - entropyDiff := newT.EntropyScore - oldT.EntropyScore - if entropyDiff > 1.0 { - deltas = append(deltas, fmt.Sprintf("Entropy+%.1f", entropyDiff)) - riskScore += int(entropyDiff * 3) - } - - if len(deltas) == 0 { - return models.TopoDeltaNone, 0 - } - - return strings.Join(deltas, ", "), riskScore -} - -func CompareFunctions(funcName string, oldResult, newResult diff.FingerprintResult) models.FunctionDiff { - d := models.FunctionDiff{ - Function: funcName, - OldFingerprint: oldResult.Fingerprint, - NewFingerprint: newResult.Fingerprint, - } - - if oldResult.Fingerprint == newResult.Fingerprint { - d.Status = models.StatusPreserved - d.FingerprintMatch = true - return d - } - - d.FingerprintMatch = false - oldFn := oldResult.GetSSAFunction() - newFn := newResult.GetSSAFunction() - - if oldFn == nil || newFn == nil { - d.Status = models.StatusModified - return d - } - - zipper, err := diff.NewZipper(oldFn, newFn, ir.DefaultLiteralPolicy) - if err != nil { - d.Status = models.StatusModified - return d - } - - artifacts, err := zipper.ComputeDiff() - if err != nil { - d.Status = models.StatusModified - return d - } - - d.MatchedNodes = artifacts.MatchedNodes - d.AddedOps = artifacts.Added - d.RemovedOps = artifacts.Removed - - if artifacts.Preserved { - d.Status = models.StatusPreserved - } else { - d.Status = models.StatusModified - } - - return d -} diff --git a/internal/cli/index.go b/internal/cli/index.go index 39d4f34..4942516 100644 --- a/internal/cli/index.go +++ b/internal/cli/index.go @@ -9,11 +9,11 @@ import ( "strings" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/jsondb" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/jsondb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" ) func RunIndex(target, name, severity, category, dbPath string) error { diff --git a/internal/cli/interfaces.go b/internal/cli/interfaces.go index c51ac67..1e07836 100644 --- a/internal/cli/interfaces.go +++ b/internal/cli/interfaces.go @@ -4,24 +4,17 @@ package cli import ( "context" "io" - "io/fs" - "os" - "github.com/BlackVectorOps/semantic_firewall/v3/internal/sandbox" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/internal/sandbox" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/api" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" "golang.org/x/tools/go/packages" ) -// FileSystem abstracts OS file operations to enable hermetic testing. -type FileSystem interface { - Stat(name string) (os.FileInfo, error) - Open(name string) (fs.File, error) - Getwd() (string, error) - Abs(path string) (string, error) - WalkDir(root string, fn fs.WalkDirFunc) error - ReadFile(name string) ([]byte, error) -} +// FileSystem is re-exported from pkg/api so external integrations +// (e.g. the MCP server) and internal callers see the same type. +type FileSystem = api.FileSystem // Sandboxer abstracts the process isolation mechanism. type Sandboxer interface { diff --git a/internal/cli/migrate.go b/internal/cli/migrate.go index 90c3c87..fee89b8 100644 --- a/internal/cli/migrate.go +++ b/internal/cli/migrate.go @@ -6,7 +6,7 @@ import ( "fmt" "os" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" ) func RunMigrate(fromPath, toPath string) error { diff --git a/internal/cli/sandbox_adapter.go b/internal/cli/sandbox_adapter.go index c57ea8a..ee459e8 100644 --- a/internal/cli/sandbox_adapter.go +++ b/internal/cli/sandbox_adapter.go @@ -13,7 +13,7 @@ import ( "strings" "syscall" - "github.com/BlackVectorOps/semantic_firewall/v3/internal/sandbox" + "github.com/BlackVectorOps/semantic_firewall/v4/internal/sandbox" ) // Implements the Sandboxer interface using the internal sandbox package. diff --git a/internal/cli/scan.go b/internal/cli/scan.go index c4f3638..47a9eb1 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -13,12 +13,12 @@ import ( "strings" "sync" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/jsondb" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/jsondb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" "golang.org/x/sync/errgroup" "golang.org/x/tools/go/packages" "golang.org/x/tools/go/ssa" diff --git a/internal/cli/stats.go b/internal/cli/stats.go index 1ef6ea8..4ef9009 100644 --- a/internal/cli/stats.go +++ b/internal/cli/stats.go @@ -5,8 +5,8 @@ import ( "encoding/json" "os" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/jsondb" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/jsondb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" ) func RunStats(dbPath string) error { diff --git a/internal/cli/utils.go b/internal/cli/utils.go index 0295f5e..3bc69e5 100644 --- a/internal/cli/utils.go +++ b/internal/cli/utils.go @@ -3,59 +3,23 @@ package cli import ( "fmt" - "io" "io/fs" "os" "path/filepath" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/api" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" "golang.org/x/tools/go/packages" ) // -- Real Implementations -- -// RealFileSystem implements FileSystem using the actual OS. -type RealFileSystem struct{} - -func (fs RealFileSystem) Stat(name string) (os.FileInfo, error) { return os.Stat(name) } -func (fs RealFileSystem) Open(name string) (fs.File, error) { return os.Open(name) } -func (fs RealFileSystem) Getwd() (string, error) { return os.Getwd() } -func (fs RealFileSystem) Abs(path string) (string, error) { return filepath.Abs(path) } -func (fs RealFileSystem) WalkDir(root string, fn fs.WalkDirFunc) error { - return filepath.WalkDir(root, fn) -} -func (fs RealFileSystem) ReadFile(name string) ([]byte, error) { - // Re-implement safety logic here to ensure it applies to all users - f, err := os.Open(name) - if err != nil { - return nil, err - } - defer f.Close() - - info, err := f.Stat() - if err != nil { - return nil, err - } - if info.IsDir() { - return nil, fmt.Errorf("path is a directory: %s", name) - } - if info.Size() > models.MaxSourceFileSize { - return nil, fmt.Errorf("file exceeds maximum supported size of %d bytes", models.MaxSourceFileSize) - } - - limit := int64(models.MaxSourceFileSize + 1) - content, err := io.ReadAll(io.LimitReader(f, limit)) - if err != nil { - return nil, err - } - if len(content) > models.MaxSourceFileSize { - return nil, fmt.Errorf("file exceeds maximum supported size of %d bytes", models.MaxSourceFileSize) - } - return content, nil -} +// RealFileSystem is re-exported from pkg/api so internal callers stay +// on the existing name while external integrations import the +// canonical version from pkg/api directly. +type RealFileSystem = api.RealFileSystem // RealPackageLoader wraps packages.Load type RealPackageLoader struct{} @@ -239,107 +203,12 @@ func LoadAndFingerprint(fsys FileSystem, filename string) ([]diff.FingerprintRes return diff.FingerprintSource(absPath, string(src), ir.DefaultLiteralPolicy) } +// ShortFunctionName delegates to pkg/api so the CLI and external +// integrations share a single implementation. Kept as a wrapper rather +// than a re-export to avoid forcing every internal/cli caller to +// import pkg/api directly. func ShortFunctionName(fullName string) string { - // Robust parsing for methods with receivers (e.g. (*pkg.Type).Method) - // Will detect the receiver parens and recursively strip the package from the type inside. - if strings.HasPrefix(fullName, "(") { - depth := 0 - closeIndex := -1 - for i, c := range fullName { - if c == '(' { - depth++ - } else if c == ')' { - depth-- - if depth == 0 { - closeIndex = i - break - } - } - } - - if closeIndex > 1 { - receiver := fullName[1:closeIndex] // e.g. "*pkg.Type" - rest := fullName[closeIndex+1:] // e.g. ".Method" - - // Preserve pointer indicator - prefix := "" - if strings.HasPrefix(receiver, "*") { - prefix = "*" - receiver = receiver[1:] - } - - // Recursively clean the inner type (strips path and package) - cleanReceiver := ShortFunctionName(receiver) - - return fmt.Sprintf("(%s%s)%s", prefix, cleanReceiver, rest) - } - } - - // 1. Backward Scan: Strip package path - // e.g. "github.com/pkg.Func" -> "pkg.Func" - // Must respect brackets [] and parens () to avoid splitting inside generics - end := len(fullName) - 1 - depthBrackets := 0 - depthParens := 0 - splitIndex := -1 - - for i := end; i >= 0; i-- { - b := fullName[i] - switch b { - case ']': - depthBrackets++ - case '[': - depthBrackets-- - case ')': - depthParens++ - case '(': - depthParens-- - case '/': - if depthBrackets == 0 && depthParens == 0 { - splitIndex = i - goto FoundSplit - } - } - } -FoundSplit: - name := fullName - if splitIndex >= 0 { - name = fullName[splitIndex+1:] - } - - // 2. Forward Scan: Strip package name from qualified identifier - // We scan for the first dot at depth 0. - // Heuristic: If the prefix before the dot contains brackets or parens, - // it's likely a receiver type (e.g. Type[T]) and NOT a package name. - depth := 0 - for i, ch := range name { - switch ch { - case '(': - depth++ - case ')': - depth-- - case '[': - depth++ - case ']': - depth-- - case '.': - if depth == 0 { - prefix := name[:i] - // If prefix has special characters, assume it is part of the type signature - // (e.g. Type[T].Method) and preserve it. - // If prefix is clean (e.g. pkg.Func), strip it. - if !containsSpecial(prefix) { - return name[i+1:] - } - return name - } - } - } - return name -} - -func containsSpecial(s string) bool { - return strings.ContainsAny(s, "[]()") + return api.ShortFunctionName(fullName) } func IsJSON(path string) bool { diff --git a/internal/llm/client.go b/internal/llm/client.go index b9edf6c..081da2c 100644 --- a/internal/llm/client.go +++ b/internal/llm/client.go @@ -19,7 +19,7 @@ import ( "time" "unicode/utf8" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" "google.golang.org/genai" ) diff --git a/internal/llm/client_test.go b/internal/llm/client_test.go index 3f2b496..e1eef8e 100644 --- a/internal/llm/client_test.go +++ b/internal/llm/client_test.go @@ -11,7 +11,7 @@ import ( "testing" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/models" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" ) // -- Helpers -- diff --git a/pkg/analysis/ir/benchmark_test.go b/pkg/analysis/ir/benchmark_test.go index 03a5f24..a4aa0fb 100644 --- a/pkg/analysis/ir/benchmark_test.go +++ b/pkg/analysis/ir/benchmark_test.go @@ -7,7 +7,7 @@ import ( "strings" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" "golang.org/x/tools/go/packages" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/analysis/ir/canonicalizer.go b/pkg/analysis/ir/canonicalizer.go index eeaf1c4..756bf89 100644 --- a/pkg/analysis/ir/canonicalizer.go +++ b/pkg/analysis/ir/canonicalizer.go @@ -10,7 +10,7 @@ import ( "strings" "sync" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/loop" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/loop" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/analysis/ir/export_test.go b/pkg/analysis/ir/export_test.go index 54a8983..8cd8ce8 100644 --- a/pkg/analysis/ir/export_test.go +++ b/pkg/analysis/ir/export_test.go @@ -1,7 +1,7 @@ package ir import ( - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/loop" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/loop" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/analysis/ir/ir_test.go b/pkg/analysis/ir/ir_test.go index 9265cad..1d3ce1f 100644 --- a/pkg/analysis/ir/ir_test.go +++ b/pkg/analysis/ir/ir_test.go @@ -8,9 +8,9 @@ import ( "testing" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/loop" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/loop" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" "golang.org/x/tools/go/ssa" ) @@ -376,6 +376,48 @@ func TestCanonicalizer_KitchenSink(t *testing.T) { } } +func TestCanonicalizer_DerivedInductionVariable(t *testing.T) { + t.Parallel() + + // A value derived affinely from the loop counter (here i*2) must be + // recognised as a scalar-evolution recurrence and virtualised, instead of + // surviving as an opaque BinOp MUL. This is what lets loops that compute a + // strided value two different ways canonicalize to the same fingerprint. + src := `package main + func derived(n int) int { + total := 0 + for i := 0; i < n; i++ { + total += i * 2 + } + return total + }` + + fn := testutil.CompileAndGetFunction(t, src, "derived") + c := ir.NewCanonicalizer(ir.DefaultLiteralPolicy) + defer ir.ReleaseCanonicalizer(c) + + out := c.CanonicalizeFunction(fn) + + // {start, +, step} notation is only produced by an SCEVAddRec. + if !strings.Contains(out, ", +, ") { + t.Errorf("expected an affine recurrence {start, +, step} in canonical IR:\n%s", out) + } + // The derived multiplication must not survive as an opaque operation. + if strings.Contains(out, "BinOp *") { + t.Errorf("derived induction variable left as opaque BinOp MUL:\n%s", out) + } + + // Canonicalization must remain deterministic with the new virtualization. + for i := 0; i < 5; i++ { + c2 := ir.NewCanonicalizer(ir.DefaultLiteralPolicy) + out2 := c2.CanonicalizeFunction(fn) + ir.ReleaseCanonicalizer(c2) + if out != out2 { + t.Fatalf("derived-IV canonical IR is not stable across runs:\n%s\n--- vs ---\n%s", out, out2) + } + } +} + func TestCanonicalizer_LoopDepthLimit(t *testing.T) { t.Parallel() diff --git a/pkg/analysis/loop/scev.go b/pkg/analysis/loop/scev.go index f3e8b8c..8ef1076 100644 --- a/pkg/analysis/loop/scev.go +++ b/pkg/analysis/loop/scev.go @@ -676,6 +676,139 @@ func SCEVFromConst(c *ssa.Const) SCEV { return &SCEVUnknown{Value: c, IsInvariant: true} } +// foldSCEV combines two scalar-evolution expressions under an arithmetic +// operator, simplifying the result into a canonical SCEVAddRec whenever the +// combination remains affine in the loop counter. This is what lets a derived +// induction variable (e.g. j := i*2 + 3) be recognised as the recurrence +// {3, +, 2} rather than an opaque expression, so that semantically equivalent +// loops canonicalize to the same fingerprint. +// +// The affine identities applied (each verified at iteration k, where the +// recurrence {a, +, b} denotes a + b*k): +// +// {a,+,b} ± X = {a±X, +, b} (X invariant in the recurrence's loop) +// X - {a,+,b} = {X-a, +, -b} +// {a,+,b} ± {c,+,d} = {a±c, +, b±d} (same loop) +// {a,+,b} * X = {a*X, +, b*X} (X invariant) +// +// A recurrence multiplied by another recurrence is quadratic, not affine, so it +// is intentionally left as an opaque SCEVGenericExpr. Anything that does not +// match a rule also falls through to SCEVGenericExpr, preserving the previous +// behaviour. Recursion terminates because every recursive call operates on a +// strictly smaller sub-expression of the input trees. func foldSCEV(op token.Token, left, right SCEV, loop *Loop) SCEV { + // Constant op Constant collapses immediately. + if lc, ok := left.(*SCEVConstant); ok { + if rc, ok := right.(*SCEVConstant); ok { + if folded := foldConstants(op, lc.Value, rc.Value); folded != nil { + return folded + } + } + } + + lRec, lIsRec := left.(*SCEVAddRec) + rRec, rIsRec := right.(*SCEVAddRec) + + switch op { + case token.ADD: + if lIsRec && rIsRec && lRec.Loop == rRec.Loop { + return &SCEVAddRec{ + Start: foldSCEV(token.ADD, lRec.Start, rRec.Start, loop), + Step: foldSCEV(token.ADD, lRec.Step, rRec.Step, loop), + Loop: lRec.Loop, + } + } + if lIsRec && right.IsLoopInvariant(lRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.ADD, lRec.Start, right, loop), + Step: lRec.Step, + Loop: lRec.Loop, + } + } + if rIsRec && left.IsLoopInvariant(rRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.ADD, left, rRec.Start, loop), + Step: rRec.Step, + Loop: rRec.Loop, + } + } + + case token.SUB: + if lIsRec && rIsRec && lRec.Loop == rRec.Loop { + return &SCEVAddRec{ + Start: foldSCEV(token.SUB, lRec.Start, rRec.Start, loop), + Step: foldSCEV(token.SUB, lRec.Step, rRec.Step, loop), + Loop: lRec.Loop, + } + } + if lIsRec && right.IsLoopInvariant(lRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.SUB, lRec.Start, right, loop), + Step: lRec.Step, + Loop: lRec.Loop, + } + } + if rIsRec && left.IsLoopInvariant(rRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.SUB, left, rRec.Start, loop), + Step: negateSCEV(rRec.Step), + Loop: rRec.Loop, + } + } + + case token.MUL: + // Scaling a recurrence by a loop-invariant factor stays affine; scaling + // one recurrence by another does not. + if lIsRec && !rIsRec && right.IsLoopInvariant(lRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.MUL, lRec.Start, right, loop), + Step: foldSCEV(token.MUL, lRec.Step, right, loop), + Loop: lRec.Loop, + } + } + if rIsRec && !lIsRec && left.IsLoopInvariant(rRec.Loop) { + return &SCEVAddRec{ + Start: foldSCEV(token.MUL, left, rRec.Start, loop), + Step: foldSCEV(token.MUL, left, rRec.Step, loop), + Loop: rRec.Loop, + } + } + } + return &SCEVGenericExpr{Op: op, X: left, Y: right} } + +// foldConstants evaluates an arithmetic operator over two integer constants. +// It returns nil for operators it does not fold (e.g. QUO, which would hide +// truncation behaviour) so the caller can keep an opaque expression. +func foldConstants(op token.Token, x, y *big.Int) *SCEVConstant { + if x == nil || y == nil { + return nil + } + res := new(big.Int) + switch op { + case token.ADD: + res.Add(x, y) + case token.SUB: + res.Sub(x, y) + case token.MUL: + res.Mul(x, y) + default: + return nil + } + return &SCEVConstant{Value: res} +} + +// negateSCEV returns the arithmetic negation of a scalar-evolution expression, +// pushing the negation into constants and recurrences so the result stays in +// canonical form rather than wrapping everything in a multiply-by-minus-one. +func negateSCEV(s SCEV) SCEV { + switch v := s.(type) { + case *SCEVConstant: + return &SCEVConstant{Value: new(big.Int).Neg(v.Value)} + case *SCEVAddRec: + return &SCEVAddRec{Start: negateSCEV(v.Start), Step: negateSCEV(v.Step), Loop: v.Loop} + default: + return &SCEVGenericExpr{Op: token.MUL, X: s, Y: &SCEVConstant{Value: big.NewInt(-1)}} + } +} diff --git a/pkg/analysis/loop/scev_fold_test.go b/pkg/analysis/loop/scev_fold_test.go new file mode 100644 index 0000000..6464fea --- /dev/null +++ b/pkg/analysis/loop/scev_fold_test.go @@ -0,0 +1,130 @@ +package loop + +import ( + "go/token" + "math/big" + "testing" +) + +// scevConst is a small constructor helper for readable test expressions. +func scevConst(n int64) *SCEVConstant { return &SCEVConstant{Value: big.NewInt(n)} } + +// evalAt evaluates a SCEV at iteration k, failing the test if it is unknown. +func evalAt(t *testing.T, s SCEV, k int64) *big.Int { + t.Helper() + v := s.EvaluateAt(big.NewInt(k), nil) + if v == nil { + t.Fatalf("SCEV %s did not evaluate at k=%d", s.String(), k) + } + return v +} + +// applyOp mirrors foldSCEV's arithmetic so tests can assert that folding is +// value-preserving rather than just structurally plausible. +func applyOp(op token.Token, x, y *big.Int) *big.Int { + res := new(big.Int) + switch op { + case token.ADD: + return res.Add(x, y) + case token.SUB: + return res.Sub(x, y) + case token.MUL: + return res.Mul(x, y) + } + panic("unsupported op") +} + +// assertValuePreserving checks that folding left op right yields a SCEV whose +// value matches the operator applied pointwise across several iterations. +func assertValuePreserving(t *testing.T, op token.Token, left, right SCEV, l *Loop) SCEV { + t.Helper() + folded := foldSCEV(op, left, right, l) + for k := int64(0); k < 6; k++ { + got := evalAt(t, folded, k) + want := applyOp(op, evalAt(t, left, k), evalAt(t, right, k)) + if got.Cmp(want) != 0 { + t.Fatalf("foldSCEV(%s) at k=%d: got %s, want %s (folded=%s)", + op, k, got, want, folded.String()) + } + } + return folded +} + +func TestFoldConstants(t *testing.T) { + t.Parallel() + l := &Loop{} + got := foldSCEV(token.ADD, scevConst(7), scevConst(5), l) + c, ok := got.(*SCEVConstant) + if !ok { + t.Fatalf("constant ADD did not fold to a constant: %T", got) + } + if c.Value.Int64() != 12 { + t.Errorf("7 + 5: got %s, want 12", c.Value) + } + // QUO is intentionally not folded so truncation never hides in the IR. + if _, ok := foldSCEV(token.QUO, scevConst(8), scevConst(2), l).(*SCEVGenericExpr); !ok { + t.Error("QUO of constants should remain an opaque expression") + } +} + +func TestFoldAddRecWithInvariant(t *testing.T) { + t.Parallel() + l := &Loop{} + rec := &SCEVAddRec{Start: scevConst(1), Step: scevConst(2), Loop: l} // {1,+,2} + + // {1,+,2} + 5 -> {6,+,2} + folded := assertValuePreserving(t, token.ADD, rec, scevConst(5), l) + ar, ok := folded.(*SCEVAddRec) + if !ok { + t.Fatalf("AddRec + const did not stay affine: %T", folded) + } + if ar.Loop != l { + t.Error("folded recurrence lost its loop association") + } + + // 5 + {1,+,2} -> {6,+,2} (commuted) + if _, ok := assertValuePreserving(t, token.ADD, scevConst(5), rec, l).(*SCEVAddRec); !ok { + t.Error("const + AddRec did not stay affine") + } + + // {1,+,2} - 5 -> {-4,+,2} + if _, ok := assertValuePreserving(t, token.SUB, rec, scevConst(5), l).(*SCEVAddRec); !ok { + t.Error("AddRec - const did not stay affine") + } + + // 10 - {1,+,2} -> {9,+,-2} (step negated) + folded = assertValuePreserving(t, token.SUB, scevConst(10), rec, l) + ar, ok = folded.(*SCEVAddRec) + if !ok { + t.Fatalf("const - AddRec did not stay affine: %T", folded) + } + if evalAt(t, ar.Step, 0).Int64() != -2 { + t.Errorf("const - AddRec: step should be negated, got %s", ar.Step.String()) + } + + // {1,+,2} * 3 -> {3,+,6} + if _, ok := assertValuePreserving(t, token.MUL, rec, scevConst(3), l).(*SCEVAddRec); !ok { + t.Error("AddRec * const did not stay affine") + } +} + +func TestFoldAddRecWithAddRec(t *testing.T) { + t.Parallel() + l := &Loop{} + a := &SCEVAddRec{Start: scevConst(1), Step: scevConst(2), Loop: l} // {1,+,2} + b := &SCEVAddRec{Start: scevConst(3), Step: scevConst(4), Loop: l} // {3,+,4} + + // {1,+,2} + {3,+,4} -> {4,+,6} + if _, ok := assertValuePreserving(t, token.ADD, a, b, l).(*SCEVAddRec); !ok { + t.Error("AddRec + AddRec (same loop) did not stay affine") + } + // {1,+,2} - {3,+,4} -> {-2,+,-2} + if _, ok := assertValuePreserving(t, token.SUB, a, b, l).(*SCEVAddRec); !ok { + t.Error("AddRec - AddRec (same loop) did not stay affine") + } + + // Recurrence * recurrence is quadratic and must stay opaque. + if _, ok := foldSCEV(token.MUL, a, b, l).(*SCEVGenericExpr); !ok { + t.Error("AddRec * AddRec should not be folded into an affine recurrence") + } +} diff --git a/pkg/analysis/loop/scev_test.go b/pkg/analysis/loop/scev_test.go index abeee02..5aae0fa 100644 --- a/pkg/analysis/loop/scev_test.go +++ b/pkg/analysis/loop/scev_test.go @@ -9,8 +9,8 @@ import ( "go/token" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/loop" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/loop" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" ) func TestSCEVPatterns(t *testing.T) { diff --git a/pkg/analysis/topology/bench_test.go b/pkg/analysis/topology/bench_test.go index 57559b3..86437f2 100644 --- a/pkg/analysis/topology/bench_test.go +++ b/pkg/analysis/topology/bench_test.go @@ -3,7 +3,7 @@ package topology_test import ( "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" ) func BenchmarkEntropyCalculation(b *testing.B) { diff --git a/pkg/analysis/topology/entropy_test.go b/pkg/analysis/topology/entropy_test.go index 87691d8..dfc769d 100644 --- a/pkg/analysis/topology/entropy_test.go +++ b/pkg/analysis/topology/entropy_test.go @@ -4,7 +4,7 @@ import ( "math" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" ) func TestCalculateEntropy(t *testing.T) { diff --git a/pkg/analysis/topology/string_extraction_test.go b/pkg/analysis/topology/string_extraction_test.go index b3d581e..250625f 100644 --- a/pkg/analysis/topology/string_extraction_test.go +++ b/pkg/analysis/topology/string_extraction_test.go @@ -3,8 +3,8 @@ package topology_test import ( "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" ) // TestStringExtraction verifies that string literals are extracted without quotes. diff --git a/pkg/analysis/topology/topology.go b/pkg/analysis/topology/topology.go index c81c2b0..1495138 100644 --- a/pkg/analysis/topology/topology.go +++ b/pkg/analysis/topology/topology.go @@ -12,7 +12,7 @@ import ( "sync" "unicode/utf8" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/loop" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/loop" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/analysis/topology/topology_test.go b/pkg/analysis/topology/topology_test.go index 92f891a..aec391c 100644 --- a/pkg/analysis/topology/topology_test.go +++ b/pkg/analysis/topology/topology_test.go @@ -5,8 +5,8 @@ import ( "sync" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" ) func TestExtractTopology(t *testing.T) { diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go new file mode 100644 index 0000000..f1012f7 --- /dev/null +++ b/pkg/api/api_test.go @@ -0,0 +1,112 @@ +package api_test + +import ( + "os" + "path/filepath" + "testing" + + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/api" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" +) + +// TestShortFunctionName covers the cases that used to live in +// internal/cli's test suite, pinning the moved implementation against +// regressions. +func TestShortFunctionName(t *testing.T) { + cases := []struct { + in, want string + }{ + {"fmt.Println", "Println"}, + {"main.main", "main"}, + {"github.com/user/repo/pkg.Func", "Func"}, + {"pkg.(*Type).Method", "(*Type).Method"}, + {"(*pkg.Type).Method", "(*Type).Method"}, + {"pkg.Func[int]", "Func[int]"}, + {"pkg.Func[a/b.T]", "Func[a/b.T]"}, + {"pkg.Type[sub.T].Method", "Type[sub.T].Method"}, + {"Type[sub.T].Method", "Type[sub.T].Method"}, + } + for _, tc := range cases { + if got := api.ShortFunctionName(tc.in); got != tc.want { + t.Errorf("ShortFunctionName(%q) = %q; want %q", tc.in, got, tc.want) + } + } +} + +const srcA = `package x + +func Add(a, b int) int { + return a + b +} +` + +const srcB = `package x + +func Add(a, b int) int { + if a < 0 { + return -1 + } + return a + b +} +` + +// TestDiff_DetectsModification exercises the public Diff entry point +// on real files, confirming the wiring (RealFileSystem, SSA build, +// topology match, summary aggregation) survives the move out of +// internal/cli. +func TestDiff_DetectsModification(t *testing.T) { + dir := t.TempDir() + oldPath := filepath.Join(dir, "a.go") + newPath := filepath.Join(dir, "b.go") + if err := os.WriteFile(oldPath, []byte(srcA), 0o644); err != nil { + t.Fatalf("write a: %v", err) + } + if err := os.WriteFile(newPath, []byte(srcB), 0o644); err != nil { + t.Fatalf("write b: %v", err) + } + + out, err := api.Diff(oldPath, newPath) + if err != nil { + t.Fatalf("Diff: %v", err) + } + if out.OldFile != oldPath || out.NewFile != newPath { + t.Errorf("paths not echoed: %+v", out) + } + if out.Summary.TotalFunctions == 0 { + t.Fatal("expected at least one function") + } + + var found bool + for _, f := range out.Functions { + if f.Function == "Add" { + found = true + if f.Status != models.StatusModified { + t.Errorf("Add status = %q; want modified", f.Status) + } + } + } + if !found { + t.Fatal("Add not present in diff output") + } +} + +// TestDiff_MissingFileTreatedAsEmpty verifies that an absent path is +// interpreted as "no functions" (the added/removed-file case) rather +// than an error, which the GitHub Action depends on when one side of +// the diff doesn't exist yet. +func TestDiff_MissingFileTreatedAsEmpty(t *testing.T) { + dir := t.TempDir() + present := filepath.Join(dir, "present.go") + missing := filepath.Join(dir, "missing.go") + if err := os.WriteFile(present, []byte(srcA), 0o644); err != nil { + t.Fatalf("write: %v", err) + } + + out, err := api.Diff(missing, present) + if err != nil { + t.Fatalf("Diff: %v", err) + } + if out.Summary.Added == 0 { + t.Errorf("expected added > 0 when old side missing; got %+v", out.Summary) + } +} diff --git a/pkg/api/diff.go b/pkg/api/diff.go new file mode 100644 index 0000000..98badb8 --- /dev/null +++ b/pkg/api/diff.go @@ -0,0 +1,302 @@ +package api + +import ( + "fmt" + "os" + "strings" + + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" +) + +// Diff is the convenience entry point: it computes a semantic diff +// between two on-disk Go source files using the real filesystem. +// Either path may be empty or non-existent to represent an added or +// removed file. +func Diff(oldPath, newPath string) (*models.DiffOutput, error) { + return DiffWithFS(RealFileSystem{}, oldPath, newPath) +} + +// DiffWithFS performs a semantic diff against the supplied FileSystem +// so tests can drive the pipeline without touching disk. Behaviour is +// identical to Diff otherwise. +func DiffWithFS(fsys FileSystem, oldPath, newPath string) (*models.DiffOutput, error) { + processFile := func(path string) ([]diff.FingerprintResult, error) { + if path == "" { + return []diff.FingerprintResult{}, nil + } + info, statErr := fsys.Stat(path) + if statErr != nil { + // A missing file is a valid "no functions" case (added/removed + // file). Any other stat error must surface so we never + // dereference the nil FileInfo below. + if os.IsNotExist(statErr) { + return []diff.FingerprintResult{}, nil + } + return nil, fmt.Errorf("failed to stat %s: %w", path, statErr) + } + if info.Size() > models.MaxSourceFileSize { + return nil, fmt.Errorf("file %s exceeds maximum analysis size of %d bytes", path, models.MaxSourceFileSize) + } + content, err := fsys.ReadFile(path) + if err != nil { + return nil, err + } + absPath, err := fsys.Abs(path) + if err != nil { + absPath = path + } + return diff.FingerprintSource(absPath, string(content), ir.DefaultLiteralPolicy) + } + + oldResults, err := processFile(oldPath) + if err != nil { + return nil, fmt.Errorf("failed to process old file: %w", err) + } + newResults, err := processFile(newPath) + if err != nil { + return nil, fmt.Errorf("failed to process new file: %w", err) + } + + matched, addedFuncs, removedFuncs := diff.MatchFunctionsByTopology( + oldResults, newResults, models.DefaultTopologyMatchThreshold, + ) + + var functionDiffs []models.FunctionDiff + var topologyMatches []models.TopologyMatchInfo + preserved, modified, renamed, highRisk := 0, 0, 0, 0 + + for _, m := range matched { + oldShort := ShortFunctionName(m.OldResult.FunctionName) + newShort := ShortFunctionName(m.NewResult.FunctionName) + + d := CompareFunctions(oldShort, m.OldResult, m.NewResult) + + if d.Status == models.StatusModified && m.OldTopology != nil && m.NewTopology != nil { + delta, riskScore := CalculateTopologyDelta(m.OldTopology, m.NewTopology) + d.TopologyDelta = delta + d.RiskScore = riskScore + if riskScore >= models.RiskScoreHigh { + highRisk++ + } + } + + if !m.ByName { + d.Function = fmt.Sprintf("%s → %s", oldShort, newShort) + d.Status = models.StatusRenamed + renamed++ + } + + functionDiffs = append(functionDiffs, d) + + if d.Status == models.StatusPreserved { + preserved++ + } else { + modified++ + } + + oldTopoStr := "" + if m.OldTopology != nil { + oldTopoStr = topology.TopologyFingerprint(m.OldTopology) + } + newTopoStr := "" + if m.NewTopology != nil { + newTopoStr = topology.TopologyFingerprint(m.NewTopology) + } + + topologyMatches = append(topologyMatches, models.TopologyMatchInfo{ + OldFunction: oldShort, + NewFunction: newShort, + Similarity: m.Similarity, + MatchedByName: m.ByName, + OldTopology: oldTopoStr, + NewTopology: newTopoStr, + }) + } + + for _, r := range addedFuncs { + risk := models.BaseRiskAddedFunc + delta := models.TopoDeltaNew + + fn := r.GetSSAFunction() + if fn != nil { + topo := topology.ExtractTopology(fn) + if topo != nil { + d, s := CalculateTopologyDelta(nil, topo) + delta = d + risk += s + } + } + + if risk >= models.RiskScoreHigh { + highRisk++ + } + + functionDiffs = append(functionDiffs, models.FunctionDiff{ + Function: ShortFunctionName(r.FunctionName), + Status: models.StatusAdded, + NewFingerprint: r.Fingerprint, + RiskScore: risk, + TopologyDelta: delta, + }) + } + + for _, r := range removedFuncs { + functionDiffs = append(functionDiffs, models.FunctionDiff{ + Function: ShortFunctionName(r.FunctionName), + Status: models.StatusRemoved, + OldFingerprint: r.Fingerprint, + }) + } + + added := len(addedFuncs) + removed := len(removedFuncs) + total := len(matched) + added + removed + matchPct := 0.0 + topoMatchPct := 0.0 + if total > 0 { + matchPct = float64(preserved) / float64(total) * 100.0 + } + if len(matched) > 0 { + topoMatchPct = float64(len(matched)) / float64(total) * 100.0 + } + + return &models.DiffOutput{ + OldFile: oldPath, + NewFile: newPath, + Summary: models.DiffSummary{ + TotalFunctions: total, + Preserved: preserved, + Modified: modified, + Added: added, + Removed: removed, + SemanticMatchPct: matchPct, + TopologyMatchedPct: topoMatchPct, + RenamedFunctions: renamed, + HighRiskChanges: highRisk, + }, + Functions: functionDiffs, + TopologyMatches: topologyMatches, + }, nil +} + +// CalculateTopologyDelta diffs two function topologies and returns a +// short human label plus a heuristic risk score. Adding goroutines, +// loops, or external calls weighs the score up; removing structure +// is neutral. +func CalculateTopologyDelta(oldT, newT *topology.FunctionTopology) (string, int) { + if newT == nil { + return models.TopoDeltaUnknown, 0 + } + if oldT == nil { + oldT = &topology.FunctionTopology{} + } + + var deltas []string + riskScore := 0 + + callDiff := len(newT.CallSignatures) - len(oldT.CallSignatures) + if callDiff > 0 { + deltas = append(deltas, fmt.Sprintf("Calls+%d", callDiff)) + riskScore += callDiff * 5 + } else if callDiff < 0 { + deltas = append(deltas, fmt.Sprintf("Calls%d", callDiff)) + } + + loopDiff := newT.LoopCount - oldT.LoopCount + if loopDiff > 0 { + deltas = append(deltas, fmt.Sprintf("Loops+%d", loopDiff)) + riskScore += loopDiff * 10 + } else if loopDiff < 0 { + deltas = append(deltas, fmt.Sprintf("Loops%d", loopDiff)) + } + + branchDiff := newT.BranchCount - oldT.BranchCount + if branchDiff > 0 { + deltas = append(deltas, fmt.Sprintf("Branches+%d", branchDiff)) + riskScore += branchDiff * 2 + } else if branchDiff < 0 { + deltas = append(deltas, fmt.Sprintf("Branches%d", branchDiff)) + } + + if newT.HasGo && !oldT.HasGo { + deltas = append(deltas, models.TopoDeltaGoroutine) + riskScore += 15 + } + + if newT.HasDefer && !oldT.HasDefer { + deltas = append(deltas, models.TopoDeltaDefer) + riskScore += 3 + } + + if newT.HasPanic && !oldT.HasPanic { + deltas = append(deltas, models.TopoDeltaPanic) + riskScore += 5 + } + + entropyDiff := newT.EntropyScore - oldT.EntropyScore + if entropyDiff > 1.0 { + deltas = append(deltas, fmt.Sprintf("Entropy+%.1f", entropyDiff)) + riskScore += int(entropyDiff * 3) + } + + if len(deltas) == 0 { + return models.TopoDeltaNone, 0 + } + + return strings.Join(deltas, ", "), riskScore +} + +// CompareFunctions reduces two FingerprintResult values to a +// FunctionDiff. When the canonical IR fingerprints match the function +// is preserved verbatim; otherwise the Zipper algorithm computes the +// structural delta, falling back to "modified" if SSA reconstruction +// fails. +func CompareFunctions(funcName string, oldResult, newResult diff.FingerprintResult) models.FunctionDiff { + d := models.FunctionDiff{ + Function: funcName, + OldFingerprint: oldResult.Fingerprint, + NewFingerprint: newResult.Fingerprint, + } + + if oldResult.Fingerprint == newResult.Fingerprint { + d.Status = models.StatusPreserved + d.FingerprintMatch = true + return d + } + + d.FingerprintMatch = false + oldFn := oldResult.GetSSAFunction() + newFn := newResult.GetSSAFunction() + + if oldFn == nil || newFn == nil { + d.Status = models.StatusModified + return d + } + + zipper, err := diff.NewZipper(oldFn, newFn, ir.DefaultLiteralPolicy) + if err != nil { + d.Status = models.StatusModified + return d + } + + artifacts, err := zipper.ComputeDiff() + if err != nil { + d.Status = models.StatusModified + return d + } + + d.MatchedNodes = artifacts.MatchedNodes + d.AddedOps = artifacts.Added + d.RemovedOps = artifacts.Removed + + if artifacts.Preserved { + d.Status = models.StatusPreserved + } else { + d.Status = models.StatusModified + } + + return d +} diff --git a/pkg/api/doc.go b/pkg/api/doc.go new file mode 100644 index 0000000..bc09ef8 --- /dev/null +++ b/pkg/api/doc.go @@ -0,0 +1,10 @@ +// Package api is the public, library-shaped surface of Semantic +// Firewall. The CLI in cmd/sfw and the MCP server in the sibling +// semantic_firewall_mcp repo both consume it; everything in +// internal/cli is implementation glue that orchestrates flag parsing +// and process boundaries on top of the entry points defined here. +// +// The entry points return the same JSON-serialisable types from +// pkg/models that the CLI prints, so callers can either marshal them +// directly or inspect the structured values in-process. +package api diff --git a/pkg/api/filesystem.go b/pkg/api/filesystem.go new file mode 100644 index 0000000..73d035a --- /dev/null +++ b/pkg/api/filesystem.go @@ -0,0 +1,68 @@ +package api + +import ( + "fmt" + "io" + "io/fs" + "os" + "path/filepath" + + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/models" +) + +// FileSystem abstracts the OS file operations the analysis pipeline +// needs. Production code uses RealFileSystem; tests can substitute a +// mock to drive the pipeline without touching disk. +type FileSystem interface { + Stat(name string) (os.FileInfo, error) + Open(name string) (fs.File, error) + Getwd() (string, error) + Abs(path string) (string, error) + WalkDir(root string, fn fs.WalkDirFunc) error + ReadFile(name string) ([]byte, error) +} + +// RealFileSystem is the production FileSystem implementation backed +// by the os and filepath packages. ReadFile bounds the read at +// models.MaxSourceFileSize so a hostile/oversize input cannot exhaust +// memory. +type RealFileSystem struct{} + +func (RealFileSystem) Stat(name string) (os.FileInfo, error) { return os.Stat(name) } +func (RealFileSystem) Open(name string) (fs.File, error) { return os.Open(name) } +func (RealFileSystem) Getwd() (string, error) { return os.Getwd() } +func (RealFileSystem) Abs(path string) (string, error) { return filepath.Abs(path) } +func (RealFileSystem) WalkDir(root string, fn fs.WalkDirFunc) error { + return filepath.WalkDir(root, fn) +} + +func (RealFileSystem) ReadFile(name string) ([]byte, error) { + f, err := os.Open(name) + if err != nil { + return nil, err + } + defer f.Close() + + info, err := f.Stat() + if err != nil { + return nil, err + } + if info.IsDir() { + return nil, fmt.Errorf("path is a directory: %s", name) + } + if info.Size() > models.MaxSourceFileSize { + return nil, fmt.Errorf("file exceeds maximum supported size of %d bytes", models.MaxSourceFileSize) + } + + // Read one byte past the cap so an under-reported size can still be + // detected and rejected, rather than silently truncated. + limit := int64(models.MaxSourceFileSize + 1) + content, err := io.ReadAll(io.LimitReader(f, limit)) + if err != nil { + return nil, err + } + if len(content) > models.MaxSourceFileSize { + return nil, fmt.Errorf("file exceeds maximum supported size of %d bytes", models.MaxSourceFileSize) + } + return content, nil +} diff --git a/pkg/api/names.go b/pkg/api/names.go new file mode 100644 index 0000000..79b0a97 --- /dev/null +++ b/pkg/api/names.go @@ -0,0 +1,114 @@ +package api + +import ( + "fmt" + "strings" +) + +// ShortFunctionName strips package paths and qualifying identifiers +// from a Go SSA function name so it is readable in diffs and reports. +// +// It handles three shapes: +// - "fmt.Println" -> "Println" +// - "pkg.(*Type).Method" / "(*pkg.Type).Method" -> "(*Type).Method" +// - Generics like "pkg.Func[a/b.T]" -> "Func[a/b.T]" +// +// Brackets and parens are tracked so qualified types inside generic +// parameters or receiver positions are preserved verbatim. +func ShortFunctionName(fullName string) string { + // Receiver form: "(*pkg.Type).Method" or "(pkg.Type).Method". + // Recurse into the receiver type so its package qualifier is + // stripped while the pointer marker and method tail survive. + if strings.HasPrefix(fullName, "(") { + depth := 0 + closeIndex := -1 + for i, c := range fullName { + if c == '(' { + depth++ + } else if c == ')' { + depth-- + if depth == 0 { + closeIndex = i + break + } + } + } + + if closeIndex > 1 { + receiver := fullName[1:closeIndex] // e.g. "*pkg.Type" + rest := fullName[closeIndex+1:] // e.g. ".Method" + + prefix := "" + if strings.HasPrefix(receiver, "*") { + prefix = "*" + receiver = receiver[1:] + } + + cleanReceiver := ShortFunctionName(receiver) + return fmt.Sprintf("(%s%s)%s", prefix, cleanReceiver, rest) + } + } + + // Backward scan to strip "github.com/...../pkg." style import paths. + // Brackets and parens prevent splitting inside generics like + // "Func[a/b.T]" where the slash is part of a type argument. + end := len(fullName) - 1 + depthBrackets := 0 + depthParens := 0 + splitIndex := -1 + + for i := end; i >= 0; i-- { + b := fullName[i] + switch b { + case ']': + depthBrackets++ + case '[': + depthBrackets-- + case ')': + depthParens++ + case '(': + depthParens-- + case '/': + if depthBrackets == 0 && depthParens == 0 { + splitIndex = i + goto FoundSplit + } + } + } +FoundSplit: + name := fullName + if splitIndex >= 0 { + name = fullName[splitIndex+1:] + } + + // Forward scan for the first top-level dot. If the prefix carries + // brackets/parens it is part of the type signature (e.g. + // "Type[T].Method") and must be preserved; otherwise it is a bare + // package name that can be stripped. + depth := 0 + for i, ch := range name { + switch ch { + case '(': + depth++ + case ')': + depth-- + case '[': + depth++ + case ']': + depth-- + case '.': + if depth == 0 { + prefix := name[:i] + if !containsSpecial(prefix) { + return name[i+1:] + } + return name + } + } + } + return name +} + +func containsSpecial(s string) bool { + return strings.ContainsAny(s, "[]()") +} diff --git a/pkg/detection/bench_test.go b/pkg/detection/bench_test.go index 0d49d0d..e056715 100644 --- a/pkg/detection/bench_test.go +++ b/pkg/detection/bench_test.go @@ -4,7 +4,7 @@ import ( "fmt" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" ) func BenchmarkGenerateTopologyHash(b *testing.B) { diff --git a/pkg/detection/engine.go b/pkg/detection/engine.go index 9d13564..ba6d761 100644 --- a/pkg/detection/engine.go +++ b/pkg/detection/engine.go @@ -7,7 +7,7 @@ import ( "strconv" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" ) // MatchSignature checks a function topology against a signature. diff --git a/pkg/detection/engine_test.go b/pkg/detection/engine_test.go index fcf289e..7fc5fcc 100644 --- a/pkg/detection/engine_test.go +++ b/pkg/detection/engine_test.go @@ -3,8 +3,8 @@ package detection_test import ( "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" ) func TestMatchSignature(t *testing.T) { diff --git a/pkg/diff/fingerprinter.go b/pkg/diff/fingerprinter.go index 1dd16a5..0ad8b84 100644 --- a/pkg/diff/fingerprinter.go +++ b/pkg/diff/fingerprinter.go @@ -11,7 +11,7 @@ import ( "sort" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" "golang.org/x/tools/go/packages" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/diff/fingerprinter_test.go b/pkg/diff/fingerprinter_test.go index d8ea205..fd4214a 100644 --- a/pkg/diff/fingerprinter_test.go +++ b/pkg/diff/fingerprinter_test.go @@ -5,9 +5,9 @@ import ( "strings" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" ) func TestRegression_LogicCorruption(t *testing.T) { diff --git a/pkg/diff/fuzz_test.go b/pkg/diff/fuzz_test.go index 12db831..fa5fd7d 100644 --- a/pkg/diff/fuzz_test.go +++ b/pkg/diff/fuzz_test.go @@ -5,9 +5,9 @@ import ( "path/filepath" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/testutil" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/testutil" ) func FuzzFingerprintSource(f *testing.F) { diff --git a/pkg/diff/topology_match.go b/pkg/diff/topology_match.go index 19af373..59f1217 100644 --- a/pkg/diff/topology_match.go +++ b/pkg/diff/topology_match.go @@ -4,7 +4,7 @@ import ( "sort" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" ) type TopologyMatch struct { diff --git a/pkg/diff/zipper.go b/pkg/diff/zipper.go index ca7fd50..adb0dc9 100644 --- a/pkg/diff/zipper.go +++ b/pkg/diff/zipper.go @@ -9,7 +9,7 @@ import ( "strconv" "strings" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" "golang.org/x/tools/go/ssa" ) diff --git a/pkg/models/types.go b/pkg/models/types.go index 9a78ebf..105d685 100644 --- a/pkg/models/types.go +++ b/pkg/models/types.go @@ -3,7 +3,7 @@ package models import ( "encoding/json" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" ) // -- Diff & Fingerprinting -- diff --git a/pkg/storage/jsondb/json_store.go b/pkg/storage/jsondb/json_store.go index 8feeb52..dd1f468 100644 --- a/pkg/storage/jsondb/json_store.go +++ b/pkg/storage/jsondb/json_store.go @@ -12,8 +12,8 @@ import ( "sort" "sync" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" ) const ( @@ -64,8 +64,13 @@ func (s *Scanner) LoadDatabase(path string) error { // Verify file existence explicitly. // While Open handles this, a Stat check lets us give a more useful error message. info, err := os.Stat(cleanPath) - if os.IsNotExist(err) { - return fmt.Errorf("signature database file does not exist at %s", cleanPath) + if err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("signature database file does not exist at %s", cleanPath) + } + // Any other stat error (permission denied, EIO, etc.) must surface as an + // error rather than dereferencing the nil FileInfo below. + return fmt.Errorf("failed to stat %s: %w", cleanPath, err) } // Refuse to read named pipes or devices. @@ -248,6 +253,9 @@ func (s *Scanner) AddSignatures(sigs []detection.Signature) error { Description: "Semantic Firewall Malware Signature Database", } } + if s.sigMap == nil { + s.sigMap = make(map[string]int, len(sigs)) + } for i := range sigs { sig := &sigs[i] @@ -257,6 +265,11 @@ func (s *Scanner) AddSignatures(sigs []detection.Signature) error { } s.db.Signatures = append(s.db.Signatures, *sig) + // Keep the ID index in sync. Without this, GetSignature returns + // "not found" for every signature added via a batch until the + // database is reloaded -- the bulk-import path of MigrateFromJSON + // is the realistic trigger. + s.sigMap[sig.ID] = len(s.db.Signatures) - 1 } return nil } @@ -303,9 +316,14 @@ func (s *Scanner) ScanCandidates(topo *topology.FunctionTopology) ([]*detection. match := sig.TopologyHash == topoHash || (sig.FuzzyHash != "" && sig.FuzzyHash == fuzzyHash) if match { - // Respect the signature's tolerance. - // If it demands 0.0 variance, we give it 0.0 variance. + // Respect the signature's tolerance, falling back to the scanner + // default when the signature leaves it unset (0). A zero value means + // "unspecified", not "demand an exact entropy match" -- this matches + // the behaviour of MatchSignature and the PebbleDB scanner. effectiveTol := sig.EntropyTolerance + if effectiveTol == 0 { + effectiveTol = s.entropyTolerance + } if math.Abs(sig.EntropyScore-topo.EntropyScore) <= effectiveTol { // Deep copy allows the caller to mutate their candidate list safely. diff --git a/pkg/storage/jsondb/json_store_test.go b/pkg/storage/jsondb/json_store_test.go index b1be317..056c696 100644 --- a/pkg/storage/jsondb/json_store_test.go +++ b/pkg/storage/jsondb/json_store_test.go @@ -2,11 +2,41 @@ package jsondb import ( "fmt" + "os" + "path/filepath" + "strings" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" ) +func TestLoadDatabase_PermissionDeniedDoesNotPanic(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("running as root bypasses unix permission checks") + } + + dir := t.TempDir() + dbPath := filepath.Join(dir, "sigs.json") + if err := os.WriteFile(dbPath, []byte(`{"signatures":[]}`), 0o600); err != nil { + t.Fatalf("write db: %v", err) + } + // Strip read+execute from the containing directory so stat fails with EACCES + // rather than ENOENT. This is the case the old code crashed on. + if err := os.Chmod(dir, 0o000); err != nil { + t.Fatalf("chmod dir: %v", err) + } + t.Cleanup(func() { _ = os.Chmod(dir, 0o700) }) + + s := NewScanner() + err := s.LoadDatabase(dbPath) + if err == nil { + t.Fatal("expected LoadDatabase to error on permission denied, got nil") + } + if strings.Contains(err.Error(), "does not exist") { + t.Errorf("permission error misreported as missing file: %v", err) + } +} + func TestAddSignatures(t *testing.T) { s := NewScanner() sigs := []detection.Signature{ @@ -29,6 +59,30 @@ func TestAddSignatures(t *testing.T) { } } +func TestAddSignatures_UpdatesSigMap(t *testing.T) { + s := NewScanner() + sigs := []detection.Signature{ + {ID: "SFW-BATCH-1", Name: "Sig1"}, + {ID: "SFW-BATCH-2", Name: "Sig2"}, + } + if err := s.AddSignatures(sigs); err != nil { + t.Fatalf("AddSignatures failed: %v", err) + } + + // GetSignature relies on sigMap; if the batch path skipped indexing it + // returns "not found" even though the signature is present in the slice. + for _, want := range sigs { + got, err := s.GetSignature(want.ID) + if err != nil { + t.Errorf("GetSignature(%q) after batch insert: %v", want.ID, err) + continue + } + if got.Name != want.Name { + t.Errorf("GetSignature(%q).Name = %q, want %q", want.ID, got.Name, want.Name) + } + } +} + const count = 10000 func BenchmarkAddSignatureLoop(b *testing.B) { diff --git a/pkg/storage/pebbledb/store.go b/pkg/storage/pebbledb/store.go index 4db1941..6271bc6 100644 --- a/pkg/storage/pebbledb/store.go +++ b/pkg/storage/pebbledb/store.go @@ -18,8 +18,8 @@ import ( "sync" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" "github.com/cockroachdb/pebble" ) @@ -39,8 +39,12 @@ const ( // CurrentDBVersion tracks the semantic version of the data format. CurrentDBVersion = "3.0.0" - // CurrentSchemaVersion enforces binary compatibility. - // Increment this only if the fundamental serialization format (e.g. Gob struct shape) changes. + // CurrentSchemaVersion enforces database compatibility. A database whose + // stored version differs from this constant is rejected: a newer version + // cannot be understood, and an older version holds fingerprints produced by + // an incompatible algorithm. Increment this whenever the serialization + // format (Gob struct shape) OR the fingerprint/canonicalization algorithm + // changes, since either makes previously stored signatures unmatchable. CurrentSchemaVersion = 3 // BatchSizeLimitBytes limits the memory usage of a batch before commit (10MB). @@ -166,6 +170,17 @@ func NewPebbleScanner(dbPath string, opts PebbleScannerOptions) (*PebbleScanner, db.Close() return nil, fmt.Errorf("database schema version %d is newer than binary supported version %d; please upgrade sfw", dbVer, CurrentSchemaVersion) } + if dbVer < CurrentSchemaVersion { + // Fail closed: signatures stored under an older schema were + // fingerprinted by a prior canonicalization algorithm and will + // silently fail to match code scanned by this binary. Rebuilding + // the indexes cannot fix this -- the stored TopologyHash values + // themselves are stale -- so the database must be regenerated by + // re-indexing the original samples with this version of sfw. + db.Close() + return nil, fmt.Errorf("database schema version %d is older than binary version %d; "+ + "its fingerprints are incompatible -- rebuild the database by re-indexing your samples with this version of sfw", dbVer, CurrentSchemaVersion) + } } } else if !opts.ReadOnly { // Initialize schema version for new/legacy databases diff --git a/pkg/storage/pebbledb/store_test.go b/pkg/storage/pebbledb/store_test.go index d24ba22..03ecd29 100644 --- a/pkg/storage/pebbledb/store_test.go +++ b/pkg/storage/pebbledb/store_test.go @@ -2,13 +2,35 @@ package pebbledb_test import ( "fmt" + "strings" "testing" "time" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" ) +func TestPebbleScanner_RejectsOlderSchema(t *testing.T) { + dbPath := t.TempDir() + + s, err := pebbledb.NewPebbleScanner(dbPath, pebbledb.DefaultPebbleScannerOptions()) + if err != nil { + t.Fatalf("NewPebbleScanner failed: %v", err) + } + // Simulate a database written by an older sfw whose fingerprint algorithm + // predates the current canonicalizer. + if err := s.SetMetadata("schema_version", fmt.Sprintf("%d", pebbledb.CurrentSchemaVersion-1)); err != nil { + t.Fatalf("SetMetadata failed: %v", err) + } + s.Close() + + if _, err := pebbledb.NewPebbleScanner(dbPath, pebbledb.DefaultPebbleScannerOptions()); err == nil { + t.Fatal("expected NewPebbleScanner to reject a database with an older schema version") + } else if !strings.Contains(err.Error(), "older") { + t.Errorf("expected an 'older schema' error, got: %v", err) + } +} + func TestPebbleScanner_CRUD(t *testing.T) { dbPath := t.TempDir() diff --git a/pkg/storage/provider.go b/pkg/storage/provider.go index 6ab60f4..2a95262 100644 --- a/pkg/storage/provider.go +++ b/pkg/storage/provider.go @@ -1,8 +1,8 @@ package storage import ( - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" ) // SignatureProvider defines the contract for signature persistence and retrieval. diff --git a/pkg/testutil/helpers.go b/pkg/testutil/helpers.go index 3c56c83..e6af9bf 100644 --- a/pkg/testutil/helpers.go +++ b/pkg/testutil/helpers.go @@ -5,10 +5,10 @@ import ( "strings" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/topology" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/detection" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/diff" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/storage/pebbledb" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/topology" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/detection" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/storage/pebbledb" ) // FindResult searches for a FingerprintResult by function name. diff --git a/pkg/testutil/setup.go b/pkg/testutil/setup.go index 18f6cf3..f88a8b5 100644 --- a/pkg/testutil/setup.go +++ b/pkg/testutil/setup.go @@ -7,7 +7,7 @@ import ( "strings" "testing" - "github.com/BlackVectorOps/semantic_firewall/v3/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" "golang.org/x/tools/go/packages" "golang.org/x/tools/go/ssa" )