From f34f051dc48b0af067c5548cc900bd76f082eb58 Mon Sep 17 00:00:00 2001 From: xkilldash9x Date: Sat, 23 May 2026 23:48:07 -0700 Subject: [PATCH 1/2] feat(diff): add FingerprintTree for directory-mode loads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FingerprintSource loads a single file and cannot resolve symbols defined in sibling files. Pre-module trees whose functions call sibling helpers (e.g. envconfig's Process() calling lookupEnv from env_syscall.go) fail to type-check under FingerprintSource and the fingerprint is unusable. FingerprintTree loads the tree via packages.Load with full sibling resolution. When the tree has no go.mod, the loader synthesizes one through packages.Config.Overlay so resolution proceeds through a canonical module path rather than falling back to "command-line-arguments". LoadMeta exposes HadGoMod, SynthesizedGoMod, ModulePath, and LoadErrors so callers can distinguish the three load regimes. The synthetic module path is a stable constant ("synthetic.local/ anonymous"), not basename-derived. The first draft of this fix derived the synthetic path from filepath.Base(rootDir), which reintroduced qualifier asymmetry in a new form: pairwise comparisons load each side from its own temp directory, so basename- derived paths differ across sides, deflating types.Type.String()- based similarity on any signature containing user-defined types. exec_v2 head-to-head showed -0.0968 deflation under basename- derived paths; the identical-basename diagnostic recovered the baseline bit-identically (0.8046 vs 0.8046); a three-way comparison against real-go.mod confirmed agreement across all three load regimes on the synthetic corpus (exec 0.8046, net 0.5946, syscall 0.5978 — nine measurements all bit-identical). Also bump the go directive 1.24.0 -> 1.26.3 to match the installed toolchain (the directive is a floor, not a target — the toolchain was already 1.26.3 functionally, so this only raises self-declared minimum). FingerprintSource fingerprints remain bit-identical on the synthetic corpus before and after the bump, verified under GOWORK=off against v4.0.0. Add /semantic_firewall to .gitignore so the stray built binary at repo root cannot ride along into future commits. --- .gitignore | 4 + go.mod | 2 +- pkg/diff/fingerprinter.go | 156 ++++++++++++++++++++++++++++ pkg/diff/fingerprinter_tree_test.go | 137 ++++++++++++++++++++++++ 4 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 pkg/diff/fingerprinter_tree_test.go diff --git a/.gitignore b/.gitignore index cb6a179..72f5287 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ *.dylib sfw cmd/sfw/sfw +# Root-anchored: `go build` at repo root produces a binary named after +# the module's last path segment (semantic_firewall). Anchor with / +# so we don't accidentally ignore any nested path of the same name. +/semantic_firewall # Test binary, built with `go test -c` *.test diff --git a/go.mod b/go.mod index f1d32f9..3cbc1ee 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,5 @@ //go.mod -go 1.24.0 +go 1.26.3 module github.com/BlackVectorOps/semantic_firewall/v4 diff --git a/pkg/diff/fingerprinter.go b/pkg/diff/fingerprinter.go index 0ad8b84..efa3d44 100644 --- a/pkg/diff/fingerprinter.go +++ b/pkg/diff/fingerprinter.go @@ -296,3 +296,159 @@ func processFunctionAndAnons(fn *ssa.Function, policy ir.LiteralPolicy, strictMo processFunctionAndAnons(anon, policy, strictMode, results, visited) } } + +// LoadMeta records what the tree loader did for a particular FingerprintTree +// invocation. Carried alongside results so callers can tag analysis output +// with whether a real go.mod was found or whether one was synthesized. +type LoadMeta struct { + HadGoMod bool // real go.mod found at or above rootDir + SynthesizedGoMod bool // loader supplied a synthetic go.mod via overlay + ModulePath string // module path used for resolution (real or synthetic) + LoadErrors []string // per-package errors encountered during Load +} + +// FingerprintTree fingerprints Go source under rootDir using a tree-mode load. +// If a real go.mod is found at or above rootDir, it's used directly. Otherwise +// a synthetic go.mod is supplied via packages.Config.Overlay (no disk write) +// so the loader has a canonical module path to resolve through — this fixes +// the qualifier-corruption case where types.Type.String() would otherwise +// carry a temp-dir-synthesized path. +// +// fileFilter, if non-nil, keeps only function fingerprints whose source file +// satisfies the predicate. Use it to avoid fingerprinting the whole tree when +// the caller only cares about a subset (e.g., changed files in a diff). +// +// GOPROXY=off is preserved via GetHardenedEnv(); files that import external +// modules without resolvable deps will still parse-fail by design. +func FingerprintTree(rootDir string, fileFilter func(string) bool, policy ir.LiteralPolicy) ([]FingerprintResult, LoadMeta, error) { + return FingerprintTreeAdvanced(rootDir, fileFilter, policy, false) +} + +// FingerprintTreeAdvanced is the strict-mode variant of FingerprintTree. +func FingerprintTreeAdvanced(rootDir string, fileFilter func(string) bool, policy ir.LiteralPolicy, strictMode bool) ([]FingerprintResult, LoadMeta, error) { + absRoot, err := filepath.Abs(rootDir) + if err != nil { + return nil, LoadMeta{}, fmt.Errorf("resolve absolute path for %s: %w", rootDir, err) + } + + pkgs, meta, err := loadPackagesFromTree(absRoot) + if err != nil { + return nil, meta, err + } + if len(pkgs) == 0 { + return nil, meta, fmt.Errorf("no packages loaded under %s", absRoot) + } + + results, err := FingerprintPackages(pkgs, policy, strictMode) + if err != nil { + return nil, meta, err + } + + if fileFilter != nil { + filtered := results[:0] + for _, r := range results { + if fileFilter(r.Filename) { + filtered = append(filtered, r) + } + } + results = filtered + } + + return results, meta, nil +} + +func loadPackagesFromTree(rootDir string) ([]*packages.Package, LoadMeta, error) { + var meta LoadMeta + + cfg := &packages.Config{ + Mode: packages.LoadAllSyntax, + Env: GetHardenedEnv(), + } + + modDir, modPath := findGoMod(rootDir) + if modDir != "" { + meta.HadGoMod = true + meta.ModulePath = modPath + cfg.Dir = modDir + } else { + // Synthesize a go.mod via overlay so the loader has a canonical + // module path. Fixes the qualifier-corruption case where the loader + // would otherwise synthesize a path from the temp directory. + meta.HadGoMod = false + meta.SynthesizedGoMod = true + meta.ModulePath = syntheticModulePath() + cfg.Dir = rootDir + cfg.Overlay = map[string][]byte{ + filepath.Join(rootDir, "go.mod"): []byte(fmt.Sprintf("module %s\n\ngo 1.21\n", meta.ModulePath)), + } + } + + // Load the tree. "./..." pulls all Go files in the tree as packages, + // which addresses the sibling-symbol-missing class of failures from the + // pilot — multi-file packages now resolve cleanly. + pkgs, err := packages.Load(cfg, "./...") + if err != nil { + return nil, meta, fmt.Errorf("failed to execute loader: %w", err) + } + + packages.Visit(pkgs, nil, func(pkg *packages.Package) { + for _, e := range pkg.Errors { + meta.LoadErrors = append(meta.LoadErrors, e.Error()) + } + }) + + return pkgs, meta, nil +} + +// findGoMod walks up from dir looking for a go.mod file. Returns the directory +// containing it and the module path, or ("", "") if none found. +func findGoMod(dir string) (modDir, modPath string) { + for { + modFile := filepath.Join(dir, "go.mod") + if data, err := os.ReadFile(modFile); err == nil { + if mp := parseModuleLine(data); mp != "" { + return dir, mp + } + } + parent := filepath.Dir(dir) + if parent == dir { + return "", "" + } + dir = parent + } +} + +// parseModuleLine extracts the module path from go.mod's `module ` line. +// Lightweight string scan — sufficient for the path-only field. +func parseModuleLine(data []byte) string { + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "module") { + continue + } + rest := strings.TrimSpace(strings.TrimPrefix(line, "module")) + // Strip surrounding quotes if present (rare but valid). + rest = strings.Trim(rest, "\"`") + // Strip an inline comment. + if i := strings.Index(rest, "//"); i >= 0 { + rest = strings.TrimSpace(rest[:i]) + } + if rest != "" { + return rest + } + } + return "" +} + +// syntheticModulePath returns the stable module path used by the +// overlay-synthesized go.mod when a tree has no real go.mod. The path +// MUST be stable across loads — pairwise diff comparisons load each +// side from its own temp directory, and per-load variation in the +// module path makes type qualifiers (e.g. on user-defined types like +// "synthetic.local/A.Foo" vs "synthetic.local/B.Foo") asymmetric across +// sides, deflating types.Type.String()-based similarity. A constant +// prevents that — both halves of any pairwise comparison see identical +// qualifiers for identical types. +func syntheticModulePath() string { + return "synthetic.local/anonymous" +} diff --git a/pkg/diff/fingerprinter_tree_test.go b/pkg/diff/fingerprinter_tree_test.go new file mode 100644 index 0000000..5182e44 --- /dev/null +++ b/pkg/diff/fingerprinter_tree_test.go @@ -0,0 +1,137 @@ +package diff_test + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/analysis/ir" + "github.com/BlackVectorOps/semantic_firewall/v4/pkg/diff" +) + +// TestFingerprintTree_SyntheticGoMod covers the no-real-go.mod path: loader +// synthesizes a go.mod via overlay so the package resolves through a canonical +// module path instead of falling back to "command-line-arguments". +func TestFingerprintTree_SyntheticGoMod(t *testing.T) { + t.Parallel() + dir := t.TempDir() + src := `package configloader + +import "fmt" + +func Greet(name string) string { + return fmt.Sprintf("hello, %s", name) +} +` + if err := os.WriteFile(filepath.Join(dir, "fixture.go"), []byte(src), 0o644); err != nil { + t.Fatalf("write fixture: %v", err) + } + + results, meta, err := diff.FingerprintTree(dir, nil, ir.DefaultLiteralPolicy) + if err != nil { + t.Fatalf("FingerprintTree: %v", err) + } + + if meta.HadGoMod { + t.Errorf("expected HadGoMod=false; got true (ModulePath=%q)", meta.ModulePath) + } + if !meta.SynthesizedGoMod { + t.Errorf("expected SynthesizedGoMod=true; got false") + } + if !strings.HasPrefix(meta.ModulePath, "synthetic.local/") { + t.Errorf("expected synthetic ModulePath to start with synthetic.local/; got %q", meta.ModulePath) + } + + if !containsFunctionLike(results, "Greet") { + t.Errorf("expected results to include Greet; got %v", funcNamesOf(results)) + } +} + +// TestFingerprintTree_RealGoMod covers the path where rootDir has a real +// go.mod: loader uses the declared module path, no synthesis. +func TestFingerprintTree_RealGoMod(t *testing.T) { + t.Parallel() + dir := t.TempDir() + const modulePath = "example.com/treetestreal" + if err := os.WriteFile(filepath.Join(dir, "go.mod"), + []byte("module "+modulePath+"\n\ngo 1.21\n"), 0o644); err != nil { + t.Fatalf("write go.mod: %v", err) + } + src := `package realmod + +func Hi() string { return "hi" } +` + if err := os.WriteFile(filepath.Join(dir, "fixture.go"), []byte(src), 0o644); err != nil { + t.Fatalf("write fixture: %v", err) + } + + results, meta, err := diff.FingerprintTree(dir, nil, ir.DefaultLiteralPolicy) + if err != nil { + t.Fatalf("FingerprintTree: %v", err) + } + + if !meta.HadGoMod { + t.Errorf("expected HadGoMod=true; got false") + } + if meta.SynthesizedGoMod { + t.Errorf("expected SynthesizedGoMod=false; got true (ModulePath=%q)", meta.ModulePath) + } + if meta.ModulePath != modulePath { + t.Errorf("expected ModulePath=%q; got %q", modulePath, meta.ModulePath) + } + + if !containsFunctionLike(results, "Hi") { + t.Errorf("expected results to include Hi; got %v", funcNamesOf(results)) + } +} + +// TestFingerprintTree_SiblingFiles confirms tree-mode load pulls in siblings, +// addressing the sibling-symbol-missing failure category from the pilot. +func TestFingerprintTree_SiblingFiles(t *testing.T) { + t.Parallel() + dir := t.TempDir() + mainSrc := `package multi + +func PublicEntry() int { return helper() } +` + siblingSrc := `package multi + +func helper() int { return 42 } +` + if err := os.WriteFile(filepath.Join(dir, "main.go"), []byte(mainSrc), 0o644); err != nil { + t.Fatalf("write main: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "helper.go"), []byte(siblingSrc), 0o644); err != nil { + t.Fatalf("write helper: %v", err) + } + + results, meta, err := diff.FingerprintTree(dir, nil, ir.DefaultLiteralPolicy) + if err != nil { + t.Fatalf("FingerprintTree: %v (loadErrors=%v)", err, meta.LoadErrors) + } + + if len(meta.LoadErrors) > 0 { + t.Errorf("expected no load errors; got %v", meta.LoadErrors) + } + if !containsFunctionLike(results, "PublicEntry") || !containsFunctionLike(results, "helper") { + t.Errorf("expected both PublicEntry and helper in results; got %v", funcNamesOf(results)) + } +} + +func containsFunctionLike(results []diff.FingerprintResult, needle string) bool { + for _, r := range results { + if strings.Contains(r.FunctionName, needle) { + return true + } + } + return false +} + +func funcNamesOf(results []diff.FingerprintResult) []string { + names := make([]string, 0, len(results)) + for _, r := range results { + names = append(names, r.FunctionName) + } + return names +} From 37ae723083fa88b0068075bb7be4661e59b3b7ec Mon Sep 17 00:00:00 2001 From: xkilldash9x Date: Sun, 24 May 2026 00:33:10 -0700 Subject: [PATCH 2/2] docs(diff): annotate FingerprintTree's same-module sub-package limitation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The committed FingerprintTree synthesizes a go.mod with the stable constant module path "synthetic.local/anonymous" when no real go.mod is found. That works for self-contained single-package trees (the synthetic-corpus shape the fix was first validated against) but does not resolve same-module sub-package imports in real multi-package modules — the synthetic module identity does not match the real module path the source code imports. Real-corpus triage of the 3 genuine same-package-sibling commits in the pilot (go-cmp 8ebdfab3, x/text c8872a1a, x/text db455d00) showed each one's failing sub-package directory exists on-disk at the path the real import declares, so a synthesized go.mod declaring the REAL module name at the worktree root would resolve the imports. The fix shape (moduleNameHint parameter + load from tree root) is mechanism- verified but implementation-deferred — the 3-commit payoff did not justify the engine-API + bench-runner refactor at this stage. This commit only documents the limitation in the FingerprintTree and syntheticModulePath doc comments so the boundary lives in the code, not just in conversation. No behavior change. --- pkg/diff/fingerprinter.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pkg/diff/fingerprinter.go b/pkg/diff/fingerprinter.go index efa3d44..f080123 100644 --- a/pkg/diff/fingerprinter.go +++ b/pkg/diff/fingerprinter.go @@ -320,6 +320,30 @@ type LoadMeta struct { // // GOPROXY=off is preserved via GetHardenedEnv(); files that import external // modules without resolvable deps will still parse-fail by design. +// +// KNOWN LIMITATION — same-module sub-package imports in real multi-package +// trees: when no real go.mod is found, the synthetic go.mod declares module +// "synthetic.local/anonymous" (see syntheticModulePath). For self-contained +// single-package trees (the synthetic-corpus shape this code was first +// validated against) this is fine — no imports need to resolve through the +// module path. For real multi-package modules whose internal files import +// other sub-packages of the same module (e.g., github.com/google/go-cmp's +// cmp/compare.go importing github.com/google/go-cmp/cmp/internal/diff), +// the synthetic module identity does NOT match the import paths declared in +// source, so the sub-package lookup fails with +// "cannot find module providing package /" even +// though the sub-package's source is present on disk in the tree. +// +// Verified by real-corpus triage of the 3 genuine same-package-sibling +// commits in the pilot (go-cmp 8ebdfab3, x/text c8872a1a, x/text db455d00): +// in each case the failing sub-package directory EXISTS at the worktree- +// root-relative path that the real import declares, so a synthesized go.mod +// declaring the REAL module name placed at the worktree root would resolve +// the imports correctly. The fix shape — adding a moduleNameHint parameter +// and loading the target package(s) by module-relative path from the tree +// root — is mechanism-verified but implementation-deferred. Affected +// corpus: pre-modules-era multi-package trees (modern commits carry their +// own go.mod and don't go through this synthesis path). func FingerprintTree(rootDir string, fileFilter func(string) bool, policy ir.LiteralPolicy) ([]FingerprintResult, LoadMeta, error) { return FingerprintTreeAdvanced(rootDir, fileFilter, policy, false) } @@ -449,6 +473,13 @@ func parseModuleLine(data []byte) string { // sides, deflating types.Type.String()-based similarity. A constant // prevents that — both halves of any pairwise comparison see identical // qualifiers for identical types. +// +// See FingerprintTree's KNOWN LIMITATION note: this constant works for +// self-contained single-package trees but does not resolve same-module +// sub-package imports in real multi-package trees, where source imports +// the real module path and the synthetic "synthetic.local/anonymous" +// identity cannot satisfy those lookups. The verified-deferred fix is a +// moduleNameHint parameter on FingerprintTreeAdvanced. func syntheticModulePath() string { return "synthetic.local/anonymous" }