diff --git a/README.md b/README.md index e20678c..a72b88f 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,15 @@ geiger --live ./leaked-repo geiger --live --from-trufflehog trufflehog.json geiger --live --from-gitleaks gitleaks-report.json +# external recon: pipe a nuclei exposure scan straight in. Its templates pull the +# leaked value out of each exposed endpoint (/.env, phpinfo, instance metadata); +# geiger types, validates, and ranks it, and records the URL it leaked from. It +# also parses the response body when present, reassembling multi-field creds (an +# AWS key+secret pair, a connection string) the flat extracted-results can't — +# run nuclei with -irr to include the body. +# Stream over a pipe so live secrets never land on disk (add -o only if you must). +nuclei -t exposures/ -l targets.txt -j -irr | geiger --live --from-nuclei - + # rank by YOUR crown jewels (boost anything touching these to HIGH+) geiger --live --context '1234567890,acme-prod,billing-service' ./repo @@ -132,9 +141,9 @@ geiger --ssh-correlate ~/.ssh ## Where geiger fits geiger is not a scanner — it starts where they stop. Detection finds the secret; -geiger triages it. Point gitleaks or TruffleHog at the haystack, then pipe the -report in (`--from-gitleaks` / `--from-trufflehog`) to learn which hits actually -reach prod. +geiger triages it. Point gitleaks or TruffleHog at the haystack, or nuclei at +internet-exposed endpoints, then pipe the report in (`--from-gitleaks` / +`--from-trufflehog` / `--from-nuclei -`) to learn which hits actually reach prod. | | gitleaks | TruffleHog | GitGuardian | **geiger** | |---|:---:|:---:|:---:|:---:| @@ -179,6 +188,7 @@ answers the question they leave open: *now that you found it, how bad is it?* | `--no-reverse` | keep highest-impact findings first; by default an interactive terminal reverses them to the bottom (above the summary) so the worst don't scroll off the top | | `--only TYPES` / `--skip TYPES` | scope by module name or category (`databases`,`cloud`,`secrets`,`ai`,`vcs`,`kubernetes`,`identity`,`backup`,`endpoint`) | | `--from-gitleaks F` / `--from-trufflehog F` | triage each finding in a scanner report | +| `--from-nuclei F` | triage each value extracted by a nuclei JSONL (`-j`) scan; `F` = `-` reads stdin (stream over a pipe) | | `--ssh-correlate` | SSH: read local hints for candidate target hosts | | `--trace` | print the raw request + response of each call (secrets masked) | | `--user-agent UA` | User-Agent for recon calls (default `geiger/`) | diff --git a/cmd/geiger/main.go b/cmd/geiger/main.go index f347b83..f02d81d 100644 --- a/cmd/geiger/main.go +++ b/cmd/geiger/main.go @@ -34,7 +34,7 @@ var version = "dev" // writers (and a test can prove stdout is independent of the stderr status). type config struct { live, intrusive, minFootprint, useEnv, correlate, trace, asJSON, verbose, stream, quiet, noReverse, useMetadata bool - endpoint, proxy, fromGitleaks, fromTrufflehog, contextTerms, colorMode, only, skip string + endpoint, proxy, fromGitleaks, fromTrufflehog, fromNuclei, contextTerms, colorMode, only, skip string userAgent, minSeverity, output string timeout time.Duration concurrency, minSevRank int @@ -53,6 +53,7 @@ func main() { flag.StringVar(&c.proxy, "proxy", "", "route HTTP recon through a proxy (http/https/socks5 URL)") flag.StringVar(&c.fromGitleaks, "from-gitleaks", "", "ingest a gitleaks JSON report and triage each finding") flag.StringVar(&c.fromTrufflehog, "from-trufflehog", "", "ingest a TruffleHog v3 JSON report and triage each finding") + flag.StringVar(&c.fromNuclei, "from-nuclei", "", "ingest nuclei JSONL (-j) output and triage each extracted value; '-' reads stdin") flag.StringVar(&c.contextTerms, "context", "", "comma-separated crown-jewel terms (account ids, prod hosts, critical repos) that raise a credential's tier when matched") flag.BoolVar(&c.correlate, "ssh-correlate", false, "for SSH keys, read local hints (~/.ssh/config, known_hosts, shell history) to list candidate target hosts") flag.BoolVar(&c.trace, "trace", false, "print the raw request and response of each call (secrets masked); implies showing all calls") @@ -275,6 +276,8 @@ func header(c config) string { target = "gitleaks report " + c.fromGitleaks case c.fromTrufflehog != "": target = "trufflehog report " + c.fromTrufflehog + case c.fromNuclei != "": + target = "nuclei JSONL " + c.fromNuclei case len(c.args) > 0: target = "scanning " + c.args[0] } @@ -672,6 +675,9 @@ func readSources(c config, st *status) ([]pipeline.Source, error) { if c.fromTrufflehog != "" { return pipeline.FromTrufflehog(c.fromTrufflehog) } + if c.fromNuclei != "" { + return pipeline.FromNuclei(c.fromNuclei) + } if len(c.args) > 0 { // Multiple paths (files, dirs, or scanner reports) are merged, so a deeper // second pass can target just the few files that mattered. diff --git a/internal/pipeline/batch.go b/internal/pipeline/batch.go index 12dca0a..08b2f2b 100644 --- a/internal/pipeline/batch.go +++ b/internal/pipeline/batch.go @@ -208,6 +208,33 @@ type gitleaksFinding struct { File string `json:"File"` } +// nucleiFinding is the subset of a nuclei JSONL result we consume. nuclei's +// detection templates locate where a secret leaks (an exposed /.env, phpinfo, +// instance metadata) and pull the literal value(s) into extracted-results; +// geiger re-types and validates each value itself. We also consume the raw +// response body when present: it's a superset of extracted-results and lets +// geiger's parsers reassemble multi-field credentials (an AWS key+secret pair, a +// ~/.aws/credentials INI, a connection string) that the flat value array can't. +type nucleiFinding struct { + TemplateID string `json:"template-id"` + MatchedAt string `json:"matched-at"` + ExtractedResults []string `json:"extracted-results"` + Response string `json:"response"` +} + +// nucleiBody returns the body of nuclei's raw `response` field (status line + +// headers + blank line + body). Returns "" when there's no body, so the caller +// falls back to the extracted values. +func nucleiBody(raw string) string { + if _, body, ok := strings.Cut(raw, "\r\n\r\n"); ok { + return body + } + if _, body, ok := strings.Cut(raw, "\n\n"); ok { + return body + } + return "" // no header/body separator — nothing reliable to ingest +} + // trufflehogFinding is the subset of a TruffleHog v3 JSON finding we consume. // TruffleHog emits newline-delimited JSON (one object per line). type trufflehogFinding struct { @@ -241,7 +268,7 @@ func FromTrufflehog(path string) ([]Source, error) { var findings []trufflehogFinding if json.Unmarshal(data, &findings) != nil { // fall back to newline-delimited JSON - for _, line := range strings.Split(string(data), "\n") { + for line := range strings.SplitSeq(string(data), "\n") { line = strings.TrimSpace(line) if line == "" || !strings.HasPrefix(line, "{") { continue @@ -280,6 +307,75 @@ func FromTrufflehog(path string) ([]Source, error) { return out, nil } +// FromNuclei ingests nuclei JSONL output (the `-j`/`-jsonl` stream, or a JSON +// array) and yields one Source per extracted credential value. nuclei casts the +// wide net — its templates extract any value that *looks* like a secret from an +// exposed endpoint — and geiger is the authority: each value flows through the +// same recognizer as every other source, so over-matches that aren't real +// credentials are dropped here. path "-" reads stdin, so the intended use is a +// streaming pipe (`nuclei … -j | geiger --from-nuclei - --live`) that never +// lands secrets on disk. The matched-at URL becomes the Source label, which +// drives the title provenance ("from https://host/.env"), the cross-source +// dedup/"also exposed in" rollup (one key exposed at many URLs collapses to one +// finding), and the internet-exposed-endpoint exposure class. +func FromNuclei(path string) ([]Source, error) { + var data []byte + if path == "-" { + b, err := io.ReadAll(io.LimitReader(os.Stdin, 32<<20)) // cap at 32 MiB + if err != nil { + return nil, err + } + data = b + } else { + b, err := os.ReadFile(path) + if err != nil { + return nil, err + } + data = b + } + var findings []nucleiFinding + if json.Unmarshal(data, &findings) != nil { + // fall back to newline-delimited JSON (the default -j stream) + findings = findings[:0] + for line := range strings.SplitSeq(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" || !strings.HasPrefix(line, "{") { + continue + } + var f nucleiFinding + if json.Unmarshal([]byte(line), &f) == nil { + findings = append(findings, f) + } + } + } + var out []Source + for _, f := range findings { + label := f.MatchedAt + if label == "" { + label = "nuclei:" + f.TemplateID + } + // Prefer the full response body: it's a superset of extracted-results and + // lets geiger reassemble multi-field credentials. Then add only those + // extracted values not already in the body (e.g. a value a template pulled + // from a response header). When there's no body, fall back to the values. + if body := nucleiBody(f.Response); strings.TrimSpace(body) != "" { + out = append(out, Source{Label: label, Blob: parse.Parse(body, label)}) + for _, v := range f.ExtractedResults { + if v = strings.TrimSpace(v); v != "" && !strings.Contains(body, v) { + out = append(out, Source{Label: label, Blob: parse.Parse(v, label)}) + } + } + continue + } + for _, v := range f.ExtractedResults { + if v = strings.TrimSpace(v); v != "" { + out = append(out, Source{Label: label, Blob: parse.Parse(v, label)}) + } + } + } + return out, nil +} + // FromGitleaks ingests a gitleaks JSON report and yields one Source per // finding, so a prior scanner run can feed Geiger's triage directly. func FromGitleaks(path string) ([]Source, error) { diff --git a/internal/pipeline/batch_test.go b/internal/pipeline/batch_test.go index 53fcbf4..ff6d476 100644 --- a/internal/pipeline/batch_test.go +++ b/internal/pipeline/batch_test.go @@ -1,11 +1,15 @@ package pipeline import ( + "encoding/json" "os" "path/filepath" + "strings" "testing" "github.com/puck-security/geiger/internal/module" + "github.com/puck-security/geiger/internal/parse" + "github.com/puck-security/geiger/internal/recognize" "github.com/puck-security/geiger/internal/score" ) @@ -93,6 +97,179 @@ func TestFromTrufflehogJSONArray(t *testing.T) { } } +func TestFromNuclei(t *testing.T) { + dir := t.TempDir() + // nuclei -j / -jsonl emits one result object per line. Two extractable + // values, an all-blank extracted-results to skip, and a junk line to ignore. + ndjson := `{"template-id":"exposed-env","matched-at":"https://victim.example/.env","extracted-results":["AKIAIOSFODNN7EXAMPLE"]} +{"template-id":"phpinfo","matched-at":"https://victim.example/phpinfo.php","extracted-results":["ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"]} +{"template-id":"empty","matched-at":"https://victim.example/x","extracted-results":["",""]} +not json, ignore me` + p := filepath.Join(dir, "nuclei.jsonl") + if err := os.WriteFile(p, []byte(ndjson), 0o600); err != nil { + t.Fatal(err) + } + srcs, err := FromNuclei(p) + if err != nil { + t.Fatal(err) + } + if len(srcs) != 2 { // two real values; blank values and junk line skipped + t.Fatalf("expected 2 sources, got %d: %+v", len(srcs), srcs) + } + // label = matched-at URL (the provenance that drives title/dedup/exposure), + // raw = the literal extracted value (geiger re-types it downstream). + if srcs[0].Label != "https://victim.example/.env" || srcs[0].Blob.Raw != "AKIAIOSFODNN7EXAMPLE" { + t.Errorf("unexpected source: %+v", srcs[0]) + } +} + +func TestFromNucleiArrayAndFallbackLabel(t *testing.T) { + dir := t.TempDir() + // JSON-array form; matched-at absent → label falls back to nuclei:. + arr := `[{"template-id":"some-detector","extracted-results":["sk_live_fallbacklabel"]}]` + p := filepath.Join(dir, "arr.json") + if err := os.WriteFile(p, []byte(arr), 0o600); err != nil { + t.Fatal(err) + } + srcs, err := FromNuclei(p) + if err != nil { + t.Fatal(err) + } + if len(srcs) != 1 || srcs[0].Blob.Raw != "sk_live_fallbacklabel" { + t.Fatalf("array parse failed: %+v", srcs) + } + if srcs[0].Label != "nuclei:some-detector" { + t.Errorf("expected nuclei:template-id fallback label, got %q", srcs[0].Label) + } +} + +func TestFromNucleiCorrelatesAcrossURLs(t *testing.T) { + // The same key extracted from two exposed URLs should collapse to one + // reconned finding that records both locations — the "one prod key exposed at + // N URLs" signal — and the matched-at URLs drive the internet-exposed-endpoint + // exposure class (both on the kept finding and in the "also exposed in" rollup). + const sec = "nuclei-corr-token-abc123xyz" + reg := module.NewRegistry() + reg.Register(fakeBearer{}) + recognize.RegisterRecognizer(func(b parse.Blob, _ string, _ *module.Registry) []recognize.Match { + if b.Raw == sec { + return []recognize.Match{{Module: "fake", Fields: module.Fields{"token": sec}, Secret: sec, Label: "nuclei"}} + } + return nil + }) + dir := t.TempDir() + ndjson := `{"template-id":"exposed-env","matched-at":"https://a.example/.env","extracted-results":["` + sec + `"]} +{"template-id":"exposed-env","matched-at":"https://b.example/backup.env","extracted-results":["` + sec + `"]}` + p := filepath.Join(dir, "nuclei.jsonl") + if err := os.WriteFile(p, []byte(ndjson), 0o600); err != nil { + t.Fatal(err) + } + srcs, err := FromNuclei(p) + if err != nil { + t.Fatal(err) + } + if len(srcs) != 2 { + t.Fatalf("expected 2 sources, got %d", len(srcs)) + } + // Run sequentially on one Batch so dedup state is shared and the first URL is + // the kept finding (deterministic), matching TestBatchDedupesSecretAcrossSources. + bt := NewBatch(reg, Options{Live: false}) + var all []Result + for _, s := range srcs { + all = append(all, bt.Run(s.Blob)...) + } + bt.AnnotateDuplicates(all) + + kept, exposed, correlated := 0, false, false + for _, r := range all { + if r.secret != sec { + continue + } + kept++ + for _, f := range r.Note.Findings { + if f.Key == "exposure" && strings.Contains(f.Value, "served over HTTP") { + exposed = true + } + if f.Key == "also exposed in" && strings.Contains(f.Value, "internet-exposed endpoint") && len(f.Detail) == 1 { + correlated = true + } + } + } + if kept != 1 { + t.Fatalf("same key at 2 URLs should recon once, got %d", kept) + } + if !exposed { + t.Errorf("kept result missing internet-exposed-endpoint exposure finding: %+v", all) + } + if !correlated { + t.Errorf("kept result not correlated with the other URL: %+v", all) + } +} + +func TestFromNucleiIngestsResponseBody(t *testing.T) { + // nuclei's JSON carries the full response body. Ingesting it (not just the + // flat extracted-results) lets geiger's parsers reassemble multi-field + // credentials — here an AWS access-key + secret-key pair the bare array, + // which only carries the access key, can't represent. + dir := t.TempDir() + body := "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\nAWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\n" + rec := map[string]any{ + "template-id": "exposed-dotenv", + "matched-at": "https://victim.example/.env", + "extracted-results": []string{"AKIAIOSFODNN7EXAMPLE"}, // nuclei only got the key + "response": "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + body, + } + line, _ := json.Marshal(rec) + p := filepath.Join(dir, "n.jsonl") + if err := os.WriteFile(p, line, 0o600); err != nil { + t.Fatal(err) + } + srcs, err := FromNuclei(p) + if err != nil { + t.Fatal(err) + } + // The body is a superset of the bare key, so the bare value is not re-emitted. + if len(srcs) != 1 { + t.Fatalf("expected 1 (body) source, got %d: %+v", len(srcs), srcs) + } + if srcs[0].Label != "https://victim.example/.env" { + t.Errorf("label should be the matched-at URL, got %q", srcs[0].Label) + } + v := srcs[0].Blob.Vars + if v["AWS_ACCESS_KEY_ID"] != "AKIAIOSFODNN7EXAMPLE" || v["AWS_SECRET_ACCESS_KEY"] == "" { + t.Errorf("response body not parsed into a full AWS pair: %+v", v) + } +} + +func TestFromNucleiKeepsNonBodyExtracted(t *testing.T) { + // A value extracted from a part other than the body (so not present in the + // body) must still be ingested alongside the body source. + dir := t.TempDir() + body := "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\n" + rec := map[string]any{ + "template-id": "x", + "matched-at": "https://victim.example/.env", + "extracted-results": []string{"AKIAIOSFODNN7EXAMPLE", "ghp_headertokennotinbody000000000000000"}, + "response": "HTTP/1.1 200 OK\r\n\r\n" + body, + } + line, _ := json.Marshal(rec) + p := filepath.Join(dir, "n.jsonl") + if err := os.WriteFile(p, line, 0o600); err != nil { + t.Fatal(err) + } + srcs, err := FromNuclei(p) + if err != nil { + t.Fatal(err) + } + // body source + the one extracted value that isn't in the body. + if len(srcs) != 2 { + t.Fatalf("expected body source + 1 non-body value, got %d: %+v", len(srcs), srcs) + } + if srcs[1].Blob.Raw != "ghp_headertokennotinbody000000000000000" { + t.Errorf("non-body extracted value not kept: %+v", srcs[1]) + } +} + func TestWalkDirSkipsNoiseDirs(t *testing.T) { dir := t.TempDir() os.MkdirAll(filepath.Join(dir, "node_modules"), 0o755) diff --git a/internal/pipeline/exposure.go b/internal/pipeline/exposure.go index 13cc041..2740d25 100644 --- a/internal/pipeline/exposure.go +++ b/internal/pipeline/exposure.go @@ -33,6 +33,17 @@ func classifyExposure(path string) (class, note string, flag module.FlagLevel) { case strings.HasPrefix(path, "gitleaks:"), strings.HasPrefix(path, "trufflehog:"): return "scanner finding", "from a secret-scanner report — confirm the on-disk source", module.FlagInfo + // A nuclei matched-at is a URL, not a filesystem path: the secret was served + // over HTTP. Checked before the path-based classes below so a web URL like + // /.git/config or /logs/app.log reads as internet-exposed, not a git object + // or log file. A secret reachable by anyone who can hit the URL is a worse + // exposure than the same value in a local file — assume mass scanners already + // have it. + case strings.HasPrefix(p, "http://"), strings.HasPrefix(p, "https://"): + return "internet-exposed endpoint", + "served over HTTP — reachable by anyone who can hit the URL; assume already harvested by mass scanners", + module.FlagWarn + case strings.Contains(p, "/crashpad/"), strings.Contains(p, "/cores/"), hasSuffixAny(base, ".dmp", ".mdmp", ".hprof", ".core"), strings.HasPrefix(base, "core."): return "crash dump", diff --git a/internal/pipeline/exposure_test.go b/internal/pipeline/exposure_test.go index 3828470..b2a578a 100644 --- a/internal/pipeline/exposure_test.go +++ b/internal/pipeline/exposure_test.go @@ -22,8 +22,11 @@ func TestClassifyExposure(t *testing.T) { {"/proj/.git/objects/ab/cdef", "git object", module.FlagInfo}, {"harvested via ai_ide_store: vscdb:cursorAuth/accessToken", "harvested secret", module.FlagInfo}, {"trufflehog:GitHub", "scanner finding", module.FlagInfo}, - {"/proj/.env", "", module.FlagInfo}, // ordinary file → no class - {"/proj/config/settings.py", "", module.FlagInfo}, // ordinary file → no class + {"https://victim.example/.env", "internet-exposed endpoint", module.FlagWarn}, // nuclei matched-at URL + {"http://victim.example/logs/app.log", "internet-exposed endpoint", module.FlagWarn}, // URL beats the .log/logs class + {"https://victim.example/.git/config", "internet-exposed endpoint", module.FlagWarn}, // URL beats the git-object class + {"/proj/.env", "", module.FlagInfo}, // ordinary file → no class + {"/proj/config/settings.py", "", module.FlagInfo}, // ordinary file → no class } for _, c := range cases { class, note, flag := classifyExposure(c.path)