Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ geiger --live ./leaked-repo
geiger --live --from-trufflehog trufflehog.json
geiger --live --from-gitleaks gitleaks-report.json

# external recon: pipe a nuclei exposure scan straight in. Its templates pull the
# leaked value out of each exposed endpoint (/.env, phpinfo, instance metadata);
# geiger types, validates, and ranks it, and records the URL it leaked from. It
# also parses the response body when present, reassembling multi-field creds (an
# AWS key+secret pair, a connection string) the flat extracted-results can't —
# run nuclei with -irr to include the body.
# Stream over a pipe so live secrets never land on disk (add -o only if you must).
nuclei -t exposures/ -l targets.txt -j -irr | geiger --live --from-nuclei -

# rank by YOUR crown jewels (boost anything touching these to HIGH+)
geiger --live --context '1234567890,acme-prod,billing-service' ./repo

Expand Down Expand Up @@ -132,9 +141,9 @@ geiger --ssh-correlate ~/.ssh
## Where geiger fits

geiger is not a scanner — it starts where they stop. Detection finds the secret;
geiger triages it. Point gitleaks or TruffleHog at the haystack, then pipe the
report in (`--from-gitleaks` / `--from-trufflehog`) to learn which hits actually
reach prod.
geiger triages it. Point gitleaks or TruffleHog at the haystack, or nuclei at
internet-exposed endpoints, then pipe the report in (`--from-gitleaks` /
`--from-trufflehog` / `--from-nuclei -`) to learn which hits actually reach prod.

| | gitleaks | TruffleHog | GitGuardian | **geiger** |
|---|:---:|:---:|:---:|:---:|
Expand Down Expand Up @@ -179,6 +188,7 @@ answers the question they leave open: *now that you found it, how bad is it?*
| `--no-reverse` | keep highest-impact findings first; by default an interactive terminal reverses them to the bottom (above the summary) so the worst don't scroll off the top |
| `--only TYPES` / `--skip TYPES` | scope by module name or category (`databases`,`cloud`,`secrets`,`ai`,`vcs`,`kubernetes`,`identity`,`backup`,`endpoint`) |
| `--from-gitleaks F` / `--from-trufflehog F` | triage each finding in a scanner report |
| `--from-nuclei F` | triage each value extracted by a nuclei JSONL (`-j`) scan; `F` = `-` reads stdin (stream over a pipe) |
| `--ssh-correlate` | SSH: read local hints for candidate target hosts |
| `--trace` | print the raw request + response of each call (secrets masked) |
| `--user-agent UA` | User-Agent for recon calls (default `geiger/<version>`) |
Expand Down
8 changes: 7 additions & 1 deletion cmd/geiger/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ var version = "dev"
// writers (and a test can prove stdout is independent of the stderr status).
type config struct {
live, intrusive, minFootprint, useEnv, correlate, trace, asJSON, verbose, stream, quiet, noReverse, useMetadata bool
endpoint, proxy, fromGitleaks, fromTrufflehog, contextTerms, colorMode, only, skip string
endpoint, proxy, fromGitleaks, fromTrufflehog, fromNuclei, contextTerms, colorMode, only, skip string
userAgent, minSeverity, output string
timeout time.Duration
concurrency, minSevRank int
Expand All @@ -53,6 +53,7 @@ func main() {
flag.StringVar(&c.proxy, "proxy", "", "route HTTP recon through a proxy (http/https/socks5 URL)")
flag.StringVar(&c.fromGitleaks, "from-gitleaks", "", "ingest a gitleaks JSON report and triage each finding")
flag.StringVar(&c.fromTrufflehog, "from-trufflehog", "", "ingest a TruffleHog v3 JSON report and triage each finding")
flag.StringVar(&c.fromNuclei, "from-nuclei", "", "ingest nuclei JSONL (-j) output and triage each extracted value; '-' reads stdin")
flag.StringVar(&c.contextTerms, "context", "", "comma-separated crown-jewel terms (account ids, prod hosts, critical repos) that raise a credential's tier when matched")
flag.BoolVar(&c.correlate, "ssh-correlate", false, "for SSH keys, read local hints (~/.ssh/config, known_hosts, shell history) to list candidate target hosts")
flag.BoolVar(&c.trace, "trace", false, "print the raw request and response of each call (secrets masked); implies showing all calls")
Expand Down Expand Up @@ -275,6 +276,8 @@ func header(c config) string {
target = "gitleaks report " + c.fromGitleaks
case c.fromTrufflehog != "":
target = "trufflehog report " + c.fromTrufflehog
case c.fromNuclei != "":
target = "nuclei JSONL " + c.fromNuclei
case len(c.args) > 0:
target = "scanning " + c.args[0]
}
Expand Down Expand Up @@ -672,6 +675,9 @@ func readSources(c config, st *status) ([]pipeline.Source, error) {
if c.fromTrufflehog != "" {
return pipeline.FromTrufflehog(c.fromTrufflehog)
}
if c.fromNuclei != "" {
return pipeline.FromNuclei(c.fromNuclei)
}
if len(c.args) > 0 {
// Multiple paths (files, dirs, or scanner reports) are merged, so a deeper
// second pass can target just the few files that mattered.
Expand Down
98 changes: 97 additions & 1 deletion internal/pipeline/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,33 @@ type gitleaksFinding struct {
File string `json:"File"`
}

// nucleiFinding is the subset of a nuclei JSONL result we consume. nuclei's
// detection templates locate where a secret leaks (an exposed /.env, phpinfo,
// instance metadata) and pull the literal value(s) into extracted-results;
// geiger re-types and validates each value itself. We also consume the raw
// response body when present: it's a superset of extracted-results and lets
// geiger's parsers reassemble multi-field credentials (an AWS key+secret pair, a
// ~/.aws/credentials INI, a connection string) that the flat value array can't.
type nucleiFinding struct {
TemplateID string `json:"template-id"`
MatchedAt string `json:"matched-at"`
ExtractedResults []string `json:"extracted-results"`
Response string `json:"response"`
}

// nucleiBody returns the body of nuclei's raw `response` field (status line +
// headers + blank line + body). Returns "" when there's no body, so the caller
// falls back to the extracted values.
func nucleiBody(raw string) string {
if _, body, ok := strings.Cut(raw, "\r\n\r\n"); ok {
return body
}
if _, body, ok := strings.Cut(raw, "\n\n"); ok {
return body
}
return "" // no header/body separator — nothing reliable to ingest
}

// trufflehogFinding is the subset of a TruffleHog v3 JSON finding we consume.
// TruffleHog emits newline-delimited JSON (one object per line).
type trufflehogFinding struct {
Expand Down Expand Up @@ -241,7 +268,7 @@ func FromTrufflehog(path string) ([]Source, error) {
var findings []trufflehogFinding
if json.Unmarshal(data, &findings) != nil {
// fall back to newline-delimited JSON
for _, line := range strings.Split(string(data), "\n") {
for line := range strings.SplitSeq(string(data), "\n") {
line = strings.TrimSpace(line)
if line == "" || !strings.HasPrefix(line, "{") {
continue
Expand Down Expand Up @@ -280,6 +307,75 @@ func FromTrufflehog(path string) ([]Source, error) {
return out, nil
}

// FromNuclei ingests nuclei JSONL output (the `-j`/`-jsonl` stream, or a JSON
// array) and yields one Source per extracted credential value. nuclei casts the
// wide net — its templates extract any value that *looks* like a secret from an
// exposed endpoint — and geiger is the authority: each value flows through the
// same recognizer as every other source, so over-matches that aren't real
// credentials are dropped here. path "-" reads stdin, so the intended use is a
// streaming pipe (`nuclei … -j | geiger --from-nuclei - --live`) that never
// lands secrets on disk. The matched-at URL becomes the Source label, which
// drives the title provenance ("from https://host/.env"), the cross-source
// dedup/"also exposed in" rollup (one key exposed at many URLs collapses to one
// finding), and the internet-exposed-endpoint exposure class.
func FromNuclei(path string) ([]Source, error) {
var data []byte
if path == "-" {
b, err := io.ReadAll(io.LimitReader(os.Stdin, 32<<20)) // cap at 32 MiB
if err != nil {
return nil, err
}
data = b
} else {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
data = b
}
var findings []nucleiFinding
if json.Unmarshal(data, &findings) != nil {
// fall back to newline-delimited JSON (the default -j stream)
findings = findings[:0]
for line := range strings.SplitSeq(string(data), "\n") {
line = strings.TrimSpace(line)
if line == "" || !strings.HasPrefix(line, "{") {
continue
}
var f nucleiFinding
if json.Unmarshal([]byte(line), &f) == nil {
findings = append(findings, f)
}
}
}
var out []Source
for _, f := range findings {
label := f.MatchedAt
if label == "" {
label = "nuclei:" + f.TemplateID
}
// Prefer the full response body: it's a superset of extracted-results and
// lets geiger reassemble multi-field credentials. Then add only those
// extracted values not already in the body (e.g. a value a template pulled
// from a response header). When there's no body, fall back to the values.
if body := nucleiBody(f.Response); strings.TrimSpace(body) != "" {
out = append(out, Source{Label: label, Blob: parse.Parse(body, label)})
for _, v := range f.ExtractedResults {
if v = strings.TrimSpace(v); v != "" && !strings.Contains(body, v) {
out = append(out, Source{Label: label, Blob: parse.Parse(v, label)})
}
}
continue
}
for _, v := range f.ExtractedResults {
if v = strings.TrimSpace(v); v != "" {
out = append(out, Source{Label: label, Blob: parse.Parse(v, label)})
}
}
}
return out, nil
}

// FromGitleaks ingests a gitleaks JSON report and yields one Source per
// finding, so a prior scanner run can feed Geiger's triage directly.
func FromGitleaks(path string) ([]Source, error) {
Expand Down
177 changes: 177 additions & 0 deletions internal/pipeline/batch_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package pipeline

import (
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"

"github.com/puck-security/geiger/internal/module"
"github.com/puck-security/geiger/internal/parse"
"github.com/puck-security/geiger/internal/recognize"
"github.com/puck-security/geiger/internal/score"
)

Expand Down Expand Up @@ -93,6 +97,179 @@ func TestFromTrufflehogJSONArray(t *testing.T) {
}
}

func TestFromNuclei(t *testing.T) {
dir := t.TempDir()
// nuclei -j / -jsonl emits one result object per line. Two extractable
// values, an all-blank extracted-results to skip, and a junk line to ignore.
ndjson := `{"template-id":"exposed-env","matched-at":"https://victim.example/.env","extracted-results":["AKIAIOSFODNN7EXAMPLE"]}
{"template-id":"phpinfo","matched-at":"https://victim.example/phpinfo.php","extracted-results":["ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"]}
{"template-id":"empty","matched-at":"https://victim.example/x","extracted-results":["",""]}
not json, ignore me`
p := filepath.Join(dir, "nuclei.jsonl")
if err := os.WriteFile(p, []byte(ndjson), 0o600); err != nil {
t.Fatal(err)
}
srcs, err := FromNuclei(p)
if err != nil {
t.Fatal(err)
}
if len(srcs) != 2 { // two real values; blank values and junk line skipped
t.Fatalf("expected 2 sources, got %d: %+v", len(srcs), srcs)
}
// label = matched-at URL (the provenance that drives title/dedup/exposure),
// raw = the literal extracted value (geiger re-types it downstream).
if srcs[0].Label != "https://victim.example/.env" || srcs[0].Blob.Raw != "AKIAIOSFODNN7EXAMPLE" {
t.Errorf("unexpected source: %+v", srcs[0])
}
}

func TestFromNucleiArrayAndFallbackLabel(t *testing.T) {
dir := t.TempDir()
// JSON-array form; matched-at absent → label falls back to nuclei:<template-id>.
arr := `[{"template-id":"some-detector","extracted-results":["sk_live_fallbacklabel"]}]`
p := filepath.Join(dir, "arr.json")
if err := os.WriteFile(p, []byte(arr), 0o600); err != nil {
t.Fatal(err)
}
srcs, err := FromNuclei(p)
if err != nil {
t.Fatal(err)
}
if len(srcs) != 1 || srcs[0].Blob.Raw != "sk_live_fallbacklabel" {
t.Fatalf("array parse failed: %+v", srcs)
}
if srcs[0].Label != "nuclei:some-detector" {
t.Errorf("expected nuclei:template-id fallback label, got %q", srcs[0].Label)
}
}

func TestFromNucleiCorrelatesAcrossURLs(t *testing.T) {
// The same key extracted from two exposed URLs should collapse to one
// reconned finding that records both locations — the "one prod key exposed at
// N URLs" signal — and the matched-at URLs drive the internet-exposed-endpoint
// exposure class (both on the kept finding and in the "also exposed in" rollup).
const sec = "nuclei-corr-token-abc123xyz"
reg := module.NewRegistry()
reg.Register(fakeBearer{})
recognize.RegisterRecognizer(func(b parse.Blob, _ string, _ *module.Registry) []recognize.Match {
if b.Raw == sec {
return []recognize.Match{{Module: "fake", Fields: module.Fields{"token": sec}, Secret: sec, Label: "nuclei"}}
}
return nil
})
dir := t.TempDir()
ndjson := `{"template-id":"exposed-env","matched-at":"https://a.example/.env","extracted-results":["` + sec + `"]}
{"template-id":"exposed-env","matched-at":"https://b.example/backup.env","extracted-results":["` + sec + `"]}`
p := filepath.Join(dir, "nuclei.jsonl")
if err := os.WriteFile(p, []byte(ndjson), 0o600); err != nil {
t.Fatal(err)
}
srcs, err := FromNuclei(p)
if err != nil {
t.Fatal(err)
}
if len(srcs) != 2 {
t.Fatalf("expected 2 sources, got %d", len(srcs))
}
// Run sequentially on one Batch so dedup state is shared and the first URL is
// the kept finding (deterministic), matching TestBatchDedupesSecretAcrossSources.
bt := NewBatch(reg, Options{Live: false})
var all []Result
for _, s := range srcs {
all = append(all, bt.Run(s.Blob)...)
}
bt.AnnotateDuplicates(all)

kept, exposed, correlated := 0, false, false
for _, r := range all {
if r.secret != sec {
continue
}
kept++
for _, f := range r.Note.Findings {
if f.Key == "exposure" && strings.Contains(f.Value, "served over HTTP") {
exposed = true
}
if f.Key == "also exposed in" && strings.Contains(f.Value, "internet-exposed endpoint") && len(f.Detail) == 1 {
correlated = true
}
}
}
if kept != 1 {
t.Fatalf("same key at 2 URLs should recon once, got %d", kept)
}
if !exposed {
t.Errorf("kept result missing internet-exposed-endpoint exposure finding: %+v", all)
}
if !correlated {
t.Errorf("kept result not correlated with the other URL: %+v", all)
}
}

func TestFromNucleiIngestsResponseBody(t *testing.T) {
// nuclei's JSON carries the full response body. Ingesting it (not just the
// flat extracted-results) lets geiger's parsers reassemble multi-field
// credentials — here an AWS access-key + secret-key pair the bare array,
// which only carries the access key, can't represent.
dir := t.TempDir()
body := "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\nAWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\n"
rec := map[string]any{
"template-id": "exposed-dotenv",
"matched-at": "https://victim.example/.env",
"extracted-results": []string{"AKIAIOSFODNN7EXAMPLE"}, // nuclei only got the key
"response": "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n" + body,
}
line, _ := json.Marshal(rec)
p := filepath.Join(dir, "n.jsonl")
if err := os.WriteFile(p, line, 0o600); err != nil {
t.Fatal(err)
}
srcs, err := FromNuclei(p)
if err != nil {
t.Fatal(err)
}
// The body is a superset of the bare key, so the bare value is not re-emitted.
if len(srcs) != 1 {
t.Fatalf("expected 1 (body) source, got %d: %+v", len(srcs), srcs)
}
if srcs[0].Label != "https://victim.example/.env" {
t.Errorf("label should be the matched-at URL, got %q", srcs[0].Label)
}
v := srcs[0].Blob.Vars
if v["AWS_ACCESS_KEY_ID"] != "AKIAIOSFODNN7EXAMPLE" || v["AWS_SECRET_ACCESS_KEY"] == "" {
t.Errorf("response body not parsed into a full AWS pair: %+v", v)
}
}

func TestFromNucleiKeepsNonBodyExtracted(t *testing.T) {
// A value extracted from a part other than the body (so not present in the
// body) must still be ingested alongside the body source.
dir := t.TempDir()
body := "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\n"
rec := map[string]any{
"template-id": "x",
"matched-at": "https://victim.example/.env",
"extracted-results": []string{"AKIAIOSFODNN7EXAMPLE", "ghp_headertokennotinbody000000000000000"},
"response": "HTTP/1.1 200 OK\r\n\r\n" + body,
}
line, _ := json.Marshal(rec)
p := filepath.Join(dir, "n.jsonl")
if err := os.WriteFile(p, line, 0o600); err != nil {
t.Fatal(err)
}
srcs, err := FromNuclei(p)
if err != nil {
t.Fatal(err)
}
// body source + the one extracted value that isn't in the body.
if len(srcs) != 2 {
t.Fatalf("expected body source + 1 non-body value, got %d: %+v", len(srcs), srcs)
}
if srcs[1].Blob.Raw != "ghp_headertokennotinbody000000000000000" {
t.Errorf("non-body extracted value not kept: %+v", srcs[1])
}
}

func TestWalkDirSkipsNoiseDirs(t *testing.T) {
dir := t.TempDir()
os.MkdirAll(filepath.Join(dir, "node_modules"), 0o755)
Expand Down
Loading
Loading