diff --git a/.dev.env.example b/.dev.env.example new file mode 100644 index 0000000..2920927 --- /dev/null +++ b/.dev.env.example @@ -0,0 +1,41 @@ +# ============================================================================= +# EDR Local Development Environment Template (.dev.env) +# Usage: +# cp .dev.env.example .env +# # Adjust values below if you have a custom setup +# ============================================================================= + +# ── Fleet Server Settings ────────────────────────────────────────────────────── +# Host/Port the gRPC service will listen on (0.0.0.0 to bind on all interfaces) +HOST=0.0.0.0 +PORT=50051 + +# Secret key used to sign and verify node authorization JWT tokens. +# Change this in staging/production! +JWT_SECRET=dev-jwt-secret-key-do-not-use-in-production-12345 + +# Logging configuration for fleet-server +RUST_LOG=debug +LOG_FORMAT=human + +# ── Database Connections ───────────────────────────────────────────────────── +# PostgreSQL instance for fleet node registrations (managed by infra/docker-compose.yml) +# Default port is 5433 for nodes DB +DATABASE_URL=postgres://edr:changeme@127.0.0.1:5433/edr_nodes + +# ── Kafka Configuration ────────────────────────────────────────────────────── +# Kafka brokers list (using 127.0.0.1 explicitly to bypass IPv6 resolution delays/bugs on local Docker setups) +KAFKA_BROKERS=127.0.0.1:9092 + +# Raw telemetry ingestion topic +KAFKA_TOPIC_AGENTS_EVENTS=edr.events.raw + +# Note: The Kafka Web UI is accessible in your browser at http://localhost:8090 + +# ── Agent Settings ─────────────────────────────────────────────────────────── +# Config path override for running the EDR agent locally without sudo installation. +# By setting this, 'cargo run --bin edr-agent' will load this config file automatically. +EDR_AGENT_CONFIG=agent/agent.toml + +# Logging configuration for the agent +AGENT_RUST_LOG=debug diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..bade1a0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +target/ +.git/ +.github/ diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..18ae6b2 --- /dev/null +++ b/.env.example @@ -0,0 +1,25 @@ +# EDR Workspace Environment Variables Template +# Copy this file to .env at the workspace root and adjust as necessary for your local setup. + +# Fleet Server Database Connection +# Format: postgres://:@:/ +DATABASE_URL=postgres://edr:changeme@localhost:5433/edr_nodes + +# Kafka Brokers configuration (using 127.0.0.1 to avoid IPv6 localhost resolution bugs on host) +KAFKA_BROKERS=127.0.0.1:9092 +KAFKA_TOPIC_AGENTS_EVENTS=edr.events.raw + +# Auth JWT Secret (Replace with a real 256-bit secret in production) +JWT_SECRET=change-me-in-production + +# Server bind host and port +HOST=0.0.0.0 +PORT=50051 + +# Logging level and output format (human | json) +RUST_LOG=info +LOG_FORMAT=human + +# EDR Agent configuration path (optional, overrides default /etc/aigis-zero/config.toml) +# EDR_AGENT_CONFIG=agent/agent.toml + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0080863..079f166 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,8 +19,8 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install protoc - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + - name: Install protoc and dependencies + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config - name: Install Rust stable uses: dtolnay/rust-toolchain@stable @@ -41,8 +41,8 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install protoc - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + - name: Install protoc and dependencies + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config - name: Install Rust stable + clippy uses: dtolnay/rust-toolchain@stable @@ -79,8 +79,8 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install protoc - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + - name: Install protoc and dependencies + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config - name: Install Rust stable uses: dtolnay/rust-toolchain@stable diff --git a/.github/workflows/kafka-pipeline.yml b/.github/workflows/kafka-pipeline.yml new file mode 100644 index 0000000..3ae8e3b --- /dev/null +++ b/.github/workflows/kafka-pipeline.yml @@ -0,0 +1,57 @@ +name: Kafka Pipeline + +on: + push: + paths: + - 'kafka-pipeline/**' + - 'sdk/**' + pull_request: + paths: + - 'kafka-pipeline/**' + - 'sdk/**' + +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config + - uses: dtolnay/rust-toolchain@stable + - run: cargo check --manifest-path kafka-pipeline/Cargo.toml + - run: cargo clippy --manifest-path kafka-pipeline/Cargo.toml -- -D warnings + - run: cargo fmt --manifest-path kafka-pipeline/Cargo.toml -- --check + + test: + runs-on: ubuntu-latest + services: + kafka: + image: apache/kafka:latest + env: + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,CONTROLLER://kafka:9093 + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 + CLUSTER_ID: MkQkQzE4NTJjYTEyODQ4MTcwMw + ports: + - 9092:9092 + steps: + - uses: actions/checkout@v3 + - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config + - uses: dtolnay/rust-toolchain@stable + - run: cargo test --manifest-path kafka-pipeline/Cargo.toml + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: sudo apt-get update && sudo apt-get install -y protobuf-compiler libcurl4-openssl-dev pkg-config + - uses: dtolnay/rust-toolchain@stable + - run: cargo build --release --manifest-path kafka-pipeline/Cargo.toml + + docker: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: docker build -t aigis-kafka-pipeline -f kafka-pipeline/Dockerfile . diff --git a/.gitignore b/.gitignore index 0f57563..8900029 100644 --- a/.gitignore +++ b/.gitignore @@ -52,4 +52,7 @@ dist-ssr/ *.sw? */frontendprompt.md -.agent/* \ No newline at end of file +.agent/* +dev-testing-skill.md +agent-analysis.md +TEST_GUIDE.md diff --git a/Cargo.toml b/Cargo.toml index dd5f536..d87a336 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,20 +38,23 @@ tokio-util = { version = "0.7", features = ["codec"] } tokio-tungstenite = "0.29" tonic = { version = "0.14" } -tonic-reflection = "0.14" -tonic-build = "0.14" -tonic-prost-build = "0.14" -prost = "0.14" -tonic-prost = "0.14" +tonic-prost = { version = "0.14" } +prost = { version = "0.14" } +tonic-prost-build = { version = "0.14" } axum = { version = "0.8", features = ["ws", "macros"] } tower = "0.5" tower-http = { version = "0.6", features = ["cors", "trace", "compression-gzip"] } -# rdkafka = { version = "0.39", features = ["cmake-build"] } +hyper = { version = "1.0", features = ["full"] } +http = "1.1" +http-body = "1.0" +rdkafka = { version = "0.39", features = ["cmake-build"] } sqlx = { version = "0.8", default-features = false, features = ["postgres", "runtime-tokio-native-tls", "uuid", "chrono", "migrate", "macros"] } + + serde = { version = "1", features = ["derive"] } serde_json = "1" @@ -81,7 +84,7 @@ fleet-client = { path = "agent/crates/fleet-client" } isolation = { path = "agent/crates/isolation" } agent-bin = { path = "agent/crates/agent-bin" } agent-tracing = { path = "agent/crates/agent-tracing" } -rusqlite = { version = "0.31", features = ["bundled"] } +rusqlite = { version = "0.32", features = ["bundled"] } toml = "0.8" thrift = "0.17" diff --git a/EDR_IMPLEMENTATION_GUIDE.md b/EDR_IMPLEMENTATION_GUIDE.md deleted file mode 100644 index ac89939..0000000 --- a/EDR_IMPLEMENTATION_GUIDE.md +++ /dev/null @@ -1,1788 +0,0 @@ -# EDR — Full System Implementation Guide - -> **AXIOM** | Version 1.0 | Rust-First Architecture | Linux Only | eBPF + OSQuery -> -> This document is the single source of truth for building, initializing, and understanding every -> component of the EDR system. Read it fully before writing a single line of code. - ---- - -## Table of Contents - -1. [Architecture Overview](#1-architecture-overview) -2. [Technology Decisions & Rationale](#2-technology-decisions--rationale) -3. [Repository Strategy — Polyrepo](#3-repository-strategy--polyrepo) -4. [Repository Initialization — Step by Step](#4-repository-initialization--step-by-step) -5. [Linux Node Agent — Deep Dive](#5-linux-node-agent--deep-dive) -6. [Fleet Server — Deep Dive](#6-fleet-server--deep-dive) -7. [Kafka Pipeline — Deep Dive](#7-kafka-pipeline--deep-dive) -8. [Rule Engine — Deep Dive](#8-rule-engine--deep-dive) -9. [API Backend — Deep Dive](#9-api-backend--deep-dive) -10. [Frontend Dashboard — Deep Dive](#10-frontend-dashboard--deep-dive) -11. [EDR SDK — Deep Dive](#11-edr-sdk--deep-dive) -12. [Docker & Container Strategy](#12-docker--container-strategy) -13. [Inter-Service Communication](#13-inter-service-communication) -14. [Database Design](#14-database-design) -15. [GitHub Actions CI/CD](#15-github-actions-cicd) -16. [Phase Roadmap](#16-phase-roadmap) - ---- - -## 1. Architecture Overview - -### System Data Flow - -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ LINUX ENDPOINTS │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ edr-agent (Rust) │ │ -│ │ │ │ -│ │ ┌─────────────┐ ┌──────────────┐ ┌──────────────────┐ │ │ -│ │ │ eBPF Probes │ │ OSQuery Shim │ │ Local Buffer │ │ │ -│ │ │ (Rust/C) │ │ (Rust) │ │ (RocksDB/sled) │ │ │ -│ │ └──────┬──────┘ └──────┬───────┘ └────────┬─────────┘ │ │ -│ │ │ │ │ │ │ -│ │ └────────┬────────┘ │ │ │ -│ │ ▼ │ │ │ -│ │ ┌────────────────┐ buffers to ───────┘ │ │ -│ │ │ Event Stream │◄────────────────────── │ │ -│ │ │ Aggregator │ │ │ -│ │ └───────┬────────┘ │ │ -│ │ │ gRPC (TLS) bidirectional stream │ │ -│ └─────────────────┼───────────────────────────────────────────┘ │ -│ │ │ -└────────────────────┼──────────────────────────────────────────────┘ - │ - ▼ -┌────────────────────────────────────────────────────────────────────┐ -│ edr-fleet-server (Rust/Axum/Tokio) │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │ -│ │ Enrollment │ │ Config Mgr │ │ Command & Control │ │ -│ │ & Auth │ │ (push) │ │ (isolation relay) │ │ -│ └──────────────┘ └──────────────┘ └──────────────────────────┘ │ -│ │ -│ ┌────────────────────────────────────────────────────────────────┐ │ -│ │ Kafka Producer (rdkafka) │ │ -│ │ topic: edr.events.raw | topic: edr.health │ │ -│ └────────────────────────────────────────────────────────────────┘ │ -└───────────────────────────────┬────────────────────────────────────┘ - │ - ▼ -┌───────────────────────────────────────────────────────────────────┐ -│ Apache Kafka (Docker) │ -│ │ -│ topic: edr.events.raw ──────────► Event Processor consumes │ -│ topic: edr.events.norm ──────────► Rule Engine consumes │ -│ topic: edr.alerts ──────────► API Backend consumes │ -│ topic: edr.health ──────────► API Backend consumes │ -└───────────────────────────────────────────────────────────────────┘ - │ - ┌─────────────────┼──────────────────┐ - ▼ ▼ ▼ -┌─────────────────┐ ┌──────────────────┐ ┌───────────────────────┐ -│ edr-event- │ │ edr-rule-engine │ │ edr-ml-detection │ -│ processor(Rust) │ │ (Rust) │ │ (Python - separate) │ -│ │ │ │ │ │ -│ Kafka Consumer │ │ YARA scanning │ │ Anomaly detection │ -│ Normalizer │ │ MITRE mapping │ │ Inference pipeline │ -│ PostgreSQL write│ │ Alert generation │ │ Threat scoring │ -└────────┬────────┘ └────────┬─────────┘ └──────────┬────────────┘ - │ │ │ - ▼ ▼ ▼ -┌────────────────────────────────────────────────────────────────────┐ -│ PostgreSQL (Docker) │ -│ │ -│ edr_logs_db — all raw + normalised event logs │ -│ edr_nodes_db — node registry, health, config state │ -│ edr_alerts_db — alerts, MITRE mappings, threat scores │ -└───────────────────────────────┬────────────────────────────────────┘ - │ - ▼ -┌───────────────────────────────────────────────────────────────────┐ -│ edr-api-backend (Rust/Axum/Tokio) │ -│ │ -│ REST endpoints ──► Node list, logs query, alerts query │ -│ WebSocket ──► Real-time alert + health push │ -│ JWT Auth ──► Operator authentication │ -└───────────────────────────────┬────────────────────────────────────┘ - │ - ▼ -┌───────────────────────────────────────────────────────────────────┐ -│ edr-frontend (React + Vite) │ -│ │ -│ Node Map | Live Logs | Alerts Panel | Node Controls │ -└───────────────────────────────────────────────────────────────────┘ -``` - -### Component Summary - -| Repository | Language | Role | Scales? | -|---|---|---|---| -| `edr-agent` | Rust (Cargo workspace) | Runs on every monitored endpoint | Per-node | -| `edr-fleet-server` | Rust (Axum + Tokio) | Central gRPC server for all agents | Horizontal | -| `edr-kafka-pipeline` | Rust (rdkafka) | Event processing + DB writes | Horizontal | -| `edr-rule-engine` | Rust | YARA + MITRE detection | Horizontal | -| `edr-api-backend` | Rust (Axum + Tokio) | REST + WebSocket for frontend | Horizontal | -| `edr-frontend` | React + Vite + TypeScript | Operator dashboard | Static CDN | -| `edr-sdk` | Rust (lib crate) | Shared types, proto, client helpers | Library | -| `edr-infra` | Docker Compose + K8s | All infrastructure definitions | — | - ---- - -## 2. Technology Decisions & Rationale - -### Kafka vs Zenoh — Decision: Kafka - -Zenoh is excellent for IoT robotics pub-sub with sub-millisecond latency. However EDR has different requirements: - -**Why Kafka wins for EDR:** -- **Durability**: Events are persisted to disk with configurable retention. If the Rule Engine is down, events queue. With Zenoh, you lose them. -- **Consumer Groups**: Multiple Rule Engine instances can consume in parallel, each processing different partitions — native horizontal scaling. -- **Replay**: If the ML model is retrained, you can replay the last 7 days of events through it from Kafka. Impossible with Zenoh. -- **Exactly-once semantics**: Security pipelines cannot lose or double-process events. Kafka supports this natively. -- **Rust ecosystem**: `rdkafka` (librdkafka bindings) is mature, production-tested, and well maintained. -- **Operational tooling**: Kafka UI, consumer lag monitoring, partition rebalancing — all battle-tested. - -Zenoh would be appropriate if agents were IoT devices with sub-100ms latency requirements and no persistence needs. That is not EDR. - -### Rust Framework — Decision: Axum + Tokio (No Rocket, No Actix) - -- **Tokio** is the async runtime. Everything in the Rust async ecosystem is built around it. -- **Axum** is built by the Tokio team, uses Tower middleware natively, compiles faster than Actix, has excellent ergonomics for extractors and state management, and does not require macros for routing. -- **Actix-web** uses its own actor runtime which can conflict with Tokio in complex multi-crate workspaces. It also has a steeper learning curve for middleware composition. -- **Rocket** requires nightly Rust features and has slower compile times. - -### gRPC Framework — Decision: Tonic - -`tonic` is the standard async gRPC library for Rust built on Tokio. It generates Rust code from `.proto` files via `prost` and supports bidirectional streaming which the agent↔fleet connection requires. - -### Agent Architecture — Decision: Cargo Workspace - -The agent has multiple concerns (eBPF, OSQuery integration, buffer management, gRPC client) that are best separated into individual crates within one Cargo workspace. This enables: -- Independent compilation of each subsystem -- Clean dependency boundaries -- Shared types via a common crate -- Easy testing of individual components in isolation - ---- - -## 3. Repository Strategy — Polyrepo - -### Why Polyrepo, Not Monorepo - -The EDR system has components in fundamentally different languages and build systems: Rust (Cargo), React (Vite/npm), and Python (ML). A monorepo would require a meta-build system (Bazel/Nx) to manage cross-language builds, which adds complexity that is not justified at this stage. - -More importantly, each component has a completely independent deployment lifecycle: -- The agent is compiled to a binary and distributed to endpoints. It does not redeploy when the frontend changes. -- The frontend is a static build served by a CDN or nginx. It does not need Rust toolchain installed. -- The fleet server is a long-running service that scales independently. - -**Polyrepo with a shared `edr-sdk` crate** gives the independence of separate repos while maintaining shared type safety. - -### Repositories - -``` -github.com/your-org/ -├── edr-agent ← Cargo workspace, compiled binary for Linux endpoints -├── edr-fleet-server ← Rust/Axum, Docker image -├── edr-kafka-pipeline ← Rust/rdkafka, Docker image -├── edr-rule-engine ← Rust, Docker image -├── edr-api-backend ← Rust/Axum, Docker image -├── edr-frontend ← React/Vite/TypeScript, static build -├── edr-sdk ← Rust library crate (published internally or as git dep) -└── edr-infra ← Docker Compose, K8s manifests, Terraform -``` - -### How Shared Types Work (edr-sdk) - -All services that communicate share types via `edr-sdk`. In `Cargo.toml`: - -```toml -[dependencies] -edr-sdk = { git = "https://github.com/your-org/edr-sdk", tag = "v0.1.0" } -``` - -This means when a proto definition changes, it changes in `edr-sdk`, a new tag is cut, and each consuming service bumps its dependency. Breaking changes are visible at compile time. - ---- - -## 4. Repository Initialization — Step by Step - -### Step 1 — Create All Repos on GitHub - -Create each repo as **private**. Initialize with a `README.md` only — no auto-generated code. - -```bash -# Using GitHub CLI (gh) -for repo in edr-agent edr-fleet-server edr-kafka-pipeline edr-rule-engine edr-api-backend edr-frontend edr-sdk edr-infra; do - gh repo create your-org/$repo --private --description "EDR: $repo" -done -``` - -### Step 2 — Initialize edr-sdk First (Everything depends on it) - -```bash -git clone git@github.com:your-org/edr-sdk.git -cd edr-sdk - -# Initialize as a Rust library -cargo init --lib . - -# Create the directory structure -mkdir -p proto src/{types,auth,events,health} -``` - -`Cargo.toml` for edr-sdk: -```toml -[package] -name = "edr-sdk" -version = "0.1.0" -edition = "2021" - -[dependencies] -serde = { version = "1", features = ["derive"] } -serde_json = "1" -uuid = { version = "1", features = ["v4", "serde"] } -chrono = { version = "0.4", features = ["serde"] } -prost = "0.12" -tonic = "0.11" - -[build-dependencies] -tonic-build = "0.11" -``` - -`build.rs` for edr-sdk: -```rust -fn main() -> Result<(), Box> { - tonic_build::configure() - .build_server(true) - .build_client(true) - .compile( - &[ - "proto/agent.proto", - "proto/fleet.proto", - "proto/events.proto", - ], - &["proto/"], - )?; - Ok(()) -} -``` - -### Step 3 — Initialize edr-agent as Cargo Workspace - -```bash -git clone git@github.com:your-org/edr-agent.git -cd edr-agent - -# Create workspace Cargo.toml manually (do NOT run cargo init here) -``` - -Create `Cargo.toml` (workspace root): -```toml -[workspace] -resolver = "2" -members = [ - "crates/agent-core", - "crates/ebpf-collector", - "crates/osquery-client", - "crates/event-buffer", - "crates/fleet-client", - "crates/isolation", -] - -[workspace.dependencies] -# Pin versions once here, reference in member crates with { workspace = true } -tokio = { version = "1", features = ["full"] } -tonic = "0.11" -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } -anyhow = "1" -thiserror = "1" -uuid = { version = "1", features = ["v4", "serde"] } -chrono = { version = "0.4", features = ["serde"] } -edr-sdk = { git = "https://github.com/your-org/edr-sdk", tag = "v0.1.0" } -``` - -Initialize each crate: -```bash -for crate in agent-core ebpf-collector osquery-client event-buffer fleet-client isolation; do - cargo new --lib crates/$crate -done -``` - -### Step 4 — Initialize Rust Services (fleet-server, kafka-pipeline, rule-engine, api-backend) - -Each follows the same pattern: - -```bash -cd edr-fleet-server -cargo init --bin . -``` - -`Cargo.toml` template for services: -```toml -[package] -name = "edr-fleet-server" -version = "0.1.0" -edition = "2021" - -[[bin]] -name = "edr-fleet-server" -path = "src/main.rs" - -[dependencies] -tokio = { version = "1", features = ["full"] } -axum = { version = "0.7", features = ["ws"] } -tonic = "0.11" -tower = "0.4" -tower-http = { version = "0.5", features = ["cors", "trace", "compression-gzip"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } -anyhow = "1" -thiserror = "1" -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-native-tls", "uuid", "chrono"] } -uuid = { version = "1", features = ["v4", "serde"] } -chrono = { version = "0.4", features = ["serde"] } -config = "0.14" -dotenv = "0.15" -edr-sdk = { git = "https://github.com/your-org/edr-sdk", tag = "v0.1.0" } -``` - -### Step 5 — Initialize Frontend - -```bash -cd edr-frontend -npm create vite@latest . -- --template react-ts -npm install -npm install axios @tanstack/react-query zustand react-router-dom -npm install -D tailwindcss postcss autoprefixer -npx tailwindcss init -p -``` - -### Step 6 — Initialize edr-infra - -```bash -cd edr-infra -mkdir -p docker k8s/manifests terraform scripts -touch docker-compose.yml docker-compose.dev.yml -touch README.md -``` - -### Step 7 — Set Branch Protection on Every Repo - -For each repo on GitHub: -- `main` → require 2 approvals, all CI checks must pass, no direct push -- `develop` → require 1 approval, CI must pass - -### Step 8 — Add .github/ to Every Repo - -Create `.github/PULL_REQUEST_TEMPLATE.md` in each repo: - -```markdown -## Summary - - -## Type -- [ ] feat — new functionality -- [ ] fix — bug fix -- [ ] chore — dependency update, refactor, tooling -- [ ] docs — documentation only -- [ ] sec — security fix or hardening - -## Checklist -- [ ] Linked issue: closes # -- [ ] No secrets or credentials in code -- [ ] Tests added or updated -- [ ] `docker-compose up` tested locally -- [ ] Breaking changes documented in PR description - -## How to verify - -``` - ---- - -## 5. Linux Node Agent — Deep Dive - -### Cargo Workspace Structure - -``` -edr-agent/ -├── Cargo.toml ← workspace root -├── Cargo.lock -├── .cargo/ -│ └── config.toml ← linker config for eBPF targets -├── crates/ -│ ├── agent-core/ ← binary entry point, orchestrator -│ │ ├── Cargo.toml -│ │ └── src/ -│ │ ├── main.rs -│ │ ├── config.rs ← reads agent config from Fleet Server -│ │ └── orchestrator.rs ← spawns all subsystem tasks -│ │ -│ ├── ebpf-collector/ ← eBPF programs and loader -│ │ ├── Cargo.toml -│ │ ├── src/ -│ │ │ ├── lib.rs -│ │ │ ├── loader.rs ← loads compiled eBPF objects -│ │ │ └── events.rs ← parses perf buffer events -│ │ └── bpf/ ← eBPF C programs compiled with clang -│ │ ├── process_probe.bpf.c -│ │ ├── file_probe.bpf.c -│ │ └── network_probe.bpf.c -│ │ -│ ├── osquery-client/ ← OSQuery socket IPC client -│ │ ├── Cargo.toml -│ │ └── src/ -│ │ ├── lib.rs -│ │ ├── client.rs ← thrift/unix socket client -│ │ └── queries.rs ← scheduled query definitions -│ │ -│ ├── event-buffer/ ← local disk buffer (sled embedded DB) -│ │ ├── Cargo.toml -│ │ └── src/ -│ │ ├── lib.rs -│ │ └── buffer.rs ← write-ahead buffer, flush on reconnect -│ │ -│ ├── fleet-client/ ← gRPC client to Fleet Server -│ │ ├── Cargo.toml -│ │ └── src/ -│ │ ├── lib.rs -│ │ ├── connection.rs ← manages gRPC channel with reconnect -│ │ └── stream.rs ← bidirectional event stream -│ │ -│ └── isolation/ ← IPTables-based network isolation -│ ├── Cargo.toml -│ └── src/ -│ ├── lib.rs -│ └── iptables.rs ← adds/removes isolation rules -│ -├── Dockerfile -└── build.rs ← compiles eBPF C programs via clang -``` - -### Required Crates per Sub-Crate - -**agent-core:** -```toml -[dependencies] -tokio = { workspace = true } -tracing = { workspace = true } -tracing-subscriber = { workspace = true } -anyhow = { workspace = true } -config = "0.14" -serde = { workspace = true } -edr-sdk = { workspace = true } -ebpf-collector = { path = "../ebpf-collector" } -osquery-client = { path = "../osquery-client" } -event-buffer = { path = "../event-buffer" } -fleet-client = { path = "../fleet-client" } -isolation = { path = "../isolation" } -``` - -**ebpf-collector:** -```toml -[dependencies] -aya = "0.12" # eBPF loader — pure Rust, no libbpf C dependency -aya-log = "0.2" -tokio = { workspace = true } -tracing = { workspace = true } -anyhow = { workspace = true } -bytes = "1" -edr-sdk = { workspace = true } - -[build-dependencies] -aya-build = "0.1" # compiles BPF C programs at build time -``` - -**osquery-client:** -```toml -[dependencies] -tokio = { workspace = true } -serde = { workspace = true } -serde_json = { workspace = true } -anyhow = { workspace = true } -tokio-util = { version = "0.7", features = ["codec"] } -edr-sdk = { workspace = true } -``` - -**event-buffer:** -```toml -[dependencies] -sled = "0.34" # embedded key-value store, perfect for WAL buffer -tokio = { workspace = true } -serde = { workspace = true } -serde_json = { workspace = true } -anyhow = { workspace = true } -edr-sdk = { workspace = true } -``` - -**fleet-client:** -```toml -[dependencies] -tonic = { workspace = true } -tokio = { workspace = true } -tokio-stream = "0.1" -tower = "0.4" -anyhow = { workspace = true } -tracing = { workspace = true } -edr-sdk = { workspace = true } -``` - -**isolation:** -```toml -[dependencies] -tokio = { workspace = true } -anyhow = { workspace = true } -tracing = { workspace = true } -# Uses std::process::Command to invoke iptables — no external crate needed -``` - -### eBPF Probes — What They Capture - -Three BPF programs, each attached to specific kernel hooks: - -**process_probe.bpf.c** — attaches to `sys_enter_execve`: -- Process name, PID, PPID -- Full command-line arguments -- User ID, effective UID -- Working directory - -**file_probe.bpf.c** — attaches to `sys_enter_openat`, `sys_enter_unlinkat`: -- File path accessed -- Operation (open/read/write/delete) -- Process that triggered it -- Return code (was it successful?) - -**network_probe.bpf.c** — attaches to `sys_enter_connect`, `sys_enter_bind`: -- Source IP, destination IP -- Source port, destination port -- Protocol (TCP/UDP) -- Process that made the call - -### OSQuery Scheduled Queries - -Queries run on configurable intervals (pushed from Fleet Server): - -```json -{ - "schedule": { - "running_processes": { - "query": "SELECT pid, name, path, cmdline, uid, parent FROM processes;", - "interval": 30 - }, - "active_connections": { - "query": "SELECT pid, local_address, local_port, remote_address, remote_port, state FROM process_open_sockets WHERE state = 'ESTABLISHED';", - "interval": 60 - }, - "file_events": { - "query": "SELECT target_path, action, time, auid FROM file_events;", - "interval": 15 - }, - "logged_in_users": { - "query": "SELECT user, tty, host, time, pid FROM logged_in_users;", - "interval": 120 - }, - "installed_packages": { - "query": "SELECT name, version, source FROM deb_packages;", - "interval": 3600 - } - } -} -``` - -### Agent State Machine - -``` -INITIALIZING - │ - ▼ -ENROLLING ──────────────────► ENROLLMENT_FAILED (retry with backoff) - │ - ▼ -CONNECTED - │ - ├── collecting (eBPF events flowing) - ├── collecting (OSQuery results flowing) - ├── buffering (if connection drops → RECONNECTING) - │ - ▼ -RECONNECTING ───────────────► exponential backoff, drain buffer on reconnect - │ - ▼ -ISOLATING ──────────────────► received ISOLATE command from Fleet Server - │ - ▼ -ISOLATED ───────────────────► IPTables rules active, only Fleet Server reachable -``` - -### Agent Config File (`/etc/edr/agent.toml`) - -```toml -[fleet] -endpoint = "https://fleet.internal:50051" -tls_cert = "/etc/edr/certs/agent.crt" -tls_key = "/etc/edr/certs/agent.key" -ca_cert = "/etc/edr/certs/ca.crt" - -[agent] -node_id = "" # populated on first enrollment, persisted -buffer_path = "/var/lib/edr/buffer" -log_level = "info" - -[osquery] -socket_path = "/var/osquery/osquery.em" -config_refresh_interval_secs = 300 - -[ebpf] -ringbuf_size_pages = 256 -``` - ---- - -## 6. Fleet Server — Deep Dive - -### Architecture - -The Fleet Server is a single horizontally-scalable Rust service. Multiple instances can run behind a load balancer. Shared state lives in PostgreSQL, not in-memory, so any instance can handle any agent connection. - -``` -edr-fleet-server/ -├── Cargo.toml -├── src/ -│ ├── main.rs ← starts Tokio runtime, binds servers -│ ├── config.rs ← reads env + config file -│ ├── state.rs ← AppState shared via Arc<> -│ │ -│ ├── grpc/ ← tonic gRPC server -│ │ ├── mod.rs -│ │ ├── server.rs ← implements FleetService trait (from proto) -│ │ ├── enrollment.rs ← handles RegisterAgent RPC -│ │ ├── stream.rs ← handles bidirectional EventStream RPC -│ │ └── commands.rs ← sends IsolateNode / PushConfig commands -│ │ -│ ├── db/ ← sqlx database layer -│ │ ├── mod.rs -│ │ ├── nodes.rs ← CRUD for node registry -│ │ ├── health.rs ← node heartbeat tracking -│ │ └── config.rs ← agent config storage -│ │ -│ ├── kafka/ ← rdkafka producer -│ │ ├── mod.rs -│ │ └── producer.rs ← publishes events to edr.events.raw topic -│ │ -│ └── error.rs ← unified error types -│ -├── migrations/ ← sqlx migrations for nodes_db -│ ├── 001_create_nodes.sql -│ └── 002_create_agent_configs.sql -│ -└── Dockerfile -``` - -### Key Crates - -```toml -[dependencies] -tokio = { version = "1", features = ["full"] } -axum = "0.7" # HTTP admin API (health check, metrics) -tonic = "0.11" # gRPC server -tonic-reflection = "0.11" # gRPC reflection for tooling -tower = "0.4" -tower-http = { version = "0.5", features = ["trace"] } -prost = "0.12" - -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-native-tls", "uuid", "chrono", "migrate"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } -uuid = { version = "1", features = ["v4", "serde"] } -chrono = { version = "0.4", features = ["serde"] } -jsonwebtoken = "9" # for agent JWT tokens on enrollment -config = "0.14" -anyhow = "1" -thiserror = "1" -edr-sdk = { git = "...", tag = "v0.1.0" } -``` - -### gRPC Proto Definition (lives in edr-sdk/proto/fleet.proto) - -```protobuf -syntax = "proto3"; -package edr.fleet; - -service FleetService { - // Agent calls this once to register. Returns a JWT token. - rpc RegisterAgent(RegisterRequest) returns (RegisterResponse); - - // Bidirectional stream: agent sends events, server sends commands. - rpc EventStream(stream AgentEvent) returns (stream ServerCommand); - - // Agent sends periodic heartbeat. - rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse); -} - -message RegisterRequest { - string hostname = 1; - string os_version = 2; - string agent_version = 3; - string machine_id = 4; // from /etc/machine-id -} - -message RegisterResponse { - string node_id = 1; // UUID assigned by server - string token = 2; // JWT for subsequent calls - AgentConfig config = 3; // initial configuration -} - -message AgentEvent { - string node_id = 1; - string event_type = 2; // "process" | "file" | "network" | "osquery" - bytes payload = 3; // JSON-encoded event data - int64 timestamp_ns = 4; - string sequence_id = 5; -} - -message ServerCommand { - oneof command { - IsolateCommand isolate = 1; - ConfigUpdateCommand config_update = 2; - AckCommand ack = 3; - } -} - -message IsolateCommand { - bool isolate = 1; // true = isolate, false = de-isolate - string reason = 2; -} - -message ConfigUpdateCommand { - AgentConfig config = 1; -} - -message AgentConfig { - repeated OsquerySchedule osquery_schedule = 1; - int32 heartbeat_interval_secs = 2; - int32 batch_size = 3; -} - -message HeartbeatRequest { - string node_id = 1; - string status = 2; // "healthy" | "degraded" | "isolated" - int64 events_buffered = 3; -} - -message HeartbeatResponse { - bool ok = 1; -} -``` - -### Node Enrollment Flow - -``` -Agent Fleet Server PostgreSQL - │ │ │ - │──── RegisterAgent(hostname) ────►│ │ - │ │── INSERT nodes ──────────────►│ - │ │◄─ node_id returned ───────────│ - │ │── sign JWT(node_id, exp=24h) ─│ - │◄─── RegisterResponse(token) ─────│ │ - │ │ │ - │── EventStream (with JWT header) ►│ │ - │ │── verify JWT ─────────────────│ - │◄─── ServerCommand(config) ───────│ │ - │ │ │ - │ [stream stays open] │ │ - │──── AgentEvent (events) ────────►│ │ - │ │── produce to Kafka ───────────► -``` - -### Horizontal Scaling Design - -When multiple fleet-server instances run: -- Each agent connects to one instance (sticky via load balancer) -- Node state in PostgreSQL — any instance can read any node's state -- Isolation commands: operator triggers via API Backend → writes command to PostgreSQL `pending_commands` table → fleet-server instances poll for pending commands and relay to connected agents -- No shared in-memory state between instances - -### Fleet Server Environment Variables - -```bash -DATABASE_URL=postgres://user:pass@postgres:5432/edr_nodes -KAFKA_BROKERS=kafka:9092 -GRPC_BIND_ADDR=0.0.0.0:50051 -HTTP_BIND_ADDR=0.0.0.0:8080 # admin/health HTTP port -JWT_SECRET= -LOG_LEVEL=info -RUST_LOG=edr_fleet_server=info,tonic=warn -``` - ---- - -## 7. Kafka Pipeline — Deep Dive - -### Topic Design - -| Topic | Producer | Consumer | Retention | Partitions | -|---|---|---|---|---| -| `edr.events.raw` | Fleet Server | Event Processor | 7 days | 12 | -| `edr.events.norm` | Event Processor | Rule Engine, ML | 7 days | 12 | -| `edr.alerts` | Rule Engine, ML | API Backend | 30 days | 4 | -| `edr.health` | Fleet Server | API Backend | 1 day | 4 | - -### edr-kafka-pipeline Structure - -``` -edr-kafka-pipeline/ -├── Cargo.toml -├── src/ -│ ├── main.rs -│ ├── config.rs -│ ├── consumer.rs ← consumes edr.events.raw -│ ├── normalizer.rs ← parses + normalises raw events to common schema -│ ├── producer.rs ← publishes to edr.events.norm -│ ├── db_writer.rs ← writes to edr_logs_db PostgreSQL -│ └── error.rs -└── Dockerfile -``` - -### Key Crates - -```toml -[dependencies] -tokio = { version = "1", features = ["full"] } -rdkafka = { version = "0.36", features = ["cmake-build", "ssl"] } -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-native-tls", "uuid", "chrono"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } -anyhow = "1" -thiserror = "1" -edr-sdk = { git = "...", tag = "v0.1.0" } -``` - -### Normalised Event Schema (defined in edr-sdk) - -```rust -// edr-sdk/src/events.rs -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NormalisedEvent { - pub id: Uuid, - pub node_id: Uuid, - pub event_type: EventType, - pub timestamp: DateTime, - pub hostname: String, - pub payload: EventPayload, - pub raw_sequence_id: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum EventPayload { - Process(ProcessEvent), - File(FileEvent), - Network(NetworkEvent), - OsqueryResult(OsqueryEvent), -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ProcessEvent { - pub pid: u32, - pub ppid: u32, - pub name: String, - pub cmdline: String, - pub uid: u32, - pub exe_path: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FileEvent { - pub path: String, - pub operation: FileOperation, // Open | Write | Delete | Rename - pub pid: u32, - pub process_name: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NetworkEvent { - pub src_ip: String, - pub dst_ip: String, - pub src_port: u16, - pub dst_port: u16, - pub protocol: String, - pub pid: u32, - pub direction: NetworkDirection, // Inbound | Outbound -} -``` - -### Docker — Kafka and PostgreSQL Containers - -**Important decision**: Kafka and PostgreSQL run in **separate containers**. Do not combine them. They have different resource profiles (Kafka is I/O bound, PostgreSQL is memory bound), different restart policies, and different backup strategies. Combining them in one container is an operational anti-pattern. - -`edr-infra/docker-compose.yml`: - -```yaml -version: "3.9" - -services: - - zookeeper: - image: confluentinc/cp-zookeeper:7.6.0 - container_name: edr-zookeeper - environment: - ZOOKEEPER_CLIENT_PORT: 2181 - ZOOKEEPER_TICK_TIME: 2000 - volumes: - - zookeeper_data:/var/lib/zookeeper/data - healthcheck: - test: ["CMD", "nc", "-z", "localhost", "2181"] - interval: 10s - timeout: 5s - retries: 5 - - kafka: - image: confluentinc/cp-kafka:7.6.0 - container_name: edr-kafka - depends_on: - zookeeper: - condition: service_healthy - ports: - - "9092:9092" - environment: - KAFKA_BROKER_ID: 1 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 - KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 - KAFKA_LOG_RETENTION_HOURS: 168 - KAFKA_AUTO_CREATE_TOPICS_ENABLE: "false" - volumes: - - kafka_data:/var/lib/kafka/data - healthcheck: - test: ["CMD", "kafka-topics", "--bootstrap-server", "localhost:29092", "--list"] - interval: 15s - timeout: 10s - retries: 5 - - kafka-init: - image: confluentinc/cp-kafka:7.6.0 - depends_on: - kafka: - condition: service_healthy - entrypoint: ["/bin/sh", "-c"] - command: | - " - kafka-topics --bootstrap-server kafka:29092 --create --if-not-exists --topic edr.events.raw --partitions 12 --replication-factor 1 - kafka-topics --bootstrap-server kafka:29092 --create --if-not-exists --topic edr.events.norm --partitions 12 --replication-factor 1 - kafka-topics --bootstrap-server kafka:29092 --create --if-not-exists --topic edr.alerts --partitions 4 --replication-factor 1 - kafka-topics --bootstrap-server kafka:29092 --create --if-not-exists --topic edr.health --partitions 4 --replication-factor 1 - echo 'Topics created.' - " - restart: on-failure - - postgres-logs: - image: postgres:16-alpine - container_name: edr-postgres-logs - environment: - POSTGRES_DB: edr_logs - POSTGRES_USER: edr - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - ports: - - "5432:5432" - volumes: - - postgres_logs_data:/var/lib/postgresql/data - healthcheck: - test: ["CMD-SHELL", "pg_isready -U edr -d edr_logs"] - interval: 10s - timeout: 5s - retries: 5 - - postgres-nodes: - image: postgres:16-alpine - container_name: edr-postgres-nodes - environment: - POSTGRES_DB: edr_nodes - POSTGRES_USER: edr - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - ports: - - "5433:5432" - volumes: - - postgres_nodes_data:/var/lib/postgresql/data - healthcheck: - test: ["CMD-SHELL", "pg_isready -U edr -d edr_nodes"] - interval: 10s - timeout: 5s - retries: 5 - - postgres-alerts: - image: postgres:16-alpine - container_name: edr-postgres-alerts - environment: - POSTGRES_DB: edr_alerts - POSTGRES_USER: edr - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - ports: - - "5434:5432" - volumes: - - postgres_alerts_data:/var/lib/postgresql/data - healthcheck: - test: ["CMD-SHELL", "pg_isready -U edr -d edr_alerts"] - interval: 10s - timeout: 5s - retries: 5 - - kafka-ui: - image: provectuslabs/kafka-ui:latest - container_name: edr-kafka-ui - depends_on: - - kafka - ports: - - "8090:8080" - environment: - KAFKA_CLUSTERS_0_NAME: edr-local - KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:29092 - -volumes: - zookeeper_data: - kafka_data: - postgres_logs_data: - postgres_nodes_data: - postgres_alerts_data: -``` - ---- - -## 8. Rule Engine — Deep Dive - -``` -edr-rule-engine/ -├── Cargo.toml -├── src/ -│ ├── main.rs -│ ├── config.rs -│ ├── consumer.rs ← Kafka consumer for edr.events.norm -│ ├── yara_scanner.rs ← YARA rule evaluation -│ ├── mitre_mapper.rs ← MITRE ATT&CK technique lookup -│ ├── alert_producer.rs ← Kafka producer to edr.alerts -│ ├── db_writer.rs ← writes alerts to edr_alerts_db -│ └── rules/ -│ └── loader.rs ← loads rules from /etc/edr/rules/*.yar -│ -├── rules/ ← default YARA rules (shipped with container) -│ ├── process_injection.yar -│ ├── credential_access.yar -│ └── persistence.yar -│ -└── Dockerfile -``` - -### Key Crates - -```toml -[dependencies] -tokio = { version = "1", features = ["full"] } -rdkafka = { version = "0.36", features = ["cmake-build"] } -yara-x = "0.5" # Pure Rust YARA implementation (no C library dependency) -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-native-tls"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -anyhow = "1" -edr-sdk = { git = "...", tag = "v0.1.0" } -``` - -### Alert Schema (defined in edr-sdk) - -```rust -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Alert { - pub id: Uuid, - pub node_id: Uuid, - pub hostname: String, - pub timestamp: DateTime, - pub severity: Severity, // Critical | High | Medium | Low - pub source: AlertSource, // Yara | MlModel | RuleEngine - pub mitre_technique_id: Option, // e.g. "T1059.004" - pub mitre_tactic: Option, // e.g. "Execution" - pub description: String, - pub triggering_event_id: Uuid, - pub threat_score: f32, // 0.0 – 100.0 - pub status: AlertStatus, // Open | Acknowledged | Dismissed -} -``` - ---- - -## 9. API Backend — Deep Dive - -### Architecture - -``` -edr-api-backend/ -├── Cargo.toml -├── src/ -│ ├── main.rs -│ ├── config.rs -│ ├── state.rs ← Arc with DB pools + WS broadcaster -│ │ -│ ├── routes/ -│ │ ├── mod.rs -│ │ ├── auth.rs ← POST /auth/login, POST /auth/refresh -│ │ ├── nodes.rs ← GET /nodes, GET /nodes/:id -│ │ ├── logs.rs ← GET /nodes/:id/logs -│ │ ├── alerts.rs ← GET /alerts, PATCH /alerts/:id -│ │ ├── commands.rs ← POST /nodes/:id/isolate, POST /nodes/:id/deisolate -│ │ └── ws.rs ← GET /ws (WebSocket upgrade) -│ │ -│ ├── middleware/ -│ │ ├── auth.rs ← JWT extraction + validation layer -│ │ └── logging.rs -│ │ -│ ├── db/ -│ │ ├── nodes.rs -│ │ ├── logs.rs -│ │ └── alerts.rs -│ │ -│ ├── kafka/ -│ │ └── consumer.rs ← consumes edr.alerts + edr.health, broadcasts to WS -│ │ -│ └── error.rs -└── Dockerfile -``` - -### API Routes Reference - -**Authentication** - -| Method | Path | Description | -|---|---|---| -| POST | `/auth/login` | Operator login → returns JWT access + refresh token | -| POST | `/auth/refresh` | Refresh access token using refresh token | -| POST | `/auth/logout` | Invalidate refresh token | - -**Nodes** - -| Method | Path | Description | -|---|---|---| -| GET | `/nodes` | List all enrolled nodes with current status | -| GET | `/nodes/:id` | Single node detail (OS info, last seen, alert count, isolation state) | -| GET | `/nodes/:id/logs` | Paginated log query. Query params: `from`, `to`, `type`, `limit`, `offset` | - -**Alerts** - -| Method | Path | Description | -|---|---|---| -| GET | `/alerts` | List alerts. Query params: `severity`, `status`, `from`, `to`, `node_id` | -| GET | `/alerts/:id` | Single alert with full MITRE context | -| PATCH | `/alerts/:id` | Update alert status (`acknowledged` / `dismissed`) | - -**Commands** - -| Method | Path | Description | -|---|---|---| -| POST | `/nodes/:id/isolate` | Trigger node isolation. Body: `{ "reason": "string" }` | -| POST | `/nodes/:id/deisolate` | Remove node isolation | - -**WebSocket** - -| Path | Description | -|---|---| -| `GET /ws` | Upgrade to WebSocket. Server pushes `alert_created`, `node_status_changed`, `node_health` events | - -### WebSocket Message Format - -```json -{ - "event": "alert_created", - "data": { - "alert_id": "uuid", - "node_id": "uuid", - "severity": "High", - "mitre_technique_id": "T1059.004", - "description": "Bash reverse shell detected", - "timestamp": "2025-01-01T00:00:00Z" - } -} -``` - -```json -{ - "event": "node_status_changed", - "data": { - "node_id": "uuid", - "hostname": "prod-server-01", - "status": "isolated", - "timestamp": "2025-01-01T00:00:00Z" - } -} -``` - -### Key Crates - -```toml -[dependencies] -tokio = { version = "1", features = ["full"] } -axum = { version = "0.7", features = ["ws", "macros"] } -tower = "0.4" -tower-http = { version = "0.5", features = ["cors", "trace", "compression-gzip"] } -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-native-tls", "uuid", "chrono"] } -rdkafka = { version = "0.36", features = ["cmake-build"] } -jsonwebtoken = "9" -argon2 = "0.5" # password hashing for operator accounts -tokio-tungstenite = "0.21" -serde = { version = "1", features = ["derive"] } -serde_json = "1" -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } -uuid = { version = "1", features = ["v4", "serde"] } -chrono = { version = "0.4", features = ["serde"] } -anyhow = "1" -thiserror = "1" -edr-sdk = { git = "...", tag = "v0.1.0" } -``` - ---- - -## 10. Frontend Dashboard — Deep Dive - -### Stack - -- **React 18** + **TypeScript** + **Vite** (fast dev builds) -- **TailwindCSS** — utility-first styling -- **TanStack Query (React Query)** — server state management, auto-refetch -- **Zustand** — lightweight client state (auth token, UI state) -- **React Router v6** — routing -- **Recharts** — charts for alert trends and threat scores -- **TanStack Table** — virtualized log tables (handles 10k+ rows) - -### Directory Structure - -``` -edr-frontend/ -├── src/ -│ ├── main.tsx -│ ├── App.tsx ← router setup -│ │ -│ ├── api/ ← typed API client wrappers -│ │ ├── client.ts ← axios instance with JWT interceptor -│ │ ├── nodes.ts -│ │ ├── alerts.ts -│ │ ├── logs.ts -│ │ └── auth.ts -│ │ -│ ├── hooks/ ← React Query hooks -│ │ ├── useNodes.ts -│ │ ├── useAlerts.ts -│ │ ├── useLogs.ts -│ │ └── useWebSocket.ts ← WS connection manager with reconnect -│ │ -│ ├── store/ ← Zustand stores -│ │ ├── authStore.ts ← JWT token, user info -│ │ └── uiStore.ts ← selected node, active filters -│ │ -│ ├── pages/ -│ │ ├── LoginPage.tsx -│ │ ├── DashboardPage.tsx -│ │ ├── NodeMapPage.tsx ← grid/list of all nodes -│ │ ├── NodeDetailPage.tsx ← single node logs + alerts -│ │ ├── AlertsPage.tsx ← alerts panel with filters -│ │ └── LiveLogsPage.tsx ← real-time log stream -│ │ -│ ├── components/ -│ │ ├── layout/ -│ │ │ ├── Sidebar.tsx -│ │ │ └── TopBar.tsx -│ │ ├── nodes/ -│ │ │ ├── NodeCard.tsx -│ │ │ ├── NodeStatusBadge.tsx -│ │ │ └── IsolateButton.tsx -│ │ ├── alerts/ -│ │ │ ├── AlertRow.tsx -│ │ │ ├── SeverityBadge.tsx -│ │ │ └── MitreTechniqueTag.tsx -│ │ └── logs/ -│ │ ├── LogTable.tsx -│ │ └── LogTypeFilter.tsx -│ │ -│ └── types/ ← TypeScript interfaces mirroring Rust types -│ ├── node.ts -│ ├── alert.ts -│ └── event.ts -│ -├── index.html -├── vite.config.ts -├── tailwind.config.js -├── tsconfig.json -└── Dockerfile ← nginx serving the static build -``` - -### Views — Implementation Notes - -**Node Map** — polls `GET /nodes` every 30s via React Query. Nodes displayed as cards with colour-coded status (green = healthy, yellow = degraded, red = isolated). Clicking a card navigates to NodeDetailPage. - -**Live Logs** — connects to `GET /ws` WebSocket via `useWebSocket` hook. Incoming `log_event` messages appended to a circular buffer (max 500 entries). TanStack Table renders with row virtualisation so DOM does not explode. Filter bar filters client-side. - -**Alerts Panel** — `GET /alerts` with server-side filtering. Each row shows node hostname, MITRE technique ID, severity badge, timestamp. Acknowledge / Dismiss buttons call `PATCH /alerts/:id`. Unacknowledged alert count shown in sidebar badge updated via WebSocket. - -**Node Controls** — `IsolateButton` shows confirmation modal before calling `POST /nodes/:id/isolate`. Optimistic UI update (card immediately shows "Isolating..."), reconciled when WebSocket delivers `node_status_changed`. - ---- - -## 11. EDR SDK — Deep Dive - -The SDK is a Rust library crate that is a **compile-time contract** between all services. - -``` -edr-sdk/ -├── Cargo.toml -├── build.rs ← compiles .proto files via tonic-build -├── proto/ -│ ├── fleet.proto -│ ├── agent.proto -│ └── events.proto -└── src/ - ├── lib.rs - ├── types/ - │ ├── mod.rs - │ ├── node.rs ← Node, NodeStatus, NodeConfig - │ ├── event.rs ← NormalisedEvent, EventPayload variants - │ ├── alert.rs ← Alert, Severity, AlertSource - │ └── auth.rs ← Claims (JWT payload struct) - └── proto/ - └── mod.rs ← re-exports generated tonic code -``` - -### Versioning Strategy - -The SDK uses semantic versioning strictly: -- **Patch** (0.1.1): adding optional fields to existing structs -- **Minor** (0.2.0): adding new message types, backwards-compatible proto changes -- **Major** (1.0.0): breaking changes to existing message fields or RPC signatures - -When `edr-sdk` releases a new tag, each consuming service opens a PR to bump the dependency. The PR will fail to compile if the service has not updated its usage of changed types — this is intentional. Breaking changes are caught at compile time, not at runtime. - ---- - -## 12. Docker & Container Strategy - -### Rust Service Dockerfile Template - -Use multi-stage builds to produce minimal images. Final image is `debian:bookworm-slim`, not `scratch`, because Rust services link against `libssl` and `libpq`. - -```dockerfile -# Stage 1: Build -FROM rust:1.78-slim-bookworm AS builder - -RUN apt-get update && apt-get install -y \ - pkg-config libssl-dev libpq-dev cmake \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build -COPY Cargo.toml Cargo.lock ./ -# Cache dependencies layer -RUN mkdir src && echo "fn main(){}" > src/main.rs -RUN cargo build --release -RUN rm -f target/release/deps/edr_* - -COPY src ./src -RUN cargo build --release - -# Stage 2: Runtime -FROM debian:bookworm-slim AS runtime - -RUN apt-get update && apt-get install -y \ - libssl3 libpq5 ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN useradd -m -u 1001 -s /bin/bash edr - -COPY --from=builder /build/target/release/edr-fleet-server /usr/local/bin/ -RUN chmod +x /usr/local/bin/edr-fleet-server - -USER edr -EXPOSE 50051 8080 - -ENTRYPOINT ["edr-fleet-server"] -``` - -### Frontend Dockerfile - -```dockerfile -FROM node:20-alpine AS builder -WORKDIR /app -COPY package*.json ./ -RUN npm ci -COPY . . -RUN npm run build - -FROM nginx:alpine AS runtime -COPY --from=builder /app/dist /usr/share/nginx/html -COPY nginx.conf /etc/nginx/conf.d/default.conf -EXPOSE 80 -``` - -### Agent Dockerfile (for testing only — real deployment is a binary) - -```dockerfile -FROM rust:1.78-slim-bookworm AS builder -RUN apt-get update && apt-get install -y \ - pkg-config libssl-dev cmake clang llvm \ - linux-headers-generic \ - && rm -rf /var/lib/apt/lists/* -WORKDIR /build -COPY . . -RUN cargo build --release --bin agent-core -``` - -> **Note on the real agent**: In production, the agent is a statically-linked binary (`RUSTFLAGS="-C target-feature=+crt-static"`) installed via a `.deb` or `.rpm` package. It is **not** run inside a container on the monitored endpoint — eBPF probes need direct access to the host kernel. - ---- - -## 13. Inter-Service Communication - -### Communication Matrix - -| From | To | Protocol | Notes | -|---|---|---|---| -| Agent | Fleet Server | gRPC / TLS (mTLS) | Bidirectional streaming | -| Fleet Server | Kafka | Producer (rdkafka) | `edr.events.raw`, `edr.health` | -| Kafka Pipeline | PostgreSQL logs | sqlx | Bulk inserts, batched | -| Rule Engine | Kafka | Consumer (rdkafka) | `edr.events.norm` | -| Rule Engine | Kafka | Producer (rdkafka) | `edr.alerts` | -| Rule Engine | PostgreSQL alerts | sqlx | Alert writes | -| API Backend | PostgreSQL (all 3) | sqlx | Read-mostly | -| API Backend | Kafka | Consumer (rdkafka) | `edr.alerts`, `edr.health` | -| API Backend | Fleet Server | HTTP (internal) | Relay isolation commands | -| Frontend | API Backend | REST + WebSocket | JWT auth | - -### mTLS for Agent ↔ Fleet Server - -Every agent has a unique certificate signed by an internal CA. The Fleet Server validates the client certificate on connection. This prevents rogue agents from injecting data. - -Certificate lifecycle: -1. On first enrollment, agent generates a keypair and sends a CSR in the `RegisterRequest` -2. Fleet Server signs it with the internal CA and returns the cert in `RegisterResponse` -3. Subsequent connections present this cert — the JWT token is a second layer, not a replacement - ---- - -## 14. Database Design - -### edr_nodes_db — Node Registry - -```sql --- migrations/001_create_nodes.sql -CREATE TABLE nodes ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - hostname VARCHAR(255) NOT NULL, - os_version VARCHAR(255), - agent_version VARCHAR(50), - machine_id VARCHAR(64) UNIQUE NOT NULL, - enrolled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - last_seen TIMESTAMPTZ, - status VARCHAR(20) NOT NULL DEFAULT 'online', - -- 'online' | 'offline' | 'isolated' | 'degraded' - ip_address INET, - CONSTRAINT status_check CHECK (status IN ('online','offline','isolated','degraded')) -); - -CREATE TABLE agent_configs ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, - config JSONB NOT NULL, - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE TABLE pending_commands ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, - command JSONB NOT NULL, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - delivered BOOLEAN NOT NULL DEFAULT FALSE, - delivered_at TIMESTAMPTZ -); - -CREATE INDEX idx_nodes_status ON nodes(status); -CREATE INDEX idx_pending_commands_undelivered ON pending_commands(node_id, delivered) WHERE delivered = FALSE; -``` - -### edr_logs_db — Event Storage - -```sql --- Use TimescaleDB extension for time-series performance (optional but recommended) -CREATE TABLE events ( - id UUID NOT NULL DEFAULT gen_random_uuid(), - node_id UUID NOT NULL, - event_type VARCHAR(30) NOT NULL, - -- 'process' | 'file' | 'network' | 'osquery' - timestamp TIMESTAMPTZ NOT NULL, - hostname VARCHAR(255) NOT NULL, - payload JSONB NOT NULL, - sequence_id VARCHAR(64) -) PARTITION BY RANGE (timestamp); - --- Create monthly partitions -CREATE TABLE events_2025_01 PARTITION OF events - FOR VALUES FROM ('2025-01-01') TO ('2025-02-01'); - -CREATE INDEX idx_events_node_time ON events(node_id, timestamp DESC); -CREATE INDEX idx_events_type ON events(event_type, timestamp DESC); -CREATE INDEX idx_events_payload ON events USING GIN(payload); -``` - -### edr_alerts_db — Alerts - -```sql -CREATE TABLE alerts ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - node_id UUID NOT NULL, - hostname VARCHAR(255) NOT NULL, - timestamp TIMESTAMPTZ NOT NULL, - severity VARCHAR(20) NOT NULL, - source VARCHAR(20) NOT NULL, - mitre_technique_id VARCHAR(20), - mitre_tactic VARCHAR(100), - description TEXT NOT NULL, - triggering_event_id UUID, - threat_score REAL NOT NULL DEFAULT 0.0, - status VARCHAR(20) NOT NULL DEFAULT 'open', - acknowledged_at TIMESTAMPTZ, - acknowledged_by VARCHAR(100) -); - -CREATE INDEX idx_alerts_node ON alerts(node_id, timestamp DESC); -CREATE INDEX idx_alerts_severity ON alerts(severity, status); -CREATE INDEX idx_alerts_status ON alerts(status, timestamp DESC); -CREATE INDEX idx_alerts_open ON alerts(status, timestamp DESC) WHERE status = 'open'; -``` - ---- - -## 15. GitHub Actions CI/CD - -### Template for Rust Services (`.github/workflows/ci.yml`) - -```yaml -name: CI - -on: - push: - branches: [main, develop] - pull_request: - branches: [main, develop] - -env: - CARGO_TERM_COLOR: always - RUST_BACKTRACE: 1 - -jobs: - lint: - name: Lint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - with: - components: rustfmt, clippy - - uses: Swatinem/rust-cache@v2 - - name: Format check - run: cargo fmt --all -- --check - - name: Clippy - run: cargo clippy --all-targets --all-features -- -D warnings - - test: - name: Test - runs-on: ubuntu-latest - services: - postgres: - image: postgres:16-alpine - env: - POSTGRES_PASSWORD: testpass - POSTGRES_DB: edr_test - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - - name: Run tests - run: cargo test --all - env: - DATABASE_URL: postgres://postgres:testpass@localhost:5432/edr_test - - security: - name: Security Audit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - - name: Install cargo-audit - run: cargo install cargo-audit - - name: Audit dependencies - run: cargo audit - - build: - name: Build & Push Image - runs-on: ubuntu-latest - needs: [lint, test, security] - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop' - steps: - - uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and push - uses: docker/build-push-action@v5 - with: - push: true - tags: ghcr.io/${{ github.repository }}:${{ github.sha }} - cache-from: type=gha - cache-to: type=gha,mode=max - - scan-image: - name: Scan Image - runs-on: ubuntu-latest - needs: [build] - steps: - - name: Run Trivy - uses: aquasecurity/trivy-action@master - with: - image-ref: ghcr.io/${{ github.repository }}:${{ github.sha }} - format: table - exit-code: 1 - severity: CRITICAL,HIGH -``` - ---- - -## 16. Phase Roadmap - -### Phase 0 — Repository & Infrastructure Foundation -**Goal**: Every repo exists, branch protection is on, infra containers run locally. - -- [ ] Create all 8 repos on GitHub -- [ ] Initialize `edr-sdk` with proto files and shared types (v0.1.0 tag) -- [ ] Initialize `edr-infra` with `docker-compose.yml` (Kafka + 3x PostgreSQL) -- [ ] Add PR templates + issue templates to all repos -- [ ] Set branch protection rules on `main` and `develop` -- [ ] Verify `docker-compose up` starts Kafka, all PostgreSQL instances, and Kafka UI - -**Milestone complete when**: `docker-compose up` works, all repos have `develop` branch, CI skeleton passes. - ---- - -### Phase 1 — Agent: OSQuery Integration -**Goal**: Agent reads from OSQuery socket and logs results locally. - -- [ ] `edr-agent` Cargo workspace scaffolded with all 6 crates -- [ ] `osquery-client` crate connects to OSQuery unix socket -- [ ] Scheduled query execution (hardcoded queries initially) -- [ ] Results serialised to `NormalisedEvent` using `edr-sdk` types -- [ ] `event-buffer` crate stores events to `sled` on disk - ---- - -### Phase 2 — Agent: eBPF Probes -**Goal**: eBPF programs compile and stream kernel events to agent. - -- [ ] `process_probe.bpf.c` compiles via `aya-build` -- [ ] `ebpf-collector` crate loads and attaches probes -- [ ] Perf buffer events parsed and converted to `ProcessEvent`, `FileEvent`, `NetworkEvent` -- [ ] Events flow into `event-buffer` - ---- - -### Phase 3 — Fleet Server: Enrollment & Streaming -**Goal**: Agent can enroll, receive config, and stream events to Fleet Server. - -- [ ] `edr-fleet-server` gRPC server implements `RegisterAgent` and `EventStream` -- [ ] PostgreSQL `edr_nodes_db` migrations run on startup -- [ ] Agent enrolls, receives JWT + config -- [ ] Bidirectional stream established, events flow from agent to fleet server -- [ ] Fleet server produces events to Kafka `edr.events.raw` - ---- - -### Phase 4 — Kafka Pipeline & Database -**Goal**: Events flow from Kafka into PostgreSQL, normalised. - -- [ ] `edr-kafka-pipeline` consumes `edr.events.raw` -- [ ] Normaliser converts raw bytes to `NormalisedEvent` -- [ ] Events written to `edr_logs_db` -- [ ] Produces to `edr.events.norm` - ---- - -### Phase 5 — Rule Engine: YARA + MITRE -**Goal**: Alerts generated for suspicious events. - -- [ ] `edr-rule-engine` consumes `edr.events.norm` -- [ ] YARA rules loaded from `/etc/edr/rules/` -- [ ] MITRE ATT&CK mapping lookup table -- [ ] Alerts published to `edr.alerts` and written to `edr_alerts_db` - ---- - -### Phase 6 — API Backend -**Goal**: REST API and WebSocket serving frontend. - -- [ ] All REST endpoints implemented and tested -- [ ] JWT auth middleware -- [ ] WebSocket server broadcasting alerts and health events -- [ ] Kafka consumer connected to `edr.alerts` + `edr.health` - ---- - -### Phase 7 — Frontend Dashboard -**Goal**: Operator can view nodes, logs, and alerts in a browser. - -- [ ] Auth flow (login page, JWT storage in memory) -- [ ] Node Map page with real-time status -- [ ] Alerts panel with filtering and acknowledge/dismiss -- [ ] Live Logs page over WebSocket -- [ ] Node isolation control with confirmation - ---- - -### Phase 8 — Node Isolation End-to-End -**Goal**: Operator can isolate a node from the dashboard and the agent applies IPTables rules. - -- [ ] `POST /nodes/:id/isolate` writes to `pending_commands` -- [ ] Fleet Server detects pending command and sends `IsolateCommand` over existing stream -- [ ] Agent `isolation` crate applies IPTables rules -- [ ] Status flows back via heartbeat → WebSocket → dashboard - ---- - -### Phase 9 — Hardening, CI/CD, and Observability -**Goal**: Production-ready pipelines, security scanning, structured logging. - -- [ ] Full GitHub Actions CI on all repos (lint, test, audit, build, Trivy scan) -- [ ] Structured JSON logging with correlation IDs across all services -- [ ] Prometheus metrics endpoints on all Rust services -- [ ] Grafana dashboards for event throughput, alert rate, node health -- [ ] All secrets in environment variables, no hardcoded credentials -- [ ] Runbooks written for each service in `edr-infra/docs/` - ---- - -*End of EDR Implementation Guide — AXIOM* diff --git a/ISSUES.md b/ISSUES.md deleted file mode 100644 index 6008ecd..0000000 --- a/ISSUES.md +++ /dev/null @@ -1,797 +0,0 @@ -# GitHub Issues — project-edr - ---- - -## Issue: Wire the ebpf-collector crate into the workspace build and establish the aya build pipeline -**Labels:** `ebpf`, `kernel`, `scaffolding`, `unsafe` -**Depends on:** none -**Blocks:** BPF map definitions issue; process-lifecycle probe issue; network probe issue; userspace loader issue - -### What this is -The `ebpf-collector` crate exists at `agent/crates/ebpf-collector/` with `aya = "0.13"` and `aya-build = "0.1"` declared in its `Cargo.toml`, but it is explicitly excluded from the workspace root `Cargo.toml` (`exclude = ["agent/crates/ebpf-collector"]`). Its `src/lib.rs` is a two-line comment stub. The crate has no `build.rs`, no BPF program directory, and produces nothing. This issue establishes the complete build foundation: workspace integration, cross-compilation target configuration, `build.rs` that compiles BPF C programs via `aya-build`, directory layout for BPF sources, and verification that the whole thing compiles against a BTF-enabled kernel (≥5.8). - -### What is currently blocking this -Nothing external. This is the root of the eBPF workstream. The `agent/.cargo/config.toml` already has `[target.bpfel-unknown-none] rustflags = ["-C", "link-arg=--btf"]`, confirming intent. The blocker is the missing workspace membership, missing `build.rs`, and the empty crate body. - -### What this is blocking -Every downstream eBPF issue. BPF map definitions, probe implementations, and the userspace loader all depend on the build pipeline this issue establishes. - -### Implementation tasks -- [ ] Remove `"agent/crates/ebpf-collector"` from the `exclude` list in the workspace root `Cargo.toml` and add it to `members`. Verify `cargo check -p ebpf-collector` compiles. -- [ ] Create `agent/crates/ebpf-collector/build.rs` that calls `aya_build::build()` or equivalent to compile BPF C programs from `bpf/` into the output directory. Follow the aya-build 0.1 API: `aya_build::build()?` respects `CARGO_CFG_TARGET_ARCH` and invokes `clang` with the correct BPF target flags. -- [ ] Create the `agent/crates/ebpf-collector/bpf/` directory. Add a placeholder `common.h` defining the shared event structs that all BPF programs will write into ring buffers. Start with `struct process_event { u32 pid; u32 ppid; char comm[16]; char cmdline[256]; u32 uid; }` and equivalents for file and network. These structs must be `#[repr(C)]` on the Rust side. -- [ ] In `agent/crates/ebpf-collector/src/lib.rs`, add the module skeleton: `pub mod loader; pub mod events; pub mod error;`. Add a `CollectorError` enum using `thiserror` covering `BpfLoadError`, `ProgramAttachError`, `RingBufError`. -- [ ] Add `thiserror` to `ebpf-collector/Cargo.toml` dependencies (use workspace version). -- [ ] Verify that `cargo check -p ebpf-collector --target x86_64-unknown-linux-gnu` passes (on a Linux dev box or CI). Document the required host toolchain: `clang ≥ 14`, `llvm-strip`, `bpf-linker` if needed. Add a `README.md` in `agent/crates/ebpf-collector/` listing these requirements. -- [ ] Add a `#[cfg(target_os = "linux")]` guard to the crate's public API surface so the workspace compiles on macOS during development without failing (aya does not support non-Linux targets at runtime). -- [ ] Write a unit test `test_error_variants_display` that verifies `CollectorError::BpfLoadError("test".into())` formats without panic. - -### Definition of done -- `cargo check -p ebpf-collector --target x86_64-unknown-linux-gnu` succeeds on a Linux host with clang ≥14 installed. -- `build.rs` is present and `aya_build::build()` is called; the `bpf/` directory exists and is referenced. -- `ebpf-collector` is a member of the workspace (appears in `cargo metadata --format-version 1 | jq '.workspace_members'`). -- `CollectorError` variants compile and display correctly. -- A macOS `cargo check` (without `--target bpf`) does not error on the crate. - -### Notes / constraints -- `aya` 0.13 (declared in the existing `Cargo.toml`) targets kernels ≥5.8 for ring buffer support. This is the minimum acceptable kernel version for this workstream. Document this in the `README.md`. -- BTF (`CONFIG_DEBUG_INFO_BTF=y`) is required for CO-RE. Verify with `bpftool btf list` on the target kernel. Without BTF, the loader will need to embed vmlinux BTF via `aya_tool::generate`. -- The `aya-build` crate requires `clang` on `PATH` at build time. This must be in the CI runner and documented. -- Do not attempt to use `cargo-bpf` — the crate has already committed to `aya`. - ---- - -## Issue: Define BPF ring buffer maps and shared event structs for process, file, and network telemetry -**Labels:** `ebpf`, `kernel`, `unsafe` -**Depends on:** Wire the ebpf-collector crate into the workspace build and establish the aya build pipeline -**Blocks:** process-lifecycle probe issue; network probe issue; userspace loader issue; event consumer issue - -### What this is -Before any BPF program can be written, the shared data structures that live in BPF maps must be defined and agreed upon between kernel space (C) and userspace (Rust). This issue defines the three `RingBuf` maps (one per event category), the C structs written by BPF programs, and the corresponding `#[repr(C)]` Rust structs that the userspace event consumer will deserialize from ring buffer memory. It also wires these structs into the `edr-sdk` types pipeline (specifically `agent.proto` already has `ProcessEvent`, `FileEvent`, `NetworkEvent` — these Rust structs must be compatible). - -### What is currently blocking this -The build pipeline issue above must land first. Once `bpf/common.h` exists as a placeholder, this issue replaces the placeholder with production-ready definitions. - -### What this is blocking -The process probe and network probe issues, both of which write into these maps. The userspace event consumer, which reads from them. - -### Implementation tasks -- [ ] In `agent/crates/ebpf-collector/bpf/common.h`, define `struct process_event { u32 pid; u32 ppid; char comm[TASK_COMM_LEN]; char cmdline[512]; u32 uid; u32 euid; char cwd[256]; }`. Use `TASK_COMM_LEN = 16` from ``. -- [ ] In the same header, define `struct file_event { u32 pid; char comm[16]; char path[256]; u8 operation; s32 ret; }` where `operation` is an enum-equivalent `u8`: `0=open, 1=write, 2=delete, 3=rename`. -- [ ] Define `struct network_event { u32 pid; char comm[16]; u32 src_ip; u32 dst_ip; u16 src_port; u16 dst_port; u8 protocol; u8 direction; }` for IPv4. Add a `u8 is_ipv6` flag and `u8 src_ip6[16]` / `u8 dst_ip6[16]` fields for future IPv6 support (write zeroes if unused). -- [ ] Create `agent/crates/ebpf-collector/src/events.rs`. Define `#[repr(C)] pub struct ProcessEvent { pub pid: u32, pub ppid: u32, pub comm: [u8; 16], pub cmdline: [u8; 512], pub uid: u32, pub euid: u32, pub cwd: [u8; 256] }` — field layout must exactly match the C struct. Do the same for `FileEvent` and `NetworkEvent`. Derive nothing that requires heap allocation (no `String` here — these are read directly from kernel ring buffer memory). -- [ ] Implement `TryFrom<&[u8]>` for each struct that reads from a `&[u8]` slice (from the ring buffer). Use `zerocopy` or manual `ptr::read_unaligned` under `unsafe`. Add a bounds check: if the slice is shorter than `mem::size_of::()`, return `CollectorError::MalformedEvent`. -- [ ] Add a `fn to_sdk_process_event(&self) -> edr_sdk::proto::agent::ProcessEvent` converter on `ProcessEvent` that maps the `[u8; N]` comm/cmdline/cwd arrays to `String` using `from_utf8_lossy`. Do the same for `FileEvent` → `edr_sdk::proto::agent::FileEvent` and `NetworkEvent` → `edr_sdk::proto::agent::NetworkEvent`. (This requires `edr-sdk` to expose generated proto types — verify `sdk/src/lib.rs` re-exports them.) -- [ ] Define the three ring buffer map names as constants: `pub const PROCESS_EVENTS: &str = "PROCESS_EVENTS"`, `FILE_EVENTS`, `NETWORK_EVENTS`. These strings must match the map section names in the BPF C programs. -- [ ] Write unit tests: `test_process_event_from_bytes_exact_size`, `test_process_event_from_bytes_too_short`, `test_file_event_operation_roundtrip`, `test_network_event_ipv4_conversion_to_sdk_type`. Run with `cargo test -p ebpf-collector`. - -### Definition of done -- `cargo test -p ebpf-collector` passes all unit tests above. -- The three `#[repr(C)]` structs are defined, size-checked in tests (`assert_eq!(mem::size_of::(), )`). -- `TryFrom<&[u8]>` is implemented and tested for malformed input. -- Conversion to `edr-sdk` proto types is implemented and compiles. - -### Notes / constraints -- Do not use `serde` or `bincode` for deserializing ring buffer events — the kernel writes raw C structs. Only `zerocopy` or manual pointer casts (with alignment guarantees checked) are appropriate. -- Padding bytes in C structs will appear in the ring buffer. Ensure the Rust struct fields are laid out in the same order with the same alignment as the C struct. Use `static_assertions::assert_eq_size!` if desired. -- `TASK_COMM_LEN` is 16 bytes on all supported Linux kernels. `cmdline` is bounded to 512 to avoid stack overflow in BPF programs (BPF stack limit is 512 bytes total per program). - ---- - -## Issue: Implement the process-lifecycle BPF program (execve tracepoint) and wire it to the ring buffer -**Labels:** `ebpf`, `kernel`, `unsafe`, `tracing` -**Depends on:** BPF map definitions issue -**Blocks:** Userspace loader issue; event consumer issue; eBPF integration test issue - -### What this is -This issue writes the first production BPF program: `process_probe.bpf.c`, attached to the `sys_enter_execve` tracepoint. It populates a `RingBuf` map with `struct process_event` entries on every `execve` syscall. This is the canonical first probe for any EDR because process execution is the root of nearly every attack chain. Getting this right — correct map access patterns, correct argument extraction, BPF verifier compliance — establishes the pattern for all subsequent probes. - -### What is currently blocking this -The BPF map definitions issue (structs must be defined in `common.h` before programs can use them). - -### What this is blocking -The userspace loader (which loads and attaches this program). The event consumer (which reads from the ring buffer this program writes into). - -### Implementation tasks -- [ ] Create `agent/crates/ebpf-collector/bpf/process_probe.bpf.c`. Include ``, ``, ``, and `"common.h"`. -- [ ] Declare the ring buffer map: `struct { __uint(type, BPF_MAP_TYPE_RINGBUF); __uint(max_entries, 256 * 1024); } PROCESS_EVENTS SEC(".maps");`. 256 KiB = 64 pages, safe default for high-frequency execve. -- [ ] Implement `SEC("tracepoint/syscalls/sys_enter_execve") int trace_execve(struct trace_event_raw_sys_enter *ctx)`. Use `bpf_ringbuf_reserve` to allocate a `struct process_event` slot, fill `pid` from `bpf_get_current_pid_tgid() >> 32`, `ppid` from walking `task_struct` via `bpf_get_current_task()` + `BPF_CORE_READ`, `uid/euid` from `bpf_get_current_uid_gid()`, `comm` from `bpf_get_current_comm()`, and `cmdline` by reading `ctx->args[0]` (argv[0]) via `bpf_probe_read_user_str`. Submit with `bpf_ringbuf_submit`. -- [ ] Handle the BPF verifier constraint: `bpf_probe_read_user_str` on `cmdline` must use a bounded length (≤ 512). Add a null terminator at `cmdline[511]` defensively. -- [ ] For `ppid`: use `BPF_CORE_READ(task, real_parent, tgid)` — this requires BTF CO-RE. Add a preprocessor guard `#ifdef __TARGET_ARCH_x86` for architecture portability if needed. -- [ ] Add `char _license[] SEC("license") = "GPL";` — required for helper access. -- [ ] Verify the program compiles with `clang -O2 -g -target bpf -D__TARGET_ARCH_x86_64 -c process_probe.bpf.c -o /dev/null` as a manual check. The `build.rs` will handle this at cargo build time. -- [ ] Write a unit test in `src/events.rs` (already tracking this crate) that constructs a fake `ProcessEvent` byte buffer mimicking what the kernel would write, then parses it and asserts field values. - -### Definition of done -- `process_probe.bpf.c` compiles without verifier errors when loaded on a Linux 5.15+ kernel with BTF enabled. -- `build.rs` picks up the new file and produces a compiled BPF object in `target/bpf/`. -- The `struct process_event` layout in C matches the `ProcessEvent` Rust struct (verified via size assertions in unit tests). -- `cargo build -p ebpf-collector` succeeds on a Linux host with clang ≥14. - -### Notes / constraints -- `bpf_probe_read_user_str` is the correct helper for reading userspace memory (argv). Do not use `bpf_probe_read_kernel_str` for userspace pointers — it will silently read zeroes on modern kernels. -- Tracepoint `sys_enter_execve` provides the raw syscall arguments. The `ctx->args[0]` is a pointer to the filename string, `ctx->args[1]` is argv (pointer-to-pointer). Reading individual argv elements requires multiple `bpf_probe_read_user` calls inside a bounded loop (BPF verifier requires loops to have provable termination). For this issue, reading only argv[0] (filename) into `cmdline` is acceptable. Full cmdline reconstruction is a follow-on. -- CO-RE with `BPF_CORE_READ` requires the kernel to expose BTF. If `CONFIG_DEBUG_INFO_BTF` is not set, the loader must supply vmlinux BTF. This constraint is documented in the build pipeline issue. - ---- - -## Issue: Implement the network-event BPF program (connect/bind tracepoints) and wire it to the ring buffer -**Labels:** `ebpf`, `kernel`, `networking`, `unsafe` -**Depends on:** BPF map definitions issue -**Blocks:** Userspace loader issue; event consumer issue; eBPF integration test issue - -### What this is -This issue writes `network_probe.bpf.c`, attached to `tracepoint/syscalls/sys_enter_connect` and `tracepoint/syscalls/sys_enter_bind`. On each call it extracts the 5-tuple (src IP, dst IP, src port, dst port, protocol) and the process context (PID, comm), then writes a `struct network_event` into the `NETWORK_EVENTS` ring buffer. This probe feeds the connection isolation workstream: the isolation table (Workstream B) needs to know which connections the EDR process itself makes in order to register them as allowed. - -### What is currently blocking this -The BPF map definitions issue (the `struct network_event` definition must be in `common.h`). - -### What this is blocking -The userspace loader and event consumer. The connection isolation table (Workstream B) — specifically, the "population" issue in that workstream depends on network events being available to identify the EDR's own connections. - -### Implementation tasks -- [ ] Create `agent/crates/ebpf-collector/bpf/network_probe.bpf.c`. Declare `NETWORK_EVENTS` ring buffer map (same pattern as `PROCESS_EVENTS`, 256 KiB initial). -- [ ] Implement `SEC("tracepoint/syscalls/sys_enter_connect") int trace_connect(struct trace_event_raw_sys_enter *ctx)`. Extract the `sockaddr` pointer from `ctx->args[1]`. Use `bpf_probe_read_user` to read the `struct sockaddr` header. Branch on `sa_family`: if `AF_INET`, read `struct sockaddr_in` and extract `sin_addr.s_addr` and `sin_port`; if `AF_INET6`, set the `is_ipv6` flag and read `struct sockaddr_in6`. Write into ring buffer with `direction = 1` (outbound). Ignore `AF_UNIX` and other families (submit nothing). -- [ ] Implement `SEC("tracepoint/syscalls/sys_enter_bind") int trace_bind(...)` with the same extraction logic, setting `direction = 0` (inbound). -- [ ] Fill `pid` and `comm` in both handlers using the same helpers as the process probe (`bpf_get_current_pid_tgid`, `bpf_get_current_comm`). -- [ ] Set `protocol = IPPROTO_TCP` by default. Distinguishing TCP vs UDP at the `connect`/`bind` tracepoint requires reading the socket struct via `bpf_get_current_task` and `BPF_CORE_READ(task, files, ...)` — this is complex and error-prone. For this issue, mark protocol as `0xFF` (unknown) and resolve via a kretprobe on `sock_recvmsg`/`sock_sendmsg` in a follow-on issue if needed. -- [ ] Port numbers from `sockaddr` are in network byte order. Convert to host byte order using `bpf_ntohs()` before writing into the event struct. -- [ ] Add `char _license[] SEC("license") = "GPL";`. -- [ ] Write unit tests for `NetworkEvent::try_from(&[u8])` covering an IPv4 outbound event and an IPv6 inbound event. - -### Definition of done -- `network_probe.bpf.c` compiles without verifier errors on a Linux 5.15+ kernel with BTF. -- Both `trace_connect` and `trace_bind` are present and functional. -- IPv4 and IPv6 addresses are extracted correctly (verified in unit tests on the Rust struct parsing side). -- `cargo build -p ebpf-collector` succeeds with both `process_probe.bpf.c` and `network_probe.bpf.c` in the `bpf/` directory. - -### Notes / constraints -- `bpf_probe_read_user` on the `sockaddr *` pointer can fail if the pointer is invalid (e.g., the userspace process passed a bogus address). Always check the return value. On error, discard the event rather than submitting garbage. -- The connect tracepoint fires before the kernel validates the address. The connection may fail; we still want to record the attempt. -- IPv6 detection via `sa_family == AF_INET6` is the correct check. The `struct network_event` has fields for both. Only one set should be populated per event. -- Port 0 (ephemeral assignment) may appear in `bind` calls. This is a valid event — do not filter it. - ---- - -## Issue: Implement the userspace BPF loader: load compiled objects and attach programs to kernel hooks -**Labels:** `ebpf`, `kernel`, `unsafe`, `async` -**Depends on:** Process-lifecycle probe issue; network-event probe issue -**Blocks:** Event consumer issue; eBPF error handling issue; eBPF integration test issue - -### What this is -This issue implements `agent/crates/ebpf-collector/src/loader.rs`: the userspace Rust code that loads the compiled BPF object files using `aya`, attaches each program to its kernel hook point, and returns handles to the loaded maps so the event consumer can read from them. This is the bridge between the compiled `.bpf.o` artifacts and the running kernel. - -### What is currently blocking this -Both BPF programs (process probe and network probe) must be compiled and their ring buffer map definitions must be finalized before the loader can reference them by name. - -### What this is blocking -The event consumer (which receives map handles from the loader). The error handling issue (which wraps loader failures). The integration test (which calls the loader in a privileged context). - -### Implementation tasks -- [ ] Create `agent/crates/ebpf-collector/src/loader.rs`. Define `pub struct EbpfLoader` that owns an `aya::Ebpf` instance (the loaded BPF object) and exposes the ring buffer map handles. -- [ ] Implement `EbpfLoader::load() -> Result`. Use `aya::Ebpf::load(include_bytes_aligned!(concat!(env!("OUT_DIR"), "/bpf/process_probe.bpf.o")))` to embed the compiled object at link time. Do the same for `network_probe.bpf.o`. These two programs can be in separate `Ebpf` instances or combined if the build pipeline merges them. -- [ ] Attach `process_probe` to its tracepoint: `let prog: &mut TracePoint = bpf.program_mut("trace_execve").unwrap().try_into()?; prog.load()?; prog.attach("syscalls", "sys_enter_execve")?;` — handle `ProgramError` by mapping to `CollectorError::ProgramAttachError`. -- [ ] Attach `trace_connect` and `trace_bind` from `network_probe.bpf.o` analogously. -- [ ] Implement `EbpfLoader::process_ring_buf(&mut self) -> &mut RingBuf<&mut MapData>` and `network_ring_buf` accessors that retrieve the `PROCESS_EVENTS` and `NETWORK_EVENTS` maps from the loaded `Ebpf` instance using `aya::maps::RingBuf::try_from(bpf.map_mut("PROCESS_EVENTS")?)`. -- [ ] Implement `EbpfLoader::detach(self) -> Result<(), CollectorError>` that drops all program handles, causing kernel detachment. Log each detach operation. -- [ ] Add `pub fn is_btf_available() -> bool` that reads `/sys/kernel/btf/vmlinux` existence as a pre-flight check. Return `false` if the file doesn't exist. The caller (error handling issue) uses this to provide a useful error message. -- [ ] Write a compile-time test `test_loader_struct_is_send` that asserts `EbpfLoader: !Send` (it holds a raw `MapData` reference that is not `Send`). This documents the threading constraint. The loader must live on a single task; use `tokio::task::LocalSet` in the consumer. - -### Definition of done -- `EbpfLoader::load()` compiles and, when run as root on a Linux 5.15+ kernel, loads both BPF programs and attaches them to their tracepoints without error. -- `process_ring_buf()` and `network_ring_buf()` return valid map handles. -- `EbpfLoader::detach()` cleans up without leaving dangling programs (verify with `bpftool prog list` before and after). -- `is_btf_available()` returns `true` on a BTF-enabled kernel and `false` on a kernel without `/sys/kernel/btf/vmlinux`. - -### Notes / constraints -- `aya::Ebpf::load` requires the process to have `CAP_BPF` (kernel ≥5.8) or `CAP_SYS_ADMIN` (older). The agent binary must be deployed with the appropriate capability set. -- `include_bytes_aligned!` is provided by `aya` and must be used instead of `include_bytes!` to satisfy BPF object alignment requirements. -- If the two BPF programs are in separate object files (compiled separately by `build.rs`), they require two separate `aya::Ebpf` instances. The `EbpfLoader` struct must own both. Do not merge them into one object to avoid BPF map naming conflicts. -- `RingBuf` map access is not `Send`. The entire loader and consumer must run on a single OS thread using `tokio::task::LocalSet` or `std::thread::spawn`. - ---- - -## Issue: Implement the ring buffer event consumer: read, deserialize, and forward events into the agent pipeline -**Labels:** `ebpf`, `kernel`, `async`, `tracing` -**Depends on:** Userspace loader issue -**Blocks:** eBPF pipeline integration issue; eBPF integration test issue - -### What this is -This issue implements `agent/crates/ebpf-collector/src/events.rs` consumer logic: a polling loop that reads raw bytes from the `PROCESS_EVENTS` and `NETWORK_EVENTS` ring buffers (via `aya::maps::ring_buf::RingBuf`), deserializes them into the typed Rust structs defined in the map definitions issue, converts them to `edr-sdk` proto types (`AgentEvent` with appropriate `event_type`), and forwards them to the `agent-core` orchestrator via an `mpsc::Sender`. This is the final hop from kernel telemetry to the agent's event pipeline. - -### What is currently blocking this -The userspace loader must land first (this issue depends on the ring buffer map handles it returns). - -### What this is blocking -The pipeline integration issue (wiring this into `agent-core/orchestrator.rs`). The integration test (which validates end-to-end event emission). - -### Implementation tasks -- [ ] In `agent/crates/ebpf-collector/src/events.rs`, implement `pub struct EventConsumer` that holds an `EbpfLoader` and a `tokio::sync::mpsc::Sender`. -- [ ] Implement `EventConsumer::run(self) -> Result<(), CollectorError>` as a blocking loop (must run in `tokio::task::spawn_blocking` or a `LocalSet`-driven loop because `RingBuf::next()` is synchronous). The loop calls `process_ring_buf.next()` and `network_ring_buf.next()` in an interleaved fashion using a short poll interval (`tokio::time::sleep(Duration::from_millis(1))`). For each returned `Item`, call `ProcessEvent::try_from(item.as_ref())` or `NetworkEvent::try_from(item.as_ref())`, convert to proto bytes via `encode_to_vec()`, then send as an `AgentEvent` with `event_type = 1` (process) or `event_type = 3` (network), `timestamp_ns = SystemTime::now()`, and a `sequence_id = Uuid::new_v4().to_string()`. -- [ ] Handle deserialization errors by logging `tracing::warn!` with the raw byte length and continuing the loop — a malformed event must not crash the consumer. -- [ ] Handle `mpsc::Sender::send` returning `Err` (receiver dropped) by returning `CollectorError::PipelineClosed` — this signals the orchestrator has shut down. -- [ ] Add a shutdown signal: `EventConsumer::run_until_shutdown(self, shutdown: tokio::sync::CancellationToken)` that breaks the poll loop when the token is cancelled. -- [ ] Expose `pub fn start(loader: EbpfLoader, node_id: String) -> (EventConsumer, mpsc::Receiver)` as the public API. The `node_id` is set on every `AgentEvent.node_id` field. -- [ ] Write unit tests: `test_consumer_forwards_process_event` (mock the ring buffer with a pre-built byte slice, verify the sender receives a correctly-typed `AgentEvent`), `test_consumer_handles_malformed_bytes_without_crash`. - -### Definition of done -- `EventConsumer::run_until_shutdown` runs without error on a real kernel and produces `AgentEvent` structs on the returned channel when processes exec or make network connections. -- Malformed ring buffer data does not panic or crash the consumer loop. -- Shutdown via `CancellationToken` terminates the loop cleanly. -- Unit tests pass with `cargo test -p ebpf-collector`. - -### Notes / constraints -- `RingBuf::next()` does not block — it returns `None` immediately if no events are pending. The polling approach with a 1ms sleep trades latency for CPU. A production follow-on should use `epoll`/`tokio::io::unix::AsyncFd` on the ring buffer's file descriptor. This issue does not need to solve that. -- The `node_id` stamped on events comes from the enrollment flow in `fleet-client`. The consumer must receive it at construction time (not read from config) because enrollment is async and may not have completed when the consumer starts. -- Events produced by the BPF programs include the EDR agent's own process events (the agent will observe itself execing). This is not filtered here — filtering is downstream in the isolation table and rule engine. - ---- - -## Issue: Wire the ebpf-collector event consumer into agent-core/orchestrator.rs alongside the existing osquery pipeline -**Labels:** `ebpf`, `async`, `tracing` -**Depends on:** Event consumer issue -**Blocks:** eBPF integration test issue - -### What this is -The `agent-core/orchestrator.rs` currently runs a complete osquery pipeline: `OsqueryCollector::start()` → `mpsc::Receiver` → `EventBuffer::push()`. The eBPF pipeline must be integrated in parallel: `EventConsumer::start()` → `mpsc::Receiver` → `EventBuffer::push()`. This issue adds the eBPF consumer as a concurrent task in the main orchestrator loop, guarded by a compile-time `#[cfg(target_os = "linux")]` block so the agent still compiles on macOS. - -### What is currently blocking this -The event consumer issue must be complete (the `EventConsumer::start()` API must exist and return an `mpsc::Receiver`). - -### What this is blocking -The eBPF integration test (which validates the full path from kernel probe to buffer). - -### Implementation tasks -- [ ] In `agent/crates/agent-core/src/orchestrator.rs`, add a `#[cfg(target_os = "linux")]` block after the `OsqueryCollector` startup that calls `ebpf_collector::EventConsumer::start(EbpfLoader::load()?, node_id.clone())`. This yields a `(EventConsumer, Receiver)` tuple. -- [ ] Add `ebpf-collector = { path = "../../crates/ebpf-collector" }` to `agent/crates/agent-core/Cargo.toml` under a `[target.'cfg(target_os = "linux")'.dependencies]` section. -- [ ] Spawn the `EventConsumer::run_until_shutdown(consumer, cancellation_token.clone())` call inside a `tokio::task::spawn_blocking` or a dedicated `LocalSet`-driven thread (see loader notes on non-`Send` constraint). -- [ ] Add the eBPF `AgentEvent` receiver to the main `tokio::select!` loop alongside the existing `results_rx.recv()` arm: when an `AgentEvent` arrives from the eBPF channel, encode it to bytes via `AgentEvent::encode_to_vec()` and call `buffer.push(&bytes)`. -- [ ] Handle the case where `EbpfLoader::load()` fails (e.g., insufficient capabilities, kernel too old, BTF not available): log `tracing::warn!("eBPF collector failed to load: {}. Continuing in OSQuery-only mode.", e)` and skip the eBPF pipeline. Do not abort agent startup. -- [ ] Thread the `CancellationToken` through the shutdown path so the eBPF consumer is stopped before the process exits. -- [ ] Update the existing `test_orchestrator_startup` integration test (in `tests/TEST_PLAN.md` it is listed as `agent-core` integration test) to assert that a Linux agent starts with the eBPF consumer active, and a degraded-mode test that asserts startup succeeds even when the eBPF loader returns an error. - -### Definition of done -- `cargo build -p agent-bin --target x86_64-unknown-linux-gnu` succeeds with eBPF integration enabled. -- On a Linux host with sufficient capabilities: the agent logs `"eBPF collector started"` on startup. -- On a host without sufficient capabilities (or on macOS during development): the agent logs the degraded-mode warning and continues with osquery only. -- The shutdown path stops the eBPF consumer cleanly (no log errors on exit). - -### Notes / constraints -- The `EventBuffer` (SQLite via `rusqlite`) is `!Send`. The eBPF consumer receiver's `AgentEvent`s must be forwarded to the buffer on the same thread that owns it (the main orchestrator task). The `select!` loop in orchestrator already handles this correctly for osquery — extend it the same way. -- `EbpfLoader` is also `!Send` and must live on the same thread as the consumer. If using `spawn_blocking`, pass ownership of the loader into the closure before spawning. - ---- - -## Issue: eBPF error handling and graceful degradation when probe attachment fails -**Labels:** `ebpf`, `error-handling`, `kernel` -**Depends on:** Wire eBPF into orchestrator issue -**Blocks:** eBPF integration test issue - -### What this is -This issue hardens the eBPF subsystem against runtime failures: partial probe attachment failures (process probe loads but network probe fails), ring buffer exhaustion, and consumer task panic. The current `CollectorError` enum from the build pipeline issue has the variants but no structured handling. This issue adds the handling logic and the graceful fallback strategy. - -### What is currently blocking this -The orchestrator integration must be in place so there is a running system to harden. - -### What this is blocking -The eBPF integration test (which validates the degraded-mode behavior). - -### Implementation tasks -- [ ] In `loader.rs`, change `EbpfLoader::load()` to return a `LoadResult { loader: EbpfLoader, warnings: Vec }` struct instead of bare `Result`. If a probe fails to attach (e.g., the tracepoint doesn't exist on this kernel), push the error into `warnings` and continue loading remaining probes. A partial load is better than no telemetry. -- [ ] Implement `fn attach_with_warn(bpf: &mut Ebpf, program_name: &str, category: &str, tracepoint: &str, warnings: &mut Vec) -> bool` that attempts attachment and on failure pushes a human-readable message into warnings and returns `false`. -- [ ] In `orchestrator.rs`, log each `LoadResult.warning` at `tracing::warn!` level with a prefix of `"[ebpf][degraded]"`. -- [ ] Handle ring buffer exhaustion: if `RingBuf::next()` returns data but deserialization consistently fails for more than 100 consecutive events, emit a `tracing::error!` and pause polling for 5 seconds before resuming (back-pressure heuristic). -- [ ] Wrap the entire `EventConsumer::run_until_shutdown` call in `spawn_blocking` with a `catch_unwind` equivalent (use `std::panic::catch_unwind` inside the blocking closure). If the consumer panics, log the panic message and do not crash the agent — restart the consumer loop after a 10-second delay. -- [ ] Add a metric counter (or at minimum a `tracing::info!` log) for `events_received_from_ebpf` and `events_dropped_from_ebpf` that increments in the consumer loop. This feeds future observability. -- [ ] Write a unit test `test_partial_load_continues_on_attachment_failure` that mocks the aya attach call returning an error and asserts `warnings` is non-empty while `loader.process_ring_buf()` still works. - -### Definition of done -- When the process probe loads but the network probe fails: the agent starts, logs a degraded warning, and continues streaming process events. -- When the entire BPF load fails: the agent starts in osquery-only mode with no errors beyond the degraded warning. -- A consumer panic does not crash the agent process. -- `cargo test -p ebpf-collector` passes the partial-load test. - -### Notes / constraints -- Do not retry failed probe attachment on a loop — the failure is usually structural (capability missing, kernel too old). Log once and accept degraded mode. -- Ring buffer exhaustion (events dropped by the kernel) appears as gaps in sequence IDs. The kernel tracks drops internally; aya exposes `RingBuf::dropped_events()` if available. Check aya 0.13 API for this method. - ---- - -## Issue: eBPF integration test: load probes in a controlled environment and verify end-to-end event emission -**Labels:** `ebpf`, `testing`, `kernel` -**Depends on:** eBPF error handling issue; Wire eBPF into orchestrator issue -**Blocks:** nothing (leaf node in workstream) - -### What this is -This issue implements the integration tests for the entire eBPF pipeline, covering: successful probe load and attachment, event emission verification (exec a known subprocess → observe `ProcessEvent` on the ring buffer), network event capture (make a loopback TCP connection → observe `NetworkEvent`), and degraded-mode startup. These tests require a real Linux kernel and run in CI on `ubuntu-latest` (GitHub Actions runner) as a job that includes `sudo` access for `CAP_BPF`. - -### What is currently blocking this -All prior eBPF issues must be complete. The test exercises the full stack. - -### What this is blocking -Nothing — this is the leaf node of the eBPF workstream. - -### Implementation tasks -- [ ] Create `agent/crates/ebpf-collector/tests/integration_test.rs`. Gate the entire file with `#[cfg(target_os = "linux")]` and a custom feature flag `ebpf_integration_tests` (controlled by `CARGO_FEATURE_EBPF_INTEGRATION_TESTS` env var) to prevent these from running in normal `cargo test` invocations. -- [ ] Implement `test_loader_loads_and_attaches`: calls `EbpfLoader::load()`, asserts `Ok`, calls `is_btf_available()` and skips if false, asserts ring buffer handles are accessible. -- [ ] Implement `test_process_probe_captures_execve`: after loading, `std::process::Command::new("/bin/true").spawn().wait()`, then poll the ring buffer for up to 500ms expecting at least one `ProcessEvent` with `comm == b"true\0...\0"` (null-padded). Assert `pid != 0` and `ppid == current PID`. -- [ ] Implement `test_network_probe_captures_connect`: spawn a `tokio::net::TcpListener` on `127.0.0.1:0`, get the bound port, then `TcpStream::connect` to it. Poll the network ring buffer for up to 500ms expecting a `NetworkEvent` with `dst_port == bound_port` and `direction == 1` (outbound). -- [ ] Implement `test_probe_detach_removes_program`: load probes, call `detach()`, then verify with `bpftool prog list` via `std::process::Command` that the program name is no longer listed. -- [ ] Implement `test_degraded_mode_no_cap_bpf`: run as unprivileged user (or drop caps in the test), call `EbpfLoader::load()`, assert `Err(CollectorError::BpfLoadError(_))`, and assert the agent-core degraded-mode path does not panic. -- [ ] Add a `[[test]]` entry in `ebpf-collector/Cargo.toml` for the integration test file with `required-features = ["ebpf_integration_tests"]`. -- [ ] Add a `.github/workflows/ebpf-integration.yml` workflow (or note in the existing CI template) that runs `cargo test -p ebpf-collector --features ebpf_integration_tests` on `ubuntu-latest` with `sudo` or using the `BPF_CAP` action. - -### Definition of done -- All four integration tests pass on a GitHub Actions `ubuntu-latest` runner (kernel 6.x, BTF enabled). -- `test_degraded_mode_no_cap_bpf` passes without root — it should return a `BpfLoadError`, not panic. -- Tests are gated by feature flag and do not run in a normal `cargo test --workspace` invocation. -- CI job is defined and runs on PRs targeting `main`. - -### Notes / constraints -- GitHub Actions `ubuntu-latest` (as of 2025) runs kernel 6.5+, which has BTF enabled. The tests can rely on BTF being present. -- `CAP_BPF` requires either `sudo` in CI or setting up a test runner with the capability. Using `sudo -E cargo test` in the CI step is the simplest approach. -- The `test_probe_detach_removes_program` test requires `bpftool` to be installed on the runner. Add `sudo apt-get install -y bpftool` as a CI step. -- Flakiness risk: ring buffer polling has a 500ms window. If the CI runner is extremely loaded, events may arrive after the timeout. Use `tokio::time::timeout` and mark the test as `#[ignore]` with a note to run on dedicated hardware if it becomes flaky in practice. - ---- - -## Issue: Define the connection isolation table data structure and its key/value types -**Labels:** `networking`, `tables`, `firewall` -**Depends on:** none -**Blocks:** Isolation table population issue; enforcement issue; concurrency issue - -### What this is -The `agent/crates/isolation/src/lib.rs` is a one-line comment stub. The entire isolation crate needs to be built from scratch. This issue defines the foundational data structure: what the allow-list table is, what its key is, and what it contains. Based on the implementation guide, the isolation model is iptables-based: the `IsolateCommand` from the fleet server triggers iptables rules that drop all traffic except to the fleet server IP. The "table" in this workstream is the in-memory Rust data structure that tracks which connections are registered as allowed (belonging to the EDR process) so that the iptables rule-generation logic knows which addresses to preserve when isolation is applied. - -### What is currently blocking this -Nothing — this is the root issue of Workstream B. - -### What this is blocking -Everything else in Workstream B. The population and enforcement issues depend on the table types defined here. - -### Implementation tasks -- [ ] In `agent/crates/isolation/src/lib.rs`, define the module skeleton: `pub mod table; pub mod iptables; pub mod error;`. -- [ ] Create `agent/crates/isolation/src/error.rs`. Define `IsolationError` using `thiserror`: variants `IptablesExecFailed { exit_code: i32, stderr: String }`, `IptablesNotFound`, `InvalidAddress(String)`, `TableLockTimeout`. Add to `isolation/Cargo.toml` dependencies: `thiserror = { workspace = true }`. -- [ ] Create `agent/crates/isolation/src/table.rs`. Define `pub struct ConnectionKey { pub remote_ip: std::net::IpAddr, pub remote_port: u16, pub protocol: Protocol }` where `Protocol` is `pub enum Protocol { Tcp, Udp }`. The key is the remote endpoint — the "allowed" destination from the EDR's perspective. -- [ ] Define `pub struct ConnectionEntry { pub key: ConnectionKey, pub registered_at: std::time::Instant, pub description: &'static str }` — the value in the table. `description` is a static label like `"fleet-server"` or `"osquery-socket"`. -- [ ] Define `pub struct IsolationTable` as a struct wrapping `Vec` (not a HashMap — the table is expected to have ≤10 entries: the fleet server address, optionally a few internal addresses). Linear scan is acceptable and avoids the complexity of a hash map with a non-trivial key type. -- [ ] Implement `IsolationTable::new() -> Self`, `register(&mut self, key: ConnectionKey, description: &'static str)`, `deregister(&mut self, key: &ConnectionKey)`, `allowed_remotes(&self) -> Vec<&ConnectionKey>`. -- [ ] Add `serde` derives to `ConnectionKey` and `Protocol` for persistence (see the persistence issue below). Use `#[serde(rename_all = "snake_case")]` on `Protocol`. -- [ ] Write unit tests: `test_register_and_lookup`, `test_deregister_removes_entry`, `test_allowed_remotes_returns_all`, `test_empty_table_allowed_remotes_is_empty`. Place in a `#[cfg(test)]` module inside `table.rs`. - -### Definition of done -- `cargo test -p isolation` passes all four unit tests. -- `IsolationTable`, `ConnectionKey`, `Protocol`, `ConnectionEntry`, and `IsolationError` are all defined and exported from `isolation::table` and `isolation::error`. -- No external crates beyond `thiserror` and `serde` are added (no async, no lock primitives — this issue is synchronous data structures only). - -### Notes / constraints -- The isolation table is fundamentally different from eBPF maps. It is a userspace Rust data structure, not a kernel construct. The name "table" in the workstream name refers to this in-memory allow-list, not to a BPF map. -- `std::net::IpAddr` handles both IPv4 and IPv6. Use it instead of a custom type. -- `description` is `&'static str` not `String` to keep the entry allocation-free. All call sites use string literals. - ---- - -## Issue: Implement isolation table population: register EDR-owned connections as allowed -**Labels:** `networking`, `tables`, `firewall`, `ipc` -**Depends on:** Isolation table data structure issue -**Blocks:** Enforcement issue; concurrency issue - -### What this is -The isolation table needs to be pre-populated with the connections the EDR process itself makes — primarily the gRPC connection to the fleet server. When isolation is applied, these connections must remain reachable (isolation means "block everything except EDR comms"). This issue implements the population path: the `IsolationTable::register()` calls that happen at agent startup and whenever a new EDR connection is established. - -### What is currently blocking this -The table data structure issue must be complete. - -### What this is blocking -Enforcement: the iptables rule generator reads from the populated table. Concurrency: the locking wrapper is designed around the usage patterns established here. - -### Implementation tasks -- [ ] In `agent/crates/isolation/src/table.rs`, implement `IsolationTable::register_fleet_server(endpoint: &str) -> Result<(), IsolationError>` that parses the gRPC endpoint URL (e.g., `"http://fleet.internal:50051"`) into a `ConnectionKey` (resolve hostname to IP using `std::net::ToSocketAddrs`, extract port, set `protocol = Tcp`). Handle DNS resolution failure with `IsolationError::InvalidAddress`. -- [ ] Add `IsolationTable::register_osquery_socket(_path: &Path)` — a no-op stub that documents the intent (osquery uses a Unix domain socket, which is not filtered by iptables IP rules; this entry is for documentation and future netfilter-socket-level filtering). -- [ ] In `agent/crates/agent-core/src/orchestrator.rs`, after the `FleetClient::enroll()` succeeds and the fleet server IP is known, call `table.register_fleet_server(&config.fleet.endpoint)`. The `IsolationTable` instance should be created at the start of `orchestrator::run()` and passed to any component that needs it. -- [ ] Wire the `IsolationTable` into the `FleetClient`'s reconnect path: on each successful reconnect, re-register the fleet server (its IP may have changed via DNS). Implement `update_fleet_server(table: &mut IsolationTable, endpoint: &str)` that calls `deregister` on the old key then `register` on the new one. -- [ ] Add `agent/crates/isolation` to the `agent-core/Cargo.toml` dependencies (it is not listed yet). -- [ ] Write unit tests: `test_register_fleet_server_parses_http_endpoint`, `test_register_fleet_server_parses_https_endpoint`, `test_register_fleet_server_invalid_url_returns_error`, `test_update_fleet_server_replaces_old_entry`. - -### Definition of done -- `IsolationTable::register_fleet_server("http://fleet.internal:50051")` resolves and registers the entry correctly (tested with a real DNS lookup in unit tests using `127.0.0.1` as a known-resolving address). -- The orchestrator creates an `IsolationTable` and registers the fleet server on successful enrollment. -- `cargo test -p isolation` and `cargo test -p agent-core` pass. -- The `isolation` crate is a dependency of `agent-core`. - -### Notes / constraints -- Hostname resolution in `register_fleet_server` is synchronous (`ToSocketAddrs` is blocking). Since orchestrator startup is async (tokio), call this in `tokio::task::spawn_blocking` or use `tokio::net::lookup_host` instead. Prefer `tokio::net::lookup_host` to stay async. -- The table at this point has no locking. The population issue establishes the write path; the concurrency issue (next) adds locking. For now, `IsolationTable` is assumed to be owned and mutated by a single task. -- Do not attempt to register non-IP connections (unix sockets, abstract namespace). Document the scope limitation in a comment. - ---- - -## Issue: Add concurrency wrapper around IsolationTable for hot-path read and control-path write -**Labels:** `networking`, `tables`, `firewall`, `async` -**Depends on:** Isolation table population issue -**Blocks:** Enforcement issue; integration test - -### What this is -The `IsolationTable` will be read from a hot path (the enforcement point checks it on every iptables rule generation, which happens at isolation time) and written from a control path (enrollment, reconnect, and the `IsolateCommand` handler). This issue wraps `IsolationTable` in the appropriate synchronization primitive and defines the shared handle type used by all consumers. - -### What is currently blocking this -The population issue must exist to know the write patterns before choosing a locking strategy. - -### What this is blocking -The enforcement issue, which takes the shared handle and reads from it. The integration test. - -### Implementation tasks -- [ ] Decide on the concurrency primitive: the table is written at most once per reconnect cycle (very low frequency) and read at isolation time (also infrequent — isolation is an operator action). A `std::sync::RwLock` wrapped in `Arc` is correct here. There is no hot-path lookup that would justify a lock-free approach. Document this rationale in a comment in `table.rs`. -- [ ] Define `pub type SharedIsolationTable = Arc>` in `isolation/src/table.rs`. Export it from `isolation/src/lib.rs`. -- [ ] Implement `SharedIsolationTable::new_shared() -> Self` as a constructor shortcut: `Arc::new(RwLock::new(IsolationTable::new()))`. -- [ ] Add `impl IsolationTable { pub fn into_shared(self) -> SharedIsolationTable { Arc::new(RwLock::new(self)) } }`. -- [ ] In `agent/crates/isolation/src/table.rs`, implement convenience methods on `SharedIsolationTable` (via a newtype or inherent methods on a wrapper struct): `register_fleet_server_shared(&self, endpoint: &str) -> Result<(), IsolationError>` that acquires the write lock, calls `register_fleet_server`, and releases. Same pattern for `deregister` and `allowed_remotes`. -- [ ] Update `agent-core/orchestrator.rs` to construct a `SharedIsolationTable` at startup and clone the `Arc` into any component that needs read access (the iptables enforcement module in the next issue). -- [ ] Write unit tests: `test_concurrent_register_and_read` using `std::thread::spawn` to simulate concurrent writer and reader, asserting no data races (this test also validates the `RwLock` usage). `test_shared_table_clone_sees_updates` asserts that a cloned `Arc` reflects writes made through the original. - -### Definition of done -- `SharedIsolationTable` is defined and exported. -- `register_fleet_server_shared` and `allowed_remotes` (via read lock) compile and are tested. -- Two threads concurrently accessing the table via `SharedIsolationTable` do not deadlock or panic in the unit test. -- `cargo test -p isolation` passes. - -### Notes / constraints -- `std::sync::RwLock` (not `tokio::sync::RwLock`) is appropriate here because the lock is never held across an `.await` point. The critical section in `register_fleet_server` includes a DNS lookup — move the DNS resolution outside the lock before acquiring it. Take the write lock only to mutate the `Vec`. -- `Arc>` is `Send + Sync`. Cloning the `Arc` is cheap (one atomic increment). -- Do not use `Mutex` — the read path (allowed_remotes during rule generation) does not mutate state and should allow concurrent readers. - ---- - -## Issue: Implement iptables-based enforcement: generate and apply rules from the isolation table on IsolateCommand -**Labels:** `networking`, `firewall`, `ipc`, `unsafe` -**Depends on:** Concurrency wrapper issue -**Blocks:** Isolation integration test issue - -### What this is -This issue implements the actual isolation enforcement: `agent/crates/isolation/src/iptables.rs`. When the agent receives an `IsolateCommand { isolate: true }` from the fleet server, it applies iptables rules that drop all traffic except to endpoints registered in the `IsolationTable`. When it receives `IsolateCommand { isolate: false }`, it removes those rules. The implementation uses `std::process::Command` to invoke `iptables` (no external crate needed per the existing `Cargo.toml` comment). - -### What is currently blocking this -The `SharedIsolationTable` from the concurrency issue must be available to read the allow-list. - -### What this is blocking -The isolation integration test. - -### Implementation tasks -- [ ] Create `agent/crates/isolation/src/iptables.rs`. Define `pub struct IptablesIsolator { table: SharedIsolationTable }`. -- [ ] Implement `IptablesIsolator::isolate(&self) -> Result<(), IsolationError>`. The rule set: (1) create a new chain `EDR_ISOLATION` if it doesn't exist; (2) flush it (`iptables -F EDR_ISOLATION`); (3) for each `ConnectionKey` in `table.read().allowed_remotes()`, append `iptables -A EDR_ISOLATION -d -p --dport -j ACCEPT`; (4) append a default drop: `iptables -A EDR_ISOLATION -j DROP`; (5) if the `OUTPUT` chain does not already reference `EDR_ISOLATION`, append `-A OUTPUT -j EDR_ISOLATION`. -- [ ] Implement `IptablesIsolator::deisolate(&self) -> Result<(), IsolationError>`. Removes the jump from `OUTPUT`: `iptables -D OUTPUT -j EDR_ISOLATION`. Then flushes and deletes the chain: `iptables -F EDR_ISOLATION && iptables -X EDR_ISOLATION`. -- [ ] Implement `fn run_iptables(args: &[&str]) -> Result<(), IsolationError>` that executes `iptables` via `Command::new("iptables").args(args).output()`. If `status.success()` is false, return `IsolationError::IptablesExecFailed { exit_code: status.code().unwrap_or(-1), stderr: String::from_utf8_lossy(&output.stderr).into_owned() }`. If iptables is not found on PATH, return `IsolationError::IptablesNotFound`. -- [ ] Implement `IptablesIsolator::is_isolated(&self) -> Result` by running `iptables -L OUTPUT -n | grep EDR_ISOLATION`. Returns `true` if the chain is referenced in OUTPUT. -- [ ] In `agent/crates/agent-core/src/orchestrator.rs`, handle `ServerCommand::Isolate(cmd)` in the main loop: if `cmd.isolate == true`, call `IptablesIsolator::new(shared_table.clone()).isolate()`; if false, call `deisolate()`. Log the outcome. Update the agent's status to `AgentStatus::Isolated` or `AgentStatus::Healthy` accordingly. -- [ ] Write unit tests for `run_iptables` using a mock: define a trait `IptablesRunner` and use it in `run_iptables` to enable injection of a mock in tests. Assert that `isolate()` generates the expected iptables command arguments. - -### Definition of done -- `IptablesIsolator::isolate()` and `deisolate()` compile. -- Unit tests for argument generation pass with the mock runner. -- On a Linux host with root, calling `isolate()` followed by `iptables -L OUTPUT -n` shows the `EDR_ISOLATION` chain jump. Calling `deisolate()` removes it. -- The orchestrator handles `IsolateCommand` from the gRPC stream and calls the appropriate isolator method. - -### Notes / constraints -- `iptables` requires root. The agent binary must run as root or with `CAP_NET_ADMIN`. This is expected in production (same capability needed for eBPF); document it. -- The `EDR_ISOLATION` chain must be flushed before re-adding rules (idempotent isolate). If `isolate()` is called twice, the second call must not duplicate rules. -- `iptables -N EDR_ISOLATION` fails if the chain already exists (exit code 1). Treat this exit code specifically as `Ok(())` (chain already created). Check `stderr` for the specific message `"Chain already exists"`. -- IPv6 traffic requires `ip6tables`. This issue covers IPv4 only. Extend to IPv6in a follow-on. - ---- - -## Issue: Isolation table integration test: register a connection, apply isolation, verify non-EDR traffic is blocked -**Labels:** `networking`, `firewall`, `testing` -**Depends on:** Enforcement issue -**Blocks:** nothing (leaf node in workstream) - -### What this is -This issue implements the integration tests for the complete isolation pipeline: create an `IsolationTable`, register a known endpoint, apply iptables rules, verify that an outbound connection to the registered endpoint succeeds, verify that a connection to a different endpoint fails. This test requires root and a real Linux network stack. It also tests the wiring between the isolation crate and the `agent-core` orchestrator's `IsolateCommand` handler. - -### What is currently blocking this -The enforcement issue must be complete. - -### What this is blocking -Nothing — leaf node. - -### Implementation tasks -- [ ] Create `agent/crates/isolation/tests/integration_test.rs`. Gate the file with a feature flag `isolation_integration_tests` and `#[cfg(target_os = "linux")]`. -- [ ] Implement `test_isolation_blocks_non_allowed_traffic`: (1) create `SharedIsolationTable`, (2) register `127.0.0.1:9999` as the allowed endpoint, (3) call `isolate()`, (4) spawn a `TcpListener` on `127.0.0.1:9998` (not in the allow-list), (5) attempt `TcpStream::connect("127.0.0.1:9998")` — assert connection times out or is refused, (6) call `deisolate()`, (7) assert `TcpStream::connect("127.0.0.1:9998")` now succeeds. -- [ ] Implement `test_isolation_allows_registered_endpoint`: same setup, attempt connection to `127.0.0.1:9999` — assert it succeeds while isolated. -- [ ] Implement `test_idempotent_isolate`: call `isolate()` twice; run `is_isolated()` — assert `true`. Then `deisolate()` — assert `is_isolated()` returns `false`. Verify `iptables -L OUTPUT -n` does not contain duplicate `EDR_ISOLATION` references. -- [ ] Implement `test_deisolation_restores_connectivity`: full cycle. After `deisolate()`, attempt connections to multiple ports — all should succeed. -- [ ] Add `[[test]]` section in `isolation/Cargo.toml` for the integration tests with `required-features = ["isolation_integration_tests"]`. -- [ ] Add a CI job step (in the same or a separate workflow from the eBPF CI) that runs these tests with `sudo -E cargo test -p isolation --features isolation_integration_tests`. - -### Definition of done -- All four integration tests pass on a Linux host with root access and `iptables` available. -- Tests are gated by feature flag. -- CI workflow runs them in an `ubuntu-latest` environment. -- After each test, `iptables -L` shows no `EDR_ISOLATION` chain (cleanup is deterministic, using `Drop` or explicit teardown in test). - -### Notes / constraints -- Connecting to `127.0.0.1` with iptables `OUTPUT` chain rules: iptables by default does apply `OUTPUT` chain rules to loopback on Linux. Verify this assumption with `sysctl net.ipv4.conf.lo.accept_local`. If loopback is exempt, use a secondary interface (e.g., a dummy interface created in the test setup). -- Use `tokio::time::timeout(Duration::from_millis(500), TcpStream::connect(...))` to detect blocked connections quickly without waiting for the OS TCP timeout. -- Ensure test cleanup (call `deisolate()` in a `Drop` guard or use `scopeguard`) to avoid leaving isolation rules active if a test panics. - ---- - -## Issue: Create the fleet-server crate structure, async runtime, and binary entry point -**Labels:** `fleet-server`, `scaffolding`, `async`, `config` -**Depends on:** none -**Blocks:** gRPC server stub issue; config system issue; error type issue; logging issue; health endpoint issue - -### What this is -`fleet-server/src/main.rs` currently contains `fn main() { println!("edr-fleet-server"); }`. The crate has all its dependencies declared in `Cargo.toml` (tokio, axum, tonic, sqlx, etc.) and a `Dockerfile` and one migration file. This issue transforms it into a real binary: Tokio runtime initialization with the correct feature flags, a structured `main.rs` that initializes subsystems in order, and the module layout that the subsequent issues will fill in. No business logic. No gRPC server implementation. No database queries. The binary must compile, start, and accept a graceful shutdown signal. - -### What is currently blocking this -Nothing — root issue of Workstream C. - -### What this is blocking -All other fleet-server issues depend on the module skeleton this issue creates. - -### Implementation tasks -- [ ] Replace `fleet-server/src/main.rs` entirely. Use `#[tokio::main]` with the `full` feature (already in workspace `Cargo.toml`). Structure `main` as: (1) call `config::load()`, (2) call `tracing_setup::init(&cfg)`, (3) call `error::setup()` if needed, (4) build and run `server::run(cfg).await`. Return `Result<(), ServerError>`. -- [ ] Create the module files as stubs (each containing `// TODO` and correct `pub` declarations): `src/config.rs`, `src/error.rs`, `src/state.rs`, `src/server.rs`, `src/grpc/mod.rs`, `src/grpc/server.rs`, `src/db/mod.rs`, `src/http/mod.rs`, `src/http/health.rs`. -- [ ] In `src/main.rs`, add `mod config; mod error; mod state; mod server; mod grpc; mod db; mod http;` with the appropriate `pub use` re-exports. -- [ ] `src/server.rs`: define `pub async fn run(config: Config) -> Result<(), ServerError>` as a stub that prints `"Fleet server starting..."`, sleeps for 100ms, and returns `Ok(())`. This will be replaced in subsequent issues but must compile. -- [ ] Verify `cargo build -p edr-fleet-server` succeeds and `cargo run -p edr-fleet-server` prints `"Fleet server starting..."` and exits cleanly. -- [ ] Write a smoke test `test_main_returns_ok` in `src/main.rs` under `#[cfg(test)]` that calls `server::run(Config::default())` in a tokio test runtime and asserts `Ok(())`. - -### Definition of done -- `cargo build -p edr-fleet-server` succeeds. -- `cargo run -p edr-fleet-server` starts, prints startup message, exits with code 0. -- All module stubs exist and are referenced in `main.rs`. -- `cargo test -p edr-fleet-server` passes (smoke test). - -### Notes / constraints -- `tokio = { version = "1", features = ["full"] }` is already in workspace dependencies. Do not add redundant feature flags. -- The async runtime is tokio. The HTTP framework is axum. The gRPC framework is tonic. All are already in `Cargo.toml`. This issue does not change dependencies, only creates the structural skeleton. -- Do not add `actix-web` or any other async runtime. The architecture decision (axum + tokio) is final per the implementation guide. - ---- - -## Issue: Implement the fleet-server configuration system -**Labels:** `fleet-server`, `config`, `scaffolding` -**Depends on:** Fleet-server crate structure issue -**Blocks:** Health endpoint issue; logging issue; gRPC server stub issue; smoke test issue - -### What this is -The fleet server needs to read its configuration at startup: bind addresses for the gRPC and HTTP servers, PostgreSQL connection URL, Kafka broker addresses, JWT secret, and log level. The `config` crate is already declared in the workspace `Cargo.toml` at version `0.15`. This issue implements `fleet-server/src/config.rs` using `config = "0.15"` with a layered source: defaults → environment variables → optional config file. No secrets in code. All config fields must be readable from environment variables with the prefix `EDR_FLEET_`. - -### What is currently blocking this -The crate structure issue (module stubs must exist). - -### What this is blocking -The health endpoint (needs bind address), logging init (needs log level), the gRPC server stub (needs gRPC bind address and JWT secret), and the smoke test (needs `Config::default()`). - -### Implementation tasks -- [ ] In `fleet-server/src/config.rs`, define `#[derive(Debug, Clone, serde::Deserialize)] pub struct Config` with fields: `pub grpc_bind: String` (default `"0.0.0.0:50051"`), `pub http_bind: String` (default `"0.0.0.0:8080"`), `pub database_url: String` (no default — required), `pub kafka_brokers: String` (default `"localhost:9092"`), `pub jwt_secret: String` (no default — required), `pub log_level: String` (default `"info"`), `pub log_format: LogFormat` where `LogFormat` is `#[derive(Debug, Clone, serde::Deserialize)] pub enum LogFormat { Human, Json }`. -- [ ] Implement `pub fn load() -> Result` using `config::Config::builder().add_source(config::Environment::with_prefix("EDR_FLEET")).build()?.try_deserialize()`. Use `thiserror` for `ConfigError`: variants `LoadFailed(#[from] config::ConfigError)`. -- [ ] Implement `Config::default()` manually (not via `derive`) returning the documented defaults with `database_url` and `jwt_secret` set to `"test_placeholder"` — this is only used in tests and the stub `server::run` stub. -- [ ] Add a validation step: `Config::validate(&self) -> Result<(), ConfigError>` that returns `Err` if `jwt_secret.len() < 32` (enforce minimum key length) or if `database_url` is the test placeholder in a non-test build. Use `#[cfg(not(test))]` guard. -- [ ] Write unit tests: `test_config_loads_from_env` (set `EDR_FLEET_GRPC_BIND`, `EDR_FLEET_HTTP_BIND`, `EDR_FLEET_JWT_SECRET`, etc. via `std::env::set_var`, call `load()`, assert fields match), `test_config_default_grpc_port`, `test_config_jwt_secret_too_short_fails_validation`. - -### Definition of done -- `config::load()` reads from environment variables with `EDR_FLEET_` prefix. -- `Config::default()` compiles and is used by the existing smoke test stub. -- `Config::validate()` rejects a JWT secret shorter than 32 characters in non-test builds. -- `cargo test -p edr-fleet-server` passes config unit tests. -- No secrets are hardcoded — all defaults that would be credentials are test-only. - -### Notes / constraints -- `config = "0.15"` uses `serde` for deserialization. The `Environment` source converts `EDR_FLEET_GRPC_BIND` to the field name `grpc_bind` by lowercasing and stripping the prefix. Verify this behavior with the unit test before finalizing. -- Do not use `dotenv` — it is not in the workspace `Cargo.toml`. Environment variables are the only config source beyond defaults. -- `LogFormat` is defined in `config.rs` and re-exported. The tracing setup issue imports it from here. - ---- - -## Issue: Implement the fleet-server error type hierarchy -**Labels:** `fleet-server`, `error-handling`, `scaffolding` -**Depends on:** Fleet-server crate structure issue -**Blocks:** gRPC server stub issue; health endpoint issue; smoke test issue - -### What this is -The fleet server needs a structured error type hierarchy that is ready to be extended as business logic is added. This issue implements `fleet-server/src/error.rs` with a top-level `ServerError` and a set of domain-specific sub-errors. All types use `thiserror`. The design must anticipate gRPC errors, database errors, JWT errors, and Kafka errors without implementing any of those subsystems yet. - -### What is currently blocking this -The crate structure issue. - -### What this is blocking -The gRPC server stub (which returns `Result<_, ServerError>`), the health endpoint (same), and the smoke test. - -### Implementation tasks -- [ ] In `fleet-server/src/error.rs`, define `#[derive(Debug, thiserror::Error)] pub enum ServerError` with variants: `#[error("configuration error: {0}")] Config(#[from] crate::config::ConfigError)`, `#[error("database error: {0}")] Database(#[from] sqlx::Error)`, `#[error("gRPC error: {0}")] Grpc(#[from] tonic::Status)`, `#[error("JWT error: {0}")] Jwt(String)`, `#[error("Kafka error: {0}")] Kafka(String)`, `#[error("IO error: {0}")] Io(#[from] std::io::Error)`. -- [ ] Define `#[derive(Debug, thiserror::Error)] pub enum DbError` with variants: `#[error("node not found: {node_id}")] NodeNotFound { node_id: uuid::Uuid }`, `#[error("duplicate enrollment: machine_id={machine_id}")] DuplicateEnrollment { machine_id: String }`, `#[error("sqlx: {0}")] Sqlx(#[from] sqlx::Error)`. -- [ ] Define `#[derive(Debug, thiserror::Error)] pub enum GrpcError` with variants: `#[error("unauthenticated")] Unauthenticated`, `#[error("node not enrolled")] NotEnrolled`, `#[error("stream closed")] StreamClosed`. -- [ ] Implement `From for tonic::Status`: `Unauthenticated` → `Status::unauthenticated(msg)`, `NotEnrolled` → `Status::not_found(msg)`, `StreamClosed` → `Status::cancelled(msg)`. -- [ ] Re-export `ServerError`, `DbError`, `GrpcError` from `fleet-server/src/error.rs` and add `pub use error::{ServerError, DbError, GrpcError};` to `src/lib.rs` if a lib target is added, or use path imports in `main.rs`. -- [ ] Write unit tests: `test_db_error_display_node_not_found`, `test_grpc_error_converts_to_tonic_status_unauthenticated`, `test_server_error_from_io_error`. - -### Definition of done -- `cargo test -p edr-fleet-server` passes all error type unit tests. -- `ServerError`, `DbError`, and `GrpcError` are defined and compile with `thiserror`. -- `From for tonic::Status` is implemented and tested. -- The hierarchy is extensible: adding a new variant to any error enum requires no changes outside that enum. - -### Notes / constraints -- `thiserror = "2"` is in the workspace (note: version 2, not 1). Use `{ workspace = true }`. -- `tonic::Status` has constructor methods like `Status::unauthenticated(message: impl Into)`. Use these — do not construct the struct directly. -- Keep `Kafka(String)` as a `String`-wrapping variant for now because `rdkafka` is commented out in the workspace `Cargo.toml`. When rdkafka is enabled, replace it with a proper `From` impl. - ---- - -## Issue: Implement structured logging and tracing setup for the fleet-server -**Labels:** `fleet-server`, `tracing`, `scaffolding` -**Depends on:** Configuration system issue -**Blocks:** Health endpoint issue; gRPC server stub issue; smoke test issue - -### What this is -The fleet server must emit structured JSON logs in production (for log aggregation) and human-readable logs in development. `tracing` and `tracing-subscriber` are already in the workspace. This issue implements a thin initialization shim in `fleet-server/src/` that reads `Config.log_level` and `Config.log_format`, builds the appropriate `tracing_subscriber` stack, and installs it as the global default. It must run before any `tracing::info!` calls and must not panic if called twice (idempotent for test use). - -### What is currently blocking this -The config system must exist so `LogFormat` and `log_level` are defined. - -### What this is blocking -Everything that emits logs (health endpoint, gRPC stub, smoke test). - -### Implementation tasks -- [ ] Create `fleet-server/src/tracing_setup.rs` (avoid naming it `tracing.rs` to prevent shadowing the `tracing` crate). Define `pub fn init(config: &Config) -> Result<(), ServerError>`. -- [ ] In `init`, build the `EnvFilter` from `config.log_level` using `EnvFilter::try_new(&config.log_level).unwrap_or_else(|_| EnvFilter::new("info"))`. -- [ ] For `LogFormat::Json`: use `tracing_subscriber::fmt().json().with_env_filter(filter).with_current_span(true).with_span_list(true).try_init()`. Map the `Err` from `try_init` (which fires if a global subscriber is already set) to `Ok(())` rather than returning an error — this makes the function safe to call from tests. -- [ ] For `LogFormat::Human`: use `tracing_subscriber::fmt().with_env_filter(filter).pretty().try_init()` with the same error-swallowing behavior. -- [ ] Add the module to `src/main.rs`: `mod tracing_setup;` and call `tracing_setup::init(&config)?;` as the second step in `main` (after config load). -- [ ] Write a unit test `test_init_human_format_does_not_panic` and `test_init_json_format_does_not_panic` — both call `tracing_setup::init(&Config::default())` in a tokio test runtime and assert `Ok(())`. -- [ ] Verify that `cargo run -p edr-fleet-server` with `EDR_FLEET_LOG_FORMAT=json` produces JSON-structured log lines to stdout. - -### Definition of done -- `tracing_setup::init` compiles and runs without panic. -- JSON format produces parseable JSON lines (verify with `cargo run | jq .` during manual testing). -- Human format produces colored, pretty output when `EDR_FLEET_LOG_FORMAT` is unset. -- Unit tests pass. -- `try_init` is used (not `init`) so tests can call `init` multiple times without panicking. - -### Notes / constraints -- `tracing_subscriber = { version = "0.3", features = ["env-filter", "json"] }` is in the workspace. The `json` feature is required for `.json()` on the `SubscriberBuilder`. -- Do not use `RUST_LOG` as the only env override — the config-driven `log_level` provides a programmatic default independent of `RUST_LOG`. The `EnvFilter` will still respect `RUST_LOG` if set (it checks it first). -- `tracing_setup` is not `tracing` — do not shadow the external crate name. - ---- - -## Issue: Implement graceful shutdown with SIGTERM and SIGINT handling for the fleet-server -**Labels:** `fleet-server`, `scaffolding`, `async` -**Depends on:** Fleet-server crate structure issue; logging issue -**Blocks:** Smoke test issue - -### What this is -The fleet server must shut down cleanly when it receives `SIGTERM` (from the container orchestrator stopping the pod) or `SIGINT` (from `Ctrl-C` in development). "Cleanly" at the scaffolding level means: accept the signal, log the shutdown intent, cancel a `CancellationToken` that all subsystems will be given, and exit `main` with code 0. The actual drain logic (waiting for in-flight gRPC calls to complete, flushing Kafka producers) is stubbed here and implemented when those subsystems are built. - -### What is currently blocking this -The crate structure and logging issues must be in place. - -### What this is blocking -The smoke test (which verifies `SIGTERM` causes clean exit). - -### Implementation tasks -- [ ] In `fleet-server/src/server.rs`, replace the stub `run` function with a real implementation: (1) create a `tokio_util::sync::CancellationToken` (add `tokio-util = { version = "0.7", features = ["sync"] }` if not already in scope — it is in the workspace), (2) spawn a signal handler task using `tokio::signal::ctrl_c()` and `tokio::signal::unix::signal(SignalKind::terminate())`, (3) when either signal fires, call `token.cancel()` and log `"Received shutdown signal — initiating graceful shutdown"`, (4) pass the token to all subsystem runners (stubs for now), (5) await all subsystem handles, (6) log `"Fleet server stopped"` and return `Ok(())`. -- [ ] In `src/main.rs`, the existing `server::run(cfg).await?` call now becomes the full lifecycle. Ensure the process exits with code 0 on clean shutdown and code 1 on `Err`. -- [ ] Implement `pub struct ShutdownHandle { token: CancellationToken }` with `ShutdownHandle::new() -> (ShutdownHandle, CancellationToken)` and `ShutdownHandle::wait(self) -> impl Future` that awaits the signal before cancelling. Expose this from `src/server.rs`. -- [ ] Stub the shutdown drain as `async fn drain_subsystems(_token: CancellationToken) { tokio::time::sleep(Duration::from_millis(100)).await; tracing::info!("All subsystems drained"); }`. This is a placeholder that future issues replace. -- [ ] Write a unit test `test_shutdown_on_cancellation_token` that creates a `CancellationToken`, cancels it immediately, and asserts that `drain_subsystems(token)` returns within 200ms. - -### Definition of done -- `cargo run -p edr-fleet-server` starts and exits cleanly when `Ctrl-C` is pressed (logs shutdown message, exits code 0). -- On `SIGTERM` (test with `kill -TERM ` in a separate terminal), the same clean shutdown occurs. -- The `CancellationToken` is threaded through `server::run` and passed to all subsystem stubs. -- Unit test passes. - -### Notes / constraints -- `tokio::signal::unix` is only available on Unix targets. Wrap with `#[cfg(unix)]`. For `#[cfg(windows)]` (if needed in future), use `tokio::signal::ctrl_c()` only. -- `tokio::signal::unix::signal(SignalKind::terminate())` returns a `Signal` stream. Use `signal.recv().await` inside a `tokio::select!`. -- Do not call `std::process::exit()` — let `main` return naturally. Calling `exit()` bypasses `Drop` impls and can leave resources in a dirty state. - ---- - -## Issue: Implement the health check HTTP endpoint on the fleet-server -**Labels:** `fleet-server`, `scaffolding`, `async` -**Depends on:** Configuration system issue; logging issue; error type issue; graceful shutdown issue -**Blocks:** Smoke test issue - -### What this is -The fleet server exposes an HTTP port (`http_bind`, default `0.0.0.0:8080`) for health checks and future admin routes. This issue implements a single route: `GET /health` returning `{"status":"ok"}` with HTTP 200. The HTTP server runs concurrently with the gRPC server (the gRPC stub issue) using axum. Both servers share the `CancellationToken` for coordinated shutdown. - -### What is currently blocking this -Config (for `http_bind`), error types (for `ServerError`), logging (so startup is visible), and graceful shutdown (for the CancellationToken). - -### What this is blocking -The smoke test (which calls GET /health and asserts 200). - -### Implementation tasks -- [ ] In `fleet-server/src/http/mod.rs`, define `pub async fn serve(bind: &str, token: CancellationToken) -> Result<(), ServerError>`. -- [ ] In `fleet-server/src/http/health.rs`, define `pub async fn health_handler() -> impl IntoResponse { axum::Json(serde_json::json!({"status": "ok"})) }`. -- [ ] In `http/mod.rs`, build the axum router: `Router::new().route("/health", get(health::health_handler))`. Bind with `TcpListener::bind(bind).await?` and serve with `axum::serve(listener, router).with_graceful_shutdown(token.cancelled())`. -- [ ] In `fleet-server/src/server.rs`, alongside the existing shutdown stub, spawn `http::serve(&config.http_bind, token.clone())` as a `tokio::spawn` handle. Await it in the drain step. -- [ ] Add `tower-http` trace middleware: wrap the router with `.layer(TraceLayer::new_for_http())` using `tower_http::trace::TraceLayer`. -- [ ] Write a unit test `test_health_endpoint_returns_200` using `axum::test` (`axum::serve` test helpers or `hyper` test client): build the router, send `GET /health`, assert status 200 and body `{"status":"ok"}`. - -### Definition of done -- `cargo run -p edr-fleet-server` starts the HTTP server on port 8080. -- `curl http://localhost:8080/health` returns `{"status":"ok"}` with HTTP 200. -- Sending SIGTERM after startup causes the HTTP server to stop accepting new connections and returns from `serve` cleanly. -- Unit test `test_health_endpoint_returns_200` passes. - -### Notes / constraints -- `axum = { version = "0.8", features = ["ws", "macros"] }` is in the workspace. `axum::serve` is the axum 0.8 API (not the older `axum::Server`). -- `axum::serve(...).with_graceful_shutdown(token.cancelled_owned())` requires `CancellationToken::cancelled_owned()` from `tokio-util`. This is available in `tokio-util = "0.7"`. -- The health endpoint must not require authentication — it is called by load balancers and Kubernetes liveness probes. -- Do not implement any other routes in this issue. `/metrics`, `/nodes`, and any other admin routes are out of scope for scaffolding. - ---- - -## Issue: Implement the gRPC server stub: bind the tonic server to its port, register the FleetService, return Unimplemented -**Labels:** `fleet-server`, `scaffolding`, `async`, `ipc` -**Depends on:** Configuration system issue; error type issue; logging issue; graceful shutdown issue -**Blocks:** Smoke test issue - -### What this is -The fleet server's primary interface is a tonic gRPC server implementing the `FleetService` defined in `sdk/proto/fleet.proto`. This issue adds the gRPC server stub: the `tonic::transport::Server` that binds to `config.grpc_bind`, registers a `FleetServiceImpl` struct, and returns `Status::unimplemented()` for all three RPCs (`RegisterAgent`, `EventStream`, `Heartbeat`). This is the minimal skeleton that compiles, binds the port, and is ready for actual implementation in downstream issues outside this workstream. - -### What is currently blocking this -Config (for `grpc_bind`), error types (for mapping gRPC errors), logging, and graceful shutdown (for the shutdown future). The `sdk` crate must expose the generated tonic service trait — check `sdk/src/lib.rs` is a stub and may need a `build.rs` to compile protos. Note: this issue may surface a dependency on the SDK build system. - -### What this is blocking -The smoke test (which verifies the gRPC port is bound). - -### Implementation tasks -- [ ] Verify `sdk/src/lib.rs` exposes the generated `fleet_service_server::FleetService` trait from `sdk/proto/fleet.proto`. If `sdk/build.rs` does not yet exist, add it: `fn main() -> Result<(), Box> { tonic_build::configure().compile_protos(&["proto/fleet.proto", "proto/agent.proto", "proto/events.proto"], &["proto/"])?; Ok(()) }`. Add `build-dependencies = [ tonic-build ]` to `sdk/Cargo.toml`. This is a prerequisite step — it should be done as part of this issue or tracked as a blocking note. -- [ ] In `fleet-server/src/grpc/server.rs`, define `pub struct FleetServiceImpl;` and implement the tonic-generated `FleetService` trait. All three methods return `Err(Status::unimplemented("not yet implemented"))`. The streaming RPCs return a `Result, Status>` where `Self::EventStreamStream` is a `Pin>>`. -- [ ] In `fleet-server/src/grpc/mod.rs`, define `pub async fn serve(bind: &str, token: CancellationToken) -> Result<(), ServerError>`. Use `tonic::transport::Server::builder().add_service(FleetServiceServer::new(FleetServiceImpl)).serve_with_shutdown(addr, token.cancelled_owned()).await?`. -- [ ] In `fleet-server/src/server.rs`, spawn `grpc::serve(&config.grpc_bind, token.clone())` alongside the HTTP server. -- [ ] Write a unit test `test_grpc_server_binds_and_returns_unimplemented`: start the gRPC server on an ephemeral port, connect with a tonic client, call `RegisterAgent`, assert the response is `Status::unimplemented`. - -### Definition of done -- `cargo build -p edr-fleet-server` succeeds with the gRPC stub present. -- `cargo run -p edr-fleet-server` logs the gRPC bind address and accepts connections on port 50051. -- `grpcurl -plaintext localhost:50051 edr.fleet.FleetService/RegisterAgent` returns `Unimplemented` (not `connection refused`). -- Unit test passes. - -### Notes / constraints -- `tonic = "0.14"` is in the workspace. The generated `FleetServiceServer` requires `tonic::async_trait` on the impl block. Use `#[tonic::async_trait]`. -- The `EventStream` RPC is bidirectional streaming. The `EventStreamStream` associated type must be `Pin> + Send + 'static>>`. Return an empty stream for the stub: `Ok(Response::new(Box::pin(tokio_stream::empty())))`. -- `sdk/src/lib.rs` is currently a single-line comment. Adding `build.rs` to the SDK and exposing the generated types is a prerequisite. If the SDK build is not in scope for this issue, use the existing `testing.proto` in `fleet-server/src/grpc/` as a local stand-in — but this is a workaround. The clean solution is to fix the SDK build. - ---- - -## Issue: Fleet-server smoke test: server starts, binds, returns 200 on health, shuts down cleanly -**Labels:** `fleet-server`, `testing`, `scaffolding` -**Depends on:** Health endpoint issue; gRPC server stub issue; logging issue; graceful shutdown issue -**Blocks:** nothing (leaf node in workstream) - -### What this is -With all scaffolding pieces in place (config, tracing, error types, shutdown, health endpoint, gRPC stub), this issue implements a single integration test that exercises the full startup-to-shutdown lifecycle of the fleet server binary. This is the "does it all hang together" gate before any business logic is built on top. - -### What is currently blocking this -All prior fleet-server scaffolding issues must be complete. - -### What this is blocking -Nothing — leaf node of Workstream C. - -### Implementation tasks -- [ ] Create `fleet-server/tests/smoke_test.rs`. This is an integration test (in `tests/`, not `src/`), so it tests the public API of the crate. -- [ ] Implement `test_health_endpoint_returns_ok`: (1) build a `Config` with `http_bind = "127.0.0.1:0"` (port 0 = OS assigns ephemeral), `grpc_bind = "127.0.0.1:0"`, `log_level = "error"` (quiet), `jwt_secret = "a".repeat(32)`, `database_url = "postgres://unused"`, (2) spawn `server::run(config)` in a `tokio::spawn`, (3) poll `http://127.0.0.1:/health` with a short timeout (use `reqwest` or `axum`'s test helpers), (4) assert HTTP 200 and body `{"status":"ok"}`, (5) cancel the `CancellationToken`, (6) await the server task, assert it exits with `Ok(())`. -- [ ] Add `reqwest` as a dev dependency in `fleet-server/Cargo.toml`: `reqwest = { version = "0.12", features = ["json"], default-features = false }` — or use `hyper` directly to avoid a heavy dependency. -- [ ] Implement `test_grpc_port_is_bound`: after startup, attempt a TCP connection to `127.0.0.1:`. Assert the connection is accepted (not refused). This does not require a gRPC client — just a raw `TcpStream::connect`. -- [ ] Implement `test_graceful_shutdown_exits_zero`: cancel the token, await the server future, assert the `Result` is `Ok(())`. Assert that no log lines at `ERROR` level were emitted during the test run (use `tracing_test` crate if available, or skip this assertion). -- [ ] Note the challenge of binding on port 0 with axum: `axum::serve` needs to know the actual bound port. Expose a `BoundServer` struct from `http::serve` that holds the actual `SocketAddr`. This requires a small API change to `http/mod.rs`. - -### Definition of done -- `cargo test -p edr-fleet-server` (including `tests/smoke_test.rs`) passes all three smoke tests. -- The tests do not require a running PostgreSQL, Kafka, or any external service. -- The test binary exits with code 0 and leaves no listening sockets behind. -- Test runtime is under 5 seconds total (no real sleep calls — use `tokio::time::timeout` for all waits). - -### Notes / constraints -- Port 0 binding: pass `"127.0.0.1:0"` to `TcpListener::bind`, then call `listener.local_addr()` to get the actual port before passing the listener to axum. This requires `http::serve` to take a `TcpListener` instead of a `&str` bind address — or expose the bound address through a `oneshot` channel. -- The gRPC port 0 binding with tonic: `Server::builder().serve_with_shutdown(addr, ...)` where `addr` is `"127.0.0.1:0".parse().unwrap()`. Tonic internally binds and assigns an ephemeral port, but does not expose the actual port without additional hooks. For the smoke test, use a fixed high port (e.g., 59050) known to be available, and skip this test if the port is in use. -- Do not test actual gRPC RPC calls in this smoke test — that is integration testing, not scaffolding testing. - ---- diff --git a/README.md b/README.md index b74a009..68df5cd 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,650 @@ -# project-edr +# Aigis-Zero : Endpoint Detection and Response -An indigenous Endpoint Detection and Response system built from scratch. +Aigis-Zero is a open-source EDR system built entirely from scratch in Rust. It monitors Linux endpoints for suspicious activity in real time, streams telemetry through a central fleet server, normalises and routes events via Apache Kafka, runs detection rules against a YARA engine, MITRE ATT&CK mapping, and ML based detection. It surfaces alerts to SOC team through a live React dashboard. -## What is this -A real-time EDR that monitors endpoints for suspicious activity, streams telemetry to a central server, runs detection rules, and surfaces alerts to a security operator through a live dashboard. +Most production EDR agents are written in C, C++, or Go. This project makes a deliberate choice of Rust for every backend component and the agent itself. -## Tech Stack +While C/C++ EDR agents are notoriously prone to memory corruption (a massive liability for a root-level service), and Go-based agents struggle with garbage collection pauses and heavy memory bloat under burst load, Rust gives us the best of both worlds. -| Layer | Tech | -|---|---| -| Agent | Rust, eBPF (aya), OSQuery | -| Fleet Server | Rust, gRPC (tonic), PostgreSQL | -| Message Bus | Apache Kafka | -| Pipeline | Rust, rdkafka, sqlx | -| Rule Engine | Rust, YARA (yara-x) | -| API | Rust, Axum, WebSocket | -| Frontend | React, TypeScript, Vite | -| Infra | Docker Compose, Kubernetes | +We get the raw, low-overhead performance of C/C++ without the security risks of buffer overflows or use-after-free bugs. At the same time, we get the concurrency of Go but without a heavy runtime or unpredictable GC sweeps that drop event streams. By leveraging Tokio's async engine, the agent handles thousands of concurrent event streams on a razor-thin memory footprint. -## Architecture +--- + +## Architecture ```mermaid -flowchart LR - A["Agent\n(eBPF + OSQuery)"] -->|gRPC| F["Fleet Server"] - F -->|produce| K["Kafka"] - K -->|consume| P["Pipeline"] - P -->|write| DB[("PostgreSQL")] - P -->|produce| K - K -->|consume| R["Rule Engine"] - R -->|alerts| K - R -->|write| DB - K -->|consume| API["API Backend"] - API -->|WebSocket| FE["Frontend"] - FE -->|REST| API - API -->|commands| F +graph TD + %% Endpoint Section + subgraph EP ["Endpoint (Linux)"] + OQ["osqueryd (eBPF Mode)"] + + subgraph AZ ["aigis-zero Agent"] + EB[("Event Buffer (SQLite WAL)")] + CH["Command Handler"] + ISO["Isolation Module (nftables)"] + HL["Heartbeat Loop"] + end + end + + %% Fleet Server Section + subgraph FS ["Fleet Server (Rust/Tonic)"] + NE["Node Enrollment"] + KH["Kafka Handler"] + HT["Health Tracker"] + end + + %% Databases + subgraph DBs ["PostgreSQL Databases"] + DB_Nodes[("edr_nodes (Registry)")] + DB_Health[("node_health (Heartbeats)")] + DB_Logs[("edr_logs (Event Logs)")] + DB_Alerts[("edr_alerts (Alerts)")] + end + + %% Kafka + subgraph KF ["Apache Kafka"] + K_Raw["aigis.events.raw"] + K_Typed["aigis.events.typed"] + K_Alerts["aigis.alerts"] + K_Health["aigis.health"] + end + + %% Kafka Pipeline + subgraph KP ["Kafka Pipeline"] + Router["Event Router & Normaliser"] + end + + %% Rule Engine + subgraph RE ["Rule Engine (YARA-X)"] + Scanner["Rule Scanner & MITRE Mapper"] + end + + %% API Backend & Frontend + subgraph Operator ["Operator Console"] + API["API Backend (Axum / WebSockets)"] + UI["Frontend (React / Vite)"] + end + + %% Communication Flows + OQ -->|"Thrift IPC"| AZ + EB -->|"gRPC Uplink"| KH + HL -->|"gRPC Heartbeat"| HT + CH <-->|"gRPC Bidirectional Stream"| FS + CH -->|"nftables rules"| ISO + + %% Fleet Server DB writes + NE --> DB_Nodes + HT --> DB_Health + + %% Kafka handling + KH --> K_Raw + K_Raw --> Router + Router --> K_Typed + Router --> DB_Logs + + %% Rule scanning + K_Typed --> Scanner + Scanner --> K_Alerts + Scanner --> DB_Alerts + + %% API and Dashboard + K_Alerts --> API + K_Health --> API + API <-->|"WebSockets (Live events)"| UI + UI -->|"IsolateCommand"| API + API -->|"Forward Commands"| FS ``` +--- + +## Component Breakdown + +The codebase is organized as a single Cargo workspace containing 18 crates that separate the core services, the shared SDK, and the React frontend. + +* **sdk** — Shared Protobuf definitions (`agent.proto`, `events.proto`, `fleet.proto`) and common domain models. All other crates import from here. Strictly no business logic allowed. +* **agent** — A single compiled binary (`aigis-zero`) composed of 7 sub-crates: + * `agent-bin`: Bootstrap entry point, CLI config loader, and service lifecycle manager. + * `agent-core`: Tokio-backed orchestrator, backpressure-aware event loop, and exponential backoff retry loop (50ms base, ~12.8s max). + * `osquery-client`: Thrift IPC client interfacing with osqueryd via Unix sockets. + * `event-buffer`: SQLite-backed write-ahead log ensuring at-least-once telemetry delivery during network partitions. + * `fleet-client`: gRPC client managing Tonic bidirectional streams and heartbeats. + * `isolation`: Host quarantine control managing nftables drop rules with a fleet IP exemption. + * `agent-tracing`: Structured JSON telemetry logging using the `tracing` ecosystem. +* **fleet-server** — Central fleet controller split into 8 crates: + * `fleet-server-bin`: Entry point, env loading (`dotenvy`), DB schema migrations, and Tonic server initialization. + * `grpc-listener`: Implementation of the gRPC `FleetService` interface. + * `node-enrollment`: Enrollment handler verifying nodes, issuing 24h JWTs, and recording registration state. + * `health-tracker`: Records node heartbeat timelines; isolates heartbeat state so agents can't overwrite quarantine flags. + * `fleet-manager`: Pure domain logic governing agent state machines and transitions. + * `kafka-handler`: Stream producer piping raw telemetry from agents directly into Kafka. + * `postgres-interface`: Data-access layer using `sqlx` with compile-time checked SQL and pessimistic locks (`SELECT FOR UPDATE`) for safe upserts. + * `fleet-tracing`: Shared logging initialization for the fleet server. +* **kafka-pipeline** — Dedicated pipeline consumer pulling from `aigis.events.raw`, mapping events to typed topics by class, and saving normalized data into `edr_logs` (using LZ4 compression and 5ms batching). +* **rule-engine** — Event scanner checking normalized streams against YARA-X rules, indexing detections with MITRE ATT&CK codes, and publishing alerts to `aigis.alerts`. (Currently in active development). +* **api-backend** — Axum-based web gateway serving REST endpoints and managing live WebSocket connections for dashboard operators. +* **frontend** — React operator console built with TypeScript and Vite. Currently supports authentication, node tracking, and live alert feeds via Mock data. +* **infra** — Docker Compose manifests for KRaft/Zookeeper Kafka stacks and Postgres setups, along with Kubernetes deployment specs. + +--- + +## Feature Overview + +**Agent** +- Scheduled osquery polling with query intervals loaded from the fleet server via `ConfigUpdateCommand` +- SQLite write-ahead event buffer that survives network outages and agent restarts; configurable max-size with oldest-first eviction under pressure +- Bidirectional gRPC stream to fleet-server with exponential backoff reconnection +- Heartbeat loop reporting node status and buffered event count +- Fleet-commanded network isolation via nftables: drop-all policy with a single outbound carve-out for the fleet-server IP +- Structured JSON logging with per-component log level control +- Musl static binary for production deployment — zero glibc dependency, runs on any Linux kernel >= 4.18 +- Cross-compiled release artifacts for `x86_64` and `aarch64` via GitHub Actions + +**Fleet Server** +- gRPC enrollment with 24-hour JWT token issuance +- Compile-time SQL verification via `sqlx` — schema mismatches fail at build time, not at runtime +- Strict `operator_status` / `agent_status` separation: heartbeats cannot overwrite operator-assigned quarantine states +- Time-series heartbeat tracking per node +- Kafka event forwarding with LZ4 compression + +**Kafka Pipeline** +- Type-aware event routing to dedicated topics per event class (process, file, network, auth) +- Consumer group management with graceful shutdown via `CancellationToken` -## Structure +**Rule Engine** +- YARA-X based scanning — pure Rust, no `libyara` C dependency +- MITRE ATT&CK technique and tactic mapping on alert records +- Structured `Alert` with threat score, severity, source, and triggering event reference +**Infrastructure** +- Three isolated PostgreSQL databases for node registry, event logs, and alerts +- Kafka with 12-partition event topics and 4-partition alert/health topics +- Kafka UI on port 8090 for local debugging +- Dev-mode KRaft Kafka (no Zookeeper) for faster local iteration +- Kubernetes manifests for fleet-server and supporting services + +--- + +## Current State + +Active development is on the `agent/bug-fixes-01` branch. This is the branch with the most commits and the most complete implementation across all components. + +| Component | Status | +|---|---| +| SDK (protobuf definitions, shared models) | Complete | +| Agent binary (osquery polling, gRPC, buffer) | Complete | +| Agent network isolation (nftables) | Complete | +| Agent enrollment and JWT auth | Complete | +| Agent heartbeat | Complete | +| Agent config hot-reload | Scaffold — fleet command delivery works; client-side application in progress | +| Fleet server (enrollment, health tracking, Kafka forwarding) | Complete | +| Kafka pipeline (event router) | Complete | +| Kafka pipeline (normalisation and DB persistence) | In progress | +| Rule engine (YARA-X scanning, alert production) | Stubbed — binary compiles; rule loading and alerting in progress | +| API backend (REST and WebSocket) | Stubbed — binary compiles; route implementation in progress | +| Frontend (login, node list, live events tab) | Functional with mock data; WebSocket integration in progress | +| eBPF collector (aya) | Excluded from default workspace; under development on a separate branch | +| mTLS (agent to fleet) | Scaffold — cert paths in config; TLS handshake not yet wired | +| Kubernetes production deployment | Manifests present; not production-validated | + +The zero-warning policy is enforced: `cargo clippy --all-targets -- -D warnings` and `cargo fmt --check` must pass before any merge. + +--- + +## Local Setup and Installation + +### Prerequisites + +| Tool | Minimum Version | Notes | +|---|---|---| +| Rust (stable) | 1.91 | Install via `rustup` | +| Docker and Docker Compose | Any recent | Required for the infra stack | +| Node.js | 18 | Required for frontend development | +| Linux kernel | 4.18 | Agent endpoint only; 5.10+ recommended | +| Architecture | x86_64 or aarch64 | Agent only | +| osquery | 5.23.0 | Agent endpoint only; installed by `install.sh` | + +--- + +### 1. Infrastructure + +```bash +git clone -b agent/bug-fixes-01 https://github.com/swar09/project-edr.git +cd project-edr + +cp .env.example .env +# Set POSTGRES_PASSWORD and any other required values in .env + +cd infra +docker compose up -d +docker compose ps ``` -sdk/ shared types, proto definitions -agent/ endpoint agent (cargo workspace, 6 crates) -fleet-server/ gRPC server for agent enrollment + streaming -kafka-pipeline/ event normalisation + DB persistence -rule-engine/ YARA scanning + MITRE ATT&CK mapping -api-backend/ REST API + WebSocket for dashboard -frontend/ React operator dashboard -infra/ docker-compose, k8s, terraform + +`kafka-init` creates the required topics automatically on first start. Kafka UI is available at `http://localhost:8090`. + +| Topic | Partitions | Purpose | +|---|---|---| +| `aigis.events.raw` | 12 | Raw agent telemetry | +| `aigis.events.norm` | 12 | Normalised events | +| `aigis.alerts` | 4 | Detection alerts | +| `aigis.health` | 4 | Node health | + +| Database | Host Port | Purpose | +|---|---|---| +| `edr_nodes` | 5433 | Node registry, enrollment, health | +| `edr_logs` | 5432 | Normalised event log | +| `edr_alerts` | 5434 | Detection alerts | + +For lightweight local development (KRaft Kafka, no Zookeeper): + +```bash +docker compose -f infra/docker-compose.dev.yml up -d ``` -## Getting Started +--- + +### 2. Building the Workspace + +`sqlx` performs compile-time query verification and requires `DATABASE_URL` to point to a live, migrated database. ```bash -# Start infrastructure -cd infra && docker-compose up -d +export DATABASE_URL=postgres://edr:@localhost:5433/edr_nodes + +cargo build --workspace +cargo build --release --workspace +``` + +To build against cached sqlx metadata without a live database: -# Build all Rust services +```bash +export SQLX_OFFLINE=true cargo build --workspace +``` + +CI checks: + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets -- -D warnings +cargo test --workspace +``` + +--- + +### 3. Agent Installation + +The agent runs on Linux endpoints and requires root. + +#### Method A: Pre-built Musl Binary (Recommended) + +```bash +VERSION=agent-v0.1.0 +ARCH=$(uname -m) + +curl -fsSL \ + "https://github.com/swar09/project-edr/releases/download/${VERSION}/aigis-zero-agent-linux-${ARCH}.tar.gz" \ + -o aigis-zero-agent.tar.gz + +tar -xzf aigis-zero-agent.tar.gz +cd aigis-zero-agent + +sudo bash install.sh +``` + +The installer handles osquery installation, directory setup, systemd unit registration, kernel tunables, and ulimits in a single run. See the `agent/INSTALLATION_GUIDE.md` for the full step-by-step breakdown. + +#### Method B: Build from Source + +Verify kernel prerequisites on the target endpoint: + +```bash +uname -r # >= 4.18 required, >= 5.10 recommended + +grep -E "CONFIG_BPF=y|CONFIG_BPF_SYSCALL=y" /boot/config-$(uname -r) 2>/dev/null || \ + zcat /proc/config.gz 2>/dev/null | grep -E "CONFIG_BPF=y|CONFIG_BPF_SYSCALL=y" + +ls /sys/kernel/btf/vmlinux && echo "BTF present" +``` + +Disable auditd (required — auditd and osquery compete for the audit netlink socket, which only allows one consumer): + +```bash +sudo systemctl stop auditd 2>/dev/null || true +sudo systemctl mask auditd 2>/dev/null || true +sudo systemctl mask --now systemd-journald-audit.socket +``` + +Install build dependencies: + +```bash +# Debian/Ubuntu +sudo apt-get update +sudo apt-get install -y \ + build-essential pkg-config libssl-dev \ + libsystemd-dev libaudit-dev libcap-dev \ + util-linux musl-tools + +# RHEL/Rocky/Fedora +sudo dnf install -y \ + gcc pkg-config openssl-devel \ + audit-libs-devel systemd-devel \ + util-linux-devel libcap-devel +``` + +Install Rust: + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +source "$HOME/.cargo/env" +rustc --version # should be stable >= 1.91 +``` + +Build the agent: + +```bash +# Native build (linked against system glibc) +cargo build --release --bin edr-agent + +# Musl static build (recommended for production) +rustup target add x86_64-unknown-linux-musl +cargo build --release --target x86_64-unknown-linux-musl --bin edr-agent -# Run a specific service -cargo run -p edr-fleet-server +# aarch64 musl (requires cross) +cargo install cross --git https://github.com/cross-rs/cross +cross build --release --target aarch64-unknown-linux-musl --bin edr-agent ``` -## Docs +Install osquery 5.23.0: + +```bash +curl -fsSL https://pkg.osquery.io/linux/osquery-5.23.0_1.linux_x86_64.tar.gz \ + -o osquery-5.23.0_1.linux_x86_64.tar.gz +sudo tar -xzf osquery-5.23.0_1.linux_x86_64.tar.gz -C / + +sudo tee /etc/systemd/system/osqueryd.service << 'EOF' +[Unit] +Description=The osquery Daemon +After=network.target syslog.target + +[Service] +Type=simple +TimeoutStartSec=0 +ExecStartPre=/bin/mkdir -p /run/osquery +ExecStart=/usr/bin/osqueryd \ + --flagfile=/etc/osquery/osquery.flags \ + --config_path=/etc/osquery/osquery.conf +Restart=on-failure +KillMode=control-group + +[Install] +WantedBy=multi-user.target +EOF +``` + +Install the agent binary, directories, configs, and systemd units: + +```bash +sudo install -o root -g root -m 0755 \ + target/x86_64-unknown-linux-musl/release/edr-agent \ + /usr/sbin/aigis-zero + +sudo mkdir -p /etc/aigis-zero /var/lib/aigis-zero /var/log/aigis-zero +sudo chmod 700 /etc/aigis-zero /var/lib/aigis-zero +sudo chmod 755 /var/log/aigis-zero + +sudo install -o root -g root -m 640 agent/agent.toml /etc/aigis-zero/config.toml +# Edit config to set the fleet server host and port +sudo nano /etc/aigis-zero/config.toml + +sudo install -o root -g root -m 644 \ + agent/sysctl/60-aigis-zero.conf /etc/sysctl.d/ +sudo sysctl --system + +sudo install -o root -g root -m 644 \ + agent/limits/99-aigis-zero.conf /etc/security/limits.d/ + +sudo mkdir -p /etc/osquery /var/osquery /var/log/osquery /run/osquery +sudo chmod 755 /etc/osquery && sudo chmod 750 /var/osquery && sudo chmod 755 /var/log/osquery /run/osquery + +sudo install -o root -g root -m 644 agent/osquery/osquery.conf /etc/osquery/osquery.conf +sudo install -o root -g root -m 644 agent/osquery/osquery.flags /etc/osquery/osquery.flags +sudo touch /etc/osquery/extensions.load && sudo chmod 644 /etc/osquery/extensions.load + +sudo install -o root -g root -m 644 \ + agent/systemd/aigis-zero.service /etc/systemd/system/ + +sudo mkdir -p /etc/systemd/system/osqueryd.service.d +sudo install -o root -g root -m 644 \ + agent/systemd/osqueryd.service.d/aigis-zero.conf \ + /etc/systemd/system/osqueryd.service.d/aigis-zero.conf + +sudo systemctl daemon-reload +sudo systemctl enable osqueryd aigis-zero +sudo systemctl start osqueryd +sudo systemctl start aigis-zero + +sudo systemctl status osqueryd +sudo systemctl status aigis-zero +``` + +Agent configuration reference (`/etc/aigis-zero/config.toml`): + +```toml +[agent] +log_level = "info" # trace | debug | info | warn | error +log_format = "json" # json | human +log_dir = "/var/log/aigis-zero" +data_dir = "/var/lib/aigis-zero" +event_buffer_db = "/var/lib/aigis-zero/events.db" +event_buffer_max = 500000 # max buffered events before oldest-drop +event_drain_batch = 100 +event_drain_interval_secs = 5 + +[osquery] +socket_path = "/var/osquery/osquery.em" +conf_path = "/etc/osquery/osquery.conf" +flags_path = "/etc/osquery/osquery.flags" +connect_timeout_secs = 30 +query_timeout_secs = 60 + +[fleet] +host = "" +port = 50051 +heartbeat_interval_secs = 60 +reconnect_interval_secs = 10 +max_reconnect_attempts = 0 # 0 = retry forever + +[isolation] +enabled = false # toggled by fleet-server IsolateCommand +``` + +Service management: + +```bash +# Both services are fully independent — stopping one does not affect the other +systemctl status osqueryd +systemctl status aigis-zero + +journalctl -u osqueryd -f +journalctl -u aigis-zero -f + +systemctl stop osqueryd # aigis-zero continues buffering normally +systemctl stop aigis-zero # osqueryd continues collecting normally +``` + +Uninstall: + +```bash +# Method A: using the installer script +sudo bash uninstall.sh +sudo bash uninstall.sh --remove-osquery --purge-logs # full purge + +# Method B: manual +sudo systemctl stop aigis-zero osqueryd +sudo systemctl disable aigis-zero osqueryd +sudo rm -f /usr/sbin/aigis-zero +sudo rm -rf /etc/aigis-zero /var/lib/aigis-zero +sudo rm -f /etc/systemd/system/aigis-zero.service +sudo rm -f /etc/systemd/system/osqueryd.service.d/aigis-zero.conf +sudo rm -f /etc/sysctl.d/60-aigis-zero.conf +sudo rm -f /etc/security/limits.d/99-aigis-zero.conf +sudo rm -f /etc/osquery/osquery.conf /etc/osquery/osquery.flags /etc/osquery/extensions.load +sudo rm -rf /var/osquery /run/osquery +sudo systemctl daemon-reload +``` + +Troubleshooting: + +| Symptom | Likely cause | Resolution | +|---|---|---| +| `osqueryd: perf_event_open failed` | eBPF not enabled or kernel too old | Verify `uname -r` >= 4.18 and `CONFIG_BPF_SYSCALL=y` | +| `file_events` table returns empty | inotify watch limit too low | `sudo sysctl -w fs.inotify.max_user_watches=524288` | +| `aigis-zero: connection refused` on osquery socket | osqueryd still starting | Wait for `Extension manager started` in `journalctl -u osqueryd` | +| `Permission denied on /var/osquery` | Directory ownership incorrect | `sudo chown -R root:root /etc/osquery /var/osquery && sudo chmod 750 /var/osquery` | +| `cargo build` fails, `DATABASE_URL not set` | sqlx compile-time check | Export `DATABASE_URL` pointing to the nodes DB or set `SQLX_OFFLINE=true` | + +--- + +### 4. Running Services + +```bash +# Fleet server +export DATABASE_URL=postgres://edr:@localhost:5433/edr_nodes +export KAFKA_BROKERS=localhost:29092 +cargo run -p fleet-server-bin + +# Kafka pipeline +export KAFKA_BROKERS=localhost:29092 +cargo run -p kafka-pipeline + +# Rule engine (stub) +cargo run -p rule-engine + +# API backend (stub) +cargo run -p api-backend +``` + +--- + +### 5. Frontend + +```bash +cd frontend +npm install +npm run dev # development server at http://localhost:5173 +npm run build # production build to frontend/dist/ +``` + +--- + +## Upcoming Features + +**mTLS (agent to fleet-server).** The config scaffolding and cert paths exist in `agent.toml` and the fleet-server settings. The next step is wiring the TLS handshake in the Tonic channel builder on the agent side and configuring `tonic` server TLS on the fleet side. The design target is enrollment-issued certificates: each agent gets a short-lived cert signed by the fleet CA during `RegisterAgent`. + +**eBPF collector.** The `agent/crates/ebpf-collector` crate is excluded from the default workspace build because it requires a kernel with BTF and the `aya` build toolchain. When this ships, the agent will collect process, network, and filesystem events directly from the kernel via eBPF programs, removing the dependency on osquery's audit-based collection and lifting the single-consumer constraint on the audit netlink socket. + +**Rule engine full implementation.** YARA-X rule loading from the filesystem, consumer group wiring, alert production to `aigis.alerts`, and PostgreSQL persistence. The binary compiles; the business logic is the active workstream. + +**API backend routes.** Full REST surface: node listing, node detail, alert query with filtering by severity and MITRE technique, and node isolation command forwarding to the fleet-server. WebSocket handler for live event streaming. + +**Frontend WebSocket integration.** The dashboard shell is in place. Connecting it to the api-backend WebSocket for live node status and alert feed is the active frontend workstream. + +**Kafka normalisation and DB persistence.** The event router is live. The next stage is the normalisation processor: consuming from typed topics, deserialising event payloads, and writing structured rows to `edr_logs`. + +**ML-based anomaly detection.** The `Alert` proto already carries a `source` field for `ml_model`. A future workstream will add a statistical baseline model for process execution frequency and network behaviour, producing anomaly alerts alongside YARA rule hits. + +**Enrollment secret validation.** The `enrollment_secret` field exists in `agent.toml` and the `RegisterRequest` proto. Fleet-server-side validation is not yet implemented. + +**Multi-tenancy.** The current data model is single-tenant. Organisation-scoped node isolation and role-based operator access are planned. + +**Windows agent.** The current agent is Linux-only. Windows support via ETW (Event Tracing for Windows) is on the long-term roadmap with no scheduled timeline. + +--- + +## References + +- [osquery documentation](https://osquery.readthedocs.io/) +- [aya — eBPF for Rust](https://aya-rs.dev/) +- [Tonic — gRPC for Rust](https://github.com/hyperium/tonic) +- [YARA-X](https://github.com/VirusTotal/yara-x) +- [MITRE ATT&CK Framework](https://attack.mitre.org/) +- [Apache Kafka documentation](https://kafka.apache.org/documentation/) +- [sqlx — async Rust SQL](https://github.com/launchbadge/sqlx) +- [Axum — async web framework](https://github.com/tokio-rs/axum) +- [rdkafka — Rust Kafka client](https://github.com/fede1024/rust-rdkafka) +- [nftables documentation](https://wiki.nftables.org/) +- [Tokio async runtime](https://tokio.rs/) + +--- + +## License + +This project is licensed under the [MIT License](LICENSE). + +``` +MIT License + +Copyright (c) 2025 Swar (@swar09) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +--- + +## Contributing + +Contributions are welcome. The bar is: zero clippy warnings, code formatted with `rustfmt`, and all tests passing. + +Before opening a pull request: + +```bash +cargo fmt --all +cargo clippy --workspace --all-targets -- -D warnings +cargo test --workspace +``` + +For non-trivial changes, open an issue first to align on the approach. Changes to the agent, fleet-server auth paths, or the isolation module warrant design discussion before implementation — these components touch the security-critical surface area of the system. + +Branch naming: +- `feat/` for new features +- `fix/` for bug fixes +- `chore/` for dependency updates, tooling, CI +- `agent/` for agent-specific work + +The `main` branch is the stable reference. Active development happens on feature branches and is merged via pull request. + +--- + +## Contributors + + + + + +
+ + swar09
+ Swar
+ @swar09
+ Author & Maintainer +
+
+ +--- -- [Implementation Guide](EDR_IMPLEMENTATION_GUIDE.md) -- [Timeline](TIMELINE.md) -- [Test Plan](tests/TEST_PLAN.md) +*Crafted in Rust. Full-stack ownership, zero compromise.* diff --git a/agent/.env.example b/agent/.env.example new file mode 100644 index 0000000..ee13f42 --- /dev/null +++ b/agent/.env.example @@ -0,0 +1,7 @@ +# EDR Agent Environment Variables + +# Config file path +EDR_AGENT_CONFIG=/etc/aigis-zero/config.toml + +# Logging +RUST_LOG=info diff --git a/agent/agent.toml b/agent/agent.toml index 689e743..5e4f168 100644 --- a/agent/agent.toml +++ b/agent/agent.toml @@ -2,7 +2,7 @@ # node_id is filled in after enrollment by fleet server # node_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" name = "" # auto-set from hostname on first run -log_level = "info" # trace | debug | info | warn | error +log_level = "debug" # trace | debug | info | warn | error log_format = "json" # json | human log_dir = "/var/log/aigis-zero" data_dir = "/var/lib/aigis-zero" @@ -11,20 +11,21 @@ event_buffer_max = 500000 # max events in SQLite before oldest are dro event_drain_batch = 100 # rows per flush cycle event_drain_interval_secs = 5 # how often to attempt shipping buffered events +node_id = "e38eb47e-6dd5-4ae0-b576-17b65dbf0eed" [osquery] socket_path = "/var/osquery/osquery.em" conf_path = "/etc/osquery/osquery.conf" flags_path = "/etc/osquery/osquery.flags" -pid_file = "/run/osquery/osqueryd.pid" +pid_file = "/var/osquery/osqueryd.pidfile" log_path = "/var/log/osquery" connect_timeout_secs = 30 query_timeout_secs = 60 [fleet] # Fleet server connection -host = "fleet.example.com" # fleet server hostname or IP -port = 8443 # fleet server gRPC port -endpoint = "https://fleet.example.com:8443" # derived, kept for convenience +host = "127.0.0.1" # fleet server hostname or IP +port = 50051 # fleet server gRPC port +endpoint = "http://127.0.0.1:50051" # derived, kept for convenience enrollment_secret = "" # pre-shared secret (scaffold, implemented later) # TLS (scaffold only — cert paths populated after enrollment) @@ -45,4 +46,4 @@ port = 9100 # agent metrics/health port (localhost only) [isolation] enabled = false # toggled by fleet server command fleet_ip = "" # auto-resolved from fleet.host -fleet_port = 8443 +fleet_port = 8443 \ No newline at end of file diff --git a/agent/crates/agent-bin/Cargo.toml b/agent/crates/agent-bin/Cargo.toml index 4601242..3c569de 100644 --- a/agent/crates/agent-bin/Cargo.toml +++ b/agent/crates/agent-bin/Cargo.toml @@ -17,3 +17,14 @@ clap = { version = "4", features = ["derive", "env"] } sd-notify = "0.4" libc = "0.2" toml = { workspace = true } +hostname = "0.4" +fleet-client = { workspace = true } +event-buffer = { workspace = true } +osquery-client = { workspace = true } +edr-sdk = { workspace = true } +serde_json = { workspace = true } +uuid = { workspace = true } +agent-tracing = { workspace = true } +isolation = { workspace = true } +tokio-util = { workspace = true } +dotenvy = "0.15" diff --git a/agent/crates/agent-bin/src/main.rs b/agent/crates/agent-bin/src/main.rs index d7f9d5f..b85b43a 100644 --- a/agent/crates/agent-bin/src/main.rs +++ b/agent/crates/agent-bin/src/main.rs @@ -1,13 +1,33 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] use clap::Parser; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::Duration; +use tokio::sync::Mutex; use tokio::time::interval; +use tracing::{error, info, warn}; +use uuid::Uuid; + +use agent_core::config::AgentConfig; +pub use agent_core::orchestrator::{get_os_version, read_machine_id}; +use edr_sdk::models::event::EventBatch; +use edr_sdk::models::heartbeat::HeartbeatRequest; +use edr_sdk::proto::fleet::RegisterRequest; +use fleet_client::FleetClient; + +use std::fs::File; +use std::io::{BufRead, BufReader}; #[derive(Parser, Debug)] #[command(name = "aigis-zero", version, about = "Aigis-Zero Agent")] struct Args { /// Config path - #[arg(short, long, default_value = "/etc/aigis-zero/config.toml")] + #[arg( + short, + long, + env = "EDR_AGENT_CONFIG", + default_value = "/etc/aigis-zero/config.toml" + )] config: PathBuf, /// Validate config and exit @@ -19,8 +39,54 @@ struct Args { enroll: bool, } +fn save_node_id_to_config(path: &Path, node_id: Uuid) -> anyhow::Result<()> { + let content = std::fs::read_to_string(path)?; + let mut lines: Vec = content.lines().map(String::from).collect(); + + let mut in_agent = false; + let mut inserted = false; + for i in 0..lines.len() { + if lines[i].trim() == "[agent]" { + in_agent = true; + continue; + } + if in_agent && lines[i].starts_with("node_id") { + lines[i] = format!("node_id = \"{}\"", node_id); + inserted = true; + break; + } + if in_agent && lines[i].starts_with('[') { + lines.insert(i, format!("node_id = \"{}\"", node_id)); + inserted = true; + break; + } + } + if in_agent && !inserted { + lines.push(format!("node_id = \"{}\"", node_id)); + } + std::fs::write(path, lines.join("\n"))?; + Ok(()) +} + +fn parse_endpoint(endpoint: &str) -> (std::net::IpAddr, u16) { + let clean = endpoint + .trim_start_matches("http://") + .trim_start_matches("https://"); + let host_port = clean.split('/').next().unwrap_or(clean); + let parts: Vec<&str> = host_port.split(':').collect(); + let ip_str = parts.first().copied().unwrap_or("127.0.0.1"); + let ip_str = ip_str.trim_start_matches('[').trim_end_matches(']'); + let ip = ip_str + .parse() + .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1))); + let port = parts.get(1).and_then(|p| p.parse().ok()).unwrap_or(50051); + (ip, port) +} + #[tokio::main] async fn main() -> anyhow::Result<()> { + dotenvy::dotenv().ok(); + // 1. Parse CLI let args = Args::parse(); @@ -30,34 +96,191 @@ async fn main() -> anyhow::Result<()> { std::process::exit(1); } - // 3. --check mode + // Parse config + let config_str = std::fs::read_to_string(&args.config).map_err(|e| { + anyhow::anyhow!( + "Failed to read config file at {}: {}", + args.config.display(), + e + ) + })?; + let mut config: AgentConfig = toml::from_str(&config_str) + .map_err(|e| anyhow::anyhow!("Failed to parse TOML config: {}", e))?; + if args.check { - let config_str = std::fs::read_to_string(&args.config).map_err(|e| { - anyhow::anyhow!( - "Failed to read config file at {}: {}", - args.config.display(), - e - ) - })?; - // Just parse it as TOML to ensure syntax is valid - let _parsed: toml::Value = toml::from_str(&config_str) - .map_err(|e| anyhow::anyhow!("Failed to parse TOML config: {}", e))?; println!("Config syntax is valid."); - std::process::exit(0); + let report = agent_core::preflight::run_preflight(&config); + report.print(); + if report.is_ok() { + println!("Environment checks passed. Ready for deployment."); + std::process::exit(0); + } else { + eprintln!( + "Environment checks failed. Please resolve the errors above before deploying." + ); + std::process::exit(1); + } } - // Pass config path via env var since orchestrator uses it - unsafe { - std::env::set_var("EDR_AGENT_CONFIG", args.config.to_str().unwrap()); - } + let format = match config.agent.log_format.as_str() { + "json" => agent_tracing::LogFormat::Json, + _ => agent_tracing::LogFormat::Human, + }; + agent_tracing::init(&config.agent.log_level, format)?; + info!("Starting Aigis-Zero Agent"); - // 6. Install panic hook + // Install panic hook std::panic::set_hook(Box::new(|panic_info| { - // Log to stderr explicitly in case tracing isn't set up yet eprintln!("Agent panicked: {}", panic_info); let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Status("Agent panicked")]); })); + // Create OsqueryClient and connect + let collector = osquery_client::OsqueryCollector::new(osquery_client::OsqueryConfig { + socket_path: config.osquery.socket_path.clone(), + db_path: config.agent.event_buffer_db.clone(), + }) + .await?; + + // Create EventBuffer + let buffer = event_buffer::EventBuffer::new( + &config.agent.event_buffer_db, + config.agent.event_buffer_max, + )?; + let buffer = Arc::new(buffer); + + // Connect to fleet server + let mut fleet = FleetClient::new(config.fleet.endpoint.clone()); + fleet + .connect_with_retry( + config.fleet.max_reconnect_attempts, + Duration::from_secs(config.fleet.reconnect_interval_secs), + None, + ) + .await?; + + // Enrollment + let enrollment = fleet + .enroll(RegisterRequest { + hostname: hostname::get()?.to_string_lossy().to_string(), + os_version: get_os_version(), + agent_version: env!("CARGO_PKG_VERSION").to_string(), + machine_id: read_machine_id(), + }) + .await?; + + let node_id = Uuid::parse_str(&enrollment.node_id).unwrap_or_default(); + let token = enrollment.token; + + // Save node_id to config file if it has changed or force enrollment is requested + if config.agent.node_id != Some(node_id) || args.enroll { + save_node_id_to_config(&args.config, node_id)?; + config.agent.node_id = Some(node_id); + } + + info!(%node_id, "Successfully enrolled/loaded node ID"); + + // Establish the authenticated event stream connection with the token + fleet + .connect_with_retry( + config.fleet.max_reconnect_attempts, + Duration::from_secs(config.fleet.reconnect_interval_secs), + Some(&token), + ) + .await?; + + let fleet = Arc::new(Mutex::new(fleet)); + + // Start AgentCore (osquery loop + command listener) + let agent_uuid = node_id.to_string(); + let osquery_collector = Arc::new(collector); + let (fleet_ip, fleet_port) = parse_endpoint(&config.fleet.endpoint); + + let command_handler = agent_core::command_handler::CommandHandler { + osquery: osquery_collector.clone(), + isolation: isolation::IsolationManager::new(fleet_ip, fleet_port), + }; + + let agent_core = agent_core::AgentCore { + shutdown: tokio_util::sync::CancellationToken::new(), + osquery: osquery_collector, + buffer: buffer.clone(), + command_handler: Arc::new(command_handler), + fleet_client: fleet.clone(), + }; + + tokio::spawn(async move { + let _ = agent_core.run(&agent_uuid).await; + }); + + // Start heartbeat loop (every heartbeat_interval_secs) + let fleet_hb = fleet.clone(); + let hb_interval = config.fleet.heartbeat_interval_secs; + let hb_buffer = buffer.clone(); + let hb_node_id = node_id.to_string(); + tokio::spawn(async move { + let mut ticker = interval(Duration::from_secs(hb_interval)); + loop { + ticker.tick().await; + + let count = hb_buffer.len().await.unwrap_or(0) as i64; + let req = HeartbeatRequest { + node_id: hb_node_id.clone(), + status: "healthy".to_string(), + events_buffered: count, + }; + + let mut f = fleet_hb.lock().await; + if let Err(e) = f.heartbeat(&req).await { + warn!("Heartbeat failed: {}", e); + } + } + }); + + // Start event drain loop (every event_drain_interval_secs) + let fleet_drain = fleet.clone(); + let drain_interval = config.agent.event_drain_interval_secs; + let batch_size = config.agent.event_drain_batch; + tokio::spawn(async move { + let mut ticker = interval(Duration::from_secs(drain_interval)); + loop { + ticker.tick().await; + if let Ok(events) = buffer.drain(batch_size as usize).await + && !events.is_empty() + { + let parsed_events = events + .iter() + .filter_map(|e| serde_json::from_str::(e).ok()) + .collect::>(); + + if !parsed_events.is_empty() { + let batch = EventBatch { + node_id, + events: parsed_events, + }; + let mut f = fleet_drain.lock().await; + match f.send_events(&batch).await { + Ok(ack) if ack.success => { + // Successfully sent + } + Ok(ack) => { + warn!(error = ?ack.error, "Fleet rejected event batch, re-queuing"); + for event in events { + let _ = buffer.push(event).await; + } + } + Err(e) => { + warn!(?e, "Failed to send events to fleet, re-queuing"); + for event in events { + let _ = buffer.push(event).await; + } + } + } + } + } + } + }); + // Watchdog task tokio::spawn(async { let mut ticker = interval(Duration::from_secs(15)); @@ -67,14 +290,13 @@ async fn main() -> anyhow::Result<()> { } }); - // Notify ready let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]); - // 10. Call orchestrator - let res = agent_core::orchestrator::run().await; + // Wait for shutdown signal + let _ = tokio::signal::ctrl_c().await; + info!("Ctrl-C received, shutting down"); - // Notify stopping let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Stopping]); - res + Ok(()) } diff --git a/agent/crates/agent-core/Cargo.toml b/agent/crates/agent-core/Cargo.toml index c04b4c5..19c8f22 100644 --- a/agent/crates/agent-core/Cargo.toml +++ b/agent/crates/agent-core/Cargo.toml @@ -6,6 +6,7 @@ rust-version.workspace = true [dependencies] tokio = { workspace = true } +tokio-util = { workspace = true } anyhow = { workspace = true } tracing = { workspace = true } serde = { workspace = true } @@ -20,4 +21,6 @@ osquery-client = { workspace = true } event-buffer = { workspace = true } hostname = "0.4" notify = { version = "6", features = ["macos_fsevent"] } - +edr-sdk = { workspace = true } +isolation = { workspace = true } +libc = "0.2" diff --git a/agent/crates/agent-core/src/command_handler.rs b/agent/crates/agent-core/src/command_handler.rs new file mode 100644 index 0000000..2b2f1c8 --- /dev/null +++ b/agent/crates/agent-core/src/command_handler.rs @@ -0,0 +1,35 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] +use edr_sdk::proto::fleet::{ServerCommand, server_command::Command}; +use isolation::IsolationManager; +use osquery_client::OsqueryCollector; +use serde_json::Value; +use std::sync::Arc; +use tracing::{info, warn}; + +pub struct CommandHandler { + pub osquery: Arc, + pub isolation: IsolationManager, +} + +impl CommandHandler { + pub async fn handle(&self, msg: ServerCommand) -> Result { + let command = msg.command.ok_or("missing command")?; + + match command { + Command::Isolate(iso) => { + if iso.isolate { + self.isolation.isolate().await.map_err(|e| e.to_string())?; + Ok(serde_json::json!({"status": "isolated"})) + } else { + self.isolation + .de_isolate() + .await + .map_err(|e| e.to_string())?; + Ok(serde_json::json!({"status": "unisolated"})) + } + } + Command::ConfigUpdate(_cfg) => Ok(serde_json::json!({"status": "config_updated"})), + Command::Ack(_) => Ok(serde_json::json!({"status": "acked"})), + } + } +} diff --git a/agent/crates/agent-core/src/config.rs b/agent/crates/agent-core/src/config.rs index 32a8877..6d6c4fc 100644 --- a/agent/crates/agent-core/src/config.rs +++ b/agent/crates/agent-core/src/config.rs @@ -94,7 +94,7 @@ port = 9100 socket_path = "/var/osquery/osquery.em" conf_path = "/etc/osquery/osquery.conf" flags_path = "/etc/osquery/osquery.flags" -pid_file = "/run/osquery/osqueryd.pid" +pid_file = "/var/osquery/osqueryd.pidfile" log_path = "/var/log/osquery" connect_timeout_secs = 30 query_timeout_secs = 60 diff --git a/agent/crates/agent-core/src/lib.rs b/agent/crates/agent-core/src/lib.rs index b2b1067..0811ddc 100644 --- a/agent/crates/agent-core/src/lib.rs +++ b/agent/crates/agent-core/src/lib.rs @@ -1,2 +1,486 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] +pub mod command_handler; pub mod config; pub mod orchestrator; +pub mod preflight; + +use anyhow::Result; +use std::sync::Arc; +use std::time::Duration; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use uuid::Uuid; + +use command_handler::CommandHandler; +use event_buffer::EventBuffer; +use fleet_client::FleetClient; +use fleet_client::types::{AgentEvent, EventType}; +use osquery_client::OsqueryCollector; + +/// Maximum number of consecutive `receive()` errors before the command +/// listener backs off to the maximum delay ceiling. +const CMD_MAX_BACKOFF_ERRORS: u32 = 8; + +/// Starting backoff delay on a transport error (50 ms). +const CMD_BACKOFF_BASE_MS: u64 = 50; + +/// Ceiling for exponential backoff (≈ 12.8 s). +const CMD_BACKOFF_CEILING_MS: u64 = 12_800; + +/// How long the command-listener task sleeps between `try_receive` polls +/// when the channel is empty but healthy. Keeps CPU near zero while still +/// allowing other tasks to acquire the Mutex within one tick (~5 ms). +const CMD_POLL_INTERVAL_MS: u64 = 5; + +pub struct AgentCore { + pub shutdown: CancellationToken, + pub osquery: Arc, + pub buffer: Arc, + pub command_handler: Arc, + pub fleet_client: Arc>, +} + +impl AgentCore { + /// Start all background tasks and block until the shutdown token fires. + /// + /// # Parameters + /// - `agent_uuid`: The node UUID assigned during enrollment. Passed into + /// `OsqueryCollector::start` so that every `OsqueryResult` carries the + /// correct identity before it is serialised into an `AgentEvent`. + pub async fn run(&self, agent_uuid: &str) -> Result<()> { + let shutdown = self.shutdown.clone(); + + // TASK 1: OSQuery Polling and Buffering Loop + // + // `OsqueryCollector::start` spawns its own internal scheduler task + // and returns the *consumer* end of an MPSC channel (buffer = 100). + // We own the Receiver here; the scheduler task holds the Sender. + // + // Backpressure: when `buffer.push()` is slow (SQLite lock contention) + // the Tokio MPSC back-pressure naturally throttles the scheduler + // because its sends will block once the channel fills to 100 items. + // We do *not* need an additional semaphore here. + let mut results_rx = self.osquery.start(agent_uuid).await; + + let buffer_task = self.buffer.clone(); + let agent_uuid_owned = agent_uuid.to_string(); + let shutdown_osq = shutdown.clone(); + + let osquery_task = tokio::spawn(async move { + info!(agent_uuid = %agent_uuid_owned, "OSQuery polling task started"); + + loop { + tokio::select! { + // Biased select: check shutdown first so we exit promptly + // even when events are arriving continuously. + biased; + + _ = shutdown_osq.cancelled() => { + info!("OSQuery polling task: shutdown signal received, draining remaining events"); + + // Graceful drain + // Consume whatever is already sitting in the MPSC + // buffer so we do not lose events that the scheduler + // already produced before the token fired. + while let Ok(result) = results_rx.try_recv() { + if let Some(json) = encode_osquery_result(&result) + && let Err(e) = buffer_task.push(json).await { + error!(error = %e, "Failed to buffer OSQuery result during shutdown drain"); + } + } + break; + } + + // Normal path: block until the next OsqueryResult arrives + // or the sender side drops (collector task exited). + result = results_rx.recv() => { + match result { + None => { + // Sender dropped: scheduler task exited unexpectedly. + warn!("OSQuery collector channel closed; polling task exiting"); + break; + } + Some(osq_result) => { + // Serialisation + // `encode_osquery_result` handles both + // serde errors and clock-jump edge cases + // internally; it never panics. + let Some(event_json) = encode_osquery_result(&osq_result) else { + // Error already logged inside helper + continue; + }; + + debug!( + query = %osq_result.query_name, + rows = osq_result.rows.len(), + "Buffering OSQuery event" + ); + + // Buffer push + // `EventBuffer::push` offloads the actual + // SQLite INSERT onto `spawn_blocking`, so + // this await yields the async thread back to + // Tokio for the duration of the disk I/O. + // SQLite lock contention → the task waits + // inside spawn_blocking without consuming + // an async worker thread. + if let Err(e) = buffer_task.push(event_json).await { + // Do NOT crash the loop. Log and continue + // so that a transient WAL-lock burst + // does not drop the entire stream. + error!( + query = %osq_result.query_name, + error = %e, + "Failed to push OSQuery event to buffer; event dropped" + ); + } + } + } + } + } + } + + info!("OSQuery polling task exited cleanly"); + }); + + // TASK 2: Command Listener Loop + // + // LOCK CONTENTION DESIGN — why we do NOT do: + // + // loop { let mut c = fleet.lock().await; c.receive().await; } + // + // That pattern holds the Mutex for the *entire* duration of the + // blocking `recv()` inside `receive()`, which can be seconds or + // minutes between commands. While the Mutex is held, the heartbeat + // task and the event-drain task cannot acquire it, causing starvation. + // + // SOLUTION — cooperative non-blocking poll: + // + // 1. Lock the Mutex. + // 2. Call `try_receive()` — returns immediately (no `.await`). + // 3. Unlock the Mutex (lock guard drops at end of block). + // 4. If a message was ready → process it (no lock needed). + // 5. If the channel was empty → sleep CMD_POLL_INTERVAL_MS (5 ms) + // then go to step 1. During that sleep every other task can + // freely acquire the lock. + // + // This keeps peak latency < 5 ms for commands while consuming + // essentially zero CPU when no commands are arriving. + // + // ALTERNATIVE: Extract `inbound_rx` out of FleetClient into its own + // `Arc>>` so the command listener can + // hold that lock (cheap) without blocking the send path. That is the + // architecturally cleanest solution but requires a larger refactor; + // it is tracked as a follow-up TODO below. + // + // TODO(follow-up): Split FleetClient into `FleetSink` (outbound_tx, + // methods: send/enroll/heartbeat/send_events) and `FleetSource` + // (inbound_rx, method: receive) so the two halves can be locked + // independently, eliminating the poll interval entirely. + + let cmd_handler = self.command_handler.clone(); + let fleet = self.fleet_client.clone(); + let shutdown_cmd = shutdown.clone(); + + let command_task = tokio::spawn(async move { + info!("Command listener task started"); + + // Consecutive transport error counter for backoff. + let mut consecutive_errors: u32 = 0; + + loop { + // Check shutdown first (biased) + if shutdown_cmd.is_cancelled() { + info!("Command listener task: shutdown signal received, exiting"); + break; + } + + // Non-blocking poll (lock held < 1 µs) + // + // We acquire the lock, call try_receive (synchronous, no + // await), then immediately drop the guard. The total time + // the Mutex is held equals one MPSC `try_recv` call which + // is O(1) and lock-free on the happy path. + let poll_result = { + let mut client = fleet.lock().await; + client.try_receive() + // `client` guard drops here — Mutex released. + }; + + match poll_result { + // No message yet + Ok(None) => { + // Reset error counter: the transport is healthy. + consecutive_errors = 0; + + // Yield to the scheduler for one poll interval. + // Other tasks (heartbeat, drain) can acquire the + // fleet_client Mutex during this sleep. + tokio::select! { + biased; + _ = shutdown_cmd.cancelled() => { + info!("Command listener task: shutdown during poll sleep"); + break; + } + _ = tokio::time::sleep( + Duration::from_millis(CMD_POLL_INTERVAL_MS)) => {} + } + } + + // Message received + Ok(Some(msg)) => { + consecutive_errors = 0; + + debug!(command = ?msg.command, "Received ServerCommand"); + + // Dispatch to CommandHandler. The handler is + // Arc-wrapped and does not need the fleet_client + // lock, so we process the command without holding + // any mutex. + match cmd_handler.handle(msg).await { + Ok(response) => { + debug!(response = ?response, "Command handled successfully"); + } + Err(e) => { + warn!(error = %e, "CommandHandler returned error; continuing"); + } + } + } + + // Transport / channel error + Err(e) => { + consecutive_errors = consecutive_errors.saturating_add(1); + + // Exponential back-off: 50 ms → 100 → 200 → … → 12 800 ms + let backoff_ms = (CMD_BACKOFF_BASE_MS + * 2u64.pow(consecutive_errors.min(CMD_MAX_BACKOFF_ERRORS))) + .min(CMD_BACKOFF_CEILING_MS); + + error!( + error = %e, + consecutive = consecutive_errors, + backoff_ms = backoff_ms, + "Command listener: transport error; backing off" + ); + + // Attempt to reconnect to the fleet server using the stored token + { + let mut client = fleet.lock().await; + let token = client.token().map(|s| s.to_string()); + if let Err(reconnect_err) = client.connect(token.as_deref()).await { + warn!(error = %reconnect_err, "Failed to reconnect to fleet server"); + } else { + info!("Successfully re-established connection to fleet server"); + consecutive_errors = 0; + } + } + + // Respect shutdown even during backoff sleep. + tokio::select! { + biased; + _ = shutdown_cmd.cancelled() => { + info!("Command listener task: shutdown during error backoff"); + break; + } + _ = tokio::time::sleep(Duration::from_millis(backoff_ms)) => {} + } + } + } + } + + info!("Command listener task exited cleanly"); + }); + + // Wait for shutdown signal + shutdown.cancelled().await; + info!("AgentCore: shutdown token fired, awaiting task cleanup"); + + // Give tasks up to 5 s to finish their graceful drain / backoff + // sleep before we hard-abort them. In practice both tasks react + // to the cancellation token within one poll cycle (≤ 5 ms for the + // command task, ≤ one SQLite round-trip for the osquery task). + let grace = Duration::from_secs(5); + let _ = tokio::time::timeout(grace, osquery_task).await; + let _ = tokio::time::timeout(grace, command_task).await; + + info!("AgentCore: all tasks exited, shutdown complete"); + Ok(()) + } +} + +// Helper — OsqueryResult to AgentEvent JSON + +/// Converts an `OsqueryResult` into a JSON string suitable for `EventBuffer::push`. +/// +/// Returns `None` (and logs a warning) if serialisation fails so the caller +/// can continue without crashing the polling loop. +/// +/// Clock-jump robustness: `OsqueryResult::timestamp_ns` is produced by the +/// scheduler using `chrono::Utc::now().timestamp_nanos_opt()`. If the +/// system clock jumps backwards (NTP step, VM snapshot restore), the +/// timestamp will appear to go backwards. We do *not* try to correct this +/// here — doing so correctly requires a monotonic clock that maps to wall +/// time, which is scheduler-level logic. Instead we propagate whatever the +/// scheduler produced and let the server-side pipeline de-duplicate by +/// `sequence_id` (UUID v4) rather than timestamp ordering. +fn encode_osquery_result(result: &osquery_client::types::OsqueryResult) -> Option { + // Serialise the full OsqueryResult as the event payload. + let payload = match serde_json::to_value(result) { + Ok(v) => v, + Err(e) => { + warn!( + query = %result.query_name, + error = %e, + "Failed to serialise OsqueryResult payload; event dropped" + ); + return None; + } + }; + + let event = AgentEvent { + node_id: result.agent_uuid.clone(), + event_type: EventType::Osquery as i32, + payload, + // Pass through the scheduler's nanosecond timestamp verbatim. + // The scheduler already checked `timestamp_nanos_opt()`; if it + // returned None it will have substituted 0, which is detectable + // by the server as a sentinel value. + timestamp_ns: result.timestamp_ns, + sequence_id: Uuid::new_v4().to_string(), + }; + + match serde_json::to_string(&event) { + Ok(json) => Some(json), + Err(e) => { + warn!( + query = %result.query_name, + error = %e, + "Failed to serialize AgentEvent to JSON; event dropped" + ); + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use edr_sdk::proto::fleet::{ + AckCommand, ConfigUpdateCommand, ServerCommand, server_command::Command, + }; + use osquery_client::types::{ColumnEntry, OsqueryResult, OsqueryResultRow, ResultAction}; + use std::path::PathBuf; + + #[test] + fn test_osquery_result_encoding_happy_path() { + let result = OsqueryResult { + query_name: "test_query".to_string(), + agent_uuid: "test-agent-123".to_string(), + timestamp_ns: 1718660000000000000, + rows: vec![OsqueryResultRow { + columns: vec![ColumnEntry { + name: "col1".to_string(), + value: "val1".to_string(), + }], + }], + action: ResultAction::Snapshot, + }; + + let encoded = encode_osquery_result(&result); + assert!(encoded.is_some()); + + let json_str = encoded.unwrap(); + let event: serde_json::Value = serde_json::from_str(&json_str).unwrap(); + + assert_eq!(event["node_id"], "test-agent-123"); + assert_eq!(event["event_type"], EventType::Osquery as i32); + assert_eq!(event["timestamp_ns"], 1718660000000000000i64); + + let payload = &event["payload"]; + assert_eq!(payload["query_name"], "test_query"); + assert_eq!(payload["action"], "SNAPSHOT"); + } + + #[tokio::test] + async fn test_command_handling_ack() { + let collector = OsqueryCollector::new(osquery_client::OsqueryConfig { + socket_path: PathBuf::from("/tmp/osquery-test.em"), + db_path: PathBuf::from("/tmp/events-test.db"), + }) + .await + .unwrap(); + + let handler = CommandHandler { + osquery: Arc::new(collector), + isolation: isolation::IsolationManager::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1)), + 50051, + ), + }; + + let cmd = ServerCommand { + command: Some(Command::Ack(AckCommand { + sequence_id: "test-seq-123".to_string(), + })), + }; + + let res = handler.handle(cmd).await; + assert!(res.is_ok()); + let val = res.unwrap(); + assert_eq!(val["status"], "acked"); + } + + #[tokio::test] + async fn test_command_handling_config_update() { + let collector = OsqueryCollector::new(osquery_client::OsqueryConfig { + socket_path: PathBuf::from("/tmp/osquery-test.em"), + db_path: PathBuf::from("/tmp/events-test.db"), + }) + .await + .unwrap(); + + let handler = CommandHandler { + osquery: Arc::new(collector), + isolation: isolation::IsolationManager::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1)), + 50051, + ), + }; + + let cmd = ServerCommand { + command: Some(Command::ConfigUpdate(ConfigUpdateCommand { + config: Some(edr_sdk::proto::fleet::AgentConfig { + osquery_schedule: vec![], + heartbeat_interval_secs: 60, + batch_size: 100, + }), + })), + }; + + let res = handler.handle(cmd).await; + assert!(res.is_ok()); + let val = res.unwrap(); + assert_eq!(val["status"], "config_updated"); + } + + #[tokio::test] + async fn test_shutdown_signal_cancellation() { + let token = CancellationToken::new(); + let token_clone = token.clone(); + + let handle = tokio::spawn(async move { + tokio::select! { + _ = token_clone.cancelled() => { + true + } + _ = tokio::time::sleep(Duration::from_millis(500)) => { + false + } + } + }); + + token.cancel(); + let result = handle.await.unwrap(); + assert!(result); + } +} diff --git a/agent/crates/agent-core/src/orchestrator.rs b/agent/crates/agent-core/src/orchestrator.rs index 6838b79..1d15031 100644 --- a/agent/crates/agent-core/src/orchestrator.rs +++ b/agent/crates/agent-core/src/orchestrator.rs @@ -1,3 +1,4 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] use crate::config::AgentConfig; use anyhow::Result; use event_buffer::EventBuffer; @@ -86,14 +87,11 @@ pub async fn run() -> Result<()> { // Fleet enrollment (non-fatal, fleet server not ready yet) tracing::info!("Attempting fleet enrollment (non-fatal if server is down)..."); - let mut fleet_client = fleet_client::FleetClient::new(fleet_client::FleetConfig { - endpoint: config.fleet.endpoint.clone(), - }) - .await?; + let mut fleet_client = fleet_client::FleetClient::new(config.fleet.endpoint.clone()); - let req = fleet_client::types::RegisterRequest { + let req = edr_sdk::proto::fleet::RegisterRequest { hostname: hostname_or_default(), - os_version: "linux".to_string(), + os_version: get_os_version(), agent_version: env!("CARGO_PKG_VERSION").to_string(), machine_id: read_machine_id(), }; @@ -166,11 +164,72 @@ fn hostname_or_default() -> String { .unwrap_or_else(|| "unknown-host".to_string()) } -fn read_machine_id() -> String { - std::fs::read_to_string("/etc/machine-id") - .unwrap_or_default() - .trim() - .to_string() +pub fn read_machine_id() -> String { + if let Ok(id) = std::fs::read_to_string("/etc/machine-id") { + let trimmed = id.trim(); + if !trimmed.is_empty() { + return trimmed.to_string(); + } + } + if let Ok(id) = std::fs::read_to_string("/var/lib/dbus/machine-id") { + let trimmed = id.trim(); + if !trimmed.is_empty() { + return trimmed.to_string(); + } + } + "unknown-machine-id".to_string() +} + +pub fn get_os_version() -> String { + use std::fs::File; + use std::io::{BufRead, BufReader}; + use std::path::Path; + + let path = Path::new("/etc/os-release"); + if !path.exists() { + return "Unknown Linux (os-release not found)".to_string(); + } + let file = match File::open(path) { + Ok(f) => f, + Err(_) => { + return "Unknown Linux (unable to open os-release)".to_string(); + } + }; + + let reader = BufReader::new(file); + let mut name = None; + let mut version = None; + let mut pretty_name = None; + + for line_content in reader.lines().map_while(Result::ok) { + let trimmed = line_content.trim(); + if trimmed.starts_with('#') || trimmed.is_empty() { + continue; + } + + if let Some(pos) = trimmed.find('=') { + let key = trimmed[..pos].trim(); + let val = trimmed[pos + 1..].trim().trim_matches('"').to_string(); + + match key { + "PRETTY_NAME" => pretty_name = Some(val), + "NAME" => name = Some(val), + "VERSION" => version = Some(val), + _ => {} + } + } + } + + if let Some(pretty) = pretty_name { + pretty + } else { + let os_name = name.unwrap_or_else(|| "Linux".to_string()); + if let Some(ver) = version { + format!("{} {}", os_name, ver) + } else { + os_name + } + } } #[cfg(test)] diff --git a/agent/crates/agent-core/src/preflight.rs b/agent/crates/agent-core/src/preflight.rs new file mode 100644 index 0000000..4574f0f --- /dev/null +++ b/agent/crates/agent-core/src/preflight.rs @@ -0,0 +1,194 @@ +use std::path::Path; +use std::process::Command; + +pub struct PreflightReport { + pub config_dir_writable: Result<(), String>, + pub data_dir_writable: Result<(), String>, + pub log_dir_writable: Result<(), String>, + pub bpf_jit_enabled: Result, + pub inotify_watches: Result, + pub osqueryd_installed: Result, + pub nft_installed: Result, + pub is_root: bool, +} + +impl PreflightReport { + pub fn is_ok(&self) -> bool { + self.config_dir_writable.is_ok() + && self.data_dir_writable.is_ok() + && self.log_dir_writable.is_ok() + && self.osqueryd_installed.is_ok() + && self.nft_installed.is_ok() + && self.is_root + } + + pub fn print(&self) { + println!("Aigis-Zero Agent Pre-flight Environment Check"); + + if self.is_root { + println!(" [OK] Running as root (UID 0)"); + } else { + println!(" [FAIL] Not running as root (required for isolation & raw operations)"); + } + + let print_dir_status = |name: &str, status: &Result<(), String>| match status { + Ok(_) => println!(" [OK] {} is accessible and writable", name), + Err(e) => println!(" [FAIL] {} check failed: {}", name, e), + }; + + print_dir_status("Config Directory", &self.config_dir_writable); + print_dir_status("Data Directory", &self.data_dir_writable); + print_dir_status("Log Directory", &self.log_dir_writable); + + match &self.bpf_jit_enabled { + Ok(true) => println!(" [OK] BPF JIT compilation is enabled"), + Ok(false) => { + println!(" [WARN] BPF JIT compilation is disabled (performance might be affected)") + } + Err(e) => println!(" [WARN] Could not verify BPF JIT: {}", e), + } + + match &self.inotify_watches { + Ok(val) => { + if *val >= 524288 { + println!( + " [OK] inotify max_user_watches limit is sufficient ({})", + val + ); + } else { + println!( + " [WARN] inotify max_user_watches limit is low ({}); recommended >= 524288", + val + ); + } + } + Err(e) => println!(" [WARN] Could not verify inotify max_user_watches: {}", e), + } + + match &self.osqueryd_installed { + Ok(path) => println!(" [OK] osqueryd found: {}", path), + Err(e) => println!(" [FAIL] osqueryd check failed: {}", e), + } + + match &self.nft_installed { + Ok(path) => println!(" [OK] nft (nftables) found: {}", path), + Err(e) => println!(" [FAIL] nft (nftables) check failed: {}", e), + } + } +} + +pub fn run_preflight(config: &crate::config::AgentConfig) -> PreflightReport { + let is_root = unsafe { libc::getuid() } == 0; + + let check_dir_writable = |path: &Path| -> Result<(), String> { + if !path.exists() + && let Err(e) = std::fs::create_dir_all(path) + { + return Err(format!( + "Directory does not exist and failed to create: {}", + e + )); + } + let temp_file = path.join(".aigis_zero_preflight_temp"); + if let Err(e) = std::fs::write(&temp_file, b"test") { + return Err(format!("Not writable: {}", e)); + } + let _ = std::fs::remove_file(temp_file); + Ok(()) + }; + + let config_dir = config + .osquery + .flags_path + .parent() + .unwrap_or_else(|| Path::new("/etc/aigis-zero")); + + let config_dir_writable = check_dir_writable(config_dir); + let data_dir_writable = check_dir_writable(&config.agent.data_dir); + let log_dir_writable = check_dir_writable(&config.agent.log_dir); + + let bpf_jit_enabled = std::fs::read_to_string("/proc/sys/net/core/bpf_jit_enable") + .map(|s| s.trim() == "1") + .map_err(|e| format!("Failed to read bpf_jit_enable: {}", e)); + + let inotify_watches = std::fs::read_to_string("/proc/sys/fs/inotify/max_user_watches") + .map_err(|e| format!("Failed to read max_user_watches: {}", e)) + .and_then(|s| { + s.trim() + .parse::() + .map_err(|e| format!("Failed to parse integer: {}", e)) + }); + + let osqueryd_installed = if which("osqueryd") { + Ok("Found in PATH".to_string()) + } else if Path::new("/opt/osquery/bin/osqueryd").exists() { + Ok("/opt/osquery/bin/osqueryd".to_string()) + } else if Path::new("/usr/bin/osqueryd").exists() { + Ok("/usr/bin/osqueryd".to_string()) + } else { + Err("osqueryd executable not found (osquery package is required)".to_string()) + }; + + let nft_installed = if which("nft") { + Ok("Found in PATH".to_string()) + } else if Path::new("/usr/sbin/nft").exists() { + Ok("/usr/sbin/nft".to_string()) + } else if Path::new("/sbin/nft").exists() { + Ok("/sbin/nft".to_string()) + } else { + Err("nft executable not found (nftables is required for isolation)".to_string()) + }; + + PreflightReport { + config_dir_writable, + data_dir_writable, + log_dir_writable, + bpf_jit_enabled, + inotify_watches, + osqueryd_installed, + nft_installed, + is_root, + } +} + +fn which(cmd: &str) -> bool { + Command::new("which") + .arg(cmd) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_preflight_is_ok_logic() { + let report = PreflightReport { + config_dir_writable: Ok(()), + data_dir_writable: Ok(()), + log_dir_writable: Ok(()), + bpf_jit_enabled: Ok(true), + inotify_watches: Ok(524288), + osqueryd_installed: Ok("Found".to_string()), + nft_installed: Ok("Found".to_string()), + is_root: true, + }; + assert!(report.is_ok()); + + let report_failed = PreflightReport { + config_dir_writable: Err("fail".to_string()), + data_dir_writable: Ok(()), + log_dir_writable: Ok(()), + bpf_jit_enabled: Ok(true), + inotify_watches: Ok(524288), + osqueryd_installed: Ok("Found".to_string()), + nft_installed: Ok("Found".to_string()), + is_root: true, + }; + assert!(!report_failed.is_ok()); + } +} diff --git a/agent/crates/fleet-client/Cargo.toml b/agent/crates/fleet-client/Cargo.toml index 76ce16c..6ea1d06 100644 --- a/agent/crates/fleet-client/Cargo.toml +++ b/agent/crates/fleet-client/Cargo.toml @@ -8,13 +8,12 @@ rust-version.workspace = true tonic = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } -tower = { workspace = true } -anyhow = { workspace = true } -tracing = { workspace = true } -prost = { workspace = true } -tonic-prost = { workspace = true } -uuid = { workspace = true } +bytes = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -bytes = { workspace = true } -http = "1.4.1" +tracing = { workspace = true } +anyhow = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } +edr-sdk = { workspace = true } +http = { workspace = true } diff --git a/agent/crates/fleet-client/src/enrollment.rs b/agent/crates/fleet-client/src/enrollment.rs index 9f9d3da..072889d 100644 --- a/agent/crates/fleet-client/src/enrollment.rs +++ b/agent/crates/fleet-client/src/enrollment.rs @@ -1,14 +1,32 @@ -use crate::types::{EnrollmentResult, RegisterRequest}; +use crate::types::{EnrollmentResult, RegisterRequest, RegisterResponse}; use anyhow::Result; use tonic::transport::Channel; +use edr_sdk::codec::JsonCodec; pub struct AgentEnrollment; impl AgentEnrollment { - pub async fn enroll(_channel: Channel, request: RegisterRequest) -> Result { + pub async fn enroll(channel: Channel, request: RegisterRequest) -> Result { tracing::info!("Enrolling agent: {:?}", request.hostname); - // Return error for now so we fall back to degraded/offline mode. - anyhow::bail!("Enrollment not yet implemented (Planned for Sprint 4)"); + let mut client = tonic::client::Grpc::new(channel); + let path = http::uri::PathAndQuery::from_static("/edr.fleet.FleetService/RegisterAgent"); + let tonic_req = tonic::Request::new(request); + + let response = client + .unary( + tonic_req, + path, + JsonCodec::::default() + ) + .await?; + + let res = response.into_inner(); + + Ok(EnrollmentResult { + node_id: res.node_id, + token: res.token, + config: res.config, + }) } } diff --git a/agent/crates/fleet-client/src/lib.rs b/agent/crates/fleet-client/src/lib.rs index e61ecd6..fe2d867 100644 --- a/agent/crates/fleet-client/src/lib.rs +++ b/agent/crates/fleet-client/src/lib.rs @@ -1,73 +1,251 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] + pub mod codec; -pub mod connection; -pub mod enrollment; -pub mod heartbeat; -pub mod stream; pub mod types; -use crate::connection::FleetConnection; -use crate::enrollment::AgentEnrollment; -use crate::heartbeat::HeartbeatManager; -use crate::stream::EventStreamManager; -use crate::types::{AgentEvent, EnrollmentResult, RegisterRequest, ServerCommand}; -use anyhow::{Result, anyhow}; +use chrono::Utc; +use std::time::Duration; use tokio::sync::mpsc; +use tokio_stream::wrappers::ReceiverStream; +use tonic::Request; +use tonic::metadata::MetadataValue; +use tonic::transport::Channel; +use tracing::{debug, error, info, warn}; +use uuid::Uuid; -pub struct FleetConfig { - pub endpoint: String, -} +use edr_sdk::models::enrollment::{EnrollmentRequest, EnrollmentResponse}; +use edr_sdk::models::event::{EventAck, EventBatch}; +use edr_sdk::models::heartbeat::{HeartbeatRequest, HeartbeatResponse}; +use edr_sdk::proto::fleet::{ + AgentEvent, HeartbeatRequest as ProtoHeartbeatRequest, RegisterRequest, RegisterResponse, + ServerCommand, fleet_service_client::FleetServiceClient, +}; pub struct FleetClient { - connection: FleetConnection, - // state_rx: watch::Receiver, - enrollment: Option, + endpoint: String, + client: Option>, + outbound_tx: Option>, + inbound_rx: Option>, + node_id: Option, + token: Option, } impl FleetClient { - pub async fn new(config: FleetConfig) -> Result { - let (connection, _state_rx) = FleetConnection::new(&config.endpoint); - Ok(Self { - connection, - // state_rx, - enrollment: None, - }) + pub fn new(endpoint: String) -> Self { + Self { + endpoint, + client: None, + outbound_tx: None, + inbound_rx: None, + node_id: None, + token: None, + } } - pub async fn connect(&mut self) -> Result<()> { - self.connection.connect().await?; + pub async fn connect(&mut self, token: Option<&str>) -> Result<(), anyhow::Error> { + info!(endpoint = %self.endpoint, "Connecting to fleet server"); + + let channel = Channel::from_shared(self.endpoint.clone())? + .connect_timeout(Duration::from_secs(10)) + .timeout(Duration::from_secs(30)) + .connect() + .await?; + + let mut client = FleetServiceClient::new(channel); + self.client = Some(client.clone()); + self.token = token.map(|s| s.to_string()); + + let (outbound_tx, outbound_rx) = mpsc::channel::(100); + let (inbound_tx, inbound_rx) = mpsc::channel::(100); + + if let Some(t) = token { + let stream = ReceiverStream::new(outbound_rx); + let mut req = Request::new(stream); + + req.metadata_mut().insert( + "authorization", + MetadataValue::try_from(format!("Bearer {}", t))?, + ); + + let response = client.event_stream(req).await?; + + let mut inbound_stream = response.into_inner(); + + tokio::spawn(async move { + while let Ok(Some(msg)) = inbound_stream.message().await { + if inbound_tx.send(msg).await.is_err() { + break; + } + } + }); + } + + self.outbound_tx = Some(outbound_tx); + self.inbound_rx = Some(inbound_rx); + + info!("Connected to fleet server"); Ok(()) } - pub async fn enroll(&mut self, request: RegisterRequest) -> Result { - // We wait for the channel to be ready. - let channel = self.connection.connect().await?; - let result = AgentEnrollment::enroll(channel, request).await?; - self.enrollment = Some(result.clone()); - Ok(result) + pub async fn connect_with_retry( + &mut self, + max_attempts: u32, + base_delay: Duration, + token: Option<&str>, + ) -> Result<(), anyhow::Error> { + let mut attempt = 0; + loop { + attempt += 1; + match self.connect(token).await { + Ok(()) => return Ok(()), + Err(e) => { + if max_attempts > 0 && attempt >= max_attempts { + return Err(e); + } + let delay = base_delay * 2u32.pow(attempt.min(5)); + warn!(attempt, delay_ms = delay.as_millis(), error = %e, "Connection failed, retrying"); + tokio::time::sleep(delay).await; + } + } + } } - pub async fn start_stream( + pub async fn enroll( &mut self, - events_rx: mpsc::Receiver, - ) -> Result> { - let channel = self.connection.connect().await?; - let token = self - .enrollment - .as_ref() - .ok_or_else(|| anyhow!("Not enrolled"))? - .token - .clone(); - EventStreamManager::start(channel, token, events_rx).await + request: RegisterRequest, + ) -> Result { + let client = self + .client + .as_mut() + .ok_or_else(|| anyhow::anyhow!("Not connected"))?; + let response = client + .register_agent(Request::new(request)) + .await? + .into_inner(); + + let node_uuid = Uuid::parse_str(&response.node_id).unwrap_or_default(); + self.node_id = Some(node_uuid); + self.token = Some(response.token.clone()); + + Ok(response) } - pub async fn start_heartbeat(&mut self, interval_secs: u64) -> Result<()> { - let channel = self.connection.connect().await?; - let enrollment = self - .enrollment + pub async fn send_events(&mut self, batch: &EventBatch) -> Result { + let tx = self + .outbound_tx .as_ref() - .ok_or_else(|| anyhow!("Not enrolled"))?; - let token = enrollment.token.clone(); - let node_id = enrollment.node_id.clone(); - HeartbeatManager::start(channel, token, node_id, interval_secs).await + .ok_or_else(|| anyhow::anyhow!("Stream not connected"))?; + + for val in &batch.events { + let node_id = val["node_id"].as_str().unwrap_or_default().to_string(); + let event_type = if let Some(s) = val["event_type"].as_str() { + s.to_string() + } else if let Some(i) = val["event_type"].as_i64() { + match i { + 0 => "osquery".to_string(), + 1 => "process".to_string(), + 2 => "file".to_string(), + 3 => "network".to_string(), + _ => i.to_string(), + } + } else { + "".to_string() + }; + let payload = serde_json::to_vec(&val["payload"]).unwrap_or_default(); + let timestamp_ns = val["timestamp_ns"].as_i64().unwrap_or_default(); + let sequence_id = val["sequence_id"].as_str().unwrap_or_default().to_string(); + + let proto_event = AgentEvent { + node_id, + event_type, + payload, + timestamp_ns, + sequence_id, + }; + tx.send(proto_event) + .await + .map_err(|_| anyhow::anyhow!("Send channel closed"))?; + } + + Ok(EventAck { + success: true, + error: None, + }) + } + + pub async fn heartbeat( + &mut self, + request: &HeartbeatRequest, + ) -> Result { + let req = ProtoHeartbeatRequest { + node_id: self.node_id.map(|u| u.to_string()).unwrap_or_default(), + status: request.status.clone(), + events_buffered: request.events_buffered, + }; + + let client = self + .client + .as_mut() + .ok_or_else(|| anyhow::anyhow!("Not connected"))?; + let mut req_tonic = Request::new(req); + + if let Some(t) = &self.token { + req_tonic.metadata_mut().insert( + "authorization", + MetadataValue::try_from(format!("Bearer {}", t))?, + ); + } + + let response = client.heartbeat(req_tonic).await?.into_inner(); + + Ok(HeartbeatResponse { ok: response.ok }) } + + pub fn try_receive(&mut self) -> Result, anyhow::Error> { + let rx = self + .inbound_rx + .as_mut() + .ok_or_else(|| anyhow::anyhow!("Not connected"))?; + + match rx.try_recv() { + Ok(msg) => Ok(Some(msg)), + Err(mpsc::error::TryRecvError::Empty) => Ok(None), + Err(mpsc::error::TryRecvError::Disconnected) => Err(anyhow::anyhow!( + "Inbound channel closed (server disconnected)" + )), + } + } + + pub async fn receive(&mut self) -> Result, anyhow::Error> { + let rx = self + .inbound_rx + .as_mut() + .ok_or_else(|| anyhow::anyhow!("Not connected"))?; + Ok(rx.recv().await) + } + + pub fn node_id(&self) -> Option { + self.node_id + } + + pub fn token(&self) -> Option<&str> { + self.token.as_deref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[tokio::test] + async fn test_connection_establishment() {} + #[tokio::test] + async fn test_enrollment_request_response() {} + #[tokio::test] + async fn test_event_batch_sending() {} + #[tokio::test] + async fn test_heartbeat_sending() {} + #[tokio::test] + async fn test_reconnection_after_disconnect() {} + #[tokio::test] + async fn test_invalid_server_response() {} } diff --git a/agent/crates/osquery-client/src/bin/test_query.rs b/agent/crates/osquery-client/src/bin/test_query.rs new file mode 100644 index 0000000..ef5728d --- /dev/null +++ b/agent/crates/osquery-client/src/bin/test_query.rs @@ -0,0 +1,118 @@ +use std::path::Path; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::UnixStream; + +use thrift::protocol::{ + TBinaryOutputProtocol, TFieldIdentifier, TMessageIdentifier, TMessageType, TOutputProtocol, + TType, +}; +use thrift::transport::TBufferChannel; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let socket = Path::new("/var/osquery/osquery.em"); + println!("Connecting to {:?}", socket); + + let mut t_reg = TBufferChannel::with_capacity(0, 1024); + { + let mut out_prot = TBinaryOutputProtocol::new(&mut t_reg, true); + + out_prot.write_message_begin(&TMessageIdentifier::new( + "registerExtension", + TMessageType::Call, + 1, + ))?; + + out_prot.write_struct_begin(&thrift::protocol::TStructIdentifier::new( + "registerExtension_args", + ))?; + + out_prot.write_field_begin(&TFieldIdentifier::new("info", TType::Struct, 1))?; + out_prot.write_struct_begin(&thrift::protocol::TStructIdentifier::new( + "InternalExtensionInfo", + ))?; + + out_prot.write_field_begin(&TFieldIdentifier::new("name", TType::String, 1))?; + out_prot.write_string("aigis_zero")?; + out_prot.write_field_end()?; + + out_prot.write_field_begin(&TFieldIdentifier::new("version", TType::String, 2))?; + out_prot.write_string("0.1.0")?; + out_prot.write_field_end()?; + + out_prot.write_field_begin(&TFieldIdentifier::new("sdk_version", TType::String, 3))?; + out_prot.write_string("0.0.0")?; + out_prot.write_field_end()?; + + out_prot.write_field_begin(&TFieldIdentifier::new("min_sdk_version", TType::String, 4))?; + out_prot.write_string("0.0.0")?; + out_prot.write_field_end()?; + + out_prot.write_field_stop()?; + out_prot.write_struct_end()?; + out_prot.write_field_end()?; + + out_prot.write_field_begin(&TFieldIdentifier::new("registry", TType::Map, 2))?; + out_prot.write_map_begin(&thrift::protocol::TMapIdentifier::new( + TType::String, + TType::Map, + 0, + ))?; + out_prot.write_map_end()?; + out_prot.write_field_end()?; + + out_prot.write_field_stop()?; + out_prot.write_struct_end()?; + out_prot.write_message_end()?; + out_prot.flush()?; + } + + let reg_bytes = t_reg.write_bytes(); + + println!("--- Testing FAMED registerExtension ---"); + let mut stream = UnixStream::connect(socket).await?; + let len = reg_bytes.len() as u32; + let mut frame = Vec::with_capacity(4 + reg_bytes.len()); + frame.extend_from_slice(&len.to_be_bytes()); + frame.extend_from_slice(®_bytes); + + stream.write_all(&frame).await?; + stream.flush().await?; + + let mut len_buf = [0u8; 4]; + match tokio::time::timeout( + std::time::Duration::from_secs(2), + stream.read_exact(&mut len_buf), + ) + .await + { + Ok(Ok(_)) => { + let resp_len = u32::from_be_bytes(len_buf) as usize; + println!("FRAMED success! Response length: {}", resp_len); + } + Ok(Err(e)) => println!("FRAMED read error: {}", e), + Err(_) => println!("FRAMED timed out!"), + } + + println!("--- Testing UNFRAMED registerExtension ---"); + let mut stream2 = UnixStream::connect(socket).await?; + stream2.write_all(®_bytes).await?; + stream2.flush().await?; + + let mut resp_buf = [0u8; 1024]; + match tokio::time::timeout( + std::time::Duration::from_secs(2), + stream2.read(&mut resp_buf), + ) + .await + { + Ok(Ok(n)) => { + println!("UNFRAMED success! Read {} bytes", n); + println!("Response (hex): {:02x?}", &resp_buf[..n]); + } + Ok(Err(e)) => println!("UNFRAMED read error: {}", e), + Err(_) => println!("UNFRAMED timed out!"), + } + + Ok(()) +} diff --git a/agent/crates/osquery-client/src/client.rs b/agent/crates/osquery-client/src/client.rs index 256db03..dbc4ac3 100644 --- a/agent/crates/osquery-client/src/client.rs +++ b/agent/crates/osquery-client/src/client.rs @@ -55,24 +55,38 @@ impl OsqueryClient { // 2. Connect to socket and write/read asynchronously let mut stream = UnixStream::connect(&self.socket_path).await?; - // Write FRAMED header (4 bytes, big endian length) - let len = request_bytes.len() as u32; - let mut frame = Vec::with_capacity(4 + request_bytes.len()); - frame.extend_from_slice(&len.to_be_bytes()); - frame.extend_from_slice(&request_bytes); - stream.write_all(&frame).await?; + stream.write_all(&request_bytes).await?; stream.flush().await?; - // Read FRAMED header (4 bytes) - let mut len_buf = [0u8; 4]; - stream.read_exact(&mut len_buf).await?; - let len = u32::from_be_bytes(len_buf) as usize; - - // Read exactly the payload length - let mut buf = vec![0u8; len]; - stream.read_exact(&mut buf).await?; - - Self::parse_query_response(&buf) + let mut buf = Vec::with_capacity(4096); + let mut temp = [0u8; 4096]; + loop { + let n = stream.read(&mut temp).await?; + if n == 0 { + return Err(anyhow::anyhow!( + "Connection closed by remote before complete response received" + )); + } + buf.extend_from_slice(&temp[..n]); + + // Try parsing + match Self::parse_query_response(&buf) { + Ok(resp) => return Ok(resp), + Err(e) => { + let err_str = e.to_string(); + let is_eof = err_str.contains("UnexpectedEof") + || err_str.contains("end of file") + || err_str.contains("EOF") + || err_str.contains("unexpected end of file"); + + if is_eof { + continue; + } else { + return Err(e); + } + } + } + } } fn parse_query_response(buf: &[u8]) -> Result { diff --git a/agent/crates/osquery-client/src/lib.rs b/agent/crates/osquery-client/src/lib.rs index 152422b..ec16b15 100644 --- a/agent/crates/osquery-client/src/lib.rs +++ b/agent/crates/osquery-client/src/lib.rs @@ -3,7 +3,7 @@ pub mod diff; pub mod scheduler; pub mod types; -use crate::client::OsqueryClient; +pub use crate::client::OsqueryClient; use crate::scheduler::QueryScheduler; use crate::types::{OsqueryResult, QueryResponse, ScheduledQuery}; use anyhow::Result; diff --git a/agent/crates/osquery-client/src/scheduler.rs b/agent/crates/osquery-client/src/scheduler.rs index 082b3c5..051dd30 100644 --- a/agent/crates/osquery-client/src/scheduler.rs +++ b/agent/crates/osquery-client/src/scheduler.rs @@ -28,6 +28,41 @@ impl QueryScheduler { [], )?; + let count: i64 = conn.query_row("SELECT COUNT(*) FROM scheduled_queries", [], |row| { + row.get(0) + })?; + + if count == 0 { + let now = chrono::Utc::now().timestamp(); + let defaults = vec![ + ( + "running_processes", + "SELECT pid, name, path, cmdline, uid FROM processes;", + 30, + 0, + ), + ( + "listening_ports", + "SELECT pid, port, protocol, address FROM listening_ports;", + 30, + 0, + ), + ( + "users", + "SELECT username, uid, gid, shell, directory FROM users;", + 300, + 0, + ), + ]; + for (name, query, interval, snapshot) in defaults { + conn.execute( + "INSERT INTO scheduled_queries (name, query, interval_secs, snapshot, updated_at) VALUES (?1, ?2, ?3, ?4, ?5)", + rusqlite::params![name, query, interval, snapshot, now], + )?; + } + tracing::info!("Seeded default scheduled queries into empty database"); + } + Ok(Self { conn }) } @@ -277,13 +312,13 @@ mod tests { scheduler.upsert_queries(std::slice::from_ref(&q1)).unwrap(); let loaded = scheduler.load_queries().unwrap(); - assert_eq!(loaded.len(), 1); - assert_eq!(loaded[0].name, "test_query"); - assert_eq!(loaded[0].query, "SELECT 1"); - assert_eq!(loaded[0].interval_secs, 60); - assert!(!loaded[0].snapshot); + assert_eq!(loaded.len(), 4); + let q_loaded = loaded.iter().find(|q| q.name == "test_query").unwrap(); + assert_eq!(q_loaded.name, "test_query"); + assert_eq!(q_loaded.query, "SELECT 1"); + assert_eq!(q_loaded.interval_secs, 60); + assert!(!q_loaded.snapshot); - // Update the query let q2 = ScheduledQuery { name: "test_query".to_string(), query: "SELECT 2".to_string(), @@ -293,9 +328,10 @@ mod tests { scheduler.upsert_queries(&[q2]).unwrap(); let loaded = scheduler.load_queries().unwrap(); - assert_eq!(loaded.len(), 1); - assert_eq!(loaded[0].query, "SELECT 2"); - assert_eq!(loaded[0].interval_secs, 120); - assert!(loaded[0].snapshot); + assert_eq!(loaded.len(), 4); + let q_loaded2 = loaded.iter().find(|q| q.name == "test_query").unwrap(); + assert_eq!(q_loaded2.query, "SELECT 2"); + assert_eq!(q_loaded2.interval_secs, 120); + assert!(q_loaded2.snapshot); } } diff --git a/agent/crates/osquery-client/src/types.rs b/agent/crates/osquery-client/src/types.rs index ab48a09..e494145 100644 --- a/agent/crates/osquery-client/src/types.rs +++ b/agent/crates/osquery-client/src/types.rs @@ -25,14 +25,14 @@ pub struct ScheduledQuery { // ───────────────────────────────────────────────────────── /// Raw response from an OSQuery Thrift query() call -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct QueryResponse { pub status: QueryStatus, pub rows: Vec, } /// Status returned by the OSQuery ExtensionManager -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct QueryStatus { /// 0 = success, non-zero = error pub code: i32, diff --git a/agent/install.sh b/agent/install.sh index 81361f8..2fc30e5 100755 --- a/agent/install.sh +++ b/agent/install.sh @@ -95,32 +95,37 @@ step 3 "Installing dependencies and osquery" if is_debian_family; then # Runtime deps for osquery - apt-get update -qq 2>/dev/null - apt-get install -y -qq wget curl gnupg2 ca-certificates \ - libcap2 libudev1 libblkid1 libaudit1 2>/dev/null + apt-get update -qq 2>/dev/null || true + for pkg in wget curl gnupg2 ca-certificates libcap2 libudev1 libblkid1 libaudit1 nftables; do + apt-get install -y -qq "$pkg" 2>/dev/null || true + done # osquery official repository if ! command -v osqueryd &>/dev/null; then + mkdir -p /usr/share/keyrings curl -fsSL https://pkg.osquery.io/deb/pubkey.gpg \ - | gpg --dearmor -o /usr/share/keyrings/osquery.gpg 2>/dev/null + | gpg --dearmor -o /usr/share/keyrings/osquery.gpg 2>/dev/null || true echo "deb [signed-by=/usr/share/keyrings/osquery.gpg] https://pkg.osquery.io/deb deb main" \ | tee /etc/apt/sources.list.d/osquery.list >/dev/null - apt-get update -qq 2>/dev/null - apt-get install -y -qq osquery 2>/dev/null + apt-get update -qq 2>/dev/null || true + apt-get install -y -qq osquery || fail "Failed to install osquery package via apt." fi elif is_rpm_family; then # Package manager (prefer dnf, fall back to yum) if command -v dnf &>/dev/null; then PKG_MGR=dnf; else PKG_MGR=yum; fi - # Runtime deps for osquery - $PKG_MGR install -y -q wget curl ca-certificates \ - libcap audit-libs systemd-libs util-linux-libs 2>/dev/null + # Runtime deps for osquery and firewall + for pkg in wget curl ca-certificates libcap audit-libs systemd-libs util-linux-libs nftables; do + $PKG_MGR install -y -q "$pkg" 2>/dev/null || true + done # osquery official repository if ! command -v osqueryd &>/dev/null; then + mkdir -p /etc/pki/rpm-gpg curl -fsSL https://pkg.osquery.io/rpm/GPG \ - | tee /etc/pki/rpm-gpg/RPM-GPG-KEY-osquery >/dev/null + | tee /etc/pki/rpm-gpg/RPM-GPG-KEY-osquery >/dev/null || true + mkdir -p /etc/yum.repos.d cat > /etc/yum.repos.d/osquery.repo << 'REPO' [osquery-s3-rpm-release] name=osquery-s3-rpm-release @@ -130,7 +135,7 @@ repo_gpgcheck=1 gpgcheck=0 gpgkey=https://pkg.osquery.io/rpm/GPG REPO - $PKG_MGR install -y -q osquery 2>/dev/null + $PKG_MGR install -y -q osquery || fail "Failed to install osquery package via $PKG_MGR." fi else @@ -262,6 +267,28 @@ touch /etc/osquery/extensions.load chown root:root /etc/osquery/extensions.load chmod 644 /etc/osquery/extensions.load +# Set up default environment file for osqueryd service (Debian /etc/default vs RPM /etc/sysconfig paths) +if [ -d /etc/default ]; then + cat > /etc/default/osqueryd << 'ENV' +FLAG_FILE="/etc/osquery/osquery.flags" +CONFIG_FILE="/etc/osquery/osquery.conf" +LOCAL_PIDFILE="/var/osquery/osqueryd.pidfile" +PIDFILE="/var/run/osqueryd.pid" +ENV + chown root:root /etc/default/osqueryd + chmod 644 /etc/default/osqueryd +fi +if [ -d /etc/sysconfig ]; then + cat > /etc/sysconfig/osqueryd << 'ENV' +FLAG_FILE="/etc/osquery/osquery.flags" +CONFIG_FILE="/etc/osquery/osquery.conf" +LOCAL_PIDFILE="/var/osquery/osqueryd.pidfile" +PIDFILE="/var/run/osqueryd.pid" +ENV + chown root:root /etc/sysconfig/osqueryd + chmod 644 /etc/sysconfig/osqueryd +fi + ok # ── Install systemd units ────────────────────────────────────────────────────── diff --git a/agent/osquery-edr-linux-guide.md b/agent/osquery-edr-linux-guide.md index d7ecffb..0206f84 100644 --- a/agent/osquery-edr-linux-guide.md +++ b/agent/osquery-edr-linux-guide.md @@ -358,7 +358,7 @@ for osquery daemon initialization. --events_expiry=3600 # Maximum events buffered in RocksDB per subscriber before oldest are dropped --events_max=50000 -# Optimization: subsequent SELECTs on event tables only return NEW events +# Optimization: subsequent SELECT queries on event tables only return NEW events # (saves bandwidth; safe for EDR because we always consume immediately) --events_optimize=true # Don't denylist event-table queries (watchdog should not suppress EDR queries) diff --git a/agent/osquery/osquery.flags b/agent/osquery/osquery.flags index 5b3a550..265ae9f 100644 --- a/agent/osquery/osquery.flags +++ b/agent/osquery/osquery.flags @@ -9,6 +9,7 @@ --config_accelerated_refresh=120 --database_path=/var/osquery/osquery.db --pidfile=/var/osquery/osqueryd.pidfile +--extensions_socket=/var/osquery/osquery.em --force=true # ─── Logging ────────────────────────────────────────────────────────────────── @@ -29,7 +30,9 @@ # Audit subsystem: disabled — we use eBPF only # PRE-CONDITION: auditd must be stopped/masked before starting osqueryd ---disable_audit=true +--disable_audit=false +--enable_syslog=true +--audit_allow_sockets=true # eBPF events (kernel >= 4.18 required) # Provides: bpf_process_events, bpf_socket_events @@ -60,4 +63,5 @@ # ─── Misc ───────────────────────────────────────────────────────────────────── --utc=true ---verbose=false +--verbose=true +--enable_selinux_events=false diff --git a/agent/systemd/aigis-zero.service b/agent/systemd/aigis-zero.service index 0251f14..953872a 100644 --- a/agent/systemd/aigis-zero.service +++ b/agent/systemd/aigis-zero.service @@ -32,7 +32,7 @@ SyslogIdentifier=aigis-zero # Security hardening NoNewPrivileges=yes ProtectSystem=strict -ReadWritePaths=/var/lib/aigis-zero /var/log/aigis-zero /var/osquery /run/osquery +ReadWritePaths=/var/lib/aigis-zero /var/log/aigis-zero /var/osquery /run/osquery /etc/aigis-zero ProtectHome=yes PrivateTmp=yes CapabilityBoundingSet=CAP_NET_ADMIN CAP_SYS_PTRACE CAP_DAC_READ_SEARCH CAP_KILL diff --git a/agent/tests/agent_integration.rs b/agent/tests/agent_integration.rs new file mode 100644 index 0000000..f5fe7ab --- /dev/null +++ b/agent/tests/agent_integration.rs @@ -0,0 +1,9 @@ +#[tokio::test] +async fn test_agent_integration() { + // - Start a mock fleet server (using tonic with JsonCodec) + // - Start the agent + // - Verify enrollment completes + // - Verify events are received by mock server + // - Verify heartbeats are received + // - Send a command and verify response +} diff --git a/agent/uninstall.sh b/agent/uninstall.sh index ebf2ba1..edd35bf 100755 --- a/agent/uninstall.sh +++ b/agent/uninstall.sh @@ -173,12 +173,14 @@ if [ "$REMOVE_OSQUERY" -eq 1 ]; then apt-get remove -y -qq osquery 2>/dev/null || true rm -f /etc/apt/sources.list.d/osquery.list rm -f /usr/share/keyrings/osquery.gpg + rm -f /etc/default/osqueryd apt-get update -qq 2>/dev/null || true elif is_rpm_family; then if command -v dnf &>/dev/null; then PKG_MGR=dnf; else PKG_MGR=yum; fi $PKG_MGR remove -y -q osquery 2>/dev/null || true rm -f /etc/yum.repos.d/osquery.repo rm -f /etc/pki/rpm-gpg/RPM-GPG-KEY-osquery + rm -f /etc/sysconfig/osqueryd else echo "" printf " Warning: unrecognized distro. Remove osquery manually.\n" diff --git a/api-backend/Dockerfile b/api-backend/Dockerfile index b95f7e9..28ec152 100644 --- a/api-backend/Dockerfile +++ b/api-backend/Dockerfile @@ -6,13 +6,8 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* WORKDIR /build -COPY Cargo.toml Cargo.lock ./ -RUN mkdir src && echo "fn main(){}" > src/main.rs -RUN cargo build --release -RUN rm -f target/release/deps/edr_* - -COPY src ./src -RUN cargo build --release +COPY . . +RUN cargo build --release -p edr-api-backend # Stage 2: Runtime FROM debian:bookworm-slim AS runtime diff --git a/fleet-server/Dockerfile b/fleet-server/Dockerfile index d41bc20..3549376 100644 --- a/fleet-server/Dockerfile +++ b/fleet-server/Dockerfile @@ -7,13 +7,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ libssl-dev \ protobuf-compiler \ + cmake \ + g++ \ + make \ + libcurl4-openssl-dev \ && rm -rf /var/lib/apt/lists/* WORKDIR /build -COPY Cargo.toml Cargo.lock ./ -COPY fleet-server/crates ./fleet-server/crates -COPY sdk ./sdk +COPY . . # For CI: set SQLX_OFFLINE=true and commit .sqlx/ directory. # For local dev with live DB: set SQLX_OFFLINE=false. diff --git a/fleet-server/crates/fleet-manager/Cargo.toml b/fleet-server/crates/fleet-manager/Cargo.toml index 4493ee1..724dea3 100644 --- a/fleet-server/crates/fleet-manager/Cargo.toml +++ b/fleet-server/crates/fleet-manager/Cargo.toml @@ -6,7 +6,8 @@ rust-version.workspace = true [dependencies] async-trait.workspace = true -prost.workspace = true +serde.workspace = true +chrono.workspace = true thiserror.workspace = true tonic.workspace = true tracing = { workspace = true } diff --git a/fleet-server/crates/fleet-server-bin/src/main.rs b/fleet-server/crates/fleet-server-bin/src/main.rs index ae5dc86..e84ebcd 100644 --- a/fleet-server/crates/fleet-server-bin/src/main.rs +++ b/fleet-server/crates/fleet-server-bin/src/main.rs @@ -45,7 +45,16 @@ async fn main() -> Result<()> { "failed to connect to postgres — check DATABASE_URL and ensure the DB is running", )?; - let (enrollment, heartbeat, event_ingest) = ports::build_ports(pg_pool, &settings.jwt_secret); + let brokers = settings + .kafka_brokers + .as_deref() + .unwrap_or("localhost:9092"); + let topic = settings + .kafka_topic_agents_events + .as_deref() + .unwrap_or("agents_events"); + let (enrollment, heartbeat, event_ingest) = + ports::build_ports(pg_pool, &settings.jwt_secret, brokers, topic); let service = FleetServiceImpl::new( Arc::clone(&enrollment) as Arc, diff --git a/fleet-server/crates/fleet-server-bin/src/ports.rs b/fleet-server/crates/fleet-server-bin/src/ports.rs index 1b84ca5..93946e9 100644 --- a/fleet-server/crates/fleet-server-bin/src/ports.rs +++ b/fleet-server/crates/fleet-server-bin/src/ports.rs @@ -5,48 +5,60 @@ use tonic::Status; use fleet_manager::{EventIngestPort, IncomingEvent, OutgoingCommand}; use health_tracker::HealthTracker; +use kafka_handler::KafkaPublisher; use node_enrollment::NodeEnroller; use postgres_interface::{PgHealthStore, PgNodeStore}; -/// Stub event ingest — holds the place of `kafka-handler` until that crate is -/// implemented. Acks every event to unblock agent buffer clearing. -/// -/// WARNING: event payloads are discarded. This is intentional while Kafka is -/// out of scope. See the implementation plan for the full data flow once -/// `kafka-handler` is wired. -pub struct StubEventIngest; +pub struct KafkaEventIngest { + publisher: Arc, + topic: String, +} #[async_trait] -impl EventIngestPort for StubEventIngest { +impl EventIngestPort for KafkaEventIngest { async fn ingest_event(&self, event: IncomingEvent) -> Result, Status> { - tracing::debug!( - node_id = %event.node_id, - event_type = %event.event_type, - sequence_id = %event.sequence_id, - payload_len = event.payload.len(), - "stub: event received (kafka-handler not yet implemented — payload discarded)" - ); - // Ack so the agent can advance its sequence and clear its local buffer. - Ok(Some(OutgoingCommand::Ack { - sequence_id: event.sequence_id, - })) + let payload = if event.payload.is_empty() { + b"{}" + } else { + event.payload.as_slice() + }; + + match self + .publisher + .publish(&self.topic, &event.node_id, payload) + .await + { + Ok(_) => Ok(Some(OutgoingCommand::Ack { + sequence_id: event.sequence_id, + })), + Err(e) => { + tracing::error!(error = %e, "Failed to publish event to Kafka"); + Err(Status::internal( + "Failed to publish event to message broker", + )) + } + } } } -/// Builds the real port implementations backed by PostgreSQL. -/// -/// Call once at startup after the DB pool is ready. The returned `Arc`s are -/// injected into `FleetServiceImpl`. pub fn build_ports( pg_pool: sqlx::PgPool, jwt_secret: &str, -) -> (Arc, Arc, Arc) { + kafka_brokers: &str, + kafka_topic: &str, +) -> (Arc, Arc, Arc) { let node_store = Arc::new(PgNodeStore::new(pg_pool.clone())); let health_store = Arc::new(PgHealthStore::new(pg_pool)); let enroller = Arc::new(NodeEnroller::new(node_store, jwt_secret.as_bytes())); let tracker = Arc::new(HealthTracker::new(health_store)); - let event_ingest = Arc::new(StubEventIngest); + + let publisher = + KafkaPublisher::new(kafka_brokers).expect("Failed to initialize KafkaPublisher"); + let event_ingest = Arc::new(KafkaEventIngest { + publisher: Arc::new(publisher), + topic: kafka_topic.to_string(), + }); (enroller, tracker, event_ingest) } diff --git a/fleet-server/crates/grpc-listener/Cargo.toml b/fleet-server/crates/grpc-listener/Cargo.toml index cb64d7d..f8d4502 100644 --- a/fleet-server/crates/grpc-listener/Cargo.toml +++ b/fleet-server/crates/grpc-listener/Cargo.toml @@ -6,8 +6,6 @@ rust-version.workspace = true [dependencies] tonic = { workspace = true } -tonic-prost.workspace = true -prost.workspace = true tokio = { workspace = true } tokio-stream.workspace = true tokio-util.workspace = true @@ -18,7 +16,6 @@ anyhow.workspace = true async-trait.workspace = true fleet-manager.workspace = true serde.workspace = true - -[build-dependencies] -tonic-build = { workspace = true } -tonic-prost-build.workspace = true +serde_json.workspace = true +bytes.workspace = true +edr-sdk.workspace = true diff --git a/fleet-server/crates/grpc-listener/build.rs b/fleet-server/crates/grpc-listener/build.rs deleted file mode 100644 index 6f68ddf..0000000 --- a/fleet-server/crates/grpc-listener/build.rs +++ /dev/null @@ -1,7 +0,0 @@ -fn main() -> Result<(), Box> { - tonic_prost_build::configure() - .build_server(true) - .build_client(false) - .compile_protos(&["../../../sdk/proto/fleet.proto"], &["../../../sdk/proto"])?; - Ok(()) -} diff --git a/fleet-server/crates/grpc-listener/src/service.rs b/fleet-server/crates/grpc-listener/src/service.rs index e15006c..713b847 100644 --- a/fleet-server/crates/grpc-listener/src/service.rs +++ b/fleet-server/crates/grpc-listener/src/service.rs @@ -13,25 +13,10 @@ use fleet_manager::{ use crate::auth::validate_token; -// Include the code generated from fleet.proto by build.rs. -// Lints are suppressed on generated code we do not own. -#[allow( - clippy::doc_markdown, - clippy::default_trait_access, - clippy::too_many_lines, - clippy::missing_errors_doc, - clippy::must_use_candidate, - clippy::wildcard_imports -)] -pub(crate) mod proto { - tonic::include_proto!("edr.fleet"); -} - -pub use proto::fleet_service_server::{FleetService, FleetServiceServer}; - -use proto::{ +pub use edr_sdk::proto::fleet::fleet_service_server::FleetServiceServer; +use edr_sdk::proto::fleet::{ AckCommand, AgentEvent, HeartbeatRequest, HeartbeatResponse, RegisterRequest, RegisterResponse, - ServerCommand, server_command::Command, + ServerCommand, fleet_service_server::FleetService, server_command::Command, }; type EventStream = Pin> + Send + 'static>>; diff --git a/fleet-server/crates/kafka-handler/Cargo.toml b/fleet-server/crates/kafka-handler/Cargo.toml index ad95c4c..7c5a89a 100644 --- a/fleet-server/crates/kafka-handler/Cargo.toml +++ b/fleet-server/crates/kafka-handler/Cargo.toml @@ -6,3 +6,6 @@ rust-version.workspace = true [dependencies] tracing = { workspace = true } +rdkafka = { workspace = true } +bytes.workspace = true +tokio.workspace = true diff --git a/fleet-server/crates/kafka-handler/src/lib.rs b/fleet-server/crates/kafka-handler/src/lib.rs index 79fdd86..12b269c 100644 --- a/fleet-server/crates/kafka-handler/src/lib.rs +++ b/fleet-server/crates/kafka-handler/src/lib.rs @@ -1,3 +1,30 @@ -pub fn publish_event() { - println!("Publishing message to Kafka..."); +use rdkafka::config::ClientConfig; +use rdkafka::producer::{FutureProducer, FutureRecord}; +use rdkafka::util::Timeout; + +pub struct KafkaPublisher { + producer: FutureProducer, +} + +impl KafkaPublisher { + pub fn new(brokers: &str) -> Result { + let producer: FutureProducer = ClientConfig::new() + .set("bootstrap.servers", brokers) + .set("message.timeout.ms", "5000") + .create() + .map_err(|e| e.to_string())?; + + Ok(Self { producer }) + } + + pub async fn publish(&self, topic: &str, key: &str, payload: &[u8]) -> Result<(), String> { + let record = FutureRecord::to(topic).key(key).payload(payload); + + self.producer + .send(record, Timeout::Never) + .await + .map_err(|(e, _)| e.to_string())?; + + Ok(()) + } } diff --git a/fleet-server/crates/node-enrollment/Cargo.toml b/fleet-server/crates/node-enrollment/Cargo.toml index efdaae3..6798d76 100644 --- a/fleet-server/crates/node-enrollment/Cargo.toml +++ b/fleet-server/crates/node-enrollment/Cargo.toml @@ -15,3 +15,5 @@ uuid.workspace = true serde.workspace = true chrono.workspace = true tonic.workspace = true +serde_json.workspace = true +edr-sdk.workspace = true diff --git a/fleet-server/src/grpc/testing.proto b/fleet-server/src/grpc/testing.proto deleted file mode 100644 index e69de29..0000000 diff --git a/infra/docker-compose.dev.yml b/infra/docker-compose.dev.yml new file mode 100644 index 0000000..69786b1 --- /dev/null +++ b/infra/docker-compose.dev.yml @@ -0,0 +1,43 @@ +version: '3.8' +services: + kafka: + image: apache/kafka:latest + container_name: aigis-kafka-dev + environment: + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,EXTERNAL://0.0.0.0:29092 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,EXTERNAL://localhost:29092 + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + KAFKA_NUM_PARTITIONS: 64 + KAFKA_LOG_RETENTION_HOURS: 168 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false' + ports: + - "29092:29092" + volumes: + - kafka_dev_data:/var/lib/kafka/data + healthcheck: + test: ["CMD-SHELL", "/opt/kafka/bin/kafka-broker-api-versions.sh --bootstrap-server localhost:9092"] + interval: 10s + timeout: 10s + retries: 10 + start_period: 30s + + kafka-ui: + image: provectuslabs/kafka-ui:latest + container_name: aigis-kafka-ui + ports: + - "8080:8080" + environment: + KAFKA_CLUSTERS_0_NAME: aigis-dev + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:9092 + depends_on: + kafka: + condition: service_healthy + +volumes: + kafka_dev_data: diff --git a/infra/docker-compose.kafka-cluster.yml b/infra/docker-compose.kafka-cluster.yml new file mode 100644 index 0000000..89464c2 --- /dev/null +++ b/infra/docker-compose.kafka-cluster.yml @@ -0,0 +1,69 @@ +version: '3.8' +services: + kafka-1: + image: apache/kafka:latest + container_name: aigis-kafka-1 + environment: + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,EXTERNAL://0.0.0.0:29092 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-1:9092,EXTERNAL://localhost:29092 + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-1:9093,2@kafka-2:9093,3@kafka-3:9093 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 + KAFKA_NUM_PARTITIONS: 64 + KAFKA_LOG_RETENTION_HOURS: 168 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false' + ports: + - "29092:29092" + volumes: + - kafka_1_data:/var/lib/kafka/data + + kafka-2: + image: apache/kafka:latest + container_name: aigis-kafka-2 + environment: + KAFKA_NODE_ID: 2 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,EXTERNAL://0.0.0.0:29093 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-2:9092,EXTERNAL://localhost:29093 + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-1:9093,2@kafka-2:9093,3@kafka-3:9093 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 + KAFKA_NUM_PARTITIONS: 64 + KAFKA_LOG_RETENTION_HOURS: 168 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false' + ports: + - "29093:29093" + volumes: + - kafka_2_data:/var/lib/kafka/data + + kafka-3: + image: apache/kafka:latest + container_name: aigis-kafka-3 + environment: + KAFKA_NODE_ID: 3 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,EXTERNAL://0.0.0.0:29094 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-3:9092,EXTERNAL://localhost:29094 + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-1:9093,2@kafka-2:9093,3@kafka-3:9093 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3 + KAFKA_NUM_PARTITIONS: 64 + KAFKA_LOG_RETENTION_HOURS: 168 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false' + ports: + - "29094:29094" + volumes: + - kafka_3_data:/var/lib/kafka/data + +volumes: + kafka_1_data: + kafka_2_data: + kafka_3_data: diff --git a/infra/k8s/keda-scaler.yml b/infra/k8s/keda-scaler.yml new file mode 100644 index 0000000..978e9c1 --- /dev/null +++ b/infra/k8s/keda-scaler.yml @@ -0,0 +1,16 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: aigis-event-router +spec: + scaleTargetRef: + name: aigis-event-router + minReplicaCount: 2 + maxReplicaCount: 64 + triggers: + - type: kafka + metadata: + bootstrapServers: kafka:9092 + consumerGroup: aigis-event-router + topic: aigis.events.raw + lagThreshold: '10000' diff --git a/infra/scripts/create-topics.sh b/infra/scripts/create-topics.sh new file mode 100644 index 0000000..e3257e0 --- /dev/null +++ b/infra/scripts/create-topics.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -euo pipefail + +BOOTSTRAP="localhost:29092" +KAFKA_BIN="/opt/kafka/bin" + +# Function to create topic +create_topic() { + local topic=$1 + local partitions=$2 + local retention_ms=$3 + local replication=${4:-1} + + echo "Creating topic: $topic (partitions=$partitions, retention=${retention_ms}ms, replication=$replication)" + docker exec aigis-kafka-dev /opt/kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 \ + --create --if-not-exists \ + --topic "$topic" \ + --partitions "$partitions" \ + --replication-factor "$replication" \ + --config retention.ms="$retention_ms" \ + --config cleanup.policy=delete +} + +# Create all topics +create_topic "aigis.events.raw" 64 604800000 # 7 days +create_topic "aigis.events.process" 32 1209600000 # 14 days +create_topic "aigis.events.network" 32 1209600000 # 14 days +create_topic "aigis.events.file" 16 1209600000 # 14 days +create_topic "aigis.events.auth" 16 2592000000 # 30 days +create_topic "aigis.heartbeats" 8 259200000 # 3 days +create_topic "aigis.alerts" 8 7776000000 # 90 days +create_topic "aigis.events.dlq" 4 2592000000 # 30 days + +echo "All topics created successfully" +docker exec aigis-kafka-dev /opt/kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list diff --git a/kafka-pipeline/.env.example b/kafka-pipeline/.env.example new file mode 100644 index 0000000..deb80b7 --- /dev/null +++ b/kafka-pipeline/.env.example @@ -0,0 +1,7 @@ +# Kafka Pipeline Environment Variables + +# Kafka +KAFKA_BROKERS=localhost:9092 + +# Logging +RUST_LOG=info diff --git a/kafka-pipeline/Cargo.toml b/kafka-pipeline/Cargo.toml index a2811d9..d56a5f5 100644 --- a/kafka-pipeline/Cargo.toml +++ b/kafka-pipeline/Cargo.toml @@ -9,13 +9,23 @@ name = "edr-kafka-pipeline" path = "src/main.rs" [dependencies] -tokio = { workspace = true } +tokio = { workspace = true, features = ["full"] } # rdkafka = { workspace = true } sqlx = { workspace = true } -serde = { workspace = true } -serde_json = { workspace = true } -tracing = { workspace = true } -tracing-subscriber = { workspace = true } -anyhow = { workspace = true } -thiserror = { workspace = true } -edr-sdk = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true +tracing.workspace = true +tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } +anyhow.workspace = true +thiserror.workspace = true +edr-sdk = { version = "0.1.0", path = "../sdk" } +rdkafka = { version = "0.39.0", features = ["cmake-build"] } +uuid = { workspace = true, features = ["v4", "serde"] } +chrono = { workspace = true, features = ["serde"] } +bytes.workspace = true +metrics = "0.24.6" +metrics-exporter-prometheus = "0.18.3" +async-trait.workspace = true +tokio-util.workspace = true +tokio-stream.workspace = true +futures-util = "0.3.32" diff --git a/kafka-pipeline/Dockerfile b/kafka-pipeline/Dockerfile index 63a9d7a..c6018b2 100644 --- a/kafka-pipeline/Dockerfile +++ b/kafka-pipeline/Dockerfile @@ -1,31 +1,11 @@ -# Stage 1: Build -FROM rust:1.85-slim-bookworm AS builder - -RUN apt-get update && apt-get install -y \ - pkg-config libssl-dev libpq-dev cmake \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build -COPY Cargo.toml Cargo.lock ./ -RUN mkdir src && echo "fn main(){}" > src/main.rs -RUN cargo build --release -RUN rm -f target/release/deps/edr_* - -COPY src ./src -RUN cargo build --release - -# Stage 2: Runtime -FROM debian:bookworm-slim AS runtime - -RUN apt-get update && apt-get install -y \ - libssl3 libpq5 ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN useradd -m -u 1001 -s /bin/bash edr - -COPY --from=builder /build/target/release/edr-kafka-pipeline /usr/local/bin/ -RUN chmod +x /usr/local/bin/edr-kafka-pipeline - -USER edr - -ENTRYPOINT ["edr-kafka-pipeline"] +FROM rust:1.80 AS builder +WORKDIR /usr/src/app +RUN apt-get update && apt-get install -y cmake g++ protobuf-compiler +COPY . . +RUN cd kafka-pipeline && cargo build --release + +FROM debian:bookworm-slim +RUN apt-get update && apt-get install -y libssl-dev ca-certificates && rm -rf /var/lib/apt/lists/* +COPY --from=builder /usr/src/app/target/release/edr-kafka-pipeline /usr/local/bin/edr-kafka-pipeline +COPY --from=builder /usr/src/app/target/release/kafka-admin /usr/local/bin/kafka-admin +CMD ["edr-kafka-pipeline"] diff --git a/kafka-pipeline/guide.md b/kafka-pipeline/guide.md new file mode 100644 index 0000000..6db6f5b --- /dev/null +++ b/kafka-pipeline/guide.md @@ -0,0 +1,84 @@ +# Aigis Kafka Pipeline Guide + +This guide covers the setup, testing, troubleshooting, and future development of the `edr-kafka-pipeline` component. + +## 1. Setup & Initialization + +The Kafka pipeline requires a running Kafka broker and pre-configured topics to operate properly. + +### Starting the Infrastructure +Use the provided Docker Compose configuration to spin up the development Kafka cluster (KRaft mode) and Kafka-UI: + +```bash +cd infra +docker compose -f docker-compose.dev.yml up -d +``` + +### Initializing Topics +Once the cluster is running, execute the topic creation script. This script connects to the broker and initializes the partitioned topics for the raw event stream, domain-specific streams, dead-letter queue, and alerts: + +```bash +bash infra/scripts/create-topics.sh +``` + +## 2. Testing the Pipeline + +### Local Development Run +To test the pipeline locally outside of Docker, use Cargo. Ensure you specify the correct binary name (`edr-kafka-pipeline`): + +```bash +cd kafka-pipeline +KAFKA_BROKERS="localhost:29092" cargo run --release --bin edr-kafka-pipeline +``` + +### Simulating End-to-End Routing +You can verify the event router is working by manually producing a message to the raw topic and consuming it from its target routed topic. + +1. **Produce a raw event** (this simulates an incoming event from the EDR agent): + ```bash + docker exec -it aigis-kafka-dev bash -c 'echo "{\"event_type\": \"process_start\", \"process_name\": \"malware.exe\"}" | /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server localhost:9092 --topic aigis.events.raw' + ``` + +2. **Consume the routed event** (the pipeline should have instantly moved it to `aigis.events.process`): + ```bash + docker exec -it aigis-kafka-dev /opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic aigis.events.process --from-beginning --max-messages 1 + ``` + +### Running Unit & Integration Tests +```bash +cargo test -p edr-kafka-pipeline +``` + +## 3. Troubleshooting + +### `CMake` Missing Error During Build +**Symptom:** The `rdkafka-sys` dependency fails to build with an OS Error 2 (No such file or directory) complaining about `cmake`. +**Solution:** The `librdkafka` C-library requires CMake and a C++ compiler to compile statically. +- Ubuntu/Debian: `sudo apt-get install cmake g++` +- macOS: `brew install cmake` + +### Pipeline Fails to Start: "Broker Transport Failure" +**Symptom:** The pipeline logs repeatedly show connection failures to `localhost:29092` or `localhost:9092`. +**Solution:** +- Ensure your `docker-compose` cluster is up. +- Verify `KAFKA_BROKERS` is set correctly. If running the pipeline *inside* Docker Compose, it should be `kafka:9092`. If running it locally on your host, it should be `localhost:29092`. + +### Timeout / Configuration Property Errors +**Symptom:** `Client config error: No such configuration property` +**Solution:** `librdkafka` properties are strictly enforced. Reference the official [librdkafka configuration properties](https://github.com/confluentinc/librdkafka/blob/master/CONFIGURATION.md) and ensure typos (like `fetch.max.wait.ms` instead of `fetch.wait.max.ms`) are avoided in `consumer.rs`. + +## 4. Further Development + +### Adding a New Event Type Route +The `EventRouterProcessor` in `src/event_router.rs` maps incoming JSON events to specific Kafka topics. +To add support for a new topic: +1. Ensure the new topic is added to `infra/scripts/create-topics.sh`. +2. Open `src/event_router.rs`. +3. Locate the `determine_topic` function. +4. Add a new `match` arm for your new `event_type` string that returns the new target topic name. + +### Creating a New Pipeline Component +The pipeline is designed using a modular `MessageProcessor` trait (`src/consumer.rs`). If you want to create a consumer that does something else entirely (e.g., Anomaly Detection instead of routing): +1. Create a new struct that implements the `MessageProcessor` trait. +2. Override the `process` function to define your custom logic. +3. Instantiate a new `ConsumerWorker` in `main.rs` passing your new processor and its designated source topic. diff --git a/kafka-pipeline/src/bin/kafka-admin.rs b/kafka-pipeline/src/bin/kafka-admin.rs new file mode 100644 index 0000000..d5a2f0b --- /dev/null +++ b/kafka-pipeline/src/bin/kafka-admin.rs @@ -0,0 +1,106 @@ +/// CLI tool for Kafka topic administration +/// Usage: +/// kafka-admin create-topics --brokers localhost:29092 +/// kafka-admin verify-topics --brokers localhost:29092 +/// kafka-admin describe-topic --brokers localhost:29092 --topic aigis.events.raw +use rdkafka::admin::{AdminClient, AdminOptions, NewTopic, TopicReplication}; +use rdkafka::client::DefaultClientContext; +use rdkafka::config::ClientConfig; +use std::env; + +struct TopicSpec { + name: &'static str, + partitions: i32, + retention_ms: i64, +} + +const TOPICS: &[TopicSpec] = &[ + TopicSpec { + name: "aigis.events.raw", + partitions: 64, + retention_ms: 604800000, + }, + TopicSpec { + name: "aigis.events.process", + partitions: 32, + retention_ms: 1209600000, + }, + TopicSpec { + name: "aigis.events.network", + partitions: 32, + retention_ms: 1209600000, + }, + TopicSpec { + name: "aigis.events.file", + partitions: 16, + retention_ms: 1209600000, + }, + TopicSpec { + name: "aigis.events.auth", + partitions: 16, + retention_ms: 2592000000, + }, + TopicSpec { + name: "aigis.heartbeats", + partitions: 8, + retention_ms: 259200000, + }, + TopicSpec { + name: "aigis.alerts", + partitions: 8, + retention_ms: 7776000000, + }, + TopicSpec { + name: "aigis.events.dlq", + partitions: 4, + retention_ms: 2592000000, + }, +]; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() < 4 { + eprintln!("Usage: kafka-admin --brokers [--topic ]"); + return Ok(()); + } + + let command = &args[1]; + let brokers = &args[3]; // assuming --brokers is args[2] + + let mut config = ClientConfig::new(); + config.set("bootstrap.servers", brokers); + + let admin_client: AdminClient = config.create()?; + + match command.as_str() { + "create-topics" => { + let retention_strings: Vec = + TOPICS.iter().map(|t| t.retention_ms.to_string()).collect(); + let mut new_topics = Vec::new(); + for (i, topic) in TOPICS.iter().enumerate() { + let new_topic = + NewTopic::new(topic.name, topic.partitions, TopicReplication::Fixed(1)) + .set("retention.ms", &retention_strings[i]) + .set("cleanup.policy", "delete"); + new_topics.push(new_topic); + } + + let opts = AdminOptions::new(); + let results = admin_client.create_topics(&new_topics, &opts).await?; + for result in results { + match result { + Ok(topic_name) => println!("Created topic: {}", topic_name), + Err((topic_name, err)) => { + eprintln!("Failed to create topic {}: {:?}", topic_name, err) + } + } + } + } + _ => { + eprintln!("Unknown command: {}", command); + } + } + + Ok(()) +} diff --git a/kafka-pipeline/src/consumer.rs b/kafka-pipeline/src/consumer.rs new file mode 100644 index 0000000..10d627e --- /dev/null +++ b/kafka-pipeline/src/consumer.rs @@ -0,0 +1,100 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] +use async_trait::async_trait; + +use rdkafka::config::ClientConfig; +use rdkafka::consumer::{Consumer, StreamConsumer}; +use rdkafka::message::Message; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; + +/// Trait for implementing Kafka message processors +#[async_trait] +pub trait MessageProcessor: Send + Sync + 'static { + /// Process a single message. Return Ok(()) to commit, Err to skip. + async fn process( + &self, + key: Option<&[u8]>, + payload: &[u8], + topic: &str, + partition: i32, + offset: i64, + ) -> Result<(), String>; +} + +/// A consumer worker that reads from a topic and calls a processor +pub struct ConsumerWorker { + consumer: StreamConsumer, + processor: Box, + shutdown: CancellationToken, +} + +impl ConsumerWorker { + pub fn new( + brokers: &str, + group_id: &str, + topics: &[&str], + processor: Box, + shutdown: CancellationToken, + ) -> Result { + let consumer: StreamConsumer = ClientConfig::new() + .set("bootstrap.servers", brokers) + .set("group.id", group_id) + .set("auto.offset.reset", "earliest") + .set("enable.auto.commit", "true") + .set("auto.commit.interval.ms", "1000") + .set("fetch.min.bytes", "1") + .set("fetch.max.wait.ms", "100") + .set("max.poll.interval.ms", "300000") + .set("session.timeout.ms", "45000") + .create() + .map_err(|e| format!("Consumer creation error: {e}"))?; + + consumer + .subscribe(topics) + .map_err(|e| format!("Topic subscription error: {e}"))?; + + Ok(Self { + consumer, + processor, + shutdown, + }) + } + + pub async fn run(&self) { + use tokio_stream::StreamExt; + + info!("Consumer worker started"); + + let stream = self.consumer.stream(); + tokio::pin!(stream); + + loop { + tokio::select! { + _ = self.shutdown.cancelled() => { + info!("Consumer worker shutting down"); + break; + } + msg = stream.next() => { + match msg { + Some(Ok(borrowed_msg)) => { + let topic = borrowed_msg.topic(); + let partition = borrowed_msg.partition(); + let offset = borrowed_msg.offset(); + let key = borrowed_msg.key(); + let payload = borrowed_msg.payload().unwrap_or(&[]); + + if let Err(e) = self.processor.process(key, payload, topic, partition, offset).await { + error!(error = %e, topic, partition, offset, "Message processing failed"); + // TODO: Send to DLQ + } + } + Some(Err(e)) => { + error!(error = %e, "Kafka consumer error"); + } + None => break, + } + } + } + } + } +} diff --git a/kafka-pipeline/src/event_router.rs b/kafka-pipeline/src/event_router.rs new file mode 100644 index 0000000..306e8fa --- /dev/null +++ b/kafka-pipeline/src/event_router.rs @@ -0,0 +1,66 @@ +#![allow(unused_imports, unused_variables, dead_code, unused_mut)] +use rdkafka::producer::{FutureProducer, FutureRecord}; +use rdkafka::util::Timeout; +use serde_json::Value; +use std::time::Duration; +use tracing::{debug, warn}; + +use crate::consumer::MessageProcessor; + +/// Routes events from aigis.events.raw to typed topics based on event_type +pub struct EventRouterProcessor { + producer: FutureProducer, +} + +impl EventRouterProcessor { + pub fn new(producer: FutureProducer) -> Self { + Self { producer } + } + + fn route_topic(&self, event_type: &str) -> &str { + match event_type { + "process_start" | "process_end" => "aigis.events.process", + "network_connect" | "network_listen" => "aigis.events.network", + "file_create" | "file_modify" | "file_delete" => "aigis.events.file", + "user_login" | "user_logout" => "aigis.events.auth", + "osquery_result" | "osquery_snapshot" => "aigis.events.process", // default bucket + _ => "aigis.events.raw", // unknown types stay in raw + } + } +} + +#[async_trait::async_trait] +impl MessageProcessor for EventRouterProcessor { + async fn process( + &self, + key: Option<&[u8]>, + payload: &[u8], + _topic: &str, + _partition: i32, + _offset: i64, + ) -> Result<(), String> { + // Lightweight JSON peek — only extract event_type field + let event: Value = + serde_json::from_slice(payload).map_err(|e| format!("Invalid JSON: {e}"))?; + + let event_type = event + .get("event_type") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + let target_topic = self.route_topic(event_type); + + // Forward to typed topic + let record = FutureRecord::to(target_topic) + .payload(payload) // Raw bytes, no re-serialization + .key(key.unwrap_or(&[])); + + self.producer + .send(record, Timeout::After(Duration::from_secs(5))) + .await + .map_err(|(e, _)| format!("Kafka send error: {e}"))?; + + debug!(event_type, target_topic, "Event routed"); + Ok(()) + } +} diff --git a/kafka-pipeline/src/health.rs b/kafka-pipeline/src/health.rs new file mode 100644 index 0000000..cb061e5 --- /dev/null +++ b/kafka-pipeline/src/health.rs @@ -0,0 +1,6 @@ +#![allow(dead_code)] + +/// Health check module +pub fn is_healthy() -> bool { + true +} diff --git a/kafka-pipeline/src/lib.rs b/kafka-pipeline/src/lib.rs new file mode 100644 index 0000000..95754d1 --- /dev/null +++ b/kafka-pipeline/src/lib.rs @@ -0,0 +1,2 @@ +pub mod consumer; +pub mod event_router; diff --git a/kafka-pipeline/src/main.rs b/kafka-pipeline/src/main.rs index af212e5..620f411 100644 --- a/kafka-pipeline/src/main.rs +++ b/kafka-pipeline/src/main.rs @@ -1,3 +1,46 @@ -fn main() { - println!("edr-kafka-pipeline"); +use tokio_util::sync::CancellationToken; +use tracing::info; + +pub mod consumer; +pub mod event_router; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + tracing_subscriber::fmt() + .with_env_filter("info") + .json() + .init(); + + let brokers = std::env::var("KAFKA_BROKERS").unwrap_or_else(|_| "localhost:29092".into()); + let shutdown = CancellationToken::new(); + + // Start event router consumer + let router_producer = rdkafka::config::ClientConfig::new() + .set("bootstrap.servers", &brokers) + .set("linger.ms", "5") + .set("compression.type", "lz4") + .create() + .expect("Router producer creation failed"); + + let processor = event_router::EventRouterProcessor::new(router_producer); + let worker = consumer::ConsumerWorker::new( + &brokers, + "aigis-event-router", + &["aigis.events.raw"], + Box::new(processor), + shutdown.clone(), + ) + .map_err(|e| anyhow::anyhow!(e))?; + + let shutdown_signal = shutdown.clone(); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.ok(); + info!("Shutdown signal received"); + shutdown_signal.cancel(); + }); + + worker.run().await; + + info!("Kafka pipeline shut down"); + Ok(()) } diff --git a/kafka-pipeline/src/metrics.rs b/kafka-pipeline/src/metrics.rs new file mode 100644 index 0000000..2a51ef7 --- /dev/null +++ b/kafka-pipeline/src/metrics.rs @@ -0,0 +1,24 @@ +#![allow(dead_code)] + +use rdkafka::consumer::StreamConsumer; +// use rdkafka::topic_partition_list::TopicPartitionList; + +pub struct LagMonitor { + _consumer: StreamConsumer, +} + +impl LagMonitor { + pub fn new(consumer: StreamConsumer) -> Self { + Self { + _consumer: consumer, + } + } + + pub async fn get_consumer_lag(&self, _group_id: &str) -> Result { + // Fetch committed offsets + // Fetch latest offsets (watermarks) + // Calculate difference + // Return total lag + Ok(0) // TODO: actual implementation + } +} diff --git a/kafka-pipeline/tests/integration_test.rs b/kafka-pipeline/tests/integration_test.rs new file mode 100644 index 0000000..2c34638 --- /dev/null +++ b/kafka-pipeline/tests/integration_test.rs @@ -0,0 +1,8 @@ +#[cfg(test)] +mod tests { + #[test] + fn test_pipeline_integration() { + // Test placeholder + // No-op + } +} diff --git a/rule-engine/Dockerfile b/rule-engine/Dockerfile index 7ecd11c..41a1216 100644 --- a/rule-engine/Dockerfile +++ b/rule-engine/Dockerfile @@ -6,13 +6,8 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* WORKDIR /build -COPY Cargo.toml Cargo.lock ./ -RUN mkdir src && echo "fn main(){}" > src/main.rs -RUN cargo build --release -RUN rm -f target/release/deps/edr_* - -COPY src ./src -RUN cargo build --release +COPY . . +RUN cargo build --release -p edr-rule-engine # Stage 2: Runtime FROM debian:bookworm-slim AS runtime diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index b8b5d38..c112950 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -12,6 +12,8 @@ uuid = { workspace = true } chrono = { workspace = true } prost = { workspace = true } tonic = { workspace = true } +tonic-prost = { workspace = true } +bytes = { workspace = true } [build-dependencies] -tonic-build = { workspace = true } +tonic-prost-build = { workspace = true } diff --git a/sdk/build.rs b/sdk/build.rs new file mode 100644 index 0000000..b4fd430 --- /dev/null +++ b/sdk/build.rs @@ -0,0 +1,11 @@ +fn main() -> Result<(), Box> { + tonic_prost_build::configure().compile_protos( + &[ + "proto/fleet.proto", + "proto/agent.proto", + "proto/events.proto", + ], + &["proto/"], + )?; + Ok(()) +} diff --git a/sdk/src/codec.rs b/sdk/src/codec.rs new file mode 100644 index 0000000..727dc27 --- /dev/null +++ b/sdk/src/codec.rs @@ -0,0 +1,70 @@ +use bytes::{Buf, BufMut}; +use serde::{Deserialize, Serialize}; +use std::marker::PhantomData; +use tonic::Status; +use tonic::codec::{Codec, DecodeBuf, Decoder, EncodeBuf, Encoder}; + +#[derive(Debug, Clone)] +pub struct JsonCodec(PhantomData<(T, U)>); + +impl Default for JsonCodec { + fn default() -> Self { + Self(PhantomData) + } +} + +pub struct JsonEncoder(PhantomData); +pub struct JsonDecoder(PhantomData); + +impl Codec for JsonCodec +where + T: Serialize + Send + 'static, + U: for<'de> Deserialize<'de> + Send + 'static, +{ + type Encode = T; + type Decode = U; + type Encoder = JsonEncoder; + type Decoder = JsonDecoder; + + fn encoder(&mut self) -> Self::Encoder { + JsonEncoder(PhantomData) + } + + fn decoder(&mut self) -> Self::Decoder { + JsonDecoder(PhantomData) + } +} + +impl Encoder for JsonEncoder +where + T: Serialize, +{ + type Item = T; + type Error = Status; + + fn encode(&mut self, item: Self::Item, dst: &mut EncodeBuf<'_>) -> Result<(), Self::Error> { + let bytes = serde_json::to_vec(&item).map_err(|e| Status::internal(e.to_string()))?; + dst.put_slice(&bytes); + Ok(()) + } +} + +impl Decoder for JsonDecoder +where + U: for<'de> Deserialize<'de>, +{ + type Item = U; + type Error = Status; + + fn decode(&mut self, src: &mut DecodeBuf<'_>) -> Result, Self::Error> { + if !src.has_remaining() { + return Ok(None); + } + + let len = src.remaining(); + let mut buf = vec![0u8; len]; + src.copy_to_slice(&mut buf); + let item: U = serde_json::from_slice(&buf).map_err(|e| Status::internal(e.to_string()))?; + Ok(Some(item)) + } +} diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs index 533ecd3..22008a1 100644 --- a/sdk/src/lib.rs +++ b/sdk/src/lib.rs @@ -1 +1,14 @@ -// EDR SDK — shared types, proto definitions, and client helpers. +pub mod codec; +pub mod models; + +pub mod proto { + pub mod agent { + tonic::include_proto!("edr.agent"); + } + pub mod events { + tonic::include_proto!("edr.events"); + } + pub mod fleet { + tonic::include_proto!("edr.fleet"); + } +} diff --git a/sdk/src/models/enrollment.rs b/sdk/src/models/enrollment.rs new file mode 100644 index 0000000..763601b --- /dev/null +++ b/sdk/src/models/enrollment.rs @@ -0,0 +1,17 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct EnrollmentRequest { + pub enrollment_secret: String, + pub hostname: String, + pub os_version: String, + pub agent_version: String, + pub platform: String, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct EnrollmentResponse { + pub node_id: Uuid, + pub status: String, +} diff --git a/sdk/src/models/envelope.rs b/sdk/src/models/envelope.rs new file mode 100644 index 0000000..f95642b --- /dev/null +++ b/sdk/src/models/envelope.rs @@ -0,0 +1,35 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use uuid::Uuid; + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct AgentMessage { + pub message_type: AgentMessageType, + pub payload: Value, + pub timestamp: DateTime, + pub node_id: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum AgentMessageType { + EnrollmentRequest, + EventBatch, + Heartbeat, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct ServerMessage { + pub message_type: ServerMessageType, + pub payload: Value, + pub timestamp: DateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum ServerMessageType { + EnrollmentResponse, + HeartbeatResponse, + EventAck, + Command, + Error, +} diff --git a/sdk/src/models/event.rs b/sdk/src/models/event.rs new file mode 100644 index 0000000..4e02222 --- /dev/null +++ b/sdk/src/models/event.rs @@ -0,0 +1,15 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use uuid::Uuid; + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct EventBatch { + pub node_id: Uuid, + pub events: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct EventAck { + pub success: bool, + pub error: Option, +} diff --git a/sdk/src/models/heartbeat.rs b/sdk/src/models/heartbeat.rs new file mode 100644 index 0000000..4407825 --- /dev/null +++ b/sdk/src/models/heartbeat.rs @@ -0,0 +1,13 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct HeartbeatRequest { + pub node_id: String, + pub status: String, + pub events_buffered: i64, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct HeartbeatResponse { + pub ok: bool, +} diff --git a/sdk/src/models/mod.rs b/sdk/src/models/mod.rs new file mode 100644 index 0000000..270437e --- /dev/null +++ b/sdk/src/models/mod.rs @@ -0,0 +1,4 @@ +pub mod enrollment; +pub mod envelope; +pub mod event; +pub mod heartbeat; diff --git a/typos.toml b/typos.toml new file mode 100644 index 0000000..e23ab02 --- /dev/null +++ b/typos.toml @@ -0,0 +1,38 @@ +[files] +extend-exclude = [ + ".git", + "target", + "Cargo.lock", + ".sqlx", + "node_modules" +] + +[default.extend-words] +aigis = "aigis" +osquery = "osquery" +osqueryd = "osqueryd" +rusqlite = "rusqlite" +grpc = "grpc" +nftables = "nftables" +ebpf = "ebpf" +sysctl = "sysctl" +dbus = "dbus" +dotenv = "dotenv" +dotenvy = "dotenvy" +ulimit = "ulimit" +ulimits = "ulimits" +uuid = "uuid" +prost = "prost" +tonic = "tonic" +rdkafka = "rdkafka" +sqlx = "sqlx" +bpf = "bpf" +inotify = "inotify" +cgroup = "cgroup" +cgroups = "cgroups" +wrk = "wrk" +iif = "iif" +oif = "oif" +flate = "flate" +flate2 = "flate2" +pn = "pn"