diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml new file mode 100644 index 00000000..7cad8d1b --- /dev/null +++ b/.github/workflows/terraform.yml @@ -0,0 +1,48 @@ +name: Terraform + +on: + pull_request: + branches: [ main ] + paths: + - "infra/terraform/**" + - ".github/workflows/terraform.yml" + push: + branches: [ main ] + paths: + - "infra/terraform/**" + - ".github/workflows/terraform.yml" + +jobs: + terraform: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.5.7 + + - name: Terraform fmt check + run: terraform -chdir=infra/terraform fmt -check -recursive + + - name: Terraform init (root) + run: terraform -chdir=infra/terraform init -backend=false + + - name: Terraform validate (root) + run: terraform -chdir=infra/terraform validate + + - name: Terraform validate modules + run: | + for module in network runner_vm secrets observability; do + terraform -chdir=infra/terraform/modules/$module init -backend=false + terraform -chdir=infra/terraform/modules/$module validate + done + + - name: Terraform validate environments + run: | + for env in dev prod; do + terraform -chdir=infra/terraform/environments/$env init -backend=false + terraform -chdir=infra/terraform/environments/$env validate + done diff --git a/docs/architecture/start-here.mdx b/docs/architecture/start-here.mdx index 926d84d9..e4abc71a 100644 --- a/docs/architecture/start-here.mdx +++ b/docs/architecture/start-here.mdx @@ -48,7 +48,8 @@ Data flows top to bottom: you ingest markets (and optional context), transform t 1. **Bootstrap the environment** – follow `getting-started` to install, load credentials, and smoke test the APIs. 2. **Run the quickstart bot** – `workflows/build-first-bot` glues market fetch → signal → paper execution so you see the system end-to-end. 3. **Deep dive where needed** – branch into the stack-specific docs from the table above. -4. **Promote to production** – use `workflows/promotion-checklist` once your strategy, risk settings, and monitoring are dialed in. +4. **Plan infrastructure topology** – use `basics/infrastructure` and `workflows/terraform-runbook` for OSS infra baseline and environment operations. +5. **Promote to production** – use `workflows/promotion-checklist` once your strategy, risk settings, and monitoring are dialed in. ## Tips for exploring @@ -63,4 +64,4 @@ Bookmark this page and `getting-started`; together they give you both the 10,000 - Get hands-on immediately: `getting-started` - Review infrastructure dependencies: `basics/infrastructure` - Jump to the quickstart bot: `workflows/build-first-bot` - +- Manage IaC operations: `workflows/terraform-runbook` diff --git a/docs/basics/infrastructure.mdx b/docs/basics/infrastructure.mdx index 71169ce4..e8705ced 100644 --- a/docs/basics/infrastructure.mdx +++ b/docs/basics/infrastructure.mdx @@ -14,7 +14,16 @@ Summarize the external services Neural touches (REST, WebSocket, FIX), their lat | FIX API | `fix.elections.kalshi.com:8228` | Ultra-low-latency order entry and execution reports | ✅ operational | | WebSocket | `/trade-api/ws/v2` | Real-time market data stream | ⚠️ requires Kalshi approval | -Latency reference: REST polling at 1s intervals, FIX round-trips ~5–10 ms, WebSocket delivers pushes \<100 ms once enabled. +Latency reference: REST polling at 1s intervals, FIX round-trips ~5–10 ms, WebSocket delivers pushes <100 ms once enabled. + +## Deployment split model + +Neural infrastructure can be split cleanly by responsibility: + +- **Open-source baseline**: Terraform modules for network, runner VM, secrets, and observability. +- **Private runtime**: Environment-specific deployment providers (for example, Daytona-based runtimes) that plug into the shared deployment interface. + +This keeps infrastructure reproducible in OSS while preserving proprietary runtime orchestration logic in private repositories. ## Deployment runtime model @@ -64,3 +73,4 @@ REST polling (baseline) ─┬─> Strategy / Aggregator ──> TradingClient - Review execution options: `trading/overview` - Plan deployment workflows: `workflows/promotion-checklist` - Build custom runtime integrations: `workflows/deployment-providers` +- Operate Terraform environments: `workflows/terraform-runbook` diff --git a/docs/mint.json b/docs/mint.json index db846e0e..e4ac2996 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -107,7 +107,8 @@ "workflows/build-first-bot", "workflows/promotion-checklist", "workflows/data-pipeline", - "workflows/deployment-providers" + "workflows/deployment-providers", + "workflows/terraform-runbook" ] }, { diff --git a/docs/workflows/promotion-checklist.mdx b/docs/workflows/promotion-checklist.mdx index 8ce5841a..3322e997 100644 --- a/docs/workflows/promotion-checklist.mdx +++ b/docs/workflows/promotion-checklist.mdx @@ -53,4 +53,4 @@ Consistently iterating through this loop keeps strategies resilient as Kalshi mi - Wire monitoring pipelines: `workflows/data-pipeline` - Review execution surfaces: `trading/trading-client` - Keep iterating on strategy design: `analysis/strategy-foundations` - +- Use Terraform operations runbook: `workflows/terraform-runbook` diff --git a/docs/workflows/terraform-runbook.mdx b/docs/workflows/terraform-runbook.mdx new file mode 100644 index 00000000..2e0a492a --- /dev/null +++ b/docs/workflows/terraform-runbook.mdx @@ -0,0 +1,91 @@ +--- +title: 'Terraform Deployment Runbook' +description: 'Bootstrap, plan, apply, and troubleshoot the GCP Terraform environments for Neural.' +--- + +Use this runbook when operating the reference Terraform stack under `infra/terraform`. + +## 1. Prerequisites + +- Terraform `1.5.x` +- GCP project with billing enabled +- `gcloud` authenticated to the target project +- IAM permissions for VPC, Compute Engine, Secret Manager, and Monitoring resources + +## 2. Bootstrap remote state + +Create a dedicated state bucket (one-time): + +```bash +gcloud storage buckets create gs://neural-tf-state-prod \ + --project \ + --location us-central1 \ + --uniform-bucket-level-access + +gcloud storage buckets update gs://neural-tf-state-prod --versioning +``` + +Initialize the production environment with backend config: + +```bash +cd infra/terraform/environments/prod +terraform init \ + -backend-config="bucket=neural-tf-state-prod" \ + -backend-config="prefix=neural/prod" +``` + +Repeat with a different bucket/prefix for `dev`. + +## 3. Plan and apply + +Create `terraform.tfvars` from `terraform.tfvars.example`, then run: + +```bash +terraform fmt -check -recursive +terraform validate +terraform plan -out=tfplan +terraform apply tfplan +``` + +Recommended safety controls: + +1. Keep `plan` output in PR artifacts before apply. +2. Require manual approval for `prod` applies. +3. Use separate service accounts for `dev` and `prod`. + +## 4. Secret injection model + +The reference stack provisions Secret Manager containers and grants runner access. +Add secret **versions** outside Terraform to avoid storing secret values in state: + +```bash +echo -n "" | gcloud secrets versions add kalshi-api-key-id --data-file=- +echo -n "" | gcloud secrets versions add kalshi-private-key-pem --data-file=- +``` + +In runtime bootstrap scripts, resolve secrets at startup using the runner service account. + +## 5. Destroy workflow + +```bash +terraform plan -destroy -out=tfdestroy +terraform apply tfdestroy +``` + +Before destroy: + +1. Drain/disable bot workloads. +2. Export required logs and runtime artifacts. +3. Confirm no shared resources are referenced by other environments. + +## 6. Troubleshooting + +- **`terraform init` backend errors**: verify bucket name, region, and IAM access. +- **Provider auth failures**: run `gcloud auth application-default login` or configure workload identity. +- **Secret access denied**: verify `roles/secretmanager.secretAccessor` on the runner service account. +- **No alert notifications**: ensure `notification_channels` are valid Monitoring channel resource IDs. + +## Next + +- Module and contract reference: `basics/infrastructure` +- Production promotion checklist: `workflows/promotion-checklist` diff --git a/infra/terraform/README.md b/infra/terraform/README.md index fdbfc148..0069750e 100644 --- a/infra/terraform/README.md +++ b/infra/terraform/README.md @@ -22,6 +22,7 @@ Inputs: - `subnet_cidr` (string) - `enable_private_google_access` (bool, default `true`) - `allow_ssh_cidrs` (list(string), default `[]`) +- `ssh_target_tags` (list(string), default `["neural-runner"]`) - `internal_tcp_ports` (list(string), default `[]`) - `internal_udp_ports` (list(string), default `[]`) diff --git a/infra/terraform/environments/dev/.terraform.lock.hcl b/infra/terraform/environments/dev/.terraform.lock.hcl new file mode 100644 index 00000000..3acc57c2 --- /dev/null +++ b/infra/terraform/environments/dev/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "5.45.2" + constraints = "~> 5.0" + hashes = [ + "h1:iy2Q9VcnMu4z/bH3v/NmI/nEpgYY7bXgJmT/hVTAUS4=", + "zh:0d09c8f20b556305192cdbe0efa6d333ceebba963a8ba91f9f1714b5a20c4b7a", + "zh:117143fc91be407874568df416b938a6896f94cb873f26bba279cedab646a804", + "zh:16ccf77d18dd2c5ef9c0625f9cf546ebdf3213c0a452f432204c69feed55081e", + "zh:3e555cf22a570a4bd247964671f421ed7517970cd9765ceb46f335edc2c6f392", + "zh:688bd5b05a75124da7ae6e885b2b92bd29f4261808b2b78bd5f51f525c1052ca", + "zh:6db3ef37a05010d82900bfffb3261c59a0c247e0692049cb3eb8c2ef16c9d7bf", + "zh:70316fde75f6a15d72749f66d994ccbdde5f5ed4311b6d06b99850f698c9bbf9", + "zh:84b8e583771a4f2bd514e519d98ed7fd28dce5efe0634e973170e1cfb5556fb4", + "zh:9d4b8ef0a9b6677935c604d94495042e68ff5489932cfd1ec41052e094a279d3", + "zh:a2089dd9bd825c107b148dd12d6b286f71aa37dfd4ca9c35157f2dcba7bc19d8", + "zh:f03d795c0fd9721e59839255ee7ba7414173017dc530b4ce566daf3802a0d6dd", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/infra/terraform/environments/dev/main.tf b/infra/terraform/environments/dev/main.tf new file mode 100644 index 00000000..ceb814dd --- /dev/null +++ b/infra/terraform/environments/dev/main.tf @@ -0,0 +1,64 @@ +terraform { + required_version = ">= 1.5.0, < 2.0.0" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } + + # Configure this with backend config flags or a backend.hcl file during init. + # Example: + # terraform init -backend-config="bucket=neural-tf-state-dev" -backend-config="prefix=neural/dev" + backend "gcs" {} +} + +provider "google" { + project = var.project_id + region = var.region + zone = var.zone +} + +module "network" { + source = "../../modules/network" + + project_id = var.project_id + region = var.region + network_name = "${var.stack_name}-vpc" + subnet_name = "${var.stack_name}-subnet" + subnet_cidr = var.subnet_cidr + + allow_ssh_cidrs = var.allow_ssh_cidrs +} + +module "runner" { + source = "../../modules/runner_vm" + + project_id = var.project_id + zone = var.zone + instance_name = "${var.stack_name}-runner" + machine_type = var.machine_type + network_self_link = module.network.network_self_link + subnetwork_self_link = module.network.subnetwork_self_link + + startup_script = var.startup_script + tags = ["neural-runner", "env-dev"] +} + +module "secrets" { + source = "../../modules/secrets" + + project_id = var.project_id + secret_ids = var.secret_ids + runner_service_account_email = module.runner.service_account_email +} + +module "observability" { + source = "../../modules/observability" + + project_id = var.project_id + instance_name = module.runner.instance_name + enable_alert_policy = var.enable_alert_policy + notification_channels = var.notification_channels +} diff --git a/infra/terraform/environments/dev/outputs.tf b/infra/terraform/environments/dev/outputs.tf new file mode 100644 index 00000000..795c15b7 --- /dev/null +++ b/infra/terraform/environments/dev/outputs.tf @@ -0,0 +1,24 @@ +output "runner_instance_name" { + value = module.runner.instance_name + description = "Compute instance name" +} + +output "runner_external_ip" { + value = module.runner.instance_external_ip + description = "External IP of runner instance" +} + +output "runner_service_account_email" { + value = module.runner.service_account_email + description = "Runner service account email" +} + +output "secret_ids" { + value = module.secrets.secret_ids + description = "Provisioned Secret Manager IDs" +} + +output "log_metric_type" { + value = module.observability.log_metric_type + description = "Log metric used for error alerting" +} diff --git a/infra/terraform/environments/dev/terraform.tfvars.example b/infra/terraform/environments/dev/terraform.tfvars.example new file mode 100644 index 00000000..67d262b5 --- /dev/null +++ b/infra/terraform/environments/dev/terraform.tfvars.example @@ -0,0 +1,9 @@ +project_id = "my-gcp-project" +region = "us-central1" +zone = "us-central1-a" + +allow_ssh_cidrs = ["35.235.240.0/20"] # IAP TCP tunnel range + +# Optional: enable alerting and wire notification channels +# enable_alert_policy = true +# notification_channels = ["projects/my-gcp-project/notificationChannels/1234567890"] diff --git a/infra/terraform/environments/dev/variables.tf b/infra/terraform/environments/dev/variables.tf new file mode 100644 index 00000000..62456a14 --- /dev/null +++ b/infra/terraform/environments/dev/variables.tf @@ -0,0 +1,77 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "region" { + description = "GCP region" + type = string + default = "us-central1" +} + +variable "zone" { + description = "GCP zone" + type = string + default = "us-central1-a" +} + +variable "stack_name" { + description = "Prefix for resource naming" + type = string + default = "neural-dev" +} + +variable "subnet_cidr" { + description = "Subnet CIDR range" + type = string + default = "10.30.0.0/24" +} + +variable "allow_ssh_cidrs" { + description = "CIDR blocks allowed for SSH" + type = list(string) + default = ["35.235.240.0/20"] +} + +variable "machine_type" { + description = "Runner machine type" + type = string + default = "e2-standard-2" +} + +variable "secret_ids" { + description = "Secret IDs to provision in Secret Manager" + type = set(string) + default = [ + "kalshi-api-key-id", + "kalshi-private-key-pem", + ] +} + +variable "enable_alert_policy" { + description = "Whether to create an alert policy" + type = bool + default = false +} + +variable "notification_channels" { + description = "Alert notification channel IDs" + type = list(string) + default = [] +} + +variable "startup_script" { + description = "Startup script for docker bootstrap" + type = string + default = <<-EOT + #!/usr/bin/env bash + set -euo pipefail + + apt-get update + apt-get install -y docker.io + systemctl enable docker + systemctl start docker + + echo "Neural dev runner bootstrap complete" > /var/log/neural-bootstrap.log + EOT +} diff --git a/infra/terraform/environments/prod/.terraform.lock.hcl b/infra/terraform/environments/prod/.terraform.lock.hcl new file mode 100644 index 00000000..3acc57c2 --- /dev/null +++ b/infra/terraform/environments/prod/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "5.45.2" + constraints = "~> 5.0" + hashes = [ + "h1:iy2Q9VcnMu4z/bH3v/NmI/nEpgYY7bXgJmT/hVTAUS4=", + "zh:0d09c8f20b556305192cdbe0efa6d333ceebba963a8ba91f9f1714b5a20c4b7a", + "zh:117143fc91be407874568df416b938a6896f94cb873f26bba279cedab646a804", + "zh:16ccf77d18dd2c5ef9c0625f9cf546ebdf3213c0a452f432204c69feed55081e", + "zh:3e555cf22a570a4bd247964671f421ed7517970cd9765ceb46f335edc2c6f392", + "zh:688bd5b05a75124da7ae6e885b2b92bd29f4261808b2b78bd5f51f525c1052ca", + "zh:6db3ef37a05010d82900bfffb3261c59a0c247e0692049cb3eb8c2ef16c9d7bf", + "zh:70316fde75f6a15d72749f66d994ccbdde5f5ed4311b6d06b99850f698c9bbf9", + "zh:84b8e583771a4f2bd514e519d98ed7fd28dce5efe0634e973170e1cfb5556fb4", + "zh:9d4b8ef0a9b6677935c604d94495042e68ff5489932cfd1ec41052e094a279d3", + "zh:a2089dd9bd825c107b148dd12d6b286f71aa37dfd4ca9c35157f2dcba7bc19d8", + "zh:f03d795c0fd9721e59839255ee7ba7414173017dc530b4ce566daf3802a0d6dd", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/infra/terraform/environments/prod/main.tf b/infra/terraform/environments/prod/main.tf new file mode 100644 index 00000000..42fc160b --- /dev/null +++ b/infra/terraform/environments/prod/main.tf @@ -0,0 +1,64 @@ +terraform { + required_version = ">= 1.5.0, < 2.0.0" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } + + # Configure this with backend config flags or a backend.hcl file during init. + # Example: + # terraform init -backend-config="bucket=neural-tf-state-prod" -backend-config="prefix=neural/prod" + backend "gcs" {} +} + +provider "google" { + project = var.project_id + region = var.region + zone = var.zone +} + +module "network" { + source = "../../modules/network" + + project_id = var.project_id + region = var.region + network_name = "${var.stack_name}-vpc" + subnet_name = "${var.stack_name}-subnet" + subnet_cidr = var.subnet_cidr + + allow_ssh_cidrs = var.allow_ssh_cidrs +} + +module "runner" { + source = "../../modules/runner_vm" + + project_id = var.project_id + zone = var.zone + instance_name = "${var.stack_name}-runner" + machine_type = var.machine_type + network_self_link = module.network.network_self_link + subnetwork_self_link = module.network.subnetwork_self_link + + startup_script = var.startup_script + tags = ["neural-runner", "env-prod"] +} + +module "secrets" { + source = "../../modules/secrets" + + project_id = var.project_id + secret_ids = var.secret_ids + runner_service_account_email = module.runner.service_account_email +} + +module "observability" { + source = "../../modules/observability" + + project_id = var.project_id + instance_name = module.runner.instance_name + enable_alert_policy = var.enable_alert_policy + notification_channels = var.notification_channels +} diff --git a/infra/terraform/environments/prod/outputs.tf b/infra/terraform/environments/prod/outputs.tf new file mode 100644 index 00000000..795c15b7 --- /dev/null +++ b/infra/terraform/environments/prod/outputs.tf @@ -0,0 +1,24 @@ +output "runner_instance_name" { + value = module.runner.instance_name + description = "Compute instance name" +} + +output "runner_external_ip" { + value = module.runner.instance_external_ip + description = "External IP of runner instance" +} + +output "runner_service_account_email" { + value = module.runner.service_account_email + description = "Runner service account email" +} + +output "secret_ids" { + value = module.secrets.secret_ids + description = "Provisioned Secret Manager IDs" +} + +output "log_metric_type" { + value = module.observability.log_metric_type + description = "Log metric used for error alerting" +} diff --git a/infra/terraform/environments/prod/terraform.tfvars.example b/infra/terraform/environments/prod/terraform.tfvars.example new file mode 100644 index 00000000..58cdf14f --- /dev/null +++ b/infra/terraform/environments/prod/terraform.tfvars.example @@ -0,0 +1,8 @@ +project_id = "my-gcp-project" +region = "us-central1" +zone = "us-central1-f" + +allow_ssh_cidrs = ["35.235.240.0/20"] # IAP TCP tunnel range + +enable_alert_policy = true +notification_channels = ["projects/my-gcp-project/notificationChannels/1234567890"] diff --git a/infra/terraform/environments/prod/variables.tf b/infra/terraform/environments/prod/variables.tf new file mode 100644 index 00000000..ff7ba985 --- /dev/null +++ b/infra/terraform/environments/prod/variables.tf @@ -0,0 +1,78 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "region" { + description = "GCP region" + type = string + default = "us-central1" +} + +variable "zone" { + description = "GCP zone" + type = string + default = "us-central1-f" +} + +variable "stack_name" { + description = "Prefix for resource naming" + type = string + default = "neural-prod" +} + +variable "subnet_cidr" { + description = "Subnet CIDR range" + type = string + default = "10.40.0.0/24" +} + +variable "allow_ssh_cidrs" { + description = "CIDR blocks allowed for SSH" + type = list(string) + default = ["35.235.240.0/20"] +} + +variable "machine_type" { + description = "Runner machine type" + type = string + default = "e2-standard-4" +} + +variable "secret_ids" { + description = "Secret IDs to provision in Secret Manager" + type = set(string) + default = [ + "kalshi-api-key-id", + "kalshi-private-key-pem", + "neural-runtime-env", + ] +} + +variable "enable_alert_policy" { + description = "Whether to create an alert policy" + type = bool + default = true +} + +variable "notification_channels" { + description = "Alert notification channel IDs" + type = list(string) + default = [] +} + +variable "startup_script" { + description = "Startup script for docker bootstrap" + type = string + default = <<-EOT + #!/usr/bin/env bash + set -euo pipefail + + apt-get update + apt-get install -y docker.io + systemctl enable docker + systemctl start docker + + echo "Neural prod runner bootstrap complete" > /var/log/neural-bootstrap.log + EOT +} diff --git a/infra/terraform/modules/network/main.tf b/infra/terraform/modules/network/main.tf index 01a02be6..d7bae1e5 100644 --- a/infra/terraform/modules/network/main.tf +++ b/infra/terraform/modules/network/main.tf @@ -55,4 +55,5 @@ resource "google_compute_firewall" "allow_ssh" { } source_ranges = var.allow_ssh_cidrs + target_tags = var.ssh_target_tags } diff --git a/infra/terraform/modules/network/variables.tf b/infra/terraform/modules/network/variables.tf index 2e8d6503..eb7e1d5f 100644 --- a/infra/terraform/modules/network/variables.tf +++ b/infra/terraform/modules/network/variables.tf @@ -35,6 +35,12 @@ variable "allow_ssh_cidrs" { default = [] } +variable "ssh_target_tags" { + description = "Network tags that SSH access rules apply to" + type = list(string) + default = ["neural-runner"] +} + variable "internal_tcp_ports" { description = "TCP ports allowed for east-west traffic inside the subnet" type = list(string)