diff --git a/.gitignore b/.gitignore index 78bfbe9c..69a6ee80 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,8 @@ dmyp.json # Secrets and Credentials (CRITICAL - NEVER COMMIT) secrets/ +!infra/terraform/modules/secrets/ +!infra/terraform/modules/secrets/*.tf *.pem *.key *_private_key.pem @@ -121,4 +123,4 @@ site/ # Temporary files tmp/ temp/ -*.tmp \ No newline at end of file +*.tmp diff --git a/infra/terraform/.terraform.lock.hcl b/infra/terraform/.terraform.lock.hcl new file mode 100644 index 00000000..3acc57c2 --- /dev/null +++ b/infra/terraform/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "5.45.2" + constraints = "~> 5.0" + hashes = [ + "h1:iy2Q9VcnMu4z/bH3v/NmI/nEpgYY7bXgJmT/hVTAUS4=", + "zh:0d09c8f20b556305192cdbe0efa6d333ceebba963a8ba91f9f1714b5a20c4b7a", + "zh:117143fc91be407874568df416b938a6896f94cb873f26bba279cedab646a804", + "zh:16ccf77d18dd2c5ef9c0625f9cf546ebdf3213c0a452f432204c69feed55081e", + "zh:3e555cf22a570a4bd247964671f421ed7517970cd9765ceb46f335edc2c6f392", + "zh:688bd5b05a75124da7ae6e885b2b92bd29f4261808b2b78bd5f51f525c1052ca", + "zh:6db3ef37a05010d82900bfffb3261c59a0c247e0692049cb3eb8c2ef16c9d7bf", + "zh:70316fde75f6a15d72749f66d994ccbdde5f5ed4311b6d06b99850f698c9bbf9", + "zh:84b8e583771a4f2bd514e519d98ed7fd28dce5efe0634e973170e1cfb5556fb4", + "zh:9d4b8ef0a9b6677935c604d94495042e68ff5489932cfd1ec41052e094a279d3", + "zh:a2089dd9bd825c107b148dd12d6b286f71aa37dfd4ca9c35157f2dcba7bc19d8", + "zh:f03d795c0fd9721e59839255ee7ba7414173017dc530b4ce566daf3802a0d6dd", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/infra/terraform/README.md b/infra/terraform/README.md new file mode 100644 index 00000000..fdbfc148 --- /dev/null +++ b/infra/terraform/README.md @@ -0,0 +1,94 @@ +# Neural Terraform Baseline (GCP) + +This directory contains a reference Terraform baseline for running Neural bots on GCP. +It is designed as a starting point for teams that want reproducible infrastructure around +Docker-based execution. + +## Modules + +- `modules/network`: VPC, subnet, and baseline firewall rules. +- `modules/runner_vm`: Compute Engine runner VM with Docker-friendly bootstrap hooks. +- `modules/secrets`: Secret Manager secret containers and runner service account access grants. +- `modules/observability`: Log-based metric and optional alert policy for runtime errors. + +## Module contracts + +### `network` +Inputs: +- `project_id` (string) +- `region` (string) +- `network_name` (string) +- `subnet_name` (string) +- `subnet_cidr` (string) +- `enable_private_google_access` (bool, default `true`) +- `allow_ssh_cidrs` (list(string), default `[]`) +- `internal_tcp_ports` (list(string), default `[]`) +- `internal_udp_ports` (list(string), default `[]`) + +Outputs: +- `network_name` +- `network_self_link` +- `subnetwork_name` +- `subnetwork_self_link` + +### `runner_vm` +Inputs: +- `project_id` (string) +- `zone` (string) +- `instance_name` (string) +- `machine_type` (string, default `e2-standard-2`) +- `network_self_link` (string) +- `subnetwork_self_link` (string) +- `create_service_account` (bool, default `true`) +- `service_account_id` (string, default `neural-runner`) +- `service_account_email` (string, required when `create_service_account=false`) +- `service_account_scopes` (list(string), default logging/monitoring/container-pull scopes; add Secret Manager scope if needed) +- `assign_public_ip` (bool, default `true`) +- `startup_script` (string, optional) +- `metadata` (map(string), default `{}`) +- `tags` (list(string), default `["neural-runner"]`) +- `boot_image` (string, default Debian 12 family image) +- `boot_disk_size_gb` (number, default `50`) +- `boot_disk_type` (string, default `pd-balanced`) + +Outputs: +- `instance_name` +- `instance_self_link` +- `instance_external_ip` +- `service_account_email` + +### `secrets` +Inputs: +- `project_id` (string) +- `secret_ids` (set(string)) +- `runner_service_account_email` (string) + +Outputs: +- `secret_ids` +- `secret_resource_ids` + +### `observability` +Inputs: +- `project_id` (string) +- `metric_name` (string, default `neural_runner_error_count`) +- `instance_name` (string) +- `enable_alert_policy` (bool, default `false`) +- `notification_channels` (list(string), default `[]`) + +Outputs: +- `log_metric_name` +- `log_metric_type` +- `alert_policy_id` + +## Usage + +Wire these modules from environment stacks (added in PR-3) and run: + +```bash +terraform init +terraform fmt -check -recursive +terraform validate +``` + +This baseline intentionally avoids provider-specific app deployment logic so teams can swap +the runtime bootstrap (Docker, private providers, or orchestrators) without rewriting core IaC. diff --git a/infra/terraform/modules/network/main.tf b/infra/terraform/modules/network/main.tf new file mode 100644 index 00000000..01a02be6 --- /dev/null +++ b/infra/terraform/modules/network/main.tf @@ -0,0 +1,58 @@ +resource "google_compute_network" "this" { + project = var.project_id + name = var.network_name + auto_create_subnetworks = false + routing_mode = "GLOBAL" +} + +resource "google_compute_subnetwork" "this" { + project = var.project_id + region = var.region + name = var.subnet_name + ip_cidr_range = var.subnet_cidr + network = google_compute_network.this.id + private_ip_google_access = var.enable_private_google_access +} + +resource "google_compute_firewall" "allow_internal" { + project = var.project_id + name = "${var.network_name}-allow-internal" + network = google_compute_network.this.name + + dynamic "allow" { + for_each = length(var.internal_tcp_ports) > 0 ? [1] : [] + content { + protocol = "tcp" + ports = var.internal_tcp_ports + } + } + + dynamic "allow" { + for_each = length(var.internal_udp_ports) > 0 ? [1] : [] + content { + protocol = "udp" + ports = var.internal_udp_ports + } + } + + allow { + protocol = "icmp" + } + + source_ranges = [var.subnet_cidr] +} + +resource "google_compute_firewall" "allow_ssh" { + count = length(var.allow_ssh_cidrs) > 0 ? 1 : 0 + + project = var.project_id + name = "${var.network_name}-allow-ssh" + network = google_compute_network.this.name + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = var.allow_ssh_cidrs +} diff --git a/infra/terraform/modules/network/outputs.tf b/infra/terraform/modules/network/outputs.tf new file mode 100644 index 00000000..aa8b4555 --- /dev/null +++ b/infra/terraform/modules/network/outputs.tf @@ -0,0 +1,19 @@ +output "network_name" { + description = "VPC network name" + value = google_compute_network.this.name +} + +output "network_self_link" { + description = "VPC network self link" + value = google_compute_network.this.self_link +} + +output "subnetwork_name" { + description = "Subnet name" + value = google_compute_subnetwork.this.name +} + +output "subnetwork_self_link" { + description = "Subnet self link" + value = google_compute_subnetwork.this.self_link +} diff --git a/infra/terraform/modules/network/variables.tf b/infra/terraform/modules/network/variables.tf new file mode 100644 index 00000000..2e8d6503 --- /dev/null +++ b/infra/terraform/modules/network/variables.tf @@ -0,0 +1,48 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "region" { + description = "GCP region for the subnet" + type = string +} + +variable "network_name" { + description = "VPC network name" + type = string +} + +variable "subnet_name" { + description = "Subnet name" + type = string +} + +variable "subnet_cidr" { + description = "CIDR range for the subnet" + type = string +} + +variable "enable_private_google_access" { + description = "Whether Private Google Access is enabled on the subnet" + type = bool + default = true +} + +variable "allow_ssh_cidrs" { + description = "CIDR blocks allowed to SSH into tagged instances" + type = list(string) + default = [] +} + +variable "internal_tcp_ports" { + description = "TCP ports allowed for east-west traffic inside the subnet" + type = list(string) + default = [] +} + +variable "internal_udp_ports" { + description = "UDP ports allowed for east-west traffic inside the subnet" + type = list(string) + default = [] +} diff --git a/infra/terraform/modules/observability/main.tf b/infra/terraform/modules/observability/main.tf new file mode 100644 index 00000000..9a54d0b3 --- /dev/null +++ b/infra/terraform/modules/observability/main.tf @@ -0,0 +1,57 @@ +resource "google_logging_metric" "runner_errors" { + project = var.project_id + name = var.metric_name + + filter = <<-EOT + resource.type="gce_instance" + resource.labels.instance_id:* + labels."compute.googleapis.com/resource_name"="${var.instance_name}" + severity>=ERROR + EOT + + metric_descriptor { + metric_kind = "DELTA" + value_type = "INT64" + unit = "1" + labels { + key = "instance_name" + value_type = "STRING" + description = "Runner instance name" + } + } + + label_extractors = { + instance_name = "EXTRACT(labels.\"compute.googleapis.com/resource_name\")" + } +} + +resource "google_monitoring_alert_policy" "runner_error_alert" { + count = var.enable_alert_policy ? 1 : 0 + project = var.project_id + + display_name = "Neural Runner Error Alert" + combiner = "OR" + enabled = true + + conditions { + display_name = "Runner emits error logs" + + condition_threshold { + filter = "metric.type=\"logging.googleapis.com/user/${google_logging_metric.runner_errors.name}\"" + comparison = "COMPARISON_GT" + threshold_value = 0 + duration = "60s" + + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_DELTA" + } + + trigger { + count = 1 + } + } + } + + notification_channels = var.notification_channels +} diff --git a/infra/terraform/modules/observability/outputs.tf b/infra/terraform/modules/observability/outputs.tf new file mode 100644 index 00000000..55a9928d --- /dev/null +++ b/infra/terraform/modules/observability/outputs.tf @@ -0,0 +1,14 @@ +output "log_metric_name" { + description = "Name of the log-based metric" + value = google_logging_metric.runner_errors.name +} + +output "log_metric_type" { + description = "Fully qualified metric type" + value = "logging.googleapis.com/user/${google_logging_metric.runner_errors.name}" +} + +output "alert_policy_id" { + description = "Alert policy ID when enabled" + value = var.enable_alert_policy ? google_monitoring_alert_policy.runner_error_alert[0].id : null +} diff --git a/infra/terraform/modules/observability/variables.tf b/infra/terraform/modules/observability/variables.tf new file mode 100644 index 00000000..aa6f7d20 --- /dev/null +++ b/infra/terraform/modules/observability/variables.tf @@ -0,0 +1,27 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "metric_name" { + description = "Log-based metric name" + type = string + default = "neural_runner_error_count" +} + +variable "instance_name" { + description = "Runner VM instance name used in log filters" + type = string +} + +variable "enable_alert_policy" { + description = "Whether to create an alert policy for runtime errors" + type = bool + default = false +} + +variable "notification_channels" { + description = "Notification channel IDs used by alert policy" + type = list(string) + default = [] +} diff --git a/infra/terraform/modules/runner_vm/main.tf b/infra/terraform/modules/runner_vm/main.tf new file mode 100644 index 00000000..22ccc3e9 --- /dev/null +++ b/infra/terraform/modules/runner_vm/main.tf @@ -0,0 +1,61 @@ +locals { + resolved_service_account_email = var.create_service_account ? google_service_account.runner[0].email : var.service_account_email +} + +resource "google_service_account" "runner" { + count = var.create_service_account ? 1 : 0 + + project = var.project_id + account_id = var.service_account_id + display_name = "Neural runner service account" +} + +resource "google_compute_instance" "runner" { + project = var.project_id + zone = var.zone + name = var.instance_name + machine_type = var.machine_type + tags = var.tags + + boot_disk { + initialize_params { + image = var.boot_image + size = var.boot_disk_size_gb + type = var.boot_disk_type + } + } + + network_interface { + network = var.network_self_link + subnetwork = var.subnetwork_self_link + + dynamic "access_config" { + for_each = var.assign_public_ip ? [1] : [] + content {} + } + } + + metadata = var.metadata + + metadata_startup_script = var.startup_script + + service_account { + email = local.resolved_service_account_email + scopes = var.service_account_scopes + } + + shielded_instance_config { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } + + lifecycle { + precondition { + condition = var.create_service_account || ( + var.service_account_email != null && trimspace(var.service_account_email) != "" + ) + error_message = "service_account_email must be provided when create_service_account=false." + } + } +} diff --git a/infra/terraform/modules/runner_vm/outputs.tf b/infra/terraform/modules/runner_vm/outputs.tf new file mode 100644 index 00000000..6f61c6fc --- /dev/null +++ b/infra/terraform/modules/runner_vm/outputs.tf @@ -0,0 +1,19 @@ +output "instance_name" { + description = "Runner instance name" + value = google_compute_instance.runner.name +} + +output "instance_self_link" { + description = "Runner instance self link" + value = google_compute_instance.runner.self_link +} + +output "instance_external_ip" { + description = "External IP address for the runner VM" + value = length(google_compute_instance.runner.network_interface[0].access_config) > 0 ? google_compute_instance.runner.network_interface[0].access_config[0].nat_ip : null +} + +output "service_account_email" { + description = "Runner service account email" + value = local.resolved_service_account_email +} diff --git a/infra/terraform/modules/runner_vm/variables.tf b/infra/terraform/modules/runner_vm/variables.tf new file mode 100644 index 00000000..a3d7ddb8 --- /dev/null +++ b/infra/terraform/modules/runner_vm/variables.tf @@ -0,0 +1,112 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "zone" { + description = "GCP zone for the runner VM" + type = string +} + +variable "instance_name" { + description = "Runner VM instance name" + type = string +} + +variable "machine_type" { + description = "Runner VM machine type" + type = string + default = "e2-standard-2" +} + +variable "network_self_link" { + description = "Self link of the VPC network" + type = string +} + +variable "subnetwork_self_link" { + description = "Self link of the subnet" + type = string +} + +variable "create_service_account" { + description = "Whether to create a service account for the runner" + type = bool + default = true +} + +variable "service_account_id" { + description = "Service account ID when create_service_account=true" + type = string + default = "neural-runner" +} + +variable "service_account_email" { + description = "Existing service account email when create_service_account=false" + type = string + default = null +} + +variable "service_account_scopes" { + description = "OAuth scopes granted to the runner VM service account" + type = list(string) + default = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring.write", + "https://www.googleapis.com/auth/devstorage.read_only", + ] +} + +variable "assign_public_ip" { + description = "Whether to assign an ephemeral public IP to the runner VM" + type = bool + default = true +} + +variable "startup_script" { + description = "Startup script for VM bootstrap" + type = string + default = <<-EOT + #!/usr/bin/env bash + set -euo pipefail + + apt-get update + apt-get install -y docker.io + systemctl enable docker + systemctl start docker + + # Allow common default VM users to run Docker without sudo. + id -u ubuntu >/dev/null 2>&1 && usermod -aG docker ubuntu || true + id -u debian >/dev/null 2>&1 && usermod -aG docker debian || true + EOT +} + +variable "metadata" { + description = "Metadata map for instance configuration" + type = map(string) + default = {} +} + +variable "tags" { + description = "Network tags assigned to the instance" + type = list(string) + default = ["neural-runner"] +} + +variable "boot_image" { + description = "Boot image for the runner VM" + type = string + default = "projects/debian-cloud/global/images/family/debian-12" +} + +variable "boot_disk_size_gb" { + description = "Boot disk size in GB" + type = number + default = 50 +} + +variable "boot_disk_type" { + description = "Boot disk type" + type = string + default = "pd-balanced" +} diff --git a/infra/terraform/modules/secrets/main.tf b/infra/terraform/modules/secrets/main.tf new file mode 100644 index 00000000..8fa01c96 --- /dev/null +++ b/infra/terraform/modules/secrets/main.tf @@ -0,0 +1,19 @@ +resource "google_secret_manager_secret" "this" { + for_each = var.secret_ids + + project = var.project_id + secret_id = each.value + + replication { + auto {} + } +} + +resource "google_secret_manager_secret_iam_member" "runner_accessor" { + for_each = google_secret_manager_secret.this + + project = var.project_id + secret_id = each.value.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${var.runner_service_account_email}" +} diff --git a/infra/terraform/modules/secrets/outputs.tf b/infra/terraform/modules/secrets/outputs.tf new file mode 100644 index 00000000..4fdb7d5b --- /dev/null +++ b/infra/terraform/modules/secrets/outputs.tf @@ -0,0 +1,9 @@ +output "secret_ids" { + description = "Secret IDs created by this module" + value = [for secret in google_secret_manager_secret.this : secret.secret_id] +} + +output "secret_resource_ids" { + description = "Secret resource IDs" + value = [for secret in google_secret_manager_secret.this : secret.id] +} diff --git a/infra/terraform/modules/secrets/variables.tf b/infra/terraform/modules/secrets/variables.tf new file mode 100644 index 00000000..40244a8a --- /dev/null +++ b/infra/terraform/modules/secrets/variables.tf @@ -0,0 +1,14 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "secret_ids" { + description = "Set of secret IDs to provision" + type = set(string) +} + +variable "runner_service_account_email" { + description = "Runner service account email granted secret access" + type = string +} diff --git a/infra/terraform/versions.tf b/infra/terraform/versions.tf new file mode 100644 index 00000000..44aab936 --- /dev/null +++ b/infra/terraform/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.5.0, < 2.0.0" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } +}