From 8b12711f3bd4b53d05ed8f7219885f214674f83a Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 11 Feb 2026 21:48:05 +0200 Subject: [PATCH 1/9] Support Instance type of c5.metal --- deploy/aws-hypervisor/scripts/create.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 24211fc..75e85b7 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -79,7 +79,7 @@ else fi ec2Type="VirtualMachine" -if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[gn].metal ]]; then +if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[a-z]*.metal ]]; then ec2Type="MetalMachine" fi From 7075bb5bf09efb6990582bedd975b96162af80be Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Sun, 15 Feb 2026 18:42:44 +0200 Subject: [PATCH 2/9] Add assisted installer deployment method for spoke TNF clusters Deploy a spoke TNF cluster via ACM/MCE assisted installer on an existing hub cluster. This adds a third deployment path alongside dev-scripts (IPI) and kcli methods. New Ansible roles: - assisted/acm-install: installs ACM operator, MultiClusterHub, AgentServiceConfig with auto-detected RHCOS ISO, enables TNF support, and configures provisioning for external BMH management - assisted/assisted-spoke: creates spoke libvirt network and VMs, deploys cluster resources (ClusterDeployment, AgentClusterInstall, InfraEnv, BareMetalHost with fencing credentials), monitors installation, and extracts spoke credentials Usage: make deploy fencing-assisted Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 26 +++- deploy/Makefile | 5 + .../openshift-clusters/assisted-install.yml | 108 ++++++++++++++++ .../collections/requirements.yml | 2 + .../assisted/acm-install/defaults/main.yml | 28 +++++ .../tasks/agent-service-config.yml | 118 ++++++++++++++++++ .../assisted/acm-install/tasks/enable-tnf.yml | 45 +++++++ .../tasks/enable-watch-all-namespaces.yml | 46 +++++++ .../acm-install/tasks/install-operator.yml | 115 +++++++++++++++++ .../roles/assisted/acm-install/tasks/main.yml | 23 ++++ .../assisted/acm-install/tasks/storage.yml | 62 +++++++++ .../assisted/acm-install/tasks/validate.yml | 36 ++++++ .../templates/agentserviceconfig.yml.j2 | 22 ++++ .../templates/multiclusterhub.yml.j2 | 7 ++ .../templates/operator-subscription.yml.j2 | 11 ++ .../roles/assisted/acm-install/vars/main.yml | 23 ++++ .../assisted/assisted-spoke/defaults/main.yml | 42 +++++++ .../assisted/assisted-spoke/tasks/cleanup.yml | 48 +++++++ .../assisted-spoke/tasks/create-bmh.yml | 63 ++++++++++ .../tasks/create-cluster-resources.yml | 101 +++++++++++++++ .../tasks/create-spoke-network.yml | 48 +++++++ .../assisted-spoke/tasks/create-spoke-vms.yml | 75 +++++++++++ .../assisted/assisted-spoke/tasks/main.yml | 30 +++++ .../tasks/retrieve-credentials.yml | 100 +++++++++++++++ .../assisted-spoke/tasks/setup-ksushy.yml | 34 +++++ .../assisted-spoke/tasks/wait-for-install.yml | 103 +++++++++++++++ .../templates/agentclusterinstall.yml.j2 | 22 ++++ .../templates/clusterdeployment.yml.j2 | 20 +++ .../templates/clusterimageset.yml.j2 | 6 + .../assisted-spoke/templates/infraenv.yml.j2 | 15 +++ .../templates/spoke-network.xml.j2 | 29 +++++ .../assisted/assisted-spoke/vars/main.yml | 21 ++++ .../scripts/deploy-fencing-assisted.sh | 53 ++++++++ .../vars/assisted.yml.template | 48 +++++++ 34 files changed, 1532 insertions(+), 3 deletions(-) create mode 100644 deploy/openshift-clusters/assisted-install.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml create mode 100755 deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh create mode 100644 deploy/openshift-clusters/vars/assisted.yml.template diff --git a/CLAUDE.md b/CLAUDE.md index ea34093..93645e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,8 +13,9 @@ Two-Node Toolbox (TNF) is a comprehensive deployment automation framework for Op # From the deploy/ directory: # Deploy AWS hypervisor and cluster in one command -make deploy arbiter-ipi # Deploy arbiter topology cluster +make deploy arbiter-ipi # Deploy arbiter topology cluster make deploy fencing-ipi # Deploy fencing topology cluster +make deploy fencing-assisted # Deploy hub + spoke TNF via assisted installer # Instance lifecycle management make create # Create new EC2 instance @@ -70,6 +71,15 @@ ansible-playbook kcli-install.yml -i inventory.ini -e "test_cluster_name=my-clus ansible-playbook kcli-install.yml -i inventory.ini -e "force_cleanup=true" ``` +#### Assisted Installer Method (Spoke TNF via ACM) +```bash +# Copy and customize the configuration template +cp vars/assisted.yml.template vars/assisted.yml + +# Deploy hub + spoke TNF cluster via assisted installer +make deploy fencing-assisted +``` + ### Linting and Validation ```bash # Shell script linting (from repository root) @@ -88,14 +98,17 @@ make shellcheck - Automatic inventory management for Ansible integration 2. **OpenShift Cluster Deployment** (`deploy/openshift-clusters/`) - - Two deployment methods: dev-scripts (traditional) and kcli (modern) + - Three deployment methods: dev-scripts (traditional), kcli (modern), and assisted installer (spoke via ACM) - Ansible roles for complete cluster automation - Support for both arbiter and fencing topologies + - Assisted installer deploys spoke TNF clusters on an existing hub via ACM/MCE - Proxy configuration for external cluster access 3. **Ansible Roles Architecture**: - `dev-scripts/install-dev`: Traditional deployment using openshift-metal3/dev-scripts - `kcli/kcli-install`: Modern deployment using kcli virtualization management + - `assisted/acm-install`: Install ACM/MCE + assisted service + enable TNF on hub + - `assisted/assisted-spoke`: Deploy spoke TNF cluster via assisted installer + BMH - `proxy-setup`: Squid proxy for cluster external access - `redfish`: Automated stonith configuration for fencing topology - `config`: SSH key and git configuration @@ -119,8 +132,15 @@ make shellcheck - `roles/kcli/kcli-install/files/pull-secret.json`: OpenShift pull secret - SSH key automatically read from `~/.ssh/id_ed25519.pub` on ansible controller +#### Assisted Installer Method +- `vars/assisted.yml`: Variable override file (copy from `vars/assisted.yml.template`) +- Hub cluster must be deployed first via dev-scripts (`make deploy fencing-ipi`) +- Spoke credentials output to `~//auth/` on hypervisor +- Hub proxy preserved as `hub-proxy.env` + #### Generated Files - `proxy.env`: Generated proxy configuration (source this to access cluster) +- `hub-proxy.env`: Hub proxy config (preserved when spoke proxy is configured) - `kubeconfig`: OpenShift cluster kubeconfig - `kubeadmin-password`: Default admin password @@ -128,7 +148,7 @@ make shellcheck 1. **Environment Setup**: Use `deploy/aws-hypervisor/` tools or bring your own RHEL 9 host 2. **Configuration**: Edit inventory and config files based on chosen deployment method -3. **Deployment**: Run appropriate Ansible playbook (setup.yml or kcli-install.yml) +3. **Deployment**: Run appropriate Ansible playbook (setup.yml, kcli-install.yml, or assisted-install.yml) 4. **Access**: Source `proxy.env` and use `oc` commands or WebUI through proxy 5. **Cleanup**: Use cleanup make targets or Ansible playbooks diff --git a/deploy/Makefile b/deploy/Makefile index 1e8e8ec..3e42e61 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -76,6 +76,10 @@ arbiter-kcli: fencing-kcli: @./openshift-clusters/scripts/deploy-cluster.sh --topology fencing --method kcli +fencing-assisted: + @$(MAKE) fencing-ipi + @./openshift-clusters/scripts/deploy-fencing-assisted.sh + patch-nodes: @./openshift-clusters/scripts/patch-nodes.sh get-tnf-logs: @@ -107,6 +111,7 @@ help: @echo " arbiter-agent - Deploy arbiter Agent cluster (non-interactive)" @echo " arbiter-kcli - Deploy arbiter cluster using kcli (non-interactive)" @echo " fencing-kcli - Deploy fencing cluster using kcli (non-interactive)" + @echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer" @echo "" @echo "OpenShift Cluster Management:" @echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy" diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml new file mode 100644 index 0000000..85e54dc --- /dev/null +++ b/deploy/openshift-clusters/assisted-install.yml @@ -0,0 +1,108 @@ +--- +# Deploy a spoke TNF cluster via ACM/assisted installer on an existing hub cluster. +# +# Prerequisites: +# - vars/assisted.yml exists (copy from vars/assisted.yml.template) +# +# Usage: +# make deploy fencing-assisted + +- hosts: metal_machine + gather_facts: yes + + vars: + topology: fencing + interactive_mode: false + + vars_files: + - vars/assisted.yml + + pre_tasks: + - name: Check that proxy.env exists (hub must be deployed first) + stat: + path: "{{ playbook_dir }}/proxy.env" + delegate_to: localhost + register: proxy_env_check + + - name: Fail if proxy.env is missing + fail: + msg: >- + proxy.env not found. The hub cluster must be deployed first + using 'make deploy fencing-ipi'. proxy.env is required for + cluster access. + when: not proxy_env_check.stat.exists + + - name: Check that hub kubeconfig exists + stat: + path: "{{ ansible_user_dir }}/auth/kubeconfig" + register: hub_kubeconfig_check + + - name: Fail if hub kubeconfig is missing + fail: + msg: >- + Hub kubeconfig not found at ~/auth/kubeconfig. + The hub cluster must be deployed first. + when: not hub_kubeconfig_check.stat.exists + + - name: Set hub KUBECONFIG path + set_fact: + hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + + - name: Preserve hub proxy.env as hub-proxy.env + copy: + src: "{{ playbook_dir }}/proxy.env" + dest: "{{ playbook_dir }}/hub-proxy.env" + remote_src: no + backup: no + delegate_to: localhost + + - name: Display assisted installer configuration + debug: + msg: | + Assisted Installer Configuration: + Hub operator: {{ hub_operator }} + ACM/MCE channel: {{ acm_channel if hub_operator == 'acm' else mce_channel }} + Spoke cluster: {{ spoke_cluster_name }}.{{ spoke_base_domain }} + Spoke release image: {{ spoke_release_image }} + Spoke VMs: {{ spoke_ctlplanes }}x ({{ spoke_vm_vcpus }} vCPUs, {{ spoke_vm_memory }}MB RAM, {{ spoke_vm_disk_size }}GB disk) + Spoke network: {{ spoke_network_cidr }} + API VIP: {{ spoke_api_vip }} + Ingress VIP: {{ spoke_ingress_vip }} + Storage method: {{ assisted_storage_method }} + Force cleanup: {{ force_cleanup }} + + roles: + - role: assisted/acm-install + - role: assisted/assisted-spoke + + post_tasks: + - name: Setup proxy access for spoke cluster + include_role: + name: proxy-setup + vars: + kubeconfig_path: "{{ spoke_kubeconfig_path }}" + kubeadmin_password_path: "{{ spoke_kubeadmin_password_path }}" + + - name: Update cluster inventory with spoke VMs + include_role: + name: common + tasks_from: update-cluster-inventory + vars: + test_cluster_name: "{{ spoke_cluster_name }}" + + - name: Display deployment summary + debug: + msg: | + Spoke TNF cluster deployed successfully! + + Spoke credentials: + Kubeconfig: {{ spoke_kubeconfig_path }} + Admin password: {{ spoke_kubeadmin_password_path }} + + Access spoke cluster: + source proxy.env + KUBECONFIG={{ spoke_kubeconfig_path }} oc get nodes + + Access hub cluster: + source hub-proxy.env + KUBECONFIG=~/auth/kubeconfig oc get nodes diff --git a/deploy/openshift-clusters/collections/requirements.yml b/deploy/openshift-clusters/collections/requirements.yml index 291137f..4f4bdfd 100644 --- a/deploy/openshift-clusters/collections/requirements.yml +++ b/deploy/openshift-clusters/collections/requirements.yml @@ -13,3 +13,5 @@ collections: version: ">=2.0" - name: community.general version: ">=5.0.0" + - name: ansible.utils + version: ">=2.0.0" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml new file mode 100644 index 0000000..2e078f4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -0,0 +1,28 @@ +--- +# Default variables for acm-install role + +# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) +hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + +# Hub operator to install: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest +acm_channel: "auto" +mce_channel: "auto" + +# Storage method for assisted service: "hostpath" +assisted_storage_method: "hostpath" + +# hostPath directories on hub nodes +assisted_images_path: /var/lib/assisted-images +assisted_db_path: /var/lib/assisted-db +assisted_images_size: 50Gi +assisted_db_size: 10Gi +assisted_storage_class: assisted-service + +# Timeouts (seconds) +acm_csv_timeout: 900 +multiclusterhub_timeout: 1800 +assisted_service_timeout: 600 +metal3_stabilize_timeout: 300 \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml new file mode 100644 index 0000000..2f127f8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -0,0 +1,118 @@ +--- +# Create AgentServiceConfig with RHCOS ISO auto-extracted from release image + +- name: Get hub release image + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image + changed_when: false + +- name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' \ + | cut -d. -f1-2 + register: hub_ocp_version + changed_when: false + +- name: Determine spoke release image + set_fact: + effective_release_image: >- + {{ hub_release_image.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + +- name: Extract RHCOS ISO URL from release image + shell: | + # Get the machine-os-images reference from the release image + RHCOS_REF=$(oc adm release info "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --image-for=machine-os-images 2>/dev/null) + if [ -z "$RHCOS_REF" ]; then + echo "FAILED: Could not extract machine-os-images from release image" + exit 1 + fi + # Extract the RHCOS ISO URL from the image labels/annotations + oc image info "$RHCOS_REF" --registry-config="{{ pull_secret_path }}" \ + -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + labels = data.get('config', {}).get('config', {}).get('Labels', {}) + stream = labels.get('coreos.stream', '') + version = labels.get('version', '') + if stream and version: + url = f'https://rhcos.mirror.openshift.com/art/storage/prod/streams/{stream}/builds/{version}/x86_64/rhcos-{version}-live-iso.x86_64.iso' + print(url) + else: + print('NEEDS_FALLBACK') + " + register: rhcos_iso_extraction + changed_when: false + failed_when: "'FAILED' in rhcos_iso_extraction.stdout" + +- name: Try fallback RHCOS ISO extraction via coreos print-stream-json + shell: | + rm -rf /tmp/oc-extract && mkdir -p /tmp/oc-extract + RHCOS_URL=$(oc adm release extract "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --command=openshift-install --to=/tmp/oc-extract 2>/dev/null && \ + /tmp/oc-extract/openshift-install coreos print-stream-json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + iso = data['architectures']['x86_64']['artifacts']['metal']['formats']['iso']['disk'] + print(iso['location']) + " 2>/dev/null) || true + rm -rf /tmp/oc-extract + if [ -n "$RHCOS_URL" ]; then + echo "$RHCOS_URL" + else + echo "FAILED" + fi + register: rhcos_iso_fallback + changed_when: false + when: "'NEEDS_FALLBACK' in rhcos_iso_extraction.stdout" + +- name: Set RHCOS ISO URL fact + set_fact: + rhcos_iso_url: >- + {{ rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout) | trim }} + failed_when: rhcos_iso_url == 'FAILED' or rhcos_iso_url == 'NEEDS_FALLBACK' + +- name: Display RHCOS ISO URL + debug: + msg: "RHCOS ISO URL: {{ rhcos_iso_url }}" + +- name: Get RHCOS version from ISO URL + set_fact: + rhcos_version: "{{ rhcos_iso_url | regex_search('rhcos-([\\d.]+-\\d+)-live', '\\1') | first }}" + +- name: Create AgentServiceConfig + template: + src: agentserviceconfig.yml.j2 + dest: /tmp/agentserviceconfig.yml + mode: '0644' + +- name: Apply AgentServiceConfig + shell: | + oc apply -f /tmp/agentserviceconfig.yml + register: asc_result + changed_when: "'created' in asc_result.stdout" + +- name: Wait for assisted-service pod to be Running (2/2) + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ + --no-headers 2>/dev/null | grep -q '2/2.*Running' + register: assisted_pod + until: assisted_pod.rc == 0 + retries: "{{ (assisted_service_timeout / 15) | int }}" + delay: 15 + +- name: Display assisted-service pod status + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service + register: pod_status + changed_when: false + +- name: Show assisted-service pod + debug: + msg: "{{ pod_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml new file mode 100644 index 0000000..00dbd95 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -0,0 +1,45 @@ +--- +# Enable TNF cluster support in assisted service +# Requires both a ConfigMap AND an annotation on AgentServiceConfig + +- name: Create assisted-unsupported-config ConfigMap + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: ConfigMap + metadata: + name: assisted-unsupported-config + namespace: {{ assisted_service_namespace }} + data: + TNF_CLUSTERS_SUPPORT: "true" + EOF + register: cm_result + changed_when: "'created' in cm_result.stdout" + +- name: Annotate AgentServiceConfig to mount unsupported config + shell: | + oc annotate agentserviceconfig agent \ + unsupported.agent-install.openshift.io/assisted-service-configmap=assisted-unsupported-config \ + --overwrite + register: annotate_result + changed_when: "'annotated' in annotate_result.stdout" + +- name: Wait for assisted-service rollout after annotation + shell: | + oc rollout status deployment/assisted-service \ + -n {{ assisted_service_namespace }} --timeout=120s + register: rollout_result + changed_when: false + +- name: Verify TNF support is enabled + shell: | + oc exec -n {{ assisted_service_namespace }} \ + $(oc get pod -n {{ assisted_service_namespace }} -l app=assisted-service -o name | head -1) \ + -c assisted-service -- env | grep -i TNF_CLUSTERS_SUPPORT + register: tnf_verify + changed_when: false + failed_when: "'TNF_CLUSTERS_SUPPORT=true' not in tnf_verify.stdout" + +- name: Display TNF support status + debug: + msg: "{{ tnf_verify.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml new file mode 100644 index 0000000..ba932f3 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml @@ -0,0 +1,46 @@ +--- +# Patch Provisioning CR to: +# 1. Enable watchAllNamespaces (BMO processes BMHs in all namespaces) +# 2. Disable provisioning network (spoke VMs not on provisioning network) +# 3. Remove leftover provisioning fields that cause ironic to hang + +- name: Patch Provisioning CR - enable watchAllNamespaces and disable provisioning network + shell: | + oc patch provisioning provisioning-configuration --type=merge \ + -p '{"spec":{"provisioningNetwork":"Disabled","watchAllNamespaces":true}}' + register: patch_result + changed_when: "'patched' in patch_result.stdout" + +- name: Remove leftover provisioning fields from Provisioning CR + shell: | + oc patch provisioning provisioning-configuration --type=json \ + -p '[ + {"op":"remove","path":"/spec/provisioningIP"}, + {"op":"remove","path":"/spec/provisioningDHCPRange"}, + {"op":"remove","path":"/spec/provisioningNetworkCIDR"}, + {"op":"remove","path":"/spec/provisioningInterface"} + ]' 2>&1 || echo "Some provisioning fields may not exist, continuing" + register: remove_result + changed_when: "'patched' in remove_result.stdout" + failed_when: false + +- name: Wait for metal3 pod to stabilize after provisioning change + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state \ + --no-headers 2>/dev/null | grep -v Terminating | grep -q Running + register: metal3_pod + until: metal3_pod.rc == 0 + retries: "{{ (metal3_stabilize_timeout / 15) | int }}" + delay: 15 + +- name: Display metal3 pod status + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state + register: metal3_status + changed_when: false + +- name: Show metal3 pod + debug: + msg: "{{ metal3_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml new file mode 100644 index 0000000..18362ea --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -0,0 +1,115 @@ +--- +# Install ACM or MCE operator with auto-detected channel + +- name: Set operator configuration + set_fact: + op_config: "{{ operator_config[hub_operator] }}" + +- name: Create operator namespace + shell: | + oc create namespace {{ op_config.namespace }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create OperatorGroup + shell: | + oc apply -f - <<'EOF' + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} + spec: + targetNamespaces: + - {{ op_config.namespace }} + EOF + register: og_result + changed_when: "'created' in og_result.stdout" + +- name: Determine operator channel + block: + - name: Auto-detect channel from packagemanifest + shell: | + oc get packagemanifest {{ op_config.package_name }} \ + -o jsonpath='{.status.defaultChannel}' + register: detected_channel + changed_when: false + + - name: Set operator channel fact + set_fact: + operator_channel: "{{ detected_channel.stdout }}" + when: (hub_operator == 'acm' and acm_channel == 'auto') or + (hub_operator == 'mce' and mce_channel == 'auto') + +- name: Use user-specified channel + set_fact: + operator_channel: "{{ acm_channel if hub_operator == 'acm' else mce_channel }}" + when: (hub_operator == 'acm' and acm_channel != 'auto') or + (hub_operator == 'mce' and mce_channel != 'auto') + +- name: Display operator channel + debug: + msg: "Installing {{ hub_operator | upper }} with channel: {{ operator_channel }}" + +- name: Create operator Subscription + template: + src: operator-subscription.yml.j2 + dest: /tmp/operator-subscription.yml + mode: '0644' + +- name: Apply operator Subscription + shell: | + oc apply -f /tmp/operator-subscription.yml + register: sub_result + changed_when: "'created' in sub_result.stdout" + +- name: Wait for operator CSV to succeed + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ + | grep {{ op_config.package_name }} \ + | grep -q Succeeded + register: csv_result + until: csv_result.rc == 0 + retries: "{{ (acm_csv_timeout / 15) | int }}" + delay: 15 + +- name: Display operator install result + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers \ + | grep {{ op_config.package_name }} + register: csv_info + changed_when: false + +- name: Show installed operator + debug: + msg: "{{ csv_info.stdout }}" + +# Create MultiClusterHub (for ACM) or MultiClusterEngine (for MCE) +- name: Create MultiClusterHub CR + template: + src: multiclusterhub.yml.j2 + dest: /tmp/multiclusterhub.yml + mode: '0644' + when: hub_operator == 'acm' + +- name: Apply MultiClusterHub CR + shell: | + oc apply -f /tmp/multiclusterhub.yml + register: mch_result + changed_when: "'created' in mch_result.stdout" + when: hub_operator == 'acm' + +- name: Wait for MultiClusterHub to reach Running phase + shell: | + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ + -o jsonpath='{.status.phase}' + register: mch_phase + until: mch_phase.stdout == 'Running' + retries: "{{ (multiclusterhub_timeout / 30) | int }}" + delay: 30 + when: hub_operator == 'acm' + +- name: Display MultiClusterHub status + debug: + msg: "MultiClusterHub phase: {{ mch_phase.stdout }}" + when: hub_operator == 'acm' \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml new file mode 100644 index 0000000..9299e93 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml @@ -0,0 +1,23 @@ +--- +# Install ACM/MCE + assisted service + enable TNF support on hub cluster + +- block: + - name: Validate hub cluster health + include_tasks: validate.yml + + - name: Provision storage for assisted service + include_tasks: storage.yml + + - name: Install {{ hub_operator | upper }} operator + include_tasks: install-operator.yml + + - name: Create AgentServiceConfig + include_tasks: agent-service-config.yml + + - name: Enable TNF cluster support in assisted service + include_tasks: enable-tnf.yml + + - name: Enable BMO watch all namespaces and disable provisioning network + include_tasks: enable-watch-all-namespaces.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml new file mode 100644 index 0000000..4506150 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -0,0 +1,62 @@ +--- +# Provision hostPath storage for assisted service +# Requires chmod 777 (non-root containers) + SELinux context fix on both nodes + +- name: Create StorageClass and PersistentVolumes for assisted service + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-images + spec: + capacity: + storage: {{ assisted_images_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_images_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-db + spec: + capacity: + storage: {{ assisted_db_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_db_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: {{ assisted_storage_class }} + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer + EOF + register: storage_result + changed_when: "'created' in storage_result.stdout" + +- name: Get hub cluster node names + shell: | + oc get nodes --no-headers -o custom-columns=NAME:.metadata.name + register: hub_nodes + changed_when: false + +- name: Fix hostPath permissions and SELinux context on each hub node + shell: | + oc debug node/{{ item }} -- chroot /host bash -c " + mkdir -p {{ assisted_images_path }} {{ assisted_db_path }} + rm -rf {{ assisted_images_path }}/* {{ assisted_db_path }}/* + chmod 777 {{ assisted_images_path }} {{ assisted_db_path }} + chcon -Rt container_file_t {{ assisted_images_path }} {{ assisted_db_path }} + " + loop: "{{ hub_nodes.stdout_lines }}" + register: selinux_fix + changed_when: true + +- name: Display storage setup result + debug: + msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml new file mode 100644 index 0000000..57be824 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml @@ -0,0 +1,36 @@ +--- +# Validate hub cluster is healthy before proceeding + +- name: Check hub cluster nodes are Ready + shell: | + oc get nodes --no-headers | awk '{print $2}' | sort -u + register: node_statuses + changed_when: false + failed_when: "'NotReady' in node_statuses.stdout" + +- name: Check hub cluster node count + shell: | + oc get nodes --no-headers | wc -l + register: node_count + changed_when: false + failed_when: node_count.stdout | int < 2 + +- name: Check for degraded cluster operators + shell: | + oc get co -o json | python3 -c " + import json, sys + cos = json.load(sys.stdin)['items'] + degraded = [c['metadata']['name'] for c in cos + if any(cond['type'] == 'Degraded' and cond['status'] == 'True' + for cond in c['status']['conditions'])] + if degraded: + print('Degraded operators: ' + ', '.join(degraded)) + sys.exit(1) + print('All cluster operators healthy') + " + register: co_check + changed_when: false + +- name: Display hub cluster status + debug: + msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 new file mode 100644 index 0000000..8b71e4b --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: AgentServiceConfig +metadata: + name: agent +spec: + databaseStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_db_size }} + filesystemStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_images_size }} + osImages: + - cpuArchitecture: x86_64 + openshiftVersion: "{{ hub_ocp_version.stdout }}" + url: "{{ rhcos_iso_url }}" + version: "{{ rhcos_version }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 new file mode 100644 index 0000000..2b68364 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 @@ -0,0 +1,7 @@ +apiVersion: operator.open-cluster-management.io/v1 +kind: MultiClusterHub +metadata: + name: multiclusterhub + namespace: {{ op_config.namespace }} +spec: + availabilityConfig: Basic \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 new file mode 100644 index 0000000..f6c3109 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 @@ -0,0 +1,11 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} +spec: + channel: {{ operator_channel }} + installPlanApproval: Automatic + name: {{ op_config.subscription_name }} + source: {{ op_config.source }} + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml new file mode 100644 index 0000000..a32a832 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -0,0 +1,23 @@ +--- +# Role-internal variables (not user-overridable) + +acm_namespace: open-cluster-management +mce_namespace: multicluster-engine + +operator_config: + acm: + namespace: "{{ acm_namespace }}" + package_name: advanced-cluster-management + subscription_name: advanced-cluster-management + source: redhat-operators + mce: + namespace: "{{ mce_namespace }}" + package_name: multicluster-engine + subscription_name: multicluster-engine + source: redhat-operators + +# The MCE namespace is always multicluster-engine regardless of hub_operator +assisted_service_namespace: multicluster-engine + +# Pull secret location (dev-scripts standard path) +pull_secret_path: /opt/dev-scripts/pull_secret.json \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml new file mode 100644 index 0000000..668375f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -0,0 +1,42 @@ +--- +# Default variables for assisted-spoke role + +# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) +hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version - "auto" uses hub release image +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 +spoke_ctlplanes: 2 + +# Spoke network +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# BMC / sushy-tools +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Deployment options +force_cleanup: false + +# Timeouts (seconds) +spoke_install_timeout: 3600 +spoke_agent_register_timeout: 900 +spoke_credentials_timeout: 1800 + +# Hub network CIDR (for cross-bridge nftables rules) +hub_network_cidr: "192.168.111.0/24" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml new file mode 100644 index 0000000..55cc58a --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -0,0 +1,48 @@ +--- +# Remove existing spoke resources for re-deployment + +- name: Delete spoke namespace (removes all cluster resources) + shell: | + oc delete namespace {{ spoke_cluster_name }} --ignore-not-found --timeout=120s + register: ns_delete + changed_when: "'deleted' in ns_delete.stdout" + failed_when: false + +- name: Delete ClusterImageSet + shell: | + OCP_VER=$(oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2) + oc delete clusterimageset "${OCP_VER}.0" --ignore-not-found + changed_when: false + failed_when: false + +- name: Destroy spoke VMs + shell: | + for i in $(seq 0 {{ spoke_ctlplanes - 1 }}); do + sudo virsh destroy {{ spoke_cluster_name }}-master-${i} 2>/dev/null || true + sudo virsh undefine {{ spoke_cluster_name }}-master-${i} --remove-all-storage 2>/dev/null || true + done + changed_when: true + failed_when: false + +- name: Remove spoke libvirt network + shell: | + sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true + sudo virsh net-undefine {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + failed_when: false + +- name: Remove spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: absent + +- name: Remove spoke /etc/hosts entry + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: absent + become: true + +- name: Display cleanup result + debug: + msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml new file mode 100644 index 0000000..fb3ceb1 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml @@ -0,0 +1,63 @@ +--- +# Create BareMetalHost resources with BMC secrets and fencing credentials for each spoke node + +- name: Create BMH resources for each spoke node + shell: | + oc apply -f - <<'EOF' + --- + apiVersion: v1 + data: + password: {{ spoke_bmc_password | b64encode }} + username: {{ spoke_bmc_user | b64encode }} + kind: Secret + metadata: + name: {{ item.name }}-bmc-secret + namespace: {{ spoke_cluster_name }} + type: Opaque + --- + apiVersion: metal3.io/v1alpha1 + kind: BareMetalHost + metadata: + name: {{ item.name }}-bmh + namespace: {{ spoke_cluster_name }} + annotations: + bmac.agent-install.openshift.io/hostname: "{{ item.name }}" + bmac.agent-install.openshift.io/role: "master" + bmac.agent-install.openshift.io/fencing-credentials-secret-name: "{{ item.name }}-fencing-credentials" + labels: + infraenvs.agent-install.openshift.io: "{{ spoke_cluster_name }}" + spec: + architecture: x86_64 + bmc: + address: redfish-virtualmedia+https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + credentialsName: {{ item.name }}-bmc-secret + disableCertificateVerification: true + bootMACAddress: {{ item.mac }} + automatedCleaningMode: disabled + online: true + --- + apiVersion: v1 + stringData: + address: https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + certificateVerification: Disabled + username: {{ spoke_bmc_user }} + password: {{ spoke_bmc_password }} + kind: Secret + metadata: + name: {{ item.name }}-fencing-credentials + namespace: {{ spoke_cluster_name }} + type: Opaque + EOF + loop: "{{ spoke_vms }}" + register: bmh_result + changed_when: "'created' in bmh_result.stdout" + +- name: Display BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: bmh_status + changed_when: false + +- name: Show BMH resources + debug: + msg: "{{ bmh_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml new file mode 100644 index 0000000..f6276a4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -0,0 +1,101 @@ +--- +# Create spoke cluster resources on hub: namespace, secrets, ClusterDeployment, +# AgentClusterInstall, InfraEnv, ClusterImageSet + +- name: Get hub release image for spoke + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image + changed_when: false + +- name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 + register: hub_ocp_version + changed_when: false + +- name: Set effective spoke release image + set_fact: + effective_release_image: >- + {{ hub_release_image.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + effective_ocp_version: "{{ hub_ocp_version.stdout }}" + +- name: Get SSH public key + shell: | + cat ~/.ssh/id_rsa.pub + register: ssh_pub_key + changed_when: false + +- name: Create spoke namespace + shell: | + oc create namespace {{ spoke_cluster_name }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create spoke pull secret + shell: | + oc get secret {{ spoke_cluster_name }}-pull-secret -n {{ spoke_cluster_name }} 2>/dev/null \ + && echo "Already exists" \ + || oc create secret generic {{ spoke_cluster_name }}-pull-secret \ + -n {{ spoke_cluster_name }} \ + --from-file=.dockerconfigjson={{ pull_secret_path }} \ + --type=kubernetes.io/dockerconfigjson + register: ps_result + changed_when: "'created' in ps_result.stdout" + +- name: Create ClusterImageSet + template: + src: clusterimageset.yml.j2 + dest: /tmp/clusterimageset.yml + mode: '0644' + +- name: Apply ClusterImageSet + shell: | + oc apply -f /tmp/clusterimageset.yml + register: cis_result + changed_when: "'created' in cis_result.stdout" + +- name: Create ClusterDeployment + template: + src: clusterdeployment.yml.j2 + dest: /tmp/clusterdeployment.yml + mode: '0644' + +- name: Apply ClusterDeployment + shell: | + oc apply -f /tmp/clusterdeployment.yml + register: cd_result + changed_when: "'created' in cd_result.stdout" + +- name: Create AgentClusterInstall + template: + src: agentclusterinstall.yml.j2 + dest: /tmp/agentclusterinstall.yml + mode: '0644' + +- name: Apply AgentClusterInstall + shell: | + oc apply -f /tmp/agentclusterinstall.yml + register: aci_result + changed_when: "'created' in aci_result.stdout" + +- name: Create InfraEnv + template: + src: infraenv.yml.j2 + dest: /tmp/infraenv.yml + mode: '0644' + +- name: Apply InfraEnv + shell: | + oc apply -f /tmp/infraenv.yml + register: ie_result + changed_when: "'created' in ie_result.stdout" + +- name: Display cluster resources status + debug: + msg: >- + Spoke cluster resources created: ClusterImageSet={{ effective_ocp_version }}.0, + ClusterDeployment={{ spoke_cluster_name }}, + AgentClusterInstall={{ spoke_cluster_name }}, + InfraEnv={{ spoke_cluster_name }} \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml new file mode 100644 index 0000000..4384bdf --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml @@ -0,0 +1,48 @@ +--- +# Create dedicated libvirt network for spoke cluster with DNS for api/apps VIPs +# Then add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + +- name: Check if spoke network already exists + shell: | + sudo virsh net-info {{ spoke_network_name }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + register: net_check + changed_when: false + +- name: Create spoke libvirt network definition + template: + src: spoke-network.xml.j2 + dest: /tmp/spoke-network.xml + mode: '0644' + when: "'NOT_FOUND' in net_check.stdout" + +- name: Define spoke libvirt network + shell: | + sudo virsh net-define /tmp/spoke-network.xml + when: "'NOT_FOUND' in net_check.stdout" + +- name: Start spoke libvirt network + shell: | + sudo virsh net-start {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + +- name: Set spoke libvirt network to autostart + shell: | + sudo virsh net-autostart {{ spoke_network_name }} + changed_when: true + +- name: Add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + shell: | + # Check if rules already exist + if sudo nft list chain ip filter FORWARD 2>/dev/null | grep -q "{{ spoke_network_cidr }}.*{{ hub_network_cidr }}"; then + echo "Rules already exist" + else + sudo nft insert rule ip filter FORWARD ip saddr {{ spoke_network_cidr }} ip daddr {{ hub_network_cidr }} accept + sudo nft insert rule ip filter FORWARD ip saddr {{ hub_network_cidr }} ip daddr {{ spoke_network_cidr }} accept + echo "Rules added" + fi + register: nft_result + changed_when: "'added' in nft_result.stdout" + +- name: Display network setup result + debug: + msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml new file mode 100644 index 0000000..e29487e --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml @@ -0,0 +1,75 @@ +--- +# Create empty libvirt VMs for spoke cluster, capture UUID/MAC + +- name: Create spoke VM disks + shell: | + sudo qemu-img create -f qcow2 \ + {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2 \ + {{ spoke_vm_disk_size }}G + args: + creates: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + loop: "{{ range(spoke_ctlplanes) | list }}" + +- name: Check if spoke VMs already exist + shell: | + sudo virsh dominfo {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_check + changed_when: false + +- name: Create spoke VMs (defined but not started) + shell: | + sudo virt-install \ + --name {{ spoke_cluster_name }}-master-{{ item.item }} \ + --ram {{ spoke_vm_memory }} \ + --vcpus {{ spoke_vm_vcpus }} \ + --disk {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item.item }}.qcow2,bus=virtio \ + --network network={{ spoke_network_name }},model=virtio \ + --os-variant rhel9.0 \ + --graphics none \ + --noautoconsole \ + --boot hd,network \ + --noreboot \ + --import + loop: "{{ vm_check.results }}" + when: "'NOT_FOUND' in item.stdout" + +- name: Ensure spoke VMs are shut off + shell: | + sudo virsh destroy {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null || true + loop: "{{ range(spoke_ctlplanes) | list }}" + changed_when: false + failed_when: false + +- name: Capture spoke VM UUIDs + shell: | + sudo virsh domuuid {{ spoke_cluster_name }}-master-{{ item }} + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_uuids + changed_when: false + +- name: Capture spoke VM MAC addresses + shell: | + sudo virsh domiflist {{ spoke_cluster_name }}-master-{{ item }} \ + | grep {{ spoke_network_name }} | awk '{print $5}' + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_macs + changed_when: false + +- name: Build spoke VM info list + set_fact: + spoke_vms: >- + {{ spoke_vms | default([]) + [ + { + 'index': item.item, + 'name': spoke_cluster_name ~ '-master-' ~ item.item, + 'uuid': vm_uuids.results[item.item].stdout | trim, + 'mac': vm_macs.results[item.item].stdout | trim + } + ] }} + loop: "{{ vm_uuids.results }}" + +- name: Display spoke VM info + debug: + msg: "VM {{ item.name }}: UUID={{ item.uuid }}, MAC={{ item.mac }}" + loop: "{{ spoke_vms }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml new file mode 100644 index 0000000..43a420c --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml @@ -0,0 +1,30 @@ +--- +# Deploy spoke TNF cluster via assisted installer + BMH + +- block: + - name: Cleanup existing spoke resources + include_tasks: cleanup.yml + when: force_cleanup | bool + + - name: Create dedicated libvirt network for spoke cluster + include_tasks: create-spoke-network.yml + + - name: Create spoke VMs + include_tasks: create-spoke-vms.yml + + - name: Verify sushy-tools is running + include_tasks: setup-ksushy.yml + + - name: Create spoke cluster resources on hub + include_tasks: create-cluster-resources.yml + + - name: Create BareMetalHost resources + include_tasks: create-bmh.yml + + - name: Wait for spoke cluster installation to complete + include_tasks: wait-for-install.yml + + - name: Retrieve spoke cluster credentials + include_tasks: retrieve-credentials.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml new file mode 100644 index 0000000..15b15e4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml @@ -0,0 +1,100 @@ +--- +# Extract spoke cluster credentials and set up DNS for hypervisor access + +- name: Create spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: directory + mode: '0700' + +- name: Wait for admin-kubeconfig secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: kubeconfig_secret + until: kubeconfig_secret.rc == 0 + retries: 10 + delay: 15 + +- name: Extract spoke kubeconfig + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.kubeconfig}' | base64 -d + register: spoke_kubeconfig + changed_when: false + +- name: Save spoke kubeconfig + copy: + content: "{{ spoke_kubeconfig.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeconfig" + mode: '0600' + +- name: Wait for admin-password secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: password_secret + until: password_secret.rc == 0 + retries: "{{ (spoke_credentials_timeout / 30) | int }}" + delay: 30 + +- name: Extract spoke admin password + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.password}' | base64 -d + register: spoke_password + changed_when: false + +- name: Save spoke admin password + copy: + content: "{{ spoke_password.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeadmin-password" + mode: '0600' + +- name: Add spoke API DNS to hypervisor /etc/hosts + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + line: "{{ spoke_api_vip }} api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} api-int.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: present + become: true + +- name: Ensure spoke VMs are running + shell: | + STATE=$(virsh domstate {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null) + if [ "$STATE" != "running" ]; then + virsh start {{ spoke_cluster_name }}-master-{{ item }} + echo "STARTED" + else + echo "ALREADY_RUNNING" + fi + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_start_result + changed_when: "'STARTED' in vm_start_result.stdout" + failed_when: false + become: true + +- name: Wait for spoke VMs to boot + pause: + seconds: 120 + when: vm_start_result.results | selectattr('stdout', 'search', 'STARTED') | list | length > 0 + +- name: Verify spoke cluster access + shell: | + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes + register: spoke_nodes + changed_when: false + retries: 20 + delay: 30 + until: spoke_nodes.rc == 0 + +- name: Display spoke cluster nodes + debug: + msg: "{{ spoke_nodes.stdout }}" + +- name: Set spoke kubeconfig path as fact for post-deployment tasks + set_fact: + spoke_kubeconfig_path: "{{ spoke_auth_dir }}/kubeconfig" + spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml new file mode 100644 index 0000000..88a4026 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml @@ -0,0 +1,34 @@ +--- +# Verify sushy-tools is running on the hypervisor (should already exist from dev-scripts) + +- name: Check if sushy-tools container is running + shell: | + sudo podman ps --format '{{ '{{' }}.Names{{ '}}' }}' | grep -q sushy-tools + register: sushy_check + changed_when: false + failed_when: false + +- name: Fail if sushy-tools is not running + fail: + msg: >- + sushy-tools container is not running. It should be started by dev-scripts. + Ensure the hub was deployed with 'make deploy fencing-ipi' before running + the assisted installer. + when: sushy_check.rc != 0 + +- name: Verify spoke VMs are visible via sushy-tools + shell: | + curl -sk https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/ \ + -u {{ spoke_bmc_user }}:{{ spoke_bmc_password }} \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(d['Members@odata.count'])" + register: sushy_systems + changed_when: false + +- name: Verify expected number of systems visible + assert: + that: + - sushy_systems.stdout | int >= (spoke_ctlplanes + 2) + fail_msg: >- + Expected at least {{ spoke_ctlplanes + 2 }} systems in sushy-tools + ({{ spoke_ctlplanes }} spoke + 2 hub), but found {{ sushy_systems.stdout }}. + success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml new file mode 100644 index 0000000..e0dd662 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml @@ -0,0 +1,103 @@ +--- +# Monitor BMH provisioning, agent registration, and installation progress + +- name: Wait for agents to register + shell: | + oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l + register: agent_count + until: agent_count.stdout | int >= spoke_ctlplanes + retries: "{{ (spoke_agent_register_timeout / 30) | int }}" + delay: 30 + +- name: Display registered agents + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: agents_info + changed_when: false + +- name: Show registered agents + debug: + msg: "{{ agents_info.stdout }}" + +- name: Wait for spoke cluster installation to complete + shell: | + ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) + echo "$ACI_STATE" + case "$ACI_STATE" in + adding-hosts|installed) + exit 0 + ;; + error|failed) + echo "INSTALL FAILED" + oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool + exit 2 + ;; + *) + exit 1 + ;; + esac + register: install_state + until: install_state.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + failed_when: install_state.rc == 2 + +- name: Display final installation state + debug: + msg: "Spoke cluster installation state: {{ install_state.stdout_lines[0] }}" + +- name: Wait for all agents to reach Done stage + shell: | + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json + import sys + + data = json.load(sys.stdin) + agents = data.get('items', []) + total = len(agents) + done = 0 + stuck = [] + for a in agents: + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + if stage == 'Done': + done += 1 + else: + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + hostname = a.get('spec', {}).get('hostname', 'unknown') + stuck.append(f'{hostname}: state={state}, stage={stage}') + + print(f'Agents Done: {done} / {total}') + for s in stuck: + print(f' {s}') + sys.exit(0 if done == total else 1) + " + register: agents_done + until: agents_done.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + changed_when: false + +- name: Display final agent status + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: final_agents + changed_when: false + +- name: Show final agents + debug: + msg: "{{ final_agents.stdout }}" + +- name: Display final BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: final_bmh + changed_when: false + +- name: Show final BMH + debug: + msg: "{{ final_bmh.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 new file mode 100644 index 0000000..0e18b80 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: extensions.hive.openshift.io/v1beta1 +kind: AgentClusterInstall +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterDeploymentRef: + name: {{ spoke_cluster_name }} + imageSetRef: + name: "{{ effective_ocp_version }}.0" + apiVIP: "{{ spoke_api_vip }}" + ingressVIP: "{{ spoke_ingress_vip }}" + platformType: BareMetal + networking: + clusterNetwork: + - cidr: "{{ spoke_cluster_network_cidr }}" + hostPrefix: 23 + serviceNetwork: + - "{{ spoke_service_network_cidr }}" + provisionRequirements: + controlPlaneAgents: {{ spoke_ctlplanes }} + sshPublicKey: "{{ ssh_pub_key.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 new file mode 100644 index 0000000..a31289f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 @@ -0,0 +1,20 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterDeployment +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + baseDomain: {{ spoke_base_domain }} + clusterName: {{ spoke_cluster_name }} + clusterInstallRef: + group: extensions.hive.openshift.io + kind: AgentClusterInstall + name: {{ spoke_cluster_name }} + version: v1beta1 + platform: + agentBareMetal: + agentSelector: + matchLabels: + cluster: tnf + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 new file mode 100644 index 0000000..82eed09 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 @@ -0,0 +1,6 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterImageSet +metadata: + name: "{{ effective_ocp_version }}.0" +spec: + releaseImage: {{ effective_release_image }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 new file mode 100644 index 0000000..4945f81 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: InfraEnv +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterRef: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} + sshAuthorizedKey: "{{ ssh_pub_key.stdout }}" + agentLabels: + cluster: tnf + cpuArchitecture: x86_64 + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 new file mode 100644 index 0000000..63f2e23 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 @@ -0,0 +1,29 @@ + + {{ spoke_network_name }} + + + + + + + + + + apps.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + + + + + + + + + + + + diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml new file mode 100644 index 0000000..3712692 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -0,0 +1,21 @@ +--- +# Role-internal variables + +# Derived from spoke_network_cidr +spoke_network_gateway: "{{ spoke_network_cidr | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_start: "{{ spoke_network_cidr | ansible.utils.ipaddr('50') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_end: "{{ spoke_network_cidr | ansible.utils.ipaddr('150') | ansible.utils.ipaddr('address') }}" +spoke_network_prefix: "{{ spoke_network_cidr | ansible.utils.ipaddr('prefix') }}" + +# Libvirt network name (derived from spoke cluster name) +spoke_network_name: "{{ spoke_cluster_name }}" + +# Pull secret location (dev-scripts standard path) +pull_secret_path: /opt/dev-scripts/pull_secret.json + +# VM image path +spoke_vm_image_dir: /var/lib/libvirt/images + +# Credential output paths +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" +hub_auth_dir: "{{ ansible_user_dir }}/auth" \ No newline at end of file diff --git a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh new file mode 100755 index 0000000..eba97c4 --- /dev/null +++ b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Get the directory where this script is located +SCRIPT_DIR=$(dirname "$0") +# Get the deploy directory (two levels up from scripts) +DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +set -o nounset +set -o errexit +set -o pipefail + +# Check if instance data exists +if [[ ! -f "${DEPLOY_DIR}/aws-hypervisor/instance-data/aws-instance-id" ]]; then + echo "Error: No instance found. Please run 'make deploy' first." + exit 1 +fi + +# Check if inventory.ini exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then + echo "Error: inventory.ini not found in ${DEPLOY_DIR}/openshift-clusters/" + echo "Please ensure the inventory file is properly configured." + echo "You can run 'make inventory' to update it with current instance information." + exit 1 +fi + +# Check if vars/assisted.yml exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" ]]; then + echo "Error: vars/assisted.yml not found." + echo "Copy the template and customize it:" + echo " cp ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml.template ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" + exit 1 +fi + +echo "Deploying spoke TNF cluster via assisted installer..." + +cd "${DEPLOY_DIR}/openshift-clusters" + +if ansible-playbook assisted-install.yml -i inventory.ini; then + echo "" + echo "OpenShift spoke TNF cluster deployment via assisted installer completed successfully!" + echo "" + echo "Next steps:" + echo "1. Access spoke cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" + echo " KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes" + echo "2. Access hub cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/hub-proxy.env" + echo " KUBECONFIG=~/auth/kubeconfig oc get nodes" +else + echo "Error: Spoke cluster deployment failed!" + echo "Check the Ansible logs for more details." + exit 1 +fi diff --git a/deploy/openshift-clusters/vars/assisted.yml.template b/deploy/openshift-clusters/vars/assisted.yml.template new file mode 100644 index 0000000..dc4f0d0 --- /dev/null +++ b/deploy/openshift-clusters/vars/assisted.yml.template @@ -0,0 +1,48 @@ +# Assisted Installer Configuration +# Copy this file to vars/assisted.yml and customize as needed +# +# Usage: After deploying a hub cluster with 'make deploy fencing-ipi', +# run 'make fencing-assisted' to deploy a spoke TNF cluster via ACM/assisted installer. + +# Hub operator: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest (recommended) +# Override with specific channel like "release-2.15" if needed +acm_channel: "auto" +mce_channel: "auto" + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version +# "auto" uses the same release image as the hub (recommended) +# Or specify an explicit release image URL +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 # MB (32GB) +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 # GB +spoke_ctlplanes: 2 # Always 2 for TNF + +# Spoke network configuration +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# BMC / sushy-tools (defaults match dev-scripts deployment) +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Storage for assisted service on hub +# Currently only "hostpath" is supported +assisted_storage_method: "hostpath" + +# Deployment options +force_cleanup: false \ No newline at end of file From 8719ccc5e0c66625b8b287a442ee802e09db9bd9 Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 18 Feb 2026 13:09:50 +0200 Subject: [PATCH 3/9] Fix critical review issues for assisted installer roles - C1: SSH key detection now tries ed25519 first, falls back to rsa/ecdsa - C2: Deduplicate pull_secret_path and hub_kubeconfig to playbook level - C3: Move hub release image extraction to playbook pre_tasks (run once) - C4: RHCOS ISO extraction checks rc and catches empty string failures - C5: Explicit disk cleanup prevents stale qcow2 reuse on re-deploy - Remove unused hub_auth_dir variable (I11) Co-Authored-By: Claude Opus 4.6 --- .../openshift-clusters/assisted-install.yml | 31 ++++++++++++++++--- .../assisted/acm-install/defaults/main.yml | 3 -- .../tasks/agent-service-config.yml | 31 ++++++------------- .../templates/agentserviceconfig.yml.j2 | 2 +- .../roles/assisted/acm-install/vars/main.yml | 5 +-- .../assisted/assisted-spoke/defaults/main.yml | 3 -- .../assisted/assisted-spoke/tasks/cleanup.yml | 7 +++++ .../tasks/create-cluster-resources.yml | 26 ++++------------ .../assisted/assisted-spoke/vars/main.yml | 6 +--- 9 files changed, 52 insertions(+), 62 deletions(-) diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml index 85e54dc..71a3027 100644 --- a/deploy/openshift-clusters/assisted-install.yml +++ b/deploy/openshift-clusters/assisted-install.yml @@ -13,6 +13,8 @@ vars: topology: fencing interactive_mode: false + pull_secret_path: /opt/dev-scripts/pull_secret.json + hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" vars_files: - vars/assisted.yml @@ -44,10 +46,6 @@ The hub cluster must be deployed first. when: not hub_kubeconfig_check.stat.exists - - name: Set hub KUBECONFIG path - set_fact: - hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - - name: Preserve hub proxy.env as hub-proxy.env copy: src: "{{ playbook_dir }}/proxy.env" @@ -56,6 +54,31 @@ backup: no delegate_to: localhost + - name: Get hub release image + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 + register: hub_ocp_version_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Set hub release facts + set_fact: + hub_release_image: "{{ hub_release_image_raw.stdout }}" + hub_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + effective_release_image: >- + {{ hub_release_image_raw.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + effective_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + - name: Display assisted installer configuration debug: msg: | diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml index 2e078f4..4e933c1 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -1,9 +1,6 @@ --- # Default variables for acm-install role -# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) -hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - # Hub operator to install: "acm" or "mce" hub_operator: acm diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml index 2f127f8..6273479 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -1,24 +1,6 @@ --- # Create AgentServiceConfig with RHCOS ISO auto-extracted from release image - -- name: Get hub release image - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.image}' - register: hub_release_image - changed_when: false - -- name: Get hub OCP version - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.version}' \ - | cut -d. -f1-2 - register: hub_ocp_version - changed_when: false - -- name: Determine spoke release image - set_fact: - effective_release_image: >- - {{ hub_release_image.stdout if spoke_release_image == 'auto' - else spoke_release_image }} +# hub_release_image, hub_ocp_version, effective_release_image are set by playbook pre_tasks - name: Extract RHCOS ISO URL from release image shell: | @@ -47,7 +29,9 @@ " register: rhcos_iso_extraction changed_when: false - failed_when: "'FAILED' in rhcos_iso_extraction.stdout" + failed_when: >- + rhcos_iso_extraction.rc != 0 or + 'FAILED' in rhcos_iso_extraction.stdout - name: Try fallback RHCOS ISO extraction via coreos print-stream-json shell: | @@ -75,8 +59,11 @@ - name: Set RHCOS ISO URL fact set_fact: rhcos_iso_url: >- - {{ rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout) | trim }} - failed_when: rhcos_iso_url == 'FAILED' or rhcos_iso_url == 'NEEDS_FALLBACK' + {{ (rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout)) | trim }} + failed_when: >- + rhcos_iso_url == 'FAILED' or + rhcos_iso_url == 'NEEDS_FALLBACK' or + rhcos_iso_url == '' - name: Display RHCOS ISO URL debug: diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 index 8b71e4b..0d8527e 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -17,6 +17,6 @@ spec: storage: {{ assisted_images_size }} osImages: - cpuArchitecture: x86_64 - openshiftVersion: "{{ hub_ocp_version.stdout }}" + openshiftVersion: "{{ hub_ocp_version }}" url: "{{ rhcos_iso_url }}" version: "{{ rhcos_version }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml index a32a832..8f1a561 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -17,7 +17,4 @@ operator_config: source: redhat-operators # The MCE namespace is always multicluster-engine regardless of hub_operator -assisted_service_namespace: multicluster-engine - -# Pull secret location (dev-scripts standard path) -pull_secret_path: /opt/dev-scripts/pull_secret.json \ No newline at end of file +assisted_service_namespace: multicluster-engine \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml index 668375f..86a371e 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -1,9 +1,6 @@ --- # Default variables for assisted-spoke role -# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) -hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - # Spoke cluster identity spoke_cluster_name: spoke-tnf spoke_base_domain: example.com diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml index 55cc58a..2dbaa36 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -24,6 +24,13 @@ changed_when: true failed_when: false +- name: Remove spoke VM disk images + file: + path: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + state: absent + loop: "{{ range(spoke_ctlplanes) | list }}" + become: true + - name: Remove spoke libvirt network shell: | sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml index f6276a4..fa1962c 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -1,29 +1,15 @@ --- # Create spoke cluster resources on hub: namespace, secrets, ClusterDeployment, # AgentClusterInstall, InfraEnv, ClusterImageSet - -- name: Get hub release image for spoke - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.image}' - register: hub_release_image - changed_when: false - -- name: Get hub OCP version - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 - register: hub_ocp_version - changed_when: false - -- name: Set effective spoke release image - set_fact: - effective_release_image: >- - {{ hub_release_image.stdout if spoke_release_image == 'auto' - else spoke_release_image }} - effective_ocp_version: "{{ hub_ocp_version.stdout }}" +# hub_release_image, hub_ocp_version, effective_release_image, effective_ocp_version +# are set by playbook pre_tasks - name: Get SSH public key shell: | - cat ~/.ssh/id_rsa.pub + for key in ~/.ssh/id_ed25519.pub ~/.ssh/id_rsa.pub ~/.ssh/id_ecdsa.pub; do + [ -f "$key" ] && cat "$key" && exit 0 + done + echo "ERROR: No SSH public key found" >&2 && exit 1 register: ssh_pub_key changed_when: false diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml index 3712692..6434d77 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -10,12 +10,8 @@ spoke_network_prefix: "{{ spoke_network_cidr | ansible.utils.ipaddr('prefix') }} # Libvirt network name (derived from spoke cluster name) spoke_network_name: "{{ spoke_cluster_name }}" -# Pull secret location (dev-scripts standard path) -pull_secret_path: /opt/dev-scripts/pull_secret.json - # VM image path spoke_vm_image_dir: /var/lib/libvirt/images # Credential output paths -spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" -hub_auth_dir: "{{ ansible_user_dir }}/auth" \ No newline at end of file +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" \ No newline at end of file From bc768c38a5f79c34918d2215903c2a2a47c6dc89 Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 18 Feb 2026 19:53:09 +0200 Subject: [PATCH 4/9] Fix important review issues and add READMEs for assisted installer roles - I6: Parse spoke_cluster_name from vars/assisted.yml in deploy script instead of hardcoding ~/spoke-tnf/auth/kubeconfig - I8: Add cluster state tracking (deploying/deployed) to assisted-install.yml using common/cluster-state.yml, consistent with dev-scripts and kcli methods - I9: Replace blanket failed_when:false with ignore_errors:true in cleanup, conditional error checking in enable-watch-all-namespaces, and remove failed_when:false from virsh start in retrieve-credentials - I10: Add block/rescue diagnostic handlers to all 9 wait loops across install-operator, agent-service-config, wait-for-install, and retrieve-credentials, dumping relevant status on timeout - I13: Add README.md for acm-install and assisted-spoke roles - S16: Document DHCP range constraints in assisted.yml.template - S17: Expose hub_network_cidr in assisted.yml.template - S18: Add trailing newlines to all 22 new files Co-Authored-By: Claude Opus 4.6 --- .../openshift-clusters/assisted-install.yml | 23 ++ .../roles/assisted/acm-install/README.md | 79 +++++++ .../assisted/acm-install/defaults/main.yml | 2 +- .../tasks/agent-service-config.yml | 41 +++- .../assisted/acm-install/tasks/enable-tnf.yml | 2 +- .../tasks/enable-watch-all-namespaces.yml | 9 +- .../acm-install/tasks/install-operator.yml | 79 +++++-- .../assisted/acm-install/tasks/storage.yml | 2 +- .../assisted/acm-install/tasks/validate.yml | 2 +- .../templates/agentserviceconfig.yml.j2 | 2 +- .../templates/multiclusterhub.yml.j2 | 2 +- .../templates/operator-subscription.yml.j2 | 2 +- .../roles/assisted/acm-install/vars/main.yml | 2 +- .../roles/assisted/assisted-spoke/README.md | 132 +++++++++++ .../assisted/assisted-spoke/defaults/main.yml | 2 +- .../assisted/assisted-spoke/tasks/cleanup.yml | 10 +- .../assisted-spoke/tasks/create-bmh.yml | 2 +- .../tasks/create-cluster-resources.yml | 2 +- .../tasks/create-spoke-network.yml | 2 +- .../assisted-spoke/tasks/create-spoke-vms.yml | 2 +- .../tasks/retrieve-credentials.yml | 113 +++++++--- .../assisted-spoke/tasks/setup-ksushy.yml | 2 +- .../assisted-spoke/tasks/wait-for-install.yml | 209 ++++++++++++------ .../assisted/assisted-spoke/vars/main.yml | 2 +- .../scripts/deploy-fencing-assisted.sh | 8 +- .../vars/assisted.yml.template | 8 +- 26 files changed, 600 insertions(+), 141 deletions(-) create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/README.md create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml index 71a3027..ccff642 100644 --- a/deploy/openshift-clusters/assisted-install.yml +++ b/deploy/openshift-clusters/assisted-install.yml @@ -15,6 +15,9 @@ interactive_mode: false pull_secret_path: /opt/dev-scripts/pull_secret.json hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + method: assisted + cluster_state_dir: "../aws-hypervisor/instance-data" + cluster_state_filename: "cluster-vm-state.json" vars_files: - vars/assisted.yml @@ -94,6 +97,16 @@ Storage method: {{ assisted_storage_method }} Force cleanup: {{ force_cleanup }} + - name: Update cluster state to deploying + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deploying' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + roles: - role: assisted/acm-install - role: assisted/assisted-spoke @@ -113,6 +126,16 @@ vars: test_cluster_name: "{{ spoke_cluster_name }}" + - name: Update cluster state to deployed + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deployed' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + - name: Display deployment summary debug: msg: | diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/README.md b/deploy/openshift-clusters/roles/assisted/acm-install/README.md new file mode 100644 index 0000000..8e6a125 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/README.md @@ -0,0 +1,79 @@ +# acm-install Role + +Installs ACM or MCE operator on a hub cluster and configures the assisted installer service for spoke TNF cluster deployment. + +## Description + +This role prepares an existing hub OpenShift cluster to deploy spoke TNF clusters via the assisted installer. It: + +1. Validates hub cluster health and prerequisites +2. Provisions hostPath storage for the assisted service +3. Installs the ACM or MCE operator (auto-detects channel) +4. Creates the AgentServiceConfig with RHCOS ISO auto-extracted from the hub release image +5. Enables TNF cluster support in the assisted service +6. Configures BMO to watch all namespaces and disables the provisioning network + +## Requirements + +- A running hub OpenShift cluster (deployed via `make deploy fencing-ipi` or equivalent) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- Pull secret with access to required registries +- `oc` CLI available on the hypervisor + +## Role Variables + +### Configurable Variables (defaults/main.yml) + +- `hub_operator`: Operator to install - `"acm"` or `"mce"` (default: `"acm"`) +- `acm_channel`: ACM operator channel - `"auto"` detects from packagemanifest (default: `"auto"`) +- `mce_channel`: MCE operator channel (default: `"auto"`) +- `assisted_storage_method`: Storage backend - currently only `"hostpath"` (default: `"hostpath"`) +- `assisted_images_path`: Host directory for ISO images (default: `/var/lib/assisted-images`) +- `assisted_db_path`: Host directory for database (default: `/var/lib/assisted-db`) +- `assisted_images_size`: PV size for images (default: `50Gi`) +- `assisted_db_size`: PV size for database (default: `10Gi`) +- `assisted_storage_class`: StorageClass name (default: `assisted-service`) + +### Timeout Variables + +- `acm_csv_timeout`: Operator CSV install timeout in seconds (default: `900`) +- `multiclusterhub_timeout`: MultiClusterHub readiness timeout (default: `1800`) +- `assisted_service_timeout`: Assisted service pod readiness timeout (default: `600`) +- `metal3_stabilize_timeout`: Metal3 pod stabilization timeout after provisioning changes (default: `300`) + +### Variables Set by Playbook + +These are set in `assisted-install.yml` and passed to the role: + +- `hub_kubeconfig`: Path to hub cluster kubeconfig +- `pull_secret_path`: Path to pull secret on the hypervisor +- `hub_release_image`: Hub cluster release image (extracted in playbook pre_tasks) +- `hub_ocp_version`: Hub OCP version major.minor (extracted in playbook pre_tasks) +- `effective_release_image`: Release image to use for the spoke (hub image or user override) + +## Task Flow + +1. **validate.yml** - Checks hub cluster health, node readiness, and API access +2. **storage.yml** - Creates hostPath PVs, StorageClass, and fixes permissions/SELinux on hub nodes +3. **install-operator.yml** - Installs ACM/MCE operator subscription, waits for CSV, creates MultiClusterHub +4. **agent-service-config.yml** - Extracts RHCOS ISO URL from release image, creates AgentServiceConfig +5. **enable-tnf.yml** - Enables TNF support in assisted service configuration +6. **enable-watch-all-namespaces.yml** - Patches Provisioning CR to enable BMO in all namespaces + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +## Troubleshooting + +- Check operator CSV status: `oc get csv -n open-cluster-management` +- Check MultiClusterHub status: `oc get multiclusterhub -n open-cluster-management` +- Check assisted service pods: `oc get pods -n multicluster-engine -l app=assisted-service` +- Check AgentServiceConfig: `oc get agentserviceconfig agent -o yaml` +- Check events: `oc get events -n multicluster-engine --sort-by='.lastTimestamp'` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml index 4e933c1..0c4d760 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -22,4 +22,4 @@ assisted_storage_class: assisted-service acm_csv_timeout: 900 multiclusterhub_timeout: 1800 assisted_service_timeout: 600 -metal3_stabilize_timeout: 300 \ No newline at end of file +metal3_stabilize_timeout: 300 diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml index 6273479..0cd9fc7 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -85,14 +85,37 @@ register: asc_result changed_when: "'created' in asc_result.stdout" -- name: Wait for assisted-service pod to be Running (2/2) - shell: | - oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ - --no-headers 2>/dev/null | grep -q '2/2.*Running' - register: assisted_pod - until: assisted_pod.rc == 0 - retries: "{{ (assisted_service_timeout / 15) | int }}" - delay: 15 +- block: + - name: Wait for assisted-service pod to be Running (2/2) + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ + --no-headers 2>/dev/null | grep -q '2/2.*Running' + register: assisted_pod + until: assisted_pod.rc == 0 + retries: "{{ (assisted_service_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect assisted-service timeout diagnostics + shell: | + echo "=== Assisted Service Pods ===" + oc get pods -n {{ assisted_service_namespace }} 2>/dev/null + echo "" + echo "=== Pod Details ===" + oc describe pods -n {{ assisted_service_namespace }} -l app=assisted-service 2>/dev/null | tail -40 + echo "" + echo "=== Recent Events ===" + oc get events -n {{ assisted_service_namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: assisted_diag + changed_when: false + failed_when: false + + - name: Display assisted-service timeout diagnostics + debug: + msg: "{{ assisted_diag.stdout }}" + + - name: Fail after assisted-service timeout + fail: + msg: "assisted-service pod did not reach Running (2/2) state within timeout" - name: Display assisted-service pod status shell: | @@ -102,4 +125,4 @@ - name: Show assisted-service pod debug: - msg: "{{ pod_status.stdout }}" \ No newline at end of file + msg: "{{ pod_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml index 00dbd95..3a00c4c 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -42,4 +42,4 @@ - name: Display TNF support status debug: - msg: "{{ tnf_verify.stdout | trim }}" \ No newline at end of file + msg: "{{ tnf_verify.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml index ba932f3..cb7dd4f 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml @@ -19,10 +19,13 @@ {"op":"remove","path":"/spec/provisioningDHCPRange"}, {"op":"remove","path":"/spec/provisioningNetworkCIDR"}, {"op":"remove","path":"/spec/provisioningInterface"} - ]' 2>&1 || echo "Some provisioning fields may not exist, continuing" + ]' 2>&1 register: remove_result changed_when: "'patched' in remove_result.stdout" - failed_when: false + failed_when: >- + remove_result.rc != 0 and + 'does not exist' not in remove_result.stderr and + 'does not exist' not in remove_result.stdout - name: Wait for metal3 pod to stabilize after provisioning change shell: | @@ -43,4 +46,4 @@ - name: Show metal3 pod debug: - msg: "{{ metal3_status.stdout }}" \ No newline at end of file + msg: "{{ metal3_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml index 18362ea..caeafae 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -63,15 +63,35 @@ register: sub_result changed_when: "'created' in sub_result.stdout" -- name: Wait for operator CSV to succeed - shell: | - oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ - | grep {{ op_config.package_name }} \ - | grep -q Succeeded - register: csv_result - until: csv_result.rc == 0 - retries: "{{ (acm_csv_timeout / 15) | int }}" - delay: 15 +- block: + - name: Wait for operator CSV to succeed + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ + | grep {{ op_config.package_name }} \ + | grep -q Succeeded + register: csv_result + until: csv_result.rc == 0 + retries: "{{ (acm_csv_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect CSV timeout diagnostics + shell: | + echo "=== CSV Status ===" + oc get csv -n {{ op_config.namespace }} 2>/dev/null + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: csv_diag + changed_when: false + failed_when: false + + - name: Display CSV timeout diagnostics + debug: + msg: "{{ csv_diag.stdout }}" + + - name: Fail after CSV timeout + fail: + msg: "Operator CSV did not reach Succeeded state within timeout" - name: Display operator install result shell: | @@ -99,17 +119,40 @@ changed_when: "'created' in mch_result.stdout" when: hub_operator == 'acm' -- name: Wait for MultiClusterHub to reach Running phase - shell: | - oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ - -o jsonpath='{.status.phase}' - register: mch_phase - until: mch_phase.stdout == 'Running' - retries: "{{ (multiclusterhub_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for MultiClusterHub to reach Running phase + shell: | + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ + -o jsonpath='{.status.phase}' + register: mch_phase + until: mch_phase.stdout == 'Running' + retries: "{{ (multiclusterhub_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect MCH timeout diagnostics + shell: | + echo "=== MultiClusterHub Status ===" + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} -o yaml 2>/dev/null | grep -A 50 'status:' + echo "" + echo "=== Non-Running Pods ===" + oc get pods -n {{ op_config.namespace }} --no-headers 2>/dev/null | grep -v Running + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: mch_diag + changed_when: false + failed_when: false + + - name: Display MCH timeout diagnostics + debug: + msg: "{{ mch_diag.stdout }}" + + - name: Fail after MCH timeout + fail: + msg: "MultiClusterHub did not reach Running phase within timeout" when: hub_operator == 'acm' - name: Display MultiClusterHub status debug: msg: "MultiClusterHub phase: {{ mch_phase.stdout }}" - when: hub_operator == 'acm' \ No newline at end of file + when: hub_operator == 'acm' diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml index 4506150..47c715d 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -59,4 +59,4 @@ - name: Display storage setup result debug: - msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" \ No newline at end of file + msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml index 57be824..a5c103a 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml @@ -33,4 +33,4 @@ - name: Display hub cluster status debug: - msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" \ No newline at end of file + msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 index 0d8527e..dc97a08 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -19,4 +19,4 @@ spec: - cpuArchitecture: x86_64 openshiftVersion: "{{ hub_ocp_version }}" url: "{{ rhcos_iso_url }}" - version: "{{ rhcos_version }}" \ No newline at end of file + version: "{{ rhcos_version }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 index 2b68364..fce239b 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 @@ -4,4 +4,4 @@ metadata: name: multiclusterhub namespace: {{ op_config.namespace }} spec: - availabilityConfig: Basic \ No newline at end of file + availabilityConfig: Basic diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 index f6c3109..6bec2ad 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 @@ -8,4 +8,4 @@ spec: installPlanApproval: Automatic name: {{ op_config.subscription_name }} source: {{ op_config.source }} - sourceNamespace: openshift-marketplace \ No newline at end of file + sourceNamespace: openshift-marketplace diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml index 8f1a561..b4679a9 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -17,4 +17,4 @@ operator_config: source: redhat-operators # The MCE namespace is always multicluster-engine regardless of hub_operator -assisted_service_namespace: multicluster-engine \ No newline at end of file +assisted_service_namespace: multicluster-engine diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md new file mode 100644 index 0000000..067b3d8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md @@ -0,0 +1,132 @@ +# assisted-spoke Role + +Deploys a spoke TNF (Two-Node with Fencing) cluster on a hub via the assisted installer and BareMetalHost resources. + +## Description + +This role creates and installs a spoke TNF cluster on an existing hub that has ACM/MCE and the assisted service configured (via the `acm-install` role). It: + +1. Optionally cleans up existing spoke resources (when `force_cleanup=true`) +2. Creates a dedicated libvirt network for the spoke cluster +3. Creates spoke VMs with the specified resources +4. Verifies sushy-tools (Redfish BMC simulator) is running +5. Creates cluster resources on the hub (ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet) +6. Creates BareMetalHost resources to trigger agent-based installation +7. Monitors agent registration, cluster installation, and agent completion +8. Retrieves spoke cluster credentials (kubeconfig, admin password) + +## Requirements + +- Hub cluster with ACM/MCE and assisted service configured (run `acm-install` role first) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- libvirt/KVM available on the hypervisor +- sushy-tools installed for Redfish BMC simulation +- `oc` and `virsh` CLIs available on the hypervisor + +## Role Variables + +### Spoke Cluster Identity + +- `spoke_cluster_name`: Cluster name, must be DNS-safe (default: `"spoke-tnf"`) +- `spoke_base_domain`: Base domain for the spoke cluster (default: `"example.com"`) +- `spoke_release_image`: Release image - `"auto"` uses the hub release image (default: `"auto"`) + +### VM Specifications + +- `spoke_vm_memory`: Memory per node in MB (default: `32768`) +- `spoke_vm_vcpus`: CPU cores per node (default: `4`) +- `spoke_vm_disk_size`: Disk size per node in GB (default: `120`) +- `spoke_ctlplanes`: Number of control plane nodes, must be 2 for TNF (default: `2`) + +### Network Configuration + +- `spoke_network_cidr`: Spoke cluster network CIDR (default: `"192.168.125.0/24"`) +- `spoke_api_vip`: API VIP address (default: `"192.168.125.5"`) +- `spoke_ingress_vip`: Ingress VIP address (default: `"192.168.125.10"`) +- `spoke_cluster_network_cidr`: Pod network CIDR (default: `"10.132.0.0/14"`) +- `spoke_service_network_cidr`: Service network CIDR (default: `"172.31.0.0/16"`) +- `hub_network_cidr`: Hub network CIDR for cross-bridge nftables rules (default: `"192.168.111.0/24"`) + +### BMC / sushy-tools + +- `spoke_bmc_user`: BMC username (default: `"admin"`) +- `spoke_bmc_password`: BMC password (default: `"password"`) +- `spoke_ksushy_ip`: sushy-tools listen IP (default: `"192.168.111.1"`) +- `spoke_ksushy_port`: sushy-tools port (default: `8000`) + +### Deployment Options + +- `force_cleanup`: Remove existing spoke resources before deployment (default: `false`) + +### Timeout Variables + +- `spoke_install_timeout`: Cluster installation timeout in seconds (default: `3600`) +- `spoke_agent_register_timeout`: Agent registration timeout (default: `900`) +- `spoke_credentials_timeout`: Credential retrieval timeout (default: `1800`) + +### Computed Variables (vars/main.yml) + +These are derived automatically and should not be overridden: + +- `spoke_network_gateway`: First IP in spoke CIDR +- `spoke_dhcp_start` / `spoke_dhcp_end`: DHCP range within spoke CIDR +- `spoke_network_name`: Libvirt network name (matches `spoke_cluster_name`) +- `spoke_vm_image_dir`: VM disk image directory (`/var/lib/libvirt/images`) +- `spoke_auth_dir`: Credential output directory (`~//auth`) + +## Task Flow + +1. **cleanup.yml** - Removes existing spoke namespace, VMs, network, credentials (when `force_cleanup=true`) +2. **create-spoke-network.yml** - Creates dedicated libvirt network with DHCP for spoke VMs +3. **create-spoke-vms.yml** - Creates spoke VM disk images and defines libvirt domains +4. **setup-ksushy.yml** - Verifies sushy-tools is running for Redfish BMC +5. **create-cluster-resources.yml** - Creates ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet on hub +6. **create-bmh.yml** - Creates BareMetalHost resources that trigger spoke installation +7. **wait-for-install.yml** - Monitors agent registration, installation progress, and agent completion +8. **retrieve-credentials.yml** - Extracts kubeconfig and admin password, configures DNS, verifies access + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +### Configuration + +Copy and customize the variables template: + +```bash +cp vars/assisted.yml.template vars/assisted.yml +# Edit vars/assisted.yml with desired spoke configuration +``` + +### Accessing the Spoke Cluster + +After deployment: + +```bash +source proxy.env +KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes +``` + +### Redeployment + +To redeploy with cleanup of existing resources: + +```bash +ansible-playbook assisted-install.yml -i inventory.ini -e "force_cleanup=true" +``` + +## Troubleshooting + +- Check spoke VMs: `sudo virsh list --all | grep spoke` +- Check agents: `oc get agents -n ` +- Check BMH status: `oc get bmh -n ` +- Check installation progress: `oc get agentclusterinstall -n -o yaml` +- Check spoke events: `oc get events -n --sort-by='.lastTimestamp'` +- Check sushy-tools: `sudo systemctl status ksushy` +- Check spoke network: `sudo virsh net-list | grep ` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml index 86a371e..bdcc277 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -36,4 +36,4 @@ spoke_agent_register_timeout: 900 spoke_credentials_timeout: 1800 # Hub network CIDR (for cross-bridge nftables rules) -hub_network_cidr: "192.168.111.0/24" \ No newline at end of file +hub_network_cidr: "192.168.111.0/24" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml index 2dbaa36..9ff463d 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -6,14 +6,14 @@ oc delete namespace {{ spoke_cluster_name }} --ignore-not-found --timeout=120s register: ns_delete changed_when: "'deleted' in ns_delete.stdout" - failed_when: false + ignore_errors: true - name: Delete ClusterImageSet shell: | OCP_VER=$(oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2) oc delete clusterimageset "${OCP_VER}.0" --ignore-not-found changed_when: false - failed_when: false + ignore_errors: true - name: Destroy spoke VMs shell: | @@ -22,7 +22,7 @@ sudo virsh undefine {{ spoke_cluster_name }}-master-${i} --remove-all-storage 2>/dev/null || true done changed_when: true - failed_when: false + ignore_errors: true - name: Remove spoke VM disk images file: @@ -36,7 +36,7 @@ sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true sudo virsh net-undefine {{ spoke_network_name }} 2>/dev/null || true changed_when: true - failed_when: false + ignore_errors: true - name: Remove spoke credential directory file: @@ -52,4 +52,4 @@ - name: Display cleanup result debug: - msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" \ No newline at end of file + msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml index fb3ceb1..42cfffc 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml @@ -60,4 +60,4 @@ - name: Show BMH resources debug: - msg: "{{ bmh_status.stdout }}" \ No newline at end of file + msg: "{{ bmh_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml index fa1962c..285b20c 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -84,4 +84,4 @@ Spoke cluster resources created: ClusterImageSet={{ effective_ocp_version }}.0, ClusterDeployment={{ spoke_cluster_name }}, AgentClusterInstall={{ spoke_cluster_name }}, - InfraEnv={{ spoke_cluster_name }} \ No newline at end of file + InfraEnv={{ spoke_cluster_name }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml index 4384bdf..15347a6 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml @@ -45,4 +45,4 @@ - name: Display network setup result debug: - msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" \ No newline at end of file + msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml index e29487e..e59a541 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml @@ -72,4 +72,4 @@ - name: Display spoke VM info debug: msg: "VM {{ item.name }}: UUID={{ item.uuid }}, MAC={{ item.mac }}" - loop: "{{ spoke_vms }}" \ No newline at end of file + loop: "{{ spoke_vms }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml index 15b15e4..9b72882 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml @@ -7,14 +7,35 @@ state: directory mode: '0700' -- name: Wait for admin-kubeconfig secret - shell: | - oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ - -n {{ spoke_cluster_name }} -o name 2>/dev/null - register: kubeconfig_secret - until: kubeconfig_secret.rc == 0 - retries: 10 - delay: 15 +- block: + - name: Wait for admin-kubeconfig secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: kubeconfig_secret + until: kubeconfig_secret.rc == 0 + retries: 10 + delay: 15 + rescue: + - name: Collect kubeconfig secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: kube_secret_diag + changed_when: false + failed_when: false + + - name: Display kubeconfig secret timeout diagnostics + debug: + msg: "{{ kube_secret_diag.stdout }}" + + - name: Fail after kubeconfig secret timeout + fail: + msg: "admin-kubeconfig secret not found within timeout" - name: Extract spoke kubeconfig shell: | @@ -30,14 +51,35 @@ dest: "{{ spoke_auth_dir }}/kubeconfig" mode: '0600' -- name: Wait for admin-password secret - shell: | - oc get secret {{ spoke_cluster_name }}-admin-password \ - -n {{ spoke_cluster_name }} -o name 2>/dev/null - register: password_secret - until: password_secret.rc == 0 - retries: "{{ (spoke_credentials_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for admin-password secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: password_secret + until: password_secret.rc == 0 + retries: "{{ (spoke_credentials_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect password secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: pwd_secret_diag + changed_when: false + failed_when: false + + - name: Display password secret timeout diagnostics + debug: + msg: "{{ pwd_secret_diag.stdout }}" + + - name: Fail after password secret timeout + fail: + msg: "admin-password secret not found within timeout" - name: Extract spoke admin password shell: | @@ -73,7 +115,6 @@ loop: "{{ range(spoke_ctlplanes) | list }}" register: vm_start_result changed_when: "'STARTED' in vm_start_result.stdout" - failed_when: false become: true - name: Wait for spoke VMs to boot @@ -81,14 +122,34 @@ seconds: 120 when: vm_start_result.results | selectattr('stdout', 'search', 'STARTED') | list | length > 0 -- name: Verify spoke cluster access - shell: | - KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes - register: spoke_nodes - changed_when: false - retries: 20 - delay: 30 - until: spoke_nodes.rc == 0 +- block: + - name: Verify spoke cluster access + shell: | + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes + register: spoke_nodes + changed_when: false + retries: 20 + delay: 30 + until: spoke_nodes.rc == 0 + rescue: + - name: Collect spoke access timeout diagnostics + shell: | + echo "=== VM Status ===" + sudo virsh list --all 2>/dev/null | grep {{ spoke_cluster_name }} || echo "No spoke VMs found" + echo "" + echo "=== Last oc error ===" + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes 2>&1 || true + register: spoke_access_diag + changed_when: false + failed_when: false + + - name: Display spoke access timeout diagnostics + debug: + msg: "{{ spoke_access_diag.stdout }}" + + - name: Fail after spoke access timeout + fail: + msg: "Could not access spoke cluster within timeout" - name: Display spoke cluster nodes debug: @@ -97,4 +158,4 @@ - name: Set spoke kubeconfig path as fact for post-deployment tasks set_fact: spoke_kubeconfig_path: "{{ spoke_auth_dir }}/kubeconfig" - spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" \ No newline at end of file + spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml index 88a4026..49ce70a 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml @@ -31,4 +31,4 @@ fail_msg: >- Expected at least {{ spoke_ctlplanes + 2 }} systems in sushy-tools ({{ spoke_ctlplanes }} spoke + 2 hub), but found {{ sushy_systems.stdout }}. - success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" \ No newline at end of file + success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml index e0dd662..a5bae64 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml @@ -1,13 +1,39 @@ --- # Monitor BMH provisioning, agent registration, and installation progress -- name: Wait for agents to register - shell: | - oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l - register: agent_count - until: agent_count.stdout | int >= spoke_ctlplanes - retries: "{{ (spoke_agent_register_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for agents to register + shell: | + oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l + register: agent_count + until: agent_count.stdout | int >= spoke_ctlplanes + retries: "{{ (spoke_agent_register_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect agent registration timeout diagnostics + shell: | + echo "=== Agents ({{ agent_count.stdout | default('0') }} / {{ spoke_ctlplanes }} registered) ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null || echo "No agents found" + echo "" + echo "=== BareMetalHosts ===" + oc get bmh -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== InfraEnv Status ===" + oc get infraenv -n {{ spoke_cluster_name }} -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: agent_reg_diag + changed_when: false + failed_when: false + + - name: Display agent registration timeout diagnostics + debug: + msg: "{{ agent_reg_diag.stdout }}" + + - name: Fail after agent registration timeout + fail: + msg: "Expected {{ spoke_ctlplanes }} agents but only {{ agent_count.stdout | default('0') }} registered within timeout" - name: Display registered agents shell: | @@ -19,68 +45,125 @@ debug: msg: "{{ agents_info.stdout }}" -- name: Wait for spoke cluster installation to complete - shell: | - ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ - -n {{ spoke_cluster_name }} \ - -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) - echo "$ACI_STATE" - case "$ACI_STATE" in - adding-hosts|installed) - exit 0 - ;; - error|failed) - echo "INSTALL FAILED" - oc get agentclusterinstall {{ spoke_cluster_name }} \ +- block: + - name: Wait for spoke cluster installation to complete + shell: | + ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ -n {{ spoke_cluster_name }} \ - -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool - exit 2 - ;; - *) - exit 1 - ;; - esac - register: install_state - until: install_state.rc == 0 - retries: "{{ (spoke_install_timeout / 30) | int }}" - delay: 30 - failed_when: install_state.rc == 2 + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) + echo "$ACI_STATE" + case "$ACI_STATE" in + adding-hosts|installed) + exit 0 + ;; + error|failed) + echo "INSTALL FAILED" + oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool + exit 2 + ;; + *) + exit 1 + ;; + esac + register: install_state + until: install_state.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + failed_when: install_state.rc == 2 + rescue: + - name: Collect installation timeout diagnostics + shell: | + echo "=== AgentClusterInstall Status ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 30 'status:' + echo "" + echo "=== Agents ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== ClusterDeployment ===" + oc get clusterdeployment {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: install_diag + changed_when: false + failed_when: false + + - name: Display installation timeout diagnostics + debug: + msg: "{{ install_diag.stdout }}" + + - name: Fail after installation timeout + fail: + msg: "Spoke cluster installation did not complete within timeout. Last state: {{ install_state.stdout_lines[0] | default('unknown') }}" - name: Display final installation state debug: msg: "Spoke cluster installation state: {{ install_state.stdout_lines[0] }}" -- name: Wait for all agents to reach Done stage - shell: | - oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ - | python3 -c " - import json - import sys - - data = json.load(sys.stdin) - agents = data.get('items', []) - total = len(agents) - done = 0 - stuck = [] - for a in agents: - stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') - if stage == 'Done': - done += 1 - else: +- block: + - name: Wait for all agents to reach Done stage + shell: | + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json + import sys + + data = json.load(sys.stdin) + agents = data.get('items', []) + total = len(agents) + done = 0 + stuck = [] + for a in agents: + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + if stage == 'Done': + done += 1 + else: + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + hostname = a.get('spec', {}).get('hostname', 'unknown') + stuck.append(f'{hostname}: state={state}, stage={stage}') + + print(f'Agents Done: {done} / {total}') + for s in stuck: + print(f' {s}') + sys.exit(0 if done == total else 1) + " + register: agents_done + until: agents_done.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + changed_when: false + rescue: + - name: Collect agent completion diagnostics + shell: | + echo "=== Agent Details ===" + oc get agents -n {{ spoke_cluster_name }} -o wide 2>/dev/null + echo "" + echo "=== Agent Progress ===" + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + for a in data.get('items', []): + name = a.get('spec', {}).get('hostname', 'unknown') + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') - hostname = a.get('spec', {}).get('hostname', 'unknown') - stuck.append(f'{hostname}: state={state}, stage={stage}') - - print(f'Agents Done: {done} / {total}') - for s in stuck: - print(f' {s}') - sys.exit(0 if done == total else 1) - " - register: agents_done - until: agents_done.rc == 0 - retries: "{{ (spoke_install_timeout / 30) | int }}" - delay: 30 - changed_when: false + print(f'{name}: state={state}, stage={stage}') + " 2>/dev/null || echo "Failed to parse agent details" + register: agents_done_diag + changed_when: false + failed_when: false + + - name: Display agent completion diagnostics + debug: + msg: "{{ agents_done_diag.stdout }}" + + - name: Fail after agent completion timeout + fail: + msg: "Not all agents reached Done stage within timeout" - name: Display final agent status shell: | @@ -100,4 +183,4 @@ - name: Show final BMH debug: - msg: "{{ final_bmh.stdout }}" \ No newline at end of file + msg: "{{ final_bmh.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml index 6434d77..ce4bd5d 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -14,4 +14,4 @@ spoke_network_name: "{{ spoke_cluster_name }}" spoke_vm_image_dir: /var/lib/libvirt/images # Credential output paths -spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" \ No newline at end of file +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" diff --git a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh index eba97c4..980511b 100755 --- a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh +++ b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh @@ -35,6 +35,12 @@ echo "Deploying spoke TNF cluster via assisted installer..." cd "${DEPLOY_DIR}/openshift-clusters" +# Parse spoke_cluster_name from vars/assisted.yml +SPOKE_CLUSTER_NAME=$(grep '^spoke_cluster_name:' vars/assisted.yml | awk '{print $2}' | tr -d '"' | tr -d "'") +if [[ -z "${SPOKE_CLUSTER_NAME}" ]]; then + SPOKE_CLUSTER_NAME="spoke-tnf" +fi + if ansible-playbook assisted-install.yml -i inventory.ini; then echo "" echo "OpenShift spoke TNF cluster deployment via assisted installer completed successfully!" @@ -42,7 +48,7 @@ if ansible-playbook assisted-install.yml -i inventory.ini; then echo "Next steps:" echo "1. Access spoke cluster:" echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" - echo " KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes" + echo " KUBECONFIG=~/${SPOKE_CLUSTER_NAME}/auth/kubeconfig oc get nodes" echo "2. Access hub cluster:" echo " source ${DEPLOY_DIR}/openshift-clusters/hub-proxy.env" echo " KUBECONFIG=~/auth/kubeconfig oc get nodes" diff --git a/deploy/openshift-clusters/vars/assisted.yml.template b/deploy/openshift-clusters/vars/assisted.yml.template index dc4f0d0..6c33ce5 100644 --- a/deploy/openshift-clusters/vars/assisted.yml.template +++ b/deploy/openshift-clusters/vars/assisted.yml.template @@ -28,12 +28,18 @@ spoke_vm_disk_size: 120 # GB spoke_ctlplanes: 2 # Always 2 for TNF # Spoke network configuration +# DHCP range is auto-computed as .50-.150 of the CIDR. +# VIPs must be outside that range to avoid conflicts. spoke_network_cidr: "192.168.125.0/24" spoke_api_vip: "192.168.125.5" spoke_ingress_vip: "192.168.125.10" spoke_cluster_network_cidr: "10.132.0.0/14" spoke_service_network_cidr: "172.31.0.0/16" +# Hub network CIDR (for cross-bridge nftables rules between hub and spoke) +# Must match the hub cluster's libvirt network. Default matches dev-scripts. +hub_network_cidr: "192.168.111.0/24" + # BMC / sushy-tools (defaults match dev-scripts deployment) spoke_bmc_user: admin spoke_bmc_password: password @@ -45,4 +51,4 @@ spoke_ksushy_port: 8000 assisted_storage_method: "hostpath" # Deployment options -force_cleanup: false \ No newline at end of file +force_cleanup: false From 6b85d701bf6d97d40618765b321248f6c773257f Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Thu, 9 Apr 2026 12:00:29 +0300 Subject: [PATCH 5/9] Fix spoke package layering and ACM channel detection race conditions - Add OPENSHIFT_INSTALL_EXPERIMENTAL_DISABLE_IMAGE_POLICY to assisted-unsupported-config to prevent spoke MCD signature verification failures during package layering - Add retries (30x30s) to packagemanifest auto-detection to handle catalogsource sync delays (up to 25 minutes observed) - Add fencing-assisted to VALID_CLUSTER_TYPES in Makefile --- deploy/Makefile | 2 +- .../roles/assisted/acm-install/tasks/enable-tnf.yml | 1 + .../roles/assisted/acm-install/tasks/install-operator.yml | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/deploy/Makefile b/deploy/Makefile index 3e42e61..5635354 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -1,5 +1,5 @@ # Valid cluster types for 'make deploy ' -VALID_CLUSTER_TYPES := fencing-ipi fencing-agent arbiter-ipi arbiter-agent arbiter-kcli fencing-kcli +VALID_CLUSTER_TYPES := fencing-ipi fencing-agent fencing-assisted arbiter-ipi arbiter-agent arbiter-kcli fencing-kcli # Handle 'make deploy ' pattern # When 'deploy' is first, validate any following arguments are valid cluster types diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml index 3a00c4c..c31e044 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -12,6 +12,7 @@ namespace: {{ assisted_service_namespace }} data: TNF_CLUSTERS_SUPPORT: "true" + OPENSHIFT_INSTALL_EXPERIMENTAL_DISABLE_IMAGE_POLICY: "true" EOF register: cm_result changed_when: "'created' in cm_result.stdout" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml index caeafae..566c68a 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -34,6 +34,9 @@ -o jsonpath='{.status.defaultChannel}' register: detected_channel changed_when: false + retries: 30 + delay: 30 + until: detected_channel.rc == 0 and detected_channel.stdout != "" - name: Set operator channel fact set_fact: From 625462b3cd8d18ddfa952c6d909c5c31098c4aa6 Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Thu, 9 Apr 2026 12:53:17 +0300 Subject: [PATCH 6/9] Add spoke input validation and remove destructive storage cleanup - I7: Add validate.yml for assisted-spoke role that checks cluster name is DNS-safe, ctlplanes >= 2, VIPs are within spoke CIDR and outside DHCP range, and API/Ingress VIPs are different - I12: Remove rm -rf from storage.yml that wiped assisted-service data on every run. Directories are now created idempotently with mkdir -p without destroying existing data Co-Authored-By: Claude Opus 4.6 --- .../assisted/acm-install/tasks/storage.yml | 1 - .../assisted/assisted-spoke/tasks/main.yml | 3 + .../assisted-spoke/tasks/validate.yml | 65 +++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml index 47c715d..9576064 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -49,7 +49,6 @@ shell: | oc debug node/{{ item }} -- chroot /host bash -c " mkdir -p {{ assisted_images_path }} {{ assisted_db_path }} - rm -rf {{ assisted_images_path }}/* {{ assisted_db_path }}/* chmod 777 {{ assisted_images_path }} {{ assisted_db_path }} chcon -Rt container_file_t {{ assisted_images_path }} {{ assisted_db_path }} " diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml index 43a420c..5fed125 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml @@ -2,6 +2,9 @@ # Deploy spoke TNF cluster via assisted installer + BMH - block: + - name: Validate spoke input variables + include_tasks: validate.yml + - name: Cleanup existing spoke resources include_tasks: cleanup.yml when: force_cleanup | bool diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml new file mode 100644 index 0000000..9fd40c8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml @@ -0,0 +1,65 @@ +--- +# Validate spoke cluster input variables before deployment + +- name: Validate spoke_cluster_name is non-empty and DNS-safe + assert: + that: + - spoke_cluster_name is defined + - spoke_cluster_name | length > 0 + - spoke_cluster_name is match('^[a-z0-9]([a-z0-9-]*[a-z0-9])?$') + fail_msg: >- + spoke_cluster_name '{{ spoke_cluster_name | default("") }}' is invalid. + Must be non-empty, lowercase, start/end with alphanumeric, and contain only [a-z0-9-]. + +- name: Validate spoke_ctlplanes is at least 2 + assert: + that: + - spoke_ctlplanes | int >= 2 + fail_msg: >- + spoke_ctlplanes must be at least 2 for TNF topology, got {{ spoke_ctlplanes }}. + +- name: Validate spoke_network_cidr is a valid CIDR + assert: + that: + - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') is not none + fail_msg: >- + spoke_network_cidr '{{ spoke_network_cidr }}' is not a valid CIDR notation. + +- name: Validate spoke_api_vip is within spoke_network_cidr + assert: + that: + - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) is not none + fail_msg: >- + spoke_api_vip '{{ spoke_api_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. + +- name: Validate spoke_ingress_vip is within spoke_network_cidr + assert: + that: + - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) is not none + fail_msg: >- + spoke_ingress_vip '{{ spoke_ingress_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. + +- name: Validate VIPs are not in DHCP range (.50-.150) + assert: + that: + - spoke_api_vip | ansible.utils.ipaddr('int') | int < spoke_dhcp_start | ansible.utils.ipaddr('int') | int or + spoke_api_vip | ansible.utils.ipaddr('int') | int > spoke_dhcp_end | ansible.utils.ipaddr('int') | int + - spoke_ingress_vip | ansible.utils.ipaddr('int') | int < spoke_dhcp_start | ansible.utils.ipaddr('int') | int or + spoke_ingress_vip | ansible.utils.ipaddr('int') | int > spoke_dhcp_end | ansible.utils.ipaddr('int') | int + fail_msg: >- + VIPs must be outside the DHCP range ({{ spoke_dhcp_start }} - {{ spoke_dhcp_end }}). + API VIP: {{ spoke_api_vip }}, Ingress VIP: {{ spoke_ingress_vip }}. + +- name: Validate API and Ingress VIPs are different + assert: + that: + - spoke_api_vip != spoke_ingress_vip + fail_msg: >- + spoke_api_vip and spoke_ingress_vip must be different, both are '{{ spoke_api_vip }}'. + +- name: Display spoke input validation result + debug: + msg: >- + Spoke inputs validated: cluster={{ spoke_cluster_name }}.{{ spoke_base_domain }}, + nodes={{ spoke_ctlplanes }}, network={{ spoke_network_cidr }}, + API VIP={{ spoke_api_vip }}, Ingress VIP={{ spoke_ingress_vip }} \ No newline at end of file From 7b4ec7616436ee10d4615d72018f24f5c2fffdcd Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Thu, 9 Apr 2026 15:49:00 +0300 Subject: [PATCH 7/9] Fix ipaddr validation to use truthiness instead of 'is not none' The ipaddr() filter returns False (not None) for invalid inputs, so 'is not none' always passes. Use truthiness check instead. Co-Authored-By: Claude Opus 4.6 --- .../roles/assisted/assisted-spoke/tasks/validate.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml index 9fd40c8..3ed03a2 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml @@ -21,21 +21,21 @@ - name: Validate spoke_network_cidr is a valid CIDR assert: that: - - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') is not none + - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') fail_msg: >- spoke_network_cidr '{{ spoke_network_cidr }}' is not a valid CIDR notation. - name: Validate spoke_api_vip is within spoke_network_cidr assert: that: - - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) is not none + - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) fail_msg: >- spoke_api_vip '{{ spoke_api_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. - name: Validate spoke_ingress_vip is within spoke_network_cidr assert: that: - - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) is not none + - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) fail_msg: >- spoke_ingress_vip '{{ spoke_ingress_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. From 5dfa87e3cc934e71dc9c6ec764a6addb7c526e0b Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Thu, 9 Apr 2026 21:48:17 +0300 Subject: [PATCH 8/9] Fix ipaddr validation to cast results to bool for assert Ansible assert requires boolean conditionals. The ipaddr() filter returns a string on success and False on failure. Adding '| bool' converts both cases to proper booleans. Co-Authored-By: Claude Opus 4.6 --- .../roles/assisted/assisted-spoke/tasks/validate.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml index 3ed03a2..54fc7c4 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml @@ -21,21 +21,21 @@ - name: Validate spoke_network_cidr is a valid CIDR assert: that: - - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') + - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') | bool fail_msg: >- spoke_network_cidr '{{ spoke_network_cidr }}' is not a valid CIDR notation. - name: Validate spoke_api_vip is within spoke_network_cidr assert: that: - - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) + - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) | bool fail_msg: >- spoke_api_vip '{{ spoke_api_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. - name: Validate spoke_ingress_vip is within spoke_network_cidr assert: that: - - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) + - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) | bool fail_msg: >- spoke_ingress_vip '{{ spoke_ingress_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. From 8c0cf33cbd3a1504eaa5a1c00edd6f007294b6ae Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Thu, 9 Apr 2026 21:54:03 +0300 Subject: [PATCH 9/9] Fix ipaddr validation: use != false instead of | bool The ipaddr() filter returns a string on valid input. '| bool' coerces strings like "192.168.125.0/24" to False. Use '!= false' to correctly distinguish valid results (strings) from invalid results (False). Co-Authored-By: Claude Opus 4.6 --- .../roles/assisted/assisted-spoke/tasks/validate.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml index 54fc7c4..eb6e8c0 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/validate.yml @@ -21,21 +21,21 @@ - name: Validate spoke_network_cidr is a valid CIDR assert: that: - - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') | bool + - spoke_network_cidr | ansible.utils.ipaddr('network/prefix') != false fail_msg: >- spoke_network_cidr '{{ spoke_network_cidr }}' is not a valid CIDR notation. - name: Validate spoke_api_vip is within spoke_network_cidr assert: that: - - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) | bool + - spoke_api_vip | ansible.utils.ipaddr(spoke_network_cidr) != false fail_msg: >- spoke_api_vip '{{ spoke_api_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'. - name: Validate spoke_ingress_vip is within spoke_network_cidr assert: that: - - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) | bool + - spoke_ingress_vip | ansible.utils.ipaddr(spoke_network_cidr) != false fail_msg: >- spoke_ingress_vip '{{ spoke_ingress_vip }}' is not within spoke_network_cidr '{{ spoke_network_cidr }}'.