From fc1b8f3a5d3749a463767c4b72cf0312bccb1201 Mon Sep 17 00:00:00 2001 From: Sergey Yedrikov Date: Fri, 5 Jun 2026 13:39:33 -0400 Subject: [PATCH 1/2] OLS-3235: Make lightspeed-rag-content a multi-arch image --- .tekton/embed-rag.yaml | 198 ++++++++++++++++++ ...p-lightspeed-rag-content-pull-request.yaml | 96 +++++++-- .../own-app-lightspeed-rag-content-push.yaml | 96 +++++++-- Containerfile | 2 + Containerfile.arm64 | 22 ++ Containerfile.pack | 23 ++ scripts/embed-rag-content.sh | 62 ++++++ scripts/embed-remote-setup.sh | 64 ++++++ 8 files changed, 531 insertions(+), 32 deletions(-) create mode 100644 .tekton/embed-rag.yaml create mode 100644 Containerfile.arm64 create mode 100644 Containerfile.pack create mode 100755 scripts/embed-rag-content.sh create mode 100755 scripts/embed-remote-setup.sh diff --git a/.tekton/embed-rag.yaml b/.tekton/embed-rag.yaml new file mode 100644 index 000000000..f391d6005 --- /dev/null +++ b/.tekton/embed-rag.yaml @@ -0,0 +1,198 @@ +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: embed-rag + annotations: + tekton.dev/pipelines.minVersion: "0.12.1" + build.appstudio.redhat.com/expires-on: "2026-08-22T00:00:00Z" + labels: + build.appstudio.redhat.com/build_type: docker +spec: + description: >- + Run GPU embedding on a remote amd64 VM and publish rag/ as a trusted artifact + for amd64 and arm64 packaging builds. + params: + - name: SOURCE_ARTIFACT + type: string + - name: CACHI2_ARTIFACT + type: string + default: "" + - name: ociStorage + type: string + - name: ociArtifactExpiresAfter + type: string + default: "" + - name: PLATFORM + type: string + default: linux-g64xlarge/amd64 + - name: HERMETIC + type: string + default: "true" + - name: HTTP_PROXY + type: string + default: "" + - name: NO_PROXY + type: string + default: "" + - name: EMBEDDING_MODEL + type: string + default: sentence-transformers/all-mpnet-base-v2 + - name: BUILDER_IMAGE + type: string + default: registry.redhat.io/rhai/base-image-cuda-12.9-rhel9:3.3 + - name: caTrustConfigMapName + type: string + default: trusted-ca + - name: caTrustConfigMapKey + type: string + default: ca-bundle.crt + results: + - name: SOURCE_ARTIFACT + description: Trusted artifact with rag/, LICENSE, Containerfile.arm64 + volumes: + - name: workdir + emptyDir: {} + - name: ssh + secret: + optional: false + secretName: multi-platform-ssh-$(context.taskRun.name) + - name: trusted-ca + configMap: + items: + - key: $(params.caTrustConfigMapKey) + path: ca-bundle.crt + name: $(params.caTrustConfigMapName) + optional: true + stepTemplate: + volumeMounts: + - mountPath: /var/workdir + name: workdir + steps: + - name: use-source + image: quay.io/konflux-ci/build-trusted-artifacts:latest@sha256:adf22f3ec90bfa3f7e2c832a7d52febd1ea31aa9fff6db21324c965d7d622327 + args: + - use + - $(params.SOURCE_ARTIFACT)=/var/workdir/source + - $(params.CACHI2_ARTIFACT)=/var/workdir/cachi2 + volumeMounts: + - mountPath: /etc/pki/tls/certs/ca-custom-bundle.crt + name: trusted-ca + readOnly: true + subPath: ca-bundle.crt + - name: embed-remote + image: quay.io/konflux-ci/buildah-task:latest@sha256:4c470b5a153c4acd14bf4f8731b5e36c61d7faafe09c2bf376bb81ce84aa5709 + workingDir: /var/workdir + env: + - name: HOME + value: /root + - name: HERMETIC + value: $(params.HERMETIC) + - name: EMBEDDING_MODEL + value: $(params.EMBEDDING_MODEL) + - name: PLATFORM + value: $(params.PLATFORM) + - name: BUILDER_IMAGE + value: "registry.redhat.io/rhai/base-image-cuda-12.9-rhel9:3.3" + - name: BUILDAH_HTTP_PROXY + value: $(params.HTTP_PROXY) + - name: BUILDAH_NO_PROXY + value: $(params.NO_PROXY) + script: | + #!/bin/bash + set -e + set -o verbose + + echo "[$(date --utc -Ins)] Prepare connection" + + mkdir -p ~/.ssh + if [ -e "/ssh/error" ]; then + cat /ssh/error + exit 1 + fi + export SSH_HOST + SSH_HOST=$(cat /ssh/host) + + if [ "$SSH_HOST" = "localhost" ]; then + echo "Localhost MPC host: running embed in-cluster (requires GPU)" >&2 + exit 1 + elif [ -e "/ssh/otp" ]; then + if ! curl --fail --cacert /ssh/otp-ca -XPOST -d @/ssh/otp "$(cat /ssh/otp-server)" >~/.ssh/id_rsa; then + echo "Failed to retrieve SSH key from the OTP server. This can happen when the PipelineRun retry option re-runs a task whose one-time credential was already consumed. Please, start a new build, and if problem persists, please report it as an MPC bug." >&2 + exit 1 + fi + echo "" >>~/.ssh/id_rsa + else + cp /ssh/id_rsa ~/.ssh + fi + + if [[ "${BUILDAH_HTTP_PROXY}" =~ .+\.cluster\.local ]]; then + echo "[$(date --utc -Ins)] Ignoring cluster local proxy for remote build" + unset BUILDAH_HTTP_PROXY BUILDAH_NO_PROXY + fi + + chmod 0400 ~/.ssh/id_rsa + test -s ~/.ssh/id_rsa + + export BUILD_DIR + BUILD_DIR=$(cat /ssh/user-dir) + export SSH_ARGS="-o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=10" + + echo "[$(date --utc -Ins)] Setup VM" + # shellcheck disable=SC2086 + ssh $SSH_ARGS "$SSH_HOST" mkdir -p "${BUILD_DIR@Q}/volumes/workdir" "${BUILD_DIR@Q}/volumes/trusted-ca" "${BUILD_DIR@Q}/.docker" + + echo "[$(date --utc -Ins)] Rsync data" + rsync -razW /var/workdir/ "$SSH_HOST:$BUILD_DIR/volumes/workdir/" + rsync -razW /mnt/trusted-ca/ "$SSH_HOST:$BUILD_DIR/volumes/trusted-ca/" + rsync -razW "$HOME/.docker/" "$SSH_HOST:$BUILD_DIR/.docker/" 2>/dev/null || true + + echo "[$(date --utc -Ins)] Verify remote context" + # shellcheck disable=SC2086 + ssh $SSH_ARGS "$SSH_HOST" test -f "${BUILD_DIR@Q}/volumes/workdir/source/scripts/embed-rag-content.sh" + + echo "[$(date --utc -Ins)] Run GPU embed" + : "${BUILDER_IMAGE:?BUILDER_IMAGE is empty}" + echo "Builder image: ${BUILDER_IMAGE}" + # shellcheck disable=SC2086 + ssh $SSH_ARGS "$SSH_HOST" podman run --rm \ + --device nvidia.com/gpu=all \ + --security-opt label=disable \ + -e "HERMETIC=${HERMETIC@Q}" \ + -e "EMBEDDING_MODEL=${EMBEDDING_MODEL@Q}" \ + -e "FLAVOR=gpu" \ + -e "CACHI2_ROOT=/var/workdir/cachi2" \ + -v "${BUILD_DIR@Q}/volumes/workdir:/var/workdir:Z" \ + -v "${BUILD_DIR@Q}/volumes/trusted-ca:/mnt/trusted-ca:Z" \ + -v "${BUILD_DIR@Q}/.docker:/root/.docker:Z" \ + --user=0 \ + --entrypoint='' \ + "${BUILDER_IMAGE@Q}" \ + /bin/bash /var/workdir/source/scripts/embed-remote-setup.sh + + echo "[$(date --utc -Ins)] Rsync results" + rsync -razW "$SSH_HOST:$BUILD_DIR/volumes/workdir/source/" /var/workdir/source/ + + test -d /var/workdir/source/rag-export/rag/vector_db/ocp_product_docs + test -f /var/workdir/source/rag-export/Containerfile.arm64 + volumeMounts: + - mountPath: /ssh + name: ssh + readOnly: true + - mountPath: /mnt/trusted-ca + name: trusted-ca + readOnly: true + - name: create-trusted-artifact + image: quay.io/konflux-ci/build-trusted-artifacts:latest@sha256:adf22f3ec90bfa3f7e2c832a7d52febd1ea31aa9fff6db21324c965d7d622327 + args: + - create + - --store + - $(params.ociStorage) + - $(results.SOURCE_ARTIFACT.path)=/var/workdir/source/rag-export + env: + - name: IMAGE_EXPIRES_AFTER + value: $(params.ociArtifactExpiresAfter) + volumeMounts: + - mountPath: /etc/pki/tls/certs/ca-custom-bundle.crt + name: trusted-ca + readOnly: true + subPath: ca-bundle.crt diff --git a/.tekton/own-app-lightspeed-rag-content-pull-request.yaml b/.tekton/own-app-lightspeed-rag-content-pull-request.yaml index e8462d03c..db189561a 100644 --- a/.tekton/own-app-lightspeed-rag-content-pull-request.yaml +++ b/.tekton/own-app-lightspeed-rag-content-pull-request.yaml @@ -9,6 +9,7 @@ metadata: pipelinesascode.tekton.dev/cancel-in-progress: "true" pipelinesascode.tekton.dev/max-keep-runs: "3" pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch == "main" + pipelinesascode.tekton.dev/task: "[.tekton/embed-rag.yaml]" creationTimestamp: labels: appstudio.openshift.io/application: lightspeed-rag-content @@ -27,7 +28,7 @@ spec: - name: image-expires-after value: 5d - name: dockerfile - value: Containerfile + value: Containerfile.pack - name: path-context value: . - name: build-source-image @@ -37,11 +38,9 @@ spec: - name: hermetic value: "true" - name: build-args - value: - - FLAVOR=gpu - - HERMETIC=true + value: [] - name: build-image-index - value: "false" + value: "true" pipelineSpec: description: | This pipeline is ideal for building container images from a Containerfile while maintaining trust after pipeline customization. @@ -202,37 +201,96 @@ spec: workspace: git-auth - name: netrc workspace: netrc - - name: build-container + - name: embed-rag params: + - name: SOURCE_ARTIFACT + value: $(tasks.prefetch-dependencies.results.SOURCE_ARTIFACT) + - name: CACHI2_ARTIFACT + value: $(tasks.prefetch-dependencies.results.CACHI2_ARTIFACT) - name: PLATFORM value: linux-g64xlarge/amd64 + - name: HERMETIC + value: $(params.hermetic) + - name: HTTP_PROXY + value: $(tasks.init.results.http-proxy) + - name: NO_PROXY + value: $(tasks.init.results.no-proxy) + - name: ociStorage + value: $(params.output-image).rag-export + - name: ociArtifactExpiresAfter + value: $(params.image-expires-after) + runAfter: + - prefetch-dependencies + taskRef: + kind: Task + name: embed-rag + - name: build-container + params: + - name: PLATFORM + value: linux/amd64 - name: IMAGE value: $(params.output-image) - name: DOCKERFILE - value: $(params.dockerfile) + value: Containerfile.pack - name: CONTEXT - value: $(params.path-context) + value: . - name: HERMETIC - value: $(params.hermetic) + value: "false" - name: PREFETCH_INPUT - value: $(params.prefetch-input) + value: "" - name: IMAGE_EXPIRES_AFTER value: $(params.image-expires-after) - name: COMMIT_SHA value: $(tasks.clone-repository.results.commit) - name: BUILD_ARGS - value: - - $(params.build-args[*]) + value: [] - name: BUILD_ARGS_FILE value: $(params.build-args-file) - name: SOURCE_ARTIFACT - value: $(tasks.prefetch-dependencies.results.SOURCE_ARTIFACT) - - name: CACHI2_ARTIFACT - value: $(tasks.prefetch-dependencies.results.CACHI2_ARTIFACT) + value: $(tasks.embed-rag.results.SOURCE_ARTIFACT) + - name: HTTP_PROXY + value: $(tasks.init.results.http-proxy) + - name: NO_PROXY + value: $(tasks.init.results.no-proxy) - name: BUILDAH_FORMAT value: $(params.buildah-format) runAfter: - - prefetch-dependencies + - embed-rag + taskRef: + params: + - name: name + value: buildah-remote-oci-ta + - name: bundle + value: quay.io/konflux-ci/tekton-catalog/task-buildah-remote-oci-ta:0.9@sha256:77007259cc87f32d63d2c201226aadaab98313cfd4e02b46abc243c4d2cc27bd + - name: kind + value: task + resolver: bundles + - name: build-container-arm64 + params: + - name: PLATFORM + value: linux/arm64 + - name: IMAGE + value: $(params.output-image) + - name: DOCKERFILE + value: Containerfile.arm64 + - name: CONTEXT + value: . + - name: HERMETIC + value: "false" + - name: PREFETCH_INPUT + value: "" + - name: IMAGE_EXPIRES_AFTER + value: $(params.image-expires-after) + - name: COMMIT_SHA + value: $(tasks.clone-repository.results.commit) + - name: SOURCE_ARTIFACT + value: $(tasks.embed-rag.results.SOURCE_ARTIFACT) + - name: IMAGE_APPEND_PLATFORM + value: "true" + - name: BUILDAH_FORMAT + value: $(params.buildah-format) + runAfter: + - embed-rag taskRef: params: - name: name @@ -251,11 +309,13 @@ spec: - name: IMAGES value: - $(tasks.build-container.results.IMAGE_URL)@$(tasks.build-container.results.IMAGE_DIGEST) + - $(tasks.build-container-arm64.results.IMAGE_URL)@$(tasks.build-container-arm64.results.IMAGE_DIGEST) - name: BUILDAH_FORMAT value: $(params.buildah-format) retries: 5 runAfter: - build-container + - build-container-arm64 taskRef: params: - name: name @@ -609,6 +669,10 @@ spec: - name: netrc optional: true taskRunSpecs: + - pipelineTaskName: build-container + computeResources: + limits: + memory: 8Gi - pipelineTaskName: build-source-image computeResources: limits: diff --git a/.tekton/own-app-lightspeed-rag-content-push.yaml b/.tekton/own-app-lightspeed-rag-content-push.yaml index a80e105f3..40c3fc185 100644 --- a/.tekton/own-app-lightspeed-rag-content-push.yaml +++ b/.tekton/own-app-lightspeed-rag-content-push.yaml @@ -8,6 +8,7 @@ metadata: pipelinesascode.tekton.dev/cancel-in-progress: "false" pipelinesascode.tekton.dev/max-keep-runs: "3" pipelinesascode.tekton.dev/on-cel-expression: event == "push" && target_branch == "main" + pipelinesascode.tekton.dev/task: "[.tekton/embed-rag.yaml]" creationTimestamp: labels: appstudio.openshift.io/application: lightspeed-rag-content @@ -24,7 +25,7 @@ spec: - name: output-image value: quay.io/redhat-user-workloads/crt-nshift-lightspeed-tenant/own-app-lightspeed-rag-content:{{revision}} - name: dockerfile - value: Containerfile + value: Containerfile.pack - name: path-context value: . - name: build-source-image @@ -34,11 +35,9 @@ spec: - name: hermetic value: "true" - name: build-args - value: - - FLAVOR=gpu - - HERMETIC=true + value: [] - name: build-image-index - value: "false" + value: "true" pipelineSpec: description: | This pipeline is ideal for building container images from a Containerfile while maintaining trust after pipeline customization. @@ -199,37 +198,96 @@ spec: workspace: git-auth - name: netrc workspace: netrc - - name: build-container + - name: embed-rag params: + - name: SOURCE_ARTIFACT + value: $(tasks.prefetch-dependencies.results.SOURCE_ARTIFACT) + - name: CACHI2_ARTIFACT + value: $(tasks.prefetch-dependencies.results.CACHI2_ARTIFACT) - name: PLATFORM value: linux-g64xlarge/amd64 + - name: HERMETIC + value: $(params.hermetic) + - name: HTTP_PROXY + value: $(tasks.init.results.http-proxy) + - name: NO_PROXY + value: $(tasks.init.results.no-proxy) + - name: ociStorage + value: $(params.output-image).rag-export + - name: ociArtifactExpiresAfter + value: $(params.image-expires-after) + runAfter: + - prefetch-dependencies + taskRef: + kind: Task + name: embed-rag + - name: build-container + params: + - name: PLATFORM + value: linux/amd64 - name: IMAGE value: $(params.output-image) - name: DOCKERFILE - value: $(params.dockerfile) + value: Containerfile.pack - name: CONTEXT - value: $(params.path-context) + value: . - name: HERMETIC - value: $(params.hermetic) + value: "false" - name: PREFETCH_INPUT - value: $(params.prefetch-input) + value: "" - name: IMAGE_EXPIRES_AFTER value: $(params.image-expires-after) - name: COMMIT_SHA value: $(tasks.clone-repository.results.commit) - name: BUILD_ARGS - value: - - $(params.build-args[*]) + value: [] - name: BUILD_ARGS_FILE value: $(params.build-args-file) - name: SOURCE_ARTIFACT - value: $(tasks.prefetch-dependencies.results.SOURCE_ARTIFACT) - - name: CACHI2_ARTIFACT - value: $(tasks.prefetch-dependencies.results.CACHI2_ARTIFACT) + value: $(tasks.embed-rag.results.SOURCE_ARTIFACT) + - name: HTTP_PROXY + value: $(tasks.init.results.http-proxy) + - name: NO_PROXY + value: $(tasks.init.results.no-proxy) - name: BUILDAH_FORMAT value: $(params.buildah-format) runAfter: - - prefetch-dependencies + - embed-rag + taskRef: + params: + - name: name + value: buildah-remote-oci-ta + - name: bundle + value: quay.io/konflux-ci/tekton-catalog/task-buildah-remote-oci-ta:0.9@sha256:77007259cc87f32d63d2c201226aadaab98313cfd4e02b46abc243c4d2cc27bd + - name: kind + value: task + resolver: bundles + - name: build-container-arm64 + params: + - name: PLATFORM + value: linux/arm64 + - name: IMAGE + value: $(params.output-image) + - name: DOCKERFILE + value: Containerfile.arm64 + - name: CONTEXT + value: . + - name: HERMETIC + value: "false" + - name: PREFETCH_INPUT + value: "" + - name: IMAGE_EXPIRES_AFTER + value: $(params.image-expires-after) + - name: COMMIT_SHA + value: $(tasks.clone-repository.results.commit) + - name: SOURCE_ARTIFACT + value: $(tasks.embed-rag.results.SOURCE_ARTIFACT) + - name: IMAGE_APPEND_PLATFORM + value: "true" + - name: BUILDAH_FORMAT + value: $(params.buildah-format) + runAfter: + - embed-rag taskRef: params: - name: name @@ -248,11 +306,13 @@ spec: - name: IMAGES value: - $(tasks.build-container.results.IMAGE_URL)@$(tasks.build-container.results.IMAGE_DIGEST) + - $(tasks.build-container-arm64.results.IMAGE_URL)@$(tasks.build-container-arm64.results.IMAGE_DIGEST) - name: BUILDAH_FORMAT value: $(params.buildah-format) retries: 5 runAfter: - build-container + - build-container-arm64 taskRef: params: - name: name @@ -606,6 +666,10 @@ spec: - name: netrc optional: true taskRunSpecs: + - pipelineTaskName: build-container + computeResources: + limits: + memory: 8Gi - pipelineTaskName: build-source-image computeResources: limits: diff --git a/Containerfile b/Containerfile index 1b4a7f1b1..142bd75a1 100644 --- a/Containerfile +++ b/Containerfile @@ -2,6 +2,8 @@ ARG EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 ARG FLAVOR=cpu ARG HERMETIC=false +# Full GPU/CPU build for local development. Konflux CI uses embed-rag + Containerfile.pack/arm64. + FROM registry.access.redhat.com/ubi9/python-312 as cpu-base ARG EMBEDDING_MODEL ARG FLAVOR diff --git a/Containerfile.arm64 b/Containerfile.arm64 new file mode 100644 index 000000000..fb111b21f --- /dev/null +++ b/Containerfile.arm64 @@ -0,0 +1,22 @@ +# Final-stage-only image for arm64. Build context comes from embed-rag trusted artifact. +FROM registry.access.redhat.com/ubi9/ubi-minimal@sha256:12db9874bd753eb98b1ab3d840e75de5d6842ac0604fbd68c012adefe97140be +COPY rag/ /rag/ +COPY LICENSE /licenses/LICENSE + +# Labels for enterprise contract +LABEL com.redhat.component=openshift-lightspeed-rag-content +LABEL cpe="cpe:/a:redhat:openshift_lightspeed:1::el9" +LABEL description="Red Hat OpenShift Lightspeed RAG content" +LABEL distribution-scope=private +LABEL io.k8s.description="Red Hat OpenShift Lightspeed RAG content" +LABEL io.k8s.display-name="Openshift Lightspeed RAG content" +LABEL io.openshift.tags="openshift,lightspeed,ai,assistant,rag" +LABEL name="openshift-lightspeed/lightspeed-rag-content-rhel9" +LABEL release=0.0.1 +LABEL url="https://github.com/openshift/lightspeed-rag-content" +LABEL vendor="Red Hat, Inc." +LABEL version=0.0.1 +LABEL summary="Red Hat OpenShift Lightspeed RAG content" +LABEL konflux.additional-tags="latest" + +USER 65532:65532 diff --git a/Containerfile.pack b/Containerfile.pack new file mode 100644 index 000000000..edd4db159 --- /dev/null +++ b/Containerfile.pack @@ -0,0 +1,23 @@ +# Package pre-built RAG content into the amd64 runtime image. +# GPU embedding runs in the embed-rag Tekton task; this file only COPYs rag/. +FROM registry.access.redhat.com/ubi9/ubi-minimal@sha256:12db9874bd753eb98b1ab3d840e75de5d6842ac0604fbd68c012adefe97140be +COPY rag/ /rag/ +COPY LICENSE /licenses/LICENSE + +# Labels for enterprise contract +LABEL com.redhat.component=openshift-lightspeed-rag-content +LABEL cpe="cpe:/a:redhat:openshift_lightspeed:1::el9" +LABEL description="Red Hat OpenShift Lightspeed RAG content" +LABEL distribution-scope=private +LABEL io.k8s.description="Red Hat OpenShift Lightspeed RAG content" +LABEL io.k8s.display-name="Openshift Lightspeed RAG content" +LABEL io.openshift.tags="openshift,lightspeed,ai,assistant,rag" +LABEL name="openshift-lightspeed/lightspeed-rag-content-rhel9" +LABEL release=0.0.1 +LABEL url="https://github.com/openshift/lightspeed-rag-content" +LABEL vendor="Red Hat, Inc." +LABEL version=0.0.1 +LABEL summary="Red Hat OpenShift Lightspeed RAG content" +LABEL konflux.additional-tags="latest" + +USER 65532:65532 diff --git a/scripts/embed-rag-content.sh b/scripts/embed-rag-content.sh new file mode 100755 index 000000000..d8b09349b --- /dev/null +++ b/scripts/embed-rag-content.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Generate RAG vector DB on a GPU host and export rag/ for multi-arch packaging. +# Mirrors the lightspeed-rag-builder stage in Containerfile (Konflux embed-rag task). +set -euo pipefail + +cd /var/workdir/source + +EMBEDDING_MODEL="${EMBEDDING_MODEL:-sentence-transformers/all-mpnet-base-v2}" +HERMETIC="${HERMETIC:-true}" +FLAVOR="${FLAVOR:-gpu}" +RAG_EXPORT_DIR="${RAG_EXPORT_DIR:-/var/workdir/source/rag-export}" +CACHI2_ROOT="${CACHI2_ROOT:-/cachi2}" + +if [ "${HERMETIC}" = "true" ] && [ -f "${CACHI2_ROOT}/cachi2.env" ]; then + # shellcheck disable=SC1091 + source "${CACHI2_ROOT}/cachi2.env" +fi + +if [ "${FLAVOR}" != "gpu" ]; then + echo "embed-rag-content requires FLAVOR=gpu" >&2 + exit 1 +fi + +pip3.12 install --no-cache-dir -r requirements.gpu.txt +ln -sf /usr/local/lib/python3.12/site-packages/llama_index/core/_static/nltk_cache /root/nltk_data + +cd embeddings_model +if [ "${HERMETIC}" = "true" ]; then + cp "${CACHI2_ROOT}/output/deps/generic/model.safetensors" model.safetensors +else + curl -L -O "https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/9a3225965996d404b775526de6dbfe85d3368642/model.safetensors" +fi +cd .. + +python3.12 -c "import torch; print('cuda', torch.version.cuda, torch.cuda.is_available())" + +for OCP_VERSION in $(ls -1 ocp-product-docs-plaintext); do + python3.12 scripts/generate_embeddings.py \ + -f "ocp-product-docs-plaintext/${OCP_VERSION}" \ + -r runbooks/alerts \ + -md embeddings_model \ + -mn "${EMBEDDING_MODEL}" \ + -o "vector_db/ocp_product_docs/${OCP_VERSION}" \ + -i "ocp-product-docs-$(echo "${OCP_VERSION}" | sed 's/\./_/g')" \ + -v "${OCP_VERSION}" \ + -hb "${HERMETIC}" +done + +LATEST_VERSION=$(ls -1 vector_db/ocp_product_docs/ | sort -V | tail -n 1) +ln -sf "${LATEST_VERSION}" "vector_db/ocp_product_docs/latest" + +rm -rf "${RAG_EXPORT_DIR}" +mkdir -p "${RAG_EXPORT_DIR}/rag" +cp -a vector_db "${RAG_EXPORT_DIR}/rag/" +cp -a embeddings_model "${RAG_EXPORT_DIR}/rag/" +cp LICENSE "${RAG_EXPORT_DIR}/" +cp Containerfile.arm64 "${RAG_EXPORT_DIR}/" + +test -d "${RAG_EXPORT_DIR}/rag/vector_db/ocp_product_docs" +test -d "${RAG_EXPORT_DIR}/rag/embeddings_model" +test -f "${RAG_EXPORT_DIR}/LICENSE" +test -f "${RAG_EXPORT_DIR}/Containerfile.arm64" diff --git a/scripts/embed-remote-setup.sh b/scripts/embed-remote-setup.sh new file mode 100755 index 000000000..4107602be --- /dev/null +++ b/scripts/embed-remote-setup.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# GPU host setup + embed entrypoint (Konflux embed-rag remote podman step). +set -euo pipefail + +HERMETIC="${HERMETIC:-true}" +CACHI2_SRC="${CACHI2_ROOT:-/var/workdir/cachi2}" +CACHI2_ROOT="/cachi2" + +if [ -f /mnt/trusted-ca/ca-bundle.crt ]; then + cp /mnt/trusted-ca/ca-bundle.crt /etc/pki/ca-trust/source/anchors/ + update-ca-trust +fi + +prepare_cachi2() { + if [ ! -f "${CACHI2_SRC}/cachi2.env" ]; then + echo "Missing cachi2 prefetch: ${CACHI2_SRC}/cachi2.env" >&2 + return 1 + fi + + # Match buildah-remote-oci-ta: cachi2.env references file:///cachi2/... + rm -rf /tmp/cachi2 + cp -a "${CACHI2_SRC}" /tmp/cachi2 + chmod -R go+rwX /tmp/cachi2 + + arch=$(uname -m) + rpm_prefetch_dir="/tmp/cachi2/output/deps/rpm" + if [ -d "${rpm_prefetch_dir}" ]; then + for path in "${rpm_prefetch_dir}"/*; do + [ -e "${path}" ] || continue + if [ "$(basename "${path}")" != "${arch}" ]; then + rm -rf "${path}" + fi + done + fi + + rm -rf /cachi2 + ln -sf /tmp/cachi2 /cachi2 +} + +install_rpms() { + local packages=(python3.12 python3.12-pip libcudnn9 libnccl libcusparselt0) + + if [ "${HERMETIC}" = "true" ]; then + # Prevent the CUDA base image from using host/RHSM repos. + find /usr/share/rhel/secrets -type l -exec unlink {} \; 2>/dev/null || true + + prepare_cachi2 + + # shellcheck disable=SC1091 + . /cachi2/cachi2.env + + dnf install -y "${packages[@]}" + else + dnf install -y "${packages[@]}" + fi +} + +install_rpms + +test -d /var/workdir/source/scripts +cd /var/workdir/source +chmod +x ./scripts/embed-rag-content.sh ./scripts/embed-remote-setup.sh +export CACHI2_ROOT +exec ./scripts/embed-rag-content.sh From d618a3bf3c167b6211fed67c1a6aac7062b34c2c Mon Sep 17 00:00:00 2001 From: Sergey Yedrikov Date: Sun, 7 Jun 2026 22:22:42 -0400 Subject: [PATCH 2/2] Fix hermetic embed-rag setup and GPU torch import on Konflux MPC host. Prepare cachi2 prefetch repos for dnf, skip already-installed RPMs, and set LD_LIBRARY_PATH for pip-shipped NVIDIA libs before running GPU embedding. --- .tekton/embed-rag.yaml | 3 +- scripts/embed-rag-content.sh | 45 ++++++++++++++++++- scripts/embed-remote-setup.sh | 84 ++++++++++++++++++++++++++++++----- 3 files changed, 119 insertions(+), 13 deletions(-) diff --git a/.tekton/embed-rag.yaml b/.tekton/embed-rag.yaml index f391d6005..4389582fb 100644 --- a/.tekton/embed-rag.yaml +++ b/.tekton/embed-rag.yaml @@ -48,7 +48,7 @@ spec: default: ca-bundle.crt results: - name: SOURCE_ARTIFACT - description: Trusted artifact with rag/, LICENSE, Containerfile.arm64 + description: Trusted artifact with rag/, LICENSE, Containerfile.pack, Containerfile.arm64 volumes: - name: workdir emptyDir: {} @@ -173,6 +173,7 @@ spec: rsync -razW "$SSH_HOST:$BUILD_DIR/volumes/workdir/source/" /var/workdir/source/ test -d /var/workdir/source/rag-export/rag/vector_db/ocp_product_docs + test -f /var/workdir/source/rag-export/Containerfile.pack test -f /var/workdir/source/rag-export/Containerfile.arm64 volumeMounts: - mountPath: /ssh diff --git a/scripts/embed-rag-content.sh b/scripts/embed-rag-content.sh index d8b09349b..09ea1f13e 100755 --- a/scripts/embed-rag-content.sh +++ b/scripts/embed-rag-content.sh @@ -11,19 +11,58 @@ FLAVOR="${FLAVOR:-gpu}" RAG_EXPORT_DIR="${RAG_EXPORT_DIR:-/var/workdir/source/rag-export}" CACHI2_ROOT="${CACHI2_ROOT:-/cachi2}" -if [ "${HERMETIC}" = "true" ] && [ -f "${CACHI2_ROOT}/cachi2.env" ]; then +if [ -f "${CACHI2_ROOT}/prefetch.env" ]; then + # shellcheck disable=SC1091 + source "${CACHI2_ROOT}/prefetch.env" +elif [ -f "${CACHI2_ROOT}/cachi2.env" ]; then # shellcheck disable=SC1091 source "${CACHI2_ROOT}/cachi2.env" fi +if [ "${HERMETIC}" = "true" ] && [ ! -f "${CACHI2_ROOT}/prefetch.env" ] && [ ! -f "${CACHI2_ROOT}/cachi2.env" ]; then + echo "Hermetic embed requires cachi2 prefetch at ${CACHI2_ROOT}" >&2 + exit 1 +fi + if [ "${FLAVOR}" != "gpu" ]; then echo "embed-rag-content requires FLAVOR=gpu" >&2 exit 1 fi -pip3.12 install --no-cache-dir -r requirements.gpu.txt +pip_install_args=(--no-cache-dir) +req_file="requirements.gpu.txt" +if [ "${PIP_NO_REQUIRE_HASHES:-false}" = "true" ]; then + req_file=$(mktemp) + awk '!/^[[:space:]]*#/ && !/^[[:space:]]*--hash=/ { + sub(/[[:space:]]*\\[[:space:]]*$/, ""); + gsub(/^[[:space:]]+/, ""); + if ($0) print + }' requirements.gpu.txt >"${req_file}" + trap 'rm -f "${req_file}"' EXIT +fi +pip3.12 install "${pip_install_args[@]}" -r "${req_file}" ln -sf /usr/local/lib/python3.12/site-packages/llama_index/core/_static/nltk_cache /root/nltk_data +# Pip wheels ship CUDA libs under site-packages/nvidia/*/lib; on devel images they must +# precede /usr/local/cuda-12/compat so import torch finds libnvshmem_host.so, etc. +pip_nvidia_libdirs=$( + python3.12 -c "import glob, os, site; print(':'.join(sorted({p for r in site.getsitepackages() if os.path.isdir(r) for p in glob.glob(os.path.join(r, 'nvidia', '*', 'lib')) if os.path.isdir(p)})))" +) +ld_paths=() +if [ -n "${pip_nvidia_libdirs}" ]; then + ld_paths+=("${pip_nvidia_libdirs}") +fi +if [ -d /usr/local/cuda-12/compat ]; then + ld_paths+=("/usr/local/cuda-12/compat") +fi +if [ -n "${LD_LIBRARY_PATH:-}" ]; then + ld_paths+=("${LD_LIBRARY_PATH}") +fi +if [ "${#ld_paths[@]}" -gt 0 ]; then + LD_LIBRARY_PATH=$(IFS=:; echo "${ld_paths[*]}") + export LD_LIBRARY_PATH +fi + cd embeddings_model if [ "${HERMETIC}" = "true" ]; then cp "${CACHI2_ROOT}/output/deps/generic/model.safetensors" model.safetensors @@ -54,9 +93,11 @@ mkdir -p "${RAG_EXPORT_DIR}/rag" cp -a vector_db "${RAG_EXPORT_DIR}/rag/" cp -a embeddings_model "${RAG_EXPORT_DIR}/rag/" cp LICENSE "${RAG_EXPORT_DIR}/" +cp Containerfile.pack "${RAG_EXPORT_DIR}/" cp Containerfile.arm64 "${RAG_EXPORT_DIR}/" test -d "${RAG_EXPORT_DIR}/rag/vector_db/ocp_product_docs" test -d "${RAG_EXPORT_DIR}/rag/embeddings_model" test -f "${RAG_EXPORT_DIR}/LICENSE" +test -f "${RAG_EXPORT_DIR}/Containerfile.pack" test -f "${RAG_EXPORT_DIR}/Containerfile.arm64" diff --git a/scripts/embed-remote-setup.sh b/scripts/embed-remote-setup.sh index 4107602be..42feada9e 100755 --- a/scripts/embed-remote-setup.sh +++ b/scripts/embed-remote-setup.sh @@ -11,13 +11,20 @@ if [ -f /mnt/trusted-ca/ca-bundle.crt ]; then update-ca-trust fi +pkg_installed() { + case "$1" in + libcudnn9) rpm -qa | grep -q '^libcudnn9' ;; + *) rpm -q "$1" &>/dev/null ;; + esac +} + prepare_cachi2() { - if [ ! -f "${CACHI2_SRC}/cachi2.env" ]; then + if [ ! -f "${CACHI2_SRC}/cachi2.env" ] && [ ! -f "${CACHI2_SRC}/prefetch.env" ]; then echo "Missing cachi2 prefetch: ${CACHI2_SRC}/cachi2.env" >&2 return 1 fi - # Match buildah-remote-oci-ta: cachi2.env references file:///cachi2/... + # Match buildah-remote-oci-ta: prefetched paths use file:///cachi2/... rm -rf /tmp/cachi2 cp -a "${CACHI2_SRC}" /tmp/cachi2 chmod -R go+rwX /tmp/cachi2 @@ -37,26 +44,83 @@ prepare_cachi2() { ln -sf /tmp/cachi2 /cachi2 } +source_prefetch_env() { + if [ -f /cachi2/prefetch.env ]; then + # shellcheck disable=SC1091 + . /cachi2/prefetch.env + elif [ -f /cachi2/cachi2.env ]; then + # shellcheck disable=SC1091 + . /cachi2/cachi2.env + fi +} + +enable_prefetched_repos() { + local arch repos_src repo_file + + arch=$(uname -m) + repos_src="/cachi2/output/deps/rpm/${arch}/repos.d" + if [ ! -d "${repos_src}" ]; then + echo "Missing prefetched rpm repos: ${repos_src}" >&2 + return 1 + fi + + # Same as buildah-remote: inject only regular *.repo files into /etc/yum.repos.d. + find /usr/share/rhel/secrets -type l -exec unlink {} \; 2>/dev/null || true + mkdir -p /etc/yum.repos.d + rm -f /etc/yum.repos.d/*.repo + + shopt -s nullglob + for repo_file in "${repos_src}"/*.repo; do + cp "${repo_file}" /etc/yum.repos.d/ + done + shopt -u nullglob + + if ! compgen -G "/etc/yum.repos.d/*.repo" >/dev/null; then + echo "No prefetched .repo files found in ${repos_src}" >&2 + return 1 + fi +} + install_rpms() { local packages=(python3.12 python3.12-pip libcudnn9 libnccl libcusparselt0) + local missing=() + local pkg - if [ "${HERMETIC}" = "true" ]; then - # Prevent the CUDA base image from using host/RHSM repos. - find /usr/share/rhel/secrets -type l -exec unlink {} \; 2>/dev/null || true + if [ -n "${TEST_INSTALL_PACKAGES:-}" ]; then + # shellcheck disable=SC2206 + packages=(${TEST_INSTALL_PACKAGES}) + fi - prepare_cachi2 + for pkg in "${packages[@]}"; do + if ! pkg_installed "${pkg}"; then + missing+=("${pkg}") + fi + done - # shellcheck disable=SC1091 - . /cachi2/cachi2.env + if [ "${#missing[@]}" -eq 0 ]; then + echo "Required RPMs already present in builder image" + return 0 + fi - dnf install -y "${packages[@]}" + echo "Installing missing RPMs: ${missing[*]}" + + if [ "${HERMETIC}" = "true" ]; then + prepare_cachi2 + enable_prefetched_repos + source_prefetch_env + dnf install -y "${missing[@]}" else - dnf install -y "${packages[@]}" + dnf install -y "${missing[@]}" fi } install_rpms +if [ "${EMBED_REMOTE_SETUP_ONLY:-false}" = "true" ]; then + echo "EMBED_REMOTE_SETUP_ONLY: skipping GPU embed" + exit 0 +fi + test -d /var/workdir/source/scripts cd /var/workdir/source chmod +x ./scripts/embed-rag-content.sh ./scripts/embed-remote-setup.sh