diff --git a/.github/actions/prepare-gitlab-repo/action.yml b/.github/actions/prepare-gitlab-repo/action.yml new file mode 100644 index 0000000..f714733 --- /dev/null +++ b/.github/actions/prepare-gitlab-repo/action.yml @@ -0,0 +1,53 @@ +name: Prepare GitLab repository settings +description: Validate and split the scheme-less GitLab repository secret. + +inputs: + gitlab-repo: + description: Scheme-less GitLab repository path, for example gitlab.example.com/group/project.git. + required: true + +outputs: + host-path: + description: GitLab repository host/path with optional .git suffix. + value: ${{ steps.prepare.outputs.host-path }} + host: + description: GitLab host name. + value: ${{ steps.prepare.outputs.host }} + project-path: + description: GitLab project path without .git suffix. + value: ${{ steps.prepare.outputs.project-path }} + +runs: + using: composite + steps: + - id: prepare + shell: bash + env: + GITLAB_REPO: ${{ inputs.gitlab-repo }} + run: | + set -euo pipefail + + if [ -z "${GITLAB_REPO}" ]; then + echo "GITLAB_REPO secret is required." + exit 1 + fi + + case "${GITLAB_REPO}" in + http://*|https://*) + echo "GITLAB_REPO must use host/path format, for example gitlab.example.com/group/project.git." + exit 1 + ;; + esac + + repo="${GITLAB_REPO%.git}" + host="${repo%%/*}" + project_path="${repo#*/}" + + if [ -z "${host}" ] || [ "${project_path}" = "${repo}" ] || [ -z "${project_path}" ]; then + echo "GITLAB_REPO must include a host and project path, for example gitlab.example.com/group/project.git." + exit 1 + fi + + echo "host-path=${GITLAB_REPO}" >> "${GITHUB_OUTPUT}" + echo "host=${host}" >> "${GITHUB_OUTPUT}" + echo "project-path=${project_path}" >> "${GITHUB_OUTPUT}" diff --git a/.github/workflows/gitlab-manual-ci.yml b/.github/workflows/gitlab-manual-ci.yml index feb6b76..e97610c 100644 --- a/.github/workflows/gitlab-manual-ci.yml +++ b/.github/workflows/gitlab-manual-ci.yml @@ -50,15 +50,21 @@ jobs: fetch-depth: 0 ref: ${{ inputs.target_ref }} + - name: Prepare GitLab repository settings + id: gitlab-repo + uses: ./.github/actions/prepare-gitlab-repo + with: + gitlab-repo: ${{ secrets.GITLAB_REPO }} + - name: Push target ref to GitLab test branch env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} - GITLAB_REPO: ${{ secrets.GITLAB_REPO }} + GITLAB_REPO_HOST_PATH: ${{ steps.gitlab-repo.outputs.host-path }} TARGET_REF: ${{ inputs.target_ref }} run: | set -euo pipefail - if [ -z "${GITLAB_TOKEN}" ] || [ -z "${GITLAB_REPO}" ]; then + if [ -z "${GITLAB_TOKEN}" ] || [ -z "${GITLAB_REPO_HOST_PATH}" ]; then echo "GITLAB_TOKEN and GITLAB_REPO secrets are required." exit 1 fi @@ -71,14 +77,14 @@ jobs: git config user.name "github-bot" git config user.email "bot@example.com" - git remote add gitlab "https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO}" + git remote add gitlab "https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO_HOST_PATH}" git push -o ci.skip gitlab "HEAD:refs/heads/${branch}" --force - name: Trigger and wait for GitLab pipeline env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} - GITLAB_REPO: ${{ secrets.GITLAB_REPO }} - GITLAB_TEST_BRANCH: ${{ env.GITLAB_TEST_BRANCH }} + GITLAB_HOST: ${{ steps.gitlab-repo.outputs.host }} + GITLAB_PROJECT_PATH: ${{ steps.gitlab-repo.outputs.project-path }} CODE_FILTER: ${{ inputs.code }} SYSTEM_FILTER: ${{ inputs.system }} BENCHPARK_APP: ${{ inputs.app }} @@ -88,58 +94,72 @@ jobs: run: | set -euo pipefail - repo="${GITLAB_REPO#https://}" - repo="${repo#http://}" - host="${repo%%/*}" - project_path="${repo#*/}" - project_path="${project_path%.git}" - - project_encoded="$(PROJECT_PATH="${project_path}" python3 -c 'import os, urllib.parse; print(urllib.parse.quote(os.environ["PROJECT_PATH"], safe=""))')" - api="https://${host}/api/v4/projects/${project_encoded}/pipeline" - - curl_args=( - --fail - --silent - --request POST - --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" - --form "ref=${GITLAB_TEST_BRANCH}" - ) - - var_index=0 - add_variable() { - local key="$1" - local value="$2" - if [ -n "${value}" ]; then - echo "GitLab variable: ${key}=${value}" - curl_args+=(--form "variables[${var_index}][key]=${key}") - curl_args+=(--form "variables[${var_index}][value]=${value}") - var_index=$((var_index + 1)) - fi - } - - add_variable "code" "${CODE_FILTER}" - add_variable "system" "${SYSTEM_FILTER}" - add_variable "app" "${BENCHPARK_APP}" - - if [ "${BENCHPARK}" = "true" ]; then - add_variable "benchpark" "true" - fi - if [ "${PARK_ONLY}" = "true" ]; then - add_variable "park_only" "true" - fi - if [ "${PARK_SEND}" = "true" ]; then - add_variable "park_send" "true" + project_encoded="$(PROJECT_PATH="${GITLAB_PROJECT_PATH}" python3 -c 'import os, urllib.parse; print(urllib.parse.quote(os.environ["PROJECT_PATH"], safe=""))')" + api="https://${GITLAB_HOST}/api/v4/projects/${project_encoded}/pipeline" + + payload_file="$(mktemp)" + PAYLOAD_FILE="${payload_file}" python3 - <<'PY' + import json + import os + + variables = [] + + def add_variable(key, value): + if value: + print(f"GitLab variable: {key}={value}") + variables.append({ + "key": key, + "value": value, + "variable_type": "env_var", + }) + + add_variable("code", os.environ.get("CODE_FILTER", "")) + add_variable("system", os.environ.get("SYSTEM_FILTER", "")) + add_variable("app", os.environ.get("BENCHPARK_APP", "")) + + if os.environ.get("BENCHPARK") == "true": + add_variable("benchpark", "true") + if os.environ.get("PARK_ONLY") == "true": + add_variable("park_only", "true") + if os.environ.get("PARK_SEND") == "true": + add_variable("park_send", "true") + + with open(os.environ["PAYLOAD_FILE"], "w", encoding="utf-8") as f: + json.dump({ + "ref": os.environ["GITLAB_TEST_BRANCH"], + "variables": variables, + }, f) + PY + + response_file="$(mktemp)" + http_status="$(curl --show-error --silent --output "${response_file}" --write-out "%{http_code}" --request POST --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" --header "Content-Type: application/json" --data @"${payload_file}" "${api}")" + response="$(cat "${response_file}")" + rm -f "${response_file}" "${payload_file}" + + if [ "${http_status}" -lt 200 ] || [ "${http_status}" -ge 300 ]; then + echo "GitLab pipeline trigger failed with HTTP ${http_status}." + echo "${response}" + exit 1 fi - response="$(curl "${curl_args[@]}" "${api}")" pipeline_id="$(PIPELINE="${response}" python3 -c 'import json, os; print(json.loads(os.environ["PIPELINE"])["id"])')" pipeline_url="$(PIPELINE="${response}" python3 -c 'import json, os; print(json.loads(os.environ["PIPELINE"]).get("web_url", ""))')" echo "GitLab pipeline: ${pipeline_url}" - pipeline_api="https://${host}/api/v4/projects/${project_encoded}/pipelines/${pipeline_id}" + pipeline_api="https://${GITLAB_HOST}/api/v4/projects/${project_encoded}/pipelines/${pipeline_id}" for _ in $(seq 1 180); do - response="$(curl --fail --silent --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" "${pipeline_api}")" + response_file="$(mktemp)" + http_status="$(curl --show-error --silent --output "${response_file}" --write-out "%{http_code}" --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" "${pipeline_api}")" + response="$(cat "${response_file}")" + rm -f "${response_file}" + + if [ "${http_status}" -lt 200 ] || [ "${http_status}" -ge 300 ]; then + echo "GitLab pipeline status request failed with HTTP ${http_status}." + echo "${response}" + exit 1 + fi + status="$(PIPELINE="${response}" python3 -c 'import json, os; print(json.loads(os.environ["PIPELINE"])["status"])')" echo "GitLab pipeline status: ${status}" @@ -166,11 +186,10 @@ jobs: if: always() env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} - GITLAB_REPO: ${{ secrets.GITLAB_REPO }} - GITLAB_TEST_BRANCH: ${{ env.GITLAB_TEST_BRANCH }} + GITLAB_REPO_HOST_PATH: ${{ steps.gitlab-repo.outputs.host-path }} run: | set -euo pipefail - if [ -n "${GITLAB_TEST_BRANCH:-}" ]; then - git remote add gitlab-cleanup "https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO}" || true + if [ -n "${GITLAB_TEST_BRANCH:-}" ] && [ -n "${GITLAB_REPO_HOST_PATH:-}" ]; then + git remote add gitlab-cleanup "https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO_HOST_PATH}" || true git push gitlab-cleanup --delete "${GITLAB_TEST_BRANCH}" || true fi diff --git a/.github/workflows/result-server-tests.yml b/.github/workflows/result-server-tests.yml index 099dd61..0bf433b 100644 --- a/.github/workflows/result-server-tests.yml +++ b/.github/workflows/result-server-tests.yml @@ -4,10 +4,15 @@ on: pull_request: paths: - "result_server/**" - - "scripts/test_result_server.py" - - "scripts/validate_result_quality.py" - - "scripts/result_server/send_results.sh" - - "config/result_quality_policy.json" + - "scripts/bk_functions.sh" + - "scripts/result.sh" + - "scripts/result_server/**" + - "scripts/tests/test_bk_profiler.sh" + - "scripts/tests/test_result_profile_data.sh" + - "scripts/tests/test_send_results_profile_data.sh" + - "config/system.csv" + - "config/queue.csv" + - "config/system_info.csv" - "requirements-result-server.txt" - ".github/workflows/result-server-tests.yml" push: @@ -15,10 +20,15 @@ on: - "**" paths: - "result_server/**" - - "scripts/test_result_server.py" - - "scripts/validate_result_quality.py" - - "scripts/result_server/send_results.sh" - - "config/result_quality_policy.json" + - "scripts/bk_functions.sh" + - "scripts/result.sh" + - "scripts/result_server/**" + - "scripts/tests/test_bk_profiler.sh" + - "scripts/tests/test_result_profile_data.sh" + - "scripts/tests/test_send_results_profile_data.sh" + - "config/system.csv" + - "config/queue.csv" + - "config/system_info.csv" - "requirements-result-server.txt" - ".github/workflows/result-server-tests.yml" workflow_dispatch: @@ -30,8 +40,8 @@ jobs: fail-fast: false matrix: python-version: - - "3.9" - "3.12" + - "3.13" steps: - name: Check out repository @@ -49,5 +59,21 @@ jobs: python -m pip install --upgrade pip python -m pip install -r requirements-result-server.txt + - name: Run site configuration preflight + run: python result_server/tests/check_site_config.py + - name: Run result_server pytest suite - run: python scripts/test_result_server.py + run: python result_server/tests/run_result_server_tests.py + + profile-data-shell: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Run profiler and profile-data shell tests + run: | + bash scripts/tests/test_bk_profiler.sh + bash scripts/tests/test_result_profile_data.sh + bash scripts/tests/test_send_results_profile_data.sh diff --git a/.github/workflows/sync-to-gitlab.yml b/.github/workflows/sync-to-gitlab.yml index ca21233..6885bea 100644 --- a/.github/workflows/sync-to-gitlab.yml +++ b/.github/workflows/sync-to-gitlab.yml @@ -11,20 +11,31 @@ jobs: sync: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Prepare GitLab repository settings + id: gitlab-repo + uses: ./.github/actions/prepare-gitlab-repo + with: + gitlab-repo: ${{ secrets.GITLAB_REPO }} + - name: Push protected branches and tags to GitLab env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} - GITLAB_REPO: ${{ secrets.GITLAB_REPO }} + GITLAB_REPO_HOST_PATH: ${{ steps.gitlab-repo.outputs.host-path }} run: | set -euo pipefail + if [ -z "${GITLAB_TOKEN}" ] || [ -z "${GITLAB_REPO_HOST_PATH}" ]; then + echo "GITLAB_TOKEN and GITLAB_REPO secrets are required." + exit 1 + fi + git config user.name "github-bot" git config user.email "bot@example.com" - git remote add gitlab https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO} + git remote add gitlab "https://oauth2:${GITLAB_TOKEN}@${GITLAB_REPO_HOST_PATH}" git fetch origin +refs/heads/develop:refs/remotes/origin/develop +refs/heads/main:refs/remotes/origin/main git push -o ci.skip gitlab refs/remotes/origin/develop:refs/heads/develop refs/remotes/origin/main:refs/heads/main --force git push -o ci.skip gitlab --tags --force diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d4f98ce..c4c5bf1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,8 +7,9 @@ # Effect: Pipeline is created but jobs are skipped based on file changes # # Files that automatically skip CI: -# - README.md, ADD_APP.md (documentation) -# - result_server/**/* (portal/server code and templates) +# - root Markdown files and docs/**/* (documentation) +# - .github/**/* (GitHub-only workflow/action files) +# - result_server/**/* and config/system_info.csv (portal/server code, templates, and display metadata) # # Important Notes: # - trigger_child_pipeline depends on generate_matrix, so we use needs: optional: true @@ -33,7 +34,7 @@ variables: # Extract system and code filters from API variables or commit message .filters: &filters - | - # Priority 1: API trigger variables (from curl -F "variables[system]=...") + # Priority 1: API trigger variables (from GitLab Pipeline API variables) if [[ -n "$system" ]]; then echo "System filter from API variable: $system" else @@ -47,7 +48,7 @@ variables: fi fi - | - # Priority 1: API trigger variables (from curl -F "variables[code]=...") + # Priority 1: API trigger variables (from GitLab Pipeline API variables) if [[ -n "$code" ]]; then echo "Code filter from API variable: $code" else @@ -85,16 +86,22 @@ generate_matrix: - if: '$CI_COMMIT_MESSAGE =~ /\[park-send\]/' when: never # [park-send]コミットメッセージでは無効 - changes: - - ".gitlab-ci.yml" - - ".github/workflows/*.yml" - - "programs/**/*" - - "scripts/**/*" - - "config/system.csv" - - "config/queue.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".gitlab-ci.yml" + - "programs/**/*" + - "scripts/**/*" + - "config/system.csv" + - "config/queue.csv" when: always - changes: - - "*.md" - - "result_server/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".github/**/*" + - "*.md" + - "docs/**/*" + - "result_server/**/*" + - "config/system_info.csv" when: never - when: always @@ -120,16 +127,22 @@ trigger_child_pipeline: - if: '$CI_COMMIT_MESSAGE =~ /\[park-send\]/' when: never # [park-send]コミットメッセージでは無効 - changes: - - ".gitlab-ci.yml" - - ".github/workflows/*.yml" - - "programs/**/*" - - "scripts/**/*" - - "config/system.csv" - - "config/queue.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".gitlab-ci.yml" + - "programs/**/*" + - "scripts/**/*" + - "config/system.csv" + - "config/queue.csv" when: always - changes: - - "*.md" - - "result_server/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".github/**/*" + - "*.md" + - "docs/**/*" + - "result_server/**/*" + - "config/system_info.csv" when: never - when: always # BenchPark Monitor Jobs @@ -161,7 +174,9 @@ generate_benchpark_matrix: - if: '$CI_COMMIT_MESSAGE =~ /\[benchpark\]/' when: always - changes: - - "benchpark-bridge/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - "benchpark-bridge/**/*" when: always - when: never @@ -195,7 +210,9 @@ trigger_benchpark_pipeline: - if: '$CI_COMMIT_MESSAGE =~ /\[benchpark\]/' when: always - changes: - - "benchpark-bridge/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - "benchpark-bridge/**/*" when: always - when: never diff --git a/README.md b/README.md index 48f12b7..ca3ab2e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,10 @@ This includes: - system-specific execution environments - runtime requirements +## Runtime Requirements + +- `result_server` requires Python 3.12 or later. + ## License This project is licensed under the BSD 3-Clause License. See [LICENSE](LICENSE). diff --git a/config/queue.csv b/config/queue.csv index 1900325..681bbd0 100644 --- a/config/queue.csv +++ b/config/queue.csv @@ -1,5 +1,10 @@ queue,submit_cmd,template FJ,pjsub,"-L rscunit=rscunit_ft01,rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi max-proc-per-node=${numproc_node} -x PJM_LLIO_GFSCACHE=/vol0002:/vol0003:/vol0004:/vol0005" +PJM_GENKAI,pjsub,"-L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc}" SLURM_RC,sbatch,"-p ${queue_group} -t ${elapse} -N ${nodes} --ntasks-per-node=${numproc_node} --cpus-per-task=${nthreads}" -PBS_Miyabi,qsub,"-q ${queue_group} -l select=${nodes} -l walltime=${elapse} -W group_list=jh260034" +PBS_Miyabi,qsub,"-q ${queue_group} -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} -l walltime=${elapse} -W group_list=jh260034" +PBS_Grand_C,qsub,"-q ${queue_group} -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} -W group_list=d30992" +PBS_Grand_G,qsub,"-q ${queue_group} -l select=${nodes}:ngpus=1,walltime=${elapse} -W group_list=d30992" +NQSV_AOBA_VE,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T necmpi --venode ${proc} -l elapstim_req=${elapse}" +NQSV_AOBA_B,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T intmpi -b ${nodes} -l elapstim_req=${elapse}" none,none,none diff --git a/config/result_quality_policy.json b/config/result_quality_policy.json deleted file mode 100644 index 67741a9..0000000 --- a/config/result_quality_policy.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "version": 1, - "default_tier": "relaxed", - "tiers": { - "strict": { - "fail_candidates": [ - "FOM present", - "source_info present", - "recognized source_info.source_type", - "complete source_info fields" - ], - "fail_warnings": [] - }, - "standard": { - "fail_candidates": [ - "FOM present", - "recognized source_info.source_type" - ], - "fail_warnings": [] - }, - "relaxed": { - "fail_candidates": [], - "fail_warnings": [] - } - }, - "apps": { - "qws": "strict", - "genesis": "standard", - "scale-letkf": "standard" - } -} diff --git a/config/system.csv b/config/system.csv index 618f62b..9513980 100644 --- a/config/system.csv +++ b/config/system.csv @@ -8,4 +8,12 @@ RC_GENOA,native,,cloud_jacamar,SLURM_RC,genoa RC_FX700,native,,cloud_jacamar,SLURM_RC,fx700 MiyabiG,cross,miyabi_g_login,miyabi_g_jacamar,PBS_Miyabi,debug-g MiyabiC,cross,miyabi_c_login,miyabi_c_jacamar,PBS_Miyabi,debug-c +GenkaiA,cross,genkai_login,genkai_jacamar,PJM_GENKAI,a-batch +GenkaiB,cross,genkai_login,genkai_jacamar,PJM_GENKAI,b-batch +GenkaiC,cross,genkai_login,genkai_jacamar,PJM_GENKAI,c-batch +Grand_C,cross,grand_login,grand_jacamar,PBS_Grand_C,lc +Grand_G,cross,grand_login,grand_jacamar,PBS_Grand_G,eg +AOBA_A,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_VE,sx +AOBA_B,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_B,lx +AOBA_S,cross,aoba_s_login,aoba_s_jacamar,NQSV_AOBA_VE,sxs FNCX,native,,fncx-curl-jq,none,small diff --git a/config/system_info.csv b/config/system_info.csv index 4d35ae7..0273370 100644 --- a/config/system_info.csv +++ b/config/system_info.csv @@ -8,3 +8,11 @@ RC_GH200,RC_GH200,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,6 RC_DGXSP,RC_DGXSP,ARM Cortex-X925 / Cortex-A725,1,20,NVIDIA GB10,1,128GB,7 RC_GENOA,RC_GENOA,AMD EPYC 9684X,2,96,-,-,768GB,8 RC_FX700,RC_FX700,A64FX,1,48,-,-,32GB,9 +GenkaiA,GenkaiA,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,-,-,512GiB,10 +GenkaiB,GenkaiB,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,NVIDIA H100 (Hopper),4,1024GiB,11 +GenkaiC,GenkaiC,Intel Xeon Platinum 8480+ (Sapphire Rapids),2,56,NVIDIA H100 (Hopper),8,8TiB,12 +Grand_C,Grand_C,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,-,-,512GiB,13 +Grand_G,Grand_G,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,NVIDIA H100 (Hopper),4,512GiB,14 +AOBA_A,AOBA_A,SX-Aurora TSUBASA VH,1,24,NEC SX-Aurora TSUBASA Type 20B VE,8,640GB,15 +AOBA_B,AOBA_B,AMD EPYC 7702,2,64,-,-,256GB,16 +AOBA_S,AOBA_S,SX-Aurora TSUBASA VH,1,64,NEC SX-Aurora TSUBASA Type 30A VE,8,256GB + 768GB,17 diff --git a/docs/ci.md b/docs/ci.md index e1501bb..1980da5 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -15,7 +15,26 @@ BenchKit uses GitHub as the public development repository and GitLab CI for benc | `GitLab Manual CI` | Manual `workflow_dispatch` / 手動実行 | Runs GitLab benchmark CI for a selected repository ref / 指定したrefに対してGitLabベンチマークCIを実行する | | `Sync protected branches to GitLab` | Pushes to `develop` or `main` / `develop`または`main`へのpush | Mirrors protected branches to GitLab without starting GitLab CI / GitLab CIを発火させずに保護ブランチをGitLabへ同期する | | `Guard main PR source` | Pull requests to `main` / `main`宛PR | Allows only upstream `develop` to target `main` / upstreamの`develop`から`main`へのPRだけを許可する | -| `Result Server Tests` | Result server related changes / result server関連変更 | Runs result server tests / result serverのテストを実行する | +| `Result Server Tests` | Result server, portal metadata, site config, or portal upload helper changes / result server、portal metadata、site config、portal upload helper関連変更 | Runs site config preflight and result server tests / site config preflightとresult serverのテストを実行する | + +## GitLab Secrets / GitLab secret + +GitHub ActionsからGitLabへpushまたはpipeline triggerを行うworkflowでは、以下のsecretを使います。 + +Workflows that push to GitLab or trigger GitLab pipelines use these secrets: + +| Secret | Format / 形式 | Purpose / 目的 | +|---|---|---| +| `GITLAB_TOKEN` | GitLab token with push and pipeline API access / pushとpipeline APIに使えるGitLab token | Authenticates Git operations and Pipeline API calls / Git操作とPipeline API呼び出しを認証する | +| `GITLAB_REPO` | Scheme-less `host/path` such as `gitlab.example.com/group/project.git` / `gitlab.example.com/group/project.git` のようなschemeなし`host/path` | Selects the GitLab project used by sync and manual CI / syncとmanual CIが使うGitLab projectを指定する | + +`GITLAB_REPO` に `https://` や `http://` は付けません。`GitLab Manual CI` と `Sync protected branches to GitLab` は同じ形式を検証して使います。 + +Do not include `https://` or `http://` in `GITLAB_REPO`. `GitLab Manual CI` and `Sync protected branches to GitLab` validate and use the same format. + +この検証は `.github/actions/prepare-gitlab-repo` に集約しています。 + +This validation is centralized in `.github/actions/prepare-gitlab-repo`. ## Pull Request Policy / Pull Request方針 @@ -74,6 +93,10 @@ The workflow: - 実行後、一時GitLabブランチを削除します。 - Removes the temporary GitLab branch after the run. +この経路は、`qws` / `MiyabiG` の最小実行で、GitLab pipeline 起動から推定まで動作確認済みです。 + +This path has been smoke-tested with a minimal `qws` / `MiyabiG` run through GitLab pipeline execution and estimation. + `target_ref`はupstreamリポジトリ内のbranch、tag、SHAを指定する想定です。forkからのpull requestをGitLab CIで試す場合は、maintainerがまずupstreamリポジトリ側に`ci/pr-123`のような信頼済み一時ブランチを作り、そのブランチに対して`GitLab Manual CI`を実行します。 `target_ref` is intended to refer to a branch, tag, or SHA in the upstream repository. For fork pull requests, a maintainer should first create a trusted temporary branch in the upstream repository, such as `ci/pr-123`, and then run `GitLab Manual CI` against that branch. @@ -95,6 +118,10 @@ Feature branch pushes do not trigger GitLab synchronization. The sync workflow mirrors `develop`, `main`, and tags to GitLab with `ci.skip`. This keeps GitLab history aligned without starting GitLab CI automatically. +`ci.skip` により GitLab CI が自動起動しないことは、保護ブランチ同期の運用で確認済みです。 + +The protected-branch synchronization path has been confirmed to update the GitLab mirror without automatically starting GitLab CI because it uses `ci.skip`. + 通常の運用では、pull requestが`develop`へmergeされた後、または`develop`が`main`へmergeされた後に同期が行われます。 In the normal workflow, synchronization happens after a pull request is merged into `develop`, or after `develop` is merged into `main`. @@ -191,12 +218,55 @@ GitLab pipelineは、変更がドキュメントやresult server関連ファイ The GitLab pipeline avoids benchmark execution when changes are limited to documentation or result server files, according to the active rules in `.gitlab-ci.yml`. +保護ブランチ同期は`ci.skip`付きでGitLabへpushするため、これらのskip rulesは主にGitLab上で直接pipelineを起動した場合、または`GitLab Manual CI`がPipeline APIで明示起動した場合の保険として効きます。 + +Protected-branch synchronization pushes to GitLab with `ci.skip`, so these skip rules mainly matter when a GitLab pipeline is started directly on GitLab or explicitly through the Pipeline API by `GitLab Manual CI`. + 現在のskip寄りのpatternには以下があります。 Current skip-oriented patterns include: - `*.md` +- `docs/**/*` - `result_server/**/*` +- `config/system_info.csv` + +`system_info.csv` is the public portal catalog. Every system listed there must also be registered in `system.csv` and reference a queue defined in `queue.csv`. The reverse is intentionally not required: private or development-only systems may exist in `system.csv` / `queue.csv` without being exposed in `system_info.csv`. + +`system_info.csv` はportalでユーザーに見える公開catalogです。そこに載せたsystemは必ず `system.csv` に登録され、`queue.csv` に定義されたqueueを参照する必要があります。逆方向は必須ではありません。開発用・非公開用のsystemやqueueは、`system_info.csv` に公開せず `system.csv` / `queue.csv` にだけ存在してよいです。 + +The app support matrix, partial support, missing app entrypoints, and unknown systems in `list.csv` are shown in `/results/usage` for operational visibility, but they are not CI-failing checks at this stage because application readiness varies by app and rollout phase. + +app support matrix、partial support、app entrypoint不足、`list.csv` 内の未知systemは、運用 visibility のため `/results/usage` に表示します。ただしアプリごとの準備状況や導入段階がばらばらなため、現時点では CI failure にはしません。 + +## Expected CI Behavior by Change Type / 変更種別ごとの期待CI動作 + +| Change type / 変更種別 | GitHub Actions | GitLab benchmark CI | Notes / 補足 | +|---|---|---|---| +| Root Markdown or `docs/**/*` only / root Markdownまたは`docs/**/*`のみ | No benchmark-specific GitHub workflow / ベンチマーク用GitHub workflowなし | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | Keep docs-only changes separate from benchmark logic changes / docsのみの変更はbenchmark logic変更と分ける | +| `result_server/**/*` / `result_server/**/*` | `Result Server Tests` | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | Portal regressions should be caught by lightweight Python tests / portal回帰はlightweight Python testで捕捉する | +| Public site config or portal metadata `config/system.csv`, `config/queue.csv`, `config/system_info.csv` / 公開site configまたはportal表示メタデータ`config/system.csv`、`config/queue.csv`、`config/system_info.csv` | `Result Server Tests`, including site config preflight / site config preflightを含む`Result Server Tests` | `config/system.csv` and `config/queue.csv` run by `.gitlab-ci.yml`; `config/system_info.csv` is skipped / `config/system.csv`と`config/queue.csv`は`.gitlab-ci.yml`で実行、`config/system_info.csv`はskip | Public systems listed in `system_info.csv` must also exist in `system.csv` and reference a queue defined in `queue.csv` / `system_info.csv`に載せる公開systemは`system.csv`にも存在し、`queue.csv`定義済みqueueを参照する必要がある | +| Portal upload or profile-data helper `scripts/bk_functions.sh`, `scripts/result.sh`, `scripts/result_server/**` / portal uploadまたはprofile-data helper `scripts/bk_functions.sh`、`scripts/result.sh`、`scripts/result_server/**` | `Result Server Tests` when covered by its path filter / path filter対象なら`Result Server Tests` | Not automatic for pull requests; runs only if a maintainer starts `GitLab Manual CI` / pull requestでは自動起動せず、maintainerが`GitLab Manual CI`を起動した場合のみ実行 | These helpers shape result JSON / upload behavior without requiring a full benchmark by default / これらのhelperはfull benchmarkを既定で要求せずにResult JSONやupload挙動へ影響する | +| Benchmark app code or other shared scripts / benchmark appコードまたはその他の共通script | Normal GitHub review checks only / 通常のGitHub review checkのみ | Run through `GitLab Manual CI` when maintainer starts it / maintainerが`GitLab Manual CI`を起動した場合に実行 | Use `code` and `system` filters when broad validation is unnecessary / 広範な検証が不要なら`code`と`system`を指定する | +| GitHub workflow/action `.github/**/*` / GitHub workflow/action `.github/**/*` | Workflow-specific checks when paths match / path一致時にworkflowごとのcheck | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | GitHub workflow/action changes affect API-calling or sync control logic. Validate them on the GitHub side; they are pushed to GitLab with `ci.skip` during protected-branch sync / GitHub workflow/action変更はAPI呼び出しやsync制御に影響する。GitHub側で確認する。protected-branch syncでは`ci.skip`付きでGitLabへpushされる | +| `.gitlab-ci.yml` / `.gitlab-ci.yml` | Normal GitHub review checks only / 通常のGitHub review checkのみ | Run through `GitLab Manual CI` when a maintainer needs to validate GitLab pipeline behavior / GitLab pipeline挙動の検証が必要な場合にmaintainerが`GitLab Manual CI`で実行 | This file defines GitLab benchmark pipeline behavior / このファイルはGitLab benchmark pipeline挙動を定義する | + +## Representative Change Sets / 代表的な変更セット + +以下は、pull requestを分けるか、GitLab benchmark CIを手動実行するかを判断するための代表例です。 + +Use these examples when deciding whether to split a pull request or start GitLab benchmark CI manually. + +| Example change set / 変更例 | Expected checks / 期待される確認 | GitLab benchmark expectation / GitLab benchmark期待値 | +|---|---|---| +| `docs/ci.md` only / `docs/ci.md`のみ | Review the documentation diff / docs差分をreview | No benchmark run. Direct/manual GitLab pipelines should skip by rules / benchmark不要。直接/手動GitLab pipelineではrulesでskipされる想定 | +| `result_server/routes/usage.py` and `result_server/templates/*.html` / `result_server/routes/usage.py`と`result_server/templates/*.html` | `Result Server Tests` should run / `Result Server Tests`が動く | No benchmark run unless a maintainer intentionally starts one / maintainerが意図して起動しない限りbenchmark不要 | +| `config/system_info.csv` only / `config/system_info.csv`のみ | `Result Server Tests` should verify public site config consistency / 公開site config整合性を`Result Server Tests`で確認 | No benchmark run because this file is portal display metadata / portal表示metadataなのでbenchmark不要 | +| `config/system.csv` or `config/queue.csv` for a public system / 公開system向けの`config/system.csv`または`config/queue.csv` | `Result Server Tests` should run the site config preflight / `Result Server Tests`でsite config preflightを実行 | Start `GitLab Manual CI` too when benchmark execution behavior needs validation / benchmark実行挙動の検証が必要なら`GitLab Manual CI`も起動 | +| `scripts/bk_functions.sh`, `scripts/result.sh`, or `scripts/result_server/**` only / `scripts/bk_functions.sh`、`scripts/result.sh`、または`scripts/result_server/**`のみ | `Result Server Tests` should run when the path filter matches / path filter対象なら`Result Server Tests`が動く | Manual GitLab CI is optional and only needed if upload behavior affects benchmark operation / upload挙動がbenchmark運用に影響する場合だけ手動GitLab CIを検討 | +| `programs/qws/**/*` or `scripts/job/**/*` / `programs/qws/**/*`または`scripts/job/**/*` | Normal GitHub review checks / 通常のGitHub review check | Start `GitLab Manual CI` when benchmark validation is needed, preferably with explicit `code` and `system` filters / benchmark検証が必要なら`code`と`system`を明示して`GitLab Manual CI`を起動 | +| `.github/workflows/sync-to-gitlab.yml` or `.github/actions/prepare-gitlab-repo/action.yml` / `.github/workflows/sync-to-gitlab.yml`または`.github/actions/prepare-gitlab-repo/action.yml` | Validate on the GitHub Actions side / GitHub Actions側で確認 | Skipped by `.gitlab-ci.yml` rules when changed alone; protected-branch sync pushes it with `ci.skip` / 単独変更なら`.gitlab-ci.yml` rulesでskip。protected-branch syncでは`ci.skip`付きでpushされる | +| `.gitlab-ci.yml` / `.gitlab-ci.yml` | Review the GitLab rule diff carefully / GitLab rule差分を慎重にreview | Start `GitLab Manual CI` if rule behavior itself needs validation / rule挙動そのものの検証が必要なら`GitLab Manual CI`を起動 | ## Contributor Guidance / コントリビュータ向け注意 diff --git a/docs/cx/BENCHKIT_GAP_ANALYSIS.md b/docs/cx/BENCHKIT_GAP_ANALYSIS.md index d94f00e..af7654e 100644 --- a/docs/cx/BENCHKIT_GAP_ANALYSIS.md +++ b/docs/cx/BENCHKIT_GAP_ANALYSIS.md @@ -58,8 +58,8 @@ Continuous estimation has now moved beyond a mere entry point: a common estimati However, estimation is still not yet broadly deployed across multiple applications, and AI-driven optimization integration remains mostly at the integration-point stage. As of the current repository survey, BenchKit has six benchmark applications with `build.sh`/`run.sh`, but only `qws` has an `estimate.sh`. -The result portal also already has a meaningful test base (`result_server/tests`: 27, `scripts/tests`: 3), and the repository now has a repo-local Python dependency manifest, a standard portal test entrypoint, and a lightweight GitHub Actions verification path for portal-oriented changes. -The main GitLab pipeline still intentionally skips heavy benchmark execution when changes are limited to `result_server/**/*`, which is appropriate, so the dedicated lightweight path should continue to be kept in sync as portal- or validator-side files evolve. +The result portal also already has a meaningful test base (`result_server/tests`: 27), and the repository now has a repo-local Python dependency manifest, a standard portal test entrypoint under `result_server/tests`, and a lightweight GitHub Actions verification path for portal-oriented changes. +The main GitLab pipeline still intentionally skips heavy benchmark execution when a direct or manually triggered GitLab pipeline sees changes limited to `result_server/**/*` or portal display metadata such as `config/system_info.csv`. Protected-branch synchronization itself uses `ci.skip`, so the dedicated lightweight GitHub Actions path should continue to be kept in sync as portal-side files evolve. ## 2.1 現時点で明示しておく設計負債 / Explicit Design Debts to Keep Visible @@ -73,13 +73,13 @@ The main GitLab pipeline still intentionally skips heavy benchmark execution whe - 推定結果 compare UI: detail 画面で current / future breakdown、fallback、applicability は見えるようになったが、同一 `code/exp` 間の差分把握 UI はまだ後回しにしている。 - 結果 quality の扱い: - portal 上の quality badge、detail view、latest-result current-state summary は実装済みだが、CI を fail させる validator にはしていない。品質評価は現在も visibility-first である。 + portal 上の quality badge、detail view、latest-result current-state summary は実装済みだが、PR の必須基準は基本的に FOM を持つことに留める。品質評価は内部管理・visibility-first で扱う。 - site capability checker: - `/results/usage` に lightweight configuration checks は入ったが、CI 実行可否と直接結び付く自動 checker にはまだしていない。 + `/results/usage` の lightweight configuration checks のうち、公開 `system_info.csv` が `system.csv` / `queue.csv` に対応していることは CI preflight へ昇格済みである。一方、app support matrix はアプリごとの準備状況差が大きいため、現時点では CI failure にせず portal visibility に留める。 - app/system coverage 判定: - 現在の coverage matrix は `list.csv` と `build.sh` / `run.sh` の structured shell-branch detection に基づくが、完全な execution-contract checker にはまだなっていない。 + 現在の coverage matrix は `list.csv` と `build.sh` / `run.sh` の structured shell-branch detection に基づくが、完全な execution-contract checker にはしていない。これは未実装というより、現段階では gate 化しない運用判断である。 - result portal の検証導線: - `result_server` にはまとまったテスト資産がある一方で、repo-local な Python 依存関係定義、標準の test entrypoint、portal-only 変更時に自動で走る lightweight CI はまだ固定されていない。 + repo-local な Python 依存関係定義、標準の test entrypoint、portal-oriented 変更時に走る lightweight CI は固定済みである。一方で、upload helper や portal 表示メタデータなど portal 周辺の対象が増えるたびに path filter を追従させる必要は残る。 At the current stage, several issues are not simply "missing implementations" but rather intentionally deferred or not yet fully fixed as design boundaries. This section keeps those visible so they are not forgotten later. @@ -91,35 +91,36 @@ This section keeps those visible so they are not forgotten later. - estimation compare UI: detail views now expose current / future breakdown, fallback, and applicability, but same-`code`/`exp` comparison remains intentionally deferred. - result quality handling: - quality badges, detail views, and latest-result current-state summaries are implemented, but they are not yet enforced as CI-failing validation; the current approach remains visibility-first. + quality badges, detail views, and latest-result current-state summaries are implemented, but PR requirements should generally stay limited to having a FOM value. Quality scoring remains internal and visibility-first. - site capability checker: - `/results/usage` now has lightweight configuration checks, but there is still no automatic checker directly tied to CI execution readiness. + the `/results/usage` configuration check that public `system_info.csv` entries resolve to `system.csv` / `queue.csv` is now promoted to CI preflight, while the app support matrix remains portal visibility rather than a CI failure because application readiness intentionally varies. - app/system coverage evaluation: - the current coverage matrix uses `list.csv` plus structured shell-branch detection in `build.sh` / `run.sh`, but it still stops short of a full execution-contract checker. + the current coverage matrix uses `list.csv` plus structured shell-branch detection in `build.sh` / `run.sh`, but it is intentionally not a full execution-contract gate at this stage. - result portal verification path: - the portal now has a fixed repo-local Python dependency manifest, a standard test entrypoint, and a lightweight CI path for portal-oriented changes, but the path filter coverage must continue to track validator scripts, policy files, and nearby operational entrypoints as the surface area expands. + the portal now has a fixed repo-local Python dependency manifest, a standard test entrypoint, and a lightweight CI path for portal-oriented changes, but the path filter coverage must continue to track upload helpers, portal display metadata, and nearby operational entrypoints as the surface area expands. -### 5.2.1 結果 quality validator の段階導入 / Phased Introduction of Result Quality Validation +### 2.1.1 結果 quality の内部可視化 / Internal Result Quality Visibility -result quality については、現時点では visibility-first を維持しつつ、段階的に validator を運用へ入れるのが妥当である。 +result quality については、PR の必須基準にはせず、portal 内の内部管理・可視化用途として visibility-first を維持するのが妥当である。 +外部 contributor や通常 PR に対しては、基本的に `FOM` 値を持つ有効な Result JSON であることを最低限の基準とし、`source_info`、`fom_breakdown`、artifact 参照などの品質項目は改善候補として扱う。 -短期的には、次の 3 段階を推奨する。 +短期的には、次の運用に留める。 -1. report-only: - `scripts/validate_result_quality.py` をローカルおよび upload 前に実行するが、`--fail-on none` を使い、警告は可視化だけに留める。 -2. candidate review: - usage report の `Validator Candidates` と CLI 出力を見ながら、繰り返し出る structural issue を収集し、安定した rule だけを gate 候補へ昇格させる。 -3. selective fail: - `FOM present` や `recognized source_info.source_type` のように解釈の余地が少ないものから、限定的に CI / upload fail へ昇格させる。 +1. portal-only visibility: + quality badge、detail view、usage report の current-state view に限定して品質状態を表示する。 +2. internal candidate review: + usage report の `Improvement Candidates` を見ながら、繰り返し出る structural issue を内部改善キューとして収集する。 +3. no PR quality gate: + `source_info` や `fom_breakdown` の不足を通常 PR の blocking rule にしない。必要があれば内部 workflow や staging 相当でのみ観測する。 -逆に、`fom_breakdown present` や artifact 参照のように app 間差分や導入順序の影響を受けやすいものは、すぐに blocking rule にせず、しばらくは warning / candidate のまま保持するのが安全である。 +`fom_breakdown present` や artifact 参照のように app 間差分や導入順序の影響を受けやすいものは、warning / candidate のまま保持するのが安全である。 ## 3. 機能別ギャップ分析 / Function-by-Function Gap Analysis | 機能 | 仕様要求 | 現状実装 | 不足・課題 | 他機能への影響 | 優先度 | |---|---|---|---|---|---| | ベンチマーク実行定義 | アプリごとの build/run/list を保持し、継続実行可能であること | `programs/*` に `build.sh` `run.sh` `list.csv`、一部 `estimate.sh` がある | 追加や修正がまだ人手中心。雛形生成や申請導線がない | 申請・承認・AI 連携の前提になる | 高 | -| CI ジョブ生成 | system と queue 情報を使って CI 実行を生成すること | `matrix_generate.sh` と `job_functions.sh` が実装済み。`add-site.md` に `system.csv` / `queue.csv` / `system_info.csv` の責務分担、接続確認順序、障害切り分け、onboarding checklist も整理された。portal の `/results/usage` では queue 定義抜けや `system_info.csv` 未登録を軽く確認できる | site capability を CI に結び付けた自動 checker や、site ごとの capability summary は未実装 | 拠点追加、予算管理、申請フォームの自動化に影響 | 高 | +| CI ジョブ生成 | system と queue 情報を使って CI 実行を生成すること | `matrix_generate.sh` と `job_functions.sh` が実装済み。`add-site.md` に `system.csv` / `queue.csv` / `system_info.csv` の責務分担、接続確認順序、障害切り分け、onboarding checklist も整理された。portal の `/results/usage` では queue 定義抜けや `system_info.csv` 未登録を軽く確認できる。さらに公開 `system_info.csv` が `system.csv` と `queue.csv` に対応することは `result_server/tests/check_site_config.py` で CI preflight 化済み | app support matrix はアプリごとの準備状況差が大きいため CI failure にせず、site ごとの capability summary も未実装 | 拠点追加、予算管理、申請フォームの自動化に影響 | 高 | | 結果正規化 | `run.sh` 出力を Result JSON に正規化すること | `bk_emit_result`、`bk_emit_section`、`bk_emit_overlap`、`result.sh` が実装済み。portal 側では一覧の quality badge と詳細の `Quality` セクションで `source_info`、`fom_breakdown`、推定入力参照の有無を軽く確認でき、`/results/usage` では最新 result ベースの current-state も見られる | app ごとの差異を体系的に検証する validator や、履歴横断の quality 集計・基準化はまだ弱い | 推定、可視化、AI 診断の入力品質に直結 | 高 | | 性能推定 | Result JSON から Estimate JSON を生成し、可視化可能であること | `scripts/estimation/common.sh`、`scripts/estimation/run.sh`、`scripts/result_server/send_estimate.sh`、`estimated` 画面あり。`qws` では `weakscaling` と詳細ダミー推定、section ごとの package 指定、補助データ参照、section-level fallback、requested/applied package 識別、top-level applicability end state、推定元 result と推定結果自体の UUID / timestamp 保持まで動作する | 横展開はまだ `qws` 中心。複数 detailed package の本実装、再推定比較運用、他 app への適用が未完成 | AI 駆動、将来機評価、継続的フィードバックの基盤になる | 最優先 | | 推定結果表示 | Estimate JSON を一覧・詳細で表示できること | `result_server/routes/estimated.py` とテンプレートが実装済み。requested/applied package、applicability、estimate UUID の基本表示に加えて、HTML detail で current / future breakdown、section / overlap 単位の fallback / package applicability まで表示できる。home からの導線も整理済みで、未認証時は login required であることも入口で分かる | compare UI、`not_applicable` の説明補助、複数 estimate 間の差分把握はまだ弱い | 推定運用を本格化すると重要度が上がる | 高 | @@ -295,19 +296,35 @@ Once the estimation specification is clarified, many other design decisions beco 今回のコードベース調査では、性能推定に次ぐ実務上の詰まりどころとして、`result_server` の検証導線が見えた。 -- `result_server/tests` には 26 本の pytest ベースのテストがあり、portal 側はすでに「検証すべき対象」になっている -- その一方で、repo 直下には `pyproject.toml` や `requirements.txt` などの repo-local な依存関係定義が見当たらない -- 調査環境でも `pytest` コマンドは即利用できず、ローカル再現性の入口が弱い -- `.gitlab-ci.yml` は `result_server/**/*` 変更時に重い benchmark pipeline を skip するが、portal 用 lightweight CI は別途まだ用意されていない +- `result_server/tests` には 27 本の pytest ベースのテストがあり、portal 側はすでに「検証すべき対象」になっている +- repo-local な依存関係定義として `requirements-result-server.txt` があり、`result_server/tests/run_result_server_tests.py` が標準 test entrypoint として使える +- portal-oriented 変更向けの lightweight GitHub Actions として `.github/workflows/result-server-tests.yml` が用意されている +- `.gitlab-ci.yml` は直接または手動起動されたGitLab pipelineで `result_server/**/*` や `config/system_info.csv` 変更時に重い benchmark pipeline を skip する。保護ブランチ同期自体は `ci.skip` を使うため、GitHub Actions 側の path filter を portal 周辺の実ファイルに追従させ続ける必要がある -したがって短期的には、性能推定の横展開と並行して、次を固める価値が高い。 +したがって短期的には、性能推定の横展開と並行して、次を維持・拡張する価値が高い。 -1. repo-local な Python 依存関係定義を置く -2. `result_server` を対象にした標準 test entrypoint を決める -3. portal-only 変更時に走る lightweight CI を追加する +1. `requirements-result-server.txt` を portal の依存追加に追従させる +2. `result_server/tests/run_result_server_tests.py` を標準 test entrypoint として保つ +3. `.github/workflows/result-server-tests.yml` の path filter を portal-oriented 変更に追従させる これは性能推定の優先度を下げるためではなく、推定結果表示・認証・比較 UI の変更が増えるほど、portal 回帰の検出が重要になるためである。 +#### 5.2.1 CI 関連 GAP 解消タスク / CI Gap Closure Tasks + +CI 関連の残 GAP は、「仕組みを新規に置く」段階から「対象範囲を運用に耐える形へ広げ、古くならないようにする」段階へ移っている。 +短期的な実装・確認は次の状態まで進んでいる。 + +1. `result-server-tests.yml` の path filter は、`result_server/**/*`、`scripts/bk_functions.sh`、`scripts/result.sh`、`scripts/result_server/**`、profile-data shell tests、`config/system.csv`、`config/queue.csv`、`config/system_info.csv`、`requirements-result-server.txt` を対象にする形へ更新済みである。 +2. `.gitlab-ci.yml` の heavy benchmark skip rules と `docs/ci.md` の説明は、root Markdown、`docs/**/*`、`result_server/**/*`、public site config / profile-data helper 周辺の lightweight verification 経路の扱いが一致するよう同期済みである。 +3. 手動 GitLab CI は、`qws` / `MiyabiG` の最小実行で GitLab pipeline 起動から推定まで確認済みである。Pipeline API variables は JSON payload で渡す。 +4. protected branch sync は、`ci.skip` により GitLab mirror 更新時に GitLab CI が自動起動しないことを運用上確認済みである。 + +docs-only / portal-only / benchmark-code / CI-config の代表的な変更セットは、`docs/ci.md` の examples として整理済みである。 +公開 `system_info.csv` に載せた system が `system.csv` と `queue.csv` に到達できることは、`result_server/tests/check_site_config.py` による CI preflight として整理済みである。逆に、開発用・非公開用の `system.csv` / `queue.csv` 定義が `system_info.csv` に載っていないことは許容する。 +app support matrix、partial support、app entrypoint 欠落、`list.csv` の未知 system などは、アプリごとの準備状況や導入段階に依存するため、現時点では CI failure ではなく `/results/usage` の visibility として扱う。 + +完了条件は、変更種別ごとの期待 CI 経路が文書化され、path filter と skip rules がその期待に一致し、portal 実装変更が heavy benchmark を起動せず lightweight verification で捕捉されることである。Result JSON quality は portal 内の内部管理として可視化し、通常 PR の blocking rule にはしない。 + ### 5.3 次点: AI 駆動最適化連携 / Next Priority: AI-Driven Optimization AI 駆動最適化連携は、PoC を含む試行錯誤を前提として早めに始めてよい。 @@ -328,8 +345,8 @@ AI 駆動最適化連携は、PoC を含む試行錯誤を前提として早め 1. `estimate.sh` の宣言ブロックと共通補助を整える 2. `bk_run_estimation_data_collection` の共通入口を整える 3. `qws` 以外に 1 から 2 本の app へ推定を横展開する -4. `result_server` 用の lightweight CI と標準 test entrypoint を整える -5. repo-local な Python 依存関係定義とローカル再現手順を整える +4. `result_server` 用の lightweight CI、標準 test entrypoint、依存 manifest を portal 周辺の変更に追従させる +5. ローカル再現手順と CI path filter の対象範囲を定期的に見直す 6. Estimate JSON と portal 表示を section / overlap 詳細まで整える 7. 再推定比較の UI / API を整える 8. package metadata discovery を portal や比較導線に活かす diff --git a/docs/guides/add-app.md b/docs/guides/add-app.md index df58446..402a2cb 100644 --- a/docs/guides/add-app.md +++ b/docs/guides/add-app.md @@ -328,7 +328,7 @@ tar -czf ../results/padata0.tgz ./pa ### Fugaku で `fapp` を使う場合 Fugaku 系アプリでは、アプリ側が profiler tool を内部で選び、BenchKit 共通の `bk_profiler` helper に渡す形が扱いやすいです。 -`bk_profiler` は profiler ごとの raw data / postprocess report をまとめて `results/padata*.tgz` に保存し、archive の root に `meta.json` を入れます。BenchKit や推定 package はこの `meta.json` を見て、tool、level、report kind を機械的に判断できます。 +`bk_profiler` は profiler ごとの raw data / postprocess report をまとめて `results/padata*.tgz` に保存し、archive 内の `bk_profiler_artifact/meta.json` に metadata を入れます。BenchKit や推定 package はこの `meta.json` を見て、tool、level、report kind を機械的に判断できます。 `fapp` では共通 level として次を扱います。 @@ -340,7 +340,7 @@ Fugaku 系アプリでは、アプリ側が profiler tool を内部で選び、B `single` は既定で text summary、`simple/standard/detailed` は既定で text + CSV report を保存します。CSV は `fapp` 固有の report として扱い、ほかの profiler が同じ形式を持つ必要はありません。 ```bash -# qws は Fugaku 系 build / run の内部で fapp + single を利用 +# qws は Fugaku 系 build / run の内部で fapp + detailed を利用 bash programs/qws/build.sh Fugaku bash programs/qws/run.sh Fugaku 1 4 12 ``` @@ -365,7 +365,8 @@ bk_profiler_artifact/ meta.json raw/ rep1/ - rep2/ + ... + rep17/ reports/ fapp_A_rep1.txt cpu_pa_rep1.csv @@ -374,6 +375,26 @@ bk_profiler_artifact/ より一般的な profiler helper の設計方針は [Profiler Support Guide](profiler-support.md) を参照してください。 level の早見表と portal 上の見え方は [Profiler Level Reference](profiler-level-reference.md) にまとめています。 +### GPU アプリで `ncu` を使う場合 + +NVIDIA GPU 向けアプリでは、Nsight Compute CLI (`ncu`) を `bk_profiler` 経由で使えます。 +MPI launcher 経由のアプリでは、`bk_profiler ncu` が既定で `--target-processes all` を付け、child process の CUDA kernel も採取対象にします。 +MiyabiG と RC_GH200 のように計算ノード構成が同じ Grace-Hopper GPU 系の場合は、ジョブ投入方式だけを system 設定に任せ、アプリ側の build/run と profiler 採取は共通化するのが自然です。 + +```bash +BK_PROFILER_ARGS="--set full --kernel-name regex:your_kernel" \ +bk_profiler ncu --level single --archive ../results/padata0.tgz --raw-dir ncu -- \ + mpirun -np 1 ./your_gpu_app input.inp +``` + +`ncu` の既定 level は `single` です。最初は採取時間を抑えるため、`single` または `simple` から始めてください。 +raw report は `padata*.tgz` 内の `bk_profiler_artifact/raw/rep1/` に保存され、可能な場合は `bk_profiler_artifact/reports/ncu_import_rep1.txt` に text report が保存されます。 +site の既定 module に `ncu` が含まれない場合は、アプリ側で module を load するか、system 固有の module 変数を用意してください。 +Genesis GH200 参照実装では `GENESIS_MIYABIG_MODULE` / `GENESIS_GH200_MODULE` で module を上書きできます。 +既定の `ncu` が PATH にない場合は warning を出して profiler なしで benchmark 本体を実行しますが、`GENESIS_MIYABIG_PROFILER_TOOL=ncu`、`GENESIS_GH200_PROFILER_TOOL=ncu`、または `GENESIS_PROFILER_TOOL=ncu` を明示した場合は採取不能として失敗します。 +profiler なしを明示する場合は `GENESIS_MIYABIG_PROFILER_TOOL=none`、`GENESIS_GH200_PROFILER_TOOL=none`、または `GENESIS_PROFILER_TOOL=none` を使えます。 +level は `GENESIS_MIYABIG_PROFILER_LEVEL` / `GENESIS_GH200_PROFILER_LEVEL`、または共通の `GENESIS_PROFILER_LEVEL` で上書きできます。 + --- ## 6. ローカルテスト diff --git a/docs/guides/add-site.md b/docs/guides/add-site.md index fb33759..db185ad 100644 --- a/docs/guides/add-site.md +++ b/docs/guides/add-site.md @@ -6,16 +6,17 @@ GitLab Runner と Jacamar-CI をユーザ権限でセットアップし、CI/CD ## 目次 1. [前提条件](#1-前提条件) -2. [ディレクトリ構成](#2-ディレクトリ構成) -3. [GitLab Runner のインストール](#3-gitlab-runner-のインストール) -4. [Jacamar-CI のビルド・インストール](#4-jacamar-ci-のビルドインストール) -5. [カスタムランナースクリプトの作成](#5-カスタムランナースクリプトの作成) -6. [ランナーの登録](#6-ランナーの登録) -7. [Jacamar 用ランナーの設定](#7-jacamar-用ランナーの設定) -8. [config.toml の設定](#8-configtoml-の設定) -9. [BenchKit への拠点登録](#9-benchkit-への拠点登録) -10. [ランナーの常駐化(systemd user mode)](#10-ランナーの常駐化systemd-user-mode) -11. [トラブルシューティング](#11-トラブルシューティング) +2. [クイックセットアップ(推奨)](#クイックセットアップ推奨) +3. [ディレクトリ構成](#2-ディレクトリ構成) +4. [GitLab Runner のインストール](#3-gitlab-runner-のインストール) +5. [Jacamar-CI のビルド・インストール](#4-jacamar-ci-のビルドインストール) +6. [カスタムランナースクリプトの作成](#5-カスタムランナースクリプトの作成) +7. [ランナーの登録](#6-ランナーの登録) +8. [Jacamar 用ランナーの設定](#7-jacamar-用ランナーの設定) +9. [config.toml の設定](#8-configtoml-の設定) +10. [BenchKit への拠点登録](#9-benchkit-への拠点登録) +11. [ランナーの常駐化(systemd user mode)](#10-ランナーの常駐化systemd-user-mode) +12. [トラブルシューティング](#11-トラブルシューティング) --- @@ -30,6 +31,88 @@ GitLab Runner と Jacamar-CI をユーザ権限でセットアップし、CI/CD --- +## クイックセットアップ(推奨) + +通常は `scripts/setup_site_runner.sh` を使えば、GitLab Runner の取得、Jacamar-CI のビルド、frontend runner と Jacamar runner の登録、`custom-config.toml` / `config.toml` 相当の設定、systemd user service の作成までまとめて実行できます。 + +`--login-token` と `--jacamar-token` には、GitLab で作成した各 runner の authentication token を指定します。URL は両 runner で共通です。 + +### 実行前の疎通確認 + +セットアップ前に、対象ログインノードから GitLab サーバへ到達できるか確認します。GitLab Runner は GitLab 側から接続されるのではなく、ログインノード上の常駐プロセスが GitLab へ job を取りに行きます。 + +```bash +GITLAB_URL="https://YOUR_GITLAB_SERVER" + +hostname -s +getent hosts "$(printf '%s\n' "$GITLAB_URL" | sed -E 's#^https?://([^/]+).*#\1#')" + +env | grep -Ei '^(http_proxy|https_proxy|HTTP_PROXY|HTTPS_PROXY|no_proxy|NO_PROXY)=' || true +grep -Rihn -i proxy ~/.bashrc ~/.bash_profile ~/.profile /etc/profile /etc/profile.d 2>/dev/null || true + +env -u http_proxy -u https_proxy -u HTTP_PROXY -u HTTPS_PROXY \ + curl -I --connect-timeout 5 "$GITLAB_URL" +``` + +direct 接続が timeout し、サイト側で proxy が指定されている場合は、その proxy で疎通確認します。 + +```bash +RUNNER_PROXY="http://PROXY_HOST:PORT" + +curl -I --connect-timeout 5 -x "$RUNNER_PROXY" "$GITLAB_URL" +``` + +proxy 経由でだけ成功する場合は、`setup_site_runner.sh` 実行時に `--proxy "$RUNNER_PROXY"` を指定します。`systemd --user` のサービスはログインシェルの proxy 環境変数を継承しないことがあるため、proxy は runner の systemd unit に明示しておくのが安全です。 + +AMD64 ログインノードの例: + +```bash +curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \ + | bash -s -- \ + --arch amd64 \ + --site your_site \ + --gitlab-url https://YOUR_GITLAB_SERVER \ + --login-token "$LOGIN_RUNNER_TOKEN" \ + --jacamar-token "$JACAMAR_RUNNER_TOKEN" \ + --scheduler pbs \ + --service-host "$(hostname -s)" +``` + +proxy が必要な拠点では、上のコマンドに `--proxy "$RUNNER_PROXY"` を追加します。 + +ARM64 ログインノードでは `--arch arm64` を指定します。 + +よく使う指定: + +- `--site your_site` + - Runner description と、期待する tag 表示に使います +- `--login-tag` / `--jacamar-tag` + - Runner authentication token workflow では tag は GitLab 側で設定します。このオプションはスクリプト末尾の確認表示用です +- `--scheduler pbs|slurm|pjm` + - Jacamar の executor を指定します +- `--jacamar-repo URL` + - Jacamar-CI の clone 元を明示します。省略時は `--scheduler pjm` の場合だけ PJM 対応 fork `https://gitlab.com/yoshifuminakamura/jacamar-ci.git` を使い、それ以外は upstream `https://gitlab.com/ecp-ci/jacamar-ci.git` を使います +- `--base-dir /path/to/gitlab-runner_jacamar-ci_amd` + - 既定は `$HOME/gitlab-runner_jacamar-ci_amd` または `$HOME/gitlab-runner_jacamar-ci_arm` +- `--libseccomp auto|system|local|none` + - 既定は `auto` です。利用可能な system libseccomp があれば使い、なければ gperf と libseccomp をローカルビルドします +- `--with-libseccomp` + - `--libseccomp local` の短縮形です。常に gperf と libseccomp をローカルビルドします +- `--jacamar-pbs-tools tools.go` + - PBS の完了判定にサイト固有パッチが必要な場合に使います +- `--unrestricted-cmd-line` + - Jacamar の `GIT_ASKPASS` credential helper が効かず、`get_sources` で `fatal: unable to get password from user` になる場合の回避策です。runner generated command line に job token が現れる可能性があるため、単一ユーザ運用や `/proc` の閲覧制限がある環境で使ってください +- `--proxy http://PROXY_HOST:PORT` + - runner の systemd user service に `http_proxy` / `https_proxy` / `HTTP_PROXY` / `HTTPS_PROXY` を設定します。`http://` または `https://` を省略した場合は `http://` を補います +- `--no-proxy LIST` + - runner の systemd user service に `no_proxy` / `NO_PROXY` を設定します +- `--no-systemd` / `--no-start` + - systemd user service を作らない、または作るだけで起動しない場合に使います + +このスクリプトは `config.toml` の `environment` に `PATH=$BASE_DIR/bin:...` を登録時点で入れるため、アーティファクト保存時に `gitlab-runner` が見つからない問題も避けられます。以下の手動手順は、スクリプトが失敗した場合の切り分けや、サイト固有に調整したい場合の参照として使ってください。 + +--- + ## 2. ディレクトリ構成 ARM系とx86系で共有ボリュームをマウントしている環境では、アーキテクチャ別にディレクトリを分けます。 @@ -140,6 +223,8 @@ rm -rf jacamar-ci go - `Exit_status` の取得: `-H -f` オプションで履歴から抽出(テキスト形式) - ジョブが履歴に残らない場合は正常終了と見なす +AOBA/NQSV のように `qstat -H` が使えない環境では、ジョブが `qstat` から消えた後に終了コードを後追い取得できない場合があります。この場合は `qsub` 直後から `qwait -w exited ` で待ち、出力される `exited N` を parse して `N != 0` を GitLab job failure として扱うパッチが必要です。`qwait` 自体の戻り値が 0 でも、ジョブの終了コードは `exited N` 側に入る点に注意してください。 + パッチの適用方法: ```bash git clone https://gitlab.com/ecp-ci/jacamar-ci.git @@ -264,7 +349,7 @@ exit 0 ### `run.sh` - ジョブ実行 ```bash -#!/usr/bin/bash +#!/usr/bin/env bash source ~/.bashrc set -eo pipefail exec "$@" @@ -272,18 +357,23 @@ exec "$@" ### `cleanup.sh` - ジョブ後片付け ```bash -#!/bin/bash -set -e +#!/usr/bin/env bash +set -euo pipefail -LOGFILE="${CUSTOM_DIR}/custom_cleanup.log" +BASE_DIR="/path/to/gitlab-runner_jacamar-ci_amd" # ← 実際のパスに変更 +LOGFILE="${CUSTOM_DIR:-${BASE_DIR}}/custom_cleanup.log" echo "CLEANUP STARTED at $(date)" >> "$LOGFILE" -echo "CUSTOM_ENV_CI_JOB_ID=$CUSTOM_ENV_CI_JOB_ID" >> "$LOGFILE" -BUILD_DIR="${CUSTOM_UNIQUE_BUILD_DIR}" -CACHE_DIR="${CUSTOM_UNIQUE_CACHE_DIR}" +BUILD_DIR="${CUSTOM_UNIQUE_BUILD_DIR:-}" +CACHE_DIR="${CUSTOM_UNIQUE_CACHE_DIR:-}" -[ -n "$BUILD_DIR" ] && [ -d "$BUILD_DIR" ] && rm -rf "$BUILD_DIR" -[ -n "$CACHE_DIR" ] && [ -d "$CACHE_DIR" ] && rm -rf "$CACHE_DIR" +case "$BUILD_DIR" in + "${BASE_DIR}/builds/"*) [[ -d "$BUILD_DIR" ]] && rm -rf -- "$BUILD_DIR" ;; +esac + +case "$CACHE_DIR" in + "${BASE_DIR}/cache/"*) [[ -d "$CACHE_DIR" ]] && rm -rf -- "$CACHE_DIR" ;; +esac echo "CLEANUP DONE at $(date)" >> "$LOGFILE" ``` @@ -309,12 +399,9 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh "$BASE_DIR/bin/gitlab-runner" register \ --non-interactive \ --url "https://YOUR_GITLAB_SERVER" \ - --registration-token "YOUR_TOKEN" \ + --token "YOUR_TOKEN" \ --executor "custom" \ --description "site-login" \ - --tag-list "your_site_login" \ - --run-untagged="false" \ - --locked="false" \ --builds-dir "$BASE_DIR/builds" \ --cache-dir "$BASE_DIR/cache" \ --config "$BASE_DIR/config.toml" \ @@ -329,12 +416,9 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh "$BASE_DIR/bin/gitlab-runner" register \ --non-interactive \ --url "https://YOUR_GITLAB_SERVER" \ - --registration-token "YOUR_TOKEN" \ + --token "YOUR_TOKEN" \ --executor "custom" \ --description "site-jacamar" \ - --tag-list "your_site_jacamar" \ - --run-untagged="false" \ - --locked="false" \ --builds-dir "$BASE_DIR/builds" \ --cache-dir "$BASE_DIR/cache" \ --config "$BASE_DIR/config.toml" \ @@ -345,6 +429,7 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh ``` > **Note**: Jacamar 用ランナーの `--custom-*-exec` は登録時のプレースホルダです。実際の引数は `config.toml` で設定します(次セクション参照)。 +> **Note**: Runner authentication token を使う GitLab Runner 18 系の workflow では、tag、locked、run-untagged などは GitLab server 側で設定します。register コマンドに `--tag-list` や `--locked` を渡すと失敗します。 --- @@ -439,7 +524,9 @@ queue,submit_cmd,template PBS_NewSystem,qsub,"-q ${queue_group} -l select=${nodes} -l walltime=${elapse} -W group_list=your_group" ``` -テンプレート内で使える変数:`${queue_group}`, `${nodes}`, `${numproc_node}`, `${nthreads}`, `${elapse}` +テンプレート内で使える変数:`${queue_group}`, `${nodes}`, `${numproc_node}`, `${nthreads}`, `${elapse}`, `${proc}`(`nodes * numproc_node`), `${cpu_per_node}`, `${gpu_per_node}`, `${cpu_sockets}`(`nodes * cpu_per_node`), `${gpu_cards}`(`nodes * gpu_per_node`) + +`${cpu_per_node}` と `${gpu_per_node}` は `config/system_info.csv` から取得します。CPU socket 数や GPU card 数を scheduler に明示するサイトでは、`system_info.csv` の値も投入条件に使われます。 ### `config/system_info.csv` に表示用メタデータを追加 @@ -547,7 +634,8 @@ mkdir -p ~/.config/systemd/user [Unit] Description=GitLab Runner service (user mode, amd64) After=network.target -ConditionHost=your-login-node # ← 実際のホスト名に変更 +# 実際のホスト名に変更 +ConditionHost=your-login-node [Service] ExecStart=%h/gitlab-runner_jacamar-ci_amd/bin/gitlab-runner run --config %h/gitlab-runner_jacamar-ci_amd/config.toml --working-directory %h @@ -565,7 +653,8 @@ WantedBy=default.target [Unit] Description=GitLab Runner service (user mode, arm64) After=network.target -ConditionHost=your-arm-login-node # ← 実際のホスト名に変更 +# 実際のホスト名に変更 +ConditionHost=your-arm-login-node [Service] ExecStart=%h/gitlab-runner_jacamar-ci_arm/bin/gitlab-runner run --config %h/gitlab-runner_jacamar-ci_arm/config.toml --working-directory %h @@ -578,7 +667,7 @@ StandardError=append:%h/gitlab-runner_jacamar-ci_arm/gitlab-runner.err WantedBy=default.target ``` -`ConditionHost=` を設定することで、同じホームディレクトリを複数ノードで共有していても、指定したホストでのみサービスが起動します。 +`ConditionHost=` を設定することで、同じホームディレクトリを複数ノードで共有していても、指定したホストでのみサービスが起動します。systemd の unit file では行末コメントを値として解釈するので、コメントは別行に置いてください。 ### サービスの有効化・起動 @@ -629,7 +718,125 @@ environment = ["PATH=/path/to/gitlab-runner_jacamar-ci_amd/bin:..."] ### ランナーが GitLab に接続できない - ログインノードから GitLab サーバへの HTTPS 通信が可能か確認 -- プロキシ設定が必要な場合は `config.toml` の `environment` に `https_proxy` を追加 +- プロキシ設定が必要な場合は `setup_site_runner.sh --proxy` で systemd user service に proxy を明示 + +ログインシェルでは `curl -I https://gitlab.swc.r-ccs.riken.jp` が成功するのに、常駐ランナーが +`Checking for jobs... failed` や `lookup gitlab.swc.r-ccs.riken.jp on [::1]:53` で失敗する場合は、 +`systemd --user` のサービスがログインシェルの proxy 環境変数を継承していない可能性があります。 + +```bash +env | grep -Ei 'proxy|http|https|no_proxy' +curl -I https://gitlab.swc.r-ccs.riken.jp +systemctl --user show gitlab-runner--amd.service -p Environment +``` + +`systemctl --user show` の `Environment=` が空なら、サービスに proxy を明示します。 + +```bash +systemctl --user edit gitlab-runner--amd.service +``` + +```ini +[Service] +Environment="http_proxy=http://PROXY_HOST:PORT" +Environment="https_proxy=http://PROXY_HOST:PORT" +Environment="HTTP_PROXY=http://PROXY_HOST:PORT" +Environment="HTTPS_PROXY=http://PROXY_HOST:PORT" +``` + +```bash +systemctl --user daemon-reload +systemctl --user restart gitlab-runner--amd.service +systemctl --user show gitlab-runner--amd.service -p Environment +``` + +### 計算ノードに `git` がない場合 + +一部の計算ノードでは、ログインノードやフロントエンドランナーでは `git` が使えても、バッチジョブ内では `git: コマンドが見つかりません` になることがあります。アプリの `run.sh` が実行時に外部ソースを clone する場合は、計算ノード側でも `git` 相当のコマンドが必要です。 + +Singularity/Apptainer が計算ノードで使える場合は、共有ファイルシステム上に `git` 入りコンテナと wrapper を置く方法が有効です。 + +```bash +BASE=/uhome//gitlab-runner_jacamar-ci_amd +SING=/path/to/singularity + +mkdir -p "$BASE/containers" "$BASE/bin" +"$SING" build --sandbox "$BASE/containers/git" docker://alpine/git:latest +``` + +`$BASE/bin/git`: + +```bash +#!/bin/bash +set -e + +# GitLab Runner の get_sources はログインノード上の認証 helper を使うため、 +# ホストの git がある場合はそちらへ委譲する。 +if [[ -x /usr/bin/git ]]; then + exec /usr/bin/git "$@" +fi + +SING=/path/to/singularity +IMG=/uhome//gitlab-runner_jacamar-ci_amd/containers/git + +exec "$SING" exec \ + --bind /mnt:/mnt \ + --bind /uhome:/uhome \ + --pwd "$PWD" \ + "$IMG" \ + git "$@" +``` + +```bash +chmod +x "$BASE/bin/git" +``` + +Jacamar 用ランナーの `config.toml` では wrapper のある `bin` を `PATH` に入れます。ただし `get_sources` までコンテナ内 `git` に置き換えると、GitLab Runner/Jacamar が生成する credential helper をコンテナ内から実行できず、`fatal: cannot exec .../pass` で失敗することがあります。上記のように、ログインノードでは `/usr/bin/git` に委譲し、計算ノードでだけコンテナ内 `git` を使う wrapper にしてください。 + +確認: + +```bash +git --version +git ls-remote https://github.com/RIKEN-LQCD/qws.git HEAD +``` + +### 計算ノードから外部 proxy に届かない場合 + +`queue.csv` で `qsub -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY` を指定していても、計算ノードからその proxy に TCP 接続できるとは限りません。例えば `git` や `curl` が `Trying PROXY_HOST...` のまま進まず、ジョブが経過時間超過で kill される場合は、proxy 変数ではなくネットワーク到達性を疑います。 + +計算ノードで確認: + +```bash +hostname -I +ip route +cat /etc/resolv.conf 2>/dev/null || true +cat /etc/hosts 2>/dev/null || true +env | sort | egrep -i '^(http_proxy|https_proxy|HTTP_PROXY|HTTPS_PROXY|no_proxy|NO_PROXY)=' || true + +timeout 5 bash -c '_*_run) + cache=/uhome//gitlab-runner_jacamar-ci_amd/site-cache/qws + if [ ! -d qws ]; then + echo "[site pre_build] copying qws source from $cache" + cp -a "$cache" qws + fi + test -d qws || { echo "[site pre_build] qws source is missing" >&2; exit 1; } + ;; +esac +""" +``` ### ARM/x86 混在環境での注意 同じ共有ボリュームを異なるアーキテクチャのマシンがマウントしている場合、必ずアーキテクチャ別のディレクトリ(`_amd` / `_arm`)を使い分けてください。バイナリの混在はランタイムエラーの原因になります。 @@ -658,10 +865,14 @@ qstat -f # ジョブ履歴の確認 qstat -H -f + +# NQSV で qstat -H がない場合は、投入直後に qwait で終了コードを確認 +qwait -w exited ``` **解決方法:** JSON形式の `qstat` がサポートされていない場合は、セクション4「サイト固有パッチについて」に記載の `tools.go` パッチを適用してください。 +NQSV のように `qstat -H` がない場合は、`qwait -w exited ` の出力を使って終了コードを返す `tools.go` パッチを適用してください。 ### NFS同期でファイルが見えない diff --git a/docs/guides/developer-reference.md b/docs/guides/developer-reference.md index 5058cbc..1277f89 100644 --- a/docs/guides/developer-reference.md +++ b/docs/guides/developer-reference.md @@ -197,7 +197,7 @@ Typical requirements include: - Bash and standard shell tooling - GitLab CI runner support - site-specific scheduler/runtime support -- Python for result shaping, estimation support, and portal components +- Python 3.12 or later for result shaping, estimation support, and portal components - Flask-related Python packages for `result_server` - optional profiler tools depending on system support @@ -208,43 +208,26 @@ For local portal work, see the route, template, and utility layout under `result For the lightweight `result_server` verification path: - install dependencies with `python -m pip install -r requirements-result-server.txt` -- run the portal test suite with `python scripts/test_result_server.py` +- run the portal test suite with `python result_server/tests/run_result_server_tests.py` - CI coverage for portal-only changes is provided by `.github/workflows/result-server-tests.yml` -### Result Quality Validation +For production portal deployments: -BenchKit currently keeps result-quality validation visibility-first. +- Set `FLASK_SECRET_KEY` to a strong secret and run `result_server/app.py`, not `app_dev.py`. +- `app.py` binds to `127.0.0.1:8800` by default; set `RESULT_SERVER_HOST` and `RESULT_SERVER_PORT` explicitly when the deployment requires a different bind address. +- Set runner-scoped ingest keys with `RESULT_SERVER_KEYS=runner-a:key-a,runner-b:key-b`. +- The legacy `RESULT_SERVER_KEY` variable is still accepted as runner `default` for compatibility, but should be rotated to `RESULT_SERVER_KEYS`. +- `REDIS_URL` must point to a monitored Redis instance; production authentication refuses login when Redis is unavailable. +- `app_dev.py` is localhost-only, uses ephemeral development secrets when none are provided, and enables the Werkzeug debugger only with `RESULT_SERVER_DEV_DEBUG=1`. -- run `python scripts/validate_result_quality.py ` -- use `--format json` for machine-readable output -- use `--fail-on warning` or `--fail-on candidate` only when you explicitly want non-zero exit behavior +### Result Quality Visibility -This validator is intended as a lightweight pre-CI gate and a future stepping stone toward stricter result validation. +BenchKit keeps result-quality scoring inside the portal. Normal pull requests should not be blocked on quality scoring beyond producing valid result JSON with a FOM value. -Internally, the validator can also read `config/result_quality_policy.json` to assign app-specific tiers such as `strict`, `standard`, and `relaxed`. -This policy is currently intended for internal operation, and it does not need to be reflected in public UI or external contracts. +Portal quality visibility currently lives in: -If you want to manage tier overrides locally without editing the JSON file, the validator can also read a Redis hash. +- result list quality badges +- result detail quality rows +- `/results/usage` current-state quality summaries -- set `BK_RESULT_QUALITY_REDIS_URL=redis://localhost:6379/0` -- optionally set `BK_RESULT_QUALITY_REDIS_KEY=benchkit:result_quality:app_tiers` -- store entries such as `qws -> strict` or `genesis -> relaxed` in that hash - -Redis overrides are merged on top of `config/result_quality_policy.json` and are intended for internal/local control only. - -### Result Upload Hook - -`scripts/result_server/send_results.sh` can invoke the validator before upload. - -- set `BK_RESULT_QUALITY_VALIDATE=true` to enable the pre-upload validator -- leave `BK_RESULT_QUALITY_FAIL_ON=none` for report-only execution -- use `BK_RESULT_QUALITY_FAIL_ON=warning` or `BK_RESULT_QUALITY_FAIL_ON=candidate` only when you intentionally want upload-time failure behavior -- use `BK_RESULT_QUALITY_FAIL_ON=policy` only for internal experiments with app-specific quality tiers - -Recommended rollout: - -- phase 1: `BK_RESULT_QUALITY_VALIDATE=true`, `BK_RESULT_QUALITY_FAIL_ON=none` -- phase 2: keep CI/report-only, but review repeated validator candidates in usage reports -- phase 3: promote a small subset of stable rules to `candidate`-based failure - -If you experiment with app-specific tiers, prefer starting with `--fail-on policy` only in private or staging-like internal workflows. +Treat missing `source_info`, `fom_breakdown`, or artifact references as internal improvement candidates, not upload-time or pull-request gates. diff --git a/docs/guides/profiler-level-reference.md b/docs/guides/profiler-level-reference.md index 06e2abc..32add7d 100644 --- a/docs/guides/profiler-level-reference.md +++ b/docs/guides/profiler-level-reference.md @@ -5,15 +5,16 @@ This note complements `bk_profiler` and focuses on the shared level names used b ## Shared Levels - `single` - - one measurement run + - minimal profile coverage - `simple` - - five measurement runs + - lightweight profile coverage - `standard` - - eleven measurement runs + - standard profile coverage - `detailed` - - seventeen measurement runs + - detailed profile coverage These names are BenchKit-level presets. Each profiler adapter defines the concrete behavior behind them. +For example, `fapp` maps these levels to multiple event-set runs, while `ncu` maps them to Nsight Compute options such as section set, launch count, and NVTX filtering. ## Current `fapp` Mapping @@ -39,9 +40,23 @@ Default report behavior for `fapp` is: Here `both` means text summaries plus CSV reports. +## Current `ncu` Mapping + +- `single` + - `--set basic --launch-count 1` +- `simple` + - `--set basic --launch-count 5` +- `standard` + - `--set full --launch-count 1` +- `detailed` + - `--set full --nvtx` + +Default report behavior for `ncu` is `text`. +BenchKit stores the Nsight Compute raw report under `bk_profiler_artifact/raw/rep1/` and, when import succeeds, a text details page under `bk_profiler_artifact/reports/ncu_import_rep1.txt`. + ## Portal Summary -BenchKit stores profiler metadata in `meta.json` inside `padata.tgz`, and also copies a compact summary into `result.json` as `profile_data`. +BenchKit stores profiler metadata in `bk_profiler_artifact/meta.json` inside `padata.tgz`, and also copies a compact summary into `result.json` as `profile_data`. This makes it possible to inspect profiler coverage without downloading the archive first. @@ -49,7 +64,7 @@ This makes it possible to inspect profiler coverage without downloading the arch - `Profiler` shows `tool / level` - the secondary line shows `report_format` and run count - Result detail - - `PA Data Summary` shows tool-specific events, explicit events, and report kinds + - `PA Data Summary` shows tool-specific details, explicit events when applicable, NCU options when present, and report kinds ## Why This Helps diff --git a/docs/guides/profiler-support.md b/docs/guides/profiler-support.md index 220f81e..4c97f67 100644 --- a/docs/guides/profiler-support.md +++ b/docs/guides/profiler-support.md @@ -55,7 +55,7 @@ bk_profiler [options] -- `single/simple/standard/detailed` は BenchKit の共通語彙として扱う。 ただし、その具体的意味は profiler tool ごとに adapter が定義する。 -このため、ある tool では 4 段階すべてを持ってもよいし、別の tool では 1 段階だけでもよい。 +このため、ある tool では複数の測定 run に対応し、別の tool では単一 run の profiler option や採取範囲に対応してよい。 ## 4. `fapp` の level 定義 @@ -76,7 +76,22 @@ bk_profiler [options] -- ここでいう CSV は `fapp` 固有の CPU performance analysis report を指す。 BenchKit は「CSV があること」を共通必須にはしない。 -## 5. Archive の考え方 +## 5. `ncu` の level 定義 + +`ncu` では現在、次の対応を採る。 + +- `single` → `--set basic --launch-count 1` +- `simple` → `--set basic --launch-count 5` +- `standard` → `--set full --launch-count 1` +- `detailed` → `--set full --nvtx` + +既定の report format は `text` とする。 +raw report は archive 内の `bk_profiler_artifact/raw/rep1/profile*.ncu-rep` または Nsight Compute の出力形式に従う report file として保存し、可能な場合は `ncu --import ... --page details` の出力を `bk_profiler_artifact/reports/ncu_import_rep1.txt` に保存する。 + +MPI launcher 経由の GPU application では、既定で `--target-processes all` を付けて child process も採取対象にする。 +追加の kernel filter、section set、NVTX filter などは `BK_PROFILER_ARGS` で `ncu` に渡す。 + +## 6. Archive の考え方 `bk_profiler` は archive の中に少なくとも次を置く。 @@ -104,7 +119,7 @@ bk_profiler_artifact/ cpu_pa_rep2.csv ``` -## 6. `meta.json` の役割 +## 7. `meta.json` の役割 `meta.json` は、archive の内容を BenchKit や推定 package が機械的に判断するための最小 metadata とする。 @@ -138,7 +153,7 @@ bk_profiler_artifact/ を見て、その artifact が適用可能かどうかを判断できる。 -## 7. アプリ側の責務 +## 8. アプリ側の責務 アプリ側は profiler helper を直接一般化しすぎず、次だけを持てばよい。 @@ -150,11 +165,34 @@ bk_profiler_artifact/ 例として `qws` では、 - Fugaku 系 build で `profiler=fapp` を渡す -- Fugaku 系 run で `bk_profiler fapp --level single -- ...` を呼ぶ +- Fugaku 系 run で `bk_profiler fapp --level detailed -- ...` を呼ぶ だけを持つ。 -## 8. 今は固定しないこと +`genesis` では、MiyabiG と RC_GH200 を同じ Grace-Hopper GPU 系の計算ノードとして扱い、GPU build / run に対して、 + +- build で `--enable-gpu`、`--enable-openmp`、`--with-gpuarch=sm_90` を指定する +- MiyabiG の既定 build では外部 LAPACK を要求せず、必要な場合だけ `GENESIS_MIYABIG_LAPACK_LIBS` で有効化する +- `.fpp` 前処理では GENESIS の traditional cpp flags を保持しつつ、GPU/single/MPI/OpenMP/FFTE の define を `PPFLAGS` 経由で明示する +- CUDA 12.9 以降向けに `src/spdyn/gpu_sp_energy.cu` の `nvToolsExt.h` include を `nvtx3/nvToolsExt.h` に補正する +- `mpif90` の実体が `nvfortran` の環境向けに、GENESIS の compiler 判定を NVHPC/PGI 系として補正する +- GENESIS の古い PGI flag (`-Mcuda` など) は `configure.ac` 側で NVHPC 25.x/aarch64 向けの `-cuda -gpu=cc90` へ補正する +- NVHPC 25.x では古い PGI pinned-array 経路の `PGICUDA` define を外し、GPU kernel 用の domain fields を保持する +- run では、`ncu` が PATH にある場合に `bk_profiler ncu --level single -- ...` を呼ぶ + +形を参照実装とする。ジョブ投入方式は MiyabiG が PBS、RC_GH200 が SLURM で異なるが、アプリ側の実行方法と profiler 採取方法は共通化する。 + +CUDA prefix、compiler wrapper、module、profiler tool は site 側の module 構成に合わせて上書きできる。 + +- build/run 共通の module: `GENESIS_MIYABIG_MODULE`, `GENESIS_GH200_MODULE` +- build 時の CUDA/compiler/config: `GENESIS_MIYABIG_CUDA_PATH`, `GENESIS_MIYABIG_FC`, `GENESIS_MIYABIG_CC`, `GENESIS_MIYABIG_CONFIG_ARGS` +- run 時の profiler: `GENESIS_MIYABIG_PROFILER_TOOL`, `GENESIS_GH200_PROFILER_TOOL`, `GENESIS_MIYABIG_PROFILER_LEVEL`, `GENESIS_GH200_PROFILER_LEVEL`, または共通の `GENESIS_PROFILER_TOOL` / `GENESIS_PROFILER_LEVEL` + +Genesis GH200 run の profiler 既定値は `ncu` だが、これは暗黙の既定値としてだけ扱う。`ncu` が PATH にない環境では warning を出して profiler なしで benchmark 本体を実行する。 +一方、`GENESIS_PROFILER_TOOL=ncu` または system 固有の `GENESIS_*_PROFILER_TOOL=ncu` を明示した場合は、`ncu` が見つからなければ失敗させる。 +profiler なしを明示したい場合は、`GENESIS_PROFILER_TOOL=none` または system 固有の `GENESIS_*_PROFILER_TOOL=none` を指定する。 + +## 9. 今は固定しないこと 現時点では、次は固定しない。 diff --git a/programs/genesis/build.sh b/programs/genesis/build.sh index a87c4f1..b398880 100644 --- a/programs/genesis/build.sh +++ b/programs/genesis/build.sh @@ -4,8 +4,8 @@ set -x system="$1" REPO_DIR="genesis" -REPO_URL="https://github.com/genesis-release-r-ccs/${REPO_DIR}.git" -BRANCH="main" +REPO_URL="${GENESIS_REPO_URL:-https://github.com/genesis-release-r-ccs/${REPO_DIR}.git}" +BRANCH="${GENESIS_BRANCH:-main}" echo "[${REPO_DIR}] Building on system: $system" mkdir -p artifacts @@ -18,6 +18,242 @@ cd ${REPO_DIR} || { exit 1 } +# Append flags without losing values that a site module or CI variable already +# set before this build script runs. +append_env_flags() { + local var_name="$1" + local new_flags="$2" + local current_flags="${!var_name:-}" + + if [ -n "$new_flags" ]; then + if [ -n "$current_flags" ]; then + export "${var_name}=${current_flags} ${new_flags}" + else + export "${var_name}=${new_flags}" + fi + fi +} + +# Resolve the CUDA root for NVHPC/Grace-Hopper builds. NVHPC often puts nvcc in +# a compiler bin directory while libcudart lives in a sibling cuda directory, so +# command -v nvcc alone is not enough for GENESIS configure. +detect_cuda_path() { + local nvcc_path="" + local nvcc_prefix="" + local nvhpc_root="" + local cuda_candidate="" + + if [ -n "${CUDA_HOME:-}" ]; then + printf '%s\n' "$CUDA_HOME" + return 0 + fi + if [ -n "${CUDA_PATH:-}" ]; then + printf '%s\n' "$CUDA_PATH" + return 0 + fi + if ! nvcc_path=$(command -v nvcc 2>/dev/null); then + return 1 + fi + + nvcc_prefix=$(cd "$(dirname "$(dirname "$nvcc_path")")" && pwd) + nvhpc_root=$(cd "$(dirname "$nvcc_prefix")" && pwd) + for cuda_candidate in "${nvhpc_root}"/cuda/* "${nvhpc_root}"/cuda; do + if [ -f "${cuda_candidate}/lib64/libcudart.so" ] || [ -f "${cuda_candidate}/targets/sbsa-linux/lib/libcudart.so" ]; then + printf '%s\n' "$cuda_candidate" + return 0 + fi + done + + if [ -d "${nvcc_prefix}/include" ]; then + printf '%s\n' "$nvcc_prefix" + return 0 + fi + + return 1 +} + +# Export CUDA include/library flags in the forms GENESIS configure currently +# checks. The sbsa-linux paths are needed by GH200-style NVHPC installations. +configure_cuda_environment() { + local cuda_prefix="$1" + local cuda_arch="$2" + local incflags="" + local ldflags="" + local inc_dir="" + local lib_dir="" + local cudart_lib="" + + [ -n "$cuda_prefix" ] || return 0 + + export CUDA_HOME="$cuda_prefix" + export CUDA_PATH="$cuda_prefix" + + for inc_dir in \ + "${cuda_prefix}/include" \ + "${cuda_prefix}/targets/sbsa-linux/include" \ + "${cuda_prefix}/targets/sbsa-linux/include/nvtx3"; do + if [ -d "$inc_dir" ]; then + incflags="${incflags:+${incflags} }-I${inc_dir}" + fi + done + + for lib_dir in \ + "${cuda_prefix}/targets/sbsa-linux/lib" \ + "${cuda_prefix}/lib64"; do + if [ -d "$lib_dir" ]; then + ldflags="${ldflags:+${ldflags} }-L${lib_dir}" + fi + done + + for cudart_lib in \ + "${cuda_prefix}/targets/sbsa-linux/lib/libcudart.so" \ + "${cuda_prefix}/lib64/libcudart.so"; do + if [ -f "$cudart_lib" ]; then + export GENESIS_CUDART_LIB="$cudart_lib" + break + fi + done + + append_env_flags CPPFLAGS "$incflags" + append_env_flags NVCCFLAG "$incflags" + append_env_flags LDFLAGS "$ldflags" + + if [ "$cuda_arch" = "90" ] || [ "$cuda_arch" = "sm_90" ]; then + append_env_flags NVCCFLAG '--generate-code=arch=compute_90,code="sm_90,compute_90"' + fi +} + +# GENESIS still includes the legacy NVTX header. Newer CUDA/NVHPC stacks install +# the compatibility header below nvtx3, so patch the checked-out source locally. +apply_genesis_nvtx_include_patch() { + local target="src/spdyn/gpu_sp_energy.cu" + + if [ -f "$target" ] && grep -q 'nvToolsExt.h' "$target" && ! grep -q 'nvtx3/nvToolsExt.h' "$target"; then + sed -i -e 's|nvToolsExt.h|nvtx3/nvToolsExt.h|g' "$target" + fi +} + +# The upstream configure.ac recognizes pgfortran but not nvfortran. Treating +# nvfortran as the same compiler family keeps the rest of GENESIS' PGI/NVHPC +# configuration path active without carrying a fork of the source tree. +apply_genesis_nvfortran_configure_patch() { + if [ ! -f configure.ac ] || grep -q 'x"${vtok}" = x"nvfortran"' configure.ac; then + return 0 + fi + + perl -0pi -e 's/(elif test x"\$\{vtok\}" = x"pgfortran"; then\s+FC_ACT="pgf90"\s+break)/elif test x"\${vtok}" = x"nvfortran"; then\nFC_ACT="pgf90"\nbreak\n\1/' configure.ac + if ! grep -q 'x"${vtok}" = x"nvfortran"' configure.ac; then + echo "Failed to patch configure.ac for nvfortran detection" >&2 + exit 1 + fi +} + +# Replace obsolete PGI CUDA flags with NVHPC flags and remove options that fail +# on aarch64 Grace-Hopper nodes. This keeps the patch local to the CI checkout. +apply_genesis_nvhpc_configure_flags_patch() { + if [ ! -f configure.ac ]; then + return 0 + fi + + GENESIS_NVHPC_GPU_FLAGS="${GENESIS_NVHPC_GPU_FLAGS:--cuda -gpu=cc90}" \ + perl -0pi -e ' + my $cudart_lib = $ENV{"GENESIS_CUDART_LIB"}; + my $gpu_flags = $ENV{"GENESIS_NVHPC_GPU_FLAGS"}; + if ($cudart_lib) { + s/-L\$\{cuda_lib_path\} -lcudart/$cudart_lib/g; + } + s/-Mcuda/$gpu_flags/g; + s/[[:space:]]+-Msmartalloc=huge//g; + s/[[:space:]]+-Mipa=fast,inline//g; + s/[[:space:]]+-fastsse//g; + s/[[:space:]]+-pc 64//g; + s/[[:space:]]+-mcmodel=medium//g; + s/\n[[:space:]]*AC_DEFINE\(PGICUDA, 1, \[defined if pgi and cuda are used\.\]\)//g; + s/\n[[:space:]]*DEFINED_VARIABLES\+=" -DPGICUDA"//g; + ' configure.ac + if grep -q 'PGICUDA' configure.ac; then + echo "Failed to patch configure.ac for NVHPC PGICUDA handling" >&2 + exit 1 + fi +} + +# GENESIS releases vary between shipping bootstrap and relying on autoreconf. +bootstrap_genesis() { + if [ -x ./bootstrap ]; then + bash ./bootstrap + else + autoreconf -i + fi +} + +# Shared configuration for GH200-class systems. env_prefix lets each site +# override modules, compilers, CUDA path, GPU arch, and configure args without +# duplicating the whole build block for MiyabiG/RC_GH200. +configure_genesis_gh200_gpu() { + local system_name="$1" + local env_prefix="$2" + local default_module="$3" + local module_var="${env_prefix}_MODULE" + local fc_var="${env_prefix}_FC" + local cc_var="${env_prefix}_CC" + local cxx_var="${env_prefix}_CXX" + local f77_var="${env_prefix}_F77" + local config_args_var="${env_prefix}_CONFIG_ARGS" + local gpu_arch_var="${env_prefix}_GPU_ARCH" + local cuda_path_var="${env_prefix}_CUDA_PATH" + local lapack_libs_var="${env_prefix}_LAPACK_LIBS" + local ppflags_var="${env_prefix}_PPFLAGS" + local default_ppflags="-traditional-cpp -traditional -D_SINGLE -DHAVE_MPI_GENESIS -DOMP -DFFTE -DUSE_GPU" + local gpu_arch_value="${!gpu_arch_var:-sm_90}" + local cuda_arch_number="${gpu_arch_value#sm_}" + local gpu_arch="sm_${cuda_arch_number}" + local cuda_prefix="" + + local module_name="${!module_var:-$default_module}" + if [ "$module_name" != "none" ] && command -v module >/dev/null 2>&1; then + read -r -a module_names <<< "$module_name" + module load "${module_names[@]}" + fi + + # GENESIS configure probes compiler versions through this variable. + version="--version" + FC="${!fc_var:-mpif90}" + CC="${!cc_var:-mpicc}" + CXX="${!cxx_var:-mpicxx}" + F77="${!f77_var:-mpif77}" + + cuda_prefix="${!cuda_path_var:-}" + if [ -z "$cuda_prefix" ]; then + cuda_prefix=$(detect_cuda_path || true) + fi + configure_cuda_environment "$cuda_prefix" "$cuda_arch_number" + export GENESIS_NVHPC_GPU_FLAGS="${GENESIS_NVHPC_GPU_FLAGS:--cuda -gpu=cc${cuda_arch_number}}" + + # Site-specific CONFIG_ARGS is a full replacement. Otherwise use a portable + # single-precision MPI/OpenMP GPU configuration and add CUDA/LAPACK only when + # the corresponding site paths are available. + if [ -n "${!config_args_var:-}" ]; then + read -r -a CONFIG_ARGS <<< "${!config_args_var}" + else + CONFIG_ARGS=(--enable-single --with-simd=auto --enable-mpi --without-lapack --enable-gpu --enable-openmp "--with-gpuarch=${gpu_arch}") + if [ -n "$cuda_prefix" ]; then + CONFIG_ARGS+=("--with-cuda=${cuda_prefix}") + fi + if [ -n "${!lapack_libs_var:-}" ]; then + export LAPACK_LIBS="${!lapack_libs_var}" + CONFIG_ARGS=("${CONFIG_ARGS[@]/--without-lapack/--with-lapack}") + CONFIG_ARGS+=("LAPACK_LIBS=${!lapack_libs_var}") + fi + fi + + append_env_flags PPFLAGS "${!ppflags_var:-$default_ppflags}" + + apply_genesis_nvtx_include_patch + apply_genesis_nvfortran_configure_patch + apply_genesis_nvhpc_configure_flags_patch + echo "Configured ${system_name} as Grace-Hopper GPU build" +} + case "$system" in Fugaku) comp=frtpx @@ -38,15 +274,44 @@ case "$system" in LAPACK_LIBS="-L/vol0004/apps/oss/spack-v0.21/opt/spack/linux-rhel8-cascadelake/gcc-13.2.0/openblas-0.3.24-on6q3arf3iucukiz4tfai26noq3kz4a7/lib/ -lopenblas" CONFIG_ARGS=(--enable-mixed "LAPACK_LIBS=$LAPACK_LIBS") ;; + + MiyabiG) + configure_genesis_gh200_gpu "$system" GENESIS_MIYABIG none + ;; + + RC_GH200) + configure_genesis_gh200_gpu "$system" GENESIS_GH200 "system/qc-gh200 nvhpc/25.9" + ;; + + *) + echo "Unknown system: $system" + exit 1 + ;; esac echo "FC=$FC" echo "CC=$CC" +echo "CXX=${CXX:-}" +echo "F77=${F77:-}" echo "configure args: ${CONFIG_ARGS[@]}" -autoreconf -i -./configure CC="$CC" FC="$FC" "${CONFIG_ARGS[@]}" -make -j > make.log 2>&1 +bootstrap_genesis +configure_env=(CC="$CC" FC="$FC") +if [ -n "${CXX:-}" ]; then + configure_env+=(CXX="$CXX") +fi +if [ -n "${F77:-}" ]; then + configure_env+=(F77="$F77") +fi +./configure "${configure_env[@]}" "${CONFIG_ARGS[@]}" +apply_genesis_nvtx_include_patch +if ! make -j > make.log 2>&1; then + echo "make failed. Error-like lines from make.log:" >&2 + grep -n -i -E 'error|fatal|undefined reference|no such file|cannot|failed|unknown switch|unsupported|stop\.' make.log | tail -n 200 >&2 || true + echo "make failed. Last 1000 lines of make.log:" >&2 + tail -n 1000 make.log >&2 || true + exit 1 +fi make install cp "bin/spdyn" "../artifacts/" echo "done." diff --git a/programs/genesis/list.csv b/programs/genesis/list.csv index e7f19bf..d25424d 100644 --- a/programs/genesis/list.csv +++ b/programs/genesis/list.csv @@ -2,3 +2,5 @@ system,enable,nodes,numproc_node,nthreads,elapse Fugaku,yes,2,4,12,0:10:00 FugakuLN,no,1,8,2,0:10:00 FugakuCN,no,1,8,2,0:10:00 +MiyabiG,yes,1,8,9,0:10:00 +RC_GH200,yes,1,8,9,0:10:00 diff --git a/programs/genesis/run.sh b/programs/genesis/run.sh index c63f98f..dc15d40 100644 --- a/programs/genesis/run.sh +++ b/programs/genesis/run.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +set -o pipefail system="$1" nodes="$2" numproc_node="$3" @@ -104,6 +105,81 @@ if [[ ! -f ${inputdir}/apoa1.rst ]]; then fi cp ${inputdir}/apoa1.rst . +# Shared GH200-class run path. The env_prefix pattern mirrors build.sh so each +# site can override modules, MPI launcher, GPU visibility, and profiler policy +# independently while keeping the benchmark invocation identical. +run_genesis_gh200_gpu() { + local system_name="$1" + local env_prefix="$2" + local default_module="$3" + local module_var="${env_prefix}_MODULE" + local mpi_cmd_var="${env_prefix}_MPI_CMD" + local mpi_args_var="${env_prefix}_MPI_ARGS" + local cuda_visible_devices_var="${env_prefix}_CUDA_VISIBLE_DEVICES" + local profiler_tool_var="${env_prefix}_PROFILER_TOOL" + local profiler_level_var="${env_prefix}_PROFILER_LEVEL" + + local module_name="${!module_var:-$default_module}" + if [ "$module_name" != "none" ] && command -v module >/dev/null 2>&1; then + read -r -a module_names <<< "$module_name" + module load "${module_names[@]}" + fi + + read -r -a mpi_cmd <<< "${!mpi_cmd_var:-mpirun -np ${numproc}}" + if [ -n "${!mpi_args_var:-}" ]; then + read -r -a gh200_mpi_args <<< "${!mpi_args_var}" + mpi_cmd+=("${gh200_mpi_args[@]}") + fi + + export OMP_NUM_THREADS=${nthreads} + if [ -n "${!cuda_visible_devices_var:-}" ]; then + export CUDA_VISIBLE_DEVICES="${!cuda_visible_devices_var}" + fi + + local genesis_profiler_requested="" + local genesis_profiler_explicit=0 + # GH200 systems default to Nsight Compute because the GPU path is the new + # behavior being validated. Explicit requests are strict; the default falls + # back to an unprofiled run when ncu is unavailable. + if [ -n "${!profiler_tool_var:-}" ]; then + genesis_profiler_requested="${!profiler_tool_var}" + genesis_profiler_explicit=1 + elif [ -n "${GENESIS_PROFILER_TOOL:-}" ]; then + genesis_profiler_requested="${GENESIS_PROFILER_TOOL}" + genesis_profiler_explicit=1 + else + genesis_profiler_requested="ncu" + fi + + genesis_profiler_tool=$(bk_get_profiler_tool "$genesis_profiler_requested") || return 1 + genesis_profiler_level="${!profiler_level_var:-${GENESIS_PROFILER_LEVEL:-single}}" + if [ -n "$genesis_profiler_tool" ]; then + if [ "$genesis_profiler_tool" = "ncu" ] && ! command -v ncu >/dev/null 2>&1; then + if [ "$genesis_profiler_explicit" -eq 1 ]; then + echo "Genesis ${system_name}: ncu profiler requested but ncu is not in PATH." >&2 + echo "Load Nsight Compute with ${module_var}, or set ${profiler_tool_var}=none / GENESIS_PROFILER_TOOL=none to run without profiling." >&2 + return 1 + fi + echo "Genesis ${system_name}: default ncu profiler is not in PATH; running without profiling." >&2 + echo "Set ${profiler_tool_var}=ncu or GENESIS_PROFILER_TOOL=ncu to require Nsight Compute profiling." >&2 + genesis_profiler_tool="" + genesis_profiler_requested="none" + fi + fi + + echo "Running ${system_name} as Grace-Hopper GPU run with profiler=${genesis_profiler_requested:-none} level=${genesis_profiler_level}" + if [ -n "$genesis_profiler_tool" ]; then + # set -o pipefail at script entry keeps profiler or MPI failures visible + # even though stdout/stderr are also streamed through tee for artifacts. + bk_profiler "$genesis_profiler_tool" \ + --level "$genesis_profiler_level" \ + --archive "${resultsdir}/padata0.tgz" \ + --raw-dir ncu \ + -- "${mpi_cmd[@]}" ./${binary} ${input}.sub 2>&1 | tee ${output} + else + "${mpi_cmd[@]}" ./${binary} ${input}.sub 2>&1 | tee ${output} + fi +} case "$system" in Fugaku) @@ -125,8 +201,15 @@ case "$system" in export OMP_NUM_THREADS=${nthreads} ${mpi_cmd} ./${binary} ${input}.sub 2>&1 | tee ${output} ;; + MiyabiG) + run_genesis_gh200_gpu "$system" GENESIS_MIYABIG none + ;; + RC_GH200) + run_genesis_gh200_gpu "$system" GENESIS_GH200 "system/qc-gh200 nvhpc/25.9" + ;; *) echo "Unknown Running system: $system" + exit 1 ;; esac diff --git a/programs/qws/build.sh b/programs/qws/build.sh index 477bbd5..9750137 100644 --- a/programs/qws/build.sh +++ b/programs/qws/build.sh @@ -61,6 +61,20 @@ case "$system" in MiyabiC) make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= ;; + GenkaiA|GenkaiB|GenkaiC) + module load intel/2023.2 mvapich/3.0-intel2023.2 + make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpicxx + ;; + Grand_C|Grand_G) + module load intel impi + make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= + ;; + AOBA_A|AOBA_S) + make -j 8 fugaku_benchmark= omp=1 compiler=nec arch=sx rdma= mpi=1 powerapi= + ;; + AOBA_B) + make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++ + ;; *) echo "Unknown system: $system" exit 1 diff --git a/programs/qws/list.csv b/programs/qws/list.csv index bc25f19..41d1e75 100644 --- a/programs/qws/list.csv +++ b/programs/qws/list.csv @@ -9,4 +9,12 @@ RC_GENOA,yes,1,1,96,0:10:00 RC_FX700,yes,1,4,12,0:10:00 MiyabiG,yes,1,1,72,0:10:00 MiyabiC,yes,1,1,112,0:10:00 +GenkaiA,yes,1,1,120,0:10:00 +GenkaiB,yes,1,1,120,0:10:00 +GenkaiC,yes,1,1,112,0:10:00 +Grand_C,yes,1,1,64,0:10:00 +Grand_G,yes,1,1,64,0:10:00 +AOBA_A,yes,1,1,8,0:10:00 +AOBA_S,yes,1,1,8,0:10:00 +AOBA_B,yes,1,1,128,0:10:00 FNCX,yes,1,1,1,0:10:00 diff --git a/programs/qws/run.sh b/programs/qws/run.sh index 396c46a..c555c24 100644 --- a/programs/qws/run.sh +++ b/programs/qws/run.sh @@ -109,6 +109,38 @@ case "$system" in mpirun -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 print_results CASE0 CASE0 1 >> ../results/result ;; + GenkaiA|GenkaiB|GenkaiC) + qws_numproc=$((nodes * numproc_node)) + module load intel/2023.2 mvapich/3.0-intel2023.2 + mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + Grand_C|Grand_G) + qws_numproc=$((nodes * numproc_node)) + module load intel impi + if [[ -n "${I_MPI_ROOT:-}" && -d "${I_MPI_ROOT}/bin" ]]; then + export PATH="${I_MPI_ROOT}/bin:${PATH}" + fi + qws_mpi_launcher=$(command -v mpirun || command -v mpiexec || command -v mpiexec.hydra || true) + if [[ -z "$qws_mpi_launcher" ]]; then + echo "qws: mpirun/mpiexec/mpiexec.hydra not found after module load intel impi" >&2 + echo "qws: PATH=${PATH}" >&2 + echo "qws: MPI launcher candidates:" >&2 + type -a mpirun mpiexec mpiexec.hydra mpiicc mpiicpc mpiicpx 2>&1 >&2 || true + echo "qws: loaded modules:" >&2 + module list >&2 || true + echo "qws: environment:" >&2 + env | sort >&2 + exit 1 + fi + "$qws_mpi_launcher" -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + AOBA_A|AOBA_B|AOBA_S) + qws_numproc=$((nodes * numproc_node)) + mpirun -np ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; *) echo "Unknown Running system: $system" exit 1 diff --git a/requirements-result-server.txt b/requirements-result-server.txt index cd48dc7..fd274bc 100644 --- a/requirements-result-server.txt +++ b/requirements-result-server.txt @@ -1,8 +1,11 @@ +# result_server requires Python 3.12+ for safe tar extraction via tarfile filter="data". Flask>=3.0,<4.0 Flask-Session>=0.8,<1.0 +Flask-WTF>=1.2,<2.0 +gunicorn>=23.0,<24.0 redis>=5.0,<6.0 pyotp>=2.9,<3.0 qrcode[pil]>=8.0,<9.0 -pytest>=8.0,<9.0 +pytest>=9.0.3,<10.0 hypothesis>=6.0,<7.0 fakeredis>=2.23,<3.0 diff --git a/result_server/app.py b/result_server/app.py index d6766fd..c7ec38e 100644 --- a/result_server/app.py +++ b/result_server/app.py @@ -2,19 +2,21 @@ import sys from datetime import timedelta -from flask import Flask, current_app, render_template +from flask import Flask, render_template from flask_session import Session from routes.api import api_bp from routes.estimated import estimated_bp from routes.home import register_home_routes from routes.results import results_bp +from utils.auth import parse_ingest_keys +from utils.csrf import init_csrf -EXPECTED_API_KEY = os.environ.get("RESULT_SERVER_KEY") +INGEST_KEYS = parse_ingest_keys() -if not EXPECTED_API_KEY: - print("ERROR: RESULT_SERVER_KEY is not set.", file=sys.stderr) +if not INGEST_KEYS: + print("ERROR: RESULT_SERVER_KEYS or RESULT_SERVER_KEY is not set.", file=sys.stderr) sys.exit(1) @@ -41,6 +43,7 @@ def _configure_redis(app, prefix): app.config["REDIS_CONN"] = redis.from_url(redis_url, decode_responses=True) app.config["REDIS_PREFIX"] = "dev:" if prefix == "/dev" else "main:" app.config["SESSION_COOKIE_NAME"] = "session_dev" if prefix == "/dev" else "session_main" + app.config["AUTH_REQUIRES_REDIS"] = True def _configure_user_store(app): @@ -92,12 +95,14 @@ def create_app(prefix="", base_dir=None): if not secret_key: raise RuntimeError("FLASK_SECRET_KEY must be set in production") app.secret_key = secret_key + app.config["INGEST_KEYS"] = INGEST_KEYS.copy() _configure_session(app, base_dir) _configure_redis(app, prefix) _configure_user_store(app) _configure_totp_issuer(app, prefix) _configure_result_directories(app, base_dir) + init_csrf(app, exempt_blueprints=(api_bp,)) register_home_routes(app, prefix=prefix) _register_portal_blueprints(app, prefix) @@ -127,4 +132,6 @@ def systemlist(): if __name__ == "__main__": - app.run(host="0.0.0.0", port=8800) + host = os.environ.get("RESULT_SERVER_HOST", "127.0.0.1") + port = int(os.environ.get("RESULT_SERVER_PORT", "8800")) + app.run(host=host, port=port) diff --git a/result_server/app_dev.py b/result_server/app_dev.py index cf011a5..46e8538 100644 --- a/result_server/app_dev.py +++ b/result_server/app_dev.py @@ -15,16 +15,32 @@ import argparse import json import os +import secrets import sys import types import uuid +import warnings from datetime import datetime, timedelta +LOOPBACK_HOSTS = {"127.0.0.1", "localhost", "::1"} + def setup_dev_environment(base_dir): """Configure development environment variables and runtime directories.""" - os.environ.setdefault("RESULT_SERVER_KEY", "dev-api-key") - os.environ.setdefault("FLASK_SECRET_KEY", "dev-secret-key") + if not os.environ.get("RESULT_SERVER_KEYS") and not os.environ.get("RESULT_SERVER_KEY"): + os.environ["RESULT_SERVER_KEYS"] = f"local-dev:{secrets.token_urlsafe(32)}" + warnings.warn( + "RESULT_SERVER_KEYS not set; using an ephemeral dev API key.", + RuntimeWarning, + stacklevel=2, + ) + if not os.environ.get("FLASK_SECRET_KEY"): + os.environ["FLASK_SECRET_KEY"] = secrets.token_hex(32) + warnings.warn( + "FLASK_SECRET_KEY not set; using an ephemeral dev secret key.", + RuntimeWarning, + stacklevel=2, + ) os.environ.setdefault("BASE_PATH", base_dir) os.environ["DEV_MODE"] = "1" @@ -44,6 +60,19 @@ def setup_dev_environment(base_dir): os.makedirs(os.path.join(base_dir, sub), exist_ok=True) +def validate_dev_runtime(host): + """Abort when the development launcher is used outside local-only mode.""" + if os.environ.get("FLASK_ENV") == "production": + sys.exit("app_dev.py must not be used in production. Use app.py.") + if host not in LOOPBACK_HOSTS: + sys.exit(f"app_dev.py refuses to bind to {host}. Use app.py for production.") + + +def dev_debug_enabled(): + """Return whether the Werkzeug debugger was explicitly enabled.""" + return os.environ.get("RESULT_SERVER_DEV_DEBUG") == "1" + + def _create_stub_totp_manager(): """Return a stub TOTP module that always validates setup and login.""" mod = types.ModuleType("utils.totp_manager") @@ -130,15 +159,19 @@ def create_dev_app(base_dir): from flask_session import Session from routes.home import register_home_routes + from utils.auth import parse_ingest_keys + from utils.csrf import init_csrf from utils.system_info import get_all_systems_info, summarize_systems_info app = Flask(__name__, template_folder="templates") - app.secret_key = "dev-secret-key" + app.secret_key = os.environ["FLASK_SECRET_KEY"] app.config.update( SESSION_TYPE="filesystem", SESSION_FILE_DIR=os.path.join(base_dir, "main", "flask_session"), SESSION_PERMANENT=False, + AUTH_REQUIRES_REDIS=False, + INGEST_KEYS=parse_ingest_keys(), ) Session(app) @@ -166,6 +199,10 @@ def create_dev_app(base_dir): register_home_routes(app) # Register all portal blueprints. + from routes.api import api_bp + + app.register_blueprint(api_bp) + from routes.results import results_bp app.register_blueprint(results_bp, url_prefix="/results") @@ -180,6 +217,8 @@ def create_dev_app(base_dir): from routes.admin import admin_bp + init_csrf(app, exempt_blueprints=(api_bp,)) + app.register_blueprint(admin_bp, url_prefix="/admin") @app.route("/systemlist") @@ -472,10 +511,17 @@ def generate_sample_data(received_dir): def main(): parser = argparse.ArgumentParser(description="BenchKit Result Server - Dev Mode") + parser.add_argument( + "--host", + default="127.0.0.1", + help="Loopback host to bind (default: 127.0.0.1)", + ) parser.add_argument("--port", type=int, default=8800, help="Port number (default: 8800)") parser.add_argument("--generate-sample", action="store_true", help="Generate sample data") args = parser.parse_args() + validate_dev_runtime(args.host) + # Development base directory. script_dir = os.path.dirname(os.path.abspath(__file__)) base_dir = os.path.join(script_dir, "_dev_data") @@ -492,15 +538,18 @@ def main(): print("Generating sample data...") generate_sample_data(received_dir) - print(f"\nStarting dev server on http://localhost:{args.port}") - print(f" Results: http://localhost:{args.port}/results") - print(f" Systems: http://localhost:{args.port}/systemlist") + print(f"\nStarting dev server on http://{args.host}:{args.port}") + print(f" Results: http://{args.host}:{args.port}/results") + print(f" Systems: http://{args.host}:{args.port}/systemlist") print(f" Data dir: {base_dir}") print() # Create and launch the Flask app directly. app = create_dev_app(base_dir) - app.run(host="127.0.0.1", port=args.port, debug=True) + debug = dev_debug_enabled() + if debug: + app.logger.warning("Werkzeug debugger enabled for local development.") + app.run(host=args.host, port=args.port, debug=debug) if __name__ == "__main__": diff --git a/result_server/routes/api.py b/result_server/routes/api.py index 5ab734d..d7cd8d9 100644 --- a/result_server/routes/api.py +++ b/result_server/routes/api.py @@ -7,12 +7,13 @@ import uuid import shutil import io +import sys import tarfile from datetime import datetime -api_bp = Blueprint("api", __name__) +from utils.auth import verify_ingest_key -EXPECTED_API_KEY = os.environ.get("RESULT_SERVER_KEY") +api_bp = Blueprint("api", __name__) # ========================================== @@ -20,10 +21,19 @@ # ========================================== def require_api_key(): - """Validate the request API key.""" - api_key = request.headers.get("X-API-Key") - if api_key != EXPECTED_API_KEY: + """Validate the request API key and return the authenticated runner id.""" + runner_id = verify_ingest_key(request.headers.get("X-API-Key", "")) + if not runner_id: abort(401, description="Invalid API Key") + current_app.logger.info( + "api key accepted", + extra={ + "runner_id": runner_id, + "endpoint": request.path, + "ip": request.remote_addr, + }, + ) + return runner_id def save_json_file(data, prefix, out_dir, given_uuid=None): @@ -135,6 +145,16 @@ def _find_result_file_by_uuid(received_dir, uuid_value): def _safe_extract_tar_bytes(file_storage, target_dir): + """Extract uploaded tar bytes with path and member-type checks. + + The explicit path normalization catches traversal attempts before writing + anything, and Python 3.12's data filter rejects non-regular archive entries + such as unsafe links or device files. + """ + if sys.version_info < (3, 12): + raise RuntimeError("Python 3.12 or later is required for safe tar extraction.") + + os.makedirs(target_dir, exist_ok=True) with tarfile.open(fileobj=file_storage.stream, mode="r:*") as tar: for member in tar.getmembers(): normalized = os.path.normpath(member.name) @@ -142,8 +162,8 @@ def _safe_extract_tar_bytes(file_storage, target_dir): abort(400, description="Unsafe archive entry") try: tar.extractall(target_dir, filter="data") - except TypeError: - tar.extractall(target_dir) + except tarfile.FilterError: + abort(400, description="Unsafe archive entry") # ========================================== @@ -178,9 +198,7 @@ def ingest_estimate(): @api_bp.route("/api/ingest/padata", methods=["POST"]) def ingest_padata(): """Receive and store a PA Data archive.""" - api_key = request.headers.get("X-API-Key") - if api_key != EXPECTED_API_KEY: - abort(401, description="Invalid API Key") + require_api_key() uuid_str = request.form.get("id") if not uuid_str or not is_valid_uuid(uuid_str): diff --git a/result_server/routes/auth.py b/result_server/routes/auth.py index 5258a94..216cf86 100644 --- a/result_server/routes/auth.py +++ b/result_server/routes/auth.py @@ -2,6 +2,7 @@ from flask import ( Blueprint, + abort, current_app, flash, redirect, @@ -25,6 +26,40 @@ auth_bp = Blueprint("auth", __name__, url_prefix="/auth") +def _redis_ping_ok(redis_conn): + """Return whether the configured Redis connection is currently usable.""" + if not redis_conn: + return False + try: + redis_conn.ping() + return True + except Exception: + current_app.logger.exception("Redis ping failed during authentication") + return False + + +def _get_redis_or_fail(): + """Return Redis for auth tracking, failing closed when configured to require it.""" + redis_conn = current_app.config.get("REDIS_CONN") + requires_redis = current_app.config.get("AUTH_REQUIRES_REDIS", False) + + if not redis_conn: + if requires_redis: + current_app.logger.error("Redis unavailable; refusing login") + abort(503, description="Authentication service temporarily unavailable.") + return None + + if _redis_ping_ok(redis_conn): + return redis_conn + + if requires_redis: + current_app.logger.error("Redis unavailable; refusing login") + abort(503, description="Authentication service temporarily unavailable.") + + current_app.logger.warning("Redis unavailable; continuing without auth throttling") + return None + + def _render_login_totp_step(email): return render_template("auth_login.html", step="totp", email=email) @@ -50,6 +85,8 @@ def login(): email = request.form.get("email", "").strip() totp_code = request.form.get("totp_code", "").strip() + redis_conn = _get_redis_or_fail() + prefix = current_app.config.get("REDIS_PREFIX", "") # Step 1: email submitted -> show the TOTP entry form. if email and not totp_code: @@ -64,8 +101,6 @@ def login(): return redirect(url_for("auth.login")) # Enforce rate limiting when Redis-backed tracking is available. - redis_conn = current_app.config.get("REDIS_CONN") - prefix = current_app.config.get("REDIS_PREFIX", "") if redis_conn: is_locked, remaining = check_rate_limit(redis_conn, prefix, email) if is_locked: diff --git a/result_server/templates/_results_base.html b/result_server/templates/_results_base.html index 21e6ec6..16852c4 100644 --- a/result_server/templates/_results_base.html +++ b/result_server/templates/_results_base.html @@ -2,6 +2,7 @@ + {% block title %}Results{% endblock %} {% include "_table_base.html" %} diff --git a/result_server/templates/_results_table_cell_profile.html b/result_server/templates/_results_table_cell_profile.html index 619a031..e70944a 100644 --- a/result_server/templates/_results_table_cell_profile.html +++ b/result_server/templates/_results_table_cell_profile.html @@ -10,6 +10,7 @@ {% if row.profile_summary_meta.subline %}
{{ row.profile_summary_meta.subline }}{% endif %} {% if row.data_link %}
archive: available{% endif %} {% if row.profile_summary_meta.events %}
events: {{ row.profile_summary_meta.events | join(', ') }}{% endif %} + {% if row.profile_summary_meta.ncu_options %}
ncu options: {{ row.profile_summary_meta.ncu_options | join(' ') }}{% endif %} {% if row.profile_summary_meta.report_kinds %}
reports: {{ row.profile_summary_meta.report_kinds | join(', ') }}{% endif %} diff --git a/result_server/templates/_usage_report_configuration_checks.html b/result_server/templates/_usage_report_configuration_checks.html index f7e5703..02316a6 100644 --- a/result_server/templates/_usage_report_configuration_checks.html +++ b/result_server/templates/_usage_report_configuration_checks.html @@ -16,26 +16,40 @@

Registered Systems

{% endif %}
-

System Info Coverage

+

Public System Catalog

+ {% if site_diagnostics.public_missing_system_definitions | default([]) %} +
    + {% for system in site_diagnostics.public_missing_system_definitions %} +
  • system_info.csv exposes {{ system }}, but system.csv has no matching system.
  • + {% endfor %} +
+ {% endif %} {% if site_diagnostics.missing_system_info %}
    {% for system in site_diagnostics.missing_system_info %} -
  • {{ system }} is missing a system_info.csv entry.
  • +
  • {{ system }} is registered in system.csv but is not exposed in system_info.csv.
  • {% endfor %}
- {% else %} -

Every registered system has a system_info.csv entry.

+ {% elif not site_diagnostics.public_missing_system_definitions | default([]) %} +

Every public system_info.csv entry has a matching system.csv definition.

{% endif %}

Queue Definitions

+ {% if site_diagnostics.public_missing_queue_definitions | default([]) %} +
    + {% for item in site_diagnostics.public_missing_queue_definitions %} +
  • system_info.csv exposes {{ item.system }}, but its system.csv queue{% if item.queue %} {{ item.queue }}{% endif %} is not defined in queue.csv.
  • + {% endfor %} +
+ {% endif %} {% if site_diagnostics.missing_queue_definitions %}
    {% for item in site_diagnostics.missing_queue_definitions %}
  • {{ item.system }} references queue {{ item.queue }} without a matching queue.csv definition.
  • {% endfor %}
- {% else %} + {% elif not site_diagnostics.public_missing_queue_definitions | default([]) %}

All queues referenced by system.csv are defined.

{% endif %}
diff --git a/result_server/templates/_usage_report_quality_section.html b/result_server/templates/_usage_report_quality_section.html index efa5d70..1a739c7 100644 --- a/result_server/templates/_usage_report_quality_section.html +++ b/result_server/templates/_usage_report_quality_section.html @@ -18,7 +18,7 @@

Result Quality Coverage

Est.-Ready Rich Next Action - Validator Candidates + Improvement Candidates Warnings diff --git a/result_server/templates/admin_users.html b/result_server/templates/admin_users.html index c9d50b0..3e22a09 100644 --- a/result_server/templates/admin_users.html +++ b/result_server/templates/admin_users.html @@ -88,7 +88,8 @@

Add User

Create a user record and generate a new invitation link for TOTP setup.

- + {% if csrf_token is defined %}{% endif %} +
@@ -122,16 +123,19 @@

Registered Users

+ {% if csrf_token is defined %}{% endif %}
{% if u.email != session.get('user_email') %}
+ {% if csrf_token is defined %}{% endif %}
+ {% if csrf_token is defined %}{% endif %}
{% endif %} diff --git a/result_server/templates/auth_login.html b/result_server/templates/auth_login.html index 6d78a49..33c5850 100644 --- a/result_server/templates/auth_login.html +++ b/result_server/templates/auth_login.html @@ -87,9 +87,10 @@ {% if step == "email" %}

Start with the email address associated with your portal account. You will enter your authenticator code on the next step.

+ {% if csrf_token is defined %}{% endif %}
- +
@@ -99,6 +100,7 @@

Enter the 6-digit code from your authenticator app to complete sign-in.

Step 2 of 2 + {% if csrf_token is defined %}{% endif %}
diff --git a/result_server/templates/auth_setup.html b/result_server/templates/auth_setup.html index b83ab19..5d71919 100644 --- a/result_server/templates/auth_setup.html +++ b/result_server/templates/auth_setup.html @@ -121,6 +121,7 @@
{{ secret }}
+ {% if csrf_token is defined %}{% endif %}
Path: + return Path(__file__).resolve().parents[2] + + +def _load_site_diagnostics(): + repo_root = _repo_root() + sys.path.insert(0, str(repo_root / "result_server")) + + from utils.site_diagnostics import ( # pylint: disable=import-outside-toplevel + build_site_config_preflight_failures, + build_site_diagnostics, + ) + + return build_site_diagnostics, build_site_config_preflight_failures + + +def parse_args(argv: list[str]) -> argparse.Namespace: + repo_root = _repo_root() + parser = argparse.ArgumentParser( + description=( + "Validate public site configuration. Systems listed in " + "config/system_info.csv are visible to portal users and must also " + "exist in config/system.csv with a queue defined in config/queue.csv." + ) + ) + parser.add_argument( + "--system-csv", + default=str(repo_root / "config" / "system.csv"), + help="Path to system.csv.", + ) + parser.add_argument( + "--queue-csv", + default=str(repo_root / "config" / "queue.csv"), + help="Path to queue.csv.", + ) + parser.add_argument( + "--system-info-csv", + default=str(repo_root / "config" / "system_info.csv"), + help="Path to system_info.csv.", + ) + parser.add_argument( + "--programs-dir", + default=str(repo_root / "programs"), + help="Path to programs directory for shared diagnostics.", + ) + return parser.parse_args(argv[1:]) + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + build_site_diagnostics, build_site_config_preflight_failures = _load_site_diagnostics() + + diagnostics = build_site_diagnostics( + system_csv_path=args.system_csv, + queue_csv_path=args.queue_csv, + system_info_csv_path=args.system_info_csv, + programs_dir=args.programs_dir, + ) + failures = build_site_config_preflight_failures(diagnostics) + + if failures: + print("Site configuration preflight failed:", file=sys.stderr) + for failure in failures: + print(f"- {failure}", file=sys.stderr) + return 1 + + print( + "Site configuration preflight passed: every public system_info.csv " + "system is registered in system.csv and has a queue defined in queue.csv." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/scripts/test_result_server.py b/result_server/tests/run_result_server_tests.py similarity index 90% rename from scripts/test_result_server.py rename to result_server/tests/run_result_server_tests.py index 2ec12a9..8cae6a0 100644 --- a/scripts/test_result_server.py +++ b/result_server/tests/run_result_server_tests.py @@ -10,7 +10,7 @@ def main(argv: list[str]) -> int: - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[2] suite_path = repo_root / "result_server" / "tests" if len(argv) > 1: diff --git a/result_server/tests/test_api_routes.py b/result_server/tests/test_api_routes.py index 9e0623f..c812c53 100644 --- a/result_server/tests/test_api_routes.py +++ b/result_server/tests/test_api_routes.py @@ -2,6 +2,7 @@ import io import json +import logging import os import shutil import sys @@ -38,22 +39,16 @@ def app(tmp_dirs): """Build a Flask app configured for API route tests.""" received, received_padata, received_estimation_inputs, estimated = tmp_dirs - # Override the expected API key for this test app. - import routes.api as api_mod - original_key = api_mod.EXPECTED_API_KEY - api_mod.EXPECTED_API_KEY = API_KEY - app = build_api_route_app( received_dir=received, received_padata_dir=received_padata, received_estimation_inputs_dir=received_estimation_inputs, estimated_dir=estimated, ) + app.config["INGEST_KEYS"] = {API_KEY: "test-runner"} yield app - api_mod.EXPECTED_API_KEY = original_key - @pytest.fixture def client(app): @@ -91,6 +86,62 @@ def test_post_valid_json(self, client, tmp_dirs): assert saved["_server_uuid"] == body["id"] assert saved["_server_timestamp"] == body["timestamp"] + def test_valid_key_logs_runner_id(self, client, caplog): + """Accepted API requests should include the resolved runner id in logs.""" + with caplog.at_level(logging.INFO): + resp = client.post("/api/ingest/result", + data=b'{"code":"test"}', + headers={"X-API-Key": API_KEY, + "Content-Type": "application/json"}) + + assert resp.status_code == 200 + assert any( + record.message == "api key accepted" + and getattr(record, "runner_id", None) == "test-runner" + and getattr(record, "endpoint", None) == "/api/ingest/result" + for record in caplog.records + ) + + def test_multiple_ingest_keys_accept_individual_runner_keys(self, app): + """RESULT_SERVER_KEYS-style config should accept each runner key.""" + app.config["INGEST_KEYS"] = { + "runner-a-key": "runner-a", + "runner-b-key": "runner-b", + } + + with app.test_client() as client: + resp_a = client.post("/api/ingest/result", + data=b'{"code":"a"}', + headers={"X-API-Key": "runner-a-key", + "Content-Type": "application/json"}) + resp_b = client.post("/api/ingest/result", + data=b'{"code":"b"}', + headers={"X-API-Key": "runner-b-key", + "Content-Type": "application/json"}) + + assert resp_a.status_code == 200 + assert resp_b.status_code == 200 + + def test_legacy_result_server_key_env_is_still_accepted(self, tmp_dirs, monkeypatch): + """RESULT_SERVER_KEY should remain valid as the default runner fallback.""" + monkeypatch.delenv("RESULT_SERVER_KEYS", raising=False) + monkeypatch.setenv("RESULT_SERVER_KEY", "legacy-key") + received, received_padata, received_estimation_inputs, estimated = tmp_dirs + app = build_api_route_app( + received_dir=received, + received_padata_dir=received_padata, + received_estimation_inputs_dir=received_estimation_inputs, + estimated_dir=estimated, + ) + + with app.test_client() as client: + resp = client.post("/api/ingest/result", + data=b'{"code":"legacy"}', + headers={"X-API-Key": "legacy-key", + "Content-Type": "application/json"}) + + assert resp.status_code == 200 + def test_missing_api_key_returns_401(self, client): """Test case.""" resp = client.post("/api/ingest/result", @@ -340,6 +391,90 @@ def test_ingest_estimation_inputs_expands_under_result_stem(self, client, tmp_di saved_path = os.path.join(estimation_inputs_dir, result_stem, "prepare_rhs_interval.json") assert os.path.exists(saved_path) + def test_ingest_estimation_inputs_rejects_parent_path_entry(self, client, tmp_dirs): + received = tmp_dirs[0] + uuid_value = "12345678-1234-1234-1234-123456789abc" + self._seed_result(received, uuid_value) + + archive_bytes = io.BytesIO() + with tarfile.open(fileobj=archive_bytes, mode="w:gz") as tar: + payload = b"bad" + info = tarfile.TarInfo(name="../outside.txt") + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + archive_bytes.seek(0) + + resp = client.post( + "/api/ingest/estimation-inputs", + data={"id": uuid_value, "file": (archive_bytes, "estimation_inputs.tgz")}, + headers={"X-API-Key": API_KEY}, + content_type="multipart/form-data", + ) + assert resp.status_code == 400 + + def test_ingest_estimation_inputs_rejects_absolute_path_entry(self, client, tmp_dirs): + received = tmp_dirs[0] + uuid_value = "12345678-1234-1234-1234-123456789abc" + self._seed_result(received, uuid_value) + + archive_bytes = io.BytesIO() + with tarfile.open(fileobj=archive_bytes, mode="w:gz") as tar: + payload = b"bad" + info = tarfile.TarInfo(name="/outside.txt") + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + archive_bytes.seek(0) + + resp = client.post( + "/api/ingest/estimation-inputs", + data={"id": uuid_value, "file": (archive_bytes, "estimation_inputs.tgz")}, + headers={"X-API-Key": API_KEY}, + content_type="multipart/form-data", + ) + assert resp.status_code == 400 + + def test_ingest_estimation_inputs_rejects_absolute_symlink(self, client, tmp_dirs): + received = tmp_dirs[0] + uuid_value = "12345678-1234-1234-1234-123456789abc" + self._seed_result(received, uuid_value) + + archive_bytes = io.BytesIO() + with tarfile.open(fileobj=archive_bytes, mode="w:gz") as tar: + info = tarfile.TarInfo(name="link") + info.type = tarfile.SYMTYPE + info.linkname = "/etc/passwd" + tar.addfile(info) + archive_bytes.seek(0) + + resp = client.post( + "/api/ingest/estimation-inputs", + data={"id": uuid_value, "file": (archive_bytes, "estimation_inputs.tgz")}, + headers={"X-API-Key": API_KEY}, + content_type="multipart/form-data", + ) + assert resp.status_code == 400 + + def test_ingest_estimation_inputs_rejects_absolute_hardlink(self, client, tmp_dirs): + received = tmp_dirs[0] + uuid_value = "12345678-1234-1234-1234-123456789abc" + self._seed_result(received, uuid_value) + + archive_bytes = io.BytesIO() + with tarfile.open(fileobj=archive_bytes, mode="w:gz") as tar: + info = tarfile.TarInfo(name="hardlink") + info.type = tarfile.LNKTYPE + info.linkname = "/etc/passwd" + tar.addfile(info) + archive_bytes.seek(0) + + resp = client.post( + "/api/ingest/estimation-inputs", + data={"id": uuid_value, "file": (archive_bytes, "estimation_inputs.tgz")}, + headers={"X-API-Key": API_KEY}, + content_type="multipart/form-data", + ) + assert resp.status_code == 400 + def test_query_estimation_inputs_returns_archive(self, client, tmp_dirs): received = tmp_dirs[0] estimation_inputs_dir = tmp_dirs[2] diff --git a/result_server/tests/test_app_dev_security.py b/result_server/tests/test_app_dev_security.py new file mode 100644 index 0000000..f03eacc --- /dev/null +++ b/result_server/tests/test_app_dev_security.py @@ -0,0 +1,44 @@ +"""Tests for local-development launcher safety guards.""" + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import app_dev + + +def test_setup_dev_environment_uses_ephemeral_missing_keys(tmp_path, monkeypatch): + monkeypatch.delenv("RESULT_SERVER_KEYS", raising=False) + monkeypatch.delenv("RESULT_SERVER_KEY", raising=False) + monkeypatch.delenv("FLASK_SECRET_KEY", raising=False) + + with pytest.warns(RuntimeWarning): + app_dev.setup_dev_environment(str(tmp_path)) + + assert os.environ["RESULT_SERVER_KEYS"].startswith("local-dev:") + assert os.environ["RESULT_SERVER_KEYS"] != "local-dev:dev-api-key" + assert os.environ["FLASK_SECRET_KEY"] != "dev-secret-key" + + +def test_dev_debug_requires_explicit_opt_in(monkeypatch): + monkeypatch.delenv("RESULT_SERVER_DEV_DEBUG", raising=False) + assert app_dev.dev_debug_enabled() is False + + monkeypatch.setenv("RESULT_SERVER_DEV_DEBUG", "1") + assert app_dev.dev_debug_enabled() is True + + +def test_validate_dev_runtime_rejects_non_loopback_host(monkeypatch): + monkeypatch.delenv("FLASK_ENV", raising=False) + unsafe_host = ".".join(["0", "0", "0", "0"]) + with pytest.raises(SystemExit): + app_dev.validate_dev_runtime(unsafe_host) + + +def test_validate_dev_runtime_rejects_production_env(monkeypatch): + monkeypatch.setenv("FLASK_ENV", "production") + with pytest.raises(SystemExit): + app_dev.validate_dev_runtime("127.0.0.1") diff --git a/result_server/tests/test_auth_templates.py b/result_server/tests/test_auth_templates.py index 8152ac9..a7cd293 100644 --- a/result_server/tests/test_auth_templates.py +++ b/result_server/tests/test_auth_templates.py @@ -19,6 +19,8 @@ def test_auth_login_template_renders_portal_shell(): assert "Sign in with your email address and TOTP code" in html assert "Step 2 of 2" not in html assert "Continue" in html + assert '' in html + assert 'autocomplete="username"' in html def test_auth_setup_template_renders_portal_shell(): @@ -46,6 +48,7 @@ def test_admin_users_template_renders_portal_table(): app = build_portal_shell_app( templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), ) + app.jinja_env.globals["csrf_token"] = lambda: "test-csrf-token" with app.test_request_context("/admin/users"): from flask import render_template, session @@ -71,3 +74,4 @@ def test_admin_users_template_renders_portal_table(): assert "Review current user access" in html assert "Registered" in html assert "Pending" in html + assert 'name="csrf_token" value="test-csrf-token"' in html diff --git a/result_server/tests/test_csrf.py b/result_server/tests/test_csrf.py new file mode 100644 index 0000000..55520e0 --- /dev/null +++ b/result_server/tests/test_csrf.py @@ -0,0 +1,140 @@ +"""Tests for CSRF enforcement on browser POST routes.""" + +import json +import os +import shutil +import sys +import tempfile + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from test_support import build_api_route_app, build_portal_route_app, install_portal_test_stubs +from utils.csrf import init_csrf + +install_portal_test_stubs() + +API_KEY = "test-api-key-12345" + + +class _Store: + def __init__(self): + self._users = { + "admin@test.com": { + "email": "admin@test.com", + "totp_secret": "SECRET", + "affiliations": ["admin"], + }, + "user@test.com": { + "email": "user@test.com", + "totp_secret": "SECRET2", + "affiliations": ["dev"], + }, + } + + def get_affiliations(self, email): + user = self._users.get(email) + return user["affiliations"] if user else [] + + def list_users(self): + return list(self._users.values()) + + def has_totp_secret(self, email): + user = self._users.get(email) + return bool(user and user.get("totp_secret")) + + def delete_user(self, email): + return self._users.pop(email, None) is not None + + def user_exists(self, email): + return email in self._users + + def update_affiliations(self, email, affiliations): + self._users[email]["affiliations"] = affiliations + return True + + def clear_totp_secret(self, email): + self._users[email]["totp_secret"] = "" + return True + + def create_invitation(self, email, affiliations): + return "token-1" + + +def _portal_app(): + received = tempfile.mkdtemp() + estimated = tempfile.mkdtemp() + app = build_portal_route_app( + templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), + received_dir=received, + estimated_dir=estimated, + user_store=_Store(), + ) + init_csrf(app) + return app, (received, estimated) + + +def test_admin_post_without_csrf_token_is_rejected(): + app, temp_dirs = _portal_app() + try: + with app.test_client() as client: + with client.session_transaction() as sess: + sess["authenticated"] = True + sess["user_email"] = "admin@test.com" + sess["user_affiliations"] = ["admin"] + resp = client.post("/admin/users/user@test.com/delete") + + assert resp.status_code == 400 + finally: + for path in temp_dirs: + shutil.rmtree(path) + + +def test_admin_post_with_invalid_csrf_token_is_rejected(): + app, temp_dirs = _portal_app() + try: + with app.test_client() as client: + with client.session_transaction() as sess: + sess["authenticated"] = True + sess["user_email"] = "admin@test.com" + sess["user_affiliations"] = ["admin"] + resp = client.post( + "/admin/users/user@test.com/delete", + data={"csrf_token": "not-a-valid-token"}, + ) + + assert resp.status_code == 400 + finally: + for path in temp_dirs: + shutil.rmtree(path) + + +def test_api_ingest_is_exempt_from_csrf(): + received = tempfile.mkdtemp() + received_padata = tempfile.mkdtemp() + received_estimation_inputs = tempfile.mkdtemp() + estimated = tempfile.mkdtemp() + try: + app = build_api_route_app( + received_dir=received, + received_padata_dir=received_padata, + received_estimation_inputs_dir=received_estimation_inputs, + estimated_dir=estimated, + ) + app.secret_key = "test-secret" + app.config["INGEST_KEYS"] = {API_KEY: "test-runner"} + + from routes.api import api_bp + + init_csrf(app, exempt_blueprints=(api_bp,)) + + with app.test_client() as client: + resp = client.post( + "/api/ingest/result", + data=json.dumps({"code": "test"}), + headers={"X-API-Key": API_KEY, "Content-Type": "application/json"}, + ) + + assert resp.status_code == 200 + finally: + for path in (received, received_padata, received_estimation_inputs, estimated): + shutil.rmtree(path) diff --git a/result_server/tests/test_portal_list_templates.py b/result_server/tests/test_portal_list_templates.py index 91011a2..bb4a878 100644 --- a/result_server/tests/test_portal_list_templates.py +++ b/result_server/tests/test_portal_list_templates.py @@ -89,6 +89,65 @@ def test_results_template_renders_table_note(): assert "#10" in html +def test_results_template_renders_ncu_options_tooltip(): + app = build_portal_shell_app( + templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), + ) + with app.test_request_context("/results"): + from flask import render_template + + html = render_template( + "results.html", + columns=[ + {"label": "Timestamp", "key": "timestamp"}, + {"label": "Profiler / PA", "key": "profile_summary"}, + {"label": "JSON", "key": "json_link"}, + ], + rows=[ + { + "timestamp": "2026-04-13 12:00:00", + "profile_summary": "ncu / single", + "profile_summary_meta": { + "has_profile_data": True, + "headline": "ncu / single", + "subline": "text, 1 run", + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + "data_link": "/results/padata0.tgz", + "json_link": "/results/result0.json", + "detail_link": "/results/detail/result0.json", + "filename": "result0.json", + "source_info": None, + "quality": {"level": "ready", "label": "Ready", "summary": "Breakdown is present."}, + "system": "RC_GH200", + "code": "genesis", + "fom": 1.0, + "exp": "CASE0", + "fom_version": "test", + "nodes": "1", + "numproc_node": "8", + "nthreads": "9", + "ci_trigger": "push", + "pipeline_id": "10", + "source_hash": "-", + } + ], + pagination={"total": 1, "page": 1, "total_pages": 1}, + current_per_page=50, + current_system="", + current_code="", + current_exp="", + filter_options={"systems": ["RC_GH200"], "codes": ["genesis"], "exps": ["CASE0"]}, + systems_info={}, + ) + + assert "ncu / single" in html + assert "ncu options: --target-processes all --set basic --launch-count 1" in html + assert "ncu_report" in html + + def test_estimated_results_template_renders_table_note(): app = build_portal_shell_app( templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), @@ -267,7 +326,7 @@ def test_result_compare_template_renders_headline(): assert "compareConfigData" in html -def test_usage_report_quality_section_renders_actions_and_validator_candidates(): +def test_usage_report_quality_section_renders_actions_and_improvement_candidates(): app = build_portal_shell_app( templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), ) @@ -328,7 +387,7 @@ def test_usage_report_quality_section_renders_actions_and_validator_candidates() ) assert "Next Action" in html - assert "Validator Candidates" in html + assert "Improvement Candidates" in html assert "populate top-level source_info for provenance tracking" in html assert "source_info present, fom_breakdown present" in html assert "2: source_info is missing, fom_breakdown is missing" in html diff --git a/result_server/tests/test_result_detail_template.py b/result_server/tests/test_result_detail_template.py index 38ddbfe..f66b276 100644 --- a/result_server/tests/test_result_detail_template.py +++ b/result_server/tests/test_result_detail_template.py @@ -120,11 +120,33 @@ def test_pa_data_summary_section(self, app): assert "PA Data Summary" in html assert "fapp" in html assert "single" in html - assert "Tool-Specific Events" in html + assert "Tool-Specific Detail" in html assert "fapp event set: pa1" in html assert "summary_text" in html assert "pa1" in html + def test_ncu_pa_data_summary_shows_ncu_options_without_generic_events(self, app): + result = { + **FULL_RESULT, + "profile_data": { + "tool": "ncu", + "level": "single", + "report_format": "text", + "run_count": 1, + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + } + with app.test_request_context(): + html = _render_result_detail(result, FULL_QUALITY) + + assert "Tool-Specific Detail" in html + assert "ncu options: --target-processes all --set basic --launch-count 1" in html + assert "NCU Options" in html + assert "ncu_report" in html + assert ">Events<" not in html + def test_vector_data_table(self, app): with app.test_request_context(): html = _render_result_detail(FULL_RESULT, FULL_QUALITY) @@ -209,4 +231,4 @@ def test_quality_section(self, app): assert "Estimation Inputs" in html assert "top-level source tracked" in html assert "Suggested Actions" in html - assert "Validator Candidates" in html + assert "Improvement Candidates" in html diff --git a/result_server/tests/test_result_padata_route.py b/result_server/tests/test_result_padata_route.py index 1b0d6cd..1c3a950 100644 --- a/result_server/tests/test_result_padata_route.py +++ b/result_server/tests/test_result_padata_route.py @@ -50,3 +50,27 @@ def test_results_route_serves_padata_from_received_padata_dir(client, tmp_dirs): resp = client.get(f"/results/{tgz_name}") assert resp.status_code == 200 assert resp.data == b"fake tgz content" + + +def test_results_route_blocks_confidential_padata_matched_by_server_uuid(client, tmp_dirs): + received, received_padata = tmp_dirs + uid = "12345678-1234-1234-1234-123456789abc" + tgz_name = f"padata_20250101_120000_{uid}.tgz" + + with open(os.path.join(received, "result0.json"), "w", encoding="utf-8") as f: + json.dump( + { + "code": "qws", + "system": "Fugaku", + "FOM": 1.0, + "_server_uuid": uid, + "confidential": ["dev"], + }, + f, + ) + + with open(os.path.join(received_padata, tgz_name), "wb") as f: + f.write(b"fake tgz content") + + resp = client.get(f"/results/{tgz_name}") + assert resp.status_code == 403 diff --git a/result_server/tests/test_result_quality_validator.py b/result_server/tests/test_result_quality_validator.py deleted file mode 100644 index e5c4b47..0000000 --- a/result_server/tests/test_result_quality_validator.py +++ /dev/null @@ -1,164 +0,0 @@ -import json -import os -import subprocess -import sys -import importlib.util -from pathlib import Path - - -SCRIPT_PATH = os.path.join( - os.path.dirname(__file__), - "..", - "..", - "scripts", - "validate_result_quality.py", -) - - -def _load_validator_module(): - spec = importlib.util.spec_from_file_location( - "validate_result_quality_module", - Path(SCRIPT_PATH), - ) - module = importlib.util.module_from_spec(spec) - assert spec.loader is not None - spec.loader.exec_module(module) - return module - - -def _write_json(path, data): - with open(path, "w", encoding="utf-8") as handle: - json.dump(data, handle, ensure_ascii=False) - - -def test_validator_reports_quality_and_actions(tmp_path): - _write_json( - tmp_path / "result_basic.json", - { - "code": "qws", - "system": "Fugaku", - "FOM": 1.0, - }, - ) - - completed = subprocess.run( - [sys.executable, SCRIPT_PATH, str(tmp_path)], - capture_output=True, - text=True, - check=False, - ) - - assert completed.returncode == 0 - assert "Scanned files: 1" in completed.stdout - assert "[Basic]" in completed.stdout - assert "policy-tier: strict" in completed.stdout - assert "populate top-level source_info for provenance tracking" in completed.stdout - assert "validator-candidate: source_info present" in completed.stdout - assert "policy-candidate: source_info present" in completed.stdout - - -def test_validator_can_emit_json_and_skip_non_result_files(tmp_path): - _write_json( - tmp_path / "result_ready.json", - { - "code": "qws", - "system": "Fugaku", - "FOM": 1.0, - "fom_breakdown": { - "sections": [{"name": "solver", "time": 1.0, "estimation_package": "identity"}], - "overlaps": [], - }, - }, - ) - _write_json(tmp_path / "note.json", {"hello": "world"}) - - completed = subprocess.run( - [sys.executable, SCRIPT_PATH, "--format", "json", str(tmp_path)], - capture_output=True, - text=True, - check=False, - ) - - assert completed.returncode == 0 - payload = json.loads(completed.stdout) - assert payload["scanned_files"] == 2 - assert payload["validated_results"] == 1 - assert payload["skipped_files"] == 1 - assert payload["default_tier"] == "relaxed" - ready_row = next(row for row in payload["rows"] if row["status"] == "ok") - assert ready_row["quality_level"] == "ready" - assert ready_row["policy_tier"] == "strict" - skipped_row = next(row for row in payload["rows"] if row["status"] == "skipped") - assert "missing FOM or system" in skipped_row["reason"] - - -def test_validator_fail_on_candidate_returns_nonzero(tmp_path): - _write_json( - tmp_path / "result_basic.json", - { - "code": "qws", - "system": "Fugaku", - "FOM": 1.0, - }, - ) - - completed = subprocess.run( - [sys.executable, SCRIPT_PATH, "--fail-on", "candidate", str(tmp_path)], - capture_output=True, - text=True, - check=False, - ) - - assert completed.returncode == 1 - - -def test_validator_fail_on_policy_uses_internal_tier(tmp_path): - _write_json( - tmp_path / "result_genesis.json", - { - "code": "genesis", - "system": "RC_GENOA", - "FOM": 1.0, - "source_info": {"source_type": "mystery"}, - }, - ) - - completed = subprocess.run( - [sys.executable, SCRIPT_PATH, "--fail-on", "policy", str(tmp_path)], - capture_output=True, - text=True, - check=False, - ) - - assert completed.returncode == 1 - assert "policy-tier: standard" in completed.stdout - assert "policy-candidate: recognized source_info.source_type" in completed.stdout - - -def test_build_quality_report_applies_redis_tier_overrides(tmp_path, monkeypatch): - module = _load_validator_module() - _write_json( - tmp_path / "result_qws.json", - { - "code": "qws", - "system": "Fugaku", - "FOM": 1.0, - }, - ) - - monkeypatch.setattr( - module, - "_load_redis_app_tier_overrides", - lambda redis_url, redis_key: {"qws": "relaxed"}, - ) - - report = module.build_quality_report( - [str(tmp_path)], - redis_url="redis://example.invalid/0", - ) - - assert report["redis_override_count"] == 1 - assert report["redis_override_key"] == module.DEFAULT_REDIS_KEY - row = next(row for row in report["rows"] if row["status"] == "ok") - assert row["policy_tier"] == "relaxed" - assert row["enforced_candidates"] == [] diff --git a/result_server/tests/test_results_loader.py b/result_server/tests/test_results_loader.py index ea961d4..2940628 100644 --- a/result_server/tests/test_results_loader.py +++ b/result_server/tests/test_results_loader.py @@ -320,8 +320,37 @@ def test_profile_summary_is_built_from_profile_data(self, flask_app, tmp_dir): assert row["profile_summary_meta"]["headline"] == "fapp / detailed" assert row["profile_summary_meta"]["subline"] == "both, 17 runs" assert row["profile_summary_meta"]["events"][0] == "pa1" + assert row["profile_summary_meta"]["ncu_options"] == [] assert "cpu_pa_csv" in row["profile_summary_meta"]["report_kinds"] + def test_profile_summary_keeps_ncu_options_separate_from_events(self, flask_app, tmp_dir): + uid = str(uuid.uuid4()) + _write_json(tmp_dir, f"result_20250101_120000_{uid}.json", { + "code": "genesis", + "system": "RC_GH200", + "Exp": "CASE0", + "FOM": 1.0, + "profile_data": { + "tool": "ncu", + "level": "single", + "report_format": "text", + "run_count": 1, + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + }) + + with flask_app.test_request_context(): + rows, _, _ = load_results_table(tmp_dir, public_only=True) + + assert len(rows) == 1 + row = rows[0] + assert row["profile_summary"] == "ncu / single" + assert row["profile_summary_meta"]["events"] == [] + assert row["profile_summary_meta"]["ncu_options"][:2] == ["--target-processes", "all"] + assert "ncu_report" in row["profile_summary_meta"]["report_kinds"] + class TestSummarizeResultQuality: def test_basic_quality_without_breakdown(self): diff --git a/result_server/tests/test_site_diagnostics.py b/result_server/tests/test_site_diagnostics.py index 90c44a0..8a0704c 100644 --- a/result_server/tests/test_site_diagnostics.py +++ b/result_server/tests/test_site_diagnostics.py @@ -4,7 +4,10 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from utils.site_diagnostics import build_site_diagnostics +from utils.site_diagnostics import ( + build_site_config_preflight_failures, + build_site_diagnostics, +) def _write_csv(path, header, rows): @@ -89,9 +92,11 @@ def test_build_site_diagnostics(tmp_path): assert diagnostics["application_count"] == 2 assert diagnostics["application_directory_count"] == 2 assert diagnostics["missing_system_info"] == ["RC_TEST"] + assert diagnostics["public_missing_system_definitions"] == [] assert diagnostics["missing_queue_definitions"] == [ {"system": "RC_TEST", "queue": "SLURM_UNKNOWN"} ] + assert diagnostics["public_missing_queue_definitions"] == [] assert diagnostics["unused_systems"] == ["RC_TEST", "RC_PARTIAL"] assert diagnostics["partial_support"] == [ { @@ -118,3 +123,54 @@ def test_build_site_diagnostics(tmp_path): "disabled_rows": 0, } ] + assert build_site_config_preflight_failures(diagnostics) == [] + + +def test_site_config_preflight_requires_public_systems_to_be_runnable(tmp_path): + config_dir = tmp_path / "config" + programs_dir = tmp_path / "programs" + config_dir.mkdir() + programs_dir.mkdir() + + _write_csv( + config_dir / "system.csv", + ["system", "mode", "tag_build", "tag_run", "queue", "queue_group"], + [ + ["Fugaku", "cross", "", "", "FJ", "small"], + ["PUBLIC_BAD_QUEUE", "native", "", "", "MISSING_QUEUE", "debug"], + ["PRIVATE_DEV", "native", "", "", "DEV_QUEUE", "debug"], + ], + ) + _write_csv( + config_dir / "queue.csv", + ["queue", "submit_cmd", "template"], + [ + ["FJ", "pjsub", "template"], + ], + ) + _write_csv( + config_dir / "system_info.csv", + ["system", "name", "cpu_name", "cpu_per_node", "cpu_cores", "gpu_name", "gpu_per_node", "memory", "display_order"], + [ + ["Fugaku", "Fugaku", "A64FX", "1", "48", "-", "-", "32GB", "1"], + ["PUBLIC_BAD_QUEUE", "PUBLIC_BAD_QUEUE", "CPU", "1", "16", "-", "-", "64GB", "2"], + ["PUBLIC_MISSING_SYSTEM", "PUBLIC_MISSING_SYSTEM", "CPU", "1", "16", "-", "-", "64GB", "3"], + ], + ) + + diagnostics = build_site_diagnostics( + system_csv_path=str(config_dir / "system.csv"), + queue_csv_path=str(config_dir / "queue.csv"), + system_info_csv_path=str(config_dir / "system_info.csv"), + programs_dir=str(programs_dir), + ) + + assert diagnostics["missing_system_info"] == ["PRIVATE_DEV"] + assert diagnostics["public_missing_system_definitions"] == ["PUBLIC_MISSING_SYSTEM"] + assert diagnostics["public_missing_queue_definitions"] == [ + {"system": "PUBLIC_BAD_QUEUE", "queue": "MISSING_QUEUE"} + ] + assert build_site_config_preflight_failures(diagnostics) == [ + "system_info.csv exposes PUBLIC_MISSING_SYSTEM, but config/system.csv has no matching system.", + "system_info.csv exposes PUBLIC_BAD_QUEUE, but config/system.csv references queue MISSING_QUEUE without a matching config/queue.csv definition.", + ] diff --git a/result_server/tests/test_totp_security.py b/result_server/tests/test_totp_security.py index 7f4079c..d3f2616 100644 --- a/result_server/tests/test_totp_security.py +++ b/result_server/tests/test_totp_security.py @@ -168,6 +168,11 @@ def has_totp_secret(self, email): return bool(user and user.get("totp_secret")) +class _BrokenRedis: + def ping(self): + raise ConnectionError("redis down") + + @pytest.fixture def admin_app(): """Create a Flask app for admin self-delete protection tests.""" @@ -207,6 +212,73 @@ def systemlist(): shutil.rmtree(temp_dir) +@pytest.fixture +def auth_app(): + """Create a Flask app for focused auth Redis availability tests.""" + app = Flask( + __name__, + template_folder=os.path.join(os.path.dirname(__file__), "..", "templates"), + ) + app.secret_key = "test-secret" + app.config["TESTING"] = True + app.config["USER_STORE"] = _StubUserStore() + + from routes.admin import admin_bp + from routes.auth import auth_bp + from routes.estimated import estimated_bp + from routes.home import register_home_routes + from routes.results import results_bp + + register_home_routes(app) + app.register_blueprint(admin_bp, url_prefix="/admin") + app.register_blueprint(auth_bp, url_prefix="/auth") + app.register_blueprint(results_bp, url_prefix="/results") + app.register_blueprint(estimated_bp, url_prefix="/estimated") + + temp_dir = tempfile.mkdtemp() + app.config["RECEIVED_DIR"] = temp_dir + app.config["ESTIMATED_DIR"] = temp_dir + + @app.route("/systemlist") + def systemlist(): + return "systems" + + yield app + shutil.rmtree(temp_dir) + + +class TestAuthRedisFailClosed: + """Tests production login behavior when Redis is unavailable.""" + + def test_requires_redis_without_connection_returns_503(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = True + auth_app.config["REDIS_CONN"] = None + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 503 + + def test_requires_redis_with_failed_ping_returns_503(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = True + auth_app.config["REDIS_CONN"] = _BrokenRedis() + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 503 + + def test_dev_mode_without_redis_continues_login_flow(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = False + auth_app.config["REDIS_CONN"] = None + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 200 + assert b"Step 2 of 2" in resp.data + + class TestAdminSelfDeletePrevention: """Tests that admins cannot delete their own account.""" diff --git a/result_server/utils/auth.py b/result_server/utils/auth.py new file mode 100644 index 0000000..8b16ad1 --- /dev/null +++ b/result_server/utils/auth.py @@ -0,0 +1,65 @@ +"""Authentication helpers shared by result_server API routes.""" + +from __future__ import annotations + +import hmac +import os +import warnings +from collections.abc import Mapping +from typing import Optional + +from flask import current_app + + +def parse_ingest_keys(env: Mapping[str, str] | None = None) -> dict[str, str]: + """Parse RESULT_SERVER_KEYS/RESULT_SERVER_KEY into {api_key: runner_id}.""" + env = env or os.environ + keys: dict[str, str] = {} + + multi_key_spec = env.get("RESULT_SERVER_KEYS", "").strip() + if multi_key_spec: + for entry in multi_key_spec.split(","): + if not entry.strip(): + continue + if ":" not in entry: + warnings.warn( + "Ignoring RESULT_SERVER_KEYS entry without runner_id:key format.", + RuntimeWarning, + stacklevel=2, + ) + continue + runner_id, key = (part.strip() for part in entry.split(":", 1)) + if not runner_id or not key: + warnings.warn( + "Ignoring RESULT_SERVER_KEYS entry with empty runner_id or key.", + RuntimeWarning, + stacklevel=2, + ) + continue + keys[key] = runner_id + + legacy_key = env.get("RESULT_SERVER_KEY", "").strip() + if legacy_key: + warnings.warn( + "RESULT_SERVER_KEY is deprecated; use RESULT_SERVER_KEYS=runner-id:key.", + DeprecationWarning, + stacklevel=2, + ) + keys.setdefault(legacy_key, "default") + + return keys + + +def verify_ingest_key(presented: str | None) -> Optional[str]: + """Return the runner_id for a valid ingest key, otherwise None.""" + if not presented: + return None + + keys = current_app.config.get("INGEST_KEYS") + if keys is None: + keys = parse_ingest_keys() + + for configured_key, runner_id in keys.items(): + if hmac.compare_digest(presented, configured_key): + return runner_id + return None diff --git a/result_server/utils/csrf.py b/result_server/utils/csrf.py new file mode 100644 index 0000000..6b70072 --- /dev/null +++ b/result_server/utils/csrf.py @@ -0,0 +1,13 @@ +"""CSRF extension setup for the result server.""" + +from flask_wtf.csrf import CSRFProtect + +csrf = CSRFProtect() + + +def init_csrf(app, *, exempt_blueprints=()): + """Initialize CSRF protection and exempt non-browser API blueprints.""" + for blueprint in exempt_blueprints: + csrf.exempt(blueprint) + csrf.init_app(app) + return csrf diff --git a/result_server/utils/result_detail_view.py b/result_server/utils/result_detail_view.py index bafed2f..c4cb2c7 100644 --- a/result_server/utils/result_detail_view.py +++ b/result_server/utils/result_detail_view.py @@ -43,21 +43,35 @@ def _build_profile_rows(profile_data): return [] events = profile_data.get("events") or [] + ncu_options = profile_data.get("ncu_options") or [] report_kinds = profile_data.get("report_kinds") or [] - return build_labeled_value_rows([ + rows = build_labeled_value_rows([ ("Tool", profile_data.get("tool", "N/A")), ("Level", profile_data.get("level", "N/A")), ("Report Format", profile_data.get("report_format", "N/A")), ("Run Count", profile_data.get("run_count", "N/A")), - ("Tool-Specific Events", _build_tool_specific_events_description(profile_data)), - ("Events", ", ".join(events) if events else "none"), - ("Report Kinds", ", ".join(report_kinds) if report_kinds else "none"), ]) + tool_specific_detail = _build_tool_specific_detail(profile_data) + if tool_specific_detail: + rows.append({"label": "Tool-Specific Detail", "value": tool_specific_detail}) + if events: + rows.append({"label": "Events", "value": ", ".join(events)}) + if ncu_options: + rows.append({"label": "NCU Options", "value": " ".join(ncu_options)}) + if report_kinds: + rows.append({"label": "Report Kinds", "value": ", ".join(report_kinds)}) + return rows + +def _build_tool_specific_detail(profile_data): + if profile_data.get("tool") == "ncu": + ncu_options = profile_data.get("ncu_options") or [] + if ncu_options: + return f"ncu options: {' '.join(ncu_options)}" + return "ncu options recorded in archive metadata when available" -def _build_tool_specific_events_description(profile_data): if profile_data.get("tool") != "fapp": - return "tool-specific event set" + return "tool-specific metadata" level = profile_data.get("level") mapping = { @@ -103,7 +117,7 @@ def _build_quality_rows(quality): }, {"label": "Estimation Inputs", "value": f"{stats.get('artifact_count', 0)} artifact reference(s)"}, {"label": "Suggested Actions", "list": quality.get("suggested_actions") or ["none"]}, - {"label": "Validator Candidates", "list": quality.get("validator_candidates") or ["none"]}, + {"label": "Improvement Candidates", "list": quality.get("validator_candidates") or ["none"]}, {"label": "Warnings", "list": warnings or ["none"]}, ] diff --git a/result_server/utils/result_file.py b/result_server/utils/result_file.py index 3f8dfc7..95825ac 100644 --- a/result_server/utils/result_file.py +++ b/result_server/utils/result_file.py @@ -44,11 +44,22 @@ def get_file_confidential_tags(filename: str, save_dir: str): if not uuid_match: return [] - uuid = uuid_match.group(0) + uuid = uuid_match.group(0).lower() + tags = [] for json_filename in os.listdir(save_dir): - if json_filename.endswith(".json") and uuid in json_filename: - return _read_confidential_from_json(json_filename, save_dir) - return [] + if not json_filename.endswith(".json"): + continue + if uuid in json_filename.lower(): + tags.extend(_read_confidential_from_json(json_filename, save_dir)) + continue + + data = _read_json(json_filename, save_dir) + if not isinstance(data, dict): + continue + server_uuid = data.get("_server_uuid") + if server_uuid is not None and str(server_uuid).lower() == uuid: + tags.extend(_extract_confidential_tags(data)) + return _unique_tags(tags) def check_file_permission(filename: str, dir_path: str) -> None: @@ -116,28 +127,48 @@ def load_authenticated_result_json( def _read_confidential_from_json(json_file: str, save_dir: str): + data = _read_json(json_file, save_dir) + if not isinstance(data, dict): + return [] + return _extract_confidential_tags(data) + + +def _read_json(json_file: str, save_dir: str): filepath = os.path.join(save_dir, json_file) if not os.path.exists(filepath): - return [] + return None try: with open(filepath, "r", encoding="utf-8") as f: - data = json.load(f) + return json.load(f) + except Exception: + return None - confidential_value = data.get("confidential", None) - if isinstance(confidential_value, list): - return [ - str(item).strip() - for item in confidential_value - if item and str(item).lower() != "null" - ] +def _extract_confidential_tags(data): + confidential_value = data.get("confidential", None) - if isinstance(confidential_value, str): - confidential_value = confidential_value.strip() - if confidential_value.lower() != "null" and confidential_value != "": - return [confidential_value] + if isinstance(confidential_value, list): + return [ + str(item).strip() + for item in confidential_value + if item and str(item).lower() != "null" + ] - return [] - except Exception: - return [] + if isinstance(confidential_value, str): + confidential_value = confidential_value.strip() + if confidential_value.lower() != "null" and confidential_value != "": + return [confidential_value] + + return [] + + +def _unique_tags(tags): + unique = [] + seen = set() + for tag in tags: + if tag in seen: + continue + seen.add(tag) + unique.append(tag) + return unique diff --git a/result_server/utils/result_table_rows.py b/result_server/utils/result_table_rows.py index 6cb09dd..db025a5 100644 --- a/result_server/utils/result_table_rows.py +++ b/result_server/utils/result_table_rows.py @@ -105,6 +105,8 @@ def _format_profile_summary(profile_data): return " / ".join(headline_parts) if headline_parts else "profile data" +# The template expects a stable, display-ready shape even when older result JSON +# files do not have profile_data or when only one profiler family is present. def _build_profile_summary_meta(profile_data): if not isinstance(profile_data, dict) or not profile_data: return { @@ -112,6 +114,7 @@ def _build_profile_summary_meta(profile_data): "headline": "", "subline": "", "events": [], + "ncu_options": [], "report_kinds": [], } @@ -128,5 +131,6 @@ def _build_profile_summary_meta(profile_data): "headline": _format_profile_summary(profile_data), "subline": ", ".join(subline_parts), "events": profile_data.get("events") if isinstance(profile_data.get("events"), list) else [], + "ncu_options": profile_data.get("ncu_options") if isinstance(profile_data.get("ncu_options"), list) else [], "report_kinds": profile_data.get("report_kinds") if isinstance(profile_data.get("report_kinds"), list) else [], } diff --git a/result_server/utils/site_diagnostics.py b/result_server/utils/site_diagnostics.py index ec4456a..7ec4125 100644 --- a/result_server/utils/site_diagnostics.py +++ b/result_server/utils/site_diagnostics.py @@ -128,15 +128,24 @@ def build_site_diagnostics( for row in system_info_rows if (row.get("system") or "").strip() } + public_systems = [ + (row.get("system") or "").strip() + for row in system_info_rows + if (row.get("system") or "").strip() + ] missing_queue_definitions = [] missing_system_info = [] + public_missing_system_definitions = [] + public_missing_queue_definitions = [] registered_systems = [] + system_rows_by_name = {} for row in systems: system = (row.get("system") or "").strip() queue = (row.get("queue") or "").strip() registered_systems.append(system) + system_rows_by_name[system] = row if queue and queue not in queue_names: missing_queue_definitions.append({ @@ -147,6 +156,19 @@ def build_site_diagnostics( if system and system not in system_info_names: missing_system_info.append(system) + for system in public_systems: + row = system_rows_by_name.get(system) + if row is None: + public_missing_system_definitions.append(system) + continue + + queue = (row.get("queue") or "").strip() + if not queue or queue not in queue_names: + public_missing_queue_definitions.append({ + "system": system, + "queue": queue, + }) + coverage_systems, app_support_rows = load_app_system_support_matrix( programs_dir=programs_root, system_csv_path=system_csv_path or _DEFAULT_SYSTEM_CSV, @@ -181,6 +203,8 @@ def build_site_diagnostics( "application_count": len(app_support_rows), "missing_queue_definitions": missing_queue_definitions, "missing_system_info": missing_system_info, + "public_missing_system_definitions": public_missing_system_definitions, + "public_missing_queue_definitions": public_missing_queue_definitions, "partial_support": partial_support, "unused_systems": unused_systems, "application_directory_count": program_diagnostics["application_directory_count"], @@ -189,3 +213,29 @@ def build_site_diagnostics( "apps_with_estimate_count": program_diagnostics["apps_with_estimate_count"], "unknown_listed_systems": program_diagnostics["unknown_listed_systems"], } + + +def build_site_config_preflight_failures(diagnostics): + failures = [] + + for system in diagnostics.get("public_missing_system_definitions", []): + failures.append( + f"system_info.csv exposes {system}, but config/system.csv has no matching system." + ) + + for item in diagnostics.get("public_missing_queue_definitions", []): + system = item.get("system", "") + queue = item.get("queue", "") + if queue: + failures.append( + "system_info.csv exposes " + f"{system}, but config/system.csv references queue {queue} " + "without a matching config/queue.csv definition." + ) + else: + failures.append( + "system_info.csv exposes " + f"{system}, but config/system.csv does not set a queue." + ) + + return failures diff --git a/scripts/bk_functions.sh b/scripts/bk_functions.sh index bc8ea6f..c73a69a 100644 --- a/scripts/bk_functions.sh +++ b/scripts/bk_functions.sh @@ -1,8 +1,8 @@ -#!/bin/sh +#!/bin/bash # bk_functions.sh - Common functions for standardized benchmark result output. -# Source this file from Run_Scripts: source scripts/bk_functions.sh +# Source this file from BenchKit bash run/build/estimate scripts. # -# POSIX compatible (no jq dependency). +# Bash is required for the estimation and profiler helpers below. # bk_emit_result - Output a standardized FOM result line. # @@ -621,9 +621,14 @@ bk_run_estimation_data_collection() { # # BenchKit keeps the common wrapper in bk_functions.sh, while each application # decides whether to use a profiler and which profiler tool / level to request. +# A level is translated per tool: fapp levels expand to one or more counter +# event runs, while ncu levels expand to a single Nsight Compute invocation with +# preset command-line options. Both tools stage raw data and meta.json in the +# same archive shape so result generation and the portal do not need to infer +# tool-specific filenames. # # Positional arguments: -# $1 - profiler tool (empty|none|off|fapp) +# $1 - profiler tool (empty|none|off|fapp|ncu) # # Supported variables: # BK_PROFILER_LEVEL optional profiler level override @@ -632,6 +637,8 @@ bk_run_estimation_data_collection() { # BK_PROFILER_REPORT_ARGS optional extra postprocess flags # BK_PROFILER_DIR raw profile output dir (default: pa) # BK_PROFILER_STAGE_DIR temporary staging dir for archive creation +# BK_PROFILER_ARGS and BK_PROFILER_REPORT_ARGS are expanded as shell fragments +# because sites often need to pass multiple profiler flags from CI variables. bk_get_profiler_tool() { _bk_profiler_tool="${1:-}" case "$_bk_profiler_tool" in @@ -639,7 +646,7 @@ bk_get_profiler_tool() { printf '%s\n' "" return 0 ;; - fapp) + fapp|ncu) printf '%s\n' "$_bk_profiler_tool" return 0 ;; @@ -669,6 +676,9 @@ bk_get_profiler_level() { fapp) _bk_profiler_level="single" ;; + ncu) + _bk_profiler_level="single" + ;; esac fi @@ -677,6 +687,10 @@ bk_get_profiler_level() { printf '%s\n' "$_bk_profiler_level" return 0 ;; + ncu:single|ncu:simple|ncu:standard|ncu:detailed) + printf '%s\n' "$_bk_profiler_level" + return 0 + ;; *) echo "bk_get_profiler_level: unsupported level '${_bk_profiler_level}' for tool '${_bk_profiler_tool}'" >&2 return 1 @@ -702,6 +716,9 @@ bk_get_profiler_report_format() { fapp:simple|fapp:standard|fapp:detailed) _bk_profiler_report_format="both" ;; + ncu:single|ncu:simple|ncu:standard|ncu:detailed) + _bk_profiler_report_format="text" + ;; esac fi @@ -717,6 +734,8 @@ bk_get_profiler_report_format() { esac } +# fapp requires a separate profiler run for each counter group. The public +# level names stay stable even if the underlying pa* event set changes. bk_profiler_fapp_level_events() { case "$1" in single) @@ -752,6 +771,73 @@ bk_profiler_fapp_postprocess_command() { return 1 } +# Nsight Compute levels are intentionally short presets. Callers can append +# site- or application-specific flags with BK_PROFILER_ARGS. +bk_profiler_ncu_level_args() { + case "$1" in + single) + printf '%s\n' "--set basic --launch-count 1" + ;; + simple) + printf '%s\n' "--set basic --launch-count 5" + ;; + standard) + printf '%s\n' "--set full --launch-count 1" + ;; + detailed) + printf '%s\n' "--set full --nvtx" + ;; + *) + echo "bk_profiler_ncu_level_args: unsupported level '$1'" >&2 + return 1 + ;; + esac +} + +# Nsight Compute changed report suffixes across releases, so look for all names +# we have seen instead of hard-coding only .ncu-rep. +bk_profiler_find_ncu_report() { + _bk_ncu_report_dir="$1" + find "$_bk_ncu_report_dir" -maxdepth 1 -type f \( \ + -name '*.ncu-rep' -o \ + -name '*.nsight-cuprof' -o \ + -name 'profile*' \ + \) | head -n 1 +} + +bk_json_escape() { + _bk_json_value="$1" + _bk_json_value=${_bk_json_value//\\/\\\\} + _bk_json_value=${_bk_json_value//\"/\\\"} + _bk_json_value=${_bk_json_value//$'\t'/\\t} + _bk_json_value=${_bk_json_value//$'\r'/\\r} + _bk_json_value=${_bk_json_value//$'\n'/\\n} + printf '%s' "$_bk_json_value" +} + +bk_json_string() { + printf '"' + bk_json_escape "$1" + printf '"' +} + +bk_json_string_array() { + _bk_json_first=1 + printf '[' + for _bk_json_item in "$@"; do + if [ "$_bk_json_first" -eq 0 ]; then + printf ', ' + fi + bk_json_string "$_bk_json_item" + _bk_json_first=0 + done + printf ']' +} + +# Write a compact, tool-neutral manifest for the profiler archive. Result JSON +# generation reads this manifest to expose summary fields without opening every +# raw profiler artifact. For fapp, run_events contains counter names; for ncu it +# carries the selected level so optional hooks still receive useful context. bk_profiler_write_meta() { _bk_meta_stage_dir="$1" _bk_meta_tool="$2" @@ -759,6 +845,8 @@ bk_profiler_write_meta() { _bk_meta_report_format="$4" _bk_meta_run_names="$5" _bk_meta_run_events="$6" + _bk_meta_profiler_args="$7" + _bk_meta_report_args="$8" _bk_meta_file="${_bk_meta_stage_dir}/meta.json" IFS=',' read -r -a _bk_meta_names <<< "$_bk_meta_run_names" IFS=',' read -r -a _bk_meta_events <<< "$_bk_meta_run_events" @@ -769,19 +857,94 @@ bk_profiler_write_meta() { printf ' "level": "%s",\n' "$_bk_meta_level" printf ' "report_format": "%s",\n' "$_bk_meta_report_format" printf ' "raw_dir": "raw",\n' + printf ' "measurement": {\n' + printf ' "run_count": %s,\n' "${#_bk_meta_names[@]}" + printf ' "profiler_args": ' + bk_json_string "$_bk_meta_profiler_args" + printf ',\n' + printf ' "report_args": ' + bk_json_string "$_bk_meta_report_args" + case "$_bk_meta_tool" in + fapp) + printf ',\n' + printf ' "fapp_events": ' + bk_json_string_array "${_bk_meta_events[@]}" + printf '\n' + ;; + ncu) + _bk_meta_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_meta_level") + read -r -a _bk_meta_ncu_level_arg_array <<< "$_bk_meta_ncu_level_args" + printf ',\n' + printf ' "ncu_options": ' + bk_json_string_array "--target-processes" "all" "${_bk_meta_ncu_level_arg_array[@]}" + printf '\n' + ;; + *) + printf '\n' + ;; + esac + printf ' },\n' printf ' "runs": [\n' for _bk_meta_idx in "${!_bk_meta_names[@]}"; do _bk_meta_name="${_bk_meta_names[$_bk_meta_idx]}" _bk_meta_event="${_bk_meta_events[$_bk_meta_idx]:-}" - _bk_meta_text_path="reports/fapp_A_${_bk_meta_name}.txt" - _bk_meta_csv_path="reports/cpu_pa_${_bk_meta_name}.csv" - _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" - _bk_meta_csv_abs="${_bk_meta_stage_dir}/${_bk_meta_csv_path}" + case "$_bk_meta_tool" in + fapp) + _bk_meta_text_path="reports/fapp_A_${_bk_meta_name}.txt" + _bk_meta_csv_path="reports/cpu_pa_${_bk_meta_name}.csv" + _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" + _bk_meta_csv_abs="${_bk_meta_stage_dir}/${_bk_meta_csv_path}" + _bk_meta_ncu_report_path="" + _bk_meta_ncu_report_abs="" + ;; + ncu) + _bk_meta_text_path="reports/ncu_import_${_bk_meta_name}.txt" + _bk_meta_csv_path="" + _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" + _bk_meta_csv_abs="" + _bk_meta_ncu_report_abs=$(bk_profiler_find_ncu_report "${_bk_meta_stage_dir}/raw/${_bk_meta_name}" || true) + if [ -n "$_bk_meta_ncu_report_abs" ]; then + _bk_meta_ncu_report_path="${_bk_meta_ncu_report_abs#${_bk_meta_stage_dir}/}" + else + _bk_meta_ncu_report_path="" + fi + ;; + *) + _bk_meta_text_path="" + _bk_meta_csv_path="" + _bk_meta_text_abs="" + _bk_meta_csv_abs="" + _bk_meta_ncu_report_path="" + _bk_meta_ncu_report_abs="" + ;; + esac printf ' {\n' printf ' "name": "%s",\n' "$_bk_meta_name" printf ' "event": "%s",\n' "$_bk_meta_event" printf ' "raw_path": "raw/%s",\n' "$_bk_meta_name" + printf ' "measurement": {\n' + case "$_bk_meta_tool" in + fapp) + printf ' "counter": ' + bk_json_string "$_bk_meta_event" + printf ',\n' + printf ' "options": ' + bk_json_string_array "-C" "-d" "raw/${_bk_meta_name}" "-Hevent=${_bk_meta_event}" + printf '\n' + ;; + ncu) + _bk_meta_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_meta_level") + read -r -a _bk_meta_ncu_level_arg_array <<< "$_bk_meta_ncu_level_args" + printf ' "options": ' + bk_json_string_array "-o" "raw/${_bk_meta_name}/profile" "--target-processes" "all" "${_bk_meta_ncu_level_arg_array[@]}" + printf '\n' + ;; + *) + printf ' "options": []\n' + ;; + esac + printf ' },\n' printf ' "reports": [\n' _bk_meta_has_report=0 if [ -f "$_bk_meta_text_abs" ]; then @@ -795,6 +958,13 @@ bk_profiler_write_meta() { printf ' {"kind": "cpu_pa_csv", "path": "%s"}' "$_bk_meta_csv_path" _bk_meta_has_report=1 fi + if [ -n "$_bk_meta_ncu_report_path" ] && [ -f "$_bk_meta_ncu_report_abs" ]; then + if [ "$_bk_meta_has_report" -eq 1 ]; then + printf ',\n' + fi + printf ' {"kind": "ncu_report", "path": "%s"}' "$_bk_meta_ncu_report_path" + _bk_meta_has_report=1 + fi if [ "$_bk_meta_has_report" -eq 1 ]; then printf '\n' fi @@ -810,6 +980,9 @@ bk_profiler_write_meta() { } > "$_bk_meta_file" } +# Optional hooks let site scripts wrap individual profiler runs, for example to +# load per-run modules or print scheduler diagnostics. Hook arguments are: +# tool, level, run name, fapp event or ncu level, then the profiled command. bk_profiler_call_optional_hook() { _bk_hook_name="$1" shift || true @@ -900,6 +1073,9 @@ bk_profiler() { mkdir -p "$_bk_stage_dir/reports" _bk_profiler_run_names="" _bk_profiler_run_events="" + _bk_profiler_status=0 + _bk_profiler_extra_args="${BK_PROFILER_ARGS:-}" + _bk_profiler_report_extra_args="${BK_PROFILER_REPORT_ARGS:-}" case "$_bk_profiler_tool" in fapp) @@ -912,10 +1088,20 @@ bk_profiler() { mkdir -p "$_bk_fapp_rep_dir" echo "bk_profiler[fapp]: starting ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 bk_profiler_call_optional_hook bk_profiler_before_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_fapp_rep_name" "$_bk_fapp_event" "$@" || return 1 + # BK_PROFILER_ARGS is intentionally word-split into fapp options. # shellcheck disable=SC2086 - fapp -C -d "$_bk_fapp_rep_dir" ${BK_PROFILER_ARGS:-} -Hevent="${_bk_fapp_event}" "$@" + if fapp -C -d "$_bk_fapp_rep_dir" ${_bk_profiler_extra_args} -Hevent="${_bk_fapp_event}" "$@"; then + _bk_fapp_status=0 + else + _bk_fapp_status=$? + fi bk_profiler_call_optional_hook bk_profiler_after_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_fapp_rep_name" "$_bk_fapp_event" "$@" || return 1 - echo "bk_profiler[fapp]: completed ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 + if [ "$_bk_fapp_status" -eq 0 ]; then + echo "bk_profiler[fapp]: completed ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 + else + echo "bk_profiler[fapp]: failed ${_bk_fapp_rep_name} event=${_bk_fapp_event} status=${_bk_fapp_status}" >&2 + _bk_profiler_status="$_bk_fapp_status" + fi cp -R "$_bk_fapp_rep_dir" "$_bk_stage_dir/raw/${_bk_fapp_rep_name}" if [ -n "$_bk_profiler_run_names" ]; then _bk_profiler_run_names="${_bk_profiler_run_names},${_bk_fapp_rep_name}" @@ -925,10 +1111,44 @@ bk_profiler() { _bk_profiler_run_events="${_bk_fapp_event}" fi _bk_fapp_run_index=$((_bk_fapp_run_index + 1)) + if [ "$_bk_fapp_status" -ne 0 ]; then + break + fi done ;; + ncu) + if ! command -v ncu >/dev/null 2>&1; then + echo "bk_profiler[ncu]: ncu not found in PATH" >&2 + return 1 + fi + _bk_ncu_rep_name="rep1" + _bk_ncu_rep_dir="${_bk_profiler_dir}/${_bk_ncu_rep_name}" + _bk_ncu_profile_base="${_bk_ncu_rep_dir}/profile" + mkdir -p "$_bk_ncu_rep_dir" + _bk_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_profiler_level") || return 1 + echo "bk_profiler[ncu]: starting ${_bk_ncu_rep_name} level=${_bk_profiler_level}" >&2 + bk_profiler_call_optional_hook bk_profiler_before_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_ncu_rep_name" "$_bk_profiler_level" "$@" || return 1 + # BK_PROFILER_ARGS is intentionally word-split into ncu options. + # shellcheck disable=SC2086 + if ncu -o "$_bk_ncu_profile_base" --target-processes all ${_bk_ncu_level_args} ${_bk_profiler_extra_args} "$@"; then + _bk_profiler_status=0 + else + _bk_profiler_status=$? + fi + bk_profiler_call_optional_hook bk_profiler_after_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_ncu_rep_name" "$_bk_profiler_level" "$@" || return 1 + if [ "$_bk_profiler_status" -eq 0 ]; then + echo "bk_profiler[ncu]: completed ${_bk_ncu_rep_name} level=${_bk_profiler_level}" >&2 + else + echo "bk_profiler[ncu]: failed ${_bk_ncu_rep_name} level=${_bk_profiler_level} status=${_bk_profiler_status}" >&2 + fi + cp -R "$_bk_ncu_rep_dir" "$_bk_stage_dir/raw/${_bk_ncu_rep_name}" + _bk_profiler_run_names="${_bk_ncu_rep_name}" + _bk_profiler_run_events="${_bk_profiler_level}" + ;; esac + # Report import/postprocess is best-effort: keep the raw archive even when a + # text/CSV summary cannot be produced on the run node. case "$_bk_profiler_tool" in fapp) if _bk_fapp_post_cmd=$(bk_profiler_fapp_postprocess_command); then @@ -937,23 +1157,46 @@ bk_profiler() { for _bk_fapp_rep_name in "${_bk_fapp_run_name_list[@]}"; do _bk_fapp_rep_dir="${_bk_profiler_dir}/${_bk_fapp_rep_name}" if [ "$_bk_profiler_report_format" = "text" ] || [ "$_bk_profiler_report_format" = "both" ]; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into fapp/fapppx options. # shellcheck disable=SC2086 - "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${BK_PROFILER_REPORT_ARGS:-} > "$_bk_stage_dir/reports/fapp_A_${_bk_fapp_rep_name}.txt" 2>&1 || true + "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${_bk_profiler_report_extra_args} > "$_bk_stage_dir/reports/fapp_A_${_bk_fapp_rep_name}.txt" 2>&1 || true fi if [ "$_bk_profiler_report_format" = "csv" ] || [ "$_bk_profiler_report_format" = "both" ]; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into fapp/fapppx options. # shellcheck disable=SC2086 - "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${BK_PROFILER_REPORT_ARGS:-} -Icpupa -tcsv -o "$_bk_stage_dir/reports/cpu_pa_${_bk_fapp_rep_name}.csv" >/dev/null 2>&1 || true + "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${_bk_profiler_report_extra_args} -Icpupa -tcsv -o "$_bk_stage_dir/reports/cpu_pa_${_bk_fapp_rep_name}.csv" >/dev/null 2>&1 || true fi done else echo "fapp/fapppx not found in PATH" > "$_bk_stage_dir/reports/fapp_A_missing.txt" fi ;; + ncu) + IFS=',' read -r -a _bk_ncu_run_name_list <<< "$_bk_profiler_run_names" + for _bk_ncu_rep_name in "${_bk_ncu_run_name_list[@]}"; do + _bk_ncu_report_file=$(bk_profiler_find_ncu_report "$_bk_profiler_dir/${_bk_ncu_rep_name}" || true) + if [ -n "$_bk_ncu_report_file" ] && { [ "$_bk_profiler_report_format" = "text" ] || [ "$_bk_profiler_report_format" = "both" ]; }; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into ncu --import options. + # shellcheck disable=SC2086 + ncu --import "$_bk_ncu_report_file" --page details ${_bk_profiler_report_extra_args} > "$_bk_stage_dir/reports/ncu_import_${_bk_ncu_rep_name}.txt" 2>&1 || true + fi + done + ;; esac - bk_profiler_write_meta "$_bk_stage_dir" "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_profiler_report_format" "$_bk_profiler_run_names" "$_bk_profiler_run_events" - tar -czf "$_bk_profiler_archive" "$_bk_stage_dir" + # Preserve the profiler command status after metadata/archive creation. If the + # archive itself cannot be written, that failure is more actionable to CI. + bk_profiler_write_meta "$_bk_stage_dir" "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_profiler_report_format" "$_bk_profiler_run_names" "$_bk_profiler_run_events" "$_bk_profiler_extra_args" "$_bk_profiler_report_extra_args" + if tar -czf "$_bk_profiler_archive" "$_bk_stage_dir"; then + _bk_profiler_archive_status=0 + else + _bk_profiler_archive_status=$? + fi rm -rf "$_bk_stage_dir" + if [ "$_bk_profiler_archive_status" -ne 0 ]; then + return "$_bk_profiler_archive_status" + fi + return "$_bk_profiler_status" } # bk_emit_overlap - Backward-compatible wrapper for overlap-like section timing. diff --git a/scripts/job_functions.sh b/scripts/job_functions.sh index 7da16a6..51cad84 100644 --- a/scripts/job_functions.sh +++ b/scripts/job_functions.sh @@ -64,6 +64,24 @@ get_system_queue_group() { return 0 } +# Queue templates can request aggregate CPU sockets or GPU cards. Pull the +# per-node values from system_info.csv instead of duplicating them in queue.csv. +get_system_cpu_per_node() { + local system="$1" + local info_file="${SYSTEM_INFO_FILE:-config/system_info.csv}" + awk -F, -v s="$system" '$1==s {print $4}' "$info_file" + return 0 +} + +# A dash in system_info.csv means "no GPU"; matrix_generate.sh normalizes +# non-numeric values to zero before doing scheduler arithmetic. +get_system_gpu_per_node() { + local system="$1" + local info_file="${SYSTEM_INFO_FILE:-config/system_info.csv}" + awk -F, -v s="$system" '$1==s {print $7}' "$info_file" + return 0 +} + # System_CSVからtag_buildを取得する # $1: システム名 # mode=nativeの場合は空文字を返す(tag_buildカラム自体が空) diff --git a/scripts/matrix_generate.sh b/scripts/matrix_generate.sh index 396aa66..3f922b7 100644 --- a/scripts/matrix_generate.sh +++ b/scripts/matrix_generate.sh @@ -10,6 +10,7 @@ set -euo pipefail SYSTEM_FILE="config/system.csv" QUEUE_FILE="config/queue.csv" +SYSTEM_INFO_FILE="config/system_info.csv" OUTPUT_FILE=".gitlab-ci.generated.yml" source ./scripts/job_functions.sh @@ -74,7 +75,16 @@ for listfile in programs/*/list.csv; do job_prefix="${program}_${system}_N${nodes}_P${numproc_node}_T${nthreads}" program_path="$program_dir" - export elapse nodes queue_group numproc_node nthreads + # queue.csv templates can use both direct list.csv values and derived + # scheduler quantities such as total ranks, CPU sockets, and GPU cards. + proc=$((nodes * numproc_node)) + cpu_per_node=$(get_system_cpu_per_node "$system") + gpu_per_node=$(get_system_gpu_per_node "$system") + [[ "$cpu_per_node" =~ ^[0-9]+$ ]] || cpu_per_node=0 + [[ "$gpu_per_node" =~ ^[0-9]+$ ]] || gpu_per_node=0 + cpu_sockets=$((nodes * cpu_per_node)) + gpu_cards=$((nodes * gpu_per_node)) + export elapse nodes queue_group numproc_node nthreads proc cpu_per_node gpu_per_node cpu_sockets gpu_cards read -r submit_cmd template <<< "$(get_queue_template "$system")" if [[ -z "$submit_cmd" || -z "$template" ]]; then @@ -211,4 +221,3 @@ ${job_prefix}_build_run: done < "$listfile" done - diff --git a/scripts/result.sh b/scripts/result.sh index 3f11e3e..b82a274 100644 --- a/scripts/result.sh +++ b/scripts/result.sh @@ -24,6 +24,56 @@ node_count='how_many' numproc_node="" nthreads="" +# Read the lightweight profiler manifest from a padata archive and turn it into +# the small profile_data block stored in result*.json. Missing or unreadable +# archives are ignored so FOM result generation is not blocked by profiler +# postprocessing problems. +build_profile_data_summary() { + local tgz_file="$1" + + if [[ ! -f "$tgz_file" ]]; then + printf '%s' "" + return 0 + fi + + local meta_member + meta_member=$(tar -tzf "$tgz_file" 2>/dev/null | grep 'meta\.json$' | head -n 1 || true) + if [[ -z "$meta_member" ]]; then + printf '%s' "" + return 0 + fi + + local meta_json + meta_json=$(tar -xOf "$tgz_file" "$meta_member" 2>/dev/null || true) + if [[ -z "$meta_json" ]]; then + printf '%s' "" + return 0 + fi + + echo "$meta_json" | jq -c ' + { + tool: .tool, + level: .level, + report_format: .report_format, + raw_dir: .raw_dir, + run_count: ((.runs // []) | length), + events: ( + if .tool == "fapp" + then ((.runs // []) | map(.event) | map(select(. != null and . != ""))) + else [] + end + ), + ncu_options: ( + if .tool == "ncu" and ((.measurement.ncu_options // null) | type) == "array" + then .measurement.ncu_options + else [] + end + ), + report_kinds: ((.runs // []) | map(.reports // []) | add | map(.kind) | unique) + } + ' 2>/dev/null || true +} + # Read source_info.env if it exists (written by bk_fetch_source in build stage) source_info_block="null" if [ -f results/source_info.env ]; then @@ -99,6 +149,16 @@ write_result_json() { \"pipeline_id\": $pipeline_id" fi + # Attach the profiler summary that matches this FOM index. fapp exposes + # counter events, while ncu exposes the Nsight Compute option preset. + local profile_data_block="" + local profile_data_summary="" + profile_data_summary=$(build_profile_data_summary "results/padata${idx}.tgz") + if [ -n "$profile_data_summary" ]; then + profile_data_block=", + \"profile_data\": ${profile_data_summary}" + fi + # Build fom_breakdown if sections exist if [ -n "$sections_json" ]; then # Validate overlap section names @@ -139,7 +199,7 @@ write_result_json() { "nthreads": "$nthreads", "description": "$description", "confidential": "$confidential", - "source_info": $source_info_block${fom_breakdown_block}${timing_block}${mode_block}${trigger_block}${build_job_block}${run_job_block}${pipeline_id_block} + "source_info": $source_info_block${profile_data_block}${fom_breakdown_block}${timing_block}${mode_block}${trigger_block}${build_job_block}${run_job_block}${pipeline_id_block} } EOF diff --git a/scripts/result_server/send_results.sh b/scripts/result_server/send_results.sh index a5bb2f5..dc7d597 100644 --- a/scripts/result_server/send_results.sh +++ b/scripts/result_server/send_results.sh @@ -5,34 +5,13 @@ echo "Sending results to server" ls results/ -run_result_quality_validator() { - local fail_on="${BK_RESULT_QUALITY_FAIL_ON:-none}" - local python_cmd="" - - if command -v python3 >/dev/null 2>&1; then - python_cmd="python3" - elif command -v python >/dev/null 2>&1; then - python_cmd="python" - fi - - if [[ -z "$python_cmd" ]]; then - echo "Result-quality validator skipped: Python runtime not found" - return 0 - fi - - echo "Running result-quality validator (fail-on=${fail_on})" - "$python_cmd" scripts/validate_result_quality.py results --fail-on "$fail_on" -} - -if [[ "${BK_RESULT_QUALITY_VALIDATE:-false}" == "true" ]]; then - run_result_quality_validator -else - echo "Result-quality validator disabled (set BK_RESULT_QUALITY_VALIDATE=true to enable)" -fi - meta_file="results/server_result_meta.json" echo "{}" > "$meta_file" +# Backfill profile_data for older result JSONs that were produced before +# result.sh learned to embed profiler summaries. The summary comes from +# bk_profiler_artifact/meta.json inside the matching padata archive; raw +# profiler files stay in the archive and are uploaded separately below. build_profile_data_summary() { local tgz_file="$1" @@ -62,7 +41,18 @@ build_profile_data_summary() { report_format: .report_format, raw_dir: .raw_dir, run_count: ((.runs // []) | length), - events: ((.runs // []) | map(.event) | map(select(. != null and . != ""))), + events: ( + if .tool == "fapp" + then ((.runs // []) | map(.event) | map(select(. != null and . != ""))) + else [] + end + ), + ncu_options: ( + if .tool == "ncu" and ((.measurement.ncu_options // null) | type) == "array" + then .measurement.ncu_options + else [] + end + ), report_kinds: ((.runs // []) | map(.reports // []) | add | map(.kind) | unique) } ' 2>/dev/null || true @@ -72,7 +62,7 @@ build_profile_data_summary() { for json_file in results/result*.json; do [[ ! -f "$json_file" ]] && continue - # Determine corresponding TGZ name + # Match result12.json with padata12.tgz, and result.json with padata.tgz. tgz_base="padata" if [[ "$json_file" =~ result([0-9]+)\.json$ ]]; then diff --git a/scripts/setup_site_runner.sh b/scripts/setup_site_runner.sh new file mode 100755 index 0000000..1f9c99e --- /dev/null +++ b/scripts/setup_site_runner.sh @@ -0,0 +1,504 @@ +#!/usr/bin/env bash +set -euo pipefail + +runner_version="v18.5.0" +go_version="1.25.0" +arch="" +site="" +gitlab_url="" +login_token="" +jacamar_token="" +login_tag="" +jacamar_tag="" +scheduler="pbs" +jacamar_repo="" +base_dir="" +service_host="" +allow_user="${USER:-}" +command_delay="30s" +install_systemd=1 +start_service=1 +libseccomp_mode="auto" +jacamar_pbs_tools="" +unrestricted_cmd_line=false +runner_proxy="" +runner_no_proxy="" + +usage() { + cat <<'EOF' +Usage: + setup_site_runner.sh --site SITE --gitlab-url URL --login-token TOKEN --jacamar-token TOKEN [options] + +Required: + --site SITE Site prefix used for tags if tags are omitted. + --gitlab-url URL GitLab URL shared by both runners. + --login-token TOKEN Runner token for the login/frontend runner. + --jacamar-token TOKEN Runner token for the Jacamar/batch runner. + +Options: + --arch amd64|arm64 Target architecture. Default: auto-detect. + --login-tag TAG Expected login runner tag for display only. + With runner authentication tokens, tags are set on GitLab. + --jacamar-tag TAG Expected Jacamar runner tag for display only. + With runner authentication tokens, tags are set on GitLab. + --scheduler pbs|slurm|pjm + --jacamar-repo URL Jacamar-CI repository. Default: PJM fork for + --scheduler pjm, upstream otherwise. + --base-dir DIR Default: $HOME/gitlab-runner_jacamar-ci_{amd,arm} + --service-host HOST Default: hostname -s. + --allow-user USER Jacamar user_allowlist entry. Default: $USER. + --runner-version VER Default: v18.5.0. + --go-version VER Default: 1.25.0. + --command-delay VALUE Jacamar batch command_delay. Default: 30s. + --jacamar-pbs-tools PATH Copy PATH to jacamar-ci/internal/executors/pbs/tools.go before build. + --unrestricted-cmd-line Allow Jacamar to keep runner generated Git/token commands + on the command line. Useful when GIT_ASKPASS fails. + --proxy URL Set http_proxy/https_proxy for the runner systemd service. + If URL has no scheme, http:// is prepended. + --no-proxy LIST Set no_proxy/NO_PROXY for the runner systemd service. + --libseccomp auto|system|local|none + Default: auto. Use system libseccomp if available, + build local gperf/libseccomp if missing. + --with-libseccomp Alias for --libseccomp local. + --without-libseccomp Alias for --libseccomp none. + --no-systemd Do not create a systemd user service. + --no-start Create and enable service, but do not start it. + -h, --help Show this help. + +Example: + curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \ + | bash -s -- --arch amd64 --site genkai \ + --gitlab-url https://gitlab.example.jp \ + --login-token "$LOGIN_TOKEN" --jacamar-token "$JACAMAR_TOKEN" \ + --scheduler pjm --service-host genkai0001 +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +info() { + echo "[setup-site-runner] $*" +} + +systemd_env_escape() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '%s' "$value" +} + +write_systemd_env() { + local unit_path="$1" + local name="$2" + local value="$3" + [[ -n "$value" ]] || return 0 + printf 'Environment="%s=%s"\n' "$name" "$(systemd_env_escape "$value")" >> "$unit_path" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --arch) arch="${2:-}"; shift 2 ;; + --site) site="${2:-}"; shift 2 ;; + --gitlab-url) gitlab_url="${2:-}"; shift 2 ;; + --login-token) login_token="${2:-}"; shift 2 ;; + --jacamar-token) jacamar_token="${2:-}"; shift 2 ;; + --login-tag) login_tag="${2:-}"; shift 2 ;; + --jacamar-tag) jacamar_tag="${2:-}"; shift 2 ;; + --scheduler) scheduler="${2:-}"; shift 2 ;; + --jacamar-repo) jacamar_repo="${2:-}"; shift 2 ;; + --base-dir) base_dir="${2:-}"; shift 2 ;; + --service-host) service_host="${2:-}"; shift 2 ;; + --allow-user) allow_user="${2:-}"; shift 2 ;; + --runner-version) runner_version="${2:-}"; shift 2 ;; + --go-version) go_version="${2:-}"; shift 2 ;; + --command-delay) command_delay="${2:-}"; shift 2 ;; + --jacamar-pbs-tools) jacamar_pbs_tools="${2:-}"; shift 2 ;; + --unrestricted-cmd-line) unrestricted_cmd_line=true; shift ;; + --proxy) runner_proxy="${2:-}"; shift 2 ;; + --no-proxy) runner_no_proxy="${2:-}"; shift 2 ;; + --libseccomp) libseccomp_mode="${2:-}"; shift 2 ;; + --with-libseccomp) libseccomp_mode="local"; shift ;; + --without-libseccomp) libseccomp_mode="none"; shift ;; + --no-systemd) install_systemd=0; shift ;; + --no-start) start_service=0; shift ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown option: $1" ;; + esac +done + +[[ -n "$site" ]] || die "--site is required" +[[ -n "$gitlab_url" ]] || die "--gitlab-url is required" +[[ -n "$login_token" ]] || die "--login-token is required" +[[ -n "$jacamar_token" ]] || die "--jacamar-token is required" +[[ -n "$allow_user" ]] || die "--allow-user is required when USER is empty" + +if [[ -z "$arch" ]]; then + case "$(uname -m)" in + x86_64|amd64) arch="amd64" ;; + aarch64|arm64) arch="arm64" ;; + *) die "Cannot auto-detect arch from uname -m=$(uname -m); pass --arch" ;; + esac +fi + +case "$arch" in + amd64) arch_suffix="amd"; runner_arch="amd64"; go_arch="amd64" ;; + arm64) arch_suffix="arm"; runner_arch="arm64"; go_arch="arm64" ;; + *) die "--arch must be amd64 or arm64" ;; +esac + +case "$scheduler" in + pbs|slurm|pjm) ;; + *) die "--scheduler must be pbs, slurm, or pjm" ;; +esac + +case "$libseccomp_mode" in + auto|system|local|none) ;; + *) die "--libseccomp must be auto, system, local, or none" ;; +esac + +if [[ -n "$runner_proxy" ]]; then + case "$runner_proxy" in + http://*|https://*) ;; + *) runner_proxy="http://${runner_proxy}" ;; + esac +fi + +if [[ -z "$jacamar_repo" ]]; then + if [[ "$scheduler" == "pjm" ]]; then + jacamar_repo="https://gitlab.com/yoshifuminakamura/jacamar-ci.git" + else + jacamar_repo="https://gitlab.com/ecp-ci/jacamar-ci.git" + fi +fi + +if [[ -z "$base_dir" ]]; then + base_dir="${HOME}/gitlab-runner_jacamar-ci_${arch_suffix}" +fi +base_dir="$(mkdir -p "$base_dir" && cd "$base_dir" && pwd)" + +if [[ -z "$service_host" ]]; then + service_host="$(hostname -s)" +fi + +login_tag="${login_tag:-${site}_login}" +jacamar_tag="${jacamar_tag:-${site}_jacamar}" +login_desc="${site}-login" +jacamar_desc="${site}-jacamar" + +for cmd in curl git tar make gcc g++; do + command -v "$cmd" >/dev/null 2>&1 || die "Required command not found: $cmd" +done + +mkdir -p "$base_dir/bin" "$base_dir/builds" "$base_dir/cache" + +runner_bin="${base_dir}/bin/gitlab-runner" +jacamar_bin="${base_dir}/bin/jacamar" +runner_url="https://gitlab-runner-downloads.s3.amazonaws.com/${runner_version}/binaries/gitlab-runner-linux-${runner_arch}" + +if [[ ! -x "$runner_bin" ]]; then + info "Downloading GitLab Runner ${runner_version} (${runner_arch})" + curl -fsSL "$runner_url" -o "$runner_bin" + chmod +x "$runner_bin" +else + info "GitLab Runner already exists: $runner_bin" +fi + +work_dir="${base_dir}/_bootstrap" +rm -rf "$work_dir" +mkdir -p "$work_dir" + +install_go() { + local go_pkg="go${go_version}.linux-${go_arch}.tar.gz" + info "Installing Go ${go_version} (${go_arch})" + curl -fsSL "https://go.dev/dl/${go_pkg}" -o "${work_dir}/${go_pkg}" + tar -C "$work_dir" -xzf "${work_dir}/${go_pkg}" + export GOROOT="${work_dir}/go" + export GOBIN="${GOROOT}/bin" + export PATH="${GOBIN}:${PATH}" +} + +build_local_libseccomp() { + local gperf_ver="3.1" + local sec_ver="2.5.5" + local local_prefix="${work_dir}/local" + local gperf_prefix="${local_prefix}/gperf" + local sec_prefix="${local_prefix}/libseccomp" + + info "Building local gperf/libseccomp" + curl -fsSL "https://ftp.gnu.org/gnu/gperf/gperf-${gperf_ver}.tar.gz" -o "${work_dir}/gperf.tar.gz" + tar -C "$work_dir" -xzf "${work_dir}/gperf.tar.gz" + (cd "${work_dir}/gperf-${gperf_ver}" && ./configure --prefix="$gperf_prefix" && make -j"$(nproc)" && make install) + export PATH="${gperf_prefix}/bin:${PATH}" + + curl -fsSL "https://github.com/seccomp/libseccomp/releases/download/v${sec_ver}/libseccomp-${sec_ver}.tar.gz" -o "${work_dir}/libseccomp.tar.gz" + tar -C "$work_dir" -xzf "${work_dir}/libseccomp.tar.gz" + (cd "${work_dir}/libseccomp-${sec_ver}" && ./configure --prefix="$sec_prefix" --disable-shared && make -j"$(nproc)" && make install) + export PKG_CONFIG_PATH="${sec_prefix}/lib/pkgconfig:${PKG_CONFIG_PATH:-}" + export LD_LIBRARY_PATH="${sec_prefix}/lib:${LD_LIBRARY_PATH:-}" + export LIBRARY_PATH="${sec_prefix}/lib:${LIBRARY_PATH:-}" + export CPATH="${sec_prefix}/include:${CPATH:-}" +} + +have_system_libseccomp() { + if command -v pkg-config >/dev/null 2>&1 && pkg-config --exists libseccomp; then + return 0 + fi + + local test_c="${work_dir}/check-libseccomp.c" + local test_bin="${work_dir}/check-libseccomp" + cat > "$test_c" <<'EOF' +#include +int main(void) { + return seccomp_api_get() < 0; +} +EOF + gcc "$test_c" -lseccomp -o "$test_bin" >/dev/null 2>&1 +} + +configure_libseccomp() { + case "$libseccomp_mode" in + none) + info "Skipping libseccomp detection/build (--libseccomp none)" + ;; + system) + if have_system_libseccomp; then + info "Using system libseccomp" + else + die "System libseccomp was requested but not found" + fi + ;; + local) + build_local_libseccomp + ;; + auto) + if have_system_libseccomp; then + info "Using system libseccomp" + else + info "System libseccomp not found; building local copy" + build_local_libseccomp + fi + ;; + esac +} + +if [[ ! -x "$jacamar_bin" ]]; then + install_go + configure_libseccomp + + info "Building Jacamar-CI from ${jacamar_repo}" + git clone "$jacamar_repo" "${work_dir}/jacamar-ci" + if [[ -n "$jacamar_pbs_tools" ]]; then + [[ -f "$jacamar_pbs_tools" ]] || die "--jacamar-pbs-tools file not found: $jacamar_pbs_tools" + cp "$jacamar_pbs_tools" "${work_dir}/jacamar-ci/internal/executors/pbs/tools.go" + fi + ( + cd "${work_dir}/jacamar-ci" + export CC=gcc CXX=g++ CGO_ENABLED=1 + make build + make install PREFIX="$base_dir" + ) +else + info "Jacamar already exists: $jacamar_bin" +fi + +rm -rf "$work_dir" + +info "Writing custom executor helper scripts" +cat > "${base_dir}/config.sh" < "${base_dir}/prepare.sh" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +exit 0 +EOF + +cat > "${base_dir}/run.sh" <<'EOF' +#!/usr/bin/env bash +source ~/.bashrc +set -eo pipefail +exec "$@" +EOF + +cat > "${base_dir}/cleanup.sh" <> "\$LOGFILE" + +BUILD_DIR="\${CUSTOM_UNIQUE_BUILD_DIR:-}" +CACHE_DIR="\${CUSTOM_UNIQUE_CACHE_DIR:-}" + +case "\$BUILD_DIR" in + "\${BASE_DIR}/builds/"*) [[ -d "\$BUILD_DIR" ]] && rm -rf -- "\$BUILD_DIR" ;; +esac + +case "\$CACHE_DIR" in + "\${BASE_DIR}/cache/"*) [[ -d "\$CACHE_DIR" ]] && rm -rf -- "\$CACHE_DIR" ;; +esac + +echo "CLEANUP DONE at \$(date)" >> "\$LOGFILE" +EOF + +chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh" + +info "Writing Jacamar config" +cat > "${base_dir}/custom-config.toml" < "$login_template" < "$jacamar_template" < "$unit_path" <> "$unit_path" </dev/null 2>&1; then + loginctl enable-linger "$allow_user" || true + fi + if [[ "$start_service" -eq 1 ]]; then + systemctl --user restart "$service_name" + systemctl --user --no-pager status "$service_name" || true + fi +fi + +info "Done" +info "Base dir: ${base_dir}" +info "Login tag: ${login_tag}" +info "Jacamar tag: ${jacamar_tag}" +info "Jacamar unrestricted_cmd_line: ${unrestricted_cmd_line}" +if [[ -n "$runner_proxy" ]]; then + info "Runner proxy: ${runner_proxy}" +fi diff --git a/scripts/test_submit.sh b/scripts/test_submit.sh index 7b206c0..ea12bb8 100644 --- a/scripts/test_submit.sh +++ b/scripts/test_submit.sh @@ -28,6 +28,7 @@ fi source ./scripts/job_functions.sh SYSTEM_FILE="config/system.csv" +SYSTEM_INFO_FILE="config/system_info.csv" # --- checking dir and list --- if [ ! -d "programs/$code" ]; then @@ -114,6 +115,45 @@ case "$system" in -S -x PJM_LLIO_GFSCACHE=/vol0002:/vol0003:/vol0004:/vol0005 \ script.sh ;; + GenkaiA|GenkaiB|GenkaiC) + proc=$((nodes * numproc_node)) + echo pjsub -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc \ + script.sh + pjsub -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc \ + script.sh + ;; + Grand_C) + cpu_per_node=$(get_system_cpu_per_node "$system") + echo qsub -q $queue_group \ + -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} \ + -W group_list=d30992 script.sh + qsub -q $queue_group \ + -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} \ + -W group_list=d30992 script.sh + ;; + Grand_G) + echo qsub -q $queue_group \ + -l select=${nodes}:ngpus=1,walltime=${elapse} \ + -W group_list=d30992 script.sh + qsub -q $queue_group \ + -l select=${nodes}:ngpus=1,walltime=${elapse} \ + -W group_list=d30992 script.sh + ;; + AOBA_A|AOBA_S) + proc=$((nodes * numproc_node)) + echo qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T necmpi --venode $proc \ + -l elapstim_req=$elapse script.sh + qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T necmpi --venode $proc \ + -l elapstim_req=$elapse script.sh + ;; + AOBA_B) + echo qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T intmpi -b $nodes \ + -l elapstim_req=$elapse script.sh + qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T intmpi -b $nodes \ + -l elapstim_req=$elapse script.sh + ;; RC_GH200) echo sbatch -p qc-gh200 -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ --wrap="bash programs/$code/run.sh $system $nodes $numproc_node $nthreads" @@ -121,21 +161,20 @@ case "$system" in --wrap="bash programs/${code}/run.sh $system $nodes $numproc_node $nthreads" ;; MiyabiC) - echo qsub -q debug-c -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + echo qsub -q debug-c -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh - qsub -q debug-c -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + qsub -q debug-c -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh ;; MiyabiG) - echo qsub -q debug-g -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + echo qsub -q debug-g -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh - qsub -q debug-g -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + qsub -q debug-g -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh ;; *) echo "Error: Unknown system '$system'" - echo "Supported systems: Fugaku, FugakuCN, FugakuLN, RC_GH200, MiyabiC, MiyabiG" + echo "Supported systems: Fugaku, FugakuCN, FugakuLN, GenkaiA, GenkaiB, GenkaiC, Grand_C, Grand_G, AOBA_A, AOBA_B, AOBA_S, RC_GH200, MiyabiC, MiyabiG" exit 1 ;; esac - diff --git a/scripts/tests/test_bk_profiler.sh b/scripts/tests/test_bk_profiler.sh index c0835d7..85150b7 100644 --- a/scripts/tests/test_bk_profiler.sh +++ b/scripts/tests/test_bk_profiler.sh @@ -37,6 +37,9 @@ done if [ "$mode" = "-C" ]; then mkdir -p "$dir" printf '%s\n' "$event" > "${dir}/event.txt" + if [ "${FAKE_FAPP_FAIL:-0}" = "1" ]; then + exit 23 + fi exit 0 fi @@ -77,6 +80,54 @@ EOF chmod +x "${FAKE_BIN}/fapp" "${FAKE_BIN}/fapppx" export PATH="${FAKE_BIN}:${PATH}" +cat > "${FAKE_BIN}/ncu" <<'EOF' +#!/bin/bash +set -euo pipefail +outfile="" +import_file="" +import_mode=0 +while [ $# -gt 0 ]; do + case "$1" in + -o|--output) + shift + outfile="$1" + ;; + --import) + shift + import_file="$1" + import_mode=1 + ;; + --page|--target-processes|--launch-count|--set) + shift + ;; + --nvtx) + ;; + --*) + ;; + *) + if [ "$import_mode" -eq 0 ]; then + break + fi + ;; + esac + shift || true +done + +if [ "$import_mode" -eq 1 ]; then + printf 'ncu import:%s\n' "$import_file" + exit 0 +fi + +if [ -n "$outfile" ]; then + mkdir -p "$(dirname "$outfile")" + printf 'ncu report\n' > "${outfile}.ncu-rep" +fi + +"$@" +EOF + +chmod +x "${FAKE_BIN}/ncu" + run_and_check_level() { local level="$1" local expected_last_rep="$2" @@ -112,4 +163,58 @@ run_and_check_level simple 5 pa5 both yes run_and_check_level standard 11 pa11 both yes run_and_check_level detailed 17 pa17 both yes +ncu_archive="${TMP_DIR}/ncu.tgz" +ncu_extract="${TMP_DIR}/ncu_extract" +ncu_raw="${TMP_DIR}/ncu_pa" +bk_profiler ncu --level single --archive "$ncu_archive" --raw-dir "$ncu_raw" -- bash -c 'printf "ncu target\n"' +mkdir -p "$ncu_extract" +tar -xzf "$ncu_archive" -C "$ncu_extract" +test -f "${ncu_extract}/bk_profiler_artifact/meta.json" +test -f "${ncu_extract}/bk_profiler_artifact/raw/rep1/profile.ncu-rep" +test -f "${ncu_extract}/bk_profiler_artifact/reports/ncu_import_rep1.txt" +grep -q '"tool": "ncu"' "${ncu_extract}/bk_profiler_artifact/meta.json" +grep -q '"kind": "ncu_report"' "${ncu_extract}/bk_profiler_artifact/meta.json" +grep -q '"ncu_options": \["--target-processes", "all", "--set", "basic", "--launch-count", "1"\]' "${ncu_extract}/bk_profiler_artifact/meta.json" + +ncu_detailed_archive="${TMP_DIR}/ncu_detailed.tgz" +ncu_detailed_extract="${TMP_DIR}/ncu_detailed_extract" +ncu_detailed_raw="${TMP_DIR}/ncu_detailed_pa" +bk_profiler ncu --level detailed --archive "$ncu_detailed_archive" --raw-dir "$ncu_detailed_raw" -- bash -c 'printf "ncu detailed target\n"' +mkdir -p "$ncu_detailed_extract" +tar -xzf "$ncu_detailed_archive" -C "$ncu_detailed_extract" +grep -q '"ncu_options": \["--target-processes", "all", "--set", "full", "--nvtx"\]' "${ncu_detailed_extract}/bk_profiler_artifact/meta.json" + +fapp_fail_archive="${TMP_DIR}/fapp_fail.tgz" +fapp_fail_extract="${TMP_DIR}/fapp_fail_extract" +fapp_fail_raw="${TMP_DIR}/fapp_fail_pa" +export FAKE_FAPP_FAIL=1 +if bk_profiler fapp --level single --archive "$fapp_fail_archive" --raw-dir "$fapp_fail_raw" -- true; then + echo "expected failing fapp target to propagate non-zero status" >&2 + exit 1 +else + fapp_fail_status=$? +fi +unset FAKE_FAPP_FAIL +test "$fapp_fail_status" -eq 23 +mkdir -p "$fapp_fail_extract" +tar -xzf "$fapp_fail_archive" -C "$fapp_fail_extract" +test -f "${fapp_fail_extract}/bk_profiler_artifact/meta.json" +test -f "${fapp_fail_extract}/bk_profiler_artifact/raw/rep1/event.txt" +grep -q '"fapp_events": \["pa1"\]' "${fapp_fail_extract}/bk_profiler_artifact/meta.json" + +ncu_fail_archive="${TMP_DIR}/ncu_fail.tgz" +ncu_fail_extract="${TMP_DIR}/ncu_fail_extract" +ncu_fail_raw="${TMP_DIR}/ncu_fail_pa" +if bk_profiler ncu --level single --archive "$ncu_fail_archive" --raw-dir "$ncu_fail_raw" -- bash -c 'exit 42'; then + echo "expected failing ncu target to propagate non-zero status" >&2 + exit 1 +else + ncu_fail_status=$? +fi +test "$ncu_fail_status" -eq 42 +mkdir -p "$ncu_fail_extract" +tar -xzf "$ncu_fail_archive" -C "$ncu_fail_extract" +test -f "${ncu_fail_extract}/bk_profiler_artifact/meta.json" +test -f "${ncu_fail_extract}/bk_profiler_artifact/raw/rep1/profile.ncu-rep" + echo "bk_profiler tests passed" diff --git a/scripts/tests/test_result_profile_data.sh b/scripts/tests/test_result_profile_data.sh index 1bcd4df..d491c48 100644 --- a/scripts/tests/test_result_profile_data.sh +++ b/scripts/tests/test_result_profile_data.sh @@ -7,7 +7,7 @@ REPO_DIR=$(cd "${SCRIPT_DIR}/../.." && pwd) TMP_DIR=$(mktemp -d) trap 'rm -rf "${TMP_DIR}"' EXIT -mkdir -p "${TMP_DIR}/results" "${TMP_DIR}/bk_profiler_artifact" +mkdir -p "${TMP_DIR}/results" "${TMP_DIR}/bk_profiler_artifact" "${TMP_DIR}/ncu/results" "${TMP_DIR}/ncu/bk_profiler_artifact" if ! command -v jq >/dev/null 2>&1; then echo "jq not found; skipping result profile_data test" @@ -39,18 +39,64 @@ EOF tar -czf "${TMP_DIR}/results/padata0.tgz" -C "${TMP_DIR}" bk_profiler_artifact +cat > "${TMP_DIR}/ncu/results/result" <<'EOF' +FOM:2.345 FOM_version:test Exp:CASE0 node_count:1 numproc_node:8 nthreads:9 +EOF + +cat > "${TMP_DIR}/ncu/bk_profiler_artifact/meta.json" <<'EOF' +{ + "tool": "ncu", + "level": "single", + "report_format": "text", + "raw_dir": "raw", + "measurement": { + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"] + }, + "runs": [ + { + "name": "rep1", + "event": "single", + "raw_path": "raw/rep1", + "reports": [ + {"kind": "ncu_report", "path": "raw/rep1/profile.ncu-rep"}, + {"kind": "summary_text", "path": "reports/ncu_import_rep1.txt"} + ] + } + ] +} +EOF + +tar -czf "${TMP_DIR}/ncu/results/padata0.tgz" -C "${TMP_DIR}/ncu" bk_profiler_artifact + pushd "${TMP_DIR}" >/dev/null bash "${REPO_DIR}/scripts/result.sh" qws Fugaku cross build run 999 >/dev/null popd >/dev/null +pushd "${TMP_DIR}/ncu" >/dev/null +bash "${REPO_DIR}/scripts/result.sh" genesis RC_GH200 cross build run 999 >/dev/null +popd >/dev/null + RESULT_JSON="${TMP_DIR}/results/result0.json" test -f "${RESULT_JSON}" -grep -q '"profile_data"' "${RESULT_JSON}" -grep -q '"tool": "fapp"' "${RESULT_JSON}" -grep -q '"level": "single"' "${RESULT_JSON}" -grep -q '"report_format": "text"' "${RESULT_JSON}" -grep -q '"run_count": 1' "${RESULT_JSON}" -grep -q '"pa1"' "${RESULT_JSON}" -grep -q '"summary_text"' "${RESULT_JSON}" +jq -e ' + .profile_data.tool == "fapp" and + .profile_data.level == "single" and + .profile_data.report_format == "text" and + .profile_data.run_count == 1 and + (.profile_data.events | index("pa1") != null) and + (.profile_data.report_kinds | index("summary_text") != null) +' "${RESULT_JSON}" >/dev/null + +NCU_RESULT_JSON="${TMP_DIR}/ncu/results/result0.json" +test -f "${NCU_RESULT_JSON}" +jq -e ' + .profile_data.tool == "ncu" and + .profile_data.level == "single" and + .profile_data.report_format == "text" and + .profile_data.run_count == 1 and + .profile_data.events == [] and + (.profile_data.ncu_options | index("--target-processes") != null) and + (.profile_data.report_kinds | index("ncu_report") != null) +' "${NCU_RESULT_JSON}" >/dev/null echo "result profile_data test passed" diff --git a/scripts/tests/test_send_results_profile_data.sh b/scripts/tests/test_send_results_profile_data.sh index eb7a180..58c157d 100644 --- a/scripts/tests/test_send_results_profile_data.sh +++ b/scripts/tests/test_send_results_profile_data.sh @@ -28,17 +28,21 @@ EOF cat > "${TMP_DIR}/bk_profiler_artifact/meta.json" <<'EOF' { - "tool": "fapp", + "tool": "ncu", "level": "single", "report_format": "text", "raw_dir": "raw", + "measurement": { + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"] + }, "runs": [ { "name": "rep1", - "event": "pa1", + "event": "single", "raw_path": "raw/rep1", "reports": [ - {"kind": "summary_text", "path": "reports/fapp_A_rep1.txt"} + {"kind": "ncu_report", "path": "raw/rep1/profile.ncu-rep"}, + {"kind": "summary_text", "path": "reports/ncu_import_rep1.txt"} ] } ] @@ -64,10 +68,6 @@ EOF cat > "${TMP_DIR}/bin/python" <<'EOF' #!/bin/bash set -euo pipefail -if [ "${1:-}" = "scripts/validate_result_quality.py" ]; then - printf '%s\n' "$*" > "${TMP_DIR}/validator_invocation.txt" - exit 0 -fi echo "fake python: unsupported invocation: $*" >&2 exit 1 EOF @@ -78,10 +78,23 @@ set -euo pipefail exec "${TMP_DIR}/bin/python" "$@" EOF +PYTHON_FOR_FAKE_JQ="${PYTHON_FOR_FAKE_JQ:-}" +if [ -z "$PYTHON_FOR_FAKE_JQ" ]; then + if command -v python3 >/dev/null 2>&1; then + PYTHON_FOR_FAKE_JQ="$(command -v python3)" + elif command -v python >/dev/null 2>&1; then + PYTHON_FOR_FAKE_JQ="$(command -v python)" + else + echo "python3/python not found; skipping send_results profile_data test" + exit 0 + fi +fi +export PYTHON_FOR_FAKE_JQ + cat > "${TMP_DIR}/bin/jq" <<'EOF' #!/bin/bash set -euo pipefail -python_exe="/c/Users/yoshi/AppData/Local/Programs/Python/Python312/python.exe" +python_exe="${PYTHON_FOR_FAKE_JQ:?PYTHON_FOR_FAKE_JQ is required}" if [ "$1" = "-c" ]; then shift @@ -101,7 +114,8 @@ if "tool: .tool" in expr and "report_kinds" in expr: "report_format": data.get("report_format"), "raw_dir": data.get("raw_dir"), "run_count": len(data.get("runs", [])), - "events": [run.get("event") for run in data.get("runs", []) if run.get("event")], + "events": [run.get("event") for run in data.get("runs", []) if data.get("tool") == "fapp" and run.get("event")], + "ncu_options": data.get("measurement", {}).get("ncu_options", []) if data.get("tool") == "ncu" else [], "report_kinds": sorted({rep.get("kind") for run in data.get("runs", []) for rep in run.get("reports", []) if rep.get("kind")}), } print(json.dumps(summary)) @@ -184,18 +198,18 @@ chmod +x "${TMP_DIR}/bin/curl" "${TMP_DIR}/bin/jq" "${TMP_DIR}/bin/python" "${TM export PATH="${TMP_DIR}/bin:${PATH}" export RESULT_SERVER="https://example.invalid" export RESULT_SERVER_KEY="dummy" -export BK_RESULT_QUALITY_VALIDATE="true" -export BK_RESULT_QUALITY_FAIL_ON="none" pushd "${TMP_DIR}" >/dev/null bash "${REPO_DIR}/scripts/result_server/send_results.sh" >/dev/null popd >/dev/null -grep -q 'scripts/validate_result_quality.py results --fail-on none' "${TMP_DIR}/validator_invocation.txt" grep -q '"profile_data"' "${TMP_DIR}/results/result0.json" -grep -q '"tool": "fapp"' "${TMP_DIR}/results/result0.json" -grep -q '"level": "single"' "${TMP_DIR}/results/result0.json" -grep -q '"run_count": 1' "${TMP_DIR}/results/result0.json" +grep -Eq '"tool":[[:space:]]*"ncu"' "${TMP_DIR}/results/result0.json" +grep -Eq '"level":[[:space:]]*"single"' "${TMP_DIR}/results/result0.json" +grep -Eq '"run_count":[[:space:]]*1' "${TMP_DIR}/results/result0.json" +grep -Eq '"events":[[:space:]]*\[[[:space:]]*\]' "${TMP_DIR}/results/result0.json" +grep -Eq '"ncu_options":[[:space:]]*\[' "${TMP_DIR}/results/result0.json" +grep -Eq '"ncu_report"' "${TMP_DIR}/results/result0.json" grep -q '"_server_uuid": "11111111-2222-3333-4444-555555555555"' "${TMP_DIR}/results/result0.json" grep -q '"result0.json"' "${TMP_DIR}/results/server_result_meta.json" diff --git a/scripts/validate_result_quality.py b/scripts/validate_result_quality.py deleted file mode 100644 index 726341d..0000000 --- a/scripts/validate_result_quality.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -"""Lightweight validator for BenchKit result JSON quality. - -By default this script is visibility-first: it reports warnings and validator -candidate gaps without failing the process. Future CI can opt into stricter -behavior with --fail-on. -""" - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path -from typing import Optional - - -REPO_ROOT = Path(__file__).resolve().parents[1] -RESULT_SERVER_ROOT = REPO_ROOT / "result_server" -if str(RESULT_SERVER_ROOT) not in sys.path: - sys.path.insert(0, str(RESULT_SERVER_ROOT)) - -from utils.result_records import summarize_result_quality # noqa: E402 - - -DEFAULT_POLICY_PATH = REPO_ROOT / "config" / "result_quality_policy.json" -DEFAULT_REDIS_KEY = "benchkit:result_quality:app_tiers" - - -def _iter_result_files(paths: list[str]) -> list[Path]: - collected: list[Path] = [] - seen: set[Path] = set() - - for raw_path in paths: - path = Path(raw_path) - if path.is_dir(): - for item in sorted(path.glob("*.json")): - resolved = item.resolve() - if resolved not in seen: - seen.add(resolved) - collected.append(item) - continue - - if path.is_file() and path.suffix.lower() == ".json": - resolved = path.resolve() - if resolved not in seen: - seen.add(resolved) - collected.append(path) - - return collected - - -def _load_result(path: Path): - try: - with path.open("r", encoding="utf-8") as handle: - data = json.load(handle) - except Exception as exc: # pragma: no cover - surfaced in report - return None, f"failed to parse JSON: {exc}" - - if "FOM" not in data or "system" not in data: - return None, "not a benchmark result JSON (missing FOM or system)" - - return data, None - - -def _load_policy(path: Optional[str]) -> dict: - policy_path = Path(path) if path else DEFAULT_POLICY_PATH - if not policy_path.exists(): - return { - "version": 1, - "default_tier": "relaxed", - "tiers": {"relaxed": {"fail_candidates": [], "fail_warnings": []}}, - "apps": {}, - "_path": str(policy_path), - } - - with policy_path.open("r", encoding="utf-8") as handle: - data = json.load(handle) - - if "tiers" not in data or not isinstance(data["tiers"], dict): - raise ValueError(f"invalid quality policy: missing tiers in {policy_path}") - - data["_path"] = str(policy_path) - return data - - -def _load_redis_app_tier_overrides(redis_url: Optional[str], redis_key: str) -> dict: - if not redis_url: - return {} - - try: - import redis # type: ignore - except Exception: - return {} - - try: - redis_conn = redis.from_url(redis_url, decode_responses=True) - overrides = redis_conn.hgetall(redis_key) - except Exception: - return {} - - if not isinstance(overrides, dict): - return {} - - return { - str(app).strip(): str(tier).strip() - for app, tier in overrides.items() - if str(app).strip() and str(tier).strip() - } - - -def _resolve_policy_for_app(policy: dict, app: str) -> dict: - tier_name = policy.get("apps", {}).get(app, policy.get("default_tier", "relaxed")) - tier = policy.get("tiers", {}).get(tier_name) - if not tier: - tier_name = policy.get("default_tier", "relaxed") - tier = policy.get("tiers", {}).get(tier_name, {"fail_candidates": [], "fail_warnings": []}) - - return { - "tier_name": tier_name, - "fail_candidates": tier.get("fail_candidates", []), - "fail_warnings": tier.get("fail_warnings", []), - } - - -def build_quality_report( - paths: list[str], - policy_path: Optional[str] = None, - redis_url: Optional[str] = None, - redis_key: str = DEFAULT_REDIS_KEY, -) -> dict: - files = _iter_result_files(paths) - policy = _load_policy(policy_path) - redis_overrides = _load_redis_app_tier_overrides(redis_url, redis_key) - if redis_overrides: - merged_apps = dict(policy.get("apps", {})) - merged_apps.update(redis_overrides) - policy["apps"] = merged_apps - rows = [] - - for path in files: - data, load_error = _load_result(path) - if load_error: - rows.append({ - "path": str(path), - "status": "skipped", - "reason": load_error, - }) - continue - - quality = summarize_result_quality(data) - app = data.get("code", "unknown") - policy_info = _resolve_policy_for_app(policy, app) - enforced_candidates = [ - item for item in quality.get("validator_candidates", []) - if item in policy_info["fail_candidates"] - ] - enforced_warnings = [ - item for item in quality.get("warnings", []) - if item in policy_info["fail_warnings"] - ] - rows.append({ - "path": str(path), - "status": "ok", - "code": app, - "system": data.get("system", "unknown"), - "quality_level": quality["level"], - "quality_label": quality["label"], - "warning_count": len(quality["warnings"]), - "warnings": quality["warnings"], - "suggested_actions": quality.get("suggested_actions", []), - "validator_candidates": quality.get("validator_candidates", []), - "policy_tier": policy_info["tier_name"], - "policy_fail_candidates": policy_info["fail_candidates"], - "policy_fail_warnings": policy_info["fail_warnings"], - "enforced_candidates": enforced_candidates, - "enforced_warnings": enforced_warnings, - }) - - summary = { - "scanned_files": len(files), - "validated_results": sum(1 for row in rows if row["status"] == "ok"), - "skipped_files": sum(1 for row in rows if row["status"] == "skipped"), - "policy_path": policy["_path"], - "default_tier": policy.get("default_tier", "relaxed"), - "redis_override_key": redis_key if redis_url else "", - "redis_override_count": len(redis_overrides), - "rows": rows, - } - return summary - - -def _format_text_report(report: dict) -> str: - lines = [ - f"Scanned files: {report['scanned_files']}", - f"Validated results: {report['validated_results']}", - f"Skipped files: {report['skipped_files']}", - ] - if report.get("redis_override_key"): - lines.append( - f"Redis overrides: {report.get('redis_override_count', 0)} from {report['redis_override_key']}" - ) - - for row in report["rows"]: - if row["status"] == "skipped": - lines.append(f"[SKIP] {row['path']}") - lines.append(f" reason: {row['reason']}") - continue - - lines.append( - f"[{row['quality_label']}] {row['path']} ({row['code']} / {row['system']})" - ) - lines.append(f" policy-tier: {row['policy_tier']}") - lines.append(f" warnings: {row['warning_count']}") - if row["warnings"]: - for item in row["warnings"]: - lines.append(f" - warning: {item}") - else: - lines.append(" - warning: none") - - if row["suggested_actions"]: - for item in row["suggested_actions"]: - lines.append(f" - action: {item}") - else: - lines.append(" - action: none") - - if row["validator_candidates"]: - for item in row["validator_candidates"]: - lines.append(f" - validator-candidate: {item}") - else: - lines.append(" - validator-candidate: none") - - if row["enforced_candidates"]: - for item in row["enforced_candidates"]: - lines.append(f" - policy-candidate: {item}") - if row["enforced_warnings"]: - for item in row["enforced_warnings"]: - lines.append(f" - policy-warning: {item}") - - return "\n".join(lines) - - -def _should_fail(report: dict, fail_on: str) -> bool: - if fail_on == "none": - return False - - for row in report["rows"]: - if row["status"] != "ok": - continue - if fail_on == "warning" and row["warning_count"] > 0: - return True - if fail_on == "candidate" and row["validator_candidates"]: - return True - if fail_on == "policy" and (row["enforced_candidates"] or row["enforced_warnings"]): - return True - return False - - -def main(argv: Optional[list[str]] = None) -> int: - parser = argparse.ArgumentParser(description="Validate BenchKit result JSON quality.") - parser.add_argument( - "paths", - nargs="*", - default=["results"], - help="Result JSON files or directories to scan. Defaults to ./results.", - ) - parser.add_argument( - "--format", - choices=("text", "json"), - default="text", - help="Output format.", - ) - parser.add_argument( - "--fail-on", - choices=("none", "warning", "candidate", "policy"), - default="none", - help=( - "Exit non-zero when warnings or validator candidates are found. " - "Default is report-only." - ), - ) - parser.add_argument( - "--policy-file", - default=str(DEFAULT_POLICY_PATH), - help="Internal quality policy file. Defaults to config/result_quality_policy.json.", - ) - parser.add_argument( - "--redis-url", - default=os.environ.get("BK_RESULT_QUALITY_REDIS_URL", ""), - help="Optional Redis URL for app-tier overrides.", - ) - parser.add_argument( - "--redis-key", - default=os.environ.get("BK_RESULT_QUALITY_REDIS_KEY", DEFAULT_REDIS_KEY), - help=f"Redis hash key used for app-tier overrides. Defaults to {DEFAULT_REDIS_KEY}.", - ) - args = parser.parse_args(argv) - - report = build_quality_report( - args.paths, - policy_path=args.policy_file, - redis_url=args.redis_url or None, - redis_key=args.redis_key, - ) - if args.format == "json": - print(json.dumps(report, ensure_ascii=False, indent=2)) - else: - print(_format_text_report(report)) - - return 1 if _should_fail(report, args.fail_on) else 0 - - -if __name__ == "__main__": - raise SystemExit(main())