RIKEN-RCCS · yoshifuminakamura · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.github/workflows/result-server-tests.yml b/.github/workflows/result-server-tests.yml
@@ -9,6 +9,7 @@ on:
       - "scripts/result_server/**"
       - "scripts/estimation/**"
       - "scripts/tests/test_bk_profiler.sh"
+      - "scripts/tests/test_estimation_gpu_kernel_lightgbm_v10.sh"
       - "scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh"
       - "scripts/tests/test_genesis_gpu_mlp_estimation.sh"
       - "scripts/tests/test_qws_gpu_mlp_smoke_estimation.sh"
@@ -33,6 +34,7 @@ on:
       - "scripts/result_server/**"
       - "scripts/estimation/**"
       - "scripts/tests/test_bk_profiler.sh"
+      - "scripts/tests/test_estimation_gpu_kernel_lightgbm_v10.sh"
       - "scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh"
       - "scripts/tests/test_genesis_gpu_mlp_estimation.sh"
       - "scripts/tests/test_qws_gpu_mlp_smoke_estimation.sh"
@@ -102,6 +104,7 @@ jobs:
           bash scripts/tests/test_result_profile_data.sh
           bash scripts/tests/test_send_results_profile_data.sh
           bash scripts/tests/test_send_estimate_artifacts.sh
+          bash scripts/tests/test_estimation_gpu_kernel_lightgbm_v10.sh
           bash scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
           bash scripts/tests/test_genesis_gpu_mlp_estimation.sh
           bash scripts/tests/test_qws_gpu_mlp_smoke_estimation.sh
diff --git a/docs/guides/add-app.md b/docs/guides/add-app.md
@@ -402,7 +402,8 @@ bk_profiler ncu --level single --archive ../results/padata0.tgz --raw-dir ncu --
 ```
 
 `ncu` の既定 level は `single` です。最初は採取時間を抑えるため、`single` または `simple` から始めてください。
-raw report は `padata*.tgz` 内の `bk_profiler_artifact/raw/rep1/` に保存され、可能な場合は `bk_profiler_artifact/reports/ncu_import_rep1.txt` に text report が保存されます。
+`padata*.tgz` には、可能な場合は `bk_profiler_artifact/reports/ncu_import_rep1.txt` に text report、`BK_PROFILER_NCU_RAW_CSV=true` の場合は `bk_profiler_artifact/raw/rep1/profile_raw.csv` に raw CSV が保存されます。
+Nsight Compute の binary report (`*.ncu-rep` など) は重いため既定では `padata*.tgz` から除外されます。デバッグ目的で保存したい場合だけ `BK_PROFILER_ARCHIVE_NCU_REPORT=true` を明示してください。
 site の既定 module に `ncu` が含まれない場合は、アプリ側で module を load するか、system 固有の module 変数を用意してください。
 Genesis GH200 参照実装では `GENESIS_MIYABIG_MODULE` / `GENESIS_GH200_MODULE` で module を上書きできます。
 既定の `ncu` が PATH にない場合は warning を出して profiler なしで benchmark 本体を実行しますが、`GENESIS_MIYABIG_PROFILER_TOOL=ncu`、`GENESIS_GH200_PROFILER_TOOL=ncu`、または `GENESIS_PROFILER_TOOL=ncu` を明示した場合は採取不能として失敗します。

diff --git a/docs/guides/add-estimation-package.md b/docs/guides/add-estimation-package.md
@@ -43,6 +43,7 @@
   - `counter_papi_detailed.sh`
   - `trace_mpi_basic.sh`
   - `overlap_max_basic.sh`
+  - `gpu_kernel_lightgbm_v10.sh`
   - `gpu_kernel_mlp_v15.sh`
 
 ## 3. top-level package の責務
@@ -69,19 +70,42 @@ section package はもっと小さくてかまいません。
 ここでは「1 区間の変換規則」に集中し、Estimate JSON 全体の組み立てや current / future の side 管理は BenchKit 共通層や top-level package 側へ寄せる方が自然です。
 
 GPU kernel 単位の外部推定ツールは、通常は section package として扱います。
-たとえば `gpu_kernel_mlp_v15` は、PerfTools の `MLP_NN/v1.5` を「GPU 区間だけを変換する package」として接続します。
-top-level package は `instrumented_app_sections_dummy` などのままにして、GPU 区間にだけ `gpu_kernel_mlp_v15` を割り当てます。
+たとえば次の package は、PerfTools の各モデルを「GPU 区間だけを変換する package」として接続します。
+
+- `gpu_kernel_mlp_v15`
+  - PerfTools `MLP_NN/v1.5`
+  - 主な依存: numpy/pandas/torch
+- `gpu_kernel_lightgbm_v10`
+  - PerfTools `LightGBM_model/1.0`
+  - 主な依存: numpy/pandas/lightgbm/pyyaml と `libgomp`
+
+top-level package は `instrumented_app_sections_dummy` などのままにして、GPU 区間にだけ GPU kernel section package を割り当てます。
+どの section package を既定で使うかは app 側の bring-up 状況や CI runner/container に依存するため、このガイドでは特定の package を正解として固定しません。
+新しい package を追加した場合は、この一覧と app 側の切り替え変数から選択肢として見えるようにしてください。
 
 ```bash
-bk_declare_section --side future gpu_kernel_region gpu_kernel_mlp_v15
+gpu_section_package="${BK_GENESIS_GPU_SECTION_PACKAGE:-gpu_kernel_lightgbm_v10}"
+bk_declare_section --side future gpu_kernel_region "$gpu_section_package"
 bk_emit_declared_section --side future gpu_kernel_region "$measured_gpu_time" results/estimation_artifacts/gpu_kernel_region_input.csv
 ```
 
+GENESIS の GPU kernel section package は `BK_GENESIS_GPU_SECTION_PACKAGE` で切り替えられます。
+未指定時の既定値は接続確認中の実装に合わせて変わることがあるため、検証や再現性が必要な場合は明示してください。
+
+```bash
+export BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v15
+# or
+export BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_lightgbm_v10
+```
+
 PerfTools 本体は BenchKit に vendoring せず、実行時に次の環境変数で渡します。
 
 ```bash
 export BK_GPU_MLP_PERFTOOLS_ROOT=/path/to/PerfTools
 export BK_GPU_MLP_PYTHON=python3
+# LightGBM package だけを明示したい場合
+export BK_GPU_LIGHTGBM_PERFTOOLS_ROOT=/path/to/PerfTools
+export BK_GPU_LIGHTGBM_PYTHON=python3
 ```
 
 section artifact は PerfTools 側の static GPU spec sheet から作られた prepared CSV を想定します。
@@ -94,12 +118,14 @@ export BK_GPU_MLP_ARTIFACT_MODE=prediction
 export BK_GPU_MLP_PREDICTION_CSV_GPU_KERNEL_REGION=/path/to/pred.csv
 ```
 
-section package は prediction CSV の `Execution Time [ns]` を合算し、その section の future-side `time` にします。
+section package は prediction CSV の推定実行時間を合算し、その section の future-side `time` にします。
+MLP package は `Execution Time [ns]`、LightGBM package は `O-Execution Time` を主な入力列として扱います。
 
 qws を使って CI 配管だけを確認する場合は、実際の qws が GPU 化されていなくても GPU MLP smoke test を有効にできます。
 `BK_QWS_GPU_MLP_SMOKE_MODE=prediction` では、同梱のサンプル prediction CSV を使い、run job が `gpu_kernel_region` section と prediction CSV artifact を結果に埋め込みます。
 `BK_QWS_GPU_MLP_SMOKE_MODE=perftools` では、estimate job が PerfTools repo を checkout し、`MLP_NN/examples/example_input_mixed-src_20kernels.csv` を `predict_v15.py` に渡して prediction CSV を生成します。
 どちらのモードでも、estimate job が `gpu_kernel_mlp_v15` section package を通して Estimate JSON へ変換できることを確認します。
+LightGBM など別の GPU kernel section package は、GENESIS の `BK_GENESIS_GPU_SECTION_PACKAGE` のように app 側の切り替え変数や専用テストで確認します。
 qws の推定スクリプト単体では既定無効ですが、GPU estimator integration の立ち上げ期間中は GitLab CI 側の既定を一時的に有効化しています。
 
 ```bash
@@ -114,7 +140,9 @@ export BK_GPU_MLP_PERFTOOLS_REF=main
 `BK_QWS_GPU_MLP_SMOKE` は qws を使った配管確認用、`BK_QWS_GPU_MLP_SMOKE_MODE` は prediction fixture 取り込みと PerfTools 実行の切り替え用、`BK_ESTIMATE_RUNNER_TAG` は推定用 runner/container を手動で逃がすためのものです。
 実際の GPU profiling input と推定 runner の運用が固まったら、専用の package/runner 設定へ置き換え、これらの暫定変数は削除対象として見直してください。
 
-`perftools` smoke mode は GitHub から PerfTools を取得するため、推定 runner/container には `git` と外部接続、Python 3.12 以上、numpy/pandas/torch が必要です。
+`perftools` smoke mode は GitHub から PerfTools を取得するため、推定 runner/container には `git` と外部接続が必要です。
+Python とライブラリは、選択した PerfTools モデル側の要件に合わせます。
+MLP package には Python 3.11 以上と numpy/pandas/torch、LightGBM package には Python 3.11 以上と numpy/pandas/lightgbm/pyyaml、さらに LightGBM 実行用の `libgomp` が必要です。
 実運用では smoke mode ではなく、推定 runner/container に PerfTools checkout を用意し、section artifact として実アプリ由来の prepared input CSV を渡してください。
 
 ## 5. metadata に持たせるもの

diff --git a/docs/guides/profiler-support.md b/docs/guides/profiler-support.md
@@ -49,6 +49,7 @@ bk_profiler <tool> [options] -- <command ...>
 - `BK_PROFILER_REPORT_ARGS`
 - `BK_PROFILER_DIR`
 - `BK_PROFILER_STAGE_DIR`
+- `BK_PROFILER_ARCHIVE_NCU_REPORT`
 
 ## 3. 共通語彙としての level
 
@@ -86,7 +87,10 @@ BenchKit は「CSV があること」を共通必須にはしない。
 - `detailed` → `--set full --nvtx`
 
 既定の report format は `text` とする。
-raw report は archive 内の `bk_profiler_artifact/raw/rep1/profile*.ncu-rep` または Nsight Compute の出力形式に従う report file として保存し、可能な場合は `ncu --import ... --page details` の出力を `bk_profiler_artifact/reports/ncu_import_rep1.txt` に保存する。
+`padata*.tgz` の肥大化を避けるため、Nsight Compute の binary report (`*.ncu-rep` など) は既定では archive から除外する。
+可能な場合は `ncu --import ... --page details` の出力を `bk_profiler_artifact/reports/ncu_import_rep1.txt` に保存する。
+`BK_PROFILER_NCU_RAW_CSV=true` の場合は、推定 package が使う raw CSV を `bk_profiler_artifact/raw/rep1/profile_raw.csv` に保存する。
+binary report も保存したいデバッグ用途では、`BK_PROFILER_ARCHIVE_NCU_REPORT=true` を明示する。
 
 MPI launcher 経由の GPU application では、既定で `--target-processes all` を付けて child process も採取対象にする。
 追加の kernel filter、section set、NVTX filter などは `BK_PROFILER_ARGS` で `ncu` に渡す。

diff --git a/programs/genesis/estimate.sh b/programs/genesis/estimate.sh
@@ -2,6 +2,8 @@
 # estimate.sh — GENESIS estimation entrypoint and run-time section metadata.
 
 genesis_declare_estimation_layout() {
+  local gpu_section_package="${BK_GENESIS_GPU_SECTION_PACKAGE:-gpu_kernel_lightgbm_v10}"
+
   bk_clear_estimation_defaults
   bk_clear_estimation_declarations
   bk_define_current_estimation_package weakscaling
@@ -11,7 +13,7 @@ genesis_declare_estimation_layout() {
   bk_define_future_system "${BK_ESTIMATION_FUTURE_SYSTEM:-GPU_MLP_TARGET}"
   bk_define_current_target_nodes "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}"
   bk_define_future_target_nodes "${BK_ESTIMATION_FUTURE_TARGET_NODES:-1}"
-  bk_declare_section --side future gpu_kernel_region gpu_kernel_mlp_v15
+  bk_declare_section --side future gpu_kernel_region "$gpu_section_package"
 }
 
 genesis_emit_estimation_data_from_fom() {
@@ -42,9 +44,13 @@ BK_ESTIMATION_SECTION_DEFAULT_FACTOR="${BK_ESTIMATION_SECTION_DEFAULT_FACTOR:-1.
 BK_GPU_MLP_ARTIFACT_MODE="${BK_GPU_MLP_ARTIFACT_MODE:-ncu}"
 BK_GPU_MLP_SOURCE_GPU="${BK_GPU_MLP_SOURCE_GPU:-H100}"
 BK_GPU_MLP_KERNEL_COUNT="${BK_GPU_MLP_KERNEL_COUNT:-20}"
+BK_GPU_LIGHTGBM_ARTIFACT_MODE="${BK_GPU_LIGHTGBM_ARTIFACT_MODE:-ncu}"
+BK_GPU_LIGHTGBM_SOURCE_GPU="${BK_GPU_LIGHTGBM_SOURCE_GPU:-${BK_GPU_MLP_SOURCE_GPU}}"
 export BK_GPU_MLP_ARTIFACT_MODE
 export BK_GPU_MLP_SOURCE_GPU
 export BK_GPU_MLP_KERNEL_COUNT
+export BK_GPU_LIGHTGBM_ARTIFACT_MODE
+export BK_GPU_LIGHTGBM_SOURCE_GPU
 
 genesis_declare_estimation_layout
 bk_estimation_apply_declared_defaults

diff --git a/scripts/bk_functions.sh b/scripts/bk_functions.sh
@@ -802,7 +802,12 @@ bk_profiler_find_ncu_report() {
     -name '*.ncu-rep' -o \
     -name '*.nsight-cuprof' -o \
     -name 'profile*' \
-  \) | head -n 1
+  \) \
+    ! -name 'profile_raw.csv' \
+    ! -name 'profile_raw.csv.log' \
+    ! -name '*.csv' \
+    ! -name '*.log' \
+    | head -n 1
 }
 
 bk_json_escape() {
@@ -1164,6 +1169,15 @@ bk_profiler() {
           ;;
       esac
       cp -R "$_bk_ncu_rep_dir" "$_bk_stage_dir/raw/${_bk_ncu_rep_name}"
+      case "${BK_PROFILER_ARCHIVE_NCU_REPORT:-false}" in
+        1|true|TRUE|yes|YES|on|ON) ;;
+        *)
+          find "$_bk_stage_dir/raw/${_bk_ncu_rep_name}" -maxdepth 1 -type f \( \
+            -name '*.ncu-rep' -o \
+            -name '*.nsight-cuprof' \
+          \) -delete
+          ;;
+      esac
       _bk_profiler_run_names="${_bk_ncu_rep_name}"
       _bk_profiler_run_events="${_bk_profiler_level}"
       ;;

diff --git a/scripts/estimation/common.sh b/scripts/estimation/common.sh
@@ -291,6 +291,9 @@ bk_estimation_run_recorded_current_with_weakscaling() {
   if [[ -z "$baseline_breakdown" || "$baseline_breakdown" == "null" ]]; then
     baseline_breakdown="$est_input_fom_breakdown"
   fi
+  if declare -F bk_estimation_package_normalize_recorded_current_breakdown >/dev/null 2>&1; then
+    baseline_breakdown=$(bk_estimation_package_normalize_recorded_current_breakdown "$baseline_breakdown")
+  fi
 
   est_current_system="$baseline_system"
   est_current_target_nodes="$current_target_nodes"

diff --git a/scripts/estimation/packages/instrumented_app_sections_dummy.sh b/scripts/estimation/packages/instrumented_app_sections_dummy.sh
@@ -31,6 +31,7 @@ bk_estimation_package_metadata() {
     "quarter",
     "counter_papi_detailed",
     "trace_mpi_basic",
+    "gpu_kernel_lightgbm_v10",
     "gpu_kernel_mlp_v15",
     "logp"
   ],

diff --git a/scripts/estimation/packages/top_level_package_common.sh b/scripts/estimation/packages/top_level_package_common.sh
@@ -100,6 +100,7 @@ bk_top_level_dispatch_bound_item() {
   local fallback_target
   local check_result
   local missing_inputs_json
+  local transformed_item
 
   package_name=$(echo "$item_json" | jq -r '.estimation_package // empty')
   if [[ -z "$package_name" ]]; then
@@ -122,7 +123,11 @@ bk_top_level_dispatch_bound_item() {
       check_result=$(bk_top_level_unsupported_bound_package_result "$package_name" "$item_kind")
     fi
     if declare -F "$fn_name" >/dev/null 2>&1 && [[ "$(echo "$check_result" | jq -r '.status // "not_applicable"')" == "applicable" ]]; then
-      "$fn_name" "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "$item_kind"
+      if ! transformed_item=$("$fn_name" "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "$item_kind"); then
+        echo "ERROR: section package ${package_name} failed for ${item_kind}" >&2
+        return 1
+      fi
+      printf '%s\n' "$transformed_item"
       return 0
     fi
 
@@ -163,6 +168,7 @@ bk_top_level_transform_breakdown() {
   local sections_out=()
   local overlaps_out=()
   local item_json
+  local transformed_item
 
   if [[ -z "$breakdown_json" || "$breakdown_json" == "null" ]]; then
     echo ""
@@ -171,16 +177,18 @@ bk_top_level_transform_breakdown() {
 
   while IFS= read -r item_json; do
     [[ -z "$item_json" ]] && continue
-    sections_out+=("$(
-      bk_top_level_dispatch_bound_item "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "section" "$default_section_package"
-    )")
+    if ! transformed_item=$(bk_top_level_dispatch_bound_item "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "section" "$default_section_package"); then
+      return 1
+    fi
+    sections_out+=("$transformed_item")
   done < <(echo "$breakdown_json" | jq -c '.sections // [] | .[]')
 
   while IFS= read -r item_json; do
     [[ -z "$item_json" ]] && continue
-    overlaps_out+=("$(
-      bk_top_level_dispatch_bound_item "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "overlap" "$default_overlap_package"
-    )")
+    if ! transformed_item=$(bk_top_level_dispatch_bound_item "$item_json" "$target_nodes" "$bench_nodes" "$default_factor" "overlap" "$default_overlap_package"); then
+      return 1
+    fi
+    overlaps_out+=("$transformed_item")
   done < <(echo "$breakdown_json" | jq -c '.overlaps // [] | .[]')
 
   jq -cn \

diff --git a/scripts/estimation/packages/weakscaling.sh b/scripts/estimation/packages/weakscaling.sh
@@ -126,6 +126,30 @@ bk_estimation_package_check_applicability() {
   return 0
 }
 
+_bk_weakscaling_normalize_breakdown_packages() {
+  local breakdown_json="$1"
+
+  echo "$breakdown_json" | jq -c '
+    .
+    | .sections = ((.sections // []) | map(
+        if (.estimation_package // "") == "logp" then
+          .
+        else
+          (. + {estimation_package: "identity"}
+           | del(.requested_estimation_package, .fallback_used, .package_applicability, .scaling_method, .model, .metrics))
+        end
+      ))
+    | .overlaps = ((.overlaps // []) | map(
+        . + {estimation_package: "identity"}
+        | del(.requested_estimation_package, .fallback_used, .package_applicability, .scaling_method, .model, .metrics)
+      ))
+  '
+}
+
+bk_estimation_package_normalize_recorded_current_breakdown() {
+  _bk_weakscaling_normalize_breakdown_packages "$1"
+}
+
 bk_estimation_package_run() {
   local current_system="${BK_ESTIMATION_CURRENT_SYSTEM:-$est_system}"
   local future_system="${BK_ESTIMATION_FUTURE_SYSTEM:-$est_system}"
@@ -146,7 +170,7 @@ bk_estimation_package_run() {
   est_current_bench_numproc_node="$est_numproc_node"
   est_current_bench_timestamp="$est_timestamp"
   est_current_bench_uuid="$est_uuid"
-  est_current_fom_breakdown=$(bk_top_level_transform_breakdown "$est_input_fom_breakdown" "$current_target_nodes" "$est_node_count" "1" "identity" "identity")
+  est_current_fom_breakdown=$(bk_top_level_transform_breakdown "$(_bk_weakscaling_normalize_breakdown_packages "$est_input_fom_breakdown")" "$current_target_nodes" "$est_node_count" "1" "identity" "identity")
   est_current_fom=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown")
 
   est_future_system="$future_system"
@@ -158,7 +182,7 @@ bk_estimation_package_run() {
   est_future_bench_numproc_node="$est_numproc_node"
   est_future_bench_timestamp="$est_timestamp"
   est_future_bench_uuid="$est_uuid"
-  est_future_fom_breakdown=$(bk_top_level_transform_breakdown "$est_input_fom_breakdown" "$future_target_nodes" "$est_node_count" "1" "identity" "identity")
+  est_future_fom_breakdown=$(bk_top_level_transform_breakdown "$(_bk_weakscaling_normalize_breakdown_packages "$est_input_fom_breakdown")" "$future_target_nodes" "$est_node_count" "1" "identity" "identity")
   est_future_fom=$(bk_top_level_breakdown_total_time "$est_future_fom_breakdown")
 
   applicability_issues_json=$(jq -cn \