BelixRogner · BelixRogner · May 18, 2026 · May 9, 2026 · May 9, 2026 · May 10, 2026
diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -583,14 +583,14 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
   }
   if (!USE_WEIGHT) {
     BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
-    const double float_pos = (1.0f - alpha) * len;
-    const INDEX_T pos = static_cast<INDEX_T>(float_pos);
+    const double float_pos = (1.0 - alpha) * static_cast<double>(len - 1);
+    const INDEX_T pos = static_cast<INDEX_T>(float_pos) + 1;
     if (pos < 1) {
       return values[indices[0]];
     } else if (pos >= len) {
       return values[indices[len - 1]];
     } else {
-      const double bias = float_pos - pos;
+      const double bias = float_pos - static_cast<double>(pos - 1);
       const VAL_T v1 = values[indices[pos - 1]];
       const VAL_T v2 = values[indices[pos]];
       return static_cast<VAL_T>(v1 - (v1 - v2) * bias);

diff --git a/tests/python_package_test/test_dual.py b/tests/python_package_test/test_dual.py
@@ -4,6 +4,7 @@
 import os
 import platform
 
+import numpy as np
 import pytest
 from sklearn.metrics import log_loss
 
@@ -35,3 +36,122 @@ def test_cpu_and_gpu_work():
     rel = 1e-6 if params_gpu["gpu_use_dp"] else 1e-4
     assert cpu_score == pytest.approx(gpu_score, rel=rel)
     assert gpu_score < 0.242
+
+
+_REQUIRES_CUDA = pytest.mark.skipif(
+    os.environ.get("TASK", "") != "cuda",
+    reason="requires CUDA-enabled LightGBM build (set TASK=cuda)",
+)
+
+# Loose enough to absorb label_t float32 quantization in the renewal kernel,
+# tight enough to flag the ~0.3 bias the old PercentileDevice formula produced.
+_PERCENTILE_TOL = 1e-6
+
+
+def _train_one_tree_for_renewal(device_type, objective, alpha, X, y):
+    # learning_rate=1.0 makes raw_score equal to the renewed leaf value directly,
+    # which lets the assertion compare against numpy.quantile without unwinding shrinkage.
+    params = {
+        "objective": objective,
+        "alpha": alpha,
+        "num_leaves": 7,
+        "min_data_in_leaf": 1,
+        "learning_rate": 1.0,
+        "verbose": -1,
+        "deterministic": True,
+        "num_threads": 1,
+        "seed": 0,
+        "feature_pre_filter": False,
+        "device_type": device_type,
+        "gpu_use_dp": True,
+        "force_col_wise": True,
+    }
+    ds = lgb.Dataset(X, label=y, params={"verbose": -1, "feature_pre_filter": False})
+    return lgb.train(params, ds, num_boost_round=1)
+
+
+@_REQUIRES_CUDA
+@pytest.mark.parametrize("seed", [0, 1, 2])
+def test_cuda_l1_leaf_renewal_matches_numpy_median(seed):
+    """L1 leaf renewal must produce numpy.median(y_in_leaf) on both CPU and CUDA.
+
+    Regression test for the unweighted PercentileDevice formula that previously
+    used `len * (1 - alpha)` instead of `(len - 1) * (1 - alpha)`, biasing
+    leaf values upward in the descending-sort convention used for L1 / quantile
+    renewal.
+    """
+    rng = np.random.default_rng(seed)
+    n = 200
+    X = rng.standard_normal((n, 5)).astype(np.float64)
+    w = rng.standard_normal(5)
+    y = (X @ w + 0.1 * rng.standard_normal(n)).astype(np.float64)
+
+    for device_type in ("cpu", "cuda"):
+        bst = _train_one_tree_for_renewal(device_type, "regression_l1", 0.5, X, y)
+        leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
+        raw = bst.predict(X, raw_score=True)
+        for li in np.unique(leaf_idx):
+            mask = leaf_idx == li
+            expected = float(np.median(y[mask]))
+            actual = float(raw[mask][0])
+            assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
+                f"{device_type} leaf {li} (n={int(mask.sum())}): expected np.median={expected:.10f}, got {actual:.10f}"
+            )
+
+
+@_REQUIRES_CUDA
+@pytest.mark.parametrize("alpha", [0.1, 0.25, 0.5, 0.7, 0.9])
+def test_cuda_quantile_leaf_renewal_matches_numpy_quantile(alpha):
+    """Quantile leaf renewal must produce numpy.quantile(y_in_leaf, alpha)
+    on both CPU and CUDA. Same regression coverage as the L1 test, but
+    sweeping alpha so the bias of the wrong formula would show on every
+    even/odd leaf size combination.
+    """
+    rng = np.random.default_rng(123)
+    n = 250
+    X = rng.standard_normal((n, 6)).astype(np.float64)
+    w = rng.standard_normal(6)
+    y = (X @ w + 0.1 * rng.standard_normal(n)).astype(np.float64)
+
+    for device_type in ("cpu", "cuda"):
+        bst = _train_one_tree_for_renewal(device_type, "quantile", alpha, X, y)
+        leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
+        raw = bst.predict(X, raw_score=True)
+        for li in np.unique(leaf_idx):
+            mask = leaf_idx == li
+            expected = float(np.quantile(y[mask], alpha))
+            actual = float(raw[mask][0])
+            assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
+                f"{device_type} alpha={alpha} leaf {li} (n={int(mask.sum())}): "
+                f"expected np.quantile={expected:.10f}, got {actual:.10f}"
+            )
+
+
+@_REQUIRES_CUDA
+@pytest.mark.parametrize("n", [2, 3, 4, 5, 8, 9])
+def test_cuda_l1_median_handles_small_even_and_odd_leaves(n):
+    """Targets the specific failure mode of the old PercentileDevice formula:
+    even-length leaves returning sorted[1] instead of avg(sorted[1], sorted[2]),
+    and odd-length leaves returning avg(sorted[1], sorted[2]) instead of
+    sorted[2]. We force every datapoint into its own leaf, then split a couple
+    in half and check the leaf medians.
+    """
+    rng = np.random.default_rng(7)
+    # one feature so we deterministically split on it; values are well-separated
+    X = np.arange(n, dtype=np.float64).reshape(-1, 1)
+    # values designed so that splitting on the only feature produces leaves of
+    # exactly the requested cardinalities at depth 1 and 2.
+    y = rng.standard_normal(n).astype(np.float64)
+
+    for device_type in ("cpu", "cuda"):
+        bst = _train_one_tree_for_renewal(device_type, "regression_l1", 0.5, X, y)
+        leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
+        raw = bst.predict(X, raw_score=True)
+        for li in np.unique(leaf_idx):
+            mask = leaf_idx == li
+            expected = float(np.median(y[mask]))
+            actual = float(raw[mask][0])
+            assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
+                f"{device_type} n={n} leaf {li} (size {int(mask.sum())}): "
+                f"expected np.median={expected:.10f}, got {actual:.10f}"
+            )