Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/LightGBM/cuda/cuda_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,14 +583,14 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
}
if (!USE_WEIGHT) {
BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
const double float_pos = (1.0f - alpha) * len;
const INDEX_T pos = static_cast<INDEX_T>(float_pos);
const double float_pos = (1.0 - alpha) * static_cast<double>(len - 1);
const INDEX_T pos = static_cast<INDEX_T>(float_pos) + 1;
if (pos < 1) {
return values[indices[0]];
} else if (pos >= len) {
return values[indices[len - 1]];
} else {
const double bias = float_pos - pos;
const double bias = float_pos - static_cast<double>(pos - 1);
const VAL_T v1 = values[indices[pos - 1]];
const VAL_T v2 = values[indices[pos]];
return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
Expand Down
120 changes: 120 additions & 0 deletions tests/python_package_test/test_dual.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import platform

import numpy as np
import pytest
from sklearn.metrics import log_loss

Expand Down Expand Up @@ -35,3 +36,122 @@ def test_cpu_and_gpu_work():
rel = 1e-6 if params_gpu["gpu_use_dp"] else 1e-4
assert cpu_score == pytest.approx(gpu_score, rel=rel)
assert gpu_score < 0.242


_REQUIRES_CUDA = pytest.mark.skipif(
os.environ.get("TASK", "") != "cuda",
reason="requires CUDA-enabled LightGBM build (set TASK=cuda)",
)

# Loose enough to absorb label_t float32 quantization in the renewal kernel,
# tight enough to flag the ~0.3 bias the old PercentileDevice formula produced.
_PERCENTILE_TOL = 1e-6


def _train_one_tree_for_renewal(device_type, objective, alpha, X, y):
# learning_rate=1.0 makes raw_score equal to the renewed leaf value directly,
# which lets the assertion compare against numpy.quantile without unwinding shrinkage.
params = {
"objective": objective,
"alpha": alpha,
"num_leaves": 7,
"min_data_in_leaf": 1,
"learning_rate": 1.0,
"verbose": -1,
"deterministic": True,
"num_threads": 1,
"seed": 0,
"feature_pre_filter": False,
"device_type": device_type,
"gpu_use_dp": True,
"force_col_wise": True,
}
ds = lgb.Dataset(X, label=y, params={"verbose": -1, "feature_pre_filter": False})
return lgb.train(params, ds, num_boost_round=1)


@_REQUIRES_CUDA
@pytest.mark.parametrize("seed", [0, 1, 2])
def test_cuda_l1_leaf_renewal_matches_numpy_median(seed):
"""L1 leaf renewal must produce numpy.median(y_in_leaf) on both CPU and CUDA.

Regression test for the unweighted PercentileDevice formula that previously
used `len * (1 - alpha)` instead of `(len - 1) * (1 - alpha)`, biasing
leaf values upward in the descending-sort convention used for L1 / quantile
renewal.
"""
rng = np.random.default_rng(seed)
n = 200
X = rng.standard_normal((n, 5)).astype(np.float64)
w = rng.standard_normal(5)
y = (X @ w + 0.1 * rng.standard_normal(n)).astype(np.float64)

for device_type in ("cpu", "cuda"):
bst = _train_one_tree_for_renewal(device_type, "regression_l1", 0.5, X, y)
leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
raw = bst.predict(X, raw_score=True)
for li in np.unique(leaf_idx):
mask = leaf_idx == li
expected = float(np.median(y[mask]))
actual = float(raw[mask][0])
assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
f"{device_type} leaf {li} (n={int(mask.sum())}): expected np.median={expected:.10f}, got {actual:.10f}"
)


@_REQUIRES_CUDA
@pytest.mark.parametrize("alpha", [0.1, 0.25, 0.5, 0.7, 0.9])
def test_cuda_quantile_leaf_renewal_matches_numpy_quantile(alpha):
"""Quantile leaf renewal must produce numpy.quantile(y_in_leaf, alpha)
on both CPU and CUDA. Same regression coverage as the L1 test, but
sweeping alpha so the bias of the wrong formula would show on every
even/odd leaf size combination.
"""
rng = np.random.default_rng(123)
n = 250
X = rng.standard_normal((n, 6)).astype(np.float64)
w = rng.standard_normal(6)
y = (X @ w + 0.1 * rng.standard_normal(n)).astype(np.float64)

for device_type in ("cpu", "cuda"):
bst = _train_one_tree_for_renewal(device_type, "quantile", alpha, X, y)
leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
raw = bst.predict(X, raw_score=True)
for li in np.unique(leaf_idx):
mask = leaf_idx == li
expected = float(np.quantile(y[mask], alpha))
actual = float(raw[mask][0])
assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
f"{device_type} alpha={alpha} leaf {li} (n={int(mask.sum())}): "
f"expected np.quantile={expected:.10f}, got {actual:.10f}"
)


@_REQUIRES_CUDA
@pytest.mark.parametrize("n", [2, 3, 4, 5, 8, 9])
def test_cuda_l1_median_handles_small_even_and_odd_leaves(n):
"""Targets the specific failure mode of the old PercentileDevice formula:
even-length leaves returning sorted[1] instead of avg(sorted[1], sorted[2]),
and odd-length leaves returning avg(sorted[1], sorted[2]) instead of
sorted[2]. We force every datapoint into its own leaf, then split a couple
in half and check the leaf medians.
"""
rng = np.random.default_rng(7)
# one feature so we deterministically split on it; values are well-separated
X = np.arange(n, dtype=np.float64).reshape(-1, 1)
# values designed so that splitting on the only feature produces leaves of
# exactly the requested cardinalities at depth 1 and 2.
y = rng.standard_normal(n).astype(np.float64)

for device_type in ("cpu", "cuda"):
bst = _train_one_tree_for_renewal(device_type, "regression_l1", 0.5, X, y)
leaf_idx = bst.predict(X, pred_leaf=True).astype(int).reshape(-1)
raw = bst.predict(X, raw_score=True)
for li in np.unique(leaf_idx):
mask = leaf_idx == li
expected = float(np.median(y[mask]))
actual = float(raw[mask][0])
assert actual == pytest.approx(expected, abs=_PERCENTILE_TOL), (
f"{device_type} n={n} leaf {li} (size {int(mask.sum())}): "
f"expected np.median={expected:.10f}, got {actual:.10f}"
)
Loading