Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions include/LightGBM/cuda/cuda_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,9 +498,14 @@ __device__ void ShuffleSortedPrefixSumDevice(const VAL_T* in_values,
}
__syncthreads();
thread_sum = ShufflePrefixSumExclusive<REDUCE_VAL_T>(thread_sum, shared_buffer);
const REDUCE_VAL_T thread_base = shared_buffer[threadIdx.x];
// Use the per-thread exclusive prefix sum returned above. The previous
// shared_buffer[threadIdx.x] read was OOB whenever blockDim.x > WARPSIZE
// (the shared_buffer is sized WARPSIZE) and produced an illegal-memory
// access on weighted L1/quantile renewal kernels with blockDim.x = 256.
REDUCE_VAL_T running = thread_sum;
for (INDEX_T index = start; index < end; ++index) {
out_values[index] = thread_base + static_cast<REDUCE_VAL_T>(in_values[sorted_indices[index]]);
running += static_cast<REDUCE_VAL_T>(in_values[sorted_indices[index]]);
out_values[index] = running;
}
__syncthreads();
}
Expand Down
35 changes: 35 additions & 0 deletions tests/python_package_test/test_dual.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,41 @@ def test_cuda_init_score_matches_cpu(objective, alpha, n):
assert cuda == pytest.approx(cpu, abs=1e-6), f"{objective} alpha={alpha} n={n}: cpu={cpu} cuda={cuda}"


_REQUIRES_CUDA = pytest.mark.skipif(
os.environ.get("TASK", "") != "cuda",
reason="requires CUDA-enabled LightGBM build (set TASK=cuda)",
)


@_REQUIRES_CUDA
@pytest.mark.parametrize("objective", ["regression_l1", "quantile"])
@pytest.mark.parametrize("n", [100, 200, 500, 1000])
def test_cuda_weighted_percentile_renewal_does_not_crash(objective, n):
"""Regression test for the OOB shared-memory access in
ShuffleSortedPrefixSumDevice that crashed weighted L1 / weighted
quantile training with "illegal memory access" for n >= ~100.
"""
rng = np.random.default_rng(0)
X = rng.standard_normal((n, 3)).astype(np.float64)
y = rng.standard_normal(n).astype(np.float64)
w = rng.random(n)
ds = lgb.Dataset(X, label=y, weight=w, params={"verbose": -1, "feature_pre_filter": False})
params = {
"objective": objective,
"alpha": 0.5,
"device_type": "cuda",
"verbose": -1,
"num_leaves": 4,
"min_data_in_leaf": 1,
"deterministic": True,
"gpu_use_dp": True,
}
# If the OOB access regresses, this raises a CUDA "illegal memory access" error.
bst = lgb.train(params, ds, num_boost_round=2)
preds = bst.predict(X, raw_score=True)
assert np.all(np.isfinite(preds)), "weighted percentile renewal produced non-finite predictions"


@pytest.mark.skipif(
os.environ.get("LIGHTGBM_TEST_DUAL_CPU_GPU", "0") != "1",
reason="Set LIGHTGBM_TEST_DUAL_CPU_GPU=1 to test using CPU and GPU training from the same package.",
Expand Down
Loading