From b1b1429a9072d1be2dcbe80c7f4f00583ef4d45f Mon Sep 17 00:00:00 2001 From: zjli2013 Date: Thu, 28 May 2026 18:49:42 +0800 Subject: [PATCH] Add ROCm-compatible source build fixes Adjust the existing CUDA sources and setup flags so PyTorch ROCm hipify can build the extension without adding generated HIP source files. --- cuda_rasterizer/auxiliary.h | 4 ++-- cuda_rasterizer/backward.cu | 7 +++---- cuda_rasterizer/backward.h | 3 +-- cuda_rasterizer/forward.cu | 7 +++---- cuda_rasterizer/forward.h | 3 +-- cuda_rasterizer/rasterizer_impl.cu | 12 +++++------- cuda_rasterizer/rasterizer_impl.h | 2 +- cuda_rasterizer/utils.h | 1 - setup.py | 20 ++++++++++++++++---- 9 files changed, 32 insertions(+), 27 deletions(-) diff --git a/cuda_rasterizer/auxiliary.h b/cuda_rasterizer/auxiliary.h index 30fa3b2..ce6a7c8 100644 --- a/cuda_rasterizer/auxiliary.h +++ b/cuda_rasterizer/auxiliary.h @@ -220,7 +220,7 @@ __forceinline__ __device__ bool in_frustum(int idx, if (prefiltered) { printf("Point is filtered although prefiltered is set. This shouldn't happen!"); - __trap(); + __builtin_trap(); } return false; } @@ -255,7 +255,7 @@ __forceinline__ __device__ bool in_frustum_triangle(int idx, if (prefiltered) { printf("Point is filtered although prefiltered is set. This shouldn't happen!"); - __trap(); + __builtin_trap(); } return false; } diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu index c70c159..d8e5601 100644 --- a/cuda_rasterizer/backward.cu +++ b/cuda_rasterizer/backward.cu @@ -23,8 +23,7 @@ #include "backward.h" #include "auxiliary.h" #include - #include - namespace cg = cooperative_groups; + namespace cg = cooperative_groups; @@ -762,7 +761,7 @@ // Propagate gradients for remaining steps: finish 3D mean gradients, // propagate color gradients to SH (if desireD), propagate 3D covariance // matrix gradients to scale and rotation. - preprocessCUDA << < (P + 255) / 256, 256 >> > ( + preprocessCUDA <<< (P + 255) / 256, 256 >>> ( P, D, M, triangles_points, W, H, @@ -820,7 +819,7 @@ float* dL_dcolors, float* dL_dsigma_factor) { - renderCUDA << > >( + renderCUDA <<>>( ranges, point_list, W, H, diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h index e90e6f3..60748f9 100644 --- a/cuda_rasterizer/backward.h +++ b/cuda_rasterizer/backward.h @@ -25,8 +25,7 @@ #include #include "cuda_runtime.h" - #include "device_launch_parameters.h" - #define GLM_FORCE_CUDA + #define GLM_FORCE_CUDA #include namespace BACKWARD diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index d68a454..f693b25 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -23,8 +23,7 @@ #include "forward.h" #include "auxiliary.h" #include - #include - namespace cg = cooperative_groups; + namespace cg = cooperative_groups; @@ -646,7 +645,7 @@ float* out_others, float* max_blending) { - renderCUDA << > > ( + renderCUDA <<>> ( ranges, point_list, W, H, @@ -704,7 +703,7 @@ uint32_t* tiles_touched, bool prefiltered) { - preprocessCUDA << <(P + 255) / 256, 256 >> > ( + preprocessCUDA <<<(P + 255) / 256, 256 >>> ( P, D, M, triangles_points, sigma, diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 7222b2d..82f3244 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -25,8 +25,7 @@ #include #include "cuda_runtime.h" - #include "device_launch_parameters.h" - #define GLM_FORCE_CUDA + #define GLM_FORCE_CUDA #include namespace FORWARD diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 58439a3..eebcc44 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -27,15 +27,13 @@ #include #include #include "cuda_runtime.h" - #include "device_launch_parameters.h" - #include + #include #include #define GLM_FORCE_CUDA #include #include - #include - namespace cg = cooperative_groups; + namespace cg = cooperative_groups; #include "auxiliary.h" #include "forward.h" @@ -156,7 +154,7 @@ float* projmatrix, bool* present) { - checkFrustum << <(P + 255) / 256, 256 >> > ( + checkFrustum <<<(P + 255) / 256, 256 >>> ( P, means3D, viewmatrix, projmatrix, @@ -320,7 +318,7 @@ // For each instance to be rendered, produce adequate [ tile | depth ] key // and corresponding dublicated Triangle indices to be sorted - duplicateWithKeys << <(P + 255) / 256, 256 >> > ( + duplicateWithKeys <<<(P + 255) / 256, 256 >>> ( P, geomState.means2D, geomState.depths, @@ -349,7 +347,7 @@ // Identify start and end of per-tile workloads in sorted list if (num_rendered > 0) - identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( + identifyTileRanges <<<(num_rendered + 255) / 256, 256 >>> ( num_rendered, binningState.point_list_keys, imgState.ranges); diff --git a/cuda_rasterizer/rasterizer_impl.h b/cuda_rasterizer/rasterizer_impl.h index 852e018..b7eca01 100644 --- a/cuda_rasterizer/rasterizer_impl.h +++ b/cuda_rasterizer/rasterizer_impl.h @@ -32,7 +32,7 @@ template static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment) { - std::size_t offset = (reinterpret_cast(chunk) + alignment - 1) & ~(alignment - 1); + std::size_t offset = (reinterpret_cast(chunk) + alignment - 1) & ~(alignment - 1); ptr = reinterpret_cast(offset); chunk = reinterpret_cast(ptr + count); } diff --git a/cuda_rasterizer/utils.h b/cuda_rasterizer/utils.h index e49534b..aacf826 100644 --- a/cuda_rasterizer/utils.h +++ b/cuda_rasterizer/utils.h @@ -3,7 +3,6 @@ #include #include "cuda_runtime.h" -#include "device_launch_parameters.h" namespace UTILS { diff --git a/setup.py b/setup.py index 94b274e..5f9f97a 100644 --- a/setup.py +++ b/setup.py @@ -20,10 +20,22 @@ # For inquiries contact jan.held@uliege.be # +from pathlib import Path + +import torch from setuptools import setup -from torch.utils.cpp_extension import CUDAExtension, BuildExtension -import os -os.path.dirname(os.path.abspath(__file__)) +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +ROOT = Path(__file__).resolve().parent +GLM_INCLUDE = ROOT / "third_party" / "glm" +IS_ROCM = bool(getattr(torch.version, "hip", None)) + + +def _nvcc_flags(): + flags = [f"-I{GLM_INCLUDE}"] + if not IS_ROCM: + flags.append("--use_fast_math") + return flags setup( name="diff_triangle_rasterization", @@ -38,7 +50,7 @@ "cuda_rasterizer/utils.cu", "rasterize_points.cu", "ext.cpp"], - extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"), "--use_fast_math"]}) + extra_compile_args={"nvcc": _nvcc_flags()}) ], cmdclass={ 'build_ext': BuildExtension