trianglesplatting · jammm · Jun 4, 2025
diff --git a/cuda_rasterizer/auxiliary.h b/cuda_rasterizer/auxiliary.h
@@ -220,7 +220,11 @@ __forceinline__ __device__ bool in_frustum(int idx,
 		if (prefiltered)
 		{
 			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
-			__trap();
+			#if defined(__HIPCC__)
+				__builtin_trap();
+			#else
+				__trap();
+			#endif
 		}
 		return false;
 	}
@@ -255,7 +259,11 @@ __forceinline__ __device__ bool in_frustum_triangle(int idx,
 		if (prefiltered)
 		{
 			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
-			__trap();
+			#if defined(__HIPCC__)
+				__builtin_trap();
+			#else
+				__trap();
+			#endif
 		}
 		return false;
 	}

diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu
@@ -23,7 +23,9 @@
  #include "backward.h"
  #include "auxiliary.h"
  #include <cooperative_groups.h>
+ #ifndef __HIPCC__
  #include <cooperative_groups/reduce.h>
+ #endif
  namespace cg = cooperative_groups;
 
 
@@ -762,7 +764,7 @@
 	 // Propagate gradients for remaining steps: finish 3D mean gradients,
 	 // propagate color gradients to SH (if desireD), propagate 3D covariance
 	 // matrix gradients to scale and rotation.
-	 preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
+	 preprocessCUDA<NUM_CHANNELS> <<<(P + 255) / 256, 256>>> (
 		 P, D, M,
 		 triangles_points,
 		 W, H,
@@ -820,7 +822,7 @@
 	 float* dL_dcolors,
 	 float* dL_dsigma_factor)
  {
-	 renderCUDA<NUM_CHANNELS> << <grid, block >> >(
+	 renderCUDA<NUM_CHANNELS><<<grid, block>>>(
 		 ranges,
 		 point_list,
 		 W, H,

diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h
@@ -25,7 +25,9 @@
 
  #include <cuda.h>
  #include "cuda_runtime.h"
+ #ifndef __HIPCC__
  #include "device_launch_parameters.h"
+ #endif
  #define GLM_FORCE_CUDA
  #include <glm/glm.hpp>
 

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
@@ -23,7 +23,9 @@
  #include "forward.h"
  #include "auxiliary.h"
  #include <cooperative_groups.h>
+ #ifndef __HIPCC__
  #include <cooperative_groups/reduce.h>
+ #endif
  namespace cg = cooperative_groups;
 
 
@@ -646,7 +648,7 @@
 	 float* out_others,
 	float* max_blending)
  {
-	 renderCUDA<NUM_CHANNELS> << <grid, block >> > (
+	 renderCUDA<NUM_CHANNELS> <<<grid, block>>> (
 		 ranges,
 		 point_list,
 		 W, H,
@@ -704,7 +706,7 @@
 	 uint32_t* tiles_touched,
 	 bool prefiltered)
  {
-	 preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
+	 preprocessCUDA<NUM_CHANNELS> <<<(P + 255) / 256, 256>>> (
 		 P, D, M,
 		 triangles_points,
 		 sigma,

diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
@@ -25,7 +25,9 @@
 
  #include <cuda.h>
  #include "cuda_runtime.h"
+ #ifndef __HIPCC__
  #include "device_launch_parameters.h"
+ #endif
  #define GLM_FORCE_CUDA
  #include <glm/glm.hpp>
 

diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
@@ -27,14 +27,18 @@
  #include <numeric>
  #include <cuda.h>
  #include "cuda_runtime.h"
+ #ifndef __HIPCC__
  #include "device_launch_parameters.h"
+ #endif
  #include <cub/cub.cuh>
  #include <cub/device/device_radix_sort.cuh>
  #define GLM_FORCE_CUDA
  #include <glm/glm.hpp>
 
  #include <cooperative_groups.h>
+ #ifndef __HIPCC__
  #include <cooperative_groups/reduce.h>
+ #endif
  namespace cg = cooperative_groups;
 
  #include "auxiliary.h"
@@ -156,7 +160,7 @@
 	 float* projmatrix,
 	 bool* present)
  {
-	 checkFrustum << <(P + 255) / 256, 256 >> > (
+	 checkFrustum <<<(P + 255) / 256, 256>>> (
 		 P,
 		 means3D,
 		 viewmatrix, projmatrix,
@@ -320,7 +324,7 @@
 
 	 // For each instance to be rendered, produce adequate [ tile | depth ] key 
 	 // and corresponding dublicated Triangle indices to be sorted
-	 duplicateWithKeys << <(P + 255) / 256, 256 >> > (
+	 duplicateWithKeys<<<(P + 255) / 256, 256>>> (
 		 P,
 		 geomState.means2D,
 		 geomState.depths,
@@ -349,7 +353,7 @@
 
 	 // Identify start and end of per-tile workloads in sorted list
 	 if (num_rendered > 0)
-		 identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
+		 identifyTileRanges <<<(num_rendered + 255) / 256, 256>>> (
 			 num_rendered,
 			 binningState.point_list_keys,
 			 imgState.ranges);

diff --git a/cuda_rasterizer/utils.h b/cuda_rasterizer/utils.h
@@ -3,7 +3,9 @@
 
 #include <cuda.h>
 #include "cuda_runtime.h"
+#ifndef __HIPCC__
 #include "device_launch_parameters.h"
+#endif
 
 namespace UTILS
 {

diff --git a/setup.py b/setup.py
@@ -23,8 +23,14 @@
 from setuptools import setup
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension
 import os
+import torch
 os.path.dirname(os.path.abspath(__file__))
 
+use_fast_math_flag = "--use_fast_math"
+if torch.version.hip is not None:
+    # hipcc (clang) does not recognize --use_fast_math; use clang's flag
+    use_fast_math_flag = "-ffast-math"
+
 setup(
     name="diff_triangle_rasterization",
     packages=['diff_triangle_rasterization'],
@@ -38,7 +44,7 @@
             "cuda_rasterizer/utils.cu",
             "rasterize_points.cu",
             "ext.cpp"],
-            extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"), "--use_fast_math"]})
+            extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"), use_fast_math_flag]})
         ],
     cmdclass={
         'build_ext': BuildExtension