diff --git a/cuda_rasterizer/auxiliary.h b/cuda_rasterizer/auxiliary.h index 30fa3b2..283cd85 100644 --- a/cuda_rasterizer/auxiliary.h +++ b/cuda_rasterizer/auxiliary.h @@ -220,7 +220,11 @@ __forceinline__ __device__ bool in_frustum(int idx, if (prefiltered) { printf("Point is filtered although prefiltered is set. This shouldn't happen!"); - __trap(); + #if defined(__HIPCC__) + __builtin_trap(); + #else + __trap(); + #endif } return false; } @@ -255,7 +259,11 @@ __forceinline__ __device__ bool in_frustum_triangle(int idx, if (prefiltered) { printf("Point is filtered although prefiltered is set. This shouldn't happen!"); - __trap(); + #if defined(__HIPCC__) + __builtin_trap(); + #else + __trap(); + #endif } return false; } diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu index c70c159..1fff02a 100644 --- a/cuda_rasterizer/backward.cu +++ b/cuda_rasterizer/backward.cu @@ -23,7 +23,9 @@ #include "backward.h" #include "auxiliary.h" #include + #ifndef __HIPCC__ #include + #endif namespace cg = cooperative_groups; @@ -762,7 +764,7 @@ // Propagate gradients for remaining steps: finish 3D mean gradients, // propagate color gradients to SH (if desireD), propagate 3D covariance // matrix gradients to scale and rotation. - preprocessCUDA << < (P + 255) / 256, 256 >> > ( + preprocessCUDA <<<(P + 255) / 256, 256>>> ( P, D, M, triangles_points, W, H, @@ -820,7 +822,7 @@ float* dL_dcolors, float* dL_dsigma_factor) { - renderCUDA << > >( + renderCUDA<<>>( ranges, point_list, W, H, diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h index e90e6f3..86ba851 100644 --- a/cuda_rasterizer/backward.h +++ b/cuda_rasterizer/backward.h @@ -25,7 +25,9 @@ #include #include "cuda_runtime.h" + #ifndef __HIPCC__ #include "device_launch_parameters.h" + #endif #define GLM_FORCE_CUDA #include diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index d68a454..0845188 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -23,7 +23,9 @@ #include "forward.h" #include "auxiliary.h" #include + #ifndef __HIPCC__ #include + #endif namespace cg = cooperative_groups; @@ -646,7 +648,7 @@ float* out_others, float* max_blending) { - renderCUDA << > > ( + renderCUDA <<>> ( ranges, point_list, W, H, @@ -704,7 +706,7 @@ uint32_t* tiles_touched, bool prefiltered) { - preprocessCUDA << <(P + 255) / 256, 256 >> > ( + preprocessCUDA <<<(P + 255) / 256, 256>>> ( P, D, M, triangles_points, sigma, diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 7222b2d..753ba34 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -25,7 +25,9 @@ #include #include "cuda_runtime.h" + #ifndef __HIPCC__ #include "device_launch_parameters.h" + #endif #define GLM_FORCE_CUDA #include diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 58439a3..2204626 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -27,14 +27,18 @@ #include #include #include "cuda_runtime.h" + #ifndef __HIPCC__ #include "device_launch_parameters.h" + #endif #include #include #define GLM_FORCE_CUDA #include #include + #ifndef __HIPCC__ #include + #endif namespace cg = cooperative_groups; #include "auxiliary.h" @@ -156,7 +160,7 @@ float* projmatrix, bool* present) { - checkFrustum << <(P + 255) / 256, 256 >> > ( + checkFrustum <<<(P + 255) / 256, 256>>> ( P, means3D, viewmatrix, projmatrix, @@ -320,7 +324,7 @@ // For each instance to be rendered, produce adequate [ tile | depth ] key // and corresponding dublicated Triangle indices to be sorted - duplicateWithKeys << <(P + 255) / 256, 256 >> > ( + duplicateWithKeys<<<(P + 255) / 256, 256>>> ( P, geomState.means2D, geomState.depths, @@ -349,7 +353,7 @@ // Identify start and end of per-tile workloads in sorted list if (num_rendered > 0) - identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( + identifyTileRanges <<<(num_rendered + 255) / 256, 256>>> ( num_rendered, binningState.point_list_keys, imgState.ranges); diff --git a/cuda_rasterizer/utils.h b/cuda_rasterizer/utils.h index e49534b..f94e7a8 100644 --- a/cuda_rasterizer/utils.h +++ b/cuda_rasterizer/utils.h @@ -3,7 +3,9 @@ #include #include "cuda_runtime.h" +#ifndef __HIPCC__ #include "device_launch_parameters.h" +#endif namespace UTILS { diff --git a/setup.py b/setup.py index 94b274e..a1c1c81 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,14 @@ from setuptools import setup from torch.utils.cpp_extension import CUDAExtension, BuildExtension import os +import torch os.path.dirname(os.path.abspath(__file__)) +use_fast_math_flag = "--use_fast_math" +if torch.version.hip is not None: + # hipcc (clang) does not recognize --use_fast_math; use clang's flag + use_fast_math_flag = "-ffast-math" + setup( name="diff_triangle_rasterization", packages=['diff_triangle_rasterization'], @@ -38,7 +44,7 @@ "cuda_rasterizer/utils.cu", "rasterize_points.cu", "ext.cpp"], - extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"), "--use_fast_math"]}) + extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"), use_fast_math_flag]}) ], cmdclass={ 'build_ext': BuildExtension