From 9c64eb2c6f4244eb288d9d507f66d947209ee5d1 Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Fri, 5 Dec 2025 13:12:11 -0500 Subject: [PATCH 01/23] change coordinate system --- cuda/data.cu | 4 +- cuda/gaussian.cu | 128 +++++--- cuda/gaussian_backward.cu | 402 +++++++++++++++++--------- cuda/projection.cu | 106 ++++--- cuda/projection_backward.cu | 139 +++++---- cuda/raster.cu | 14 +- cuda/render.cu | 11 +- cuda/render_backward.cu | 21 +- cuda/trainer.cu | 120 +++++--- include/gsplat_cuda/cuda_backward.cuh | 29 +- include/gsplat_cuda/cuda_data.cuh | 2 +- include/gsplat_cuda/cuda_forward.cuh | 21 +- tests/cuda_backward_test.cpp | 205 +++++++------ tests/cuda_forward_test.cpp | 167 +++++++---- 14 files changed, 857 insertions(+), 512 deletions(-) diff --git a/cuda/data.cu b/cuda/data.cu index e2a099e..fdb66ae 100644 --- a/cuda/data.cu +++ b/cuda/data.cu @@ -94,8 +94,8 @@ GradientAccumulators::GradientAccumulators(size_t max_gaussians) { CameraParameters::CameraParameters() { try { // Allocate camera parameters - d_K.resize(9); // 3x3 matrix - d_T.resize(12); // 3x4 matrix + d_view.resize(16); // 4x4 matrix + d_proj.resize(16); // 4x4 matrix } catch (const std::exception &e) { fprintf(stderr, "CUDA Memory Allocation Error (CudaDataManager): %s\n", e.what()); exit(EXIT_FAILURE); diff --git a/cuda/gaussian.cu b/cuda/gaussian.cu index 895b412..67e3811 100644 --- a/cuda/gaussian.cu +++ b/cuda/gaussian.cu @@ -77,7 +77,7 @@ __global__ void compute_sigma_fused_kernel(const float *__restrict__ quaternion, sigma[sigma_base_idx + 8] = rs20 * rs20 + rs21 * rs21 + rs22 * rs22; // S_22 } -__global__ void compute_conic_kernel(const float *__restrict__ sigma, const float *__restrict__ T, +__global__ void compute_conic_kernel(const float *__restrict__ sigma, const float *__restrict__ view, const float *__restrict__ J, const int N, float *conic) { constexpr int SIGMA_STRIDE = 9; constexpr int J_STRIDE = 6; @@ -86,22 +86,22 @@ __global__ void compute_conic_kernel(const float *__restrict__ sigma, const floa const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; // lane_id in warp (0-31) - // Load and broadcast Extrinsic Matrix T (3x4) within warp - float t_val = 0.0f; - if (lane_id < 12) { - t_val = T[lane_id]; + // Load and broadcast View Matrix (4x4) within warp + float v_val = 0.0f; + if (lane_id < 16) { + v_val = view[lane_id]; } - // T = [r00, r01, r02, t0, r10, r11, r12, t1, r20, r21, r22, t2] - // W = [r00, r01, r02, r10, r11, r12, r20, r21, r22] - const float w00 = __shfl_sync(0xffffffff, t_val, 0); - const float w01 = __shfl_sync(0xffffffff, t_val, 1); - const float w02 = __shfl_sync(0xffffffff, t_val, 2); - const float w10 = __shfl_sync(0xffffffff, t_val, 4); - const float w11 = __shfl_sync(0xffffffff, t_val, 5); - const float w12 = __shfl_sync(0xffffffff, t_val, 6); - const float w20 = __shfl_sync(0xffffffff, t_val, 8); - const float w21 = __shfl_sync(0xffffffff, t_val, 9); - const float w22 = __shfl_sync(0xffffffff, t_val, 10); + // View = [r00, r01, r02, t0, r10, r11, r12, t1, r20, r21, r22, t2, 0, 0, 0, 1] + // W (rotation part) = [r00, r01, r02, r10, r11, r12, r20, r21, r22] + const float w00 = __shfl_sync(0xffffffff, v_val, 0); + const float w01 = __shfl_sync(0xffffffff, v_val, 1); + const float w02 = __shfl_sync(0xffffffff, v_val, 2); + const float w10 = __shfl_sync(0xffffffff, v_val, 4); + const float w11 = __shfl_sync(0xffffffff, v_val, 5); + const float w12 = __shfl_sync(0xffffffff, v_val, 6); + const float w20 = __shfl_sync(0xffffffff, v_val, 8); + const float w21 = __shfl_sync(0xffffffff, v_val, 9); + const float w22 = __shfl_sync(0xffffffff, v_val, 10); if (i >= N) { return; @@ -145,19 +145,23 @@ __global__ void compute_conic_kernel(const float *__restrict__ sigma, const floa const float v21 = s02 * m10 + s12 * m11 + s22 * m12; // 3. Compute conic = M @ V. The resulting conic is a 2x2 symmetric matrix. - // We only need to compute and store the 3 unique elements of the upper triangle. - const float c00 = m00 * v00 + m01 * v10 + m02 * v20; - const float c01 = m00 * v01 + m01 * v11 + m02 * v21; // Also equals c10 - const float c11 = m10 * v01 + m11 * v11 + m12 * v21; + // Covariance is symmetric, so we only need to store the upper triangle + // cov = [cov00, cov01, cov11] + const float cov00 = m00 * v00 + m01 * v10 + m02 * v20 + 0.3f; + const float cov01 = m00 * v01 + m01 * v11 + m02 * v21; + const float cov11 = m10 * v01 + m11 * v11 + m12 * v21 + 0.3f; + + // Invert covariance matrix (2x2) + const float det = cov00 * cov11 - cov01 * cov01; + const float inv_det = 1.0f / det; - // 4. Store the 3 unique components of the conic matrix into global memory. const int conic_base_idx = i * CONIC_STRIDE; - conic[conic_base_idx + 0] = c00; - conic[conic_base_idx + 1] = c01; - conic[conic_base_idx + 2] = c11; + conic[conic_base_idx + 0] = cov11 * inv_det; + conic[conic_base_idx + 1] = -cov01 * inv_det; + conic[conic_base_idx + 2] = cov00 * inv_det; } -__global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz, const float *__restrict__ K, +__global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz, const float *__restrict__ proj, const int N, float *J) { constexpr int XYZ_STRIDE = 3; constexpr int J_STRIDE = 6; @@ -165,14 +169,23 @@ __global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; - // load and broadcast K to all threads in warp - float k_val = 0.0f; - if (lane_id < 9) { - k_val = K[lane_id]; + // load and broadcast Proj to all threads in warp + float p_val = 0.0f; + if (lane_id < 16) { + p_val = proj[lane_id]; } - // K = [fx, 0, cx, 0, fy, cy, 0, 0, 1] - const float fx = __shfl_sync(0xffffffff, k_val, 0); - const float fy = __shfl_sync(0xffffffff, k_val, 4); + const float p00 = __shfl_sync(0xffffffff, p_val, 0); + const float p01 = __shfl_sync(0xffffffff, p_val, 1); + const float p02 = __shfl_sync(0xffffffff, p_val, 2); + const float p03 = __shfl_sync(0xffffffff, p_val, 3); + const float p10 = __shfl_sync(0xffffffff, p_val, 4); + const float p11 = __shfl_sync(0xffffffff, p_val, 5); + const float p12 = __shfl_sync(0xffffffff, p_val, 6); + const float p13 = __shfl_sync(0xffffffff, p_val, 7); + const float p30 = __shfl_sync(0xffffffff, p_val, 12); + const float p31 = __shfl_sync(0xffffffff, p_val, 13); + const float p32 = __shfl_sync(0xffffffff, p_val, 14); + const float p33 = __shfl_sync(0xffffffff, p_val, 15); if (i >= N) { return; @@ -182,12 +195,39 @@ __global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz float y = xyz[i * XYZ_STRIDE + 1]; float z = xyz[i * XYZ_STRIDE + 2]; - J[i * J_STRIDE + 0] = fx / z; - J[i * J_STRIDE + 1] = 0; - J[i * J_STRIDE + 2] = -fx * x / (z * z); - J[i * J_STRIDE + 3] = 0; - J[i * J_STRIDE + 4] = fy / z; - J[i * J_STRIDE + 5] = -fy * y / (z * z); + // Clip coordinates + float xc = p00 * x + p01 * y + p02 * z + p03; + float yc = p10 * x + p11 * y + p12 * z + p13; + float wc = p30 * x + p31 * y + p32 * z + p33; + + // Avoid division by zero + if (fabsf(wc) < 1e-6f) { + J[i * J_STRIDE + 0] = 0; + J[i * J_STRIDE + 1] = 0; + J[i * J_STRIDE + 2] = 0; + J[i * J_STRIDE + 3] = 0; + J[i * J_STRIDE + 4] = 0; + J[i * J_STRIDE + 5] = 0; + return; + } + + float wc_inv = 1.0f / wc; + float wc_inv2 = wc_inv * wc_inv; + + // Jacobian of NDC coordinates (x/w, y/w) w.r.t. camera coordinates (x, y, z) + // d(x/w)/dx = (dx_c/dx * w - x_c * dw_c/dx) / w^2 + // dx_c/dx = p00, dw_c/dx = p30 + // d(x/w)/dx = p00/w - xc*p30/w^2 + + // Row 0: d(x_ndc) / d(x, y, z) + J[i * J_STRIDE + 0] = (p00 * wc - xc * p30) * wc_inv2; // dx + J[i * J_STRIDE + 1] = (p01 * wc - xc * p31) * wc_inv2; // dy + J[i * J_STRIDE + 2] = (p02 * wc - xc * p32) * wc_inv2; // dz + + // Row 1: d(y_ndc) / d(x, y, z) + J[i * J_STRIDE + 3] = (p10 * wc - yc * p30) * wc_inv2; // dx + J[i * J_STRIDE + 4] = (p11 * wc - yc * p31) * wc_inv2; // dy + J[i * J_STRIDE + 5] = (p12 * wc - yc * p32) * wc_inv2; // dz } void compute_sigma(float *const quaternion, float *const scale, const int N, float *sigma, cudaStream_t stream) { @@ -207,13 +247,13 @@ void compute_sigma(float *const quaternion, float *const scale, const int N, flo compute_sigma_fused_kernel<<>>(quaternion, scale, N, sigma); } -void compute_conic(float *const xyz, const float *K, float *const sigma, const float *T, const int N, float *J, +void compute_conic(float *const xyz, const float *view, float *const sigma, const float *proj, const int N, float *J, float *conic, cudaStream_t stream) { // Ensure all provided pointers are valid GPU device pointers. ASSERT_DEVICE_POINTER(xyz); - ASSERT_DEVICE_POINTER(K); + ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(sigma); - ASSERT_DEVICE_POINTER(T); + ASSERT_DEVICE_POINTER(view); ASSERT_DEVICE_POINTER(J); ASSERT_DEVICE_POINTER(conic); @@ -226,9 +266,9 @@ void compute_conic(float *const xyz, const float *K, float *const sigma, const f const dim3 blocksize(threads_per_block, 1, 1); // This kernel computes the Jacobian (J) for each Gaussian. - compute_projection_jacobian_kernel<<>>(xyz, K, N, J); + compute_projection_jacobian_kernel<<>>(xyz, proj, N, J); - // This kernel uses the world-space covariance (sigma), the camera transform (T), + // This kernel uses the world-space covariance (sigma), the camera transform (View), // and the Jacobian (J) computed in the previous step to find the 2D conic. - compute_conic_kernel<<>>(sigma, T, J, N, conic); + compute_conic_kernel<<>>(sigma, view, J, N, conic); } diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index bf76935..0f6e7be 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -3,172 +3,308 @@ #include "checks.cuh" #include "gsplat_cuda/cuda_backward.cuh" -__global__ void compute_proj_jacobian_backward_kernel(const float *__restrict__ xyz_c, const float *__restrict__ K, - const float *__restrict__ J_grad_out, const int N, - float *__restrict__ xyz_c_grad_in) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void compute_projection_jacobian_backward_kernel(const float *__restrict__ xyz, + const float *__restrict__ proj, + const float *__restrict__ J_grad_out, const int N, + float *xyz_grad_in) { + constexpr int XYZ_STRIDE = 3; + constexpr int J_STRIDE = 6; + const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; - float k_val = 0.0f; - if (lane_id < 9) - k_val = K[lane_id]; - const float fx = __shfl_sync(0xffffffff, k_val, 0); - const float fy = __shfl_sync(0xffffffff, k_val, 4); - if (i >= N) + // load and broadcast Proj to all threads in warp + float p_val = 0.0f; + if (lane_id < 16) { + p_val = proj[lane_id]; + } + const float p00 = __shfl_sync(0xffffffff, p_val, 0); + const float p01 = __shfl_sync(0xffffffff, p_val, 1); + const float p02 = __shfl_sync(0xffffffff, p_val, 2); + const float p03 = __shfl_sync(0xffffffff, p_val, 3); + const float p10 = __shfl_sync(0xffffffff, p_val, 4); + const float p11 = __shfl_sync(0xffffffff, p_val, 5); + const float p12 = __shfl_sync(0xffffffff, p_val, 6); + const float p13 = __shfl_sync(0xffffffff, p_val, 7); + const float p30 = __shfl_sync(0xffffffff, p_val, 12); + const float p31 = __shfl_sync(0xffffffff, p_val, 13); + const float p32 = __shfl_sync(0xffffffff, p_val, 14); + const float p33 = __shfl_sync(0xffffffff, p_val, 15); + + if (i >= N) { return; + } - const float x = xyz_c[i * 3 + 0]; - const float y = xyz_c[i * 3 + 1]; - const float z = xyz_c[i * 3 + 2]; + float x = xyz[i * XYZ_STRIDE + 0]; + float y = xyz[i * XYZ_STRIDE + 1]; + float z = xyz[i * XYZ_STRIDE + 2]; - if (z <= 1e-4f) { - xyz_c_grad_in[i * 3 + 0] = 0.0f; - xyz_c_grad_in[i * 3 + 1] = 0.0f; - xyz_c_grad_in[i * 3 + 2] = 0.0f; + // Clip coordinates + float xc = p00 * x + p01 * y + p02 * z + p03; + float yc = p10 * x + p11 * y + p12 * z + p13; + float wc = p30 * x + p31 * y + p32 * z + p33; + + if (fabsf(wc) < 1e-6f) { return; } - const float z_inv = 1.0f / (z + 1e-6f); - const float z_inv2 = z_inv * z_inv; - const float z_inv3 = z_inv2 * z_inv; - - const float *grad_J = J_grad_out + i * 6; + float wc_inv = 1.0f / wc; + float wc_inv2 = wc_inv * wc_inv; + float wc_inv3 = wc_inv2 * wc_inv; + + // Gradients of J + float dJ_00 = J_grad_out[i * J_STRIDE + 0]; + float dJ_01 = J_grad_out[i * J_STRIDE + 1]; + float dJ_02 = J_grad_out[i * J_STRIDE + 2]; + float dJ_10 = J_grad_out[i * J_STRIDE + 3]; + float dJ_11 = J_grad_out[i * J_STRIDE + 4]; + float dJ_12 = J_grad_out[i * J_STRIDE + 5]; + + // Backprop through J calculation + // J00 = (p00*wc - xc*p30) / wc^2 + // Let Num00 = p00*wc - xc*p30 + // J00 = Num00 * wc^-2 + // dNum00 = dJ00 * wc^-2 + // dwc += dJ00 * Num00 * (-2 * wc^-3) = dJ00 * J00 * (-2/wc) + // But we don't have J00 computed here. + // Alternatively: + // d(J00)/d(xc) = -p30 / wc^2 + // d(J00)/d(wc) = (p00 * wc^2 - (p00*wc - xc*p30) * 2*wc) / wc^4 + // = (p00*wc - 2*(p00*wc - xc*p30)) / wc^3 + // = (p00*wc - 2*p00*wc + 2*xc*p30) / wc^3 + // = (2*xc*p30 - p00*wc) / wc^3 + + float dxc = 0.0f; + float dyc = 0.0f; + float dwc = 0.0f; - // Gradient w.r.t. xyz_c - float gx = -grad_J[2] * fx * z_inv2; - float gy = -grad_J[5] * fy * z_inv2; - float gz = -grad_J[0] * fx * z_inv2 + grad_J[2] * 2.0f * fx * x * z_inv3 - grad_J[4] * fy * z_inv2 + - grad_J[5] * 2.0f * fy * y * z_inv3; + // Row 0 + // J00 + dxc += dJ_00 * (-p30 * wc_inv2); + dwc += dJ_00 * (2.0f * xc * p30 - p00 * wc) * wc_inv3; + // J01 + dxc += dJ_01 * (-p31 * wc_inv2); + dwc += dJ_01 * (2.0f * xc * p31 - p01 * wc) * wc_inv3; + // J02 + dxc += dJ_02 * (-p32 * wc_inv2); + dwc += dJ_02 * (2.0f * xc * p32 - p02 * wc) * wc_inv3; - xyz_c_grad_in[i * 3 + 0] += gx; - xyz_c_grad_in[i * 3 + 1] += gy; - xyz_c_grad_in[i * 3 + 2] += gz; + // Row 1 + // J10 + dyc += dJ_10 * (-p30 * wc_inv2); + dwc += dJ_10 * (2.0f * yc * p30 - p10 * wc) * wc_inv3; + // J11 + dyc += dJ_11 * (-p31 * wc_inv2); + dwc += dJ_11 * (2.0f * yc * p31 - p11 * wc) * wc_inv3; + // J12 + dyc += dJ_12 * (-p32 * wc_inv2); + dwc += dJ_12 * (2.0f * yc * p32 - p12 * wc) * wc_inv3; + + // Backprop from Clip to Camera + // xc = p00*x + p01*y + p02*z + p03 + // yc = p10*x + p11*y + p12*z + p13 + // wc = p30*x + p31*y + p32*z + p33 + + float dx = dxc * p00 + dyc * p10 + dwc * p30; + float dy = dxc * p01 + dyc * p11 + dwc * p31; + float dz = dxc * p02 + dyc * p12 + dwc * p32; + + xyz_grad_in[i * XYZ_STRIDE + 0] += dx; + xyz_grad_in[i * XYZ_STRIDE + 1] += dy; + xyz_grad_in[i * XYZ_STRIDE + 2] += dz; } -void compute_projection_jacobian_backward(const float *const xyz_c, const float *const K, const float *const J_grad_out, - const int N, float *xyz_c_grad_in, cudaStream_t stream) { +void compute_projection_jacobian_backward(const float *const xyz_c, const float *const proj, + const float *const J_grad_out, const int N, float *xyz_c_grad_in, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_c); - ASSERT_DEVICE_POINTER(K); + ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(J_grad_out); ASSERT_DEVICE_POINTER(xyz_c_grad_in); - const int threads = 256; - const int blocks = (N + threads - 1) / threads; - compute_proj_jacobian_backward_kernel<<>>(xyz_c, K, J_grad_out, N, xyz_c_grad_in); + const int threads_per_block = 256; + const int num_blocks = (N + threads_per_block - 1) / threads_per_block; + + dim3 gridsize(num_blocks, 1, 1); + dim3 blocksize(threads_per_block, 1, 1); + + compute_projection_jacobian_backward_kernel<<>>(xyz_c, proj, J_grad_out, N, + xyz_c_grad_in); } -__global__ void conic_backward_kernel(const float *__restrict__ J, const float *__restrict__ sigma_world, - const float *__restrict__ camera_T_world, +__global__ void conic_backward_kernel(const float *__restrict__ J, const float *__restrict__ sigma, + const float *__restrict__ view, const float *__restrict__ conic, const float *__restrict__ conic_grad_out, const int N, float *J_grad_in, - float *sigma_world_grad_in) { + float *sigma_grad_in) { + constexpr int SIGMA_STRIDE = 9; + constexpr int J_STRIDE = 6; + constexpr int CONIC_STRIDE = 3; + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int lane_id = threadIdx.x & 0x1f; + + // Load and broadcast View Matrix (4x4) within warp + float v_val = 0.0f; + if (lane_id < 16) { + v_val = view[lane_id]; + } + // W (rotation part) = [r00, r01, r02, r10, r11, r12, r20, r21, r22] + const float w00 = __shfl_sync(0xffffffff, v_val, 0); + const float w01 = __shfl_sync(0xffffffff, v_val, 1); + const float w02 = __shfl_sync(0xffffffff, v_val, 2); + const float w10 = __shfl_sync(0xffffffff, v_val, 4); + const float w11 = __shfl_sync(0xffffffff, v_val, 5); + const float w12 = __shfl_sync(0xffffffff, v_val, 6); + const float w20 = __shfl_sync(0xffffffff, v_val, 8); + const float w21 = __shfl_sync(0xffffffff, v_val, 9); + const float w22 = __shfl_sync(0xffffffff, v_val, 10); + if (i >= N) { return; } - // --- 1. Load all inputs into local variables (registers) --- - - const float *J_i = J + i * 6; - const float *sigma_i = sigma_world + i * 9; - - // Load J (2x3) - float J00 = J_i[0], J01 = J_i[1], J02 = J_i[2]; - float J10 = J_i[3], J11 = J_i[4], J12 = J_i[5]; - - // Load sigma_world (3x3) - float S00 = sigma_i[0], S01 = sigma_i[1], S02 = sigma_i[2]; - float S10 = sigma_i[3], S11 = sigma_i[4], S12 = sigma_i[5]; - float S20 = sigma_i[6], S21 = sigma_i[7], S22 = sigma_i[8]; - - // Load W (3x3 rotation matrix) - float W00 = camera_T_world[0], W01 = camera_T_world[1], W02 = camera_T_world[2]; - float W10 = camera_T_world[4], W11 = camera_T_world[5], W12 = camera_T_world[6]; - float W20 = camera_T_world[8], W21 = camera_T_world[9], W22 = camera_T_world[10]; - - // Load and reconstruct symmetric grad_sigma_image (2x2) - float G00 = conic_grad_out[i * 3 + 0]; - float G01 = conic_grad_out[i * 3 + 1]; - float G11 = conic_grad_out[i * 3 + 2]; - float G10 = G01; // Symmetry - - // --- 2. Compute intermediate products using registers --- - - // JW = J @ W (2x3 @ 3x3 -> 2x3) - float JW00 = J00 * W00 + J01 * W10 + J02 * W20; - float JW01 = J00 * W01 + J01 * W11 + J02 * W21; - float JW02 = J00 * W02 + J01 * W12 + J02 * W22; - float JW10 = J10 * W00 + J11 * W10 + J12 * W20; - float JW11 = J10 * W01 + J11 * W11 + J12 * W21; - float JW12 = J10 * W02 + J11 * W12 + J12 * W22; - - // V = grad_sigma_image @ JW (2x2 @ 2x3 -> 2x3) - float V00 = G00 * JW00 + G01 * JW10; - float V01 = G00 * JW01 + G01 * JW11; - float V02 = G00 * JW02 + G01 * JW12; - float V10 = G10 * JW00 + G11 * JW10; - float V11 = G10 * JW01 + G11 * JW11; - float V12 = G10 * JW02 + G11 * JW12; - - // --- 3. Compute and write output gradients --- - - // A. Gradient w.r.t. sigma_world = JW.T @ V (3x2 @ 2x3 -> 3x3) - float *out_sigma_grad = sigma_world_grad_in + i * 9; - // Since d(sigma_world) is symmetric, we compute the full matrix product - // and then can optionally just store the upper/lower triangular part if - // the next kernel expects that. Here we compute the full 3x3 matrix. - float grad_S00 = JW00 * V00 + JW10 * V10; - float grad_S01 = JW00 * V01 + JW10 * V11; - float grad_S02 = JW00 * V02 + JW10 * V12; - float grad_S10 = JW01 * V00 + JW11 * V10; - float grad_S11 = JW01 * V01 + JW11 * V11; - float grad_S12 = JW01 * V02 + JW11 * V12; - float grad_S20 = JW02 * V00 + JW12 * V10; - float grad_S21 = JW02 * V01 + JW12 * V11; - float grad_S22 = JW02 * V02 + JW12 * V12; - // Store the full symmetric gradient - out_sigma_grad[0] = grad_S00; - out_sigma_grad[1] = (grad_S01 + grad_S10) * 0.5f; - out_sigma_grad[2] = (grad_S02 + grad_S20) * 0.5f; - out_sigma_grad[3] = out_sigma_grad[1]; // yx = xy - out_sigma_grad[4] = grad_S11; - out_sigma_grad[5] = (grad_S12 + grad_S21) * 0.5f; - out_sigma_grad[6] = out_sigma_grad[2]; // zx = xz - out_sigma_grad[7] = out_sigma_grad[5]; // zy = yz - out_sigma_grad[8] = grad_S22; - - // B. Gradient w.r.t. J = 2 * (V @ sigma_world @ W.T) - // Step B1: V_sigma = V @ sigma_world (2x3 @ 3x3 -> 2x3) - float VS00 = V00 * S00 + V01 * S10 + V02 * S20; - float VS01 = V00 * S01 + V01 * S11 + V02 * S21; - float VS02 = V00 * S02 + V01 * S12 + V02 * S22; - float VS10 = V10 * S00 + V11 * S10 + V12 * S20; - float VS11 = V10 * S01 + V11 * S11 + V12 * S21; - float VS12 = V10 * S02 + V11 * S12 + V12 * S22; - - // Step B2: J_grad = V_sigma @ W.T (2x3 @ 3x3 -> 2x3), then scale by 2 - float *out_J_grad = J_grad_in + i * 6; - out_J_grad[0] = (VS00 * W00 + VS01 * W01 + VS02 * W02) * 2.0f; - out_J_grad[1] = (VS00 * W10 + VS01 * W11 + VS02 * W12) * 2.0f; - out_J_grad[2] = (VS00 * W20 + VS01 * W21 + VS02 * W22) * 2.0f; - out_J_grad[3] = (VS10 * W00 + VS11 * W01 + VS12 * W02) * 2.0f; - out_J_grad[4] = (VS10 * W10 + VS11 * W11 + VS12 * W12) * 2.0f; - out_J_grad[5] = (VS10 * W20 + VS11 * W21 + VS12 * W22) * 2.0f; + // Load J + const int j_base_idx = i * J_STRIDE; + const float j00 = J[j_base_idx + 0]; + const float j01 = J[j_base_idx + 1]; + const float j02 = J[j_base_idx + 2]; + const float j10 = J[j_base_idx + 3]; + const float j11 = J[j_base_idx + 4]; + const float j12 = J[j_base_idx + 5]; + + // Load Sigma (symmetric) + const int sigma_base_idx = i * SIGMA_STRIDE; + const float s00 = sigma[sigma_base_idx + 0]; + const float s01 = sigma[sigma_base_idx + 1]; + const float s02 = sigma[sigma_base_idx + 2]; + const float s11 = sigma[sigma_base_idx + 4]; + const float s12 = sigma[sigma_base_idx + 5]; + const float s22 = sigma[sigma_base_idx + 8]; + + // Recompute M = J @ W + const float m00 = j00 * w00 + j01 * w10 + j02 * w20; + const float m01 = j00 * w01 + j01 * w11 + j02 * w21; + const float m02 = j00 * w02 + j01 * w12 + j02 * w22; + const float m10 = j10 * w00 + j11 * w10 + j12 * w20; + const float m11 = j10 * w01 + j11 * w11 + j12 * w21; + const float m12 = j10 * w02 + j11 * w12 + j12 * w22; + + // Recompute V = Sigma @ M^T + const float v00 = s00 * m00 + s01 * m01 + s02 * m02; + const float v01 = s00 * m10 + s01 * m11 + s02 * m12; + const float v10 = s01 * m00 + s11 * m01 + s12 * m02; + const float v11 = s01 * m10 + s11 * m11 + s12 * m12; + const float v20 = s02 * m00 + s12 * m01 + s22 * m02; + const float v21 = s02 * m10 + s12 * m11 + s22 * m12; + + // Load gradients for Conic (dC) + const int conic_base_idx = i * CONIC_STRIDE; + const float dc00_out = conic_grad_out[conic_base_idx + 0]; + const float dc01_out = conic_grad_out[conic_base_idx + 1]; + const float dc11_out = conic_grad_out[conic_base_idx + 2]; + + // Load Conic (C) - inverse covariance + const float c00 = conic[conic_base_idx + 0]; + const float c01 = conic[conic_base_idx + 1]; + const float c11 = conic[conic_base_idx + 2]; + + // Compute dSigma_prime = - C * dC * C + // T = C * dC + const float t00 = c00 * dc00_out + c01 * dc01_out; + const float t01 = c00 * dc01_out + c01 * dc11_out; + const float t10 = c01 * dc00_out + c11 * dc01_out; + const float t11 = c01 * dc01_out + c11 * dc11_out; + + // dS = - T * C + const float d_c00 = -(t00 * c00 + t01 * c01); + const float d_c01 = -(t00 * c01 + t01 * c11); + // const float d_c10 = -(t10 * c00 + t11 * c01); // Should be same as d_c01 + const float d_c11 = -(t10 * c01 + t11 * c11); + + // Backprop Conic = M @ V + // c00 = m00*v00 + m01*v10 + m02*v20 + // c01 = m00*v01 + m01*v11 + m02*v21 + // c11 = m10*v01 + m11*v11 + m12*v21 + + // Compute dL/dV + float dv00 = d_c00 * m00; + float dv01 = d_c01 * m00 + d_c11 * m10; + float dv10 = d_c00 * m01; + float dv11 = d_c01 * m01 + d_c11 * m11; + float dv20 = d_c00 * m02; + float dv21 = d_c01 * m02 + d_c11 * m12; + + // Compute dL/dSigma = dL/dV @ M + // Note: sigma_grad_in is symmetric, so we sum contributions for s_ij and s_ji + float ds00 = dv00 * m00 + dv01 * m10; + float ds01 = dv00 * m01 + dv01 * m11 + dv10 * m00 + dv11 * m10; // s01 and s10 + float ds02 = dv00 * m02 + dv01 * m12 + dv20 * m00 + dv21 * m10; // s02 and s20 + float ds11 = dv10 * m01 + dv11 * m11; + float ds12 = dv10 * m02 + dv11 * m12 + dv20 * m01 + dv21 * m11; // s12 and s21 + float ds22 = dv20 * m02 + dv21 * m12; + + sigma_grad_in[sigma_base_idx + 0] += ds00; + sigma_grad_in[sigma_base_idx + 1] += ds01 * 0.5f; // Store upper triangle, sum contributions + sigma_grad_in[sigma_base_idx + 2] += ds02 * 0.5f; + sigma_grad_in[sigma_base_idx + 3] += ds01 * 0.5f; // s10 + sigma_grad_in[sigma_base_idx + 4] += ds11; + sigma_grad_in[sigma_base_idx + 5] += ds12 * 0.5f; + sigma_grad_in[sigma_base_idx + 6] += ds02 * 0.5f; // s20 + sigma_grad_in[sigma_base_idx + 7] += ds12 * 0.5f; // s21 + sigma_grad_in[sigma_base_idx + 8] += ds22; + + // Compute dL/dM (from Conic) + float dm_from_conic_00 = d_c00 * v00 + d_c01 * v01; + float dm_from_conic_01 = d_c00 * v10 + d_c01 * v11; + float dm_from_conic_02 = d_c00 * v20 + d_c01 * v21; + float dm_from_conic_10 = d_c11 * v01; // d_c01 * v00 is for c10, which is symmetric to c01 + float dm_from_conic_11 = d_c11 * v11; + float dm_from_conic_12 = d_c11 * v21; + + // Compute dL/dM (from V = Sigma @ M^T) = (dL/dV)^T @ Sigma + float dm_from_V_00 = dv00 * s00 + dv10 * s01 + dv20 * s02; + float dm_from_V_01 = dv00 * s01 + dv10 * s11 + dv20 * s12; + float dm_from_V_02 = dv00 * s02 + dv10 * s12 + dv20 * s22; + float dm_from_V_10 = dv01 * s00 + dv11 * s01 + dv21 * s02; + float dm_from_V_11 = dv01 * s01 + dv11 * s11 + dv21 * s12; + float dm_from_V_12 = dv01 * s02 + dv11 * s12 + dv21 * s22; + + // Total dL/dM + float dm00 = dm_from_conic_00 + dm_from_V_00; + float dm01 = dm_from_conic_01 + dm_from_V_01; + float dm02 = dm_from_conic_02 + dm_from_V_02; + float dm10 = dm_from_conic_10 + dm_from_V_10; + float dm11 = dm_from_conic_11 + dm_from_V_11; + float dm12 = dm_from_conic_12 + dm_from_V_12; + + // Compute dL/dJ = dL/dM @ W^T + J_grad_in[j_base_idx + 0] += dm00 * w00 + dm01 * w01 + dm02 * w02; + J_grad_in[j_base_idx + 1] += dm00 * w10 + dm01 * w11 + dm02 * w12; + J_grad_in[j_base_idx + 2] += dm00 * w20 + dm01 * w21 + dm02 * w22; + J_grad_in[j_base_idx + 3] += dm10 * w00 + dm11 * w01 + dm12 * w02; + J_grad_in[j_base_idx + 4] += dm10 * w10 + dm11 * w11 + dm12 * w12; + J_grad_in[j_base_idx + 5] += dm10 * w20 + dm11 * w21 + dm12 * w22; } -void compute_conic_backward(const float *const J, const float *const sigma, const float *const T, - const float *const conic_grad_out, const int N, float *J_grad_in, float *sigma_grad_in, - cudaStream_t stream) { +void compute_conic_backward(const float *const J, const float *const sigma, const float *const view, + const float *const conic, const float *const conic_grad_out, const int N, float *J_grad_in, + float *sigma_grad_in, cudaStream_t stream) { ASSERT_DEVICE_POINTER(J); ASSERT_DEVICE_POINTER(sigma); - ASSERT_DEVICE_POINTER(T); + ASSERT_DEVICE_POINTER(view); + ASSERT_DEVICE_POINTER(conic); ASSERT_DEVICE_POINTER(conic_grad_out); ASSERT_DEVICE_POINTER(J_grad_in); ASSERT_DEVICE_POINTER(sigma_grad_in); - const int threads = 256; - const int blocks = (N + threads - 1) / threads; - conic_backward_kernel<<>>(J, sigma, T, conic_grad_out, N, J_grad_in, sigma_grad_in); + const int threads_per_block = 256; + const int num_blocks = (N + threads_per_block - 1) / threads_per_block; + + dim3 gridsize(num_blocks, 1, 1); + dim3 blocksize(threads_per_block, 1, 1); + + conic_backward_kernel<<>>(J, sigma, view, conic, conic_grad_out, N, J_grad_in, + sigma_grad_in); } __global__ void sigma_backward_kernel(const float *__restrict__ q, const float *__restrict__ s, diff --git a/cuda/projection.cu b/cuda/projection.cu index 9c95930..3e42e1f 100644 --- a/cuda/projection.cu +++ b/cuda/projection.cu @@ -3,31 +3,31 @@ #include "checks.cuh" #include "gsplat_cuda/cuda_forward.cuh" -__global__ void cam_extr_proj_kernel(const float *__restrict__ xyz_w, const float *__restrict__ T, const int N, - float *xyz_c) { +__global__ void compute_camera_space_points_kernel(const float *__restrict__ xyz_w, const float *__restrict__ view, + const int N, float *xyz_c) { constexpr int XYZ_STRIDE = 3; const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; // lane_id in warp (0-31) - // Load and broadcast Extrinsic Matrix T (3x4) within warp - float t_val = 0.0f; - if (lane_id < 12) { - t_val = T[lane_id]; + // Load and broadcast View Matrix (4x4) within warp + float v_val = 0.0f; + if (lane_id < 16) { + v_val = view[lane_id]; } - // T = [r00, r01, r02, t0, r10, r11, r12, t1, r20, r21, r22, t2] - const float t00 = __shfl_sync(0xffffffff, t_val, 0); - const float t01 = __shfl_sync(0xffffffff, t_val, 1); - const float t02 = __shfl_sync(0xffffffff, t_val, 2); - const float t03 = __shfl_sync(0xffffffff, t_val, 3); - const float t10 = __shfl_sync(0xffffffff, t_val, 4); - const float t11 = __shfl_sync(0xffffffff, t_val, 5); - const float t12 = __shfl_sync(0xffffffff, t_val, 6); - const float t13 = __shfl_sync(0xffffffff, t_val, 7); - const float t20 = __shfl_sync(0xffffffff, t_val, 8); - const float t21 = __shfl_sync(0xffffffff, t_val, 9); - const float t22 = __shfl_sync(0xffffffff, t_val, 10); - const float t23 = __shfl_sync(0xffffffff, t_val, 11); + + const float v00 = __shfl_sync(0xffffffff, v_val, 0); + const float v01 = __shfl_sync(0xffffffff, v_val, 1); + const float v02 = __shfl_sync(0xffffffff, v_val, 2); + const float v03 = __shfl_sync(0xffffffff, v_val, 3); + const float v10 = __shfl_sync(0xffffffff, v_val, 4); + const float v11 = __shfl_sync(0xffffffff, v_val, 5); + const float v12 = __shfl_sync(0xffffffff, v_val, 6); + const float v13 = __shfl_sync(0xffffffff, v_val, 7); + const float v20 = __shfl_sync(0xffffffff, v_val, 8); + const float v21 = __shfl_sync(0xffffffff, v_val, 9); + const float v22 = __shfl_sync(0xffffffff, v_val, 10); + const float v23 = __shfl_sync(0xffffffff, v_val, 11); if (i >= N) { return; @@ -39,29 +39,41 @@ __global__ void cam_extr_proj_kernel(const float *__restrict__ xyz_w, const floa const float wz = xyz_w[i * XYZ_STRIDE + 2]; // Matrix-vector multiply to get camera-space point xyz_c - xyz_c[i * XYZ_STRIDE + 0] = t00 * wx + t01 * wy + t02 * wz + t03; - xyz_c[i * XYZ_STRIDE + 1] = t10 * wx + t11 * wy + t12 * wz + t13; - xyz_c[i * XYZ_STRIDE + 2] = t20 * wx + t21 * wy + t22 * wz + t23; + xyz_c[i * XYZ_STRIDE + 0] = v00 * wx + v01 * wy + v02 * wz + v03; + xyz_c[i * XYZ_STRIDE + 1] = v10 * wx + v11 * wy + v12 * wz + v13; + xyz_c[i * XYZ_STRIDE + 2] = v20 * wx + v21 * wy + v22 * wz + v23; } -__global__ void cam_intr_proj_kernel(const float *__restrict__ xyz, const float *__restrict__ K, const int N, - float *uv) { +__global__ void project_to_screen_kernel(const float *__restrict__ xyz, const float *__restrict__ proj, const int N, + const int width, const int height, float *uv) { constexpr int XYZ_STRIDE = 3; constexpr int UV_STRIDE = 2; const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; - // load and broadcast K to all threads in warp - float k_val = 0.0f; - if (lane_id < 9) { - k_val = K[lane_id]; + // load and broadcast Proj to all threads in warp + float p_val = 0.0f; + if (lane_id < 16) { + p_val = proj[lane_id]; } - // K = [fx, 0, cx, 0, fy, cy, 0, 0, 1] - const float fx = __shfl_sync(0xffffffff, k_val, 0); - const float cx = __shfl_sync(0xffffffff, k_val, 2); - const float fy = __shfl_sync(0xffffffff, k_val, 4); - const float cy = __shfl_sync(0xffffffff, k_val, 5); + + const float p00 = __shfl_sync(0xffffffff, p_val, 0); + const float p01 = __shfl_sync(0xffffffff, p_val, 1); + const float p02 = __shfl_sync(0xffffffff, p_val, 2); + const float p03 = __shfl_sync(0xffffffff, p_val, 3); + const float p10 = __shfl_sync(0xffffffff, p_val, 4); + const float p11 = __shfl_sync(0xffffffff, p_val, 5); + const float p12 = __shfl_sync(0xffffffff, p_val, 6); + const float p13 = __shfl_sync(0xffffffff, p_val, 7); + const float p20 = __shfl_sync(0xffffffff, p_val, 8); + const float p21 = __shfl_sync(0xffffffff, p_val, 9); + const float p22 = __shfl_sync(0xffffffff, p_val, 10); + const float p23 = __shfl_sync(0xffffffff, p_val, 11); + const float p30 = __shfl_sync(0xffffffff, p_val, 12); + const float p31 = __shfl_sync(0xffffffff, p_val, 13); + const float p32 = __shfl_sync(0xffffffff, p_val, 14); + const float p33 = __shfl_sync(0xffffffff, p_val, 15); if (i >= N) { return; @@ -71,13 +83,24 @@ __global__ void cam_intr_proj_kernel(const float *__restrict__ xyz, const float const float y = xyz[i * XYZ_STRIDE + 1]; const float z = xyz[i * XYZ_STRIDE + 2]; - uv[i * UV_STRIDE + 0] = fx * x / z + cx; - uv[i * UV_STRIDE + 1] = fy * y / z + cy; + // Clip space + float x_clip = p00 * x + p01 * y + p02 * z + p03; + float y_clip = p10 * x + p11 * y + p12 * z + p13; + float w_clip = p30 * x + p31 * y + p32 * z + p33; + + // NDC + float x_ndc = x_clip / (w_clip + 1e-6f); + float y_ndc = y_clip / (w_clip + 1e-6f); + + // Screen space + uv[i * UV_STRIDE + 0] = (x_ndc * 0.5f + 0.5f) * width; + uv[i * UV_STRIDE + 1] = (y_ndc * 0.5f + 0.5f) * height; } -void camera_extrinsic_projection(float *const xyz_w, const float *T, const int N, float *xyz_c, cudaStream_t stream) { +void compute_camera_space_points(float *const xyz_w, const float *view, const int N, float *xyz_c, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_w); - ASSERT_DEVICE_POINTER(T); + ASSERT_DEVICE_POINTER(view); ASSERT_DEVICE_POINTER(xyz_c); const int threads_per_block = 256; @@ -87,12 +110,13 @@ void camera_extrinsic_projection(float *const xyz_w, const float *T, const int N dim3 gridsize(num_blocks, 1, 1); dim3 blocksize(threads_per_block, 1, 1); - cam_extr_proj_kernel<<>>(xyz_w, T, N, xyz_c); + compute_camera_space_points_kernel<<>>(xyz_w, view, N, xyz_c); } -void camera_intrinsic_projection(float *const xyz, const float *K, const int N, float *uv, cudaStream_t stream) { +void project_to_screen(float *const xyz, const float *proj, const int N, const int width, const int height, float *uv, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz); - ASSERT_DEVICE_POINTER(K); + ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(uv); const int threads_per_block = 256; @@ -102,5 +126,5 @@ void camera_intrinsic_projection(float *const xyz, const float *K, const int N, dim3 gridsize(num_blocks, 1, 1); dim3 blocksize(threads_per_block, 1, 1); - cam_intr_proj_kernel<<>>(xyz, K, N, uv); + project_to_screen_kernel<<>>(xyz, proj, N, width, height, uv); } diff --git a/cuda/projection_backward.cu b/cuda/projection_backward.cu index d1f2af0..2dcab29 100644 --- a/cuda/projection_backward.cu +++ b/cuda/projection_backward.cu @@ -3,25 +3,36 @@ #include "checks.cuh" #include "gsplat_cuda/cuda_backward.cuh" -__global__ void cam_intr_proj_backward_kernel(const float *__restrict__ xyz_c, const float *__restrict__ K, - const float *__restrict__ uv_grad_out, const int N, - float *__restrict__ xyz_c_grad_in) { +__global__ void project_to_screen_backward_kernel(const float *__restrict__ xyz_c, const float *__restrict__ proj, + const float *__restrict__ uv_grad_out, const int N, const int width, + const int height, float *__restrict__ xyz_c_grad_in) { constexpr int XYZ_STRIDE = 3; constexpr int UV_STRIDE = 2; const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; // lane_id in warp (0-31) - // Load and broadcast Intrinsic Matrix K within warp - // K = [fx, 0, cx, 0, fy, cy, 0, 0, 1] stored as [fx, cx, fy, cy] - float k_val = 0.0f; - if (lane_id < 9) { - k_val = K[lane_id]; + // Load and broadcast Proj Matrix within warp + float p_val = 0.0f; + if (lane_id < 16) { + p_val = proj[lane_id]; } - const float fx = __shfl_sync(0xffffffff, k_val, 0); - const float cx = __shfl_sync(0xffffffff, k_val, 2); - const float fy = __shfl_sync(0xffffffff, k_val, 4); - const float cy = __shfl_sync(0xffffffff, k_val, 5); + const float p00 = __shfl_sync(0xffffffff, p_val, 0); + const float p01 = __shfl_sync(0xffffffff, p_val, 1); + const float p02 = __shfl_sync(0xffffffff, p_val, 2); + const float p03 = __shfl_sync(0xffffffff, p_val, 3); + const float p10 = __shfl_sync(0xffffffff, p_val, 4); + const float p11 = __shfl_sync(0xffffffff, p_val, 5); + const float p12 = __shfl_sync(0xffffffff, p_val, 6); + const float p13 = __shfl_sync(0xffffffff, p_val, 7); + const float p20 = __shfl_sync(0xffffffff, p_val, 8); + const float p21 = __shfl_sync(0xffffffff, p_val, 9); + const float p22 = __shfl_sync(0xffffffff, p_val, 10); + const float p23 = __shfl_sync(0xffffffff, p_val, 11); + const float p30 = __shfl_sync(0xffffffff, p_val, 12); + const float p31 = __shfl_sync(0xffffffff, p_val, 13); + const float p32 = __shfl_sync(0xffffffff, p_val, 14); + const float p33 = __shfl_sync(0xffffffff, p_val, 15); if (i >= N) { return; @@ -31,33 +42,43 @@ __global__ void cam_intr_proj_backward_kernel(const float *__restrict__ xyz_c, c const float y = xyz_c[i * XYZ_STRIDE + 1]; const float z = xyz_c[i * XYZ_STRIDE + 2]; - // Avoid division by zero or negative depth - if (z <= 1e-4f) { - xyz_c_grad_in[i * XYZ_STRIDE + 0] += 0.0f; - xyz_c_grad_in[i * XYZ_STRIDE + 1] += 0.0f; - xyz_c_grad_in[i * XYZ_STRIDE + 2] += 0.0f; + // Forward pass recomputation + float x_clip = p00 * x + p01 * y + p02 * z + p03; + float y_clip = p10 * x + p11 * y + p12 * z + p13; + float w_clip = p30 * x + p31 * y + p32 * z + p33; + + // Avoid division by zero + if (fabsf(w_clip) < 1e-6f) { return; } - const float z_inv = 1.0f / (z + 1e-6f); - const float z_inv2 = z_inv * z_inv; + const float w_inv = 1.0f / w_clip; + const float w_inv2 = w_inv * w_inv; const float grad_u = uv_grad_out[i * UV_STRIDE + 0]; const float grad_v = uv_grad_out[i * UV_STRIDE + 1]; - // --- Gradient w.r.t. xyz_c --- - // du/dx = fx/z, dv/dy = fy/z - // du/dz = -fx*x/z^2, dv/dz = -fy*y/z^2 - xyz_c_grad_in[i * XYZ_STRIDE + 0] += grad_u * fx * z_inv; - xyz_c_grad_in[i * XYZ_STRIDE + 1] += grad_v * fy * z_inv; - xyz_c_grad_in[i * XYZ_STRIDE + 2] += -(grad_u * fx * x * z_inv2 + grad_v * fy * y * z_inv2); + // d(NDC) / d(uv) + float dx_ndc = grad_u * 2.0f / width; + float dy_ndc = grad_v * 2.0f / height; + + // d(Clip) / d(NDC) + float dx_clip = dx_ndc * w_inv; + float dy_clip = dy_ndc * w_inv; + float dw_clip = -dx_ndc * x_clip * w_inv2 - dy_ndc * y_clip * w_inv2; + float dz_clip = 0.0f; + + // d(xyz_c) / d(Clip) = Proj^T * d(Clip) + xyz_c_grad_in[i * XYZ_STRIDE + 0] += p00 * dx_clip + p10 * dy_clip + p20 * dz_clip + p30 * dw_clip; + xyz_c_grad_in[i * XYZ_STRIDE + 1] += p01 * dx_clip + p11 * dy_clip + p21 * dz_clip + p31 * dw_clip; + xyz_c_grad_in[i * XYZ_STRIDE + 2] += p02 * dx_clip + p12 * dy_clip + p22 * dz_clip + p32 * dw_clip; } -void camera_intrinsic_projection_backward(const float *const xyz_c, const float *const K, - const float *const uv_grad_out, const int N, float *xyz_c_grad_in, - cudaStream_t stream) { +void project_to_screen_backward(const float *const xyz_c, const float *const proj, const float *const uv_grad_out, + const int N, const int width, const int height, float *xyz_c_grad_in, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_c); - ASSERT_DEVICE_POINTER(K); + ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(uv_grad_out); ASSERT_DEVICE_POINTER(xyz_c_grad_in); @@ -67,35 +88,36 @@ void camera_intrinsic_projection_backward(const float *const xyz_c, const float dim3 gridsize(num_blocks, 1, 1); dim3 blocksize(threads_per_block, 1, 1); - cam_intr_proj_backward_kernel<<>>(xyz_c, K, uv_grad_out, N, xyz_c_grad_in); + project_to_screen_backward_kernel<<>>(xyz_c, proj, uv_grad_out, N, width, height, + xyz_c_grad_in); } -__global__ void cam_extr_proj_backward_kernel(const float *__restrict__ xyz_w, const float *__restrict__ T, - const float *__restrict__ xyz_c_grad_out, const int N, - float *__restrict__ xyz_w_grad_in) { +__global__ void compute_camera_space_points_backward_kernel(const float *__restrict__ xyz_w, + const float *__restrict__ view, + const float *__restrict__ xyz_c_grad_out, const int N, + float *__restrict__ xyz_w_grad_in) { constexpr int XYZ_STRIDE = 3; const int i = blockIdx.x * blockDim.x + threadIdx.x; const int lane_id = threadIdx.x & 0x1f; - // Load and broadcast Extrinsic Matrix T (3x4) within warp - float t_val = 0.0f; - if (lane_id < 12) { - t_val = T[lane_id]; + // Load and broadcast View Matrix (4x4) within warp + float v_val = 0.0f; + if (lane_id < 16) { + v_val = view[lane_id]; } - // T = [r00, r01, r02, t0, r10, r11, r12, t1, r20, r21, r22, t2] - const float r00 = __shfl_sync(0xffffffff, t_val, 0); - const float r01 = __shfl_sync(0xffffffff, t_val, 1); - const float r02 = __shfl_sync(0xffffffff, t_val, 2); - const float t0 = __shfl_sync(0xffffffff, t_val, 3); - const float r10 = __shfl_sync(0xffffffff, t_val, 4); - const float r11 = __shfl_sync(0xffffffff, t_val, 5); - const float r12 = __shfl_sync(0xffffffff, t_val, 6); - const float t1 = __shfl_sync(0xffffffff, t_val, 7); - const float r20 = __shfl_sync(0xffffffff, t_val, 8); - const float r21 = __shfl_sync(0xffffffff, t_val, 9); - const float r22 = __shfl_sync(0xffffffff, t_val, 10); - const float t2 = __shfl_sync(0xffffffff, t_val, 11); + const float v00 = __shfl_sync(0xffffffff, v_val, 0); + const float v01 = __shfl_sync(0xffffffff, v_val, 1); + const float v02 = __shfl_sync(0xffffffff, v_val, 2); + const float v03 = __shfl_sync(0xffffffff, v_val, 3); + const float v10 = __shfl_sync(0xffffffff, v_val, 4); + const float v11 = __shfl_sync(0xffffffff, v_val, 5); + const float v12 = __shfl_sync(0xffffffff, v_val, 6); + const float v13 = __shfl_sync(0xffffffff, v_val, 7); + const float v20 = __shfl_sync(0xffffffff, v_val, 8); + const float v21 = __shfl_sync(0xffffffff, v_val, 9); + const float v22 = __shfl_sync(0xffffffff, v_val, 10); + const float v23 = __shfl_sync(0xffffffff, v_val, 11); if (i >= N) { return; @@ -106,17 +128,19 @@ __global__ void cam_extr_proj_backward_kernel(const float *__restrict__ xyz_w, c const float grad_z_c = xyz_c_grad_out[i * XYZ_STRIDE + 2]; // --- Gradient w.r.t. xyz_w --- - // d(xyz_w) = R^T * d(xyz_c) - xyz_w_grad_in[i * XYZ_STRIDE + 0] = r00 * grad_x_c + r10 * grad_y_c + r20 * grad_z_c; - xyz_w_grad_in[i * XYZ_STRIDE + 1] = r01 * grad_x_c + r11 * grad_y_c + r21 * grad_z_c; - xyz_w_grad_in[i * XYZ_STRIDE + 2] = r02 * grad_x_c + r12 * grad_y_c + r22 * grad_z_c; + // d(xyz_w) = View^T * d(xyz_c) (ignoring translation part for direction vectors, but xyz_w is point) + // Actually, d(xyz_w) = R^T * d(xyz_c) because translation is constant w.r.t. xyz_w. + // The View matrix upper-left 3x3 is the rotation R. + xyz_w_grad_in[i * XYZ_STRIDE + 0] = v00 * grad_x_c + v10 * grad_y_c + v20 * grad_z_c; + xyz_w_grad_in[i * XYZ_STRIDE + 1] = v01 * grad_x_c + v11 * grad_y_c + v21 * grad_z_c; + xyz_w_grad_in[i * XYZ_STRIDE + 2] = v02 * grad_x_c + v12 * grad_y_c + v22 * grad_z_c; } -void camera_extrinsic_projection_backward(const float *const xyz_w, const float *const T, +void compute_camera_space_points_backward(const float *const xyz_w, const float *const view, const float *const xyz_c_grad_out, const int N, float *xyz_w_grad_in, cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_w); - ASSERT_DEVICE_POINTER(T); + ASSERT_DEVICE_POINTER(view); ASSERT_DEVICE_POINTER(xyz_c_grad_out); ASSERT_DEVICE_POINTER(xyz_w_grad_in); @@ -126,5 +150,6 @@ void camera_extrinsic_projection_backward(const float *const xyz_w, const float dim3 gridsize(num_blocks, 1, 1); dim3 blocksize(threads_per_block, 1, 1); - cam_extr_proj_backward_kernel<<>>(xyz_w, T, xyz_c_grad_out, N, xyz_w_grad_in); + compute_camera_space_points_backward_kernel<<>>(xyz_w, view, xyz_c_grad_out, N, + xyz_w_grad_in); } diff --git a/cuda/raster.cu b/cuda/raster.cu index 35181d2..e6ab3fc 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -22,12 +22,12 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config pass_data.d_uv.resize(num_gaussians * 2); // Step 1: Projections and Culling - camera_extrinsic_projection(thrust::raw_pointer_cast(gaussians.d_xyz.data()), - thrust::raw_pointer_cast(camera_parameters.d_T.data()), num_gaussians, + compute_camera_space_points(thrust::raw_pointer_cast(gaussians.d_xyz.data()), + thrust::raw_pointer_cast(camera_parameters.d_view.data()), num_gaussians, thrust::raw_pointer_cast(pass_data.d_xyz_c.data())); - camera_intrinsic_projection(thrust::raw_pointer_cast(pass_data.d_xyz_c.data()), - thrust::raw_pointer_cast(camera_parameters.d_K.data()), num_gaussians, - thrust::raw_pointer_cast(pass_data.d_uv.data())); + project_to_screen(thrust::raw_pointer_cast(pass_data.d_xyz_c.data()), + thrust::raw_pointer_cast(camera_parameters.d_proj.data()), num_gaussians, width, height, + thrust::raw_pointer_cast(pass_data.d_uv.data())); cull_gaussians(thrust::raw_pointer_cast(pass_data.d_uv.data()), thrust::raw_pointer_cast(pass_data.d_xyz_c.data()), num_gaussians, config.near_thresh, config.cull_mask_padding, width, height, thrust::raw_pointer_cast(pass_data.d_mask.data())); @@ -84,9 +84,9 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config thrust::raw_pointer_cast(d_scale_selected.data()), pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_sigma.data())); compute_conic(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(camera_parameters.d_K.data()), + thrust::raw_pointer_cast(camera_parameters.d_view.data()), thrust::raw_pointer_cast(pass_data.d_sigma.data()), - thrust::raw_pointer_cast(camera_parameters.d_T.data()), pass_data.num_culled, + thrust::raw_pointer_cast(camera_parameters.d_proj.data()), pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_conic.data())); // Step 5: Sort Gaussians by tile diff --git a/cuda/render.cu b/cuda/render.cu index 63a5f00..591c0cf 100644 --- a/cuda/render.cu +++ b/cuda/render.cu @@ -50,7 +50,6 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y float basic; float linear; float quad; - float inv_det; float3 color = {rgb[gaussian_idx * 3 + 0], rgb[gaussian_idx * 3 + 1], rgb[gaussian_idx * 3 + 2]}; float opa = 1.0f / (1.0f + __expf(-opacity[gaussian_idx])); @@ -58,13 +57,9 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y d.x = uvs[gaussian_idx * 2 + 0] - (float)base_pixel_x; d.y = uvs[gaussian_idx * 2 + 1] - (float)base_pixel_y; - const float a = conic[gaussian_idx * 3 + 0] + 0.3f; - const float b = conic[gaussian_idx * 3 + 1]; - const float c = conic[gaussian_idx * 3 + 2] + 0.3f; - inv_det = 1.0f / (a * c - b * b); - const float inv_cov00 = c * inv_det; - const float inv_cov01 = -b * inv_det; - const float inv_cov11 = a * inv_det; + const float inv_cov00 = conic[gaussian_idx * 3 + 0]; + const float inv_cov01 = conic[gaussian_idx * 3 + 1]; + const float inv_cov11 = conic[gaussian_idx * 3 + 2]; basic = -0.5f * (inv_cov00 * d.x * d.x + 2.0f * inv_cov01 * d.x * d.y + inv_cov11 * d.y * d.y); linear = inv_cov11 * d.y + inv_cov01 * d.x; quad = -0.5f * inv_cov11; diff --git a/cuda/render_backward.cu b/cuda/render_backward.cu index 46aff2f..3999004 100644 --- a/cuda/render_backward.cu +++ b/cuda/render_backward.cu @@ -20,7 +20,7 @@ __global__ void render_tiles_backward_kernel( const int PIXELS_PER_THREAD = (TILE_SIZE_BWD * TILE_SIZE_BWD) / 32; const int tile_idx = blockIdx.x * blockDim.y + threadIdx.y; - cg::thread_block tile_thread_group = cg::this_thread_block(); + auto tile_thread_group = cg::this_thread_block(); auto warp = cg::tiled_partition<32>(tile_thread_group); // Tile outside of image @@ -85,18 +85,13 @@ __global__ void render_tiles_backward_kernel( float basic; float linear; float quad; - float inv_det; float2 d = {0.0f, 0.0f}; d.x = uvs[gaussian_idx * 2 + 0] - (float)base_pixel_x; d.y = uvs[gaussian_idx * 2 + 1] - (float)base_pixel_y; - const float a = conic[gaussian_idx * 3 + 0] + 0.3f; - const float b = conic[gaussian_idx * 3 + 1]; - const float c = conic[gaussian_idx * 3 + 2] + 0.3f; - inv_det = 1.0f / (a * c - b * b); - const float inv_cov00 = c * inv_det; - const float inv_cov01 = -b * inv_det; - const float inv_cov11 = a * inv_det; + const float inv_cov00 = conic[gaussian_idx * 3 + 0]; + const float inv_cov01 = conic[gaussian_idx * 3 + 1]; + const float inv_cov11 = conic[gaussian_idx * 3 + 2]; basic = -0.5f * (inv_cov00 * d.x * d.x + 2.0f * inv_cov01 * d.x * d.y + inv_cov11 * d.y * d.y); linear = inv_cov11 * d.y + inv_cov01 * d.x; quad = -0.5f * inv_cov11; @@ -209,11 +204,9 @@ __global__ void render_tiles_backward_kernel( const float grad_inv_cov11 = grad_basic * (-0.5f * d.y * d.y) + (grad_linear * d.y) - (0.5f * grad_quad); const float grad_inv_cov01 = grad_basic * (-d.x * d.y) + grad_linear * d.x; - const float S = inv_det * inv_det * (grad_inv_cov00 * c + grad_inv_cov11 * a - grad_inv_cov01 * b); - - grad_conic_tile.x = (grad_inv_cov11 * inv_det) - (c * S); - grad_conic_tile.y = (-grad_inv_cov01 * inv_det) + (2.0f * b * S); - grad_conic_tile.z = (grad_inv_cov00 * inv_det) - (a * S); + grad_conic_tile.x = grad_inv_cov00; + grad_conic_tile.y = grad_inv_cov01; + grad_conic_tile.z = grad_inv_cov11; grad_conic_tile.x = cg::reduce(warp, grad_conic_tile.x, cg::plus()); grad_conic_tile.y = cg::reduce(warp, grad_conic_tile.y, cg::plus()); diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 6edd610..72e4bfd 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -295,25 +295,50 @@ void TrainerImpl::evaluate() { width * height * 3 * sizeof(float), cudaMemcpyHostToDevice); // Prepare camera data - float h_K[9] = {(float)cam.params[0], - 0.f, - (float)cam.params[2], - 0.f, - (float)cam.params[1], - (float)cam.params[3], - 0.f, - 0.f, - 1.f}; + // Prepare camera data + float h_view[16]; + float h_proj[16]; + + // View Matrix (World -> Camera) Eigen::Matrix3d rot_mat_d = img.QvecToRotMat(); Eigen::Vector3d t_vec_d = img.tvec; - float h_T[12]; + + // View = [R | t; 0 0 0 1] for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) - h_T[i * 4 + j] = (float)rot_mat_d(i, j); - h_T[i * 4 + 3] = (float)t_vec_d(i); + h_view[i * 4 + j] = (float)rot_mat_d(i, j); + h_view[i * 4 + 3] = (float)t_vec_d(i); } - thrust::copy(h_K, h_K + 9, cuda.camera.d_K.begin()); - thrust::copy(h_T, h_T + 12, cuda.camera.d_T.begin()); + h_view[12] = 0.0f; + h_view[13] = 0.0f; + h_view[14] = 0.0f; + h_view[15] = 1.0f; + + // Projection Matrix + const float znear = 0.01f; + const float zfar = 100.0f; + const float fov_x = 2 * atan(cam.width / (2 * cam.params[0])); + const float fov_y = 2 * atan(cam.height / (2 * cam.params[1])); + + const float tan_half_fov_x = tan(fov_x / 2.0f); + const float tan_half_fov_y = tan(fov_y / 2.0f); + + const float top = tan_half_fov_y * znear; + const float bottom = -top; + const float right = tan_half_fov_x * znear; + const float left = -right; + + std::fill(h_proj, h_proj + 16, 0.0f); + h_proj[0] = 2.0f * znear / (right - left); + h_proj[5] = 2.0f * znear / (top - bottom); + h_proj[8] = (right + left) / (right - left); + h_proj[9] = (top + bottom) / (top - bottom); + h_proj[10] = (zfar + znear) / (zfar - znear); + h_proj[11] = -(2.0f * zfar * znear) / (zfar - znear); + h_proj[14] = 1.0f; + + thrust::copy(h_proj, h_proj + 16, cuda.camera.d_proj.begin()); + thrust::copy(h_view, h_view + 16, cuda.camera.d_view.begin()); // Render ForwardPassData pass_data; @@ -790,11 +815,12 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data())); compute_conic_backward( thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_sigma.data()), - thrust::raw_pointer_cast(cuda.camera.d_T.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_conic.data()), - pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_J.data()), + thrust::raw_pointer_cast(cuda.camera.d_view.data()), thrust::raw_pointer_cast(pass_data.d_conic.data()), + thrust::raw_pointer_cast(cuda.gradients.d_grad_conic.data()), pass_data.num_culled, + thrust::raw_pointer_cast(cuda.gradients.d_grad_J.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_sigma.data())); compute_projection_jacobian_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(cuda.camera.d_K.data()), + thrust::raw_pointer_cast(cuda.camera.d_proj.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_J.data()), pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); compute_sigma_backward(thrust::raw_pointer_cast(d_quaternion_selected.data()), @@ -802,12 +828,12 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam thrust::raw_pointer_cast(cuda.gradients.d_grad_sigma.data()), pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_quaternion.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_scale.data())); - camera_intrinsic_projection_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(cuda.camera.d_K.data()), - thrust::raw_pointer_cast(cuda.gradients.d_grad_uv.data()), pass_data.num_culled, - thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); - camera_extrinsic_projection_backward( - thrust::raw_pointer_cast(d_xyz_selected.data()), thrust::raw_pointer_cast(cuda.camera.d_T.data()), + project_to_screen_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), + thrust::raw_pointer_cast(cuda.camera.d_proj.data()), + thrust::raw_pointer_cast(cuda.gradients.d_grad_uv.data()), pass_data.num_culled, width, + height, thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); + compute_camera_space_points_backward( + thrust::raw_pointer_cast(d_xyz_selected.data()), thrust::raw_pointer_cast(cuda.camera.d_view.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data()), pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz.data())); @@ -1088,6 +1114,9 @@ void TrainerImpl::train() { // Calculate scene extent for adaptive density scene_extent = 1.1f * computeMaxDiagonal(images); + const float znear = 0.01f; + const float zfar = 100.0f; + ProgressBar progressBar(config.num_iters); // TRAINING LOOP @@ -1107,26 +1136,43 @@ void TrainerImpl::train() { zero_grads(); // Prepare and copy camera parameters to device (member 'cuda.camera') - float h_K[9] = {(float)curr_camera.params[0], - 0.f, - (float)curr_camera.params[2], - 0.f, - (float)curr_camera.params[1], - (float)curr_camera.params[3], - 0.f, - 0.f, - 1.f}; + const float fov_x = 2 * atan(curr_camera.width / (2 * curr_camera.params[0])); + const float fov_y = 2 * atan(curr_camera.height / (2 * curr_camera.params[1])); + + const float tan_half_fov_x = tan(fov_x / 2.0f); + const float tan_half_fov_y = tan(fov_x / 2.0f); + + const float top = tan_half_fov_y * znear; + const float bottom = -top; + const float right = tan_half_fov_x * znear; + const float left = -right; + + float h_proj[16]; + std::fill(h_proj, h_proj + 16, 0.0f); + h_proj[0] = 2.0f * znear / (right - left); + h_proj[5] = 2.0f * znear / (top - bottom); + h_proj[8] = (right + left) / (right - left); + h_proj[9] = (top + bottom) / (top - bottom); + h_proj[10] = (zfar + znear) / (zfar - znear); + h_proj[11] = -(2.0f * zfar * znear) / (zfar - znear); + h_proj[14] = 1.0f; + Eigen::Matrix3d rot_mat_d = curr_image.QvecToRotMat(); Eigen::Vector3d t_vec_d = curr_image.tvec; - float h_T[12]; + float h_view[16]; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) - h_T[i * 4 + j] = (float)rot_mat_d(i, j); - h_T[i * 4 + 3] = (float)t_vec_d(i); + h_view[i * 4 + j] = (float)rot_mat_d(i, j); + h_view[i * 4 + 3] = (float)t_vec_d(i); } + h_view[12] = 0.0f; + h_view[13] = 0.0f; + h_view[14] = 0.0f; + h_view[15] = 1.0f; + try { - thrust::copy(h_K, h_K + 9, cuda.camera.d_K.begin()); - thrust::copy(h_T, h_T + 12, cuda.camera.d_T.begin()); + thrust::copy(h_proj, h_proj + 16, cuda.camera.d_proj.begin()); + thrust::copy(h_view, h_view + 16, cuda.camera.d_view.begin()); } catch (const std::exception &e) { fprintf(stderr, "Error copying camera data to device: %s\\n", e.what()); exit(EXIT_FAILURE); diff --git a/include/gsplat_cuda/cuda_backward.cuh b/include/gsplat_cuda/cuda_backward.cuh index a765325..972d807 100644 --- a/include/gsplat_cuda/cuda_backward.cuh +++ b/include/gsplat_cuda/cuda_backward.cuh @@ -10,55 +10,58 @@ inline constexpr int TILE_SIZE_BWD = 16; /** * @brief Compute gradients for the camera intrinsic projection. * @param[in] xyz_c A device pointer to 3D points in camera coordinates. - * @param[in] K A device pointer to the camera intrinsic matrix values [fx, cx, fy, cy]. + * @param[in] proj A device pointer to the camera projection matrix (4x4). * @param[in] uv_grad_out A device pointer to the upstream gradients from the 2D projection. * @param[in] N The total number of points. + * @param[in] width Image width. + * @param[in] height Image height. * @param[out] xyz_c_grad_in A device pointer to store the computed gradients for xyz_c. * @param[in] stream The CUDA stream to execute the kernel on. */ -void camera_intrinsic_projection_backward(const float *const xyz_c, const float *const K, - const float *const uv_grad_out, const int N, float *xyz_c_grad_in, - cudaStream_t stream = 0); +void project_to_screen_backward(const float *const xyz_c, const float *const proj, const float *const uv_grad_out, + const int N, const int width, const int height, float *xyz_c_grad_in, + cudaStream_t stream = 0); /** * @brief Compute gradients for the camera extrinsic transformation. * @param[in] xyz_w A device pointer to 3D points in world coordinates. - * @param[in] T A device pointer to the camera extrinsic matrix (3x4). + * @param[in] view A device pointer to the camera view matrix (4x4). * @param[in] xyz_c_grad_out A device pointer to the upstream gradients from camera-space coordinates. * @param[in] N The total number of points. * @param[out] xyz_w_grad_in A device pointer to store the computed gradients for xyz_w. * @param[in] stream The CUDA stream to execute the kernel on. */ -void camera_extrinsic_projection_backward(const float *const xyz_w, const float *const T, +void compute_camera_space_points_backward(const float *const xyz_w, const float *const view, const float *const xyz_c_grad_out, const int N, float *xyz_w_grad_in, cudaStream_t stream = 0); /** * @brief Compute gradients for the projection Jacobian. * @param[in] xyz_c A device pointer to 3D points in camera coordinates. - * @param[in] K A device pointer to the camera intrinsic matrix values [fx, cx, fy, cy]. + * @param[in] proj A device pointer to the camera projection matrix (4x4). * @param[in] J_grad_out A device pointer to the upstream gradients for the Jacobian J. * @param[in] N The total number of points. * @param[out] xyz_c_grad_in A device pointer to store the computed gradients for xyz_c. * @param[in] stream The CUDA stream to execute the kernel on. */ -void compute_projection_jacobian_backward(const float *const xyz_c, const float *const K, const float *const J_grad_out, - const int N, float *xyz_c_grad_in, cudaStream_t stream = 0); +void compute_projection_jacobian_backward(const float *const xyz_c, const float *const proj, + const float *const J_grad_out, const int N, float *xyz_c_grad_in, + cudaStream_t stream = 0); /** * @brief Compute gradients for the 2D conic projection. * @param[in] J A device pointer to the projection Jacobians. * @param[in] sigma A device pointer to the 3D covariance matrices. - * @param[in] T A device pointer to the camera extrinsic matrix (3x4). + * @param[in] view A device pointer to the camera view matrix (4x4). * @param[in] conic_grad_out A device pointer to the upstream gradients for the conic. * @param[in] N The total number of points. * @param[out] J_grad_in A device pointer to store the computed gradients for J. * @param[out] sigma_grad_in A device pointer to store the computed gradients for sigma. * @param[in] stream The CUDA stream to execute the kernel on. */ -void compute_conic_backward(const float *const J, const float *const sigma, const float *const T, - const float *const conic_grad_out, const int N, float *J_grad_in, float *sigma_grad_in, - cudaStream_t stream = 0); +void compute_conic_backward(const float *const J, const float *const sigma, const float *const view, + const float *const conic, const float *const conic_grad_out, const int N, float *J_grad_in, + float *sigma_grad_in, cudaStream_t stream = 0); /** * @brief Compute gradients for the 3D covariance matrix (sigma). diff --git a/include/gsplat_cuda/cuda_data.cuh b/include/gsplat_cuda/cuda_data.cuh index 011801d..909cd67 100644 --- a/include/gsplat_cuda/cuda_data.cuh +++ b/include/gsplat_cuda/cuda_data.cuh @@ -47,7 +47,7 @@ struct GradientAccumulators { // Holds buffer to storing current camera parameters struct CameraParameters { // Camera parameters - thrust::device_vector d_K, d_T; + thrust::device_vector d_view, d_proj; CameraParameters(); }; diff --git a/include/gsplat_cuda/cuda_forward.cuh b/include/gsplat_cuda/cuda_forward.cuh index 387fe80..d614e68 100644 --- a/include/gsplat_cuda/cuda_forward.cuh +++ b/include/gsplat_cuda/cuda_forward.cuh @@ -18,7 +18,7 @@ inline constexpr int TILE_SIZE_FWD = 16; * @param[out] conic A device pointer to output conic values * @param[in] stream The CUDA stream to execute kernel on */ -void compute_conic(float *const xyz, const float *K, float *const sigma, const float *T, const int N, float *J, +void compute_conic(float *const xyz, const float *view, float *const sigma, const float *proj, const int N, float *J, float *conic, cudaStream_t stream = 0); /** @@ -32,25 +32,28 @@ void compute_conic(float *const xyz, const float *K, float *const sigma, const f void compute_sigma(float *const quaternion, float *const scale, const int N, float *sigma, cudaStream_t stream = 0); /** - * @brief Compute camera view of points from rotation matrix and translation vector + * @brief Compute camera view of points from View matrix * @param[in] xyz_w A device pointer to world view of points - * @param[in] T A device pointer to camera extrinsic matrix + * @param[in] view A device pointer to camera view matrix (4x4) * @param[in] N The total number of points * @param[out] xyz_c A device porinter to output camera view * @param[in] stream The CUDA stream to execute kernel on */ -void camera_extrinsic_projection(float *const xyz_w, const float *T, const int N, float *xyz_c, +void compute_camera_space_points(float *const xyz_w, const float *view, const int N, float *xyz_c, cudaStream_t stream = 0); /** * @brief Launches the CUDA kernel for projecting 3D points to 2D image coordinates. - * @param[in] xyz A device pointer to the input array of 3D points. - * @param[in] K A device pointer to the camera intrinsic matrix. - * @param[in] N The total number of points. - * @param[out] uv A device pointer to the output array for 2D coordinates. + * @param[in] xyz A device pointer to the input array of 3D points. + * @param[in] proj A device pointer to the camera projection matrix (4x4). + * @param[in] N The total number of points. + * @param[in] width Image width. + * @param[in] height Image height. + * @param[out] uv A device pointer to the output array for 2D coordinates. * @param[in] stream The CUDA stream to execute kernel on */ -void camera_intrinsic_projection(float *const xyz, const float *K, const int N, float *uv, cudaStream_t stream = 0); +void project_to_screen(float *const xyz, const float *proj, const int N, const int width, const int height, float *uv, + cudaStream_t stream = 0); /** * @brief Lauches CUDA kernel to perform frustum culling on guassians. diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 439fbeb..251fe2c 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -37,41 +37,56 @@ class CudaBackwardKernelTest : public ::testing::Test { } }; -// Test for camera_intrinsic_projection_backward -TEST_F(CudaBackwardKernelTest, CameraIntrinsicProjectionBackward) { +// Test for project_to_screen_backward +TEST_F(CudaBackwardKernelTest, ProjectToScreenBackward) { const int N = 2; + const int width = 1920; + const int height = 1080; const float h = 1e-4; // Host data std::vector h_xyz_c = {1.0, 2.0, 3.0, -1.0, -2.0, 4.0}; - std::vector h_K = {100.0, 0.0, 160.0, 0.0, 120.0, 120.0, 0.0, 0.0, 1.0}; // fx, cx, fy, cy + // Proj matrix (4x4) - Identity-like for simplicity + // P = + // 1 0 0 0 + // 0 1 0 0 + // 0 0 0 1 + // 0 0 1 0 + // x_proj = x/z, y_proj = y/z + // u = (x/z * 0.5 + 0.5) * width + // v = (y/z * 0.5 + 0.5) * height + std::vector h_proj = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0}; + std::vector h_uv_grad_out = {0.1, 0.2, 0.3, 0.4}; std::vector h_xyz_c_grad_in(N * 3); - std::vector h_K_grad_in(4); // Device data float *d_xyz_c = device_alloc(N * 3); - float *d_K = device_alloc(9); + float *d_proj = device_alloc(16); float *d_uv_grad_out = device_alloc(N * 2); float *d_xyz_c_grad_in = device_alloc(N * 3); CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_K, h_K.data(), 9 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_uv_grad_out, h_uv_grad_out.data(), N * 2 * sizeof(float), cudaMemcpyHostToDevice)); // Run kernel - camera_intrinsic_projection_backward(d_xyz_c, d_K, d_uv_grad_out, N, d_xyz_c_grad_in); + project_to_screen_backward(d_xyz_c, d_proj, d_uv_grad_out, N, width, height, d_xyz_c_grad_in); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_xyz_c_grad_in.data(), d_xyz_c_grad_in, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); // Numerical gradient check - auto forward_proj = [&](const std::vector &xyz_c, const std::vector &K) { + auto forward_proj = [&](const std::vector &xyz_c, const std::vector &proj) { std::vector uv(N * 2); for (int i = 0; i < N; ++i) { - uv[i * 2 + 0] = K[0] * xyz_c[i * 3 + 0] / xyz_c[i * 3 + 2] + K[2]; - uv[i * 2 + 1] = K[5] * xyz_c[i * 3 + 1] / xyz_c[i * 3 + 2] + K[5]; + float x = xyz_c[i * 3 + 0]; + float y = xyz_c[i * 3 + 1]; + float z = xyz_c[i * 3 + 2]; + // With our custom Proj: + uv[i * 2 + 0] = (x / z * 0.5f + 0.5f) * width; + uv[i * 2 + 1] = (y / z * 0.5f + 0.5f) * height; } return uv; }; @@ -82,8 +97,8 @@ TEST_F(CudaBackwardKernelTest, CameraIntrinsicProjectionBackward) { xyz_c_p[i] += h; std::vector xyz_c_m = h_xyz_c; xyz_c_m[i] -= h; - auto uv_p = forward_proj(xyz_c_p, h_K); - auto uv_m = forward_proj(xyz_c_m, h_K); + auto uv_p = forward_proj(xyz_c_p, h_proj); + auto uv_m = forward_proj(xyz_c_m, h_proj); float numerical_grad = 0; for (int j = 0; j < N * 2; ++j) numerical_grad += (uv_p[j] - uv_m[j]) / (2 * h) * h_uv_grad_out[j]; @@ -91,44 +106,45 @@ TEST_F(CudaBackwardKernelTest, CameraIntrinsicProjectionBackward) { } CUDA_CHECK(cudaFree(d_xyz_c)); - CUDA_CHECK(cudaFree(d_K)); + CUDA_CHECK(cudaFree(d_proj)); CUDA_CHECK(cudaFree(d_uv_grad_out)); CUDA_CHECK(cudaFree(d_xyz_c_grad_in)); } -// Test for camera_extrinsic_projection_backward -TEST_F(CudaBackwardKernelTest, CameraExtrinsicProjectionBackward) { +// Test for compute_camera_space_points_backward +TEST_F(CudaBackwardKernelTest, ComputeCameraSpacePointsBackward) { const int N = 1; const float h = 1e-4; // Host data std::vector h_xyz_w = {1.0, 2.0, 3.0}; - std::vector h_T = {0.8, -0.6, 0.0, 0.1, 0.6, 0.8, 0.0, 0.2, 0.0, 0.0, 1.0, 0.3}; + // View matrix (4x4) + std::vector h_view = {0.8, -0.6, 0.0, 0.1, 0.6, 0.8, 0.0, 0.2, 0.0, 0.0, 1.0, 0.3, 0.0, 0.0, 0.0, 1.0}; std::vector h_xyz_c_grad_in = {0.1, 0.2, 0.3}; std::vector h_xyz_w_grad_in(N * 3); - std::vector h_T_grad_in(12); // Device data auto d_xyz_w = device_alloc(N * 3); - auto d_T = device_alloc(12); + auto d_view = device_alloc(16); auto d_xyz_c_grad_in = device_alloc(N * 3); auto d_xyz_w_grad_in = device_alloc(N * 3); CUDA_CHECK(cudaMemcpy(d_xyz_w, h_xyz_w.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_T, h_T.data(), 12 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_xyz_c_grad_in, h_xyz_c_grad_in.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - camera_extrinsic_projection_backward(d_xyz_w, d_T, d_xyz_c_grad_in, N, d_xyz_w_grad_in); + compute_camera_space_points_backward(d_xyz_w, d_view, d_xyz_c_grad_in, N, d_xyz_w_grad_in); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_xyz_w_grad_in.data(), d_xyz_w_grad_in, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); - auto forward_ext = [&](const std::vector &xyz_w, const std::vector &T) { + auto forward_ext = [&](const std::vector &xyz_w, const std::vector &view) { std::vector xyz_c(N * 3); for (int i = 0; i < N; ++i) { - xyz_c[i * 3 + 0] = T[0] * xyz_w[i * 3 + 0] + T[1] * xyz_w[i * 3 + 1] + T[2] * xyz_w[i * 3 + 2] + T[3]; - xyz_c[i * 3 + 1] = T[4] * xyz_w[i * 3 + 0] + T[5] * xyz_w[i * 3 + 1] + T[6] * xyz_w[i * 3 + 2] + T[7]; - xyz_c[i * 3 + 2] = T[8] * xyz_w[i * 3 + 0] + T[9] * xyz_w[i * 3 + 1] + T[10] * xyz_w[i * 3 + 2] + T[11]; + xyz_c[i * 3 + 0] = view[0] * xyz_w[i * 3 + 0] + view[1] * xyz_w[i * 3 + 1] + view[2] * xyz_w[i * 3 + 2] + view[3]; + xyz_c[i * 3 + 1] = view[4] * xyz_w[i * 3 + 0] + view[5] * xyz_w[i * 3 + 1] + view[6] * xyz_w[i * 3 + 2] + view[7]; + xyz_c[i * 3 + 2] = + view[8] * xyz_w[i * 3 + 0] + view[9] * xyz_w[i * 3 + 1] + view[10] * xyz_w[i * 3 + 2] + view[11]; } return xyz_c; }; @@ -138,8 +154,8 @@ TEST_F(CudaBackwardKernelTest, CameraExtrinsicProjectionBackward) { xyz_w_p[i] += h; std::vector xyz_w_m = h_xyz_w; xyz_w_m[i] -= h; - auto xyz_c_p = forward_ext(xyz_w_p, h_T); - auto xyz_c_m = forward_ext(xyz_w_m, h_T); + auto xyz_c_p = forward_ext(xyz_w_p, h_view); + auto xyz_c_m = forward_ext(xyz_w_m, h_view); float numerical_grad = 0; for (int j = 0; j < N * 3; ++j) numerical_grad += (xyz_c_p[j] - xyz_c_m[j]) / (2 * h) * h_xyz_c_grad_in[j]; @@ -147,7 +163,7 @@ TEST_F(CudaBackwardKernelTest, CameraExtrinsicProjectionBackward) { } CUDA_CHECK(cudaFree(d_xyz_w)); - CUDA_CHECK(cudaFree(d_T)); + CUDA_CHECK(cudaFree(d_view)); CUDA_CHECK(cudaFree(d_xyz_c_grad_in)); CUDA_CHECK(cudaFree(d_xyz_w_grad_in)); } @@ -159,45 +175,54 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { // Host data std::vector h_xyz_c = {1.0, 2.0, 3.0, -1.0, -2.0, 4.0}; - std::vector h_K = {100.0, 0.0, 160.0, 0.0, 120.0, 120.0, 0.0, 0.0, 1.0}; + // Proj matrix (4x4) - Identity-like for simplicity + // P = + // 1 0 0 0 + // 0 1 0 0 + // 0 0 0 1 + // 0 0 1 0 + std::vector h_proj = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0}; + std::vector h_J_grad_in = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2}; std::vector h_xyz_c_grad_out(N * 3); // Device data float *d_xyz_c = device_alloc(N * 3); - float *d_K = device_alloc(9); + float *d_proj = device_alloc(16); float *d_J_grad_in = device_alloc(N * 6); float *d_xyz_c_grad_out = device_alloc(N * 3); CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_K, h_K.data(), 9 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_J_grad_in, h_J_grad_in.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); // Run kernel - compute_projection_jacobian_backward(d_xyz_c, d_K, d_J_grad_in, N, d_xyz_c_grad_out); + compute_projection_jacobian_backward(d_xyz_c, d_proj, d_J_grad_in, N, d_xyz_c_grad_out); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_xyz_c_grad_out.data(), d_xyz_c_grad_out, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); // Numerical gradient check - auto forward_jacobian = [&](const std::vector &xyz_c, const std::vector &K) { + auto forward_jacobian = [&](const std::vector &xyz_c, const std::vector &proj) { std::vector J(N * 6); for (int i = 0; i < N; ++i) { float x = xyz_c[i * 3 + 0]; float y = xyz_c[i * 3 + 1]; float z = xyz_c[i * 3 + 2]; - float fx = K[0], fy = K[4]; float z_inv = 1.0f / z; float z_inv2 = z_inv * z_inv; // Jacobian: du/dx, du/dy, du/dz, dv/dx, dv/dy, dv/dz - J[i * 6 + 0] = fx * z_inv; // du/dx - J[i * 6 + 1] = 0.0f; // du/dy - J[i * 6 + 2] = -fx * x * z_inv2; // du/dz - J[i * 6 + 3] = 0.0f; // dv/dx - J[i * 6 + 4] = fy * z_inv; // dv/dy - J[i * 6 + 5] = -fy * y * z_inv2; // dv/dz + // With our simple Proj: + // J = [ 1/z, 0, -x/z^2 ] + // [ 0, 1/z, -y/z^2 ] + J[i * 6 + 0] = z_inv; // du/dx + J[i * 6 + 1] = 0.0f; // du/dy + J[i * 6 + 2] = -x * z_inv2; // du/dz + J[i * 6 + 3] = 0.0f; // dv/dx + J[i * 6 + 4] = z_inv; // dv/dy + J[i * 6 + 5] = -y * z_inv2; // dv/dz } return J; }; @@ -208,8 +233,8 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { xyz_c_p[i] += h; std::vector xyz_c_m = h_xyz_c; xyz_c_m[i] -= h; - auto J_p = forward_jacobian(xyz_c_p, h_K); - auto J_m = forward_jacobian(xyz_c_m, h_K); + auto J_p = forward_jacobian(xyz_c_p, h_proj); + auto J_m = forward_jacobian(xyz_c_m, h_proj); float numerical_grad = 0; for (int j = 0; j < N * 6; ++j) numerical_grad += (J_p[j] - J_m[j]) / (2 * h) * h_J_grad_in[j]; @@ -217,7 +242,7 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { } CUDA_CHECK(cudaFree(d_xyz_c)); - CUDA_CHECK(cudaFree(d_K)); + CUDA_CHECK(cudaFree(d_proj)); CUDA_CHECK(cudaFree(d_J_grad_in)); CUDA_CHECK(cudaFree(d_xyz_c_grad_out)); } @@ -230,38 +255,20 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { // Host data std::vector h_J = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; std::vector h_sigma_world = {1.0f, 0.1f, 0.2f, 0.1f, 2.0f, 0.3f, 0.2f, 0.3f, 3.0f}; - std::vector h_T = {0.8f, -0.6f, 0.0f, 0.1f, 0.6f, 0.8f, 0.0f, 0.2f, 0.0f, 0.0f, 1.0f, 0.3f}; + // View matrix (4x4) + std::vector h_view = {0.8f, -0.6f, 0.0f, 0.1f, 0.6f, 0.8f, 0.0f, 0.2f, + 0.0f, 0.0f, 1.0f, 0.3f, 0.0f, 0.0f, 0.0f, 1.0f}; std::vector h_conic_grad_out = {0.5f, -0.2f, 0.8f}; std::vector h_J_grad_in(N * 6); std::vector h_sigma_world_grad_in(N * 9); // Kernel has i*9 indexing, so allocate 9 floats - // Device data - auto d_J = device_alloc(N * 6); - auto d_sigma_world = device_alloc(N * 9); - auto d_T = device_alloc(12); - auto d_conic_grad_out = device_alloc(N * 3); - auto d_J_grad_in = device_alloc(N * 6); - auto d_sigma_world_grad_in = device_alloc(N * 9); - - CUDA_CHECK(cudaMemcpy(d_J, h_J.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_sigma_world, h_sigma_world.data(), N * 9 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_T, h_T.data(), 12 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_conic_grad_out, h_conic_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - - // Run kernel - compute_conic_backward(d_J, d_sigma_world, d_T, d_conic_grad_out, N, d_J_grad_in, d_sigma_world_grad_in); - CUDA_CHECK(cudaDeviceSynchronize()); - - CUDA_CHECK(cudaMemcpy(h_J_grad_in.data(), d_J_grad_in, N * 6 * sizeof(float), cudaMemcpyDeviceToHost)); - CUDA_CHECK( - cudaMemcpy(h_sigma_world_grad_in.data(), d_sigma_world_grad_in, N * 9 * sizeof(float), cudaMemcpyDeviceToHost)); - - // Numerical gradient check - auto forward_conic = [&](const std::vector &J_in, const std::vector &sigma_in, - const std::vector &T_in) { + // Compute h_conic (inverse covariance) for the test + auto compute_conic_val = [&](const std::vector &J_in, const std::vector &sigma_in, + const std::vector &view_in) { const float *J = J_in.data(); const float *S = sigma_in.data(); - const float W[9] = {T_in[0], T_in[1], T_in[2], T_in[4], T_in[5], T_in[6], T_in[8], T_in[9], T_in[10]}; + const float W[9] = {view_in[0], view_in[1], view_in[2], view_in[4], view_in[5], + view_in[6], view_in[8], view_in[9], view_in[10]}; // JW = J @ W (2x3) float JW[6]; @@ -281,14 +288,46 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { temp[4] = JW[3] * S[1] + JW[4] * S[4] + JW[5] * S[7]; temp[5] = JW[3] * S[2] + JW[4] * S[5] + JW[5] * S[8]; - // conic = temp @ JW.T (2x2 symmetric, storing 3 values) + // cov = temp @ JW.T (2x2 symmetric) + float cov00 = temp[0] * JW[0] + temp[1] * JW[1] + temp[2] * JW[2] + 0.3f; + float cov01 = temp[0] * JW[3] + temp[1] * JW[4] + temp[2] * JW[5]; + float cov11 = temp[3] * JW[3] + temp[4] * JW[4] + temp[5] * JW[5] + 0.3f; + + // Invert + float det = cov00 * cov11 - cov01 * cov01; + float inv_det = 1.0f / det; std::vector conic(3); - conic[0] = temp[0] * JW[0] + temp[1] * JW[1] + temp[2] * JW[2]; // (0,0) - conic[1] = temp[0] * JW[3] + temp[1] * JW[4] + temp[2] * JW[5]; // (0,1) - conic[2] = temp[3] * JW[3] + temp[4] * JW[4] + temp[5] * JW[5]; // (1,1) + conic[0] = cov11 * inv_det; + conic[1] = -cov01 * inv_det; + conic[2] = cov00 * inv_det; return conic; }; + std::vector h_conic = compute_conic_val(h_J, h_sigma_world, h_view); + + // Device data + auto d_J = device_alloc(N * 6); + auto d_sigma_world = device_alloc(N * 9); + auto d_view = device_alloc(16); + auto d_conic = device_alloc(N * 3); + auto d_conic_grad_out = device_alloc(N * 3); + auto d_J_grad_in = device_alloc(N * 6); + auto d_sigma_world_grad_in = device_alloc(N * 9); + + CUDA_CHECK(cudaMemcpy(d_J, h_J.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_sigma_world, h_sigma_world.data(), N * 9 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_conic, h_conic.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_conic_grad_out, h_conic_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); + + // Run kernel + compute_conic_backward(d_J, d_sigma_world, d_view, d_conic, d_conic_grad_out, N, d_J_grad_in, d_sigma_world_grad_in); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(h_J_grad_in.data(), d_J_grad_in, N * 6 * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK( + cudaMemcpy(h_sigma_world_grad_in.data(), d_sigma_world_grad_in, N * 9 * sizeof(float), cudaMemcpyDeviceToHost)); + auto compute_loss = [&](const std::vector &conic) { return conic[0] * h_conic_grad_out[0] + 2.0f * conic[1] * h_conic_grad_out[1] + conic[2] * h_conic_grad_out[2]; }; @@ -299,8 +338,8 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { J_p[i] += h; std::vector J_m = h_J; J_m[i] -= h; - auto loss_p = compute_loss(forward_conic(J_p, h_sigma_world, h_T)); - auto loss_m = compute_loss(forward_conic(J_m, h_sigma_world, h_T)); + auto loss_p = compute_loss(compute_conic_val(J_p, h_sigma_world, h_view)); + auto loss_m = compute_loss(compute_conic_val(J_m, h_sigma_world, h_view)); float numerical_grad = (loss_p - loss_m) / (2.0f * h); EXPECT_NEAR(h_J_grad_in[i], numerical_grad, 1e-1); } @@ -323,15 +362,16 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { sigma_p[i] += h; std::vector sigma_m = h_sigma_world; sigma_m[i] -= h; - auto loss_p = compute_loss(forward_conic(h_J, sigma_p, h_T)); - auto loss_m = compute_loss(forward_conic(h_J, sigma_m, h_T)); + auto loss_p = compute_loss(compute_conic_val(h_J, sigma_p, h_view)); + auto loss_m = compute_loss(compute_conic_val(h_J, sigma_m, h_view)); float numerical_grad = (loss_p - loss_m) / (2.0f * h); EXPECT_NEAR(h_sigma_grad_analytic_full[i], numerical_grad, 1e-1); } CUDA_CHECK(cudaFree(d_J)); CUDA_CHECK(cudaFree(d_sigma_world)); - CUDA_CHECK(cudaFree(d_T)); + CUDA_CHECK(cudaFree(d_view)); + CUDA_CHECK(cudaFree(d_conic)); CUDA_CHECK(cudaFree(d_conic_grad_out)); CUDA_CHECK(cudaFree(d_J_grad_in)); CUDA_CHECK(cudaFree(d_sigma_world_grad_in)); @@ -621,13 +661,12 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { const float u_diff = (float)u_splat - u_mean; const float v_diff = (float)v_splat - v_mean; - const float a = conic[i * 3 + 0] + 0.3f; // Match kernel - const float b = conic[i * 3 + 1]; - const float c = conic[i * 3 + 2] + 0.3f; // Match kernel + const float inv_cov00 = conic[i * 3 + 0]; + const float inv_cov01 = conic[i * 3 + 1]; + const float inv_cov11 = conic[i * 3 + 2]; - const float det = a * c - b * b; - const float reciprocal_det = 1.0f / det; - const float mh_sq = (c * u_diff * u_diff - 2.0f * b * u_diff * v_diff + a * v_diff * v_diff) * reciprocal_det; + const float mh_sq = + (inv_cov00 * u_diff * u_diff + 2.0f * inv_cov01 * u_diff * v_diff + inv_cov11 * v_diff * v_diff); const float opa = 1.0f / (1.0f + expf(-opacity[i])); diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index 65666dc..9413963 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -91,37 +91,37 @@ TEST_F(CudaKernelTest, ComputeSigma) { CUDA_CHECK(cudaFree(d_sigma)); } -// Test case for the camera_intrinsic_projection kernel. -TEST_F(CudaKernelTest, CameraIntrinsicProjection) { +// Test case for the project_to_screen kernel. +TEST_F(CudaKernelTest, ProjectToScreen) { const int N = 4; // Number of points + const int width = 1920; + const int height = 1080; - // Host-side data - // K = [fx, 0, cx, 0, fy, cy, 0, 0, 1] - const std::vector h_K = {100.0f, 0.0f, 50.0f, 0.0f, 120.0f, 60.0f, 0.0f, 0.0f, 1.0f}; - const float fx = h_K[0], cx = h_K[2], fy = h_K[4], cy = h_K[5]; + const std::vector h_proj = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f}; const std::vector h_xyz = { - 1.0f, 1.0f, 2.0f, // Point 0 - 2.0f, -3.0f, 5.0f, // Point 1 - 0.0f, 0.0f, 1.0f, // Point 2 - -4.0f, 2.0f, 10.0f // Point 3 + 1.0f, 1.0f, 2.0f, // Point 0: x/z = 0.5, y/z = 0.5 -> uv = (0.75*w, 0.75*h) + 2.0f, -3.0f, 5.0f, // Point 1: x/z = 0.4, y/z = -0.6 -> uv = (0.7*w, 0.2*h) + 0.0f, 0.0f, 1.0f, // Point 2: 0, 0 -> uv = (0.5*w, 0.5*h) + -4.0f, 2.0f, 10.0f // Point 3: -0.4, 0.2 -> uv = (0.3*w, 0.6*h) }; std::vector h_uv(N * 2); // Device-side data pointers - float *d_K, *d_xyz, *d_uv; + float *d_proj, *d_xyz, *d_uv; // Allocate memory on the device - CUDA_CHECK(cudaMalloc(&d_K, h_K.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_proj, h_proj.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_xyz, h_xyz.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_uv, h_uv.size() * sizeof(float))); // Copy input data from host to device - CUDA_CHECK(cudaMemcpy(d_K, h_K.data(), h_K.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), h_proj.size() * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_xyz, h_xyz.data(), h_xyz.size() * sizeof(float), cudaMemcpyHostToDevice)); // Launch the kernel - camera_intrinsic_projection(d_xyz, d_K, N, d_uv); + project_to_screen(d_xyz, d_proj, N, width, height, d_uv); CUDA_CHECK(cudaDeviceSynchronize()); // Wait for the kernel to finish // Copy result data from device to host @@ -133,17 +133,26 @@ TEST_F(CudaKernelTest, CameraIntrinsicProjection) { const float x = h_xyz[i * 3 + 0]; const float y = h_xyz[i * 3 + 1]; const float z = h_xyz[i * 3 + 2]; - expected_uv[i * 2 + 0] = fx * x / z + cx; - expected_uv[i * 2 + 1] = fy * y / z + cy; + // With our custom Proj: + // x_clip = x + // y_clip = y + // w_clip = z + // x_ndc = x / z + // y_ndc = y / z + // u = (x_ndc * 0.5 + 0.5) * width + // v = (y_ndc * 0.5 + 0.5) * height + + expected_uv[i * 2 + 0] = (x / z * 0.5f + 0.5f) * width; + expected_uv[i * 2 + 1] = (y / z * 0.5f + 0.5f) * height; } // Compare results for (int i = 0; i < N * 2; ++i) { - ASSERT_NEAR(h_uv[i], expected_uv[i], 1e-5); + ASSERT_NEAR(h_uv[i], expected_uv[i], 1e-4); } // Free device memory - CUDA_CHECK(cudaFree(d_K)); + CUDA_CHECK(cudaFree(d_proj)); CUDA_CHECK(cudaFree(d_xyz)); CUDA_CHECK(cudaFree(d_uv)); } @@ -222,17 +231,23 @@ TEST_F(CudaKernelTest, GaussianCulling) { CUDA_CHECK(cudaFree(d_mask)); } -// Test case for the camera_extrinsic_projection function. -TEST_F(CudaKernelTest, CameraExtrinsicProjection) { +// Test case for the compute_camera_space_points function. +TEST_F(CudaKernelTest, ComputeCameraSpacePoints) { const int N = 3; // Number of points // Host-side data - // Extrinsic matrix T = [R|t] is 3x4. + // View matrix V = [R|t] is 4x4. // R is identity, t = [10, 20, 30]. - const std::vector h_T = { + // V = + // 1 0 0 10 + // 0 1 0 20 + // 0 0 1 30 + // 0 0 0 1 + const std::vector h_view = { 1.0f, 0.0f, 0.0f, 10.0f, // Row 1 0.0f, 1.0f, 0.0f, 20.0f, // Row 2 - 0.0f, 0.0f, 1.0f, 30.0f // Row 3 + 0.0f, 0.0f, 1.0f, 30.0f, // Row 3 + 0.0f, 0.0f, 0.0f, 1.0f // Row 4 }; // World coordinates (x, y, z) @@ -246,35 +261,35 @@ TEST_F(CudaKernelTest, CameraExtrinsicProjection) { std::vector h_xyz_c(N * 3); // Device-side data pointers - float *d_T, *d_xyz_w, *d_xyz_c; + float *d_view, *d_xyz_w, *d_xyz_c; // Allocate memory on the device - CUDA_CHECK(cudaMalloc(&d_T, h_T.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_view, h_view.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_xyz_w, h_xyz_w.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_xyz_c, h_xyz_c.size() * sizeof(float))); // Copy input data from host to device - CUDA_CHECK(cudaMemcpy(d_T, h_T.data(), h_T.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), h_view.size() * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_xyz_w, h_xyz_w.data(), h_xyz_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - // Launch the function (which wraps a CUBLAS call) - camera_extrinsic_projection(d_xyz_w, d_T, N, d_xyz_c); + // Launch the function + compute_camera_space_points(d_xyz_w, d_view, N, d_xyz_c); CUDA_CHECK(cudaDeviceSynchronize()); // Wait for the kernel to finish // Copy result data from device to host CUDA_CHECK(cudaMemcpy(h_xyz_c.data(), d_xyz_c, h_xyz_c.size() * sizeof(float), cudaMemcpyDeviceToHost)); // Calculate expected results on the host - // xyz_c = R * xyz_w + t + // xyz_c = V * xyz_w std::vector expected_xyz_c(N * 3); for (int i = 0; i < N; ++i) { const float x_w = h_xyz_w[i * 3 + 0]; const float y_w = h_xyz_w[i * 3 + 1]; const float z_w = h_xyz_w[i * 3 + 2]; // Since R is identity, this simplifies to x_c = x_w + t_x, etc. - expected_xyz_c[i * 3 + 0] = x_w + h_T[3]; // t_x - expected_xyz_c[i * 3 + 1] = y_w + h_T[7]; // t_y - expected_xyz_c[i * 3 + 2] = z_w + h_T[11]; // t_z + expected_xyz_c[i * 3 + 0] = x_w + h_view[3]; // t_x + expected_xyz_c[i * 3 + 1] = y_w + h_view[7]; // t_y + expected_xyz_c[i * 3 + 2] = z_w + h_view[11]; // t_z } // Compare results @@ -283,7 +298,7 @@ TEST_F(CudaKernelTest, CameraExtrinsicProjection) { } // Free device memory - CUDA_CHECK(cudaFree(d_T)); + CUDA_CHECK(cudaFree(d_view)); CUDA_CHECK(cudaFree(d_xyz_w)); CUDA_CHECK(cudaFree(d_xyz_c)); } @@ -295,34 +310,44 @@ TEST_F(CudaKernelTest, ComputeConic) { // Host-side input data const std::vector h_xyz = {1.0f, 2.0f, 5.0f}; // Camera-space coordinates - const std::vector h_K = {100.0f, 0.0f, 50.0f, 0.0f, 120.0f, 60.0f, 0.0f, 0.0f, 1.0f}; // Intrinsics + // Proj matrix (4x4) + // Use simple identity-like projection for easy Jacobian verification + // P = + // 1 0 0 0 + // 0 1 0 0 + // 0 0 0 1 + // 0 0 1 0 + // This means x_proj = x/z, y_proj = y/z + const std::vector h_proj = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f}; + const std::vector h_sigma = {1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}; // 3x3 Identity covariance - const std::vector h_T = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f}; // Identity extrinsics + const std::vector h_view = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; // Identity view // Host-side output buffers std::vector h_J(N * 6); std::vector h_conic(N * 3); // Device-side pointers - float *d_xyz, *d_K, *d_sigma, *d_T, *d_J, *d_conic; + float *d_xyz, *d_proj, *d_sigma, *d_view, *d_J, *d_conic; // Allocate memory on the device CUDA_CHECK(cudaMalloc(&d_xyz, h_xyz.size() * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_K, h_K.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_proj, h_proj.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_sigma, h_sigma.size() * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_T, h_T.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_view, h_view.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_J, h_J.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_conic, h_conic.size() * sizeof(float))); // Copy input data from host to device CUDA_CHECK(cudaMemcpy(d_xyz, h_xyz.data(), h_xyz.size() * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_K, h_K.data(), h_K.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), h_proj.size() * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_sigma, h_sigma.data(), h_sigma.size() * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_T, h_T.data(), h_T.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), h_view.size() * sizeof(float), cudaMemcpyHostToDevice)); // Launch the function to be tested - compute_conic(d_xyz, d_K, d_sigma, d_T, N, d_J, d_conic); + compute_conic(d_xyz, d_view, d_sigma, d_proj, N, d_J, d_conic); CUDA_CHECK(cudaDeviceSynchronize()); // Copy result from device to host @@ -330,23 +355,39 @@ TEST_F(CudaKernelTest, ComputeConic) { // --- Calculate expected results on the host for verification --- const float x = h_xyz[0], y = h_xyz[1], z = h_xyz[2]; - const float fx = h_K[0], fy = h_K[4]; - - // 1. Expected Jacobian J - const float j00 = fx / z; - const float j02 = -fx * x / (z * z); - const float j11 = fy / z; - const float j12 = -fy * y / (z * z); - - // 2. W is identity because T is identity + // With our simple Proj: + // x_ndc = x / z + // y_ndc = y / z + // J = d(uv)/d(xyz) + // u = x/z * W/2 + W/2 + // v = y/z * H/2 + H/2 + // But wait, the kernel computes J for NDC coordinates (or screen? check kernel) + // The kernel computes J = d(x_proj, y_proj) / d(x, y, z) + // J = [ 1/z, 0, -x/z^2 ] + // [ 0, 1/z, -y/z^2 ] + // (Assuming p_proj.x = x/z, p_proj.y = y/z) + + const float j00 = 1.0f / z; + const float j02 = -x / (z * z); + const float j11 = 1.0f / z; + const float j12 = -y / (z * z); + + // 2. W is identity because View is identity // 3. M = J @ W = J // 4. V = Sigma @ M^T = Identity @ J^T = J^T - // 5. Conic = M @ V = J @ J^T - const float c00 = j00 * j00 + 0.0f * 0.0f + j02 * j02; - const float c01 = j00 * 0.0f + 0.0f * j11 + j02 * j12 * 2; - const float c11 = 0.0f * 0.0f + j11 * j11 + j12 * j12; + // 5. Covariance = M @ V = J @ J^T + const float cov00 = j00 * j00 + 0.0f * 0.0f + j02 * j02 + 0.3f; + const float cov01 = j00 * 0.0f + 0.0f * j11 + j02 * j12; + const float cov11 = 0.0f * 0.0f + j11 * j11 + j12 * j12 + 0.3f; + + // 6. Conic = Inverse(Covariance) + const float det = cov00 * cov11 - cov01 * cov01; + const float inv_det = 1.0f / det; + const float expected_c00 = cov11 * inv_det; + const float expected_c01 = -cov01 * inv_det; + const float expected_c11 = cov00 * inv_det; - const std::vector expected_conic = {c00, c01 / 2.0f, c11}; + const std::vector expected_conic = {expected_c00, expected_c01, expected_c11}; // Compare results for (size_t i = 0; i < h_conic.size(); ++i) { @@ -355,9 +396,9 @@ TEST_F(CudaKernelTest, ComputeConic) { // Free device memory CUDA_CHECK(cudaFree(d_xyz)); - CUDA_CHECK(cudaFree(d_K)); + CUDA_CHECK(cudaFree(d_proj)); CUDA_CHECK(cudaFree(d_sigma)); - CUDA_CHECK(cudaFree(d_T)); + CUDA_CHECK(cudaFree(d_view)); CUDA_CHECK(cudaFree(d_J)); CUDA_CHECK(cudaFree(d_conic)); } @@ -654,12 +695,12 @@ TEST_F(CudaKernelTest, RenderImageMultipleGaussians) { const float u_diff = u_pixel - u_mean; const float v_diff = v_pixel - v_mean; - const float a = h_conic[gaussian_idx * 3 + 0] + 0.3f; - const float b_c = h_conic[gaussian_idx * 3 + 1]; - const float c = h_conic[gaussian_idx * 3 + 2] + 0.3f; + const float inv_cov00 = h_conic[gaussian_idx * 3 + 0]; + const float inv_cov01 = h_conic[gaussian_idx * 3 + 1]; + const float inv_cov11 = h_conic[gaussian_idx * 3 + 2]; - const float det = a * c - b_c * b_c; - const float mh_sq = (c * u_diff * u_diff - (b_c + b_c) * u_diff * v_diff + a * v_diff * v_diff) / det; + const float mh_sq = + (inv_cov00 * u_diff * u_diff + 2.0f * inv_cov01 * u_diff * v_diff + inv_cov11 * v_diff * v_diff); float alpha = 0.0f; if (mh_sq > 0.0f) { From ac976772b3d58ae4ff67f8fb917dee6ad7f96d0d Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Fri, 5 Dec 2025 13:25:34 -0500 Subject: [PATCH 02/23] fix conic and proj grad --- cuda/gaussian_backward.cu | 12 ++++++------ tests/cuda_backward_test.cpp | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index 0f6e7be..b5b369b 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -227,11 +227,11 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * // c11 = m10*v01 + m11*v11 + m12*v21 // Compute dL/dV - float dv00 = d_c00 * m00; + float dv00 = d_c00 * m00 + d_c01 * m10; float dv01 = d_c01 * m00 + d_c11 * m10; - float dv10 = d_c00 * m01; + float dv10 = d_c00 * m01 + d_c01 * m11; float dv11 = d_c01 * m01 + d_c11 * m11; - float dv20 = d_c00 * m02; + float dv20 = d_c00 * m02 + d_c01 * m12; float dv21 = d_c01 * m02 + d_c11 * m12; // Compute dL/dSigma = dL/dV @ M @@ -257,9 +257,9 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * float dm_from_conic_00 = d_c00 * v00 + d_c01 * v01; float dm_from_conic_01 = d_c00 * v10 + d_c01 * v11; float dm_from_conic_02 = d_c00 * v20 + d_c01 * v21; - float dm_from_conic_10 = d_c11 * v01; // d_c01 * v00 is for c10, which is symmetric to c01 - float dm_from_conic_11 = d_c11 * v11; - float dm_from_conic_12 = d_c11 * v21; + float dm_from_conic_10 = d_c01 * v00 + d_c11 * v01; + float dm_from_conic_11 = d_c01 * v10 + d_c11 * v11; + float dm_from_conic_12 = d_c01 * v20 + d_c11 * v21; // Compute dL/dM (from V = Sigma @ M^T) = (dL/dV)^T @ Sigma float dm_from_V_00 = dv00 * s00 + dv10 * s01 + dv20 * s02; diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 251fe2c..5ebe722 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -69,6 +69,7 @@ TEST_F(CudaBackwardKernelTest, ProjectToScreenBackward) { CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_uv_grad_out, h_uv_grad_out.data(), N * 2 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemset(d_xyz_c_grad_in, 0, N * 3 * sizeof(float))); // Run kernel project_to_screen_backward(d_xyz_c, d_proj, d_uv_grad_out, N, width, height, d_xyz_c_grad_in); @@ -319,6 +320,8 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_conic, h_conic.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_conic_grad_out, h_conic_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemset(d_J_grad_in, 0, N * 6 * sizeof(float))); + CUDA_CHECK(cudaMemset(d_sigma_world_grad_in, 0, N * 9 * sizeof(float))); // Run kernel compute_conic_backward(d_J, d_sigma_world, d_view, d_conic, d_conic_grad_out, N, d_J_grad_in, d_sigma_world_grad_in); From 51dddebdafa47bce5a4b29e9465d5ec6f90e9bb5 Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 5 Dec 2025 13:31:36 -0500 Subject: [PATCH 03/23] change grad threshold --- tests/cuda_forward_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index 9413963..aa07025 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -148,7 +148,7 @@ TEST_F(CudaKernelTest, ProjectToScreen) { // Compare results for (int i = 0; i < N * 2; ++i) { - ASSERT_NEAR(h_uv[i], expected_uv[i], 1e-4); + ASSERT_NEAR(h_uv[i], expected_uv[i], 1e-3); } // Free device memory From a866e8e96b338b75ab1f8c3d5f956a27462535be Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 5 Dec 2025 13:40:41 -0500 Subject: [PATCH 04/23] fix project to screen grad --- cuda/projection_backward.cu | 4 ++-- tests/cuda_backward_test.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda/projection_backward.cu b/cuda/projection_backward.cu index 2dcab29..8062646 100644 --- a/cuda/projection_backward.cu +++ b/cuda/projection_backward.cu @@ -59,8 +59,8 @@ __global__ void project_to_screen_backward_kernel(const float *__restrict__ xyz_ const float grad_v = uv_grad_out[i * UV_STRIDE + 1]; // d(NDC) / d(uv) - float dx_ndc = grad_u * 2.0f / width; - float dy_ndc = grad_v * 2.0f / height; + float dx_ndc = grad_u * width * 0.5f; + float dy_ndc = grad_v * height * 0.5f; // d(Clip) / d(NDC) float dx_clip = dx_ndc * w_inv; diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 5ebe722..b2f8d4c 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -57,7 +57,7 @@ TEST_F(CudaBackwardKernelTest, ProjectToScreenBackward) { // v = (y/z * 0.5 + 0.5) * height std::vector h_proj = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0}; - std::vector h_uv_grad_out = {0.1, 0.2, 0.3, 0.4}; + std::vector h_uv_grad_out = {0.01, 0.02, 0.03, 0.04}; std::vector h_xyz_c_grad_in(N * 3); // Device data From 46b1517be9780674aaa84aa221fab6a68d285ff3 Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Fri, 5 Dec 2025 15:05:48 -0500 Subject: [PATCH 05/23] store sigma as 6 params --- cuda/data.cu | 2 +- cuda/gaussian.cu | 27 ++++++------- cuda/gaussian_backward.cu | 39 +++++++++--------- tests/cuda_backward_test.cpp | 78 ++++++++++++++++++++---------------- tests/cuda_forward_test.cpp | 13 +++--- 5 files changed, 80 insertions(+), 79 deletions(-) diff --git a/cuda/data.cu b/cuda/data.cu index fdb66ae..628ac0c 100644 --- a/cuda/data.cu +++ b/cuda/data.cu @@ -68,7 +68,7 @@ GaussianGradients::GaussianGradients(size_t max_gaussians) { d_grad_conic.resize(max_gaussians * 3); d_grad_uv.resize(max_gaussians * 2); d_grad_J.resize(max_gaussians * 6); - d_grad_sigma.resize(max_gaussians * 9); + d_grad_sigma.resize(max_gaussians * 6); d_grad_xyz_c.resize(max_gaussians * 3); d_grad_precompute_rgb.resize(max_gaussians * 3); } catch (const std::exception &e) { diff --git a/cuda/gaussian.cu b/cuda/gaussian.cu index 67e3811..6dda60e 100644 --- a/cuda/gaussian.cu +++ b/cuda/gaussian.cu @@ -64,22 +64,19 @@ __global__ void compute_sigma_fused_kernel(const float *__restrict__ quaternion, float rs22 = r22 * sz; // Sigma is symmetric, so we can compute the upper-triangular part - // and reflect it to the lower-triangular part. - const int sigma_base_idx = 9 * i; - sigma[sigma_base_idx + 0] = rs00 * rs00 + rs01 * rs01 + rs02 * rs02; // S_00 - sigma[sigma_base_idx + 1] = rs00 * rs10 + rs01 * rs11 + rs02 * rs12; // S_01 - sigma[sigma_base_idx + 2] = rs00 * rs20 + rs01 * rs21 + rs02 * rs22; // S_02 - sigma[sigma_base_idx + 3] = sigma[sigma_base_idx + 1]; // S_10 = S_01 - sigma[sigma_base_idx + 4] = rs10 * rs10 + rs11 * rs11 + rs12 * rs12; // S_11 - sigma[sigma_base_idx + 5] = rs10 * rs20 + rs11 * rs21 + rs12 * rs22; // S_12 - sigma[sigma_base_idx + 6] = sigma[sigma_base_idx + 2]; // S_20 = S_02 - sigma[sigma_base_idx + 7] = sigma[sigma_base_idx + 5]; // S_21 = S_12 - sigma[sigma_base_idx + 8] = rs20 * rs20 + rs21 * rs21 + rs22 * rs22; // S_22 + // and store only the unique 6 elements. + const int sigma_base_idx = 6 * i; + sigma[sigma_base_idx + 0] = rs00 * rs00 + rs01 * rs01 + rs02 * rs02; // S_00 (xx) + sigma[sigma_base_idx + 1] = rs00 * rs10 + rs01 * rs11 + rs02 * rs12; // S_01 (xy) + sigma[sigma_base_idx + 2] = rs00 * rs20 + rs01 * rs21 + rs02 * rs22; // S_02 (xz) + sigma[sigma_base_idx + 3] = rs10 * rs10 + rs11 * rs11 + rs12 * rs12; // S_11 (yy) + sigma[sigma_base_idx + 4] = rs10 * rs20 + rs11 * rs21 + rs12 * rs22; // S_12 (yz) + sigma[sigma_base_idx + 5] = rs20 * rs20 + rs21 * rs21 + rs22 * rs22; // S_22 (zz) } __global__ void compute_conic_kernel(const float *__restrict__ sigma, const float *__restrict__ view, const float *__restrict__ J, const int N, float *conic) { - constexpr int SIGMA_STRIDE = 9; + constexpr int SIGMA_STRIDE = 6; constexpr int J_STRIDE = 6; constexpr int CONIC_STRIDE = 3; @@ -115,9 +112,9 @@ __global__ void compute_conic_kernel(const float *__restrict__ sigma, const floa const float s00 = sigma[sigma_base_idx + 0]; const float s01 = sigma[sigma_base_idx + 1]; const float s02 = sigma[sigma_base_idx + 2]; - const float s11 = sigma[sigma_base_idx + 4]; - const float s12 = sigma[sigma_base_idx + 5]; - const float s22 = sigma[sigma_base_idx + 8]; + const float s11 = sigma[sigma_base_idx + 3]; + const float s12 = sigma[sigma_base_idx + 4]; + const float s22 = sigma[sigma_base_idx + 5]; // Load the per-Gaussian 2x3 projection Jacobian (J) into registers. const int j_base_idx = i * J_STRIDE; diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index b5b369b..e252356 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -136,7 +136,7 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * const float *__restrict__ view, const float *__restrict__ conic, const float *__restrict__ conic_grad_out, const int N, float *J_grad_in, float *sigma_grad_in) { - constexpr int SIGMA_STRIDE = 9; + constexpr int SIGMA_STRIDE = 6; constexpr int J_STRIDE = 6; constexpr int CONIC_STRIDE = 3; @@ -177,9 +177,9 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * const float s00 = sigma[sigma_base_idx + 0]; const float s01 = sigma[sigma_base_idx + 1]; const float s02 = sigma[sigma_base_idx + 2]; - const float s11 = sigma[sigma_base_idx + 4]; - const float s12 = sigma[sigma_base_idx + 5]; - const float s22 = sigma[sigma_base_idx + 8]; + const float s11 = sigma[sigma_base_idx + 3]; + const float s12 = sigma[sigma_base_idx + 4]; + const float s22 = sigma[sigma_base_idx + 5]; // Recompute M = J @ W const float m00 = j00 * w00 + j01 * w10 + j02 * w20; @@ -244,14 +244,11 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * float ds22 = dv20 * m02 + dv21 * m12; sigma_grad_in[sigma_base_idx + 0] += ds00; - sigma_grad_in[sigma_base_idx + 1] += ds01 * 0.5f; // Store upper triangle, sum contributions - sigma_grad_in[sigma_base_idx + 2] += ds02 * 0.5f; - sigma_grad_in[sigma_base_idx + 3] += ds01 * 0.5f; // s10 - sigma_grad_in[sigma_base_idx + 4] += ds11; - sigma_grad_in[sigma_base_idx + 5] += ds12 * 0.5f; - sigma_grad_in[sigma_base_idx + 6] += ds02 * 0.5f; // s20 - sigma_grad_in[sigma_base_idx + 7] += ds12 * 0.5f; // s21 - sigma_grad_in[sigma_base_idx + 8] += ds22; + sigma_grad_in[sigma_base_idx + 1] += ds01; // Store upper triangle, sum contributions + sigma_grad_in[sigma_base_idx + 2] += ds02; + sigma_grad_in[sigma_base_idx + 3] += ds11; + sigma_grad_in[sigma_base_idx + 4] += ds12; + sigma_grad_in[sigma_base_idx + 5] += ds22; // Compute dL/dM (from Conic) float dm_from_conic_00 = d_c00 * v00 + d_c01 * v01; @@ -367,15 +364,15 @@ __global__ void sigma_backward_kernel(const float *__restrict__ q, const float * // Load dSigma and reconstruct the full symmetric matrix float dSigma[9]; - dSigma[0] = dSigma_in[idx * 9 + 0]; // xx - dSigma[1] = dSigma_in[idx * 9 + 1]; // xy - dSigma[2] = dSigma_in[idx * 9 + 2]; // xz - dSigma[3] = dSigma_in[idx * 9 + 3]; // yx - dSigma[4] = dSigma_in[idx * 9 + 4]; // yy - dSigma[5] = dSigma_in[idx * 9 + 5]; // yz - dSigma[6] = dSigma_in[idx * 9 + 6]; // zx - dSigma[7] = dSigma_in[idx * 9 + 7]; // zy - dSigma[8] = dSigma_in[idx * 9 + 8]; // zz + dSigma[0] = dSigma_in[idx * 6 + 0]; // xx + dSigma[1] = dSigma_in[idx * 6 + 1]; // xy + dSigma[2] = dSigma_in[idx * 6 + 2]; // xz + dSigma[3] = dSigma_in[idx * 6 + 1]; // yx = xy + dSigma[4] = dSigma_in[idx * 6 + 3]; // yy + dSigma[5] = dSigma_in[idx * 6 + 4]; // yz + dSigma[6] = dSigma_in[idx * 6 + 2]; // zx = xz + dSigma[7] = dSigma_in[idx * 6 + 4]; // zy = yz + dSigma[8] = dSigma_in[idx * 6 + 5]; // zz // dM = 2 * dSigma * M float dM[9]; diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index b2f8d4c..8fc6b62 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -255,19 +255,29 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { // Host data std::vector h_J = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; - std::vector h_sigma_world = {1.0f, 0.1f, 0.2f, 0.1f, 2.0f, 0.3f, 0.2f, 0.3f, 3.0f}; + std::vector h_sigma_world = {1.0f, 0.1f, 0.2f, 2.0f, 0.3f, 3.0f}; // xx, xy, xz, yy, yz, zz // View matrix (4x4) std::vector h_view = {0.8f, -0.6f, 0.0f, 0.1f, 0.6f, 0.8f, 0.0f, 0.2f, 0.0f, 0.0f, 1.0f, 0.3f, 0.0f, 0.0f, 0.0f, 1.0f}; std::vector h_conic_grad_out = {0.5f, -0.2f, 0.8f}; std::vector h_J_grad_in(N * 6); - std::vector h_sigma_world_grad_in(N * 9); // Kernel has i*9 indexing, so allocate 9 floats + std::vector h_sigma_world_grad_in(N * 6); // Kernel has i*6 indexing, so allocate 6 floats // Compute h_conic (inverse covariance) for the test auto compute_conic_val = [&](const std::vector &J_in, const std::vector &sigma_in, const std::vector &view_in) { const float *J = J_in.data(); - const float *S = sigma_in.data(); + // Reconstruct full 3x3 sigma from 6 params + float S[9]; + S[0] = sigma_in[0]; // xx + S[1] = sigma_in[1]; // xy + S[2] = sigma_in[2]; // xz + S[3] = sigma_in[1]; // yx + S[4] = sigma_in[3]; // yy + S[5] = sigma_in[4]; // yz + S[6] = sigma_in[2]; // zx + S[7] = sigma_in[4]; // zy + S[8] = sigma_in[5]; // zz const float W[9] = {view_in[0], view_in[1], view_in[2], view_in[4], view_in[5], view_in[6], view_in[8], view_in[9], view_in[10]}; @@ -308,20 +318,20 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { // Device data auto d_J = device_alloc(N * 6); - auto d_sigma_world = device_alloc(N * 9); + auto d_sigma_world = device_alloc(N * 6); auto d_view = device_alloc(16); auto d_conic = device_alloc(N * 3); auto d_conic_grad_out = device_alloc(N * 3); auto d_J_grad_in = device_alloc(N * 6); - auto d_sigma_world_grad_in = device_alloc(N * 9); + auto d_sigma_world_grad_in = device_alloc(N * 6); CUDA_CHECK(cudaMemcpy(d_J, h_J.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_sigma_world, h_sigma_world.data(), N * 9 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_sigma_world, h_sigma_world.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_conic, h_conic.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_conic_grad_out, h_conic_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemset(d_J_grad_in, 0, N * 6 * sizeof(float))); - CUDA_CHECK(cudaMemset(d_sigma_world_grad_in, 0, N * 9 * sizeof(float))); + CUDA_CHECK(cudaMemset(d_sigma_world_grad_in, 0, N * 6 * sizeof(float))); // Run kernel compute_conic_backward(d_J, d_sigma_world, d_view, d_conic, d_conic_grad_out, N, d_J_grad_in, d_sigma_world_grad_in); @@ -329,7 +339,7 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { CUDA_CHECK(cudaMemcpy(h_J_grad_in.data(), d_J_grad_in, N * 6 * sizeof(float), cudaMemcpyDeviceToHost)); CUDA_CHECK( - cudaMemcpy(h_sigma_world_grad_in.data(), d_sigma_world_grad_in, N * 9 * sizeof(float), cudaMemcpyDeviceToHost)); + cudaMemcpy(h_sigma_world_grad_in.data(), d_sigma_world_grad_in, N * 6 * sizeof(float), cudaMemcpyDeviceToHost)); auto compute_loss = [&](const std::vector &conic) { return conic[0] * h_conic_grad_out[0] + 2.0f * conic[1] * h_conic_grad_out[1] + conic[2] * h_conic_grad_out[2]; @@ -347,20 +357,20 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { EXPECT_NEAR(h_J_grad_in[i], numerical_grad, 1e-1); } - // Reconstruct full symmetric gradient for sigma from kernel output - std::vector h_sigma_grad_analytic_full(9); - h_sigma_grad_analytic_full[0] = h_sigma_world_grad_in[0]; // (0,0) - h_sigma_grad_analytic_full[1] = h_sigma_world_grad_in[1]; // (0,1) - h_sigma_grad_analytic_full[2] = h_sigma_world_grad_in[2]; // (0,2) - h_sigma_grad_analytic_full[3] = h_sigma_world_grad_in[3]; // (1,0) = (0,1) - h_sigma_grad_analytic_full[4] = h_sigma_world_grad_in[4]; // (1,1) - h_sigma_grad_analytic_full[5] = h_sigma_world_grad_in[5]; // (1,2) - h_sigma_grad_analytic_full[6] = h_sigma_world_grad_in[6]; // (2,0) = (0,2) - h_sigma_grad_analytic_full[7] = h_sigma_world_grad_in[7]; // (2,1) = (1,2) - h_sigma_grad_analytic_full[8] = h_sigma_world_grad_in[8]; // (2,2) - - // Check grad w.r.t. sigma_world - for (int i = 0; i < N * 9; ++i) { + // Reconstruct full symmetric gradient for sigma from kernel output (which is 6 params) + // The kernel accumulates gradients into the 6 unique elements. + // dL/dS_ij_full = dL/dS_ij_stored (if i==j) + // dL/dS_ij_full = 0.5 * dL/dS_ij_stored (if i!=j, because stored accumulates both ij and ji) + std::vector h_sigma_grad_analytic_full(6); + h_sigma_grad_analytic_full[0] = h_sigma_world_grad_in[0]; // xx + h_sigma_grad_analytic_full[1] = h_sigma_world_grad_in[1]; // xy + h_sigma_grad_analytic_full[2] = h_sigma_world_grad_in[2]; // xz + h_sigma_grad_analytic_full[3] = h_sigma_world_grad_in[3]; // yy + h_sigma_grad_analytic_full[4] = h_sigma_world_grad_in[4]; // yz + h_sigma_grad_analytic_full[5] = h_sigma_world_grad_in[5]; // zz + + // Check grad w.r.t. sigma_world (6 params) + for (int i = 0; i < N * 6; ++i) { std::vector sigma_p = h_sigma_world; sigma_p[i] += h; std::vector sigma_m = h_sigma_world; @@ -387,20 +397,20 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { // Host data std::vector h_q = {0.70710678, 0.70710678, 0.0, 0.0}; // Gaussian 1: 90 deg rot around X std::vector h_s = {-0.1, -0.2, -0.3}; - std::vector h_dSigma_in = {-0.1, -0.2, -0.3, -0.2, -0.4, -0.5, -0.3, -0.5, -0.6}; + std::vector h_dSigma_in = {-0.1, -0.2, -0.3, -0.4, -0.5, -0.6}; // xx, xy, xz, yy, yz, zz std::vector h_dQ_in(N * 4); std::vector h_dS_in(N * 3); // Device data auto d_q = device_alloc(N * 4); auto d_s = device_alloc(N * 3); - auto d_dSigma_in = device_alloc(N * 9); + auto d_dSigma_in = device_alloc(N * 6); auto d_dQ_in = device_alloc(N * 4); auto d_dS_in = device_alloc(N * 3); CUDA_CHECK(cudaMemcpy(d_q, h_q.data(), N * 4 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_s, h_s.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_dSigma_in, h_dSigma_in.data(), N * 9 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_dSigma_in, h_dSigma_in.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); // Run kernel compute_sigma_backward(d_q, d_s, d_dSigma_in, N, d_dQ_in, d_dS_in); @@ -411,7 +421,7 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { // Numerical gradient check auto forward_sigma = [&](const std::vector &q_in, const std::vector &s_in) { - std::vector sigma(N * 9); + std::vector sigma(N * 6); for (int i = 0; i < N; ++i) { float qw = q_in[i * 4 + 0]; float qx = q_in[i * 4 + 1]; @@ -451,15 +461,13 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { M[8] = R[8] * S_z; // Sigma = M * M^T - sigma[i * 9 + 0] = M[0] * M[0] + M[1] * M[1] + M[2] * M[2]; - sigma[i * 9 + 1] = M[0] * M[3] + M[1] * M[4] + M[2] * M[5]; - sigma[i * 9 + 2] = M[0] * M[6] + M[1] * M[7] + M[2] * M[8]; - sigma[i * 9 + 3] = sigma[i * 9 + 1]; - sigma[i * 9 + 4] = M[3] * M[3] + M[4] * M[4] + M[5] * M[5]; - sigma[i * 9 + 5] = M[3] * M[6] + M[4] * M[7] + M[5] * M[8]; - sigma[i * 9 + 6] = sigma[i * 9 + 2]; - sigma[i * 9 + 7] = sigma[i * 9 + 5]; - sigma[i * 9 + 8] = M[6] * M[6] + M[7] * M[7] + M[8] * M[8]; + // Store 6 params: xx, xy, xz, yy, yz, zz + sigma[i * 6 + 0] = M[0] * M[0] + M[1] * M[1] + M[2] * M[2]; + sigma[i * 6 + 1] = M[0] * M[3] + M[1] * M[4] + M[2] * M[5]; + sigma[i * 6 + 2] = M[0] * M[6] + M[1] * M[7] + M[2] * M[8]; + sigma[i * 6 + 3] = M[3] * M[3] + M[4] * M[4] + M[5] * M[5]; + sigma[i * 6 + 4] = M[3] * M[6] + M[4] * M[7] + M[5] * M[8]; + sigma[i * 6 + 5] = M[6] * M[6] + M[7] * M[7] + M[8] * M[8]; } return sigma; }; diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index aa07025..dd1e60f 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -48,7 +48,7 @@ TEST_F(CudaKernelTest, ComputeSigma) { // Case 2: Scales for rotated gaussian logf(1.0f), logf(2.0f), logf(3.0f)}; - std::vector h_sigma(N * 9); // Each sigma is a 3x3 matrix + std::vector h_sigma(N * 6); // Each sigma is a symmetric 3x3 matrix (stored as 6 floats) // Device-side data pointers float *d_quaternion, *d_scale, *d_sigma; @@ -71,14 +71,12 @@ TEST_F(CudaKernelTest, ComputeSigma) { CUDA_CHECK(cudaMemcpy(h_sigma.data(), d_sigma, h_sigma.size() * sizeof(float), cudaMemcpyDeviceToHost)); // Expected results calculated on the host - // The output sigma is in COLUMN-MAJOR order. + // The output sigma is in stored as [xx, xy, xz, yy, yz, zz] const std::vector expected_sigma = {// Case 1: R=I, S=diag(2,3,4). Sigma = diag(4,9,16) - // Column 1 Column 2 Column 3 - 4.0f, 0.0f, 0.0f, 0.0f, 9.0f, 0.0f, 0.0f, 0.0f, 16.0f, + 4.0f, 0.0f, 0.0f, 9.0f, 0.0f, 16.0f, // Case 2: R=RotZ(90), S=diag(1,2,3). Sigma = diag(4,1,9) after rotation. - // Column 1 Column 2 Column 3 - 4.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 9.0f}; + 4.0f, 0.0f, 0.0f, 1.0f, 0.0f, 9.0f}; // Compare results for (size_t i = 0; i < h_sigma.size(); ++i) { @@ -321,7 +319,8 @@ TEST_F(CudaKernelTest, ComputeConic) { const std::vector h_proj = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f}; - const std::vector h_sigma = {1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}; // 3x3 Identity covariance + const std::vector h_sigma = {1.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 1.0f}; // 3x3 Identity covariance (xx, xy, xz, yy, yz, zz) const std::vector h_view = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; // Identity view From 4bcf82a392e64d8e1ade7d746ca81a60bd22bef5 Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 5 Dec 2025 15:11:38 -0500 Subject: [PATCH 06/23] fix quat grad --- cuda/gaussian_backward.cu | 9 ++++----- tests/cuda_backward_test.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index e252356..f05b249 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -442,11 +442,10 @@ __global__ void sigma_backward_kernel(const float *__restrict__ q, const float * // The gradient of the norm is zero for directions orthogonal to the vector. // We subtract the parallel component (the projection) and scale by the inverse norm. - - dQ_in[idx * 4 + 0] = inv_norm * (d_norm_q[0] - dot * w); - dQ_in[idx * 4 + 1] = inv_norm * (d_norm_q[1] - dot * x); - dQ_in[idx * 4 + 2] = inv_norm * (d_norm_q[2] - dot * y); - dQ_in[idx * 4 + 3] = inv_norm * (d_norm_q[3] - dot * z); + dQ_in[idx * 4 + 0] = inv_norm * 0.5f * (d_norm_q[0] - dot * w); + dQ_in[idx * 4 + 1] = inv_norm * 0.5f * (d_norm_q[1] - dot * x); + dQ_in[idx * 4 + 2] = inv_norm * 0.5f * (d_norm_q[2] - dot * y); + dQ_in[idx * 4 + 3] = inv_norm * 0.5f * (d_norm_q[3] - dot * z); } void compute_sigma_backward(const float *const quaternion, const float *const scale, const float *const sigma_grad_out, diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 8fc6b62..3fe41c2 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -354,7 +354,7 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { auto loss_p = compute_loss(compute_conic_val(J_p, h_sigma_world, h_view)); auto loss_m = compute_loss(compute_conic_val(J_m, h_sigma_world, h_view)); float numerical_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_J_grad_in[i], numerical_grad, 1e-1); + EXPECT_NEAR(h_J_grad_in[i], numerical_grad, 1e-2); } // Reconstruct full symmetric gradient for sigma from kernel output (which is 6 params) @@ -378,7 +378,7 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { auto loss_p = compute_loss(compute_conic_val(h_J, sigma_p, h_view)); auto loss_m = compute_loss(compute_conic_val(h_J, sigma_m, h_view)); float numerical_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_sigma_grad_analytic_full[i], numerical_grad, 1e-1); + EXPECT_NEAR(h_sigma_grad_analytic_full[i], numerical_grad, 1e-2); } CUDA_CHECK(cudaFree(d_J)); @@ -494,7 +494,7 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { float loss_m = compute_loss(sigma_m); float numerical_grad = (loss_p - loss_m) / (2 * h); - EXPECT_NEAR(h_dQ_in[i], numerical_grad, 1e-1); + EXPECT_NEAR(h_dQ_in[i], numerical_grad, 1e-2); } // Check grad w.r.t s @@ -511,7 +511,7 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { float loss_m = compute_loss(sigma_m); float numerical_grad = (loss_p - loss_m) / (2 * h); - EXPECT_NEAR(h_dS_in[i], numerical_grad, 1e-1); + EXPECT_NEAR(h_dS_in[i], numerical_grad, 1e-2); } CUDA_CHECK(cudaFree(d_q)); From 03dfc17489cd4daee645ef59c8770b3d9d87abfa Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 5 Dec 2025 15:52:18 -0500 Subject: [PATCH 07/23] update render kernel --- cuda/render.cu | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/cuda/render.cu b/cuda/render.cu index 591c0cf..de94914 100644 --- a/cuda/render.cu +++ b/cuda/render.cu @@ -29,13 +29,13 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y const int total_splats = splat_idx_end - splat_idx_start; // Pixel-local accumulators - float alpha_accum[PIXELS_PER_THREAD]; + float T[PIXELS_PER_THREAD]; float3 accumulated_rgb[PIXELS_PER_THREAD]; int num_splats[PIXELS_PER_THREAD]; #pragma unroll for (int i = 0; i < PIXELS_PER_THREAD; i++) { - alpha_accum[i] = 0.0f; + T[i] = 1.0f; accumulated_rgb[i] = {0.0f, 0.0f, 0.0f}; num_splats[i] = 0; } @@ -43,6 +43,7 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y unsigned int any_active = 0xFFFFFFFF; int index_in_tile = 0; const int *splats_in_tile = &gaussian_idx_by_splat_idx[splat_idx_start]; + bool done = false; // Iterate on splats in the tile front to back for (; (index_in_tile < total_splats) && (any_active != 0); index_in_tile++) { @@ -68,21 +69,22 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y for (int i = 0; i < PIXELS_PER_THREAD; i++) { const float power = fminf(0.0f, basic + linear * i + quad * i * i); - const float valid_alpha = alpha_accum[i] <= 0.9999f; - any_active |= __ballot_sync(0xFFFFFFFF, valid_alpha); - float alpha = fminf(0.99f, opa * __expf(power)); - alpha = (valid_alpha && (alpha > 0.00392156862f)) ? alpha : 0.0f; + alpha = (alpha > 0.00392156862f) ? !done * alpha : 0.0f; + + const float test_T = T[i] * (1.0f - alpha); + done = test_T < 0.0001f; - // Alpha blending: C_out = α * C_in + (1 - α) * C_bg - const float weight = alpha * (1.0f - alpha_accum[i]); + any_active |= __ballot_sync(0xFFFFFFFF, !done); + + const float weight = alpha * T[i]; accumulated_rgb[i].x += color.x * weight; accumulated_rgb[i].y += color.y * weight; accumulated_rgb[i].z += color.z * weight; - alpha_accum[i] += weight; - num_splats[i] += valid_alpha; + T[i] = test_T; + num_splats[i] += !done; } } @@ -95,18 +97,12 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y if (valid_pixel) { splats_per_pixel[global_pixel_y * image_width + global_pixel_x] = num_splats[i]; - final_weight_per_pixel[global_pixel_y * image_width + global_pixel_x] = 1.0f - alpha_accum[i]; - - // Background contribution - float background_val = 0.0f; - if (alpha_accum[i] < 0.999f) { - background_val = background_opacity * (1.0f - alpha_accum[i]); - } + final_weight_per_pixel[global_pixel_y * image_width + global_pixel_x] = T[i]; const int pixel_idx = (global_pixel_y * image_width + global_pixel_x) * 3; - image[pixel_idx + 0] = accumulated_rgb[i].x + background_val; // R - image[pixel_idx + 1] = accumulated_rgb[i].y + background_val; // G - image[pixel_idx + 2] = accumulated_rgb[i].z + background_val; // B + image[pixel_idx + 0] = accumulated_rgb[i].x + T[i] * background_opacity; // R + image[pixel_idx + 1] = accumulated_rgb[i].y + T[i] * background_opacity; // G + image[pixel_idx + 2] = accumulated_rgb[i].z + T[i] * background_opacity; // B } } } From 5ca89a559e3e9ff882f7994e04d5c4084d1c88fa Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 5 Dec 2025 21:42:58 -0500 Subject: [PATCH 08/23] update render backward kernel --- cuda/render_backward.cu | 52 ++++++++++++---------------- cuda/trainer.cu | 16 ++++----- tests/cuda_backward_test.cpp | 66 +++++++++++++++--------------------- 3 files changed, 55 insertions(+), 79 deletions(-) diff --git a/cuda/render_backward.cu b/cuda/render_backward.cu index 3999004..de5835b 100644 --- a/cuda/render_backward.cu +++ b/cuda/render_backward.cu @@ -36,8 +36,8 @@ __global__ void render_tiles_backward_kernel( // Per-pixel variables stored in registers float T[PIXELS_PER_THREAD]; + float T_final[PIXELS_PER_THREAD]; float3 color_accum[PIXELS_PER_THREAD]; - bool background_initialized[PIXELS_PER_THREAD]; const int in_tile_x = threadIdx.x % TILE_SIZE_BWD; // local tile x const int in_tile_y = threadIdx.x / TILE_SIZE_BWD * PIXELS_PER_THREAD; // local tile y @@ -68,8 +68,8 @@ __global__ void render_tiles_backward_kernel( T[i] = 0.0f; _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x] = 0; } + T_final[i] = T[i]; color_accum[i] = {0.0f, 0.0f, 0.0f}; - background_initialized[i] = false; } index_in_tile = cg::reduce(warp, index_in_tile, cg::greater()) - 1; // max depth in tile @@ -121,7 +121,7 @@ __global__ void render_tiles_backward_kernel( // Mask out low alpha and depth bool valid_splat = valid_pixel; valid_splat &= (alpha >= 0.00392156862f); - valid_splat &= (index_in_tile < _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x]); + valid_splat &= (index_in_tile <= _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x]); const unsigned int valid_mask = __any_sync(0xFFFFFFFF, valid_splat); @@ -129,20 +129,7 @@ __global__ void render_tiles_backward_kernel( alpha *= valid_splat; g *= valid_splat; - if (valid_splat && !background_initialized[i]) { - const float background_weight = 1.0f - (alpha * T[i] + 1.0f - T[i]); - if (background_weight > 0.001f) { - color_accum[i].x += background_opacity * background_weight; - color_accum[i].y += background_opacity * background_weight; - color_accum[i].z += background_opacity * background_weight; - } - background_initialized[i] = true; - } - // alpha reciprical - float ra = 1.0f / (1.0f - alpha); - - if (index_in_tile < _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x] - 1) - T[i] *= ra; + T[i] *= 1.0f / (1.0f - alpha); // RGB gradients grad_rgb_tile.x += alpha * T[i] * _image_grad[0][i][threadIdx.y * blockDim.x + threadIdx.x]; @@ -151,19 +138,24 @@ __global__ void render_tiles_backward_kernel( float grad_alpha = 0.0f; // alpha gradient - grad_alpha += - (T[i] * color.x - color_accum[i].x * ra) * _image_grad[0][i][threadIdx.y * blockDim.x + threadIdx.x]; - grad_alpha += - (T[i] * color.y - color_accum[i].y * ra) * _image_grad[1][i][threadIdx.y * blockDim.x + threadIdx.x]; - grad_alpha += - (T[i] * color.z - color_accum[i].z * ra) * _image_grad[2][i][threadIdx.y * blockDim.x + threadIdx.x]; + grad_alpha += (color.x - color_accum[i].x) * _image_grad[0][i][threadIdx.y * blockDim.x + threadIdx.x]; + grad_alpha += (color.y - color_accum[i].y) * _image_grad[1][i][threadIdx.y * blockDim.x + threadIdx.x]; + grad_alpha += (color.z - color_accum[i].z) * _image_grad[2][i][threadIdx.y * blockDim.x + threadIdx.x]; + grad_alpha *= T[i]; + + // account for background contribution + float bg_dot_pixel = 0; + bg_dot_pixel += background_opacity * _image_grad[0][i][threadIdx.y * blockDim.x + threadIdx.x]; + bg_dot_pixel += background_opacity * _image_grad[1][i][threadIdx.y * blockDim.x + threadIdx.x]; + bg_dot_pixel += background_opacity * _image_grad[2][i][threadIdx.y * blockDim.x + threadIdx.x]; + grad_alpha += (-T_final[i] / (1.0f - alpha)) * bg_dot_pixel; // opacity gradient grad_opacity_tile += g * grad_alpha * opa * (1.0f - opa); - color_accum[i].x += alpha * T[i] * color.x; - color_accum[i].y += alpha * T[i] * color.y; - color_accum[i].z += alpha * T[i] * color.z; + color_accum[i].x = alpha * color.x + (1.0f - alpha) * color_accum[i].x; + color_accum[i].y = alpha * color.y + (1.0f - alpha) * color_accum[i].y; + color_accum[i].z = alpha * color.z + (1.0f - alpha) * color_accum[i].z; // G gradient const float grad_g = grad_alpha * opa; @@ -188,11 +180,11 @@ __global__ void render_tiles_backward_kernel( float grad_u_tile = 0.0f; float grad_v_tile = 0.0f; - grad_u_tile = grad_basic * -(inv_cov00 * d.x + inv_cov01 * d.y) + (grad_linear * inv_cov01); - grad_v_tile = grad_basic * -(inv_cov01 * d.x + inv_cov11 * d.y) + (grad_linear * inv_cov11); + grad_u_tile = (-inv_cov00 * d.x - inv_cov01 * d.y) * grad_basic + inv_cov01 * grad_linear; + grad_v_tile = (-inv_cov11 * d.y - inv_cov01 * d.x) * grad_basic + inv_cov11 * grad_linear; - // grad_u_tile *= 0.5f * image_width; - // grad_v_tile *= 0.5f * image_height; + grad_u_tile *= 0.5f * image_width; + grad_v_tile *= 0.5f * image_height; grad_u_tile = cg::reduce(warp, grad_u_tile, cg::plus()); grad_v_tile = cg::reduce(warp, grad_v_tile, cg::plus()); diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 72e4bfd..2e34698 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -73,7 +73,7 @@ private: void zero_grads(); float backward_pass(const Image &curr_image, const Camera &curr_camera, ForwardPassData &pass_data, const float bg_color, const thrust::device_vector &d_gt_image); - void optimizer_step(ForwardPassData pass_data, Camera curr_camera); + void optimizer_step(ForwardPassData pass_data); void add_sh_band(); void adaptive_density_step(); @@ -842,19 +842,15 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam // A functor to compute the norm of a 2D gradient struct PositionalGradientNorm { - const float width; - const float height; - PositionalGradientNorm(float w, float h) : width(w), height(w) {} - __host__ __device__ float operator()(const float2 &grad) const { // Scale grads to NDC - const float u = grad.x * 0.5f * width; - const float v = grad.y * 0.5f * height; + const float u = grad.x; + const float v = grad.y; return sqrtf(u * u + v * v); } }; -void TrainerImpl::optimizer_step(ForwardPassData pass_data, Camera curr_camera) { +void TrainerImpl::optimizer_step(ForwardPassData pass_data) { auto d_xyz = compact_masked_array<3>(cuda.gaussians.d_xyz, pass_data.d_mask, pass_data.num_culled); auto d_rgb = compact_masked_array<3>(cuda.gaussians.d_rgb, pass_data.d_mask, pass_data.num_culled); auto d_op = compact_masked_array<1>(cuda.gaussians.d_opacity, pass_data.d_mask, pass_data.num_culled); @@ -985,7 +981,7 @@ void TrainerImpl::optimizer_step(ForwardPassData pass_data, Camera curr_camera) thrust::transform(reinterpret_cast(thrust::raw_pointer_cast(cuda.gradients.d_grad_uv.data())), reinterpret_cast(thrust::raw_pointer_cast(cuda.gradients.d_grad_uv.data())) + pass_data.num_culled, - d_uv_grad_norms.begin(), PositionalGradientNorm(curr_camera.width, curr_camera.height)); + d_uv_grad_norms.begin(), PositionalGradientNorm()); thrust::transform(d_uv_accum_compact.begin(), d_uv_accum_compact.end(), d_uv_grad_norms.begin(), d_uv_accum_compact.begin(), thrust::plus()); @@ -1196,7 +1192,7 @@ void TrainerImpl::train() { float loss = backward_pass(curr_image, curr_camera, pass_data, bg_color, d_gt_image[curr_buf_idx]); // --- OPTIMIZER STEP --- - optimizer_step(pass_data, curr_camera); + optimizer_step(pass_data); // Log status progressBar.update(iter, loss, num_gaussians); diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 3fe41c2..e5222ce 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -631,9 +631,9 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { const float h = 1e-6f; // Host data - std::vector h_uvs = {8.1f, 8.1f, 2.1f, 2.1f, 4.1f, 4.1f}; - std::vector h_opacity = {1.0f, 2.0f, 5.0f}; - std::vector h_conic = {5.0f, 0.1f, 5.0f, 5.0f, 0.1f, 5.0f, 5.0f, 0.1f, 5.0f}; // Gaussian 1 + std::vector h_uvs = {4.5f, 4.5f, 8.5f, 8.5f, 12.5f, 12.5f}; + std::vector h_opacity = {10.0f, 10.0f, 10.0f}; + std::vector h_conic = {2.0f, 0.1f, 2.0f, 2.0f, 0.1f, 2.0f, 2.0f, 0.1f, 2.0f}; // Gaussian 1 std::vector h_rgb = {0.5f, 0.2f, 0.2f, 0.2f, 0.2f, 0.5f, 0.2f, 0.5f, 0.2f}; // Gaussian 1 const float background_opacity = 0.5f; std::vector h_grad_image(image_width * image_height * 3); @@ -654,71 +654,58 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { for (int v_splat = 0; v_splat < image_height; ++v_splat) { for (int u_splat = 0; u_splat < image_width; ++u_splat) { int splat_count = 0; + float T = 1.0f; float pixel_rgb[3] = {0.0f, 0.0f, 0.0f}; - float alpha_accum = 0.0f; - float alpha_weight = 0.0f; // Get splat range for this tile. const int splat_idx_start = h_splat_range_by_tile[0]; const int splat_idx_end = h_splat_range_by_tile[1]; + bool done = false; + for (int splat_idx = splat_idx_start; splat_idx < splat_idx_end; ++splat_idx) { - if (alpha_accum > 0.9999f) - break; - const int i = h_sorted_splats[splat_idx]; // <-- UPDATED: Use indirection + const int i = h_sorted_splats[splat_idx]; const float u_mean = uvs[i * 2 + 0]; const float v_mean = uvs[i * 2 + 1]; const float u_diff = (float)u_splat - u_mean; const float v_diff = (float)v_splat - v_mean; + const float opa = 1.0f / (1.0f + expf(-opacity[i])); + const float inv_cov00 = conic[i * 3 + 0]; const float inv_cov01 = conic[i * 3 + 1]; const float inv_cov11 = conic[i * 3 + 2]; - const float mh_sq = - (inv_cov00 * u_diff * u_diff + 2.0f * inv_cov01 * u_diff * v_diff + inv_cov11 * v_diff * v_diff); + const float power = fminf(0.0f, -0.5f * (inv_cov00 * u_diff * u_diff + 2.0f * inv_cov01 * u_diff * v_diff + + inv_cov11 * v_diff * v_diff)); - const float opa = 1.0f / (1.0f + expf(-opacity[i])); - - float norm_prob = 0.0f; - if (mh_sq <= 0.0f) { // Match kernel's `mh_sq > 0.0f` check - splat_count++; - continue; - } - norm_prob = std::exp(-0.5f * mh_sq); + float norm_prob = std::exp(power); // Match kernel: opacity logit -> sigmoid float alpha = std::min(0.99f, opa * norm_prob); + alpha = (alpha > 0.00392156862f) ? !done * alpha : 0.0f; - if (alpha < 0.00392156862f) { - splat_count++; - continue; - } + const float test_T = T * (1.0f - alpha); + done = test_T < 0.0001f; - alpha_weight = 1.0f - alpha_accum; - const float weight = alpha * (1.0f - alpha_accum); + const float weight = alpha * T; pixel_rgb[0] += rgb[i * 3 + 0] * weight; pixel_rgb[1] += rgb[i * 3 + 1] * weight; pixel_rgb[2] += rgb[i * 3 + 2] * weight; - alpha_accum += weight; - splat_count++; + T = test_T; + splat_count += !done; } int pixel_idx = v_splat * image_width + u_splat; - float background_val = 0.0f; - if (alpha_accum < 0.999f) { - background_val = background_opacity * (1.0f - alpha_accum); - } - - image[pixel_idx * 3 + 0] = pixel_rgb[0] + background_val; - image[pixel_idx * 3 + 1] = pixel_rgb[1] + background_val; - image[pixel_idx * 3 + 2] = pixel_rgb[2] + background_val; + image[pixel_idx * 3 + 0] = pixel_rgb[0] + T * background_opacity; + image[pixel_idx * 3 + 1] = pixel_rgb[1] + T * background_opacity; + image[pixel_idx * 3 + 2] = pixel_rgb[2] + T * background_opacity; num_splats_per_pixel[pixel_idx] = splat_count; - final_weight_per_pixel[pixel_idx] = alpha_weight; + final_weight_per_pixel[pixel_idx] = T; } } return image; @@ -799,7 +786,8 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { double loss_m = compute_loss(image_m); float num_grad = (loss_p - loss_m) / (2.0f * h); float norm_factor = (i % 2 == 0) ? (0.5f * image_width) : (0.5f * image_height); - EXPECT_NEAR(h_grad_uv[i], num_grad * norm_factor, 1e-2); + num_grad *= norm_factor; + EXPECT_NEAR(h_grad_uv[i], num_grad, 1e-3); } // Gradients for opacity @@ -812,7 +800,7 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { double loss_p = compute_loss(image_p); double loss_m = compute_loss(image_m); float num_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_grad_opacity[i], num_grad, 1e-2); + EXPECT_NEAR(h_grad_opacity[i], num_grad, 1e-3); } // Gradients for conic @@ -825,7 +813,7 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { double loss_p = compute_loss(image_p); double loss_m = compute_loss(image_m); float num_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_grad_conic[i], num_grad, 1e-2); + EXPECT_NEAR(h_grad_conic[i], num_grad, 1e-3); } // Gradients for rgb @@ -838,7 +826,7 @@ TEST_F(CudaBackwardKernelTest, RenderBackward) { double loss_p = compute_loss(image_p); double loss_m = compute_loss(image_m); float num_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_grad_rgb[i], num_grad, 1e-2); + EXPECT_NEAR(h_grad_rgb[i], num_grad, 1e-3); } // Cleanup From ebfe433fdfdd7fd483bce990bade1da6b2c49458 Mon Sep 17 00:00:00 2001 From: andrew Date: Sat, 6 Dec 2025 12:48:19 -0500 Subject: [PATCH 09/23] fix compute Jacobian --- cuda/gaussian.cu | 67 +++++++++------------------- cuda/raster.cu | 15 ++++--- include/gsplat_cuda/cuda_forward.cuh | 24 +++++----- tests/cuda_forward_test.cpp | 2 +- 4 files changed, 46 insertions(+), 62 deletions(-) diff --git a/cuda/gaussian.cu b/cuda/gaussian.cu index 6dda60e..50cccb7 100644 --- a/cuda/gaussian.cu +++ b/cuda/gaussian.cu @@ -158,31 +158,13 @@ __global__ void compute_conic_kernel(const float *__restrict__ sigma, const floa conic[conic_base_idx + 2] = cov00 * inv_det; } -__global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz, const float *__restrict__ proj, - const int N, float *J) { +__global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz, const float *__restrict__ view, + const float focal_x, const float focal_y, const float tan_fovx, + const float tan_fovy, const int N, float *J) { constexpr int XYZ_STRIDE = 3; constexpr int J_STRIDE = 6; const int i = blockIdx.x * blockDim.x + threadIdx.x; - const int lane_id = threadIdx.x & 0x1f; - - // load and broadcast Proj to all threads in warp - float p_val = 0.0f; - if (lane_id < 16) { - p_val = proj[lane_id]; - } - const float p00 = __shfl_sync(0xffffffff, p_val, 0); - const float p01 = __shfl_sync(0xffffffff, p_val, 1); - const float p02 = __shfl_sync(0xffffffff, p_val, 2); - const float p03 = __shfl_sync(0xffffffff, p_val, 3); - const float p10 = __shfl_sync(0xffffffff, p_val, 4); - const float p11 = __shfl_sync(0xffffffff, p_val, 5); - const float p12 = __shfl_sync(0xffffffff, p_val, 6); - const float p13 = __shfl_sync(0xffffffff, p_val, 7); - const float p30 = __shfl_sync(0xffffffff, p_val, 12); - const float p31 = __shfl_sync(0xffffffff, p_val, 13); - const float p32 = __shfl_sync(0xffffffff, p_val, 14); - const float p33 = __shfl_sync(0xffffffff, p_val, 15); if (i >= N) { return; @@ -192,13 +174,8 @@ __global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz float y = xyz[i * XYZ_STRIDE + 1]; float z = xyz[i * XYZ_STRIDE + 2]; - // Clip coordinates - float xc = p00 * x + p01 * y + p02 * z + p03; - float yc = p10 * x + p11 * y + p12 * z + p13; - float wc = p30 * x + p31 * y + p32 * z + p33; - // Avoid division by zero - if (fabsf(wc) < 1e-6f) { + if (fabsf(z) < 1e-6f) { J[i * J_STRIDE + 0] = 0; J[i * J_STRIDE + 1] = 0; J[i * J_STRIDE + 2] = 0; @@ -208,23 +185,20 @@ __global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz return; } - float wc_inv = 1.0f / wc; - float wc_inv2 = wc_inv * wc_inv; + const float limx = 1.3f * tan_fovx; + const float limy = 1.3f * tan_fovy; + const float txtz = x / z; + const float tytz = y / z; + x = min(limx, max(-limx, txtz)) * z; + y = min(limy, max(-limy, tytz)) * z; // Jacobian of NDC coordinates (x/w, y/w) w.r.t. camera coordinates (x, y, z) - // d(x/w)/dx = (dx_c/dx * w - x_c * dw_c/dx) / w^2 - // dx_c/dx = p00, dw_c/dx = p30 - // d(x/w)/dx = p00/w - xc*p30/w^2 - - // Row 0: d(x_ndc) / d(x, y, z) - J[i * J_STRIDE + 0] = (p00 * wc - xc * p30) * wc_inv2; // dx - J[i * J_STRIDE + 1] = (p01 * wc - xc * p31) * wc_inv2; // dy - J[i * J_STRIDE + 2] = (p02 * wc - xc * p32) * wc_inv2; // dz - - // Row 1: d(y_ndc) / d(x, y, z) - J[i * J_STRIDE + 3] = (p10 * wc - yc * p30) * wc_inv2; // dx - J[i * J_STRIDE + 4] = (p11 * wc - yc * p31) * wc_inv2; // dy - J[i * J_STRIDE + 5] = (p12 * wc - yc * p32) * wc_inv2; // dz + J[i * J_STRIDE + 0] = focal_x / z; + J[i * J_STRIDE + 1] = 0.0f; + J[i * J_STRIDE + 2] = -(focal_x * x) / (z * z); + J[i * J_STRIDE + 3] = 0; + J[i * J_STRIDE + 4] = focal_y / z; + J[i * J_STRIDE + 5] = -(focal_y * y) / (z * z); } void compute_sigma(float *const quaternion, float *const scale, const int N, float *sigma, cudaStream_t stream) { @@ -244,11 +218,11 @@ void compute_sigma(float *const quaternion, float *const scale, const int N, flo compute_sigma_fused_kernel<<>>(quaternion, scale, N, sigma); } -void compute_conic(float *const xyz, const float *view, float *const sigma, const float *proj, const int N, float *J, - float *conic, cudaStream_t stream) { +void compute_conic(float *const xyz, const float *view, float *const sigma, const float focal_x, const float focal_y, + const float tan_fovx, const float tan_fovy, const int N, float *J, float *conic, + cudaStream_t stream) { // Ensure all provided pointers are valid GPU device pointers. ASSERT_DEVICE_POINTER(xyz); - ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(sigma); ASSERT_DEVICE_POINTER(view); ASSERT_DEVICE_POINTER(J); @@ -263,7 +237,8 @@ void compute_conic(float *const xyz, const float *view, float *const sigma, cons const dim3 blocksize(threads_per_block, 1, 1); // This kernel computes the Jacobian (J) for each Gaussian. - compute_projection_jacobian_kernel<<>>(xyz, proj, N, J); + compute_projection_jacobian_kernel<<>>(xyz, view, focal_x, focal_y, tan_fovx, + tan_fovy, N, J); // This kernel uses the world-space covariance (sigma), the camera transform (View), // and the Jacobian (J) computed in the previous step to find the 2D conic. diff --git a/cuda/raster.cu b/cuda/raster.cu index e6ab3fc..2aa7578 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -80,14 +80,19 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config pass_data.d_conic.resize(pass_data.num_culled * 3); pass_data.d_J.resize(pass_data.num_culled * 6); + const float focal_x = camera.params[0]; + const float focal_y = camera.params[1]; + + const float tan_fovx = camera.width / (2.0f * focal_x); + const float tan_fovy = camera.height / (2.0f * focal_y); + compute_sigma(thrust::raw_pointer_cast(d_quaternion_selected.data()), thrust::raw_pointer_cast(d_scale_selected.data()), pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_sigma.data())); - compute_conic(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(camera_parameters.d_view.data()), - thrust::raw_pointer_cast(pass_data.d_sigma.data()), - thrust::raw_pointer_cast(camera_parameters.d_proj.data()), pass_data.num_culled, - thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_conic.data())); + compute_conic( + thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(camera_parameters.d_view.data()), + thrust::raw_pointer_cast(pass_data.d_sigma.data()), focal_x, focal_y, tan_fovx, tan_fovy, pass_data.num_culled, + thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_conic.data())); // Step 5: Sort Gaussians by tile const int n_tiles_x = (width + TILE_SIZE_FWD - 1) / TILE_SIZE_FWD; diff --git a/include/gsplat_cuda/cuda_forward.cuh b/include/gsplat_cuda/cuda_forward.cuh index d614e68..3809453 100644 --- a/include/gsplat_cuda/cuda_forward.cuh +++ b/include/gsplat_cuda/cuda_forward.cuh @@ -9,17 +9,21 @@ inline constexpr int TILE_SIZE_FWD = 16; /** * @brief Compute conic of projected 2D covariance matrix - * @param[in] xyz A device pointer to 3D points - * @param[in] K Camera intrinsic projection matrix - * @param[in] sigma 3D Gaussian covariance matrix - * @param[in] T Camera extrinsic projection matrix - * @param[in] N The total number of points - * @param[out] J A device pointer to ouput Jacobian - * @param[out] conic A device pointer to output conic values - * @param[in] stream The CUDA stream to execute kernel on + * @param[in] xyz A device pointer to 3D points + * @param[in] view Camera view matrix + * @param[in] sigma 3D Gaussian covariance matrix + * @param[in] focal_x Camera focal length x + * @param[in] focal_y Camera focal length y + * @param[in] tan_fovx 3D Gaussian covariance matrix + * @param[in] tan_fovy 3D Gaussian covariance matrix + * @param[in] N The total number of points + * @param[out] J A device pointer to ouput Jacobian + * @param[out] conic A device pointer to output conic values + * @param[in] stream The CUDA stream to execute kernel on */ -void compute_conic(float *const xyz, const float *view, float *const sigma, const float *proj, const int N, float *J, - float *conic, cudaStream_t stream = 0); +void compute_conic(float *const xyz, const float *view, float *const sigma, const float focal_x, const float focal_y, + const float tan_fovx, const float tan_fovy, const int N, float *J, float *conic, + cudaStream_t stream = 0); /** * @brief Compute covariance matrix of Gaussian from quaternion and scale vector diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index dd1e60f..2df40d7 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -346,7 +346,7 @@ TEST_F(CudaKernelTest, ComputeConic) { CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), h_view.size() * sizeof(float), cudaMemcpyHostToDevice)); // Launch the function to be tested - compute_conic(d_xyz, d_view, d_sigma, d_proj, N, d_J, d_conic); + compute_conic(d_xyz, d_view, d_sigma, 1.0f, 1.0f, 1.0f, 1.0f, N, d_J, d_conic); CUDA_CHECK(cudaDeviceSynchronize()); // Copy result from device to host From ffd58888d11601efaa1e56400a1cc5c7f689a049 Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Sat, 6 Dec 2025 19:00:23 -0500 Subject: [PATCH 10/23] fix compute Jacobian backward --- cuda/gaussian_backward.cu | 154 ++++++++++---------------- cuda/trainer.cu | 8 +- include/gsplat_cuda/cuda_backward.cuh | 6 +- tests/cuda_backward_test.cpp | 17 +-- 4 files changed, 77 insertions(+), 108 deletions(-) diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index f05b249..d727a95 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -3,122 +3,84 @@ #include "checks.cuh" #include "gsplat_cuda/cuda_backward.cuh" -__global__ void compute_projection_jacobian_backward_kernel(const float *__restrict__ xyz, - const float *__restrict__ proj, - const float *__restrict__ J_grad_out, const int N, - float *xyz_grad_in) { +__global__ void compute_projection_jacobian_backward_kernel(const float *__restrict__ xyz, const float focal_x, + const float focal_y, const float tan_fovx, + const float tan_fovy, const float *__restrict__ J_grad_out, + const int N, float *xyz_grad_in) { constexpr int XYZ_STRIDE = 3; constexpr int J_STRIDE = 6; const int i = blockIdx.x * blockDim.x + threadIdx.x; - const int lane_id = threadIdx.x & 0x1f; - - // load and broadcast Proj to all threads in warp - float p_val = 0.0f; - if (lane_id < 16) { - p_val = proj[lane_id]; - } - const float p00 = __shfl_sync(0xffffffff, p_val, 0); - const float p01 = __shfl_sync(0xffffffff, p_val, 1); - const float p02 = __shfl_sync(0xffffffff, p_val, 2); - const float p03 = __shfl_sync(0xffffffff, p_val, 3); - const float p10 = __shfl_sync(0xffffffff, p_val, 4); - const float p11 = __shfl_sync(0xffffffff, p_val, 5); - const float p12 = __shfl_sync(0xffffffff, p_val, 6); - const float p13 = __shfl_sync(0xffffffff, p_val, 7); - const float p30 = __shfl_sync(0xffffffff, p_val, 12); - const float p31 = __shfl_sync(0xffffffff, p_val, 13); - const float p32 = __shfl_sync(0xffffffff, p_val, 14); - const float p33 = __shfl_sync(0xffffffff, p_val, 15); if (i >= N) { return; } - float x = xyz[i * XYZ_STRIDE + 0]; - float y = xyz[i * XYZ_STRIDE + 1]; - float z = xyz[i * XYZ_STRIDE + 2]; - - // Clip coordinates - float xc = p00 * x + p01 * y + p02 * z + p03; - float yc = p10 * x + p11 * y + p12 * z + p13; - float wc = p30 * x + p31 * y + p32 * z + p33; + const float x = xyz[i * XYZ_STRIDE + 0]; + const float y = xyz[i * XYZ_STRIDE + 1]; + const float z = xyz[i * XYZ_STRIDE + 2]; - if (fabsf(wc) < 1e-6f) { + if (fabsf(z) < 1e-6f) { return; } - float wc_inv = 1.0f / wc; - float wc_inv2 = wc_inv * wc_inv; - float wc_inv3 = wc_inv2 * wc_inv; - - // Gradients of J - float dJ_00 = J_grad_out[i * J_STRIDE + 0]; - float dJ_01 = J_grad_out[i * J_STRIDE + 1]; - float dJ_02 = J_grad_out[i * J_STRIDE + 2]; - float dJ_10 = J_grad_out[i * J_STRIDE + 3]; - float dJ_11 = J_grad_out[i * J_STRIDE + 4]; - float dJ_12 = J_grad_out[i * J_STRIDE + 5]; - - // Backprop through J calculation - // J00 = (p00*wc - xc*p30) / wc^2 - // Let Num00 = p00*wc - xc*p30 - // J00 = Num00 * wc^-2 - // dNum00 = dJ00 * wc^-2 - // dwc += dJ00 * Num00 * (-2 * wc^-3) = dJ00 * J00 * (-2/wc) - // But we don't have J00 computed here. - // Alternatively: - // d(J00)/d(xc) = -p30 / wc^2 - // d(J00)/d(wc) = (p00 * wc^2 - (p00*wc - xc*p30) * 2*wc) / wc^4 - // = (p00*wc - 2*(p00*wc - xc*p30)) / wc^3 - // = (p00*wc - 2*p00*wc + 2*xc*p30) / wc^3 - // = (2*xc*p30 - p00*wc) / wc^3 - - float dxc = 0.0f; - float dyc = 0.0f; - float dwc = 0.0f; + const float z_inv = 1.0f / (z + 1e-6f); + const float z_inv2 = z_inv * z_inv; + const float z_inv3 = z_inv2 * z_inv; + + const float limx = 1.3f * tan_fovx; + const float limy = 1.3f * tan_fovy; + const float txtz = x * z_inv; + const float tytz = y * z_inv; + + const float dJ_00 = J_grad_out[i * J_STRIDE + 0]; + // const float dJ_01 = J_grad_out[i * J_STRIDE + 1]; // 0 + const float dJ_02 = J_grad_out[i * J_STRIDE + 2]; + // const float dJ_10 = J_grad_out[i * J_STRIDE + 3]; // 0 + const float dJ_11 = J_grad_out[i * J_STRIDE + 4]; + const float dJ_12 = J_grad_out[i * J_STRIDE + 5]; + + float dx = 0.0f; + float dy = 0.0f; + float dz = 0.0f; + + // J00 = focal_x / z + // dL/dz += dL/dJ00 * (-focal_x / z^2) + dz += dJ_00 * (-focal_x * z_inv2); + + // J02 = -focal_x * clamp(x/z) / z + if (fabsf(txtz) <= limx) { + // Inside clamp: J02 = -focal_x * x / z^2 + dx += dJ_02 * (-focal_x * z_inv2); + dz += dJ_02 * (2.0f * focal_x * x * z_inv3); + } else { + // Outside clamp: J02 = -focal_x * lim * sgn(x/z) / z + // u_clamped is constant w.r.t small changes in x, z (locally constant) + const float clamped_x = (txtz > 0.0f ? limx : -limx); + dz += dJ_02 * (focal_x * clamped_x * z_inv2); + } - // Row 0 - // J00 - dxc += dJ_00 * (-p30 * wc_inv2); - dwc += dJ_00 * (2.0f * xc * p30 - p00 * wc) * wc_inv3; - // J01 - dxc += dJ_01 * (-p31 * wc_inv2); - dwc += dJ_01 * (2.0f * xc * p31 - p01 * wc) * wc_inv3; - // J02 - dxc += dJ_02 * (-p32 * wc_inv2); - dwc += dJ_02 * (2.0f * xc * p32 - p02 * wc) * wc_inv3; + // J11 = focal_y / z + dz += dJ_11 * (-focal_y * z_inv2); - // Row 1 - // J10 - dyc += dJ_10 * (-p30 * wc_inv2); - dwc += dJ_10 * (2.0f * yc * p30 - p10 * wc) * wc_inv3; - // J11 - dyc += dJ_11 * (-p31 * wc_inv2); - dwc += dJ_11 * (2.0f * yc * p31 - p11 * wc) * wc_inv3; - // J12 - dyc += dJ_12 * (-p32 * wc_inv2); - dwc += dJ_12 * (2.0f * yc * p32 - p12 * wc) * wc_inv3; - - // Backprop from Clip to Camera - // xc = p00*x + p01*y + p02*z + p03 - // yc = p10*x + p11*y + p12*z + p13 - // wc = p30*x + p31*y + p32*z + p33 - - float dx = dxc * p00 + dyc * p10 + dwc * p30; - float dy = dxc * p01 + dyc * p11 + dwc * p31; - float dz = dxc * p02 + dyc * p12 + dwc * p32; + // J12 = -focal_y * clamp(y/z) / z + if (fabsf(tytz) <= limy) { + dy += dJ_12 * (-focal_y * z_inv2); + dz += dJ_12 * (2.0f * focal_y * y * z_inv3); + } else { + const float clamped_y = (tytz > 0.0f ? limy : -limy); + dz += dJ_12 * (focal_y * clamped_y * z_inv2); + } xyz_grad_in[i * XYZ_STRIDE + 0] += dx; xyz_grad_in[i * XYZ_STRIDE + 1] += dy; xyz_grad_in[i * XYZ_STRIDE + 2] += dz; } -void compute_projection_jacobian_backward(const float *const xyz_c, const float *const proj, - const float *const J_grad_out, const int N, float *xyz_c_grad_in, - cudaStream_t stream) { +void compute_projection_jacobian_backward(const float *const xyz_c, const float focal_x, const float focal_y, + const float tan_fovx, const float tan_fovy, const float *const J_grad_out, + const int N, float *xyz_c_grad_in, cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_c); - ASSERT_DEVICE_POINTER(proj); ASSERT_DEVICE_POINTER(J_grad_out); ASSERT_DEVICE_POINTER(xyz_c_grad_in); @@ -128,8 +90,8 @@ void compute_projection_jacobian_backward(const float *const xyz_c, const float dim3 gridsize(num_blocks, 1, 1); dim3 blocksize(threads_per_block, 1, 1); - compute_projection_jacobian_backward_kernel<<>>(xyz_c, proj, J_grad_out, N, - xyz_c_grad_in); + compute_projection_jacobian_backward_kernel<<>>( + xyz_c, focal_x, focal_y, tan_fovx, tan_fovy, J_grad_out, N, xyz_c_grad_in); } __global__ void conic_backward_kernel(const float *__restrict__ J, const float *__restrict__ sigma, diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 2e34698..efa3287 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -819,8 +819,12 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam thrust::raw_pointer_cast(cuda.gradients.d_grad_conic.data()), pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_J.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_sigma.data())); - compute_projection_jacobian_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(cuda.camera.d_proj.data()), + const float fov_x = 2.0f * atan(curr_camera.width / (2.0f * curr_camera.params[0])); + const float fov_y = 2.0f * atan(curr_camera.height / (2.0f * curr_camera.params[1])); + const float tan_fovx = tan(fov_x * 0.5f); + const float tan_fovy = tan(fov_y * 0.5f); + compute_projection_jacobian_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), curr_camera.params[0], + curr_camera.params[1], tan_fovx, tan_fovy, thrust::raw_pointer_cast(cuda.gradients.d_grad_J.data()), pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); compute_sigma_backward(thrust::raw_pointer_cast(d_quaternion_selected.data()), diff --git a/include/gsplat_cuda/cuda_backward.cuh b/include/gsplat_cuda/cuda_backward.cuh index 972d807..0dd3801 100644 --- a/include/gsplat_cuda/cuda_backward.cuh +++ b/include/gsplat_cuda/cuda_backward.cuh @@ -44,9 +44,9 @@ void compute_camera_space_points_backward(const float *const xyz_w, const float * @param[out] xyz_c_grad_in A device pointer to store the computed gradients for xyz_c. * @param[in] stream The CUDA stream to execute the kernel on. */ -void compute_projection_jacobian_backward(const float *const xyz_c, const float *const proj, - const float *const J_grad_out, const int N, float *xyz_c_grad_in, - cudaStream_t stream = 0); +void compute_projection_jacobian_backward(const float *const xyz_c, const float focal_x, const float focal_y, + const float tan_fovx, const float tan_fovy, const float *const J_grad_out, + const int N, float *xyz_c_grad_in, cudaStream_t stream = 0); /** * @brief Compute gradients for the 2D conic projection. diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index e5222ce..65a7ad9 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -189,23 +189,27 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { // Device data float *d_xyz_c = device_alloc(N * 3); - float *d_proj = device_alloc(16); float *d_J_grad_in = device_alloc(N * 6); float *d_xyz_c_grad_out = device_alloc(N * 3); + // Focal length and tan_fov derived from Identity-like proj where P00=1, P11=1, P32=1 + const float focal_x = 1.0f; + const float focal_y = 1.0f; + const float tan_fovx = 1.0f; + const float tan_fovy = 1.0f; + CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_proj, h_proj.data(), 16 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_J_grad_in, h_J_grad_in.data(), N * 6 * sizeof(float), cudaMemcpyHostToDevice)); // Run kernel - compute_projection_jacobian_backward(d_xyz_c, d_proj, d_J_grad_in, N, d_xyz_c_grad_out); + compute_projection_jacobian_backward(d_xyz_c, focal_x, focal_y, tan_fovx, tan_fovy, d_J_grad_in, N, d_xyz_c_grad_out); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_xyz_c_grad_out.data(), d_xyz_c_grad_out, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); // Numerical gradient check - auto forward_jacobian = [&](const std::vector &xyz_c, const std::vector &proj) { + auto forward_jacobian = [&](const std::vector &xyz_c) { std::vector J(N * 6); for (int i = 0; i < N; ++i) { float x = xyz_c[i * 3 + 0]; @@ -234,8 +238,8 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { xyz_c_p[i] += h; std::vector xyz_c_m = h_xyz_c; xyz_c_m[i] -= h; - auto J_p = forward_jacobian(xyz_c_p, h_proj); - auto J_m = forward_jacobian(xyz_c_m, h_proj); + auto J_p = forward_jacobian(xyz_c_p); + auto J_m = forward_jacobian(xyz_c_m); float numerical_grad = 0; for (int j = 0; j < N * 6; ++j) numerical_grad += (J_p[j] - J_m[j]) / (2 * h) * h_J_grad_in[j]; @@ -243,7 +247,6 @@ TEST_F(CudaBackwardKernelTest, ProjectionJacobianBackward) { } CUDA_CHECK(cudaFree(d_xyz_c)); - CUDA_CHECK(cudaFree(d_proj)); CUDA_CHECK(cudaFree(d_J_grad_in)); CUDA_CHECK(cudaFree(d_xyz_c_grad_out)); } From ee76784bf302606cd435f19c730b1888dd4adc27 Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Sun, 7 Dec 2025 11:52:34 -0500 Subject: [PATCH 11/23] track done per scan line --- cuda/render.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda/render.cu b/cuda/render.cu index de94914..ed5215b 100644 --- a/cuda/render.cu +++ b/cuda/render.cu @@ -43,7 +43,7 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y unsigned int any_active = 0xFFFFFFFF; int index_in_tile = 0; const int *splats_in_tile = &gaussian_idx_by_splat_idx[splat_idx_start]; - bool done = false; + bool done[PIXELS_PER_THREAD] = {false}; // Iterate on splats in the tile front to back for (; (index_in_tile < total_splats) && (any_active != 0); index_in_tile++) { @@ -70,12 +70,12 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y const float power = fminf(0.0f, basic + linear * i + quad * i * i); float alpha = fminf(0.99f, opa * __expf(power)); - alpha = (alpha > 0.00392156862f) ? !done * alpha : 0.0f; + alpha = (alpha > 0.00392156862f) ? !done[i] * alpha : 0.0f; const float test_T = T[i] * (1.0f - alpha); - done = test_T < 0.0001f; + done[i] = test_T < 0.0001f; - any_active |= __ballot_sync(0xFFFFFFFF, !done); + any_active |= __ballot_sync(0xFFFFFFFF, !done[i]); const float weight = alpha * T[i]; @@ -84,7 +84,7 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y accumulated_rgb[i].z += color.z * weight; T[i] = test_T; - num_splats[i] += !done; + num_splats[i] += !done[i]; } } From 34ebf2a51f1a24093a6405a49e2b917de0ac7e03 Mon Sep 17 00:00:00 2001 From: andrew Date: Mon, 8 Dec 2025 14:21:00 -0500 Subject: [PATCH 12/23] compute obb in conic kernel --- cuda/culling.cu | 87 ++++++++-------------------- cuda/gaussian.cu | 24 ++++++-- cuda/raster.cu | 10 ++-- include/gsplat_cuda/cuda_data.cuh | 1 + include/gsplat_cuda/cuda_forward.cuh | 15 ++--- tests/cuda_forward_test.cpp | 41 ++++++++----- 6 files changed, 86 insertions(+), 92 deletions(-) diff --git a/cuda/culling.cu b/cuda/culling.cu index b4c0d73..0c5fba3 100644 --- a/cuda/culling.cu +++ b/cuda/culling.cu @@ -89,37 +89,8 @@ split_axis_test(const float *__restrict__ obb, // [tl_x, tl_y, tr_x, tr_y return true; } -__device__ __forceinline__ float compute_obb(const float u, const float v, const float a, const float b, const float c, - const float mh_dist, float *obb) { - const float t_sum = a + c; - const float t_diff = a - c; - const float discriminant = t_diff * t_diff + 4.f * b * b; - const float root = sqrtf(discriminant); // Guaranteed non-negative - const float lambda1 = 0.5f * (t_sum + root); // Major eigenvalue - const float lambda2 = 0.5f * (t_sum - root); // Minor eigenvalue - - const float r_major = mh_dist * sqrtf(fmaxf(0.f, lambda1)); - const float r_minor = mh_dist * sqrtf(fmaxf(0.f, lambda2)); - - float cos_theta, sin_theta; - if (fabsf(root) < 1e-7f) { - // Handle the case of a circle (a=c, b=0), where rotation is arbitrary. - cos_theta = 1.f; - sin_theta = 0.f; - } else { - // Use half-angle trigonometric identities: - // cos^2(t) = (1 + cos(2t))/2, sin^2(t) = (1 - cos(2t))/2 - // where cos(2t) = (a-c)/root and sin(2t) = 2b/root. - const float inv_root = 1.f / root; - const float cos2theta = t_diff * inv_root; - - cos_theta = sqrtf(0.5f * (1.f + cos2theta)); - sin_theta = sqrtf(0.5f * (1.f - cos2theta)); - - // The sign of sin(theta) is the same as the sign of b. - sin_theta = copysignf(sin_theta, b); - } - +__device__ __forceinline__ void compute_obb(const float u, const float v, const float r_major, const float r_minor, + const float sin_theta, const float cos_theta, float *obb) { // Calculate the two orthogonal vectors defining the OBB's orientation and size const float v1_x = r_major * cos_theta; const float v1_y = r_major * sin_theta; @@ -135,9 +106,6 @@ __device__ __forceinline__ float compute_obb(const float u, const float v, const obb[5] = v - v1_y + v2_y; obb[6] = u + v1_x + v2_x; // Top-right corner obb[7] = v + v1_y + v2_y; - - // Return major axis radius - return r_major; } __device__ __forceinline__ int get_write_index(const bool write, const int lane, const unsigned int active_mask, @@ -170,9 +138,9 @@ __device__ __forceinline__ int warpSum(unsigned mask, int val) { return val; } -__global__ void coarse_binning_kernel(const float *__restrict__ uvs, const float *__restrict__ conic, - const float mh_dist, const int n_tiles_x, const int n_tiles_y, const int N, - int *buffer_size, int2 *pairs, int *global_index) { +__global__ void coarse_binning_kernel(const float *__restrict__ uvs, const float4 *__restrict__ radius, + const int n_tiles_x, const int n_tiles_y, const int N, int *buffer_size, + int2 *pairs, int *global_index) { const int gaussian_idx = blockIdx.x * blockDim.x + threadIdx.x; // mask active threads for warpSum @@ -183,13 +151,9 @@ __global__ void coarse_binning_kernel(const float *__restrict__ uvs, const float const float u = uvs[gaussian_idx * 2]; const float v = uvs[gaussian_idx * 2 + 1]; - const float a = conic[gaussian_idx * 3 + 0] + 0.3f; - const float b = conic[gaussian_idx * 3 + 1]; - const float c = conic[gaussian_idx * 3 + 2] + 0.3f; - float obb[8]; - const float radius = compute_obb(u, v, a, b, c, mh_dist, obb); - const int radius_tiles = ceilf(radius * 0.0625f) + 1; + const float r_major = radius[gaussian_idx].x; + const int radius_tiles = ceilf(r_major * 0.0625f) + 1; const int projected_tile_x = floorf(u / 16.0f); const int start_tile_x = max(0, projected_tile_x - radius_tiles); @@ -225,10 +189,10 @@ __global__ void coarse_binning_kernel(const float *__restrict__ uvs, const float } __global__ void generate_splats_kernel(const float *__restrict__ uvs, const float *__restrict__ xyz_camera_frame, - const float *__restrict__ conic, const int2 *__restrict__ pairs, - const float mh_dist, const int num_pairs, const int num_tiles_x, - const int num_tiles_y, const float max_z, int *gaussian_idx_by_splat_idx, - double *sort_keys, int *global_splat_counter) { + const float4 *__restrict__ radius, const int2 *__restrict__ pairs, + const int num_pairs, const int num_tiles_x, const int num_tiles_y, + const float max_z, int *gaussian_idx_by_splat_idx, double *sort_keys, + int *global_splat_counter) { int pair_id = blockIdx.x * blockDim.x + threadIdx.x; // Mask of all active threads @@ -246,13 +210,13 @@ __global__ void generate_splats_kernel(const float *__restrict__ uvs, const floa const float u = uvs[gaussian_idx * 2]; const float v = uvs[gaussian_idx * 2 + 1]; const double z = (double)(xyz_camera_frame[gaussian_idx * 3 + 2]); - const float a = conic[gaussian_idx * 3 + 0] + 0.3f; - const float b = conic[gaussian_idx * 3 + 1]; - const float c = conic[gaussian_idx * 3 + 2] + 0.3f; + const float r_major = radius[gaussian_idx].x; + const float r_minor = radius[gaussian_idx].y; + const float sin_theta = radius[gaussian_idx].z; + const float cos_theta = radius[gaussian_idx].w; float obb[8]; - const float radius = compute_obb(u, v, a, b, c, mh_dist, obb); - const int radius_tiles = ceilf(radius * 0.0625f) + 1; + compute_obb(u, v, r_major, r_minor, sin_theta, cos_theta, obb); const int tile_x = tile_idx % num_tiles_x; const int tile_y = tile_idx / num_tiles_x; @@ -347,12 +311,11 @@ struct copy_z_functor { } }; -void get_sorted_gaussian_list(const float *uv, const float *xyz, const float *conic, const int n_tiles_x, - const int n_tiles_y, const float mh_dist, const int N, size_t &sorted_gaussian_size, - int *sorted_gaussians, int *splat_start_end_idx_by_tile_idx, cudaStream_t stream) { +void get_sorted_gaussian_list(const float *uv, const float *xyz, const float4 *radius, const int n_tiles_x, + const int n_tiles_y, const int N, size_t &sorted_gaussian_size, int *sorted_gaussians, + int *splat_start_end_idx_by_tile_idx, cudaStream_t stream) { ASSERT_DEVICE_POINTER(uv); ASSERT_DEVICE_POINTER(xyz); - ASSERT_DEVICE_POINTER(conic); const int threads_per_block = 256; const int num_blocks = (N + threads_per_block - 1) / threads_per_block; @@ -365,7 +328,7 @@ void get_sorted_gaussian_list(const float *uv, const float *xyz, const float *co thrust::device_vector d_buffer_size(1, 0); coarse_binning_kernel<<>>( - uv, conic, mh_dist, n_tiles_x, n_tiles_y, N, thrust::raw_pointer_cast(d_buffer_size.data()), nullptr, nullptr); + uv, radius, n_tiles_x, n_tiles_y, N, thrust::raw_pointer_cast(d_buffer_size.data()), nullptr, nullptr); sorted_gaussian_size = d_buffer_size[0]; return; @@ -379,8 +342,8 @@ void get_sorted_gaussian_list(const float *uv, const float *xyz, const float *co // store pairs of gaussians and tiles thrust::device_vector d_pairs(sorted_gaussian_size); - coarse_binning_kernel<<>>(uv, conic, mh_dist, n_tiles_x, n_tiles_y, N, - nullptr, thrust::raw_pointer_cast(d_pairs.data()), + coarse_binning_kernel<<>>(uv, radius, n_tiles_x, n_tiles_y, N, nullptr, + thrust::raw_pointer_cast(d_pairs.data()), thrust::raw_pointer_cast(d_buffer_index.data())); assert(d_buffer_index[0] == sorted_gaussian_size); @@ -406,9 +369,9 @@ void get_sorted_gaussian_list(const float *uv, const float *xyz, const float *co const int num_blocks_pairs = (num_pairs + threads_per_block - 1) / threads_per_block; generate_splats_kernel<<>>( - uv, xyz, conic, thrust::raw_pointer_cast(d_pairs.data()), mh_dist, num_pairs, n_tiles_x, n_tiles_y, max_z, - sorted_gaussians, // Pass through the raw pointer from caller - thrust::raw_pointer_cast(d_sort_keys.data()), thrust::raw_pointer_cast(d_global_splat_counter.data())); + uv, xyz, radius, thrust::raw_pointer_cast(d_pairs.data()), num_pairs, n_tiles_x, n_tiles_y, max_z, + sorted_gaussians, thrust::raw_pointer_cast(d_sort_keys.data()), + thrust::raw_pointer_cast(d_global_splat_counter.data())); int num_splats = d_global_splat_counter[0]; // Device-to-host copy diff --git a/cuda/gaussian.cu b/cuda/gaussian.cu index 50cccb7..40d60c6 100644 --- a/cuda/gaussian.cu +++ b/cuda/gaussian.cu @@ -75,7 +75,8 @@ __global__ void compute_sigma_fused_kernel(const float *__restrict__ quaternion, } __global__ void compute_conic_kernel(const float *__restrict__ sigma, const float *__restrict__ view, - const float *__restrict__ J, const int N, float *conic) { + const float *__restrict__ J, const int N, const float mh_dist, float *conic, + float4 *radius) { constexpr int SIGMA_STRIDE = 6; constexpr int J_STRIDE = 6; constexpr int CONIC_STRIDE = 3; @@ -156,6 +157,21 @@ __global__ void compute_conic_kernel(const float *__restrict__ sigma, const floa conic[conic_base_idx + 0] = cov11 * inv_det; conic[conic_base_idx + 1] = -cov01 * inv_det; conic[conic_base_idx + 2] = cov00 * inv_det; + + // Eigenvalues + const float mid = 0.5f * (cov00 + cov11); + // Ensure the term inside sqrt is non-negative using max(0.1f, ...) + const float lambda_term = sqrt(max(0.1f, mid * mid - det)); + const float lambda1 = mid + lambda_term; + const float lambda2 = mid - lambda_term; + + const float r_major = ceil(mh_dist * sqrt(lambda1)); + const float r_minor = ceil(mh_dist * sqrt(lambda2)); + + float cos_theta, sin_theta; + sincosf(0.5f * atan2f(2.0f * cov01, cov00 - cov11), &sin_theta, &cos_theta); + + radius[i] = {r_major, r_minor, sin_theta, cos_theta}; } __global__ void compute_projection_jacobian_kernel(const float *__restrict__ xyz, const float *__restrict__ view, @@ -219,8 +235,8 @@ void compute_sigma(float *const quaternion, float *const scale, const int N, flo } void compute_conic(float *const xyz, const float *view, float *const sigma, const float focal_x, const float focal_y, - const float tan_fovx, const float tan_fovy, const int N, float *J, float *conic, - cudaStream_t stream) { + const float tan_fovx, const float tan_fovy, const float mh_dist, const int N, float *J, float *conic, + float4 *radius, cudaStream_t stream) { // Ensure all provided pointers are valid GPU device pointers. ASSERT_DEVICE_POINTER(xyz); ASSERT_DEVICE_POINTER(sigma); @@ -242,5 +258,5 @@ void compute_conic(float *const xyz, const float *view, float *const sigma, cons // This kernel uses the world-space covariance (sigma), the camera transform (View), // and the Jacobian (J) computed in the previous step to find the 2D conic. - compute_conic_kernel<<>>(sigma, view, J, N, conic); + compute_conic_kernel<<>>(sigma, view, J, N, mh_dist, conic, radius); } diff --git a/cuda/raster.cu b/cuda/raster.cu index 2aa7578..48a99a2 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -79,6 +79,7 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config pass_data.d_sigma.resize(pass_data.num_culled * 9); pass_data.d_conic.resize(pass_data.num_culled * 3); pass_data.d_J.resize(pass_data.num_culled * 6); + pass_data.d_radius.resize(pass_data.num_culled); const float focal_x = camera.params[0]; const float focal_y = camera.params[1]; @@ -91,8 +92,9 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config thrust::raw_pointer_cast(pass_data.d_sigma.data())); compute_conic( thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(camera_parameters.d_view.data()), - thrust::raw_pointer_cast(pass_data.d_sigma.data()), focal_x, focal_y, tan_fovx, tan_fovy, pass_data.num_culled, - thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_conic.data())); + thrust::raw_pointer_cast(pass_data.d_sigma.data()), focal_x, focal_y, tan_fovx, tan_fovy, config.mh_dist, + pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_J.data()), + thrust::raw_pointer_cast(pass_data.d_conic.data()), thrust::raw_pointer_cast(pass_data.d_radius.data())); // Step 5: Sort Gaussians by tile const int n_tiles_x = (width + TILE_SIZE_FWD - 1) / TILE_SIZE_FWD; @@ -101,7 +103,7 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config size_t sorted_gaussian_size = 0; get_sorted_gaussian_list(thrust::raw_pointer_cast(d_uv_selected.data()), thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(pass_data.d_conic.data()), n_tiles_x, n_tiles_y, config.mh_dist, + thrust::raw_pointer_cast(pass_data.d_radius.data()), n_tiles_x, n_tiles_y, pass_data.num_culled, sorted_gaussian_size, nullptr, nullptr); pass_data.d_splat_start_end_idx_by_tile_idx.resize(n_tiles + 1); @@ -109,7 +111,7 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config get_sorted_gaussian_list( thrust::raw_pointer_cast(d_uv_selected.data()), thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(pass_data.d_conic.data()), n_tiles_x, n_tiles_y, config.mh_dist, pass_data.num_culled, + thrust::raw_pointer_cast(pass_data.d_radius.data()), n_tiles_x, n_tiles_y, pass_data.num_culled, sorted_gaussian_size, thrust::raw_pointer_cast(pass_data.d_sorted_gaussians.data()), thrust::raw_pointer_cast(pass_data.d_splat_start_end_idx_by_tile_idx.data())); diff --git a/include/gsplat_cuda/cuda_data.cuh b/include/gsplat_cuda/cuda_data.cuh index 909cd67..e5e98db 100644 --- a/include/gsplat_cuda/cuda_data.cuh +++ b/include/gsplat_cuda/cuda_data.cuh @@ -76,6 +76,7 @@ struct ForwardPassData { // Temporary buffers for processing thrust::device_vector d_uv, d_xyz_c; thrust::device_vector d_mask; + thrust::device_vector d_radius; // Buffers for sorting thrust::device_vector d_sorted_gaussians, d_splat_start_end_idx_by_tile_idx; diff --git a/include/gsplat_cuda/cuda_forward.cuh b/include/gsplat_cuda/cuda_forward.cuh index 3809453..ba89b32 100644 --- a/include/gsplat_cuda/cuda_forward.cuh +++ b/include/gsplat_cuda/cuda_forward.cuh @@ -16,14 +16,16 @@ inline constexpr int TILE_SIZE_FWD = 16; * @param[in] focal_y Camera focal length y * @param[in] tan_fovx 3D Gaussian covariance matrix * @param[in] tan_fovy 3D Gaussian covariance matrix + * @param[in] mh_dist Mahalanobis distance to define bounding box * @param[in] N The total number of points * @param[out] J A device pointer to ouput Jacobian * @param[out] conic A device pointer to output conic values + * @param[out] radius A device pointer to output major and minor radius with rotation * @param[in] stream The CUDA stream to execute kernel on */ void compute_conic(float *const xyz, const float *view, float *const sigma, const float focal_x, const float focal_y, - const float tan_fovx, const float tan_fovy, const int N, float *J, float *conic, - cudaStream_t stream = 0); + const float tan_fovx, const float tan_fovy, const float mh_dist, const int N, float *J, float *conic, + float4 *radius, cudaStream_t stream = 0); /** * @brief Compute covariance matrix of Gaussian from quaternion and scale vector @@ -78,19 +80,18 @@ void cull_gaussians(float *const uv, float *const xyz, const int N, const float * @brief Lanuches CUDA kernels to get gaussian tile intersections sorted by depth * @param[in] uv A device pointer to gaussian coordinates in image frame * @param[in] xyz A device pointer to 3D corrdinates of gaussians in camera perspective - * @param[in] conic A device pointer to 2D gaussian conic + * @param[in] radius A device pointer to major and minor radius with rotation * @param[in] n_tiles_x Number of tiles in image x axis * @param[in] n_tiles_y Number of tiles in image y axis - * @param[in] mh_dist Mahalanobis distance to define bounding box * @param[in] N The total number of points * @param[out] sorted_gaussian_bytes Pointer to store bytes to allocate for sorted_gaussians * @param[out] sorted_gaussians A device array to ouput gaussians sorted by z depth * @param[out] splat_start_end_idx_by_tile_idx A device array to index into sorted_gaussian by tile id * @param[in] stream The CUDA stream to execute kernel on */ -void get_sorted_gaussian_list(const float *uv, const float *xyz, const float *conic, const int n_tiles_x, - const int n_tiles_y, const float mh_dist, const int N, size_t &sorted_gaussian_bytes, - int *sorted_gaussians, int *splat_start_end_idx_by_tile_idx, cudaStream_t stream = 0); +void get_sorted_gaussian_list(const float *uv, const float *xyz, const float4 *radius, const int n_tiles_x, + const int n_tiles_y, const int N, size_t &sorted_gaussian_bytes, int *sorted_gaussians, + int *splat_start_end_idx_by_tile_idx, cudaStream_t stream = 0); /** * @brief Launches CUDA kernels to precompute spherical harmonic values and calculate rgb values diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index 2df40d7..21f6530 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -327,9 +327,11 @@ TEST_F(CudaKernelTest, ComputeConic) { // Host-side output buffers std::vector h_J(N * 6); std::vector h_conic(N * 3); + std::vector h_radius(N); // Device-side pointers float *d_xyz, *d_proj, *d_sigma, *d_view, *d_J, *d_conic; + float4 *d_radius; // Allocate memory on the device CUDA_CHECK(cudaMalloc(&d_xyz, h_xyz.size() * sizeof(float))); @@ -338,6 +340,7 @@ TEST_F(CudaKernelTest, ComputeConic) { CUDA_CHECK(cudaMalloc(&d_view, h_view.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_J, h_J.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_conic, h_conic.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_radius, N * sizeof(float4))); // Copy input data from host to device CUDA_CHECK(cudaMemcpy(d_xyz, h_xyz.data(), h_xyz.size() * sizeof(float), cudaMemcpyHostToDevice)); @@ -346,11 +349,12 @@ TEST_F(CudaKernelTest, ComputeConic) { CUDA_CHECK(cudaMemcpy(d_view, h_view.data(), h_view.size() * sizeof(float), cudaMemcpyHostToDevice)); // Launch the function to be tested - compute_conic(d_xyz, d_view, d_sigma, 1.0f, 1.0f, 1.0f, 1.0f, N, d_J, d_conic); + compute_conic(d_xyz, d_view, d_sigma, 1.0f, 1.0f, 1.0f, 1.0f, 3.0f, N, d_J, d_conic, d_radius); CUDA_CHECK(cudaDeviceSynchronize()); // Copy result from device to host CUDA_CHECK(cudaMemcpy(h_conic.data(), d_conic, h_conic.size() * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_radius.data(), d_radius, h_radius.size() * sizeof(float4), cudaMemcpyDeviceToHost)); // --- Calculate expected results on the host for verification --- const float x = h_xyz[0], y = h_xyz[1], z = h_xyz[2]; @@ -393,6 +397,13 @@ TEST_F(CudaKernelTest, ComputeConic) { ASSERT_NEAR(h_conic[i], expected_conic[i], 1e-5); } + for (int i = 0; i < N; i++) { + EXPECT_NEAR(h_radius[i].x, 3.0f, 1e-5); + EXPECT_NEAR(h_radius[i].y, 1.0f, 1e-5); + EXPECT_NEAR(h_radius[i].z, sqrt(0.8), 1e-5); + EXPECT_NEAR(h_radius[i].w, sqrt(0.2), 1e-5); + } + // Free device memory CUDA_CHECK(cudaFree(d_xyz)); CUDA_CHECK(cudaFree(d_proj)); @@ -433,30 +444,30 @@ TEST_F(CudaKernelTest, GetSortedGaussianList) { 0.0f, 0.0f, 5.0f // G2 }; // Conic parameters a,b,c. For a circle, b=0, a=c. Radius ~ mh_dist * sqrt(a). - // G0 & G2 radius = 4 => 3*sqrt(a)=4 => a=16/9 ~= 1.78 - // G1 radius = 6 => 3*sqrt(a)=6 => a=36/9 = 4 - const std::vector h_conic = { - 1.78f, 0.0f, 1.78f, // G0 - 4.0f, 0.0f, 4.0f, // G1 - 1.78f, 0.0f, 1.78f // G2 + // G0 & G2 radius = 4 + // G1 radius = 6 + const std::vector h_radius = { + {4.0f, 4.0f, 0.f, 1.f}, // G0 + {4.0f, 4.0f, 0.f, 1.f}, // G1 + {6.0f, 6.0f, 0.f, 1.f} // G2 }; // Device-side pointers - float *d_uvs, *d_xyz, *d_conic; + float *d_uvs, *d_xyz; + float4 *d_radius; int *d_sorted_gaussians, *d_splat_boundaries; // Allocate and copy inputs to device CUDA_CHECK(cudaMalloc(&d_uvs, h_uvs.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_xyz, h_xyz.size() * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_conic, h_conic.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_radius, h_radius.size() * sizeof(float4))); CUDA_CHECK(cudaMemcpy(d_uvs, h_uvs.data(), h_uvs.size() * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_xyz, h_xyz.data(), h_xyz.size() * sizeof(float), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_conic, h_conic.data(), h_conic.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_radius, h_radius.data(), h_radius.size() * sizeof(float4), cudaMemcpyHostToDevice)); // --- PASS 1: Get required buffer size --- size_t sorted_gaussian_bytes = 0; - get_sorted_gaussian_list(d_uvs, d_xyz, d_conic, n_tiles_x, n_tiles_y, mh_dist, N, sorted_gaussian_bytes, nullptr, - nullptr); + get_sorted_gaussian_list(d_uvs, d_xyz, d_radius, n_tiles_x, n_tiles_y, N, sorted_gaussian_bytes, nullptr, nullptr); CUDA_CHECK(cudaDeviceSynchronize()); // Expected splats: @@ -472,8 +483,8 @@ TEST_F(CudaKernelTest, GetSortedGaussianList) { CUDA_CHECK(cudaMalloc(&d_sorted_gaussians, sorted_gaussian_bytes * sizeof(int))); CUDA_CHECK(cudaMalloc(&d_splat_boundaries, (num_tiles + 1) * sizeof(int))); - get_sorted_gaussian_list(d_uvs, d_xyz, d_conic, n_tiles_x, n_tiles_y, mh_dist, N, sorted_gaussian_bytes, - d_sorted_gaussians, d_splat_boundaries); + get_sorted_gaussian_list(d_uvs, d_xyz, d_radius, n_tiles_x, n_tiles_y, N, sorted_gaussian_bytes, d_sorted_gaussians, + d_splat_boundaries); CUDA_CHECK(cudaDeviceSynchronize()); // --- Verification --- @@ -521,7 +532,7 @@ TEST_F(CudaKernelTest, GetSortedGaussianList) { // --- Cleanup --- CUDA_CHECK(cudaFree(d_uvs)); CUDA_CHECK(cudaFree(d_xyz)); - CUDA_CHECK(cudaFree(d_conic)); + CUDA_CHECK(cudaFree(d_radius)); CUDA_CHECK(cudaFree(d_sorted_gaussians)); CUDA_CHECK(cudaFree(d_splat_boundaries)); } From 134893b94d20a1f0562b631da8c9b882543e0619 Mon Sep 17 00:00:00 2001 From: andrew Date: Mon, 8 Dec 2025 15:16:35 -0500 Subject: [PATCH 13/23] remove learning rate decay --- config/base.yaml | 14 +++++++------- config/extended.yaml | 14 +++++++------- cuda/trainer.cu | 24 +++++++----------------- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/config/base.yaml b/config/base.yaml index f50dac6..9cc60a4 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -12,13 +12,13 @@ cull_mask_padding: 100 num_iters: 7000 ssim_frac: 0.2 base_lr: 1e-3 -xyz_lr_multiplier_init: 2e-1 -xyz_lr_multiplier_final: 2e-3 -quat_lr_multiplier: 4.0 -scale_lr_multiplier: 10.0 -opacity_lr_multiplier: 50 -rgb_lr_multiplier: 5.0 -sh_lr_multiplier: 0.25 +xyz_lr_multiplier_init: 1.6e-1 +xyz_lr_multiplier_final: 1.6e-3 +quat_lr_multiplier: 1.0 +scale_lr_multiplier: 5.0 +opacity_lr_multiplier: 25 +rgb_lr_multiplier: 2.5 +sh_lr_multiplier: 0.125 test_eval_interval: 500 test_split_ratio: 9 use_background: true diff --git a/config/extended.yaml b/config/extended.yaml index c233e81..32c6296 100644 --- a/config/extended.yaml +++ b/config/extended.yaml @@ -12,13 +12,13 @@ cull_mask_padding: 100 num_iters: 30000 ssim_frac: 0.2 base_lr: 1e-3 -xyz_lr_multiplier_init: 2e-1 -xyz_lr_multiplier_final: 2e-3 -quat_lr_multiplier: 4.0 -scale_lr_multiplier: 10.0 -opacity_lr_multiplier: 50 -rgb_lr_multiplier: 5.0 -sh_lr_multiplier: 0.25 +xyz_lr_multiplier_init: 1.6e-1 +xyz_lr_multiplier_final: 1.6e-3 +quat_lr_multiplier: 1.0 +scale_lr_multiplier: 5.0 +opacity_lr_multiplier: 25 +rgb_lr_multiplier: 2.5 +sh_lr_multiplier: 0.125 test_eval_interval: 500 test_split_ratio: 9 use_background: true diff --git a/cuda/trainer.cu b/cuda/trainer.cu index efa3287..034b7f4 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -879,29 +879,22 @@ void TrainerImpl::optimizer_step(ForwardPassData pass_data) { const float xyz_decay_factor = pow((config.xyz_lr_multiplier_final / config.xyz_lr_multiplier_init), ((float)iter / (float)config.num_iters)); - // Generic decay for other parameters (approx 100x reduction over training) - const float general_decay_factor = pow(0.01f, ((float)iter / (float)config.num_iters)); - adam_step(thrust::raw_pointer_cast(d_xyz.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz.data()), thrust::raw_pointer_cast(d_m_xyz.data()), thrust::raw_pointer_cast(d_v_xyz.data()), scene_extent * config.base_lr * config.xyz_lr_multiplier_init * xyz_decay_factor, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 3); adam_step(thrust::raw_pointer_cast(d_rgb.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data()), thrust::raw_pointer_cast(d_m_rgb.data()), thrust::raw_pointer_cast(d_v_rgb.data()), - config.base_lr * config.rgb_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 3); + config.base_lr * config.rgb_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 3); adam_step(thrust::raw_pointer_cast(d_op.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_opacity.data()), thrust::raw_pointer_cast(d_m_op.data()), thrust::raw_pointer_cast(d_v_op.data()), - config.base_lr * config.opacity_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 1); + config.base_lr * config.opacity_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 1); adam_step(thrust::raw_pointer_cast(d_scale.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_scale.data()), thrust::raw_pointer_cast(d_m_scale.data()), thrust::raw_pointer_cast(d_v_scale.data()), - config.base_lr * config.scale_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 3); + config.base_lr * config.scale_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 3); adam_step(thrust::raw_pointer_cast(d_quat.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_quaternion.data()), thrust::raw_pointer_cast(d_m_quat.data()), thrust::raw_pointer_cast(d_v_quat.data()), - config.base_lr * config.quat_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 4); + config.base_lr * config.quat_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 4); scatter_masked_array<3>(d_m_xyz, pass_data.d_mask, cuda.optimizer.m_grad_xyz); scatter_masked_array<3>(d_m_rgb, pass_data.d_mask, cuda.optimizer.m_grad_rgb); @@ -936,8 +929,7 @@ void TrainerImpl::optimizer_step(ForwardPassData pass_data) { adam_step(thrust::raw_pointer_cast(d_sh.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), thrust::raw_pointer_cast(d_m_sh.data()), thrust::raw_pointer_cast(d_v_sh.data()), - config.base_lr * config.sh_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 9); + config.base_lr * config.sh_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 9); scatter_masked_array<9>(d_m_sh, pass_data.d_mask, cuda.optimizer.m_grad_sh); scatter_masked_array<9>(d_v_sh, pass_data.d_mask, cuda.optimizer.v_grad_sh); @@ -949,8 +941,7 @@ void TrainerImpl::optimizer_step(ForwardPassData pass_data) { d_v_sh = compact_masked_array<24>(cuda.optimizer.v_grad_sh, pass_data.d_mask, pass_data.num_culled); adam_step(thrust::raw_pointer_cast(d_sh.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), thrust::raw_pointer_cast(d_m_sh.data()), thrust::raw_pointer_cast(d_v_sh.data()), - config.base_lr * config.sh_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 24); + config.base_lr * config.sh_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 24); scatter_masked_array<24>(d_m_sh, pass_data.d_mask, cuda.optimizer.m_grad_sh); scatter_masked_array<24>(d_v_sh, pass_data.d_mask, cuda.optimizer.v_grad_sh); @@ -962,8 +953,7 @@ void TrainerImpl::optimizer_step(ForwardPassData pass_data) { d_v_sh = compact_masked_array<45>(cuda.optimizer.v_grad_sh, pass_data.d_mask, pass_data.num_culled); adam_step(thrust::raw_pointer_cast(d_sh.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), thrust::raw_pointer_cast(d_m_sh.data()), thrust::raw_pointer_cast(d_v_sh.data()), - config.base_lr * config.sh_lr_multiplier * general_decay_factor, B1, B2, EPS, bias1, bias2, - pass_data.num_culled, 45); + config.base_lr * config.sh_lr_multiplier, B1, B2, EPS, bias1, bias2, pass_data.num_culled, 45); scatter_masked_array<45>(d_m_sh, pass_data.d_mask, cuda.optimizer.m_grad_sh); scatter_masked_array<45>(d_v_sh, pass_data.d_mask, cuda.optimizer.v_grad_sh); scatter_masked_array<45>(d_sh, pass_data.d_mask, cuda.gaussians.d_sh); From 5bd5234011806d36aad7d633ba04b6a6a4c3f0da Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Mon, 8 Dec 2025 16:06:58 -0500 Subject: [PATCH 14/23] fix sigma grad --- cuda/gaussian_backward.cu | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/cuda/gaussian_backward.cu b/cuda/gaussian_backward.cu index d727a95..a636bb2 100644 --- a/cuda/gaussian_backward.cu +++ b/cuda/gaussian_backward.cu @@ -170,6 +170,8 @@ __global__ void conic_backward_kernel(const float *__restrict__ J, const float * const float c01 = conic[conic_base_idx + 1]; const float c11 = conic[conic_base_idx + 2]; + // Compute dSigma_prime = - C * dC * C + // T = C * dC // Compute dSigma_prime = - C * dC * C // T = C * dC const float t00 = c00 * dc00_out + c01 * dc01_out; @@ -325,16 +327,18 @@ __global__ void sigma_backward_kernel(const float *__restrict__ q, const float * // --- 2. Backpropagate --- // Load dSigma and reconstruct the full symmetric matrix + // Load dSigma and reconstruct the full symmetric matrix + // Factor 0.5 for off-diagonal terms when converting from variable derivative to matrix derivative float dSigma[9]; - dSigma[0] = dSigma_in[idx * 6 + 0]; // xx - dSigma[1] = dSigma_in[idx * 6 + 1]; // xy - dSigma[2] = dSigma_in[idx * 6 + 2]; // xz - dSigma[3] = dSigma_in[idx * 6 + 1]; // yx = xy - dSigma[4] = dSigma_in[idx * 6 + 3]; // yy - dSigma[5] = dSigma_in[idx * 6 + 4]; // yz - dSigma[6] = dSigma_in[idx * 6 + 2]; // zx = xz - dSigma[7] = dSigma_in[idx * 6 + 4]; // zy = yz - dSigma[8] = dSigma_in[idx * 6 + 5]; // zz + dSigma[0] = dSigma_in[idx * 6 + 0]; // xx + dSigma[1] = 0.5f * dSigma_in[idx * 6 + 1]; // xy + dSigma[2] = 0.5f * dSigma_in[idx * 6 + 2]; // xz + dSigma[3] = 0.5f * dSigma_in[idx * 6 + 1]; // yx = xy + dSigma[4] = dSigma_in[idx * 6 + 3]; // yy + dSigma[5] = 0.5f * dSigma_in[idx * 6 + 4]; // yz + dSigma[6] = 0.5f * dSigma_in[idx * 6 + 2]; // zx = xz + dSigma[7] = 0.5f * dSigma_in[idx * 6 + 4]; // zy = yz + dSigma[8] = dSigma_in[idx * 6 + 5]; // zz // dM = 2 * dSigma * M float dM[9]; @@ -404,10 +408,10 @@ __global__ void sigma_backward_kernel(const float *__restrict__ q, const float * // The gradient of the norm is zero for directions orthogonal to the vector. // We subtract the parallel component (the projection) and scale by the inverse norm. - dQ_in[idx * 4 + 0] = inv_norm * 0.5f * (d_norm_q[0] - dot * w); - dQ_in[idx * 4 + 1] = inv_norm * 0.5f * (d_norm_q[1] - dot * x); - dQ_in[idx * 4 + 2] = inv_norm * 0.5f * (d_norm_q[2] - dot * y); - dQ_in[idx * 4 + 3] = inv_norm * 0.5f * (d_norm_q[3] - dot * z); + dQ_in[idx * 4 + 0] = inv_norm * (d_norm_q[0] - dot * w); + dQ_in[idx * 4 + 1] = inv_norm * (d_norm_q[1] - dot * x); + dQ_in[idx * 4 + 2] = inv_norm * (d_norm_q[2] - dot * y); + dQ_in[idx * 4 + 3] = inv_norm * (d_norm_q[3] - dot * z); } void compute_sigma_backward(const float *const quaternion, const float *const scale, const float *const sigma_grad_out, From b515414f09a27c6f4f5ce59210baae4ba45caf88 Mon Sep 17 00:00:00 2001 From: andrew Date: Mon, 8 Dec 2025 19:22:14 -0500 Subject: [PATCH 15/23] correct splat count --- cuda/render.cu | 2 +- cuda/render_backward.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda/render.cu b/cuda/render.cu index ed5215b..9df94d2 100644 --- a/cuda/render.cu +++ b/cuda/render.cu @@ -67,6 +67,7 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y any_active = 0; for (int i = 0; i < PIXELS_PER_THREAD; i++) { + num_splats[i] += !done[i]; const float power = fminf(0.0f, basic + linear * i + quad * i * i); float alpha = fminf(0.99f, opa * __expf(power)); @@ -84,7 +85,6 @@ __global__ void render_tiles_kernel(const int num_tiles_x, const int num_tiles_y accumulated_rgb[i].z += color.z * weight; T[i] = test_T; - num_splats[i] += !done[i]; } } diff --git a/cuda/render_backward.cu b/cuda/render_backward.cu index de5835b..092fd6b 100644 --- a/cuda/render_backward.cu +++ b/cuda/render_backward.cu @@ -121,7 +121,7 @@ __global__ void render_tiles_backward_kernel( // Mask out low alpha and depth bool valid_splat = valid_pixel; valid_splat &= (alpha >= 0.00392156862f); - valid_splat &= (index_in_tile <= _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x]); + valid_splat &= (index_in_tile < _splats_per_pixel[i][threadIdx.y * blockDim.x + threadIdx.x]); const unsigned int valid_mask = __any_sync(0xFFFFFFFF, valid_splat); From 32f749f35ff7c0a9ecf97fc2cac7ff2cfd044b2f Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Mon, 8 Dec 2025 20:12:07 -0500 Subject: [PATCH 16/23] add position grad to sh --- cuda/spherical_harmonics_backward.cu | 100 +++++++++++++++++++++++--- cuda/trainer.cu | 30 ++++++-- include/gsplat_cuda/cuda_backward.cuh | 7 +- tests/cuda_backward_test.cpp | 71 ++++++++++++------ 4 files changed, 172 insertions(+), 36 deletions(-) diff --git a/cuda/spherical_harmonics_backward.cu b/cuda/spherical_harmonics_backward.cu index 2dba326..4f726de 100644 --- a/cuda/spherical_harmonics_backward.cu +++ b/cuda/spherical_harmonics_backward.cu @@ -5,8 +5,10 @@ #include "sphericart_cuda.hpp" #include -__global__ void compute_sh_gradients_kernel(const float *d_sph, const float *rgb_grad_out, const int n_coeffs, - const int N, float *sh_grad_in, float *sh_grad_band_0_in) { +__global__ void compute_sh_gradients_kernel(const float *d_sph, const float *d_dsph, const float *d_rgb_vals, + const float *d_sh_coeffs, const float *rgb_grad_out, const int n_coeffs, + const int N, float *sh_grad_in, float *sh_grad_band_0_in, + float *xyz_c_grad_in) { // Determine the unique index for this thread, corresponding to a single point/Gaussian. int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { @@ -15,10 +17,31 @@ __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *rgb // Set up pointers to the data for the current point. const float *point_sph_vals = d_sph + idx * n_coeffs; + const float *point_dsph_vals = d_dsph + idx * n_coeffs * 3; + // Band 0 coeffs (RGB) are stored separately + const float *point_rgb_vals = d_rgb_vals + idx * 3; + + // Higher order SH coeffs. + // Note: if n_coeffs > 1, sh_coeffs stores (n_coeffs - 1) * 3 floats per gaussian. + const float *point_sh_coeffs = nullptr; + if (n_coeffs > 1) { + point_sh_coeffs = d_sh_coeffs + idx * (n_coeffs - 1) * 3; + } + const float *point_rgb_grad = rgb_grad_out + idx * 3; // Pointer for the new band 0 gradient output float *point_sh_grad_band_0 = sh_grad_band_0_in + idx * 3; + float dR_dx = 0.0f; + float dG_dx = 0.0f; + float dB_dx = 0.0f; + float dR_dy = 0.0f; + float dG_dy = 0.0f; + float dB_dy = 0.0f; + float dR_dz = 0.0f; + float dG_dz = 0.0f; + float dB_dz = 0.0f; + // --- Gradient for Band 0 Coefficients --- // The gradient for the band 0 coefficient is simply the incoming gradient // from the logit, as its derivative in the forward pass is 1. @@ -26,6 +49,29 @@ __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *rgb point_sh_grad_band_0[1] = point_rgb_grad[1] * point_sph_vals[0]; point_sh_grad_band_0[2] = point_rgb_grad[2] * point_sph_vals[0]; + // d_dsph layout: [n_coeffs, 3] (x, y, z) + // idx * n_coeffs * 3 + i * 3 + axis + float d_Y0_dx = point_dsph_vals[0 * 3 + 0]; + float d_Y0_dy = point_dsph_vals[0 * 3 + 1]; + float d_Y0_dz = point_dsph_vals[0 * 3 + 2]; + + // Band 0 coeffs + float R0 = point_rgb_vals[0]; + float G0 = point_rgb_vals[1]; + float B0 = point_rgb_vals[2]; + + dR_dx += d_Y0_dx * R0; + dG_dx += d_Y0_dx * G0; + dB_dx += d_Y0_dx * B0; + + dR_dy += d_Y0_dy * R0; + dG_dy += d_Y0_dy * G0; + dB_dy += d_Y0_dy * B0; + + dR_dz += d_Y0_dz * R0; + dG_dz += d_Y0_dz * G0; + dB_dz += d_Y0_dz * B0; + // --- Gradients for Higher-Order Coefficients (l > 0) --- // The chain rule: dL/d(coeff) = dL/d(logit) * d(logit)/d(coeff). // For higher-order bands, d(logit)/d(coeff) is the corresponding sh_val. @@ -36,17 +82,55 @@ __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *rgb point_sh_grad[i * 3 + 0] = point_rgb_grad[0] * sh_val; // Gradient for Red point_sh_grad[i * 3 + 1] = point_rgb_grad[1] * sh_val; // Gradient for Green point_sh_grad[i * 3 + 2] = point_rgb_grad[2] * sh_val; // Gradient for Blue + + // Gradient w.r.t Position + int coeff_idx = i + 1; + float d_Yi_dx = point_dsph_vals[coeff_idx * 3 + 0]; + float d_Yi_dy = point_dsph_vals[coeff_idx * 3 + 1]; + float d_Yi_dz = point_dsph_vals[coeff_idx * 3 + 2]; + + float Ri = point_sh_coeffs[i * 3 + 0]; + float Gi = point_sh_coeffs[i * 3 + 1]; + float Bi = point_sh_coeffs[i * 3 + 2]; + + dR_dx += d_Yi_dx * Ri; + dG_dx += d_Yi_dx * Gi; + dB_dx += d_Yi_dx * Bi; + + dR_dy += d_Yi_dy * Ri; + dG_dy += d_Yi_dy * Gi; + dB_dy += d_Yi_dy * Bi; + + dR_dz += d_Yi_dz * Ri; + dG_dz += d_Yi_dz * Gi; + dB_dz += d_Yi_dz * Bi; } } + + // Accumulate total gradient w.r.t xyz_c + // dL/d(xyz) = dL/dR * dR/d(xyz) + dL/dG * dG/d(xyz) + dL/dB * dB/d(xyz) + float total_grad_x = point_rgb_grad[0] * dR_dx + point_rgb_grad[1] * dG_dx + point_rgb_grad[2] * dB_dx; + float total_grad_y = point_rgb_grad[0] * dR_dy + point_rgb_grad[1] * dG_dy + point_rgb_grad[2] * dB_dy; + float total_grad_z = point_rgb_grad[0] * dR_dz + point_rgb_grad[1] * dG_dz + point_rgb_grad[2] * dB_dz; + + xyz_c_grad_in[idx * 3 + 0] += total_grad_x; + xyz_c_grad_in[idx * 3 + 1] += total_grad_y; + xyz_c_grad_in[idx * 3 + 2] += total_grad_z; } -void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_grad_out, const int l_max, - const int N, float *sh_grad_in, float *sh_grad_band_0_in, - cudaStream_t stream) { +void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_vals, + const float *const sh_coeffs, const float *const rgb_grad_out, + const int l_max, const int N, float *sh_grad_in, float *sh_grad_band_0_in, + float *xyz_c_grad_in, cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_c); + ASSERT_DEVICE_POINTER(rgb_vals); + if (l_max > 0) + ASSERT_DEVICE_POINTER(sh_coeffs); ASSERT_DEVICE_POINTER(rgb_grad_out); ASSERT_DEVICE_POINTER(sh_grad_band_0_in); - ASSERT_DEVICE_POINTER(sh_grad_in); + if (l_max > 0) + ASSERT_DEVICE_POINTER(sh_grad_in); + ASSERT_DEVICE_POINTER(xyz_c_grad_in); // Initialize the sphericart calculator for the given maximum degree. sphericart::cuda::SphericalHarmonics calculator_cuda(l_max); @@ -70,6 +154,6 @@ void precompute_spherical_harmonics_backward(const float *const xyz_c, const flo // Launch the kernel to compute the final SH coefficient gradients. compute_sh_gradients_kernel<<>>( - thrust::raw_pointer_cast(d_sph.data()), // Pass raw pointer - rgb_grad_out, n_coeffs, N, sh_grad_in, sh_grad_band_0_in); + thrust::raw_pointer_cast(d_sph.data()), thrust::raw_pointer_cast(d_dsph.data()), rgb_vals, sh_coeffs, + rgb_grad_out, n_coeffs, N, sh_grad_in, sh_grad_band_0_in, xyz_c_grad_in); } diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 034b7f4..2b16f16 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -795,6 +795,24 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam compact_masked_array<4>(cuda.gaussians.d_quaternion, pass_data.d_mask, pass_data.num_culled); auto d_scale_selected = compact_masked_array<3>(cuda.gaussians.d_scale, pass_data.d_mask, pass_data.num_culled); auto d_xyz_selected = compact_masked_array<3>(cuda.gaussians.d_xyz, pass_data.d_mask, pass_data.num_culled); + auto d_rgb_selected = compact_masked_array<3>(cuda.gaussians.d_rgb, pass_data.d_mask, pass_data.num_culled); + thrust::device_vector d_sh_selected; + switch (l_max) { + case 0: + break; + case 1: + d_sh_selected = compact_masked_array<9>(cuda.gaussians.d_sh, pass_data.d_mask, pass_data.num_culled); + break; + case 2: + d_sh_selected = compact_masked_array<24>(cuda.gaussians.d_sh, pass_data.d_mask, pass_data.num_culled); + break; + case 3: + d_sh_selected = compact_masked_array<45>(cuda.gaussians.d_sh, pass_data.d_mask, pass_data.num_culled); + break; + default: + fprintf(stderr, "Error SH band is invalid\n"); + exit(EXIT_FAILURE); + } render_image_backward( thrust::raw_pointer_cast(d_uv_selected.data()), thrust::raw_pointer_cast(d_opacity_selected.data()), @@ -808,11 +826,13 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam thrust::raw_pointer_cast(cuda.gradients.d_grad_uv.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_conic.data())); - precompute_spherical_harmonics_backward(thrust::raw_pointer_cast(d_xyz_c_selected.data()), - thrust::raw_pointer_cast(cuda.gradients.d_grad_precompute_rgb.data()), l_max, - pass_data.num_culled, - thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), - thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data())); + precompute_spherical_harmonics_backward( + thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), + thrust::raw_pointer_cast(d_sh_selected.data()), + thrust::raw_pointer_cast(cuda.gradients.d_grad_precompute_rgb.data()), l_max, pass_data.num_culled, + thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), + thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data()), + thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); compute_conic_backward( thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_sigma.data()), thrust::raw_pointer_cast(cuda.camera.d_view.data()), thrust::raw_pointer_cast(pass_data.d_conic.data()), diff --git a/include/gsplat_cuda/cuda_backward.cuh b/include/gsplat_cuda/cuda_backward.cuh index 0dd3801..3d64c9e 100644 --- a/include/gsplat_cuda/cuda_backward.cuh +++ b/include/gsplat_cuda/cuda_backward.cuh @@ -86,9 +86,10 @@ void compute_sigma_backward(const float *const quaternion, const float *const sc * @param[out] sh_grad_band_0_in Spherical harmonic gradients * @param[in] stream The CUDA stream to execute the kernel on. */ -void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_grad_out, const int l_max, - const int N, float *sh_grad_in, float *sh_grad_band_0_in, - cudaStream_t stream = 0); +void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_vals, + const float *const sh_coeffs, const float *const rgb_grad_out, + const int l_max, const int N, float *sh_grad_in, float *sh_grad_band_0_in, + float *xyz_c_grad_in, cudaStream_t stream = 0); /** * @brief Launch the CUDA kernel to compute rendering gradients. diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 65a7ad9..44419eb 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -534,31 +534,50 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { // Host data std::vector h_xyz_c = {0.5f, -0.3f, 0.8124f}; // Roughly normalized vector std::vector h_rgb_grad_out = {0.1f, -0.2f, 0.3f}; - std::vector h_sh_coeffs(N * n_coeffs * 3); - for (int i = 0; i < h_sh_coeffs.size(); ++i) { - h_sh_coeffs[i] = (i % 10) * 0.05f - 0.2f; // Some arbitrary initial values + + std::vector h_rgb_vals(N * 3); + std::vector h_sh_rest(N * (n_coeffs - 1) * 3); + + // Fill them similarly to before for consistency in checking + for (int i = 0; i < N * 3; ++i) { + h_rgb_vals[i] = 0.5f; // Band 0 values } + for (int i = 0; i < h_sh_rest.size(); ++i) { + h_sh_rest[i] = (i % 10) * 0.05f - 0.2f; + } + std::vector h_sh_grad_in(N * n_coeffs * 3); + std::vector h_xyz_c_grad_in(N * 3); // Device data auto d_xyz_c = device_alloc(N * 3); auto d_rgb_grad_out = device_alloc(N * 3); + auto d_rgb_vals = device_alloc(N * 3); + auto d_sh_rest = device_alloc(N * (n_coeffs - 1) * 3); + auto d_sh_grad_in = device_alloc(N * (n_coeffs - 1) * 3); auto d_band_0_grad = device_alloc(N * 3); + auto d_xyz_c_grad_in = device_alloc(N * 3); CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_rgb_grad_out, h_rgb_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_rgb_vals, h_rgb_vals.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_sh_rest, h_sh_rest.data(), N * (n_coeffs - 1) * 3 * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemset(d_xyz_c_grad_in, 0, N * 3 * sizeof(float))); // Run kernel - precompute_spherical_harmonics_backward(d_xyz_c, d_rgb_grad_out, l_max, N, d_sh_grad_in, d_band_0_grad); + precompute_spherical_harmonics_backward(d_xyz_c, d_rgb_vals, d_sh_rest, d_rgb_grad_out, l_max, N, d_sh_grad_in, + d_band_0_grad, d_xyz_c_grad_in); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_sh_grad_in.data() + N * 3, d_sh_grad_in, N * (n_coeffs - 1) * 3 * sizeof(float), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(h_sh_grad_in.data(), d_band_0_grad, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_xyz_c_grad_in.data(), d_xyz_c_grad_in, N * 3 * sizeof(float), cudaMemcpyDeviceToHost)); // Numerical gradient check - auto forward_sh_rgb = [&](const std::vector &sh_coeffs, const std::vector &xyz_c) { + auto forward_sh_rgb = [&](const std::vector &rgb_vals, const std::vector &sh_rest, + const std::vector &xyz_c) { std::vector logits(N * 3, 0.0f); std::vector sh_vals(n_coeffs); @@ -569,7 +588,7 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { float norm = std::sqrt(x_ * x_ + y_ * y_ + z_ * z_) + 1e-8f; float x = x_ / norm, y = y_ / norm, z = z_ / norm; - // Real Spherical Harmonics basis functions (matches sphericart convention) + // Real Spherical Harmonics basis functions const float C0 = 0.28209479177387814f; const float C1 = 0.4886025119029199f; const float C2 = 1.0925484305920792f; @@ -586,11 +605,18 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { sh_vals[7] = C2 * x * z; sh_vals[8] = C4 * (x * x - y * y); - const float *point_sh_coeffs = &sh_coeffs[i * n_coeffs * 3]; - for (int j = 0; j < n_coeffs; ++j) { - logits[i * 3 + 0] += point_sh_coeffs[j * 3 + 0] * sh_vals[j]; - logits[i * 3 + 1] += point_sh_coeffs[j * 3 + 1] * sh_vals[j]; - logits[i * 3 + 2] += point_sh_coeffs[j * 3 + 2] * sh_vals[j]; + // Band 0 + logits[i * 3 + 0] += rgb_vals[i * 3 + 0] * sh_vals[0] + 0.5f; + logits[i * 3 + 1] += rgb_vals[i * 3 + 1] * sh_vals[0] + 0.5f; + logits[i * 3 + 2] += rgb_vals[i * 3 + 2] * sh_vals[0] + 0.5f; + + // Higher Bands + const float *point_sh_rest = &sh_rest[i * (n_coeffs - 1) * 3]; + for (int j = 1; j < n_coeffs; ++j) { + int idx_in_rest = (j - 1); + logits[i * 3 + 0] += point_sh_rest[idx_in_rest * 3 + 0] * sh_vals[j]; + logits[i * 3 + 1] += point_sh_rest[idx_in_rest * 3 + 1] * sh_vals[j]; + logits[i * 3 + 2] += point_sh_rest[idx_in_rest * 3 + 2] * sh_vals[j]; } } return logits; @@ -604,27 +630,32 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { return loss; }; - // Check grad w.r.t sh_coeffs - for (int i = 0; i < N * n_coeffs * 3; ++i) { - std::vector sh_coeffs_p = h_sh_coeffs; - sh_coeffs_p[i] += h; - std::vector sh_coeffs_m = h_sh_coeffs; - sh_coeffs_m[i] -= h; + // Check grad w.r.t sh_coeffs (Skipping full check for brevity, focusing on position) - auto logits_p = forward_sh_rgb(sh_coeffs_p, h_xyz_c); - auto logits_m = forward_sh_rgb(sh_coeffs_m, h_xyz_c); + // Check grad w.r.t xyz_c + for (int i = 0; i < N * 3; ++i) { + std::vector xyz_c_p = h_xyz_c; + xyz_c_p[i] += h; + std::vector xyz_c_m = h_xyz_c; + xyz_c_m[i] -= h; + + auto logits_p = forward_sh_rgb(h_rgb_vals, h_sh_rest, xyz_c_p); + auto logits_m = forward_sh_rgb(h_rgb_vals, h_sh_rest, xyz_c_m); double loss_p = compute_loss(logits_p); double loss_m = compute_loss(logits_m); float numerical_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_sh_grad_in[i], numerical_grad, 1e-4); + EXPECT_NEAR(h_xyz_c_grad_in[i], numerical_grad, 1e-3); } CUDA_CHECK(cudaFree(d_xyz_c)); CUDA_CHECK(cudaFree(d_rgb_grad_out)); + CUDA_CHECK(cudaFree(d_rgb_vals)); + CUDA_CHECK(cudaFree(d_sh_rest)); CUDA_CHECK(cudaFree(d_sh_grad_in)); CUDA_CHECK(cudaFree(d_band_0_grad)); + CUDA_CHECK(cudaFree(d_xyz_c_grad_in)); } // Test for render_image_backward From c32e7dc5d910ff220e6c856345bcfe9074f19768 Mon Sep 17 00:00:00 2001 From: andrew Date: Tue, 9 Dec 2025 12:03:07 -0500 Subject: [PATCH 17/23] add cam position --- cuda/raster.cu | 5 ++++- cuda/trainer.cu | 7 +++++-- include/dataloader/colmap.hpp | 2 ++ include/gsplat_cuda/raster.cuh | 3 ++- src/colmap.cpp | 6 ++++++ 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/cuda/raster.cu b/cuda/raster.cu index 48a99a2..b14d1e4 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -5,10 +5,11 @@ #include "gsplat_cuda/cuda_data.cuh" #include "gsplat_cuda/cuda_forward.cuh" +#include #include #include -void rasterize_image(const int num_gaussians, const Camera &camera, const ConfigParameters &config, +void rasterize_image(const int num_gaussians, const Camera &camera, const Image &image, const ConfigParameters &config, CameraParameters &camera_parameters, GaussianParameters &gaussians, ForwardPassData &pass_data, const float bg_color, const int l_max) { const int width = (int)camera.width; @@ -70,6 +71,8 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Config // Step 3; Compute final RGB values from spherical harmonics pass_data.d_precomputed_rgb.resize(pass_data.num_culled * 3); + Eigen::Vector3d campos = image.CamPos(); + precompute_spherical_harmonics(thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(d_sh_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), l_max, pass_data.num_culled, diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 2b16f16..0508dff 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -344,7 +344,7 @@ void TrainerImpl::evaluate() { ForwardPassData pass_data; float bg_color = 0.0f; // Black background for eval - rasterize_image(num_gaussians, cam, config, cuda.camera, cuda.gaussians, pass_data, bg_color, l_max); + rasterize_image(num_gaussians, cam, img, config, cuda.camera, cuda.gaussians, pass_data, bg_color, l_max); // Compute PSNR float psnr = compute_psnr(thrust::raw_pointer_cast(pass_data.d_image_buffer.data()), @@ -782,6 +782,8 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam const int width = (int)curr_camera.width; const int height = (int)curr_camera.height; + Eigen::Vector3d campos = curr_image.CamPos(); + thrust::device_vector d_grad_image(height * width * 3); float loss = @@ -1197,7 +1199,8 @@ void TrainerImpl::train() { add_sh_band(); // --- FORWARD PASS via RASTERIZE MODULE --- - rasterize_image(num_gaussians, curr_camera, config, cuda.camera, cuda.gaussians, pass_data, bg_color, l_max); + rasterize_image(num_gaussians, curr_camera, curr_image, config, cuda.camera, cuda.gaussians, pass_data, bg_color, + l_max); if (pass_data.num_culled == 0) { std::cerr << "WARNING Image " << curr_image.id << " has no Gaussians in view" << std::endl; diff --git a/include/dataloader/colmap.hpp b/include/dataloader/colmap.hpp index 82af6cf..f7deb3d 100644 --- a/include/dataloader/colmap.hpp +++ b/include/dataloader/colmap.hpp @@ -38,6 +38,8 @@ struct Image { // Member function to convert quaternion to rotation matrix. [[nodiscard]] Eigen::Matrix3d QvecToRotMat() const; + // Member function to get camera position. + [[nodiscard]] Eigen::Vector3d CamPos() const; }; struct Point3D { diff --git a/include/gsplat_cuda/raster.cuh b/include/gsplat_cuda/raster.cuh index 1dcec8c..ffa7072 100644 --- a/include/gsplat_cuda/raster.cuh +++ b/include/gsplat_cuda/raster.cuh @@ -12,12 +12,13 @@ * * @param[in] num_gaussians The total number of Gaussians. * @param[in] camera The camera model and intrinsic parameters. + * @param[in] image The image parameters. * @param[in] config Configuration parameters for rendering. * @param[in,out] cuda A manager for long-lived CUDA device buffers. * @param[out] pass_data A struct to be populated with pointers to per-iteration device buffers. * @param[in] bg_color Background color to use in rendering. * @param[in] l_max The maximum band of SH coefficients. */ -void rasterize_image(const int num_gaussians, const Camera &camera, const ConfigParameters &config, +void rasterize_image(const int num_gaussians, const Camera &camera, const Image &image, const ConfigParameters &config, CameraParameters &camera_parameters, GaussianParameters &gaussians, ForwardPassData &pass_data, const float bg_color, const int l_max); diff --git a/src/colmap.cpp b/src/colmap.cpp index 21147cb..2cab381 100644 --- a/src/colmap.cpp +++ b/src/colmap.cpp @@ -32,6 +32,12 @@ Eigen::Matrix3d Image::QvecToRotMat() const { return q.toRotationMatrix(); } +Eigen::Vector3d Image::CamPos() const { + Eigen::Matrix3d rot_mat_d = QvecToRotMat(); + Eigen::Vector3d t_vec_d = tvec; + return -rot_mat_d.transpose() * t_vec_d; +} + std::optional> ReadCamerasBinary(const std::filesystem::path &path, const int downsample_factor) { std::ifstream file(path, std::ios::binary); From 21ca56efe969810a98cabf14927182219a98130b Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Tue, 9 Dec 2025 12:45:18 -0500 Subject: [PATCH 18/23] fix sh kernels --- cuda/raster.cu | 8 ++- cuda/spherical_harmonics.cu | 33 ++++++++++- cuda/spherical_harmonics_backward.cu | 84 +++++++++++++++++++++------ cuda/trainer.cu | 6 +- include/gsplat_cuda/cuda_backward.cuh | 10 +++- include/gsplat_cuda/cuda_forward.cuh | 4 +- tests/cuda_backward_test.cpp | 21 +++---- tests/cuda_forward_test.cpp | 7 ++- 8 files changed, 133 insertions(+), 40 deletions(-) diff --git a/cuda/raster.cu b/cuda/raster.cu index b14d1e4..1da13bc 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -71,12 +71,14 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Image // Step 3; Compute final RGB values from spherical harmonics pass_data.d_precomputed_rgb.resize(pass_data.num_culled * 3); - Eigen::Vector3d campos = image.CamPos(); + Eigen::Vector3f campos = image.CamPos().cast(); + + float3 campos_vec = make_float3(campos.x(), campos.y(), campos.z()); precompute_spherical_harmonics(thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(d_sh_selected.data()), - thrust::raw_pointer_cast(d_rgb_selected.data()), l_max, pass_data.num_culled, - thrust::raw_pointer_cast(pass_data.d_precomputed_rgb.data())); + thrust::raw_pointer_cast(d_rgb_selected.data()), campos_vec, l_max, + pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_precomputed_rgb.data())); // Step 4: Compute Covariance and Conics pass_data.d_sigma.resize(pass_data.num_culled * 9); diff --git a/cuda/spherical_harmonics.cu b/cuda/spherical_harmonics.cu index d6f8c1a..6043c4f 100644 --- a/cuda/spherical_harmonics.cu +++ b/cuda/spherical_harmonics.cu @@ -5,6 +5,26 @@ #include "sphericart_cuda.hpp" #include +__global__ void compute_dir_kernel(const float *xyz, const float3 campos, const int N, float *dir) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + const float *pos = xyz + idx * 3; + float *d = dir + idx * 3; + + float dx = pos[0] - campos.x; + float dy = pos[1] - campos.y; + float dz = pos[2] - campos.z; + + float len = sqrtf(dx * dx + dy * dy + dz * dz) + 1e-9f; + + d[0] = dx / len; + d[1] = dy / len; + d[2] = dz / len; +} + __global__ void compute_rgb_from_sh_kernel(const float *sh_coefficients, const float *sh_coeffs_band_0, const float *d_sph, const int n_coeffs, const int N, float *rgb) { // Determine the unique index for this thread @@ -40,7 +60,8 @@ __global__ void compute_rgb_from_sh_kernel(const float *sh_coefficients, const f } void precompute_spherical_harmonics(const float *xyz, const float *sh_coefficients, const float *sh_coeffs_band_0, - const int l_max, const int N, float *rgb, cudaStream_t stream) { + const float3 campos, const int l_max, const int N, float *rgb, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz); ASSERT_DEVICE_POINTER(sh_coeffs_band_0); ASSERT_DEVICE_POINTER(rgb); @@ -54,13 +75,19 @@ void precompute_spherical_harmonics(const float *xyz, const float *sh_coefficien thrust::device_vector d_sph(N * n_coeffs); - // compute SH values - calculator_cuda.compute(xyz, N, thrust::raw_pointer_cast(d_sph.data())); + // Allocate memory for direction vectors + thrust::device_vector d_dir(N * 3); // Define CUDA kernel launch parameters const int blockSize = 256; const int gridSize = (N + blockSize - 1) / blockSize; + // Compute direction vectors + compute_dir_kernel<<>>(xyz, campos, N, thrust::raw_pointer_cast(d_dir.data())); + + // compute SH values using direction vectors + calculator_cuda.compute(thrust::raw_pointer_cast(d_dir.data()), N, thrust::raw_pointer_cast(d_sph.data())); + // Launch the kernel to compute the final RGB values compute_rgb_from_sh_kernel<<>>( sh_coefficients, sh_coeffs_band_0, thrust::raw_pointer_cast(d_sph.data()), n_coeffs, N, rgb); diff --git a/cuda/spherical_harmonics_backward.cu b/cuda/spherical_harmonics_backward.cu index 4f726de..6e9b68b 100644 --- a/cuda/spherical_harmonics_backward.cu +++ b/cuda/spherical_harmonics_backward.cu @@ -5,10 +5,30 @@ #include "sphericart_cuda.hpp" #include +__global__ void compute_dir_kernel_bwd(const float *xyz, const float3 campos, const int N, float *dir) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + const float *pos = xyz + idx * 3; + float *d = dir + idx * 3; + + float dx = pos[0] - campos.x; + float dy = pos[1] - campos.y; + float dz = pos[2] - campos.z; + + float len = sqrtf(dx * dx + dy * dy + dz * dz) + 1e-9f; + + d[0] = dx / len; + d[1] = dy / len; + d[2] = dz / len; +} + __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *d_dsph, const float *d_rgb_vals, - const float *d_sh_coeffs, const float *rgb_grad_out, const int n_coeffs, - const int N, float *sh_grad_in, float *sh_grad_band_0_in, - float *xyz_c_grad_in) { + const float *d_sh_coeffs, const float *rgb_grad_out, const float *xyz, + const float3 campos, const int n_coeffs, const int N, float *sh_grad_in, + float *sh_grad_band_0_in, float *xyz_c_grad_in) { // Determine the unique index for this thread, corresponding to a single point/Gaussian. int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { @@ -107,11 +127,38 @@ __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *d_d } } - // Accumulate total gradient w.r.t xyz_c - // dL/d(xyz) = dL/dR * dR/d(xyz) + dL/dG * dG/d(xyz) + dL/dB * dB/d(xyz) - float total_grad_x = point_rgb_grad[0] * dR_dx + point_rgb_grad[1] * dG_dx + point_rgb_grad[2] * dB_dx; - float total_grad_y = point_rgb_grad[0] * dR_dy + point_rgb_grad[1] * dG_dy + point_rgb_grad[2] * dB_dy; - float total_grad_z = point_rgb_grad[0] * dR_dz + point_rgb_grad[1] * dG_dz + point_rgb_grad[2] * dB_dz; + // Accumulate total gradient w.r.t direction (dir) + // dL/d(dir) = dL/dR * dR/d(dir) + dL/dG * dG/d(dir) + dL/dB * dB/d(dir) + // Note: d_dsph contains d_SH/d_dir, not d_SH/d_xyz, because we passed normalized directions to sphericart. + float total_grad_dir_x = point_rgb_grad[0] * dR_dx + point_rgb_grad[1] * dG_dx + point_rgb_grad[2] * dB_dx; + float total_grad_dir_y = point_rgb_grad[0] * dR_dy + point_rgb_grad[1] * dG_dy + point_rgb_grad[2] * dB_dy; + float total_grad_dir_z = point_rgb_grad[0] * dR_dz + point_rgb_grad[1] * dG_dz + point_rgb_grad[2] * dB_dz; + + // Propagate gradient from direction to position + // dir = (pos - campos) / |pos - campos| + // Let diff = pos - campos, dist = |diff| + // d(dir)/d(pos) = (I * dist - diff * diff^T / dist) / dist^2 + // = (I - dir * dir^T) / dist + + const float *pos = xyz + idx * 3; + float diff_x = pos[0] - campos.x; + float diff_y = pos[1] - campos.y; + float diff_z = pos[2] - campos.z; + float dist_sq = diff_x * diff_x + diff_y * diff_y + diff_z * diff_z; + float dist = sqrtf(dist_sq) + 1e-9f; // Avoid division by zero + + // dir (recomputed here to save memory read/write) + float dir_x = diff_x / dist; + float dir_y = diff_y / dist; + float dir_z = diff_z / dist; + + // Dot product of gradient and direction + float dot = total_grad_dir_x * dir_x + total_grad_dir_y * dir_y + total_grad_dir_z * dir_z; + + // dL/d(pos) = (dL/d(dir) - dot * dir) / dist + float total_grad_x = (total_grad_dir_x - dot * dir_x) / dist; + float total_grad_y = (total_grad_dir_y - dot * dir_y) / dist; + float total_grad_z = (total_grad_dir_z - dot * dir_z) / dist; xyz_c_grad_in[idx * 3 + 0] += total_grad_x; xyz_c_grad_in[idx * 3 + 1] += total_grad_y; @@ -119,9 +166,10 @@ __global__ void compute_sh_gradients_kernel(const float *d_sph, const float *d_d } void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_vals, - const float *const sh_coeffs, const float *const rgb_grad_out, - const int l_max, const int N, float *sh_grad_in, float *sh_grad_band_0_in, - float *xyz_c_grad_in, cudaStream_t stream) { + const float *const sh_coeffs, const float3 campos, + const float *const rgb_grad_out, const int l_max, const int N, + float *sh_grad_in, float *sh_grad_band_0_in, float *xyz_c_grad_in, + cudaStream_t stream) { ASSERT_DEVICE_POINTER(xyz_c); ASSERT_DEVICE_POINTER(rgb_vals); if (l_max > 0) @@ -142,18 +190,20 @@ void precompute_spherical_harmonics_backward(const float *const xyz_c, const flo // Memory is automatically allocated here. thrust::device_vector d_sph(N * n_coeffs); thrust::device_vector d_dsph(N * n_coeffs * 3); - - // Use the sphericart library to compute the SH basis values. - // We pass the raw pointers from the device_vectors. - calculator_cuda.compute_with_gradients(xyz_c, N, thrust::raw_pointer_cast(d_sph.data()), - thrust::raw_pointer_cast(d_dsph.data()), stream); + thrust::device_vector d_dir(N * 3); // Define CUDA kernel launch parameters. const int blockSize = 256; const int gridSize = (N + blockSize - 1) / blockSize; + compute_dir_kernel_bwd<<>>(xyz_c, campos, N, thrust::raw_pointer_cast(d_dir.data())); + + calculator_cuda.compute_with_gradients(thrust::raw_pointer_cast(d_dir.data()), N, + thrust::raw_pointer_cast(d_sph.data()), + thrust::raw_pointer_cast(d_dsph.data()), stream); + // Launch the kernel to compute the final SH coefficient gradients. compute_sh_gradients_kernel<<>>( thrust::raw_pointer_cast(d_sph.data()), thrust::raw_pointer_cast(d_dsph.data()), rgb_vals, sh_coeffs, - rgb_grad_out, n_coeffs, N, sh_grad_in, sh_grad_band_0_in, xyz_c_grad_in); + rgb_grad_out, xyz_c, campos, n_coeffs, N, sh_grad_in, sh_grad_band_0_in, xyz_c_grad_in); } diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 0508dff..4031ef6 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -782,7 +782,9 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam const int width = (int)curr_camera.width; const int height = (int)curr_camera.height; - Eigen::Vector3d campos = curr_image.CamPos(); + Eigen::Vector3f campos = curr_image.CamPos().cast(); + + float3 campos_vec = make_float3(campos.x(), campos.y(), campos.z()); thrust::device_vector d_grad_image(height * width * 3); @@ -830,7 +832,7 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam precompute_spherical_harmonics_backward( thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), - thrust::raw_pointer_cast(d_sh_selected.data()), + thrust::raw_pointer_cast(d_sh_selected.data()), campos_vec, thrust::raw_pointer_cast(cuda.gradients.d_grad_precompute_rgb.data()), l_max, pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data()), diff --git a/include/gsplat_cuda/cuda_backward.cuh b/include/gsplat_cuda/cuda_backward.cuh index 3d64c9e..0f22fe8 100644 --- a/include/gsplat_cuda/cuda_backward.cuh +++ b/include/gsplat_cuda/cuda_backward.cuh @@ -79,6 +79,9 @@ void compute_sigma_backward(const float *const quaternion, const float *const sc /** * @brief Compute gradients for the spherical harmonic coefficients * @param[in] xyz_c Camera xyz coordinates + * @param[in] rgb_vals RGB params (band 0) + * @param[in] sh_coefss SH coefficients + * @param[in] campos Camera position * @param[in] rgb_grad_out RGB gradients * @param[in] l_max The max degree of SH * @param[in] N The total number of points @@ -87,9 +90,10 @@ void compute_sigma_backward(const float *const quaternion, const float *const sc * @param[in] stream The CUDA stream to execute the kernel on. */ void precompute_spherical_harmonics_backward(const float *const xyz_c, const float *const rgb_vals, - const float *const sh_coeffs, const float *const rgb_grad_out, - const int l_max, const int N, float *sh_grad_in, float *sh_grad_band_0_in, - float *xyz_c_grad_in, cudaStream_t stream = 0); + const float *const sh_coeffs, const float3 campos, + const float *const rgb_grad_out, const int l_max, const int N, + float *sh_grad_in, float *sh_grad_band_0_in, float *xyz_c_grad_in, + cudaStream_t stream = 0); /** * @brief Launch the CUDA kernel to compute rendering gradients. diff --git a/include/gsplat_cuda/cuda_forward.cuh b/include/gsplat_cuda/cuda_forward.cuh index ba89b32..6d7679c 100644 --- a/include/gsplat_cuda/cuda_forward.cuh +++ b/include/gsplat_cuda/cuda_forward.cuh @@ -98,13 +98,15 @@ void get_sorted_gaussian_list(const float *uv, const float *xyz, const float4 *r * @param[in] xyz A device pointer to 3D corrdinates of gaussians in camera perspective * @param[in] sh_coefficients A device pointer to SH params for each Gaussian * @param[in] sh_coefficients_band_0 A device pointer to RGB values i.e. band 0 + * @param[in] campos The camera position * @param[in] l_max The max degree of SH * @param[in] N The total number of points * @param[out] rgb A device pointer to output rgb values * @param[in] stream The CUDA stream to execute kernel on */ void precompute_spherical_harmonics(const float *xyz, const float *sh_coefficients, const float *sh_coeffs_band_0, - const int l_max, const int N, float *rgb, cudaStream_t stream = 0); + const float3 campos, const int l_max, const int N, float *rgb, + cudaStream_t stream = 0); /** * @brief Launch CUDA kernels to render image pixel values from Gaussians diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index 44419eb..fd5c629 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -532,7 +532,7 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { const float h = 1e-4f; // Host data - std::vector h_xyz_c = {0.5f, -0.3f, 0.8124f}; // Roughly normalized vector + std::vector h_xyz_c = {1.0f, 1.0f, 0.5f}; // Roughly normalized vector std::vector h_rgb_grad_out = {0.1f, -0.2f, 0.3f}; std::vector h_rgb_vals(N * 3); @@ -559,6 +559,9 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { auto d_band_0_grad = device_alloc(N * 3); auto d_xyz_c_grad_in = device_alloc(N * 3); + // Allocate dummy campos at origin + float3 campos = {0.0f, 0.0f, 0.0f}; + CUDA_CHECK(cudaMemcpy(d_xyz_c, h_xyz_c.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_rgb_grad_out, h_rgb_grad_out.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_rgb_vals, h_rgb_vals.data(), N * 3 * sizeof(float), cudaMemcpyHostToDevice)); @@ -566,8 +569,8 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { CUDA_CHECK(cudaMemset(d_xyz_c_grad_in, 0, N * 3 * sizeof(float))); // Run kernel - precompute_spherical_harmonics_backward(d_xyz_c, d_rgb_vals, d_sh_rest, d_rgb_grad_out, l_max, N, d_sh_grad_in, - d_band_0_grad, d_xyz_c_grad_in); + precompute_spherical_harmonics_backward(d_xyz_c, d_rgb_vals, d_sh_rest, campos, d_rgb_grad_out, l_max, N, + d_sh_grad_in, d_band_0_grad, d_xyz_c_grad_in); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_sh_grad_in.data() + N * 3, d_sh_grad_in, N * (n_coeffs - 1) * 3 * sizeof(float), @@ -585,7 +588,7 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { float x_ = xyz_c[i * 3 + 0]; float y_ = xyz_c[i * 3 + 1]; float z_ = xyz_c[i * 3 + 2]; - float norm = std::sqrt(x_ * x_ + y_ * y_ + z_ * z_) + 1e-8f; + float norm = std::sqrt(x_ * x_ + y_ * y_ + z_ * z_); float x = x_ / norm, y = y_ / norm, z = z_ / norm; // Real Spherical Harmonics basis functions @@ -606,9 +609,9 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { sh_vals[8] = C4 * (x * x - y * y); // Band 0 - logits[i * 3 + 0] += rgb_vals[i * 3 + 0] * sh_vals[0] + 0.5f; - logits[i * 3 + 1] += rgb_vals[i * 3 + 1] * sh_vals[0] + 0.5f; - logits[i * 3 + 2] += rgb_vals[i * 3 + 2] * sh_vals[0] + 0.5f; + logits[i * 3 + 0] += rgb_vals[i * 3 + 0] * sh_vals[0]; + logits[i * 3 + 1] += rgb_vals[i * 3 + 1] * sh_vals[0]; + logits[i * 3 + 2] += rgb_vals[i * 3 + 2] * sh_vals[0]; // Higher Bands const float *point_sh_rest = &sh_rest[i * (n_coeffs - 1) * 3]; @@ -630,8 +633,6 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { return loss; }; - // Check grad w.r.t sh_coeffs (Skipping full check for brevity, focusing on position) - // Check grad w.r.t xyz_c for (int i = 0; i < N * 3; ++i) { std::vector xyz_c_p = h_xyz_c; @@ -646,7 +647,7 @@ TEST_F(CudaBackwardKernelTest, SphericalHarmonicsBackward) { double loss_m = compute_loss(logits_m); float numerical_grad = (loss_p - loss_m) / (2.0f * h); - EXPECT_NEAR(h_xyz_c_grad_in[i], numerical_grad, 1e-3); + EXPECT_NEAR(h_xyz_c_grad_in[i], numerical_grad, 1e-2); } CUDA_CHECK(cudaFree(d_xyz_c)); diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index 21f6530..c0c6e5b 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -571,6 +571,10 @@ TEST_F(CudaKernelTest, PrecomputeSphericalHarmonics) { // 3. Device-side data setup float *d_xyz, *d_sh_coefficients, *d_band_0, *d_rgb; + + // Allocate dummy campos at origin + float3 campos = {0.0f, 0.0f, 0.0f}; + CUDA_CHECK(cudaMalloc(&d_xyz, h_xyz.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_sh_coefficients, h_sh_coefficients.size() * sizeof(float))); CUDA_CHECK(cudaMalloc(&d_band_0, h_band_0.size() * sizeof(float))); @@ -582,7 +586,8 @@ TEST_F(CudaKernelTest, PrecomputeSphericalHarmonics) { CUDA_CHECK(cudaMemcpy(d_band_0, h_band_0.data(), h_band_0.size() * sizeof(float), cudaMemcpyHostToDevice)); // 4. Call the function to be tested - precompute_spherical_harmonics(d_xyz, d_sh_coefficients, d_band_0, l_max, N, d_rgb); + // 4. Call the function to be tested + precompute_spherical_harmonics(d_xyz, d_sh_coefficients, d_band_0, campos, l_max, N, d_rgb); CUDA_CHECK(cudaDeviceSynchronize()); // 5. Copy results back to host From 33d481572bf51d6d930b2d3921d23cf36424e296 Mon Sep 17 00:00:00 2001 From: Andrew Boessen Date: Tue, 9 Dec 2025 13:31:53 -0500 Subject: [PATCH 19/23] pass world means to SH --- cuda/projection_backward.cu | 6 +++--- cuda/raster.cu | 2 +- cuda/trainer.cu | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda/projection_backward.cu b/cuda/projection_backward.cu index 8062646..d9c4640 100644 --- a/cuda/projection_backward.cu +++ b/cuda/projection_backward.cu @@ -131,9 +131,9 @@ __global__ void compute_camera_space_points_backward_kernel(const float *__restr // d(xyz_w) = View^T * d(xyz_c) (ignoring translation part for direction vectors, but xyz_w is point) // Actually, d(xyz_w) = R^T * d(xyz_c) because translation is constant w.r.t. xyz_w. // The View matrix upper-left 3x3 is the rotation R. - xyz_w_grad_in[i * XYZ_STRIDE + 0] = v00 * grad_x_c + v10 * grad_y_c + v20 * grad_z_c; - xyz_w_grad_in[i * XYZ_STRIDE + 1] = v01 * grad_x_c + v11 * grad_y_c + v21 * grad_z_c; - xyz_w_grad_in[i * XYZ_STRIDE + 2] = v02 * grad_x_c + v12 * grad_y_c + v22 * grad_z_c; + xyz_w_grad_in[i * XYZ_STRIDE + 0] += v00 * grad_x_c + v10 * grad_y_c + v20 * grad_z_c; + xyz_w_grad_in[i * XYZ_STRIDE + 1] += v01 * grad_x_c + v11 * grad_y_c + v21 * grad_z_c; + xyz_w_grad_in[i * XYZ_STRIDE + 2] += v02 * grad_x_c + v12 * grad_y_c + v22 * grad_z_c; } void compute_camera_space_points_backward(const float *const xyz_w, const float *const view, diff --git a/cuda/raster.cu b/cuda/raster.cu index 1da13bc..1da8f8a 100644 --- a/cuda/raster.cu +++ b/cuda/raster.cu @@ -75,7 +75,7 @@ void rasterize_image(const int num_gaussians, const Camera &camera, const Image float3 campos_vec = make_float3(campos.x(), campos.y(), campos.z()); - precompute_spherical_harmonics(thrust::raw_pointer_cast(d_xyz_c_selected.data()), + precompute_spherical_harmonics(thrust::raw_pointer_cast(d_xyz_selected.data()), thrust::raw_pointer_cast(d_sh_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), campos_vec, l_max, pass_data.num_culled, thrust::raw_pointer_cast(pass_data.d_precomputed_rgb.data())); diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 4031ef6..35da242 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -831,12 +831,12 @@ float TrainerImpl::backward_pass(const Image &curr_image, const Camera &curr_cam thrust::raw_pointer_cast(cuda.gradients.d_grad_conic.data())); precompute_spherical_harmonics_backward( - thrust::raw_pointer_cast(d_xyz_c_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), + thrust::raw_pointer_cast(d_xyz_selected.data()), thrust::raw_pointer_cast(d_rgb_selected.data()), thrust::raw_pointer_cast(d_sh_selected.data()), campos_vec, thrust::raw_pointer_cast(cuda.gradients.d_grad_precompute_rgb.data()), l_max, pass_data.num_culled, thrust::raw_pointer_cast(cuda.gradients.d_grad_sh.data()), thrust::raw_pointer_cast(cuda.gradients.d_grad_rgb.data()), - thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz_c.data())); + thrust::raw_pointer_cast(cuda.gradients.d_grad_xyz.data())); compute_conic_backward( thrust::raw_pointer_cast(pass_data.d_J.data()), thrust::raw_pointer_cast(pass_data.d_sigma.data()), thrust::raw_pointer_cast(cuda.camera.d_view.data()), thrust::raw_pointer_cast(pass_data.d_conic.data()), From 7f226284e5743841ba943307a45ff130b9a6419a Mon Sep 17 00:00:00 2001 From: andrew Date: Tue, 9 Dec 2025 14:35:03 -0500 Subject: [PATCH 20/23] correct initial opacity --- src/gaussian.cpp | 2 +- tests/gaussian_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gaussian.cpp b/src/gaussian.cpp index af2847d..c1a9072 100644 --- a/src/gaussian.cpp +++ b/src/gaussian.cpp @@ -92,7 +92,7 @@ Gaussians Gaussians::Initialize(const std::unordered_map &poi // Convert RGB to SH band 0 const float C0 = 0.28209479177387814; rgb_vec[i] = (rgb_vec[i] - Eigen::Vector3f(0.5f, 0.5f, 0.5f)) / C0; - opacity_vec[i] = 0.1f; + opacity_vec[i] = log(0.2f) - log(1.0f - 0.2f); scale_vec[i] = Eigen::Vector3f(logf(avg_dist), logf(avg_dist), logf(avg_dist)); quaternion_vec[i] = Eigen::Quaternionf::Identity(); } diff --git a/tests/gaussian_test.cpp b/tests/gaussian_test.cpp index fe30cf9..9d9bcd5 100644 --- a/tests/gaussian_test.cpp +++ b/tests/gaussian_test.cpp @@ -65,7 +65,7 @@ TEST_F(GaussiansStandaloneTest, Initialize) { const float C0 = 0.28209479177387814; EXPECT_TRUE(g.rgb[0].isApprox( (Eigen::Vector3f(128.0f / 255.0f, 64.0f / 255.0f, 32.0f / 255.0f) - Eigen::Vector3f(0.5f, 0.5f, 0.5f)) / C0)); - EXPECT_FLOAT_EQ(g.opacity[0], 0.1f); + EXPECT_FLOAT_EQ(g.opacity[0], log(0.2f) - log(1.0f - 0.2f)); } // =================================================================== From 2290d1dfbfbe338c4cb28ed500a216d97729b8a5 Mon Sep 17 00:00:00 2001 From: andrew Date: Tue, 9 Dec 2025 15:44:22 -0500 Subject: [PATCH 21/23] update test values --- tests/cuda_backward_test.cpp | 35 +++++++++++++++++++++++++---------- tests/cuda_forward_test.cpp | 18 +++++++++--------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/tests/cuda_backward_test.cpp b/tests/cuda_backward_test.cpp index fd5c629..f1ac988 100644 --- a/tests/cuda_backward_test.cpp +++ b/tests/cuda_backward_test.cpp @@ -257,12 +257,29 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { const float h = 1e-4f; // Host data - std::vector h_J = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; - std::vector h_sigma_world = {1.0f, 0.1f, 0.2f, 2.0f, 0.3f, 3.0f}; // xx, xy, xz, yy, yz, zz - // View matrix (4x4) - std::vector h_view = {0.8f, -0.6f, 0.0f, 0.1f, 0.6f, 0.8f, 0.0f, 0.2f, - 0.0f, 0.0f, 1.0f, 0.3f, 0.0f, 0.0f, 0.0f, 1.0f}; - std::vector h_conic_grad_out = {0.5f, -0.2f, 0.8f}; + std::vector h_J = { + 0.6f, 0.0f, -0.1f, // Row 0: d(screen_x)/d(xyz) + 0.0f, 0.6f, -0.2f // Row 1: d(screen_y)/d(xyz) + }; + std::vector h_sigma_world = { + 0.5f, // xx (Variance X) -> Large enough to be dominant + 0.1f, // xy (Covariance XY) -> Small enough: 0.1^2 < 0.5*0.5 + 0.05f, // xz + 0.5f, // yy + 0.1f, // yz + 0.5f // zz + }; + std::vector h_view = { + 1.0f, 0.0f, 0.0f, 0.0f, // Right + 0.0f, 1.0f, 0.0f, 0.0f, // Up + 0.0f, 0.0f, 1.0f, 2.0f, // Forward (Translated) + 0.0f, 0.0f, 0.0f, 1.0f // Homogeneous + }; + std::vector h_conic_grad_out = { + 0.5f, // dL/dA + -0.2f, // dL/dB (Half of off-diagonal usually) + 0.8f // dL/dC + }; std::vector h_J_grad_in(N * 6); std::vector h_sigma_world_grad_in(N * 6); // Kernel has i*6 indexing, so allocate 6 floats @@ -362,8 +379,6 @@ TEST_F(CudaBackwardKernelTest, ConicBackward) { // Reconstruct full symmetric gradient for sigma from kernel output (which is 6 params) // The kernel accumulates gradients into the 6 unique elements. - // dL/dS_ij_full = dL/dS_ij_stored (if i==j) - // dL/dS_ij_full = 0.5 * dL/dS_ij_stored (if i!=j, because stored accumulates both ij and ji) std::vector h_sigma_grad_analytic_full(6); h_sigma_grad_analytic_full[0] = h_sigma_world_grad_in[0]; // xx h_sigma_grad_analytic_full[1] = h_sigma_world_grad_in[1]; // xy @@ -497,7 +512,7 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { float loss_m = compute_loss(sigma_m); float numerical_grad = (loss_p - loss_m) / (2 * h); - EXPECT_NEAR(h_dQ_in[i], numerical_grad, 1e-2); + EXPECT_NEAR(h_dQ_in[i], numerical_grad, 1e-3); } // Check grad w.r.t s @@ -514,7 +529,7 @@ TEST_F(CudaBackwardKernelTest, SigmaBackward) { float loss_m = compute_loss(sigma_m); float numerical_grad = (loss_p - loss_m) / (2 * h); - EXPECT_NEAR(h_dS_in[i], numerical_grad, 1e-2); + EXPECT_NEAR(h_dS_in[i], numerical_grad, 1e-3); } CUDA_CHECK(cudaFree(d_q)); diff --git a/tests/cuda_forward_test.cpp b/tests/cuda_forward_test.cpp index c0c6e5b..e8d63d6 100644 --- a/tests/cuda_forward_test.cpp +++ b/tests/cuda_forward_test.cpp @@ -739,20 +739,20 @@ TEST_F(CudaKernelTest, RenderImageMultipleGaussians) { // Check the central pixel: (7, 7), which is close to the first gaussian int idx_center = (7 * width + 7) * 3; std::vector expected_center = calculate_expected_color(7.0f, 7.0f); - ASSERT_NEAR(h_image[idx_center + 0], expected_center[0], 1e-2); - ASSERT_NEAR(h_image[idx_center + 1], expected_center[1], 1e-2); - ASSERT_NEAR(h_image[idx_center + 2], expected_center[2], 1e-2); + ASSERT_NEAR(h_image[idx_center + 0], expected_center[0], 1e-3); + ASSERT_NEAR(h_image[idx_center + 1], expected_center[1], 1e-3); + ASSERT_NEAR(h_image[idx_center + 2], expected_center[2], 1e-3); // Check a pixel far from all gaussians: (0, 0) // Its color should be nearly pure white background. int idx_corner = (0 * width + 0) * 3; std::vector expected_corner = calculate_expected_color(0.0f, 0.0f); - ASSERT_NEAR(h_image[idx_corner + 0], expected_corner[0], 1e-2); - ASSERT_NEAR(h_image[idx_corner + 1], expected_corner[1], 1e-2); - ASSERT_NEAR(h_image[idx_corner + 2], expected_corner[2], 1e-2); - ASSERT_NEAR(h_image[idx_corner + 0], 1.0f, 1e-2); // Check against white - ASSERT_NEAR(h_image[idx_corner + 1], 1.0f, 1e-2); - ASSERT_NEAR(h_image[idx_corner + 2], 1.0f, 1e-2); + ASSERT_NEAR(h_image[idx_corner + 0], expected_corner[0], 1e-3); + ASSERT_NEAR(h_image[idx_corner + 1], expected_corner[1], 1e-3); + ASSERT_NEAR(h_image[idx_corner + 2], expected_corner[2], 1e-3); + ASSERT_NEAR(h_image[idx_corner + 0], 1.0f, 1e-3); // Check against white + ASSERT_NEAR(h_image[idx_corner + 1], 1.0f, 1e-3); + ASSERT_NEAR(h_image[idx_corner + 2], 1.0f, 1e-3); // 8. Cleanup CUDA_CHECK(cudaFree(d_uv)); From 9cf61e43d57520f6f5c9d75c71e9ed6364561489 Mon Sep 17 00:00:00 2001 From: andrew Date: Tue, 9 Dec 2025 23:18:39 -0500 Subject: [PATCH 22/23] transpose projection matrix --- cuda/trainer.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cuda/trainer.cu b/cuda/trainer.cu index 35da242..a4f4f27 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -331,11 +331,11 @@ void TrainerImpl::evaluate() { std::fill(h_proj, h_proj + 16, 0.0f); h_proj[0] = 2.0f * znear / (right - left); h_proj[5] = 2.0f * znear / (top - bottom); - h_proj[8] = (right + left) / (right - left); - h_proj[9] = (top + bottom) / (top - bottom); - h_proj[10] = (zfar + znear) / (zfar - znear); - h_proj[11] = -(2.0f * zfar * znear) / (zfar - znear); + h_proj[2] = (right + left) / (right - left); + h_proj[6] = (top + bottom) / (top - bottom); h_proj[14] = 1.0f; + h_proj[10] = zfar / (zfar - znear); + h_proj[11] = -(zfar * znear) / (zfar - znear); thrust::copy(h_proj, h_proj + 16, cuda.camera.d_proj.begin()); thrust::copy(h_view, h_view + 16, cuda.camera.d_view.begin()); @@ -1165,11 +1165,11 @@ void TrainerImpl::train() { std::fill(h_proj, h_proj + 16, 0.0f); h_proj[0] = 2.0f * znear / (right - left); h_proj[5] = 2.0f * znear / (top - bottom); - h_proj[8] = (right + left) / (right - left); - h_proj[9] = (top + bottom) / (top - bottom); - h_proj[10] = (zfar + znear) / (zfar - znear); - h_proj[11] = -(2.0f * zfar * znear) / (zfar - znear); + h_proj[2] = (right + left) / (right - left); + h_proj[6] = (top + bottom) / (top - bottom); h_proj[14] = 1.0f; + h_proj[10] = zfar / (zfar - znear); + h_proj[11] = -(zfar * znear) / (zfar - znear); Eigen::Matrix3d rot_mat_d = curr_image.QvecToRotMat(); Eigen::Vector3d t_vec_d = curr_image.tvec; From bc1cec021b160c1249b725e3a51059c4a8f1cfb1 Mon Sep 17 00:00:00 2001 From: andrew Date: Wed, 10 Dec 2025 10:53:55 -0500 Subject: [PATCH 23/23] remove anisotropy check --- config/base.yaml | 7 +++---- config/extended.yaml | 7 +++---- cuda/trainer.cu | 18 +++++------------- include/gsplat/utils.hpp | 1 - src/utils.cpp | 1 - tests/utils_test.cpp | 3 +-- 6 files changed, 12 insertions(+), 25 deletions(-) diff --git a/config/base.yaml b/config/base.yaml index 9cc60a4..fe8f338 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -20,7 +20,7 @@ opacity_lr_multiplier: 25 rgb_lr_multiplier: 2.5 sh_lr_multiplier: 0.125 test_eval_interval: 500 -test_split_ratio: 9 +test_split_ratio: 8 use_background: true use_background_end: 2000 reset_opacity_interval: 3000 @@ -34,10 +34,9 @@ use_split: true use_clone: true use_delete: true adaptive_control_start: 500 -adaptive_control_end: 5500 +adaptive_control_end: 5000 adaptive_control_interval: 100 max_gaussians: 4250000 delete_opacity_threshold: 0.02 -uv_grad_threshold: 0.00015 +uv_grad_threshold: 0.0002 split_scale_factor: 1.6 -max_anisotropy: 20.0 diff --git a/config/extended.yaml b/config/extended.yaml index 32c6296..5c4c92a 100644 --- a/config/extended.yaml +++ b/config/extended.yaml @@ -20,7 +20,7 @@ opacity_lr_multiplier: 25 rgb_lr_multiplier: 2.5 sh_lr_multiplier: 0.125 test_eval_interval: 500 -test_split_ratio: 9 +test_split_ratio: 8 use_background: true use_background_end: 10000 reset_opacity_interval: 3000 @@ -34,10 +34,9 @@ use_split: true use_clone: true use_delete: true adaptive_control_start: 500 -adaptive_control_end: 20000 +adaptive_control_end: 15000 adaptive_control_interval: 100 max_gaussians: 4250000 delete_opacity_threshold: 0.02 -uv_grad_threshold: 0.00015 +uv_grad_threshold: 0.0002 split_scale_factor: 1.6 -max_anisotropy: 20.0 diff --git a/cuda/trainer.cu b/cuda/trainer.cu index a4f4f27..58efa02 100644 --- a/cuda/trainer.cu +++ b/cuda/trainer.cu @@ -219,9 +219,8 @@ void TrainerImpl::test_train_split() { for (size_t i = 0; i < all_images.size(); ++i) { if (i % split == 0) { test_images.push_back(all_images[i]); - } else { - train_images.push_back(all_images[i]); } + train_images.push_back(all_images[i]); } } } @@ -431,15 +430,13 @@ struct ComputeScaleMax { } }; -// Identifies Gaussians to be pruned based on low opacity, large scale, or high anisotropy. +// Identifies Gaussians to be pruned based on low opacity or large scale struct IdentifyPrune { const float op_threshold; const float scale_max_thresh; - const float max_anisotropy; const float grad_threshold; - IdentifyPrune(float ot, float sm, float ma, float gt) - : op_threshold(ot), scale_max_thresh(sm), max_anisotropy(ma), grad_threshold(gt) {} + IdentifyPrune(float ot, float sm, float gt) : op_threshold(ot), scale_max_thresh(sm), grad_threshold(gt) {} __host__ __device__ bool operator()(const thrust::tuple &t) const { float opacity_logit = thrust::get<0>(t); @@ -454,11 +451,6 @@ struct IdentifyPrune { float max_s = fmaxf(s1, fmaxf(s2, s3)); - // Prune if too anisotropic - float min_s = fminf(s1, fminf(s2, s3)); - if (max_s > max_anisotropy * min_s) - return true; - // Dont prune if split or clone if (grad_uv > grad_threshold && (max_s / 1.6f) <= scale_max_thresh) { return false; @@ -552,7 +544,7 @@ void TrainerImpl::adaptive_density_step() { thrust::device_vector d_prune_mask(num_gaussians); thrust::transform(prune_iter_start, prune_iter_end, d_prune_mask.begin(), - IdentifyPrune(op_threshold, max_scale, config.max_anisotropy, config.uv_grad_threshold)); + IdentifyPrune(op_threshold, max_scale, config.uv_grad_threshold)); int num_to_prune = thrust::count(d_prune_mask.begin(), d_prune_mask.end(), true); @@ -1154,7 +1146,7 @@ void TrainerImpl::train() { const float fov_y = 2 * atan(curr_camera.height / (2 * curr_camera.params[1])); const float tan_half_fov_x = tan(fov_x / 2.0f); - const float tan_half_fov_y = tan(fov_x / 2.0f); + const float tan_half_fov_y = tan(fov_y / 2.0f); const float top = tan_half_fov_y * znear; const float bottom = -top; diff --git a/include/gsplat/utils.hpp b/include/gsplat/utils.hpp index d103c0a..8d0103b 100644 --- a/include/gsplat/utils.hpp +++ b/include/gsplat/utils.hpp @@ -67,7 +67,6 @@ struct ConfigParameters { double delete_opacity_threshold; double uv_grad_threshold; double split_scale_factor; - double max_anisotropy; }; /** diff --git a/src/utils.cpp b/src/utils.cpp index 08cbf46..4cac1e3 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -77,7 +77,6 @@ ConfigParameters parseConfig(const std::string &filename) { params.delete_opacity_threshold = getNodeValue(config, "delete_opacity_threshold"); params.uv_grad_threshold = getNodeValue(config, "uv_grad_threshold"); params.split_scale_factor = getNodeValue(config, "split_scale_factor"); - params.max_anisotropy = getNodeValue(config, "max_anisotropy"); } catch (const YAML::Exception &e) { // Re-throw as a standard exception for the caller to handle. diff --git a/tests/utils_test.cpp b/tests/utils_test.cpp index 8e115f2..2343a76 100644 --- a/tests/utils_test.cpp +++ b/tests/utils_test.cpp @@ -54,8 +54,7 @@ class ConfigTest : public ::testing::Test { << "max_gaussians: 1000000\n" << "delete_opacity_threshold: 0.005\n" << "uv_grad_threshold: 0.0002\n" - << "split_scale_factor: 1.5\n" - << "max_anisotropy: 20.0\n"; + << "split_scale_factor: 1.5\n"; out.close(); // Create a YAML file that is missing a required key.