frc971 · yasen5 · Apr 12, 2026
diff --git a/third_party/971apriltag/apriltag.cc b/third_party/971apriltag/apriltag.cc
@@ -127,7 +127,6 @@ GpuDetector::GpuDetector(size_t width, size_t height,
       gray_image_host_(width * height),
       color_image_device_(width * height * 2),
       gray_image_device_(width * height),
-      decimated_image_device_(width / 2 * height / 2),
       thresholded_image_device_(width / 2 * height / 2),
       union_markers_device_(width / 2 * height / 2),
       union_markers_size_device_(width / 2 * height / 2),
@@ -588,14 +587,12 @@ struct TransformLineFitPoint {
 
     int32_t W = 1;
 
-    if (ix > 0 && ix + 1 < decimated_width && iy > 0 &&
-        iy + 1 < decimated_height) {
-      int32_t grad_x = decimated_image_device_[iy * decimated_width + ix + 1] -
-                       decimated_image_device_[iy * decimated_width + ix - 1];
+    if (ix > 0 && ix + 1 < width && iy > 0 && iy + 1 < height) {
+      int32_t grad_x = image_device_[iy * width + ix + 1] -
+                       image_device_[iy * width + ix - 1];
 
-      int32_t grad_y =
-          decimated_image_device_[(iy + 1) * decimated_width + ix] -
-          decimated_image_device_[(iy - 1) * decimated_width + ix];
+      int32_t grad_y = image_device_[(iy + 1) * width + ix] -
+                       image_device_[(iy - 1) * width + ix];
 
       // XXX Tunable. How to shape the gradient magnitude?
       W = hypotf(grad_x, grad_y) + 1;
@@ -610,14 +607,12 @@ struct TransformLineFitPoint {
     result.blob_index = p.blob_index();
     return result;
   }
-  TransformLineFitPoint(const uint8_t* decimated_image_device,
-                        int decimated_width_param, int decimated_height_param)
-      : decimated_image_device_(decimated_image_device),
-        decimated_width(decimated_width_param),
-        decimated_height(decimated_height_param) {}
-  const uint8_t* decimated_image_device_;
-  int decimated_width;
-  int decimated_height;
+  TransformLineFitPoint(const uint8_t* image_device, int width_param,
+                        int height_param)
+      : image_device_(image_device), width(width_param), height(height_param) {}
+  const uint8_t* image_device_;
+  int width;
+  int height;
 };
 
 struct SumLineFitPoints {
@@ -690,10 +685,9 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
   after_image_memcpy_to_device_.Record(&stream_);
 
   // Now, threshold on the GPU fully.
-  threshold_->ThresholdAndDecimate(image_device, decimated_image_device_.get(),
-                                   thresholded_image_device_.get(),
-                                   tag_detector_->qtp.min_white_black_diff,
-                                   &stream_);
+  threshold_->ThresholdNoDecimate(
+      image_device, gray_image_device_.get(), thresholded_image_device_.get(),
+      tag_detector_->qtp.min_white_black_diff, &stream_);
 
   after_threshold_.Record(&stream_);
 
@@ -722,9 +716,6 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
         " and height: " + std::to_string(height_) + " are unusable");
   }
 
-  size_t decimated_width = width_ / 2;
-  size_t decimated_height = height_ / 2;
-
   // TODO(austin): Tune for the global shutter camera.
   // 1280 -> 2 * 128 * 5
   // 720 -> 2 * 8 * 5 * 9
@@ -735,8 +726,8 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
     constexpr size_t kBlockHeight = 16;
     dim3 threads(kBlockWidth, kBlockHeight, 1);
     // Overlap 1 on each side in x, and 1 in y.
-    dim3 blocks((decimated_width + threads.x - 3) / (threads.x - 2),
-                (decimated_height + threads.y - 2) / (threads.y - 1), 1);
+    dim3 blocks((width_ + threads.x - 3) / (threads.x - 2),
+                (height_ + threads.y - 2) / (threads.y - 1), 1);
 
     //  Make sure we fit in our mask.
     if (width_ * height_ >= static_cast<size_t>(1 << 22)) {
@@ -748,7 +739,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
     BlobDiff<kBlockWidth, kBlockHeight><<<blocks, threads, 0, stream_.get()>>>(
         thresholded_image_device_.get(), union_markers_device_.get(),
         union_markers_size_device_.get(), union_marker_pair_device_.get(),
-        decimated_width, decimated_height);
+        width_, height_);
     MaybeCheckAndSynchronize("BlobDiff");
   }
 
@@ -963,8 +954,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
     //
     // Clear the size of non-passing extents and the starting offset of all
     // extents.
-    TransformLineFitPoint rewrite(decimated_image_device_.get(), width_ / 2,
-                                  height_ / 2);
+    TransformLineFitPoint rewrite(gray_image_device_.get(), width_, height_);
     cub::TransformInputIterator<LineFitPoint, TransformLineFitPoint,
                                 IndexPoint*>
         input_iterator(sorted_selected_blobs_device_.get(), rewrite);

diff --git a/third_party/971apriltag/apriltag.h b/third_party/971apriltag/apriltag.h
@@ -108,9 +108,6 @@ class GpuDetector {
   void CopyGrayTo(uint8_t* output) const {
     gray_image_device_.MemcpyTo(output);
   }
-  void CopyDecimatedTo(uint8_t* output) const {
-    decimated_image_device_.MemcpyTo(output);
-  }
   void CopyThresholdedTo(uint8_t* output) const {
     thresholded_image_device_.MemcpyTo(output);
   }
@@ -310,8 +307,6 @@ class GpuDetector {
   GpuMemory<uint8_t> color_image_device_;
   // Full size gray scale image.
   GpuMemory<uint8_t> gray_image_device_;
-  // Half resolution, gray, decimated image.
-  UnifiedMemory<uint8_t> decimated_image_device_;
   // Intermediates for thresholding.
   UnifiedMemory<uint8_t> thresholded_image_device_;
 

diff --git a/third_party/971apriltag/threshold.cc b/third_party/971apriltag/threshold.cc
@@ -52,40 +52,6 @@ __global__ void InternalCudaToGreyscale(const uint8_t* color_image,
   }
 }
 
-// Writes out the grayscale image and decimated image.
-template <vision::ImageFormat IMAGE_FORMAT>
-__global__ void InternalCudaToGreyscaleAndDecimate(
-    const uint8_t* color_image, uint8_t* decimated_image,
-    const apriltag_size_t in_width, const apriltag_size_t in_height) {
-  constexpr apriltag_size_t kBytesPerPixel = BytesPerPixel(IMAGE_FORMAT);
-  const apriltag_size_t out_height = in_height / 2;
-  const apriltag_size_t out_width = in_width / 2;
-  apriltag_size_t out_i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (out_i < out_width * out_height) {
-    const apriltag_size_t out_row = out_i / out_width;
-    const apriltag_size_t out_col = out_i - out_width * out_row;
-
-    const u_int32_t in_row = out_row * 2;
-    const u_int32_t in_col = out_col * 2;
-
-    const apriltag_size_t in_i = in_row * in_width + in_col;
-
-    decimated_image[out_row * out_width + out_col] =
-        ToGray<IMAGE_FORMAT>(color_image + in_i * kBytesPerPixel);
-    out_i += blockDim.x * gridDim.x;
-  }
-  // TODO(austin): Figure out how to load contiguous memory reasonably
-  // efficiently and max/min over it.
-
-  // TODO(austin): Can we do the threshold here too?  That would be less memory
-  // bandwidth consumed...
-  //
-  //   could do it by merging this code with InternalBlockMinMax, altering
-  //   the input indexing so it grabs from the undecimated input image.  Add
-  //   the grayscale converion code in there as well?
-}
-
 // Returns the min and max for a row of 4 pixels.
 __forceinline__ __device__ uchar2 minmax(uchar4 row) {
   uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w));
@@ -106,7 +72,7 @@ __forceinline__ __device__ apriltag_size_t XYToIndex(apriltag_size_t width,
 }
 
 // Computes the min and max pixel value for each block of 4 pixels.
-__global__ void InternalBlockMinMax(const uint8_t* decimated_image,
+__global__ void InternalBlockMinMax(const uint8_t* image,
                                     uchar2* unfiltered_minmax_image,
                                     const apriltag_size_t width,
                                     const apriltag_size_t height) {
@@ -119,10 +85,10 @@ __global__ void InternalBlockMinMax(const uint8_t* decimated_image,
   }
 
   for (int i = 0; i < 4; ++i) {
-    const uchar4 decimated_block = *reinterpret_cast<const uchar4*>(
-        decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i));
+    const uchar4 block = *reinterpret_cast<const uchar4*>(
+        image + XYToIndex(width * 4, x * 4, y * 4 + i));
 
-    vals[i] = minmax(decimated_block);
+    vals[i] = minmax(block);
   }
 
   unfiltered_minmax_image[XYToIndex(width, x, y)] =
@@ -169,7 +135,7 @@ __global__ void InternalBlockFilter(const uchar2* unfiltered_minmax_image,
 }
 
 // Thresholds the image based on the filtered thresholds.
-__global__ void InternalThreshold(const uint8_t* decimated_image,
+__global__ void InternalThreshold(const uint8_t* image,
                                   const uchar2* minmax_image,
                                   uint8_t* thresholded_image,
                                   const apriltag_size_t width,
@@ -187,7 +153,7 @@ __global__ void InternalThreshold(const uint8_t* decimated_image,
       result = 127;
     } else {
       uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2;
-      if (decimated_image[i] > thresh) {
+      if (image[i] > thresh) {
         result = 255;
       } else {
         result = 0;
@@ -214,11 +180,10 @@ class TypedThreshold : public Threshold {
 
   // Converts to grayscale, decimates, and thresholds an image on the provided
   // stream.
-  void ThresholdAndDecimate(const uint8_t* color_image,
-                            uint8_t* decimated_image,
-                            uint8_t* thresholded_image,
-                            apriltag_size_t min_white_black_diff,
-                            CudaStream* stream) override;
+  void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image,
+                           uint8_t* thresholded_image,
+                           apriltag_size_t min_white_black_diff,
+                           CudaStream* stream) override;
 
   virtual ~TypedThreshold() = default;
 
@@ -246,46 +211,42 @@ void TypedThreshold<IMAGE_FORMAT>::ToGreyscale(const uint8_t* color_image,
 }
 
 template <vision::ImageFormat IMAGE_FORMAT>
-void TypedThreshold<IMAGE_FORMAT>::ThresholdAndDecimate(
-    const uint8_t* color_image, uint8_t* decimated_image,
-    uint8_t* thresholded_image, apriltag_size_t min_white_black_diff,
-    CudaStream* stream) {
+void TypedThreshold<IMAGE_FORMAT>::ThresholdNoDecimate(
+    const uint8_t* color_image, uint8_t* gray_image, uint8_t* thresholded_image,
+    apriltag_size_t min_white_black_diff, CudaStream* stream) {
   uint8_t* unfiltered_minmax_image = unfiltered_minmax_image_device_.get();
   uint8_t* minmax_image = minmax_image_device_.get();
 
   CHECK((width_ % 8) == 0);
   CHECK((height_ % 8) == 0);
   constexpr size_t kThreads = 256;
-  const apriltag_size_t decimated_width = width_ / 2;
-  const apriltag_size_t decimated_height = height_ / 2;
+  const apriltag_size_t width = width_;
+  const apriltag_size_t height = height_;
 
   {
     // Step one, convert to gray and decimate.
-    const size_t kBlocks =
-        (decimated_width * decimated_height + kThreads - 1) / kThreads / 4;
-    InternalCudaToGreyscaleAndDecimate<IMAGE_FORMAT>
-        <<<kBlocks, kThreads, 0, stream->get()>>>(color_image, decimated_image,
+    const size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4;
+    InternalCudaToGreyscale<IMAGE_FORMAT>
+        <<<kBlocks, kThreads, 0, stream->get()>>>(color_image, gray_image,
                                                   width_, height_);
     MaybeCheckAndSynchronize();
   }
 
   {
     // Step 2, compute a min/max for each block of 4x4 (16) pixels.
     const dim3 threads(16, 16, 1);
-    const dim3 blocks((decimated_width / 4 + 15) / 16,
-                      (decimated_height / 4 + 15) / 16, 1);
+    const dim3 blocks((width / 4 + 15) / 16, (height / 4 + 15) / 16, 1);
 
     InternalBlockMinMax<<<blocks, threads, 0, stream->get()>>>(
-        decimated_image, reinterpret_cast<uchar2*>(unfiltered_minmax_image),
-        decimated_width / 4, decimated_height / 4);
+        gray_image, reinterpret_cast<uchar2*>(unfiltered_minmax_image),
+        width / 4, height / 4);
     MaybeCheckAndSynchronize();
 
     // Step 3, Blur those min/max's a further +- 1 block in each direction using
     // min/max.
     InternalBlockFilter<<<blocks, threads, 0, stream->get()>>>(
         reinterpret_cast<uchar2*>(unfiltered_minmax_image),
-        reinterpret_cast<uchar2*>(minmax_image), decimated_width / 4,
-        decimated_height / 4);
+        reinterpret_cast<uchar2*>(minmax_image), width / 4, height / 4);
     MaybeCheckAndSynchronize();
   }
 
@@ -295,9 +256,8 @@ void TypedThreshold<IMAGE_FORMAT>::ThresholdAndDecimate(
     const apriltag_size_t kBlocks =
         (width_ * height_ / 4 + kThreads - 1) / kThreads / 4;
     InternalThreshold<<<kBlocks, kThreads, 0, stream->get()>>>(
-        decimated_image, reinterpret_cast<uchar2*>(minmax_image),
-        thresholded_image, decimated_width, decimated_height,
-        min_white_black_diff);
+        gray_image, reinterpret_cast<uchar2*>(minmax_image), thresholded_image,
+        width, height, min_white_black_diff);
     MaybeCheckAndSynchronize();
   }
 

diff --git a/third_party/971apriltag/threshold.h b/third_party/971apriltag/threshold.h
@@ -52,11 +52,10 @@ class Threshold {
 
   // Converts to grayscale, decimates, and thresholds an image on the provided
   // stream.
-  virtual void ThresholdAndDecimate(const uint8_t* color_image,
-                                    uint8_t* decimated_image,
-                                    uint8_t* thresholded_image,
-                                    apriltag_size_t min_white_black_diff,
-                                    CudaStream* stream) = 0;
+  virtual void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image,
+                                   uint8_t* thresholded_image,
+                                   apriltag_size_t min_white_black_diff,
+                                   CudaStream* stream) = 0;
 
   virtual ~Threshold() = default;
 };