From 4f42c1fcd662c660c776b5074ef37c9288ca020f Mon Sep 17 00:00:00 2001 From: "carrotmercinary@gmail.com" <167659798+yasen5@users.noreply.github.com> Date: Sun, 12 Apr 2026 21:52:12 +0000 Subject: [PATCH] Builds, should prob make it more configurable tho --- third_party/971apriltag/apriltag.cc | 46 ++++++--------- third_party/971apriltag/apriltag.h | 5 -- third_party/971apriltag/threshold.cc | 88 ++++++++-------------------- third_party/971apriltag/threshold.h | 9 ++- 4 files changed, 46 insertions(+), 102 deletions(-) diff --git a/third_party/971apriltag/apriltag.cc b/third_party/971apriltag/apriltag.cc index 2ea3efef..bc21c103 100644 --- a/third_party/971apriltag/apriltag.cc +++ b/third_party/971apriltag/apriltag.cc @@ -127,7 +127,6 @@ GpuDetector::GpuDetector(size_t width, size_t height, gray_image_host_(width * height), color_image_device_(width * height * 2), gray_image_device_(width * height), - decimated_image_device_(width / 2 * height / 2), thresholded_image_device_(width / 2 * height / 2), union_markers_device_(width / 2 * height / 2), union_markers_size_device_(width / 2 * height / 2), @@ -588,14 +587,12 @@ struct TransformLineFitPoint { int32_t W = 1; - if (ix > 0 && ix + 1 < decimated_width && iy > 0 && - iy + 1 < decimated_height) { - int32_t grad_x = decimated_image_device_[iy * decimated_width + ix + 1] - - decimated_image_device_[iy * decimated_width + ix - 1]; + if (ix > 0 && ix + 1 < width && iy > 0 && iy + 1 < height) { + int32_t grad_x = image_device_[iy * width + ix + 1] - + image_device_[iy * width + ix - 1]; - int32_t grad_y = - decimated_image_device_[(iy + 1) * decimated_width + ix] - - decimated_image_device_[(iy - 1) * decimated_width + ix]; + int32_t grad_y = image_device_[(iy + 1) * width + ix] - + image_device_[(iy - 1) * width + ix]; // XXX Tunable. How to shape the gradient magnitude? W = hypotf(grad_x, grad_y) + 1; @@ -610,14 +607,12 @@ struct TransformLineFitPoint { result.blob_index = p.blob_index(); return result; } - TransformLineFitPoint(const uint8_t* decimated_image_device, - int decimated_width_param, int decimated_height_param) - : decimated_image_device_(decimated_image_device), - decimated_width(decimated_width_param), - decimated_height(decimated_height_param) {} - const uint8_t* decimated_image_device_; - int decimated_width; - int decimated_height; + TransformLineFitPoint(const uint8_t* image_device, int width_param, + int height_param) + : image_device_(image_device), width(width_param), height(height_param) {} + const uint8_t* image_device_; + int width; + int height; }; struct SumLineFitPoints { @@ -690,10 +685,9 @@ absl::Status GpuDetector::Detect(const uint8_t* image, after_image_memcpy_to_device_.Record(&stream_); // Now, threshold on the GPU fully. - threshold_->ThresholdAndDecimate(image_device, decimated_image_device_.get(), - thresholded_image_device_.get(), - tag_detector_->qtp.min_white_black_diff, - &stream_); + threshold_->ThresholdNoDecimate( + image_device, gray_image_device_.get(), thresholded_image_device_.get(), + tag_detector_->qtp.min_white_black_diff, &stream_); after_threshold_.Record(&stream_); @@ -722,9 +716,6 @@ absl::Status GpuDetector::Detect(const uint8_t* image, " and height: " + std::to_string(height_) + " are unusable"); } - size_t decimated_width = width_ / 2; - size_t decimated_height = height_ / 2; - // TODO(austin): Tune for the global shutter camera. // 1280 -> 2 * 128 * 5 // 720 -> 2 * 8 * 5 * 9 @@ -735,8 +726,8 @@ absl::Status GpuDetector::Detect(const uint8_t* image, constexpr size_t kBlockHeight = 16; dim3 threads(kBlockWidth, kBlockHeight, 1); // Overlap 1 on each side in x, and 1 in y. - dim3 blocks((decimated_width + threads.x - 3) / (threads.x - 2), - (decimated_height + threads.y - 2) / (threads.y - 1), 1); + dim3 blocks((width_ + threads.x - 3) / (threads.x - 2), + (height_ + threads.y - 2) / (threads.y - 1), 1); // Make sure we fit in our mask. if (width_ * height_ >= static_cast(1 << 22)) { @@ -748,7 +739,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image, BlobDiff<<>>( thresholded_image_device_.get(), union_markers_device_.get(), union_markers_size_device_.get(), union_marker_pair_device_.get(), - decimated_width, decimated_height); + width_, height_); MaybeCheckAndSynchronize("BlobDiff"); } @@ -963,8 +954,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image, // // Clear the size of non-passing extents and the starting offset of all // extents. - TransformLineFitPoint rewrite(decimated_image_device_.get(), width_ / 2, - height_ / 2); + TransformLineFitPoint rewrite(gray_image_device_.get(), width_, height_); cub::TransformInputIterator input_iterator(sorted_selected_blobs_device_.get(), rewrite); diff --git a/third_party/971apriltag/apriltag.h b/third_party/971apriltag/apriltag.h index 921bef5e..926cf880 100644 --- a/third_party/971apriltag/apriltag.h +++ b/third_party/971apriltag/apriltag.h @@ -108,9 +108,6 @@ class GpuDetector { void CopyGrayTo(uint8_t* output) const { gray_image_device_.MemcpyTo(output); } - void CopyDecimatedTo(uint8_t* output) const { - decimated_image_device_.MemcpyTo(output); - } void CopyThresholdedTo(uint8_t* output) const { thresholded_image_device_.MemcpyTo(output); } @@ -310,8 +307,6 @@ class GpuDetector { GpuMemory color_image_device_; // Full size gray scale image. GpuMemory gray_image_device_; - // Half resolution, gray, decimated image. - UnifiedMemory decimated_image_device_; // Intermediates for thresholding. UnifiedMemory thresholded_image_device_; diff --git a/third_party/971apriltag/threshold.cc b/third_party/971apriltag/threshold.cc index 4a5057d7..4c060526 100644 --- a/third_party/971apriltag/threshold.cc +++ b/third_party/971apriltag/threshold.cc @@ -52,40 +52,6 @@ __global__ void InternalCudaToGreyscale(const uint8_t* color_image, } } -// Writes out the grayscale image and decimated image. -template -__global__ void InternalCudaToGreyscaleAndDecimate( - const uint8_t* color_image, uint8_t* decimated_image, - const apriltag_size_t in_width, const apriltag_size_t in_height) { - constexpr apriltag_size_t kBytesPerPixel = BytesPerPixel(IMAGE_FORMAT); - const apriltag_size_t out_height = in_height / 2; - const apriltag_size_t out_width = in_width / 2; - apriltag_size_t out_i = blockIdx.x * blockDim.x + threadIdx.x; - - while (out_i < out_width * out_height) { - const apriltag_size_t out_row = out_i / out_width; - const apriltag_size_t out_col = out_i - out_width * out_row; - - const u_int32_t in_row = out_row * 2; - const u_int32_t in_col = out_col * 2; - - const apriltag_size_t in_i = in_row * in_width + in_col; - - decimated_image[out_row * out_width + out_col] = - ToGray(color_image + in_i * kBytesPerPixel); - out_i += blockDim.x * gridDim.x; - } - // TODO(austin): Figure out how to load contiguous memory reasonably - // efficiently and max/min over it. - - // TODO(austin): Can we do the threshold here too? That would be less memory - // bandwidth consumed... - // - // could do it by merging this code with InternalBlockMinMax, altering - // the input indexing so it grabs from the undecimated input image. Add - // the grayscale converion code in there as well? -} - // Returns the min and max for a row of 4 pixels. __forceinline__ __device__ uchar2 minmax(uchar4 row) { uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w)); @@ -106,7 +72,7 @@ __forceinline__ __device__ apriltag_size_t XYToIndex(apriltag_size_t width, } // Computes the min and max pixel value for each block of 4 pixels. -__global__ void InternalBlockMinMax(const uint8_t* decimated_image, +__global__ void InternalBlockMinMax(const uint8_t* image, uchar2* unfiltered_minmax_image, const apriltag_size_t width, const apriltag_size_t height) { @@ -119,10 +85,10 @@ __global__ void InternalBlockMinMax(const uint8_t* decimated_image, } for (int i = 0; i < 4; ++i) { - const uchar4 decimated_block = *reinterpret_cast( - decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i)); + const uchar4 block = *reinterpret_cast( + image + XYToIndex(width * 4, x * 4, y * 4 + i)); - vals[i] = minmax(decimated_block); + vals[i] = minmax(block); } unfiltered_minmax_image[XYToIndex(width, x, y)] = @@ -169,7 +135,7 @@ __global__ void InternalBlockFilter(const uchar2* unfiltered_minmax_image, } // Thresholds the image based on the filtered thresholds. -__global__ void InternalThreshold(const uint8_t* decimated_image, +__global__ void InternalThreshold(const uint8_t* image, const uchar2* minmax_image, uint8_t* thresholded_image, const apriltag_size_t width, @@ -187,7 +153,7 @@ __global__ void InternalThreshold(const uint8_t* decimated_image, result = 127; } else { uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2; - if (decimated_image[i] > thresh) { + if (image[i] > thresh) { result = 255; } else { result = 0; @@ -214,11 +180,10 @@ class TypedThreshold : public Threshold { // Converts to grayscale, decimates, and thresholds an image on the provided // stream. - void ThresholdAndDecimate(const uint8_t* color_image, - uint8_t* decimated_image, - uint8_t* thresholded_image, - apriltag_size_t min_white_black_diff, - CudaStream* stream) override; + void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image, + uint8_t* thresholded_image, + apriltag_size_t min_white_black_diff, + CudaStream* stream) override; virtual ~TypedThreshold() = default; @@ -246,25 +211,23 @@ void TypedThreshold::ToGreyscale(const uint8_t* color_image, } template -void TypedThreshold::ThresholdAndDecimate( - const uint8_t* color_image, uint8_t* decimated_image, - uint8_t* thresholded_image, apriltag_size_t min_white_black_diff, - CudaStream* stream) { +void TypedThreshold::ThresholdNoDecimate( + const uint8_t* color_image, uint8_t* gray_image, uint8_t* thresholded_image, + apriltag_size_t min_white_black_diff, CudaStream* stream) { uint8_t* unfiltered_minmax_image = unfiltered_minmax_image_device_.get(); uint8_t* minmax_image = minmax_image_device_.get(); CHECK((width_ % 8) == 0); CHECK((height_ % 8) == 0); constexpr size_t kThreads = 256; - const apriltag_size_t decimated_width = width_ / 2; - const apriltag_size_t decimated_height = height_ / 2; + const apriltag_size_t width = width_; + const apriltag_size_t height = height_; { // Step one, convert to gray and decimate. - const size_t kBlocks = - (decimated_width * decimated_height + kThreads - 1) / kThreads / 4; - InternalCudaToGreyscaleAndDecimate - <<get()>>>(color_image, decimated_image, + const size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4; + InternalCudaToGreyscale + <<get()>>>(color_image, gray_image, width_, height_); MaybeCheckAndSynchronize(); } @@ -272,20 +235,18 @@ void TypedThreshold::ThresholdAndDecimate( { // Step 2, compute a min/max for each block of 4x4 (16) pixels. const dim3 threads(16, 16, 1); - const dim3 blocks((decimated_width / 4 + 15) / 16, - (decimated_height / 4 + 15) / 16, 1); + const dim3 blocks((width / 4 + 15) / 16, (height / 4 + 15) / 16, 1); InternalBlockMinMax<<get()>>>( - decimated_image, reinterpret_cast(unfiltered_minmax_image), - decimated_width / 4, decimated_height / 4); + gray_image, reinterpret_cast(unfiltered_minmax_image), + width / 4, height / 4); MaybeCheckAndSynchronize(); // Step 3, Blur those min/max's a further +- 1 block in each direction using // min/max. InternalBlockFilter<<get()>>>( reinterpret_cast(unfiltered_minmax_image), - reinterpret_cast(minmax_image), decimated_width / 4, - decimated_height / 4); + reinterpret_cast(minmax_image), width / 4, height / 4); MaybeCheckAndSynchronize(); } @@ -295,9 +256,8 @@ void TypedThreshold::ThresholdAndDecimate( const apriltag_size_t kBlocks = (width_ * height_ / 4 + kThreads - 1) / kThreads / 4; InternalThreshold<<get()>>>( - decimated_image, reinterpret_cast(minmax_image), - thresholded_image, decimated_width, decimated_height, - min_white_black_diff); + gray_image, reinterpret_cast(minmax_image), thresholded_image, + width, height, min_white_black_diff); MaybeCheckAndSynchronize(); } diff --git a/third_party/971apriltag/threshold.h b/third_party/971apriltag/threshold.h index cf167abe..c0115f5b 100644 --- a/third_party/971apriltag/threshold.h +++ b/third_party/971apriltag/threshold.h @@ -52,11 +52,10 @@ class Threshold { // Converts to grayscale, decimates, and thresholds an image on the provided // stream. - virtual void ThresholdAndDecimate(const uint8_t* color_image, - uint8_t* decimated_image, - uint8_t* thresholded_image, - apriltag_size_t min_white_black_diff, - CudaStream* stream) = 0; + virtual void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image, + uint8_t* thresholded_image, + apriltag_size_t min_white_black_diff, + CudaStream* stream) = 0; virtual ~Threshold() = default; };