Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 18 additions & 28 deletions third_party/971apriltag/apriltag.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ GpuDetector::GpuDetector(size_t width, size_t height,
gray_image_host_(width * height),
color_image_device_(width * height * 2),
gray_image_device_(width * height),
decimated_image_device_(width / 2 * height / 2),
thresholded_image_device_(width / 2 * height / 2),
union_markers_device_(width / 2 * height / 2),
union_markers_size_device_(width / 2 * height / 2),
Expand Down Expand Up @@ -588,14 +587,12 @@ struct TransformLineFitPoint {

int32_t W = 1;

if (ix > 0 && ix + 1 < decimated_width && iy > 0 &&
iy + 1 < decimated_height) {
int32_t grad_x = decimated_image_device_[iy * decimated_width + ix + 1] -
decimated_image_device_[iy * decimated_width + ix - 1];
if (ix > 0 && ix + 1 < width && iy > 0 && iy + 1 < height) {
int32_t grad_x = image_device_[iy * width + ix + 1] -
image_device_[iy * width + ix - 1];

int32_t grad_y =
decimated_image_device_[(iy + 1) * decimated_width + ix] -
decimated_image_device_[(iy - 1) * decimated_width + ix];
int32_t grad_y = image_device_[(iy + 1) * width + ix] -
image_device_[(iy - 1) * width + ix];

// XXX Tunable. How to shape the gradient magnitude?
W = hypotf(grad_x, grad_y) + 1;
Expand All @@ -610,14 +607,12 @@ struct TransformLineFitPoint {
result.blob_index = p.blob_index();
return result;
}
TransformLineFitPoint(const uint8_t* decimated_image_device,
int decimated_width_param, int decimated_height_param)
: decimated_image_device_(decimated_image_device),
decimated_width(decimated_width_param),
decimated_height(decimated_height_param) {}
const uint8_t* decimated_image_device_;
int decimated_width;
int decimated_height;
TransformLineFitPoint(const uint8_t* image_device, int width_param,
int height_param)
: image_device_(image_device), width(width_param), height(height_param) {}
const uint8_t* image_device_;
int width;
int height;
};

struct SumLineFitPoints {
Expand Down Expand Up @@ -690,10 +685,9 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
after_image_memcpy_to_device_.Record(&stream_);

// Now, threshold on the GPU fully.
threshold_->ThresholdAndDecimate(image_device, decimated_image_device_.get(),
thresholded_image_device_.get(),
tag_detector_->qtp.min_white_black_diff,
&stream_);
threshold_->ThresholdNoDecimate(
image_device, gray_image_device_.get(), thresholded_image_device_.get(),
tag_detector_->qtp.min_white_black_diff, &stream_);

after_threshold_.Record(&stream_);

Expand Down Expand Up @@ -722,9 +716,6 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
" and height: " + std::to_string(height_) + " are unusable");
}

size_t decimated_width = width_ / 2;
size_t decimated_height = height_ / 2;

// TODO(austin): Tune for the global shutter camera.
// 1280 -> 2 * 128 * 5
// 720 -> 2 * 8 * 5 * 9
Expand All @@ -735,8 +726,8 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
constexpr size_t kBlockHeight = 16;
dim3 threads(kBlockWidth, kBlockHeight, 1);
// Overlap 1 on each side in x, and 1 in y.
dim3 blocks((decimated_width + threads.x - 3) / (threads.x - 2),
(decimated_height + threads.y - 2) / (threads.y - 1), 1);
dim3 blocks((width_ + threads.x - 3) / (threads.x - 2),
(height_ + threads.y - 2) / (threads.y - 1), 1);

// Make sure we fit in our mask.
if (width_ * height_ >= static_cast<size_t>(1 << 22)) {
Expand All @@ -748,7 +739,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
BlobDiff<kBlockWidth, kBlockHeight><<<blocks, threads, 0, stream_.get()>>>(
thresholded_image_device_.get(), union_markers_device_.get(),
union_markers_size_device_.get(), union_marker_pair_device_.get(),
decimated_width, decimated_height);
width_, height_);
MaybeCheckAndSynchronize("BlobDiff");
}

Expand Down Expand Up @@ -963,8 +954,7 @@ absl::Status GpuDetector::Detect(const uint8_t* image,
//
// Clear the size of non-passing extents and the starting offset of all
// extents.
TransformLineFitPoint rewrite(decimated_image_device_.get(), width_ / 2,
height_ / 2);
TransformLineFitPoint rewrite(gray_image_device_.get(), width_, height_);
cub::TransformInputIterator<LineFitPoint, TransformLineFitPoint,
IndexPoint*>
input_iterator(sorted_selected_blobs_device_.get(), rewrite);
Expand Down
5 changes: 0 additions & 5 deletions third_party/971apriltag/apriltag.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,6 @@ class GpuDetector {
void CopyGrayTo(uint8_t* output) const {
gray_image_device_.MemcpyTo(output);
}
void CopyDecimatedTo(uint8_t* output) const {
decimated_image_device_.MemcpyTo(output);
}
void CopyThresholdedTo(uint8_t* output) const {
thresholded_image_device_.MemcpyTo(output);
}
Expand Down Expand Up @@ -310,8 +307,6 @@ class GpuDetector {
GpuMemory<uint8_t> color_image_device_;
// Full size gray scale image.
GpuMemory<uint8_t> gray_image_device_;
// Half resolution, gray, decimated image.
UnifiedMemory<uint8_t> decimated_image_device_;
// Intermediates for thresholding.
UnifiedMemory<uint8_t> thresholded_image_device_;

Expand Down
88 changes: 24 additions & 64 deletions third_party/971apriltag/threshold.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,40 +52,6 @@ __global__ void InternalCudaToGreyscale(const uint8_t* color_image,
}
}

// Writes out the grayscale image and decimated image.
template <vision::ImageFormat IMAGE_FORMAT>
__global__ void InternalCudaToGreyscaleAndDecimate(
const uint8_t* color_image, uint8_t* decimated_image,
const apriltag_size_t in_width, const apriltag_size_t in_height) {
constexpr apriltag_size_t kBytesPerPixel = BytesPerPixel(IMAGE_FORMAT);
const apriltag_size_t out_height = in_height / 2;
const apriltag_size_t out_width = in_width / 2;
apriltag_size_t out_i = blockIdx.x * blockDim.x + threadIdx.x;

while (out_i < out_width * out_height) {
const apriltag_size_t out_row = out_i / out_width;
const apriltag_size_t out_col = out_i - out_width * out_row;

const u_int32_t in_row = out_row * 2;
const u_int32_t in_col = out_col * 2;

const apriltag_size_t in_i = in_row * in_width + in_col;

decimated_image[out_row * out_width + out_col] =
ToGray<IMAGE_FORMAT>(color_image + in_i * kBytesPerPixel);
out_i += blockDim.x * gridDim.x;
}
// TODO(austin): Figure out how to load contiguous memory reasonably
// efficiently and max/min over it.

// TODO(austin): Can we do the threshold here too? That would be less memory
// bandwidth consumed...
//
// could do it by merging this code with InternalBlockMinMax, altering
// the input indexing so it grabs from the undecimated input image. Add
// the grayscale converion code in there as well?
}

// Returns the min and max for a row of 4 pixels.
__forceinline__ __device__ uchar2 minmax(uchar4 row) {
uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w));
Expand All @@ -106,7 +72,7 @@ __forceinline__ __device__ apriltag_size_t XYToIndex(apriltag_size_t width,
}

// Computes the min and max pixel value for each block of 4 pixels.
__global__ void InternalBlockMinMax(const uint8_t* decimated_image,
__global__ void InternalBlockMinMax(const uint8_t* image,
uchar2* unfiltered_minmax_image,
const apriltag_size_t width,
const apriltag_size_t height) {
Expand All @@ -119,10 +85,10 @@ __global__ void InternalBlockMinMax(const uint8_t* decimated_image,
}

for (int i = 0; i < 4; ++i) {
const uchar4 decimated_block = *reinterpret_cast<const uchar4*>(
decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i));
const uchar4 block = *reinterpret_cast<const uchar4*>(
image + XYToIndex(width * 4, x * 4, y * 4 + i));

vals[i] = minmax(decimated_block);
vals[i] = minmax(block);
}

unfiltered_minmax_image[XYToIndex(width, x, y)] =
Expand Down Expand Up @@ -169,7 +135,7 @@ __global__ void InternalBlockFilter(const uchar2* unfiltered_minmax_image,
}

// Thresholds the image based on the filtered thresholds.
__global__ void InternalThreshold(const uint8_t* decimated_image,
__global__ void InternalThreshold(const uint8_t* image,
const uchar2* minmax_image,
uint8_t* thresholded_image,
const apriltag_size_t width,
Expand All @@ -187,7 +153,7 @@ __global__ void InternalThreshold(const uint8_t* decimated_image,
result = 127;
} else {
uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2;
if (decimated_image[i] > thresh) {
if (image[i] > thresh) {
result = 255;
} else {
result = 0;
Expand All @@ -214,11 +180,10 @@ class TypedThreshold : public Threshold {

// Converts to grayscale, decimates, and thresholds an image on the provided
// stream.
void ThresholdAndDecimate(const uint8_t* color_image,
uint8_t* decimated_image,
uint8_t* thresholded_image,
apriltag_size_t min_white_black_diff,
CudaStream* stream) override;
void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image,
uint8_t* thresholded_image,
apriltag_size_t min_white_black_diff,
CudaStream* stream) override;

virtual ~TypedThreshold() = default;

Expand Down Expand Up @@ -246,46 +211,42 @@ void TypedThreshold<IMAGE_FORMAT>::ToGreyscale(const uint8_t* color_image,
}

template <vision::ImageFormat IMAGE_FORMAT>
void TypedThreshold<IMAGE_FORMAT>::ThresholdAndDecimate(
const uint8_t* color_image, uint8_t* decimated_image,
uint8_t* thresholded_image, apriltag_size_t min_white_black_diff,
CudaStream* stream) {
void TypedThreshold<IMAGE_FORMAT>::ThresholdNoDecimate(
const uint8_t* color_image, uint8_t* gray_image, uint8_t* thresholded_image,
apriltag_size_t min_white_black_diff, CudaStream* stream) {
uint8_t* unfiltered_minmax_image = unfiltered_minmax_image_device_.get();
uint8_t* minmax_image = minmax_image_device_.get();

CHECK((width_ % 8) == 0);
CHECK((height_ % 8) == 0);
constexpr size_t kThreads = 256;
const apriltag_size_t decimated_width = width_ / 2;
const apriltag_size_t decimated_height = height_ / 2;
const apriltag_size_t width = width_;
const apriltag_size_t height = height_;

{
// Step one, convert to gray and decimate.
const size_t kBlocks =
(decimated_width * decimated_height + kThreads - 1) / kThreads / 4;
InternalCudaToGreyscaleAndDecimate<IMAGE_FORMAT>
<<<kBlocks, kThreads, 0, stream->get()>>>(color_image, decimated_image,
const size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4;
InternalCudaToGreyscale<IMAGE_FORMAT>
<<<kBlocks, kThreads, 0, stream->get()>>>(color_image, gray_image,
width_, height_);
MaybeCheckAndSynchronize();
}

{
// Step 2, compute a min/max for each block of 4x4 (16) pixels.
const dim3 threads(16, 16, 1);
const dim3 blocks((decimated_width / 4 + 15) / 16,
(decimated_height / 4 + 15) / 16, 1);
const dim3 blocks((width / 4 + 15) / 16, (height / 4 + 15) / 16, 1);

InternalBlockMinMax<<<blocks, threads, 0, stream->get()>>>(
decimated_image, reinterpret_cast<uchar2*>(unfiltered_minmax_image),
decimated_width / 4, decimated_height / 4);
gray_image, reinterpret_cast<uchar2*>(unfiltered_minmax_image),
width / 4, height / 4);
MaybeCheckAndSynchronize();

// Step 3, Blur those min/max's a further +- 1 block in each direction using
// min/max.
InternalBlockFilter<<<blocks, threads, 0, stream->get()>>>(
reinterpret_cast<uchar2*>(unfiltered_minmax_image),
reinterpret_cast<uchar2*>(minmax_image), decimated_width / 4,
decimated_height / 4);
reinterpret_cast<uchar2*>(minmax_image), width / 4, height / 4);
MaybeCheckAndSynchronize();
}

Expand All @@ -295,9 +256,8 @@ void TypedThreshold<IMAGE_FORMAT>::ThresholdAndDecimate(
const apriltag_size_t kBlocks =
(width_ * height_ / 4 + kThreads - 1) / kThreads / 4;
InternalThreshold<<<kBlocks, kThreads, 0, stream->get()>>>(
decimated_image, reinterpret_cast<uchar2*>(minmax_image),
thresholded_image, decimated_width, decimated_height,
min_white_black_diff);
gray_image, reinterpret_cast<uchar2*>(minmax_image), thresholded_image,
width, height, min_white_black_diff);
MaybeCheckAndSynchronize();
}

Expand Down
9 changes: 4 additions & 5 deletions third_party/971apriltag/threshold.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,10 @@ class Threshold {

// Converts to grayscale, decimates, and thresholds an image on the provided
// stream.
virtual void ThresholdAndDecimate(const uint8_t* color_image,
uint8_t* decimated_image,
uint8_t* thresholded_image,
apriltag_size_t min_white_black_diff,
CudaStream* stream) = 0;
virtual void ThresholdNoDecimate(const uint8_t* color_image, uint8_t* image,
uint8_t* thresholded_image,
apriltag_size_t min_white_black_diff,
CudaStream* stream) = 0;

virtual ~Threshold() = default;
};
Expand Down
Loading