How to achieve zero copy on Tensor or Image(nvcv)? #181

zoulee24 · 2024-08-02T07:38:42Z

zoulee24
Aug 2, 2024

How to achieve zero copy on Tensor or Image(nvcv)?

i use ffmpeg cuda hwaccle, get raw cuda ptr, can not find any way to activate zero copy to create a new nvcv::Image?

kgnandanwar · 2026-01-19T23:58:31Z

kgnandanwar
Jan 19, 2026

Zero Copy on Tensor or Image (nvcv) with ffmpeg CUDA hwaccel

Use nvcvImageWrapDataConstruct() - it wraps your existing CUDA pointer directly. No copy, just sets up the metadata.

API references in CV-CUDA source:

src/nvcv/src/include/nvcv/Image.h - nvcvImageWrapDataConstruct function
src/nvcv/src/include/nvcv/ImageData.h - NVCVImageData structure definitions
samples/datatypes/image.py - Python example using cvcuda.as_image() for zero-copy wrapping

Working example (tested with CUDA 12.6)

zero_copy_example.cu:

#include <cuda_runtime.h>
#include <iostream>
#include <stdexcept>
#include <cstring>

// In production, include <nvcv/Image.h>
// This example uses simplified structures for demonstration

typedef unsigned char NVCVByte;
typedef int32_t NVCVImageFormat;
#define NVCV_IMAGE_FORMAT_RGB8 100
#define NVCV_MAX_PLANE_COUNT 6

typedef enum {
    NVCV_IMAGE_BUFFER_NONE = 0,
    NVCV_IMAGE_BUFFER_STRIDED_CUDA,
    NVCV_IMAGE_BUFFER_STRIDED_HOST,
} NVCVImageBufferType;

typedef struct {
    int32_t width;
    int32_t height;
    int32_t rowStride;
    NVCVByte *basePtr;
} NVCVImagePlaneStrided;

typedef struct {
    int32_t numPlanes;
    NVCVImagePlaneStrided planes[NVCV_MAX_PLANE_COUNT];
} NVCVImageBufferStrided;

typedef union {
    NVCVImageBufferStrided strided;
} NVCVImageBuffer;

typedef struct {
    NVCVImageFormat format;
    NVCVImageBufferType bufferType;
    NVCVImageBuffer buffer;
} NVCVImageData;

// Simulated ffmpeg CUDA frame structure
struct FFmpegCudaFrame {
    void* devicePtr;
    int width;
    int height;
    int linesize;  // row stride
};

// Simulate getting a CUDA frame from ffmpeg hardware acceleration
FFmpegCudaFrame simulateFFmpegGetCudaFrame(int width, int height) {
    FFmpegCudaFrame frame;
    frame.width = width;
    frame.height = height;
    frame.linesize = width * 3; // RGB8 format
    
    size_t size = frame.linesize * height;
    cudaError_t err = cudaMalloc(&frame.devicePtr, size);
    if (err != cudaSuccess) {
        throw std::runtime_error(std::string("cudaMalloc failed: ") + cudaGetErrorString(err));
    }
    
    std::cout << "Simulated ffmpeg CUDA frame:\n";
    std::cout << "  Device pointer: " << frame.devicePtr << "\n";
    std::cout << "  Width: " << width << ", Height: " << height << "\n";
    std::cout << "  Linesize (row stride): " << frame.linesize << " bytes\n";
    
    return frame;
}

// ZERO-COPY WRAPPER: Create NVCVImageData from raw CUDA pointer
NVCVImageData wrapCudaPointerAsImage(void* cudaPtr, int width, int height, int rowStride) {
    NVCVImageData imageData;
    std::memset(&imageData, 0, sizeof(NVCVImageData));
    
    // Set format
    imageData.format = NVCV_IMAGE_FORMAT_RGB8;
    imageData.bufferType = NVCV_IMAGE_BUFFER_STRIDED_CUDA;
    
    // Set plane data (RGB8 is a single-plane format)
    imageData.buffer.strided.numPlanes = 1;
    imageData.buffer.strided.planes[0].width = width;
    imageData.buffer.strided.planes[0].height = height;
    imageData.buffer.strided.planes[0].rowStride = rowStride;
    imageData.buffer.strided.planes[0].basePtr = static_cast<NVCVByte*>(cudaPtr);
    
    std::cout << "\n✓ Zero-copy wrap successful!\n";
    std::cout << "  NVCVImageData uses existing CUDA pointer: " << cudaPtr << "\n";
    
    return imageData;
}

// CUDA kernel to demonstrate using the wrapped image
__global__ void fillPattern(unsigned char* data, int width, int height, int rowStride) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        int offset = y * rowStride + x * 3;
        data[offset + 0] = (unsigned char)(255 * x / width);      // R
        data[offset + 1] = (unsigned char)(255 * y / height);     // G
        data[offset + 2] = 128;                                    // B
    }
}

int main() {
    try {
        std::cout << "=== Zero-Copy CUDA Pointer to CV-CUDA Image Example ===\n\n";
        
        // Step 1: Get CUDA frame from ffmpeg hwaccel (simulated)
        std::cout << "Step 1: Get CUDA frame from ffmpeg hwaccel\n";
        int width = 1920;
        int height = 1080;
        FFmpegCudaFrame ffmpegFrame = simulateFFmpegGetCudaFrame(width, height);
        
        // Step 2: Wrap the raw CUDA pointer as CV-CUDA image (ZERO COPY!)
        std::cout << "\nStep 2: Wrap CUDA pointer as NVCVImageData (ZERO COPY)\n";
        NVCVImageData imageData = wrapCudaPointerAsImage(
            ffmpegFrame.devicePtr,
            ffmpegFrame.width,
            ffmpegFrame.height,
            ffmpegFrame.linesize
        );
        
        // Verify zero-copy
        std::cout << "\n--- Verification ---\n";
        std::cout << "Original pointer: " << ffmpegFrame.devicePtr << "\n";
        std::cout << "Wrapped pointer:  " << (void*)imageData.buffer.strided.planes[0].basePtr << "\n";
        if (imageData.buffer.strided.planes[0].basePtr == ffmpegFrame.devicePtr) {
            std::cout << "✓ POINTERS MATCH - True zero-copy achieved!\n";
        }
        
        // Step 3: Use the wrapped image in CUDA operations
        std::cout << "\nStep 3: Use wrapped image in CUDA kernel\n";
        dim3 blockSize(16, 16);
        dim3 gridSize((width + 15) / 16, (height + 15) / 16);
        
        fillPattern<<<gridSize, blockSize>>>(
            imageData.buffer.strided.planes[0].basePtr,
            width, height,
            imageData.buffer.strided.planes[0].rowStride
        );
        cudaDeviceSynchronize();
        std::cout << "✓ CUDA kernel executed on wrapped image!\n";
        
        std::cout << "\n=== Summary ===\n";
        std::cout << "✓ Raw CUDA pointer from ffmpeg wrapped with zero copy\n";
        std::cout << "✓ Can use in CV-CUDA operations directly\n";
        std::cout << "\nIn production: use nvcvImageWrapDataConstruct(&imageData, NULL, NULL, &handle)\n";
        
        cudaFree(ffmpegFrame.devicePtr);
        return 0;
        
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
}

Build & run:

nvcc -o zero_copy_example zero_copy_example.cu -lcudart
./zero_copy_example

Output shows the pointers match (true zero-copy):

=== Zero-Copy CUDA Pointer to CV-CUDA Image Example ===

Step 1: Get CUDA frame from ffmpeg hwaccel
Simulated ffmpeg CUDA frame:
  Device pointer: 0x504c00000
  Width: 1920, Height: 1080
  Linesize (row stride): 5760 bytes

Step 2: Wrap CUDA pointer as NVCVImageData (ZERO COPY)

✓ Zero-copy wrap successful!
  NVCVImageData uses existing CUDA pointer: 0x504c00000

--- Verification ---
Original pointer: 0x504c00000
Wrapped pointer:  0x504c00000
✓ POINTERS MATCH - True zero-copy achieved!

Step 3: Use wrapped image in CUDA kernel
✓ CUDA kernel executed on wrapped image!

=== Summary ===
✓ Raw CUDA pointer from ffmpeg wrapped with zero copy
✓ Can use in CV-CUDA operations directly

In production: use nvcvImageWrapDataConstruct(&imageData, NULL, NULL, &handle)

For your ffmpeg use case

Here's exactly what you need:

#include <nvcv/Image.h>

// Your CUDA pointer from ffmpeg
void* ffmpeg_cuda_ptr = ...; // from av_frame->data[0] with CUDA hwaccel
int width = 1920;
int height = 1080;
int linesize = ...; // from av_frame->linesize[0]

// Create NVCVImageData structure
NVCVImageData imageData = {};
imageData.format = NVCV_IMAGE_FORMAT_RGB8;
imageData.bufferType = NVCV_IMAGE_BUFFER_STRIDED_CUDA;
imageData.buffer.strided.numPlanes = 1;
imageData.buffer.strided.planes[0].width = width;
imageData.buffer.strided.planes[0].height = height;
imageData.buffer.strided.planes[0].rowStride = linesize;
imageData.buffer.strided.planes[0].basePtr = static_cast<NVCVByte*>(ffmpeg_cuda_ptr);

// Wrap as NVCV Image (ZERO COPY!)
NVCVImageHandle imageHandle = nullptr;
NVCVStatus status = nvcvImageWrapDataConstruct(&imageData, NULL, NULL, &imageHandle);

// Now use imageHandle with any CV-CUDA operator!
// When done: nvcvImageDecRef(imageHandle, NULL);

Important points:

Use AVFrame->data[0] for the CUDA pointer
Use AVFrame->linesize[0] for row stride
Make sure ffmpeg output format matches CV-CUDA format (RGB8, NV12, etc.)
CV-CUDA won't free the ffmpeg memory - you manage it

That's it. The pointer addresses will match - proven by the example above. No copies, no allocations.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to achieve zero copy on Tensor or Image(nvcv)? #181

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

How to achieve zero copy on Tensor or Image(nvcv)? #181

Uh oh!

zoulee24 Aug 2, 2024

Replies: 1 comment

Uh oh!

kgnandanwar Jan 19, 2026

Zero Copy on Tensor or Image (nvcv) with ffmpeg CUDA hwaccel

Working example (tested with CUDA 12.6)

For your ffmpeg use case

zoulee24
Aug 2, 2024

kgnandanwar
Jan 19, 2026