VisionWorks Toolkit Reference

December 18, 2015 | 1.2 Release

 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
Process Image with CUDA Kernel

This tutorial demonstrates VisionWorks image processing with user CUDA code.

This tutorial demonstrates how to process VisionWorks image objects with existing CUDA code by implementing a custom CUDA kernel that makes a negative image from an input image in RGB format.

VisionWorks image objects provide CUDA pointers with 2D pitched memory layout, similar to memory allocated by the cudaMallocPitch function. The memory layout for an image object is contained in the vx_imagepatch_addressing_t structure. Use vx_imagepatch_addressing_t::dim_x and vx_imagepatch_addressing_t::dim_y fields to get width and height in pixels of the image, and vx_imagepatch_addressing_t::stride_y to get row pitch in bytes.

  1. Implement the CUDA kernel that performs the desired functionality and small host wrapper that calls the CUDA kernel using CUDA runtime syntax.

    __global__ void negative_image_kernel(const vx_uint8 *src_ptr, int src_step,
    vx_uint8 *dst_ptr, int dst_step,
    int width, int height)
    {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height)
    return;
    const uchar3 *src_row = (const uchar3 *)(src_ptr + y * src_step);
    uchar3 *dst_row = (uchar3 *)(dst_ptr + y * dst_step);
    uchar3 src_val = src_row[x];
    dst_row[x] = make_uchar3(255 - src_val.x,
    255 - src_val.y,
    255 - src_val.z);
    }
    void negative_image(const vx_uint8 *src_ptr, int src_step,
    vx_uint8 *dst_ptr, int dst_step,
    int width, int height,
    cudaStream_t stream)
    {
    dim3 block(16, 16);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
    negative_image_kernel<<<grid, block, 0, stream>>>(src_ptr, src_step, dst_ptr, dst_step, width, height);
    }
  2. Determine the ROI of input image that will be processed.

    Now you can map image objects into the CUDA address space and get vx_imagepatch_addressing_t structures with information about memory layout. For the map operation, usage information is provided as the last parameter and describes how the mapped memory will be used (as read only access, write only access, or both read and write).

    Note
    Output pointers must be equal to NULL before calling the vxAccessImagePatch function; otherwise, the function will work in COPY mode, assuming that the pointer refers to a pre-allocated buffer.
    vx_uint8* src_ptr = NULL; // should be NULL to work in MAP mode
    vxAccessImagePatch(src, &rect, 0, &src_addr, (void **)&src_ptr, NVX_READ_ONLY_CUDA);
    vx_uint8* dst_ptr = NULL; // should be NULL to work in MAP mode
    vxAccessImagePatch(dst, &rect, 0, &dst_addr, (void **)&dst_ptr, NVX_WRITE_ONLY_CUDA);

    After you get mapped pointers with their memory layout, use it in CUDA kernels and CUDA libraries in the same way as plain CUDA pointers, allocated by the cudaMallocPitch function.

    Note
    All CUDA processing must be completed before the image memory will be unmapped (i.e., you must explicitly call synchronization functions, like cudaStreamSynchronize or cudaDeviceSynchronize to be sure that all custom CUDA kernels finish processing).
    cudaStream_t stream = NULL;
    negative_image(src_ptr, src_addr.stride_y,
    dst_ptr, dst_addr.stride_y,
    src_addr.dim_x, src_addr.dim_y,
    stream);
    cudaStreamSynchronize(stream);
  3. Unmap image objects.

    Note
    If the image was mapped in READ_ONLY mode, the ROI parameter for vxCommitImagePatch function can be NULL, since it does not change the valid region for the image.
    vxCommitImagePatch(src, NULL, 0, &src_addr, src_ptr);
    vxCommitImagePatch(dst, &rect, 0, &dst_addr, dst_ptr);

The Full Code for This Tutorial

__global__ void negative_image_kernel(const vx_uint8 *src_ptr, int src_step,
vx_uint8 *dst_ptr, int dst_step,
int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height)
return;
const uchar3 *src_row = (const uchar3 *)(src_ptr + y * src_step);
uchar3 *dst_row = (uchar3 *)(dst_ptr + y * dst_step);
uchar3 src_val = src_row[x];
dst_row[x] = make_uchar3(255 - src_val.x,
255 - src_val.y,
255 - src_val.z);
}
void negative_image(const vx_uint8 *src_ptr, int src_step,
vx_uint8 *dst_ptr, int dst_step,
int width, int height,
cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
negative_image_kernel<<<grid, block, 0, stream>>>(src_ptr, src_step, dst_ptr, dst_step, width, height);
}
void processImageWithCUDA(vx_image src, vx_image dst)
{
vx_df_image format = 0;
vxQueryImage(src, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format));
assert( format == VX_DF_IMAGE_RGB );
// Get valid region
vxGetValidRegionImage(src, &rect);
// Map VisionWorks data objects into CUDA device memory
vx_uint8* src_ptr = NULL; // should be NULL to work in MAP mode
vxAccessImagePatch(src, &rect, 0, &src_addr, (void **)&src_ptr, NVX_READ_ONLY_CUDA);
vx_uint8* dst_ptr = NULL; // should be NULL to work in MAP mode
vxAccessImagePatch(dst, &rect, 0, &dst_addr, (void **)&dst_ptr, NVX_WRITE_ONLY_CUDA);
// Call CUDA function
cudaStream_t stream = NULL;
negative_image(src_ptr, src_addr.stride_y,
dst_ptr, dst_addr.stride_y,
src_addr.dim_x, src_addr.dim_y,
stream);
cudaStreamSynchronize(stream);
// Unmap VisionWorks data objects from CUDA device memory
vxCommitImagePatch(src, NULL, 0, &src_addr, src_ptr);
vxCommitImagePatch(dst, &rect, 0, &dst_addr, dst_ptr);
}