NVIDIA NPP LIBRARY SDK

CUDA NPP库使用

NPP库是英伟达提供的可用在实现GPU加速图像处理,详细SDK文档可以参考链接,主要包含的库如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
//图像处理基础库,类似opencv core
nppc NPP core library which MUST be included when linking any application, functions are listed in nppCore.h
//算术逻辑操作
nppial arithmetic and logical operation functions in nppi_arithmetic_and_logical_operations.h
//颜色转换操作
nppicc color conversion and sampling functions in nppi_color_conversion.h
//图像压缩和解压
nppicom JPEG compression and decompression functions in nppi_compression_functions.h
//数据转换及初始化
nppidei data exchange and initialization functions in nppi_data_exchange_and_initialization.h
//滤波操作
nppif filtering and computer vision functions in nppi_filter_functions.h
//几何变换
nppig geometry transformation functions found in nppi_geometry_transforms.h
//形态学操作
nppim morphological operation functions found in nppi_morphological_operations.h
//统计及线性变换
nppist statistics and linear transform in nppi_statistics_functions.h and nppi_linear_transforms.h
//内存支持函数
nppisu memory support functions in nppi_support_functions.h
//阈值及比较操作
nppitc threshold and compare operation functions in nppi_threshold_and_compare_operations.h

由于项目需求,这里主要介绍一些常用的操作,主要是opencv中基本图像处理操作,比如颜色空间转换,图像伸缩变换等等。

RESIZE

resize操作支持单通道、3通道、4通道。8u、16u、16s、32f,接口一般是nppiResizeSqrPixel_ _ ,其中可以选择对感兴趣区域进行resize。这里需要注意的是resize的一些插值方式,和opencv不太一样,并且官方文档没有详细说明,导致有一些坑在里面。比如之前使用NPPI_INTER_SUPER插值方式的时候发现factor大于1的时候会出错。后面找到答案说NPPI_INTER_SUPER只支持降采样操作,参考链接。这里举个BGR进行通道转换的栗子:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
bool imageResize_8u_C3R(void *src, int srcWidth, int srcHeight, void *dst, int dstWidth, int dstHeight)
{
NppiSize oSrcSize;
oSrcSize.width = srcWidth;
oSrcSize.height = srcHeight;
int nSrcStep = srcWidth * 3;

NppiRect oSrcROI;
oSrcROI.x = 0;
oSrcROI.y = 0;
oSrcROI.width = srcWidth;
oSrcROI.height = srcHeight;

int nDstStep = dstWidth * 3;
NppiRect oDstROI;
oDstROI.x = 0;
oDstROI.y = 0;
oDstROI.width = dstWidth;
oDstROI.height = dstHeight;

// Scale Factor
double nXFactor = double(dstWidth) / (oSrcROI.width);
double nYFactor = double(dstHeight) / (oSrcROI.height);

// Scaled X/Y Shift
double nXShift = - oSrcROI.x * nXFactor ;
double nYShift = - oSrcROI.y * nYFactor;
int eInterpolation = NPPI_INTER_SUPER;
if (nXFactor >= 1.f || nYFactor >= 1.f)
eInterpolation = NPPI_INTER_LANCZOS;

NppStatus ret = nppiResizeSqrPixel_8u_C3R((const Npp8u *)src, oSrcSize, nSrcStep, oSrcROI, (Npp8u *)dst,
nDstStep, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );
if(ret != NPP_SUCCESS) {
printf("imageResize_8u_C3R failed %d.\n", ret);
return false;
}

return true;
}

resize库包含在nppig库里面,其中还有各种操作,包括mirror、remap、rotate、warp等等,这些在平常使用过程中比较少用到,需要用的时候再参考文档。

 颜色转换

自己实现一些操作

padding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__global__ void imagePaddingKernel(float3 *ptr, float3 *dst, int width, int height, int top,
int bottom, int left, int right)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if(x < left || x >= (width - right) || y < top || y > (height - bottom)) {
return;
}

float3 color = ptr[(y - top) * (width - top - right) + (x - left)];

dst[y * width + x] = color;
}

void imagePadding(const void *src, void *dst, int width, int height, int top,
int bottom, int left, int right)
{
int dstW = width + left + right;
int dstH = height + top + bottom;

cudaMemset(dst, 0, dstW * dstH * sizeof(float3));

dim3 grids((dstW + 31) / 32, (dstH + 31) / 32);
dim3 blocks(32, 32);
imagePaddingKernel<<<grids, blocks>>>((float3 *)src, (float3 *)dst, dstW, dstH,
top, bottom, left, right);
}

split

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__ void imageSplitKernel(float3 *ptr, float *dst, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

float3 color = ptr[y * width + x];

dst[y * width + x] = color.x;
dst[y * width + x + width * height] = color.y;
dst[y * width + x + width * height * 2] = color.z;
}

void imageSplit(const void *src, float *dst, int width, int height, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
imageSplitKernel<<<grids, blocks>>>((float3 *)src, (float *)dst, width, height);
}

normalization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
__global__ void imageNormalizationKernel(float3 *ptr, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

float3 color = ptr[y * width + x];
color.x = (color.x - 127.5) * 0.0078125;
color.y = (color.y - 127.5) * 0.0078125;
color.z = (color.z - 127.5) * 0.0078125;

ptr[y * width + x] = make_float3(color.x, color.y, color.z);
}

void imageNormalization(void *ptr, int width, int height, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
imageNormalizationKernel<<<grids, blocks>>>((float3 *)ptr, width, height);
}

BGR2RGBfloat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__global__ void convertBGR2RGBfloatKernel(uchar3 *src, float3 *dst, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

uchar3 color = src[y * width + x];
dst[y * width + x] = make_float3(color.z, color.y, color.x);
}

void convertBGR2RGBfloat(void *src, void *dst, int width, int height, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
convertBGR2RGBfloatKernel<<<grids, blocks>>>((uchar3 *)src, (float3 *)dst, width, height);
}

RGBA2Gray

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
__global__ void convertRGBA2GrayKernel(uchar4 *src, uchar1 *dst, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

uchar4 color = src[y * width + x];

//dst[y * width + x] = make_uchar1((color.x+color.y+color.z) * .333333f);
dst[y * width + x] = make_uchar1(0.114f * color.x + 0.587f * color.y + 0.299f * color.z);
}

void convertRGBA2Gray(void *src, void *dst, int width, int height, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
convertRGBA2GrayKernel<<<grids, blocks, 0, stream>>>((uchar4 *)src, (uchar1 *)dst, width, height);
// cudaDeviceSynchronize();
cudaStreamSynchronize(stream);
}

RGBA2BGR

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__global__ void convertRGBA2BGRKernel(uchar4 *src, uchar3 *dst, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

uchar4 color = src[y * width + x];
dst[y * width + x] = make_uchar3(color.z, color.y, color.x);
}

void convertRGBA2BGR(void *src, void *dst, int width, int height, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
convertRGBA2BGRKernel<<<grids, blocks, 0, stream>>>((uchar4 *)src, (uchar3 *)dst, width, height);
}

TX2 nvx实现RGBA2YUVI420

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
void convertRGBA2YUVI420(void *src, void *dst, int width, int height)
{
static bool inited = false;
static nvxcu_stream_exec_target_t exec_target;

if (!inited) {
int deviceID;
/*HANDLE_CUDA_ERROR*/(cudaGetDevice(&deviceID));
exec_target.base.exec_target_type = NVXCU_STREAM_EXEC_TARGET;
exec_target.stream = NULL;
/*HANDLE_CUDA_ERROR*/(cudaGetDeviceProperties(&exec_target.dev_prop, deviceID));
inited = true;
}

nvxcu_pitch_linear_image_t input, output;
input.base.format = NVXCU_DF_IMAGE_RGBX;
input.base.width = width;
input.base.height = height;
input.base.image_type = NVXCU_PITCH_LINEAR_IMAGE;
input.planes[0].dev_ptr = src;
input.planes[0].pitch_in_bytes = width * 4;

output.base.format = NVXCU_DF_IMAGE_IYUV;
output.base.width = width;
output.base.height = height;
output.base.image_type = NVXCU_PITCH_LINEAR_IMAGE;
output.planes[0].dev_ptr = dst;
output.planes[0].pitch_in_bytes = width;
output.planes[1].dev_ptr = (char *)dst + width * height;
output.planes[1].pitch_in_bytes = width / 2;
output.planes[2].dev_ptr = (char *)dst + width * height * 5 / 4;
output.planes[2].pitch_in_bytes = width / 2;

nvxcu_error_status_e stat;
stat = nvxcuColorConvert(&input.base, &output.base, NVXCU_COLOR_SPACE_DEFAULT,
NVXCU_CHANNEL_RANGE_FULL, &exec_target.base);
if (stat != NVXCU_SUCCESS) {
dbgInfo("Conver RGB to YUVI420 failed: %d.\n", stat);
}
}

叠加图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__global__ void cudaPutLogoToImageKernel(uchar4 *devImg, int imgWidth, int imgHeight,
uchar3 *devLogo, int width, int height, int offsetX, int offsetY)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= width || y >= height) {
return;
}

uchar3 devLogoColor = devLogo[y * width + x];
int offset = (y + offsetY) * imgWidth + offsetX + x;
devImg[offset] = make_uchar4(devLogoColor.z, devLogoColor.y, devLogoColor.x, 0);
}

void cudaPutLogoToImage(void *devImg, int imgWidth, int imgHeight, void *devLogo, int width,
int height, int offsetX, int offsetY, cudaStream_t stream)
{
dim3 grids((width + 31) / 32, (height + 31) / 32);
dim3 blocks(32, 32);
//if use stream, every time the result will be error. have to test!!!
cudaPutLogoToImageKernel<<<grids, blocks, 0, stream>>>((uchar4 *)devImg, imgWidth, imgHeight,
(uchar3 *)devLogo, width, height, offsetX, offsetY);
}

 参考链接

官网地址