CUDA入門 > OpenCVとCUDA > 拡大縮小:Nearest Neighbor

拡大縮小:Nearest Neighbor


拡大縮小:Nearest Neighbor
拡大縮小の手法の一つ、Nearest Neighbor(ニアレストネイバー)のCUDAでの実装例です。

サンプルプログラム
#include <stdio.h> #include "cv.h" #include "cxcore.h" #include "highgui.h" #pragma comment(lib,"cv.lib") #pragma comment(lib,"cxcore.lib") #pragma comment(lib,"highgui.lib") #include <cutil.h> // 倍率を設定 #define RATE 2 __global__ void bit1cpykernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; // 拡大・縮小率に合わせて入力画像の座標を調整 int x_in = (int)(x * ((float)width_in / width_out)); int y_in = (int)(y * ((float)height_in / height_out)); // xとyがメモリ領域を超えない場合のみ処理する if(y < height_out && x < width_out){ out[y * width_out * channels + x * channels + 0] = in[y_in * width_in * channels + x_in * channels + 0]; } } __global__ void bit2cpykernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; // 拡大・縮小率に合わせて入力画像の座標を調整 int x_in = (int)(x * ((float)width_in / width_out)); int y_in = (int)(y * ((float)height_in / height_out)); // xとyがメモリ領域を超えない場合のみ処理する if(y < height_out && x < width_out){ out[y * width_out * channels + x * channels + 0] = in[y_in * width_in * channels + x_in * channels + 0]; out[y * width_out * channels + x * channels + 1] = in[y_in * width_in * channels + x_in * channels + 1]; } } __global__ void bit3cpykernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; // 拡大・縮小率に合わせて入力画像の座標を調整 int x_in = (int)(x * ((float)width_in / width_out)); int y_in = (int)(y * ((float)height_in / height_out)); // xとyがメモリ領域を超えない場合のみ処理する if(y < height_out && x < width_out){ out[y * width_out * channels + x * channels + 0] = in[y_in * width_in * channels + x_in * channels + 0]; out[y * width_out * channels + x * channels + 1] = in[y_in * width_in * channels + x_in * channels + 1]; out[y * width_out * channels + x * channels + 2] = in[y_in * width_in * channels + x_in * channels + 2]; } } __global__ void bit4cpykernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; // 拡大・縮小率に合わせて入力画像の座標を調整 int x_in = (int)(x * ((float)width_in / width_out)); int y_in = (int)(y * ((float)height_in / height_out)); // xとyがメモリ領域を超えない場合のみ処理する if(y < height_out && x < width_out){ out[y * width_out * channels + x * channels + 0] = in[y_in * width_in * channels + x_in * channels + 0]; out[y * width_out * channels + x * channels + 1] = in[y_in * width_in * channels + x_in * channels + 1]; out[y * width_out * channels + x * channels + 2] = in[y_in * width_in * channels + x_in * channels + 2]; out[y * width_out * channels + x * channels + 3] = in[y_in * width_in * channels + x_in * channels + 3]; } } int main(int argc, char **argv){ CUT_DEVICE_INIT( argc, argv ); IplImage *imgin, *imgout; unsigned char *d_in, *d_out; int memsize_in, memsize_out; float width_out, height_out; imgin = cvLoadImage("ファイル名", 1); // 出力画像の幅を高さを求める width_out = (int)(imgin->width * RATE); height_out = (int)(imgin->height * RATE); imgout = cvCreateImage(cvSize(width_out, height_out), imgin->depth, imgin->nChannels); printf("width = %d, height = %d, nChannels = %d\n", imgout->width, imgout->height, imgin->nChannels); memsize_in = imgin->width * imgin->height * imgin->nChannels; memsize_out = imgout->width * imgout->height * imgout->nChannels; CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize_in)); CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize_out)); CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize_in, cudaMemcpyHostToDevice)); dim3 threads(32,8); // 全画素処理するため、小数点は切り上げる dim3 blocks((int)((float)imgout->width / threads.x + 0.5), (int)((float)imgout->height / threads.y + 0.5)); // 前の情報を一旦初期化(処理されなかったメモリは、前のデータが表示されるため) // デバッグ用。通常は必要なし cudaMemset(d_out, 0, memsize_out); CUDA_SAFE_CALL( cudaThreadSynchronize() ); switch(imgin->nChannels){ case 1: bit1cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height); break; case 2: bit2cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height); break; case 3: bit3cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height); break; case 4: bit4cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height); break; } CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize_out, cudaMemcpyDeviceToHost)); cvNamedWindow("in", 1); cvNamedWindow("out", 1); cvShowImage("in", imgin); cvShowImage("out", imgout); // キーを何か押したら終了 cvWaitKey(-1); cvDestroyWindow("in"); cvDestroyWindow("out"); cvReleaseImage(&imgin); cvReleaseImage(&imgout); cudaFree(d_in); cudaFree(d_out); return 0; }