CUDA入門 >
OpenCVとCUDA >
拡大縮小:Bilinear
拡大縮小:Bilinear
拡大縮小:Bilinear
拡大縮小の手法の一つ、Bilinear(バイリニア補間)のCUDAでの実装例です。
サンプルプログラム
#include
#include "cv.h"
#include "cxcore.h"
#include "highgui.h"
#pragma comment(lib,"cv.lib")
#pragma comment(lib,"cxcore.lib")
#pragma comment(lib,"highgui.lib")
#include
// 倍率を設定
#define RATE 2
__global__ void Bilinear8Kernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
// 拡大・縮小率に合わせて入力画像の座標を調整
int x_in = (int)(x * ((float)width_in / width_out));
int y_in = (int)(y * ((float)height_in / height_out));
// バッファオーバーフローしないように調整
if(x < width_out && y < height_out){
if((int)x_in >= width_in){
x_in = width_in - 1;
}
if((int)y_in >= height_in){
y_in >= height_in - 1;
}
out[y * width_out + x] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in + (int)x_in]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in + (int)x_in + 1]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in + (int)x_in]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in + (int)x_in + 1]);
}
}
__global__ void Bilinear24Kernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
// 拡大・縮小率に合わせて入力画像の座標を調整
int x_in = (int)(x * ((float)width_in / width_out));
int y_in = (int)(y * ((float)height_in / height_out));
// バッファオーバーフローしないように調整
if(x < width_out && y < height_out){
if((int)x_in >= width_in){
x_in = width_in - 1;
}
if((int)y_in >= height_in){
y_in >= height_in - 1;
}
out[y * width_out * 3 + x * 3] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)x_in * 3]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)(x_in + 1) * 3]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)x_in * 3]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)(x_in + 1) * 3]);
out[y * width_out * 3 + x * 3 + 1] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)x_in * 3 + 1]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)(x_in + 1) * 3 + 1]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)x_in * 3 + 1]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)(x_in + 1) * 3 + 1]);
out[y * width_out * 3 + x * 3 + 2] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)x_in * 3 + 2]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 3 + (int)(x_in + 1) * 3 + 2]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)x_in * 3 + 2]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 3 + (int)(x_in + 1) * 3 + 2]);
}
}
__global__ void Bilinear32Kernel(unsigned char *in, unsigned char *out, int width_in, int height_in, int channels, int width_out, int height_out){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
// 拡大・縮小率に合わせて入力画像の座標を調整
int x_in = (int)(x * ((float)width_in / width_out));
int y_in = (int)(y * ((float)height_in / height_out));
// バッファオーバーフローしないように調整
if(x < width_out && y < height_out){
if((int)x_in >= width_in){
x_in = width_in - 1;
}
if((int)y_in >= height_in){
y_in >= height_in - 1;
}
out[y * width_out * 4 + x * 4] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)x_in * 4]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)(x_in + 1) * 4]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)x_in * 4]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)(x_in + 1) * 4]);
out[y * width_out * 4 + x * 4 + 1] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)x_in * 4 + 1]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)(x_in + 1) * 4 + 1]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)x_in * 4 + 1]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)(x_in + 1) * 4 + 1]);
out[y * width_out * 4 + x * 4 + 2] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)x_in * 4 + 2]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)(x_in + 1) * 4 + 2]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)x_in * 4 + 2]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)(x_in + 1) * 4 + 2]);
out[y * width_out * 4 + x * 4 + 3] = (unsigned char)(((int)x_in + 1 - x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)x_in * 4 + 3]
+ (x_in - (int)x_in) * ((int)y_in + 1 - y_in) * (double)in[(int)y_in * width_in * 4 + (int)(x_in + 1) * 4 + 3]
+ ((int)x_in + 1 - x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)x_in * 4 + 3]
+ (x_in - (int)x_in) * (y_in - (int)y_in) * (double)in[((int)y_in + 1) * width_in * 4 + (int)(x_in + 1) * 4 + 3]);
}
}
int main(int argc, char **argv){
CUT_DEVICE_INIT( argc, argv );
IplImage *imgin, *imgout;
unsigned char *d_in, *d_out;
int memsize_in, memsize_out;
float width_out, height_out;
imgin = cvLoadImage("ファイル名", 1);
// 出力画像の幅を高さを求める
width_out = (int)(imgin->width * RATE);
height_out = (int)(imgin->height * RATE);
imgout = cvCreateImage(cvSize(width_out, height_out), imgin->depth, imgin->nChannels);
printf("width = %d, height = %d, nChannels = %d\n", imgout->width, imgout->height, imgin->nChannels);
memsize_in = imgin->width * imgin->height * imgin->nChannels;
memsize_out = imgout->width * imgout->height * imgout->nChannels;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize_in));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize_out));
CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize_in, cudaMemcpyHostToDevice));
dim3 threads(32,8);
// 全画素処理するため、小数点は切り上げる
dim3 blocks((int)((float)imgout->width / threads.x + 0.5), (int)((float)imgout->height / threads.y + 0.5));
// 前の情報を一旦初期化(処理されなかったメモリは、前のデータが表示されるため)
// デバッグ用。通常は必要なし
cudaMemset(d_out, 0, memsize_out);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
switch(imgin->nChannels){
case 1:
Bilinear8Kernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height);
break;
case 3:
Bilinear24Kernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height);
break;
case 4:
Bilinear32Kernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->height, imgin->nChannels, imgout->width, imgout->height);
break;
}
CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize_out, cudaMemcpyDeviceToHost));
cvNamedWindow("in", 1);
cvNamedWindow("out", 1);
cvShowImage("in", imgin);
cvShowImage("out", imgout);
// キーを何か押したら終了
cvWaitKey(-1);
cvDestroyWindow("in");
cvDestroyWindow("out");
cvReleaseImage(&imgin);
cvReleaseImage(&imgout);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}