globalメモリとtextureメモリ

OpenCVを利用し、カーネルではコピーのみ行う処理を、
globalメモリを使用した場合と、textureメモリを使用した場合の両方の比較です。
textureメモリ使用の参考にして下さい。
（画像はペイントなどでbmpファイルを作成してください）

1次元で処理させています。
2次元の方が速いという情報もあります・・・情報提供お願いします。

#include <stdio.h> #include "cv.h" #include "cxcore.h" #include "highgui.h" #pragma comment(lib,"cv.lib") #pragma comment(lib,"cxcore.lib") #pragma comment(lib,"highgui.lib") #include <cutil.h> __global__ void bit1cpykernel(unsigned char *in, unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0]; } __global__ void bit2cpykernel(unsigned char *in, unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0]; out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1]; } __global__ void bit3cpykernel(unsigned char *in, unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0]; out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1]; out[y * width * channels + x * channels + 2] = in[y * width * channels + x * channels + 2]; } __global__ void bit4cpykernel(unsigned char *in, unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0]; out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1]; out[y * width * channels + x * channels + 2] = in[y * width * channels + x * channels + 2]; out[y * width * channels + x * channels + 3] = in[y * width * channels + x * channels + 3]; } int main(int argc, char **argv){ CUT_DEVICE_INIT( argc, argv ); IplImage *imgin, *imgout; unsigned char *d_in, *d_out; int memsize; imgin = cvLoadImage("ファイルパス"); imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels); printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels); memsize = imgin->width * imgin->height * imgin->nChannels; CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize)); CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize)); CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice)); dim3 threads(32,8); dim3 blocks(imgin->width / threads.x, imgin->height / threads.y); // 前の情報を一旦初期化（処理されなかったメモリは、前のデータが表示されるため） // デバッグ用。通常は必要なし cudaMemset(d_out, 0, memsize); CUDA_SAFE_CALL( cudaThreadSynchronize() ); unsigned int timer = 0; CUT_SAFE_CALL( cutCreateTimer( &timer)); CUT_SAFE_CALL( cutStartTimer( timer)); switch(imgin->nChannels){ case 1: bit1cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels); break; case 2: bit2cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels); break; case 3: bit3cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels); break; case 4: bit4cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels); break; } // 同期をとる。 CUDA_SAFE_CALL( cudaThreadSynchronize() ); CUT_SAFE_CALL( cutStopTimer( timer)); printf("処理時間： %f (ms)\n", cutGetTimerValue( timer)); CUT_SAFE_CALL( cutDeleteTimer( timer)); CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize, cudaMemcpyDeviceToHost)); cvNamedWindow("in", 1); cvNamedWindow("out", 1); cvShowImage("in", imgin); cvShowImage("out", imgout); // キーを何か押したら終了 cvWaitKey(-1); cvDestroyWindow("in"); cvDestroyWindow("out"); cvReleaseImage(&imgin); cvReleaseImage(&imgout); cudaFree(d_in); cudaFree(d_out); return 0; }

#include <stdio.h> #include "cv.h" #include "cxcore.h" #include "highgui.h" #pragma comment(lib,"cv.lib") #pragma comment(lib,"cxcore.lib") #pragma comment(lib,"highgui.lib") #include <cutil.h> texture<unsigned char, 1, cudaReadModeElementType> texRef; __global__ void bit1cpykernel(unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0); } __global__ void bit2cpykernel(unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1); } __global__ void bit3cpykernel(unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1); out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, y * width * channels + x * channels + 2); } __global__ void bit4cpykernel(unsigned char *out, int width, int channels){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1); out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, y * width * channels + x * channels + 2); out[y * width * channels + x * channels + 3] = tex1Dfetch(texRef, y * width * channels + x * channels + 3); } int main(int argc, char **argv){ CUT_DEVICE_INIT( argc, argv ); IplImage *imgin, *imgout; unsigned char *d_in, *d_out; int memsize; imgin = cvLoadImage("ファイルパス"); imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels); printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels); memsize = imgin->width * imgin->height * imgin->nChannels; CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize)); CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize)); CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice)); dim3 threads(32,8); dim3 blocks(imgin->width / threads.x, imgin->height / threads.y); // 前の情報を一旦初期化（処理されなかったメモリは、前のデータが表示されるため） // デバッグ用。通常は必要なし cudaMemset(d_out, 0, memsize); CUDA_SAFE_CALL( cudaThreadSynchronize() ); cudaBindTexture(NULL, texRef, d_in); unsigned int timer = 0; CUT_SAFE_CALL( cutCreateTimer( &timer)); CUT_SAFE_CALL( cutStartTimer( timer)); switch(imgin->nChannels){ case 1: bit1cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels); break; case 2: bit2cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels); break; case 3: bit3cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels); break; case 4: bit4cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels); break; } // 同期をとる。 CUDA_SAFE_CALL( cudaThreadSynchronize() ); CUT_SAFE_CALL( cutStopTimer( timer)); printf("処理時間： %f (ms)\n", cutGetTimerValue( timer)); CUT_SAFE_CALL( cutDeleteTimer( timer)); CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize, cudaMemcpyDeviceToHost)); cvNamedWindow("in", 1); cvNamedWindow("out", 1); cvShowImage("in", imgin); cvShowImage("out", imgout); // キーを何か押したら終了 cvWaitKey(-1); cvDestroyWindow("in"); cvDestroyWindow("out"); cvReleaseImage(&imgin); cvReleaseImage(&imgout); cudaFree(d_in); cudaFree(d_out); return 0; }

フルHD(1920*1080)、24ビット(RGB)画像で処理させた場合、
以下のような時間となりました。

■GeForce8400GS
textureメモリ：58.540138 (ms)
globalメモリ：121.632118 (ms)

■GeForce9500GT
textureメモリ：11.615941 (ms)
globalメモリ：18.154190 (ms)

■GTX260
textureメモリ：0.443769 (ms)
globalメモリ：0.637897 (ms)

■GTX560
textureメモリ：0.257563 (ms)
globalメモリ：0.268670 (ms)

新しくなるにつれ、あまり差は出なくなっています。