CUDA入門 >
メモリ >
globalメモリとtextureメモリ
globalメモリとtextureメモリ
globalメモリとtextureメモリ
OpenCVを利用し、カーネルではコピーのみ行う処理を、
globalメモリを使用した場合と、textureメモリを使用した場合の両方の比較です。
textureメモリ使用の参考にして下さい。
(画像はペイントなどでbmpファイルを作成してください)
1次元で処理させています。
2次元の方が速いという情報もあります・・・情報提供お願いします。
globalメモリ
#include
#include "cv.h"
#include "cxcore.h"
#include "highgui.h"
#pragma comment(lib,"cv.lib")
#pragma comment(lib,"cxcore.lib")
#pragma comment(lib,"highgui.lib")
#include
__global__ void bit1cpykernel(unsigned char *in, unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0];
}
__global__ void bit2cpykernel(unsigned char *in, unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0];
out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1];
}
__global__ void bit3cpykernel(unsigned char *in, unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0];
out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1];
out[y * width * channels + x * channels + 2] = in[y * width * channels + x * channels + 2];
}
__global__ void bit4cpykernel(unsigned char *in, unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = in[y * width * channels + x * channels + 0];
out[y * width * channels + x * channels + 1] = in[y * width * channels + x * channels + 1];
out[y * width * channels + x * channels + 2] = in[y * width * channels + x * channels + 2];
out[y * width * channels + x * channels + 3] = in[y * width * channels + x * channels + 3];
}
int main(int argc, char **argv){
CUT_DEVICE_INIT( argc, argv );
IplImage *imgin, *imgout;
unsigned char *d_in, *d_out;
int memsize;
imgin = cvLoadImage("ファイルパス");
imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels);
printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels);
memsize = imgin->width * imgin->height * imgin->nChannels;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize));
CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice));
dim3 threads(32,8);
dim3 blocks(imgin->width / threads.x, imgin->height / threads.y);
// 前の情報を一旦初期化(処理されなかったメモリは、前のデータが表示されるため)
// デバッグ用。通常は必要なし
cudaMemset(d_out, 0, memsize);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
switch(imgin->nChannels){
case 1:
bit1cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels);
break;
case 2:
bit2cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels);
break;
case 3:
bit3cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels);
break;
case 4:
bit4cpykernel<<< blocks, threads >>> (d_in, d_out, imgin->width, imgin->nChannels);
break;
}
// 同期をとる。
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("処理時間: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize, cudaMemcpyDeviceToHost));
cvNamedWindow("in", 1);
cvNamedWindow("out", 1);
cvShowImage("in", imgin);
cvShowImage("out", imgout);
// キーを何か押したら終了
cvWaitKey(-1);
cvDestroyWindow("in");
cvDestroyWindow("out");
cvReleaseImage(&imgin);
cvReleaseImage(&imgout);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
textureメモリ
#include
#include "cv.h"
#include "cxcore.h"
#include "highgui.h"
#pragma comment(lib,"cv.lib")
#pragma comment(lib,"cxcore.lib")
#pragma comment(lib,"highgui.lib")
#include
texture texRef;
__global__ void bit1cpykernel(unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0);
}
__global__ void bit2cpykernel(unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1);
}
__global__ void bit3cpykernel(unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1);
out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, y * width * channels + x * channels + 2);
}
__global__ void bit4cpykernel(unsigned char *out, int width, int channels){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, y * width * channels + x * channels + 1);
out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, y * width * channels + x * channels + 2);
out[y * width * channels + x * channels + 3] = tex1Dfetch(texRef, y * width * channels + x * channels + 3);
}
int main(int argc, char **argv){
CUT_DEVICE_INIT( argc, argv );
IplImage *imgin, *imgout;
unsigned char *d_in, *d_out;
int memsize;
imgin = cvLoadImage("ファイルパス");
imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels);
printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels);
memsize = imgin->width * imgin->height * imgin->nChannels;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize));
CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice));
dim3 threads(32,8);
dim3 blocks(imgin->width / threads.x, imgin->height / threads.y);
// 前の情報を一旦初期化(処理されなかったメモリは、前のデータが表示されるため)
// デバッグ用。通常は必要なし
cudaMemset(d_out, 0, memsize);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
cudaBindTexture(NULL, texRef, d_in);
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
switch(imgin->nChannels){
case 1:
bit1cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels);
break;
case 2:
bit2cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels);
break;
case 3:
bit3cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels);
break;
case 4:
bit4cpykernel<<< blocks, threads >>> (d_out, imgin->width, imgin->nChannels);
break;
}
// 同期をとる。
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("処理時間: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL(cudaMemcpy(imgout->imageData, d_out, memsize, cudaMemcpyDeviceToHost));
cvNamedWindow("in", 1);
cvNamedWindow("out", 1);
cvShowImage("in", imgin);
cvShowImage("out", imgout);
// キーを何か押したら終了
cvWaitKey(-1);
cvDestroyWindow("in");
cvDestroyWindow("out");
cvReleaseImage(&imgin);
cvReleaseImage(&imgout);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
速度の違い
フルHD(1920*1080)、24ビット(RGB)画像で処理させた場合、
以下のような時間となりました。
■GeForce8400GS
textureメモリ:58.540138 (ms)
globalメモリ:121.632118 (ms)
■GeForce9500GT
textureメモリ:11.615941 (ms)
globalメモリ:18.154190 (ms)
■GTX260
textureメモリ:0.443769 (ms)
globalメモリ:0.637897 (ms)
■GTX560
textureメモリ:0.257563 (ms)
globalメモリ:0.268670 (ms)
新しくなるにつれ、あまり差は出なくなっています。