CUDA入門 >
OpenCVとCUDA >
stream(globalメモリ編)
stream(globalメモリ編)
stream(textureメモリ編)
globalメモリ編を単純にtextureメモリに変えただけです。
注意点は、入力画像の開始アドレスを指定する必要があります。
ここでは、「start」の変数で開始アドレスがわかるようにしてあります。
サンプルプログラム
#include
#include "cv.h"
#include "cxcore.h"
#include "highgui.h"
#pragma comment(lib,"cv.lib")
#pragma comment(lib,"cxcore.lib")
#pragma comment(lib,"highgui.lib")
#include
texture texRef;
__global__ void bit1cpykernel(unsigned char *out, int width, int channels, int start){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0);
}
__global__ void bit2cpykernel(unsigned char *out, int width, int channels, int start){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1);
}
__global__ void bit3cpykernel(unsigned char *out, int width, int channels, int start){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1);
out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 2);
}
__global__ void bit4cpykernel(unsigned char *out, int width, int channels, int start){
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0);
out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1);
out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 2);
out[y * width * channels + x * channels + 3] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 3);
}
int main(int argc, char **argv){
IplImage *imgin, *imgout;
unsigned char *d_in, *d_out, *out_img;
int memsize;
imgin = cvLoadImage("ファイル名");
imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels);
printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels);
memsize = imgin->width * imgin->height * imgin->nChannels;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize));
// streamを使用する場合、ホストのメモリはcudaMallocHostで確保しなければならない
CUDA_SAFE_CALL(cudaMallocHost((void**)&out_img, memsize));
CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice));
dim3 threads(32,2);
// 分割のため、ブロック数を調整
dim3 blocks(imgin->width / threads.x, imgin->height / (4*threads.y));
// 前の情報を一旦初期化(処理されなかったメモリは、前のデータが表示されるため)
// デバッグ用。通常は必要なし
cudaMemset(d_out, 0, memsize);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
cudaBindTexture(NULL, texRef, d_in);
int nstreams = 4;
cudaStream_t *streams;
streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++)
CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );
CUDA_SAFE_CALL( cudaThreadSynchronize() );
for(int i = 0; i < nstreams; i++){
switch(imgin->nChannels){
case 1:
//bit1cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams);
break;
case 2:
//bit2cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams);
break;
case 3:
bit3cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams);
break;
case 4:
//bit4cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams);
break;
}
}
for(int i = 0; i < nstreams; i++){
CUDA_SAFE_CALL(cudaMemcpyAsync(out_img + i * memsize/sizeof(char)/nstreams, d_out + i * memsize/sizeof(char)/nstreams, memsize/nstreams, cudaMemcpyDeviceToHost, streams[i]));
}
CUDA_SAFE_CALL( cudaThreadSynchronize() );
imgout->imageData = (char*)out_img;
cvNamedWindow("in", 1);
cvNamedWindow("out", 1);
cvShowImage("in", imgin);
cvShowImage("out", imgout);
// キーを何か押したら終了
cvWaitKey(-1);
cvDestroyWindow("in");
cvDestroyWindow("out");
cvReleaseImage(&imgin);
cvReleaseImage(&imgout);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}