CUDA入門 > OpenCVとCUDA > stream(globalメモリ編)

stream(globalメモリ編)

stream(textureメモリ編)

globalメモリ編を単純にtextureメモリに変えただけです。
注意点は、入力画像の開始アドレスを指定する必要があります。

ここでは、「start」の変数で開始アドレスがわかるようにしてあります。

サンプルプログラム

#include <stdio.h> #include "cv.h" #include "cxcore.h" #include "highgui.h" #pragma comment(lib,"cv.lib") #pragma comment(lib,"cxcore.lib") #pragma comment(lib,"highgui.lib") #include <cutil.h> texture<unsigned char, 1, cudaReadModeElementType> texRef; __global__ void bit1cpykernel(unsigned char *out, int width, int channels, int start){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0); } __global__ void bit2cpykernel(unsigned char *out, int width, int channels, int start){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1); } __global__ void bit3cpykernel(unsigned char *out, int width, int channels, int start){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1); out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 2); } __global__ void bit4cpykernel(unsigned char *out, int width, int channels, int start){ int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; out[y * width * channels + x * channels + 0] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 0); out[y * width * channels + x * channels + 1] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 1); out[y * width * channels + x * channels + 2] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 2); out[y * width * channels + x * channels + 3] = tex1Dfetch(texRef, start + y * width * channels + x * channels + 3); } int main(int argc, char **argv){ IplImage *imgin, *imgout; unsigned char *d_in, *d_out, *out_img; int memsize; imgin = cvLoadImage("ファイル名"); imgout = cvCreateImage(cvSize(imgin->width, imgin->height), imgin->depth, imgin->nChannels); printf("width = %d, height = %d, nChannels = %d\n", imgin->width, imgin->height, imgin->nChannels); memsize = imgin->width * imgin->height * imgin->nChannels; CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, memsize)); CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, memsize)); // streamを使用する場合、ホストのメモリはcudaMallocHostで確保しなければならない CUDA_SAFE_CALL(cudaMallocHost((void**)&out_img, memsize)); CUDA_SAFE_CALL(cudaMemcpy(d_in, imgin->imageData, memsize, cudaMemcpyHostToDevice)); dim3 threads(32,2); // 分割のため、ブロック数を調整 dim3 blocks(imgin->width / threads.x, imgin->height / (4*threads.y)); // 前の情報を一旦初期化（処理されなかったメモリは、前のデータが表示されるため） // デバッグ用。通常は必要なし cudaMemset(d_out, 0, memsize); CUDA_SAFE_CALL( cudaThreadSynchronize() ); cudaBindTexture(NULL, texRef, d_in); int nstreams = 4; cudaStream_t *streams; streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t)); for(int i = 0; i < nstreams; i++) CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) ); CUDA_SAFE_CALL( cudaThreadSynchronize() ); for(int i = 0; i < nstreams; i++){ switch(imgin->nChannels){ case 1: //bit1cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams); break; case 2: //bit2cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams); break; case 3: bit3cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams); break; case 4: //bit4cpykernel<<< blocks, threads, 0, streams[i] >>> (d_out + (i*memsize)/nstreams, imgin->width, imgin->nChannels, (i * memsize/sizeof(char))/nstreams); break; } } for(int i = 0; i < nstreams; i++){ CUDA_SAFE_CALL(cudaMemcpyAsync(out_img + i * memsize/sizeof(char)/nstreams, d_out + i * memsize/sizeof(char)/nstreams, memsize/nstreams, cudaMemcpyDeviceToHost, streams[i])); } CUDA_SAFE_CALL( cudaThreadSynchronize() ); imgout->imageData = (char*)out_img; cvNamedWindow("in", 1); cvNamedWindow("out", 1); cvShowImage("in", imgin); cvShowImage("out", imgout); // キーを何か押したら終了 cvWaitKey(-1); cvDestroyWindow("in"); cvDestroyWindow("out"); cvReleaseImage(&imgin); cvReleaseImage(&imgout); cudaFree(d_in); cudaFree(d_out); return 0; }