问题描述
- CUDA-GPU加速-中值滤波-黑屏+显卡程序崩溃
-
新手刚接触CUDA C,这段中值滤波的代码一直跑不通,如果读取一张小图片(2790,2560)没有问题,如果读取一张大图片(5580,5120),就会导致黑屏+没有结果。
我的显卡是NVIDIA Geforce 820M#include <iostream> #include "ImMedFilter.h" #define DIM_GRID 128 #define DIM_BLOCK 16 using namespace std; __device__ void mid(unsigned char* data, int size, int midIndex) { unsigned char temp = 0; for (int i = 0; i <= midIndex; i++) { for (int j = i+1; j < size; j++) { if (data[j] > data[i]) { temp = data[i]; data[i] = data[j]; data[j] = temp; } } } } __global__ void global_ZhongSmooth(unsigned char *dev_src, unsigned char *dev_dst, int width , int height, int bytesPerLine, int size, int markSize) { int x = threadIdx.x + blockIdx.x*blockDim.x; int y = threadIdx.y + blockIdx.y*blockDim.y; int step_x = blockDim.x * gridDim.x; int step_y = blockDim.y * gridDim.y; for (int t_y = y; t_y < height - markSize; t_y = t_y + step_y) { if (t_y < markSize || t_y > height-markSize-1) { continue; } for (int t_x = x; t_x < width - markSize; t_x = t_x + step_x) { if (t_x < markSize || t_x > width-markSize-1) { continue; } unsigned int index = t_x + t_y*width; if( index < size ) { unsigned char len = (markSize*2+1)*(markSize*2+1); unsigned char midIndex = len/2; // 创建保存中值区域的数组 // size = (marksize * 2 + 1) ^ 2; // 滤波大小:3、5、7、9、...、29、31 unsigned char*a = NULL; switch (markSize) { case 1: unsigned char c[9]; a = c; break; case 2: unsigned char d[25]; a = d; break; case 3: unsigned char e[49]; a = e; break; case 4: unsigned char f[81]; a = f; break; case 5: unsigned char g[121]; a = g; break; case 6: unsigned char h[169]; a = h; break; case 7: unsigned char v[225]; a = v; break; case 8: unsigned char w[289]; a = w; break; case 9: unsigned char x[361]; a = x; break; case 10: unsigned char z[441]; a = z; break; case 11: unsigned char u[529]; a = u; break; case 12: unsigned char q[625]; a = q; break; case 13: unsigned char i[729]; a = i; break; case 14: unsigned char o[841]; a = o; break; default: return; } unsigned char k = 0; for (int i = -markSize; i <= markSize; i++) { for (int j = -markSize; j <= markSize; j++) { a[k++] = dev_src[t_x+j+(t_y+i)*width]; } } mid(a, len, midIndex); dev_dst[index] = a[midIndex]; } } } } extern "C" void ImMedfilter(const unsigned char *host_src, unsigned char *host_dst, int width, int height, int bytesPerLine, int markSize) { int dataSize = bytesPerLine * height; unsigned char *dev_src = NULL; unsigned char *dev_dst = NULL; cudaMalloc((void**) &dev_src, dataSize); cudaMalloc((void**) &dev_dst, dataSize); cudaMemset(dev_dst, 0, dataSize); cudaMemcpy(dev_src, host_src, dataSize, cudaMemcpyHostToDevice); dim3 blocks(DIM_BLOCK, DIM_BLOCK); dim3 grids(DIM_GRID, DIM_GRID); global_ZhongSmooth<<<grids, blocks>>>(dev_src, dev_dst, width, height, bytesPerLine, dataSize, markSize); cudaMemcpy(host_dst, dev_dst, dataSize, cudaMemcpyDeviceToHost); cudaFree(dev_src); cudaFree(dev_dst); }
解决方案
每次程序黑屏都是ImMedfilter函数运行了:
cudaMemcpy(host_dst, dev_dst, dataSize, cudaMemcpyDeviceToHost);
这一段代码后。
解决方案二:
快1年了,基本没碰CUDA了,最近项目需要用GPU加速,又遇到这种类似情况,没人回答就自己不断补充吧。
cudaMemcpy函数不会等GPU运算结束再执行,所以GPU在操作dev_dst,CPU也在操作,两者冲突,直接死机。
我的暂时解决方法是加一句cudaThreadSynchronize(),等待GPU执行完毕CPU再继续操作。
这种方法会有问题,如果算法计算时间过长的话,你的屏幕就会卡死,要等到GPU计算完屏幕才能正常工作,这就很蛋疼。
应该是方法还没有找对,看后面继续改进吧。。。。。。。
时间: 2024-12-21 18:07:42