result from mex function causes all my indices to be the same value

13 views (last 30 days)
Hi, I am writing a mex function to implement one of my algorithms in CUDA. I know the funcitons are performing correctly but when i am taking the input from C++ into a variable in MATLAB using a mex function, my all values in my matrix result_prod is the same. Is ther something I am not doing correctly?
__device__ double prod_signal(double* result,double* res_final,double* signal, int nrows, int elements, double *img_PAM_val) {
for (int row = 0; row < nrows;row++) {
for (int numEl = 0; numEl < elements; numEl++) {
result[row] += signal[row + numEl * nrows];
}
res_final[row] += result[row] * result[row];
img_PAM_val[0] += res_final[row];
}
return *img_PAM_val;
}
__global__ void calc_shift_signal(double* img_PAM,double *result,double* res2,double* res_final, double *img_PAM_val,double* val_for_cal, double *h_raw, double* tof, int nrows, int ncols, int nFrames,int pix_x,int pix_y,int elements) {
int x_p = blockIdx.x * blockDim.x + threadIdx.x;
int y_p = blockIdx.y * blockDim.y + threadIdx.y;
//int x_p = blockIdx.x * blockDim.x + threadIdx.x;
//int y_p = blockIdx.y * blockDim.y + threadIdx.y;
int pix = 0;
if (y_p < pix_y && x_p < pix_x) {
for (int numEl = 0; numEl < elements; numEl++) {
int shift_idx = tof[x_p + pix_x * (y_p + pix_y * (numEl))];// this works
//int shift_idx = tof[y_p + pix_y * (x_p + pix_x * (numEl))];// this works
img_PAM_val[numEl] = shift_idx;
//int limit_val = nrows - img_PAM_val[numEl];
for (int p = 0; p < nrows - img_PAM_val[numEl]; p++) {
int v2use = img_PAM_val[numEl];
result[p + (numEl*nrows)] = h_raw[(p + v2use) + (numEl * nrows)]; //p+v2use;// nrows - img_PAM_val[numEl];// h_raw[(p + v2use) + (numEl * nrows)];// img_PAM_val[numEl];// img_PAM_val[numEl];// +(nrows * numEl);// check for the last pixel. it does not look correct
}
}
int pix_val_x_cur = x_p;
int pix_val_y_cur = y_p;
pix = pix_val_x_cur + pix_val_y_cur * pix_x;
img_PAM[pix]=prod_signal(res2, res_final, result, nrows, elements, val_for_cal);
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
int mxInitGPU();
double *h_raw = (double *)mxGetData(prhs[0]);
double *tof = (double *)mxGetData(prhs[1]);
//short *shifted_data = (short *)mxGetData(prhs[2]);
double *distance_array = (double *)mxGetData(prhs[2]);
const mwSize *dimensions_rawdat = mxGetDimensions(prhs[0]);
int nrows = dimensions_rawdat[0];
int ncols = dimensions_rawdat[1];
int nframes = 1;// dimensions_rawdat[2];
if (mxGetNumberOfElements(prhs[0]) < nrows * ncols) {
mexErrMsgTxt("Not enough elements");
}
const mwSize *dimensions = mxGetDimensions(prhs[1]);
int pix_x = dimensions[0];
int pix_y = dimensions[1];
int elements = dimensions[2];
int size_res = nrows * ncols;
int ndim = 2;
const mwSize dims_n[] = { nrows,ncols };
const mwSize dims_PAM[] = { pix_x,pix_y };
const mwSize dims_PAM_f[] = { nrows };
const mwSize dims_res[] = {elements };
const mwSize dims_pval[] = { 1 };
plhs[0] = mxCreateNumericArray(ndim, dims_PAM, mxDOUBLE_CLASS, mxREAL);
double *result_prod = (double *)mxGetData(plhs[0]);
cudaSetDevice(0);
cudaDeviceReset();
double *dev_res_final;
cudaError_t rcr = cudaMalloc(&dev_res_final, nrows * sizeof(double));
if (rcr != cudaSuccess) {
mexPrintf("Could not allocate dev_res_final: %d \n", rcr);
};
double *dev_img_PAM_val;
cudaError_t rcrp = cudaMalloc(&dev_img_PAM_val, elements * sizeof(double));
if (rcr != cudaSuccess) {
mexPrintf("Could not allocate dev_res_final: %d \n", rcrp);
};
double *dev_calc_for_val;
cudaError_t dc = cudaMalloc(&dev_calc_for_val, 1 * sizeof(double));
if (dc != cudaSuccess) {
mexPrintf("Could not allocate dev_res_final: %d \n", dc);
};
double *dev_PAM;
cudaError_t rcPAM = cudaMalloc(&dev_PAM, pix_x * pix_y * sizeof(double));
if (rcr != cudaSuccess) {
mexPrintf("Could not allocate dev_res_final: %d \n", rcPAM);
};
double *dev_res2;
cudaError_t rcr2 = cudaMalloc(&dev_res2, nrows * sizeof(double));
if (rcr2 != cudaSuccess) {
mexPrintf("Could not allocate dev_res_final: %d \n", rcr2);
};
double *dev_result;
cudaError_t rc = cudaMalloc(&dev_result, size_res * sizeof(double) );
if (rc != cudaSuccess) {
mexPrintf("Could not allocate device result: %d \n", rc);
};
double *raw_dat;
cudaError_t rc2 = cudaMalloc(&raw_dat, nrows * sizeof(double) * ncols);
if (rc2 != cudaSuccess) {
mexPrintf("Could not allocate device result: %d \n", rc2);
};
double *timeOFF;
cudaError_t rc3 = cudaMalloc(&timeOFF, pix_x *pix_y * elements * sizeof(double));
if (rc2 != cudaSuccess) {
mexPrintf("Could not allocate device result: %d \n", rc3);
};
cudaError_t copyMemory_raw = cudaMemcpy(raw_dat, h_raw, sizeof(double) * nrows * ncols * nframes, cudaMemcpyHostToDevice);
if (copyMemory_raw != cudaSuccess) {
mexPrintf("Could not copy from signal array to device array: %d \n", copyMemory_raw);
cudaFree(raw_dat);
};
cudaError_t copyMemory_tof = cudaMemcpy(timeOFF, tof, sizeof(double) * pix_x * pix_y * elements , cudaMemcpyHostToDevice);
if (copyMemory_raw != cudaSuccess) {
mexPrintf("Could not copy from signal array to device array: %d \n", copyMemory_tof);
cudaFree(timeOFF);
};
dim3 threadsPerBlock(8,8,1); // 64 threads
dim3 numBlocks(4,4,1);// block size should be a multiple of 32
//int threadsPerBlock = 4;
//int numBlocks = 32;
calc_shift_signal << <numBlocks, threadsPerBlock >> > (dev_PAM,dev_result,dev_res2,dev_res_final, dev_img_PAM_val, dev_calc_for_val, raw_dat, timeOFF, nrows, ncols, nframes, pix_x,pix_y,elements);
cudaError_t copyMemory_resultn = cudaMemcpyAsync(result_prod, dev_PAM, pix_x * pix_y * sizeof(double), cudaMemcpyDeviceToHost);
if (copyMemory_resultn != cudaSuccess) {
mexPrintf("Could not copy from dev result to actual result: %d \n", copyMemory_resultn);
cudaFree(dev_PAM);
};
cudaFree(timeOFF);
cudaFree(raw_dat);
cudaFree(dev_result);
cudaFree(dev_PAM);
cudaFree(dev_res2);
cudaFree(dev_res_final);
}
  2 Comments
James Tursa
James Tursa on 10 May 2020
How do you know the cuda functions are operating correctly? Can you print the results to the screen and see that they are different?
Aparna Singh
Aparna Singh on 10 May 2020
I know they are operating correctly because i have this algorithm implemented in matlab and the result of first indices match that of matlab. i want to make it realtime so i am using cuda. what is really happening is that for some reason the latest index is updating the values of all previous indices to be that of the latest one.

Sign in to comment.

Answers (0)

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!