Help with my mex function output from cudamemcpy2D

20 views (last 30 days)
Hi I am writing a very basic CUDA code where I am sending an input via matlab, copying it to gpu and then copying it back to the host and calling that output via mex file. But I am getting all 0s. I am still getting to know CUDA but would anyone know why this would happen? Here is the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <math.h>
#include "mex.h"
#include <vector>
#include <cuda.h>
using namespace std;
//#define CCE checkCudaErrors
#define HtoD cudaMemcpyHostToDevice
#define DtoH cudaMemcpyDeviceToHost
void matData(mxArray **A, short *h_raw, int nrows, int ncols, int nframes, short *tof, short *distance_array, int pix_x, int pix_y, int elements) {
int gpuID;
getFirstAvailableGPU(&gpuID);
short *new_data = h_raw;
cudaSetDevice(gpuID);
short* d_ptr;// = 0;
size_t pitchInBytes;// = sizeof(short)*nrows;
size_t h_pitchInBytes = sizeof(short) * nrows;
cudaMallocPitch((void **)&d_ptr, &pitchInBytes, nrows * sizeof(short),ncols*nframes);
cudaMemcpy2D(d_ptr, pitchInBytes, new_data, h_pitchInBytes,sizeof(short) * nrows, ncols*nframes, HtoD);
int dim_x = nrows;
int dim_y = ncols;
int dim_z = nframes;
int total_val = dim_x * dim_y*dim_z;
int nFrames = nframes;
double *img_PA{ new double[pix_x*pix_y]{} };// initializing it to 0
dim3 B(256, 1, 1);
dim3 G((nrows - 1) / B.x + 1, (ncols - 1) / B.y + 1,
(nframes - 1) / B.z + 1);
int ndim2 = 2;
const mwSize dims_n4[] = {nrows,ncols };
*A = mxCreateNumericArray(ndim2, dims_n4, mxSINGLE_CLASS, mxREAL);
short* res = (short *)mxGetData(*A);
size_t pitchInBytes_PAM = sizeof(short)*nrows;
size_t pitchInByes_PAM_h = sizeof(short) * nrows;
cudaMemcpy2D(res, pitchInBytes_PAM, d_ptr, pitchInByes_PAM_h, sizeof(short) * nrows, ncols * nframes, DtoH);
return;
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if (nrhs == 0 || !mxIsInt16(prhs[0])) {
mexErrMsgTxt("Input must be int16");
}
const mwSize *dims2;
size_t numberOfElements;
short *h_raw = (short *)mxGetData(prhs[0]);
short *tof = (short *)mxGetData(prhs[1]);
short *shifted_data = (short *)mxGetData(prhs[2]);
short *distance_array = (short *)mxGetData(prhs[3]);
const mwSize *dimensions_rawdat = mxGetDimensions(prhs[0]);
int nrows = dimensions_rawdat[0];
int ncols = dimensions_rawdat[1];
int nframes = dimensions_rawdat[2];
if (mxGetNumberOfElements(prhs[0]) < nrows * ncols) {
mexErrMsgTxt("Not enough elements");
}
const mwSize *dimensions = mxGetDimensions(prhs[1]);
int pix_x = dimensions[0];
int pix_y = dimensions[1];
int elements = dimensions[2];
int ndim1 = 1;
int ndim = 3;
int ndim2 = 2;
int ndim3 = 3;
const mwSize dims[] = { nrows, ncols, nframes };
const mwSize dims_n[] = { nrows, ncols };
const mwSize dims_n3[] = { ncols, nrows };
const mwSize dims_n2[] = { pix_x,pix_y,elements };
const mwSize dims_n4[] = { pix_x,pix_y };
const mwSize dims_sum[] = { nrows };
const mwSize dims_sume[] = { elements,elements };
const mwSize dims_sum_sum[] = { 1 };
matData(&plhs[0], h_raw, nrows, ncols, nframes, tof, distance_array, pix_x, pix_y, elements);
return;
}

Accepted Answer

James Tursa
James Tursa on 19 Apr 2020
Why is this created as a single class, and then a short pointer is used to access it?
*A = mxCreateNumericArray(ndim2, dims_n4, mxSINGLE_CLASS, mxREAL);
short* res = (short *)mxGetData(*A);
That's a mismatch of data type and pointer type.
  3 Comments
Aparna Singh
Aparna Singh on 19 Apr 2020
Edited: Aparna Singh on 20 Apr 2020
Thank you guys! I changed around my code and here is my new code. Basically I am taking Ultrasound signal from Matlab and assigning it to an array in C++. I am then calling that array in MATLAB with imagesc to see if the output signal is exactly as the input signal. I am still getting all zeros. Just for reference the signal is 1024x128. My hunch is that I may not be calling the right grid and block dimension. I am pasting a much more concise code. i am still confused as to where I am going wrong.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <math.h>
#include "mex.h"
#include <vector>
#include <cuda.h>
using namespace std;
//#define CCE checkCudaErrors
#define HtoD cudaMemcpyHostToDevice
#define DtoH cudaMemcpyDeviceToHost
#include "mxGPUArray.h"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]);
__global__ void copyArray(short* result, short* h_raw, int nrows, int ncols) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;// also number of elements
//int total_idx = nrows * ncols;
if (row < nrows && col < ncols) {
result[row + (col*nrows)] = h_raw[row + (col*nrows)];
}
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
int mxInitGPU();
if (nrhs == 0 || !mxIsInt16(prhs[0])) {
mexErrMsgTxt("Input must be int16");
}
const mwSize *dims2;
size_t numberOfElements;
short *h_raw = (short *)mxGetData(prhs[0]);
short *tof = (short *)mxGetData(prhs[1]);
//short *shifted_data = (short *)mxGetData(prhs[2]);
short *distance_array = (short *)mxGetData(prhs[2]);
const mwSize *dimensions_rawdat = mxGetDimensions(prhs[0]);
int nrows = dimensions_rawdat[0];
int ncols = dimensions_rawdat[1];
int nframes = 1;// dimensions_rawdat[2];
if (mxGetNumberOfElements(prhs[0]) < nrows * ncols) {
mexErrMsgTxt("Not enough elements");
}
//mxArray *array_ptr;
//array_ptr = mexGetVariable("global","tof" );
const mwSize *dimensions = mxGetDimensions(prhs[1]);
int pix_x = dimensions[0];
int pix_y = dimensions[1];
int elements = dimensions[2];
//int t_elements = nelems/ (pix_x*pix_y);
//numberOfElements = mxGetNumberOfElements(array_ptr);
//mexPrintf("%d \n", time_index);
//int dim = mxGetNumberOfDimensions(array_ptr);
//const mwSize *mxGetDimensions(const mxArray *);
//
int ndim1 = 1;
int ndim = 3;
int ndim2 = 2;
int ndim3 = 3;
const mwSize dims[] = { nrows, ncols, nframes };
const mwSize dims_n[] = { nrows, ncols };
const mwSize dims_n3[] = { ncols, nrows };
const mwSize dims_n2[] = { pix_x,pix_y,elements };
const mwSize dims_n4[] = { pix_x,pix_y };
const mwSize dims_sum[] = { nrows };
const mwSize dims_sume[] = { elements,elements };
const mwSize dims_sum_sum[] = { 1 };
short *dev_h_raw = 0;
short *dev_result = 0;
size_t pitchInBytes = sizeof(short) * nrows;
plhs[0] = mxCreateNumericArray(ndim2, dims_n, mxINT16_CLASS, mxREAL);
short* res = (short *)mxGetData(plhs[0]);
cudaSetDevice(0);
cudaMallocPitch((void **)&dev_result, &pitchInBytes, nrows * sizeof(short), ncols);
cudaMallocPitch((void **)&dev_h_raw, &pitchInBytes, nrows * sizeof(short), ncols);
cudaMemcpy2D(dev_h_raw, pitchInBytes, h_raw, pitchInBytes, sizeof(short) * nrows, ncols, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(8, 8); // 64 threads
dim3 numBlocks(nrows / threadsPerBlock.x, ncols / threadsPerBlock.y);
copyArray << < numBlocks, threadsPerBlock >> > (dev_result, dev_h_raw, nrows, ncols);// size = nrows
cudaMemcpy2D(res, pitchInBytes, dev_result, pitchInBytes, nrows * sizeof(short), ncols, cudaMemcpyDeviceToHost);
//copyArr(res, h_raw,nrows,ncols,nframes,nrows);
cudaDeviceReset();
//plhs[0] = res;
//matData(res, h_raw, nrows, ncols, nframes, tof, distance_array, pix_x, pix_y, elements);
//mexPrintf("%d \n", );
return;
}
Joss Knight
Joss Knight on 25 Apr 2020
You have specified pitchInBytes for both host and device pitches, but the pitch for the host arrays remains nrows*sizeof(short), whereas for the device arrays it has been modified by cudaMallocPitch.
You then proceed to ignore the pitch of the data in your kernel and so are copying data from and to the wrong addresses.

Sign in to comment.

More Answers (0)

Categories

Find more on Standalone Applications in Help Center and File Exchange

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!