Help with my mex function output from cudamemcpy2D

Question

Aparna Singh on 19 Apr 2020

0
Link

Direct link to this question

https://www.mathworks.com/matlabcentral/answers/519001-help-with-my-mex-function-output-from-cudamemcpy2d

Commented: Joss Knight on 25 Apr 2020

Hi I am writing a very basic CUDA code where I am sending an input via matlab, copying it to gpu and then copying it back to the host and calling that output via mex file. But I am getting all 0s. I am still getting to know CUDA but would anyone know why this would happen? Here is the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <math.h>
#include "mex.h"
#include <vector>
#include <cuda.h>
using namespace std;
//#define CCE checkCudaErrors
#define HtoD cudaMemcpyHostToDevice
#define DtoH cudaMemcpyDeviceToHost
void matData(mxArray **A, short *h_raw, int nrows, int ncols, int nframes, short *tof,  short *distance_array, int pix_x, int pix_y, int elements) {
	int gpuID;
	getFirstAvailableGPU(&gpuID);
	short *new_data = h_raw;
	cudaSetDevice(gpuID);
	short* d_ptr;// = 0;
	size_t pitchInBytes;// = sizeof(short)*nrows;
	size_t h_pitchInBytes = sizeof(short) * nrows;
	cudaMallocPitch((void **)&d_ptr, &pitchInBytes, nrows * sizeof(short),ncols*nframes);
	cudaMemcpy2D(d_ptr, pitchInBytes, new_data, h_pitchInBytes,sizeof(short) * nrows, ncols*nframes, HtoD);
	int dim_x = nrows;
	int dim_y = ncols;
	int dim_z = nframes;
	int total_val = dim_x * dim_y*dim_z;
	int nFrames = nframes;
	
	double *img_PA{ new double[pix_x*pix_y]{} };// initializing it to 0
	
	dim3 B(256, 1, 1);
	dim3 G((nrows - 1) / B.x + 1, (ncols - 1) / B.y + 1,
		(nframes - 1) / B.z + 1);
	int ndim2 = 2;
	const mwSize dims_n4[] = {nrows,ncols };
	*A = mxCreateNumericArray(ndim2, dims_n4, mxSINGLE_CLASS, mxREAL);
	short* res = (short *)mxGetData(*A);
	size_t pitchInBytes_PAM = sizeof(short)*nrows;
	size_t pitchInByes_PAM_h = sizeof(short) * nrows;
	
	
	
	cudaMemcpy2D(res, pitchInBytes_PAM, d_ptr, pitchInByes_PAM_h, sizeof(short) * nrows, ncols * nframes, DtoH);
	
	
	return;
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
	if (nrhs == 0 || !mxIsInt16(prhs[0])) {
		mexErrMsgTxt("Input must be int16");
	}
	const mwSize *dims2;
	size_t numberOfElements;
	short *h_raw = (short *)mxGetData(prhs[0]);
	short *tof = (short *)mxGetData(prhs[1]);
	short *shifted_data = (short *)mxGetData(prhs[2]);
	short *distance_array = (short *)mxGetData(prhs[3]);
	const mwSize *dimensions_rawdat = mxGetDimensions(prhs[0]);
	int nrows = dimensions_rawdat[0];
	int ncols = dimensions_rawdat[1];
	int nframes = dimensions_rawdat[2];
	if (mxGetNumberOfElements(prhs[0]) < nrows * ncols) {
		mexErrMsgTxt("Not enough elements");
	}
	
	const mwSize *dimensions = mxGetDimensions(prhs[1]);
	int pix_x = dimensions[0];
	int pix_y = dimensions[1];
	int elements = dimensions[2];
	
	int ndim1 = 1;
	int ndim = 3;
	int ndim2 = 2;
	int ndim3 = 3;
	const mwSize dims[] = { nrows, ncols, nframes };
	const mwSize dims_n[] = { nrows, ncols };
	const mwSize dims_n3[] = { ncols, nrows };
	const mwSize dims_n2[] = { pix_x,pix_y,elements };
	const mwSize dims_n4[] = { pix_x,pix_y };
	const mwSize dims_sum[] = { nrows };
	const mwSize dims_sume[] = { elements,elements };
	const mwSize dims_sum_sum[] = { 1 };
	matData(&plhs[0], h_raw, nrows, ncols, nframes, tof,  distance_array, pix_x, pix_y, elements);
	
	return;
}

0 Comments
Show -2 older commentsHide -2 older comments

Sign in to comment.

Sign in to answer this question.

Answer 1

James Tursa on 19 Apr 2020

0
Link

Direct link to this answer

https://www.mathworks.com/matlabcentral/answers/519001-help-with-my-mex-function-output-from-cudamemcpy2d#answer_426973

Open in MATLAB Online

Why is this created as a single class, and then a short pointer is used to access it?

	*A = mxCreateNumericArray(ndim2, dims_n4, mxSINGLE_CLASS, mxREAL);
	short* res = (short *)mxGetData(*A);

That's a mismatch of data type and pointer type.

3 Comments
Show 1 older commentHide 1 older comment

Aparna Singh on 19 Apr 2020

Edited: Aparna Singh on 20 Apr 2020

Open in MATLAB Online

Thank you guys! I changed around my code and here is my new code. Basically I am taking Ultrasound signal from Matlab and assigning it to an array in C++. I am then calling that array in MATLAB with imagesc to see if the output signal is exactly as the input signal. I am still getting all zeros. Just for reference the signal is 1024x128. My hunch is that I may not be calling the right grid and block dimension. I am pasting a much more concise code. i am still confused as to where I am going wrong.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <math.h>
#include "mex.h"
#include <vector>
#include <cuda.h>
using namespace std;
//#define CCE checkCudaErrors
#define HtoD cudaMemcpyHostToDevice
#define DtoH cudaMemcpyDeviceToHost
#include "mxGPUArray.h"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]);
__global__ void copyArray(short* result, short* h_raw, int nrows, int ncols) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	int col = blockIdx.y * blockDim.y + threadIdx.y;// also number of elements
	//int total_idx = nrows * ncols;
	if (row < nrows && col < ncols) {
		result[row + (col*nrows)] = h_raw[row + (col*nrows)];
	}
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
	int mxInitGPU();
	if (nrhs == 0 || !mxIsInt16(prhs[0])) {
		mexErrMsgTxt("Input must be int16");
	}
	const mwSize *dims2;
	size_t numberOfElements;
	short *h_raw = (short *)mxGetData(prhs[0]);
	short *tof = (short *)mxGetData(prhs[1]);
	//short *shifted_data = (short *)mxGetData(prhs[2]);
	short *distance_array = (short *)mxGetData(prhs[2]);
	const mwSize *dimensions_rawdat = mxGetDimensions(prhs[0]);
	int nrows = dimensions_rawdat[0];
	int ncols = dimensions_rawdat[1];
	int nframes = 1;// dimensions_rawdat[2];
	if (mxGetNumberOfElements(prhs[0]) < nrows * ncols) {
		mexErrMsgTxt("Not enough elements");
	}
	//mxArray *array_ptr;
	//array_ptr = mexGetVariable("global","tof" );
	const mwSize *dimensions = mxGetDimensions(prhs[1]);
	int pix_x = dimensions[0];
	int pix_y = dimensions[1];
	int elements = dimensions[2];
	//int t_elements = nelems/ (pix_x*pix_y);
	//numberOfElements = mxGetNumberOfElements(array_ptr);
	//mexPrintf("%d \n", time_index);
	//int dim = mxGetNumberOfDimensions(array_ptr);
	//const mwSize *mxGetDimensions(const mxArray *);
	//
	int ndim1 = 1;
	int ndim = 3;
	int ndim2 = 2;
	int ndim3 = 3;
	const mwSize dims[] = { nrows, ncols, nframes };
	const mwSize dims_n[] = { nrows, ncols };
	const mwSize dims_n3[] = { ncols, nrows };
	const mwSize dims_n2[] = { pix_x,pix_y,elements };
	const mwSize dims_n4[] = { pix_x,pix_y };
	const mwSize dims_sum[] = { nrows };
	const mwSize dims_sume[] = { elements,elements };
	const mwSize dims_sum_sum[] = { 1 };
	short *dev_h_raw = 0;
	short *dev_result = 0;
	size_t pitchInBytes = sizeof(short) * nrows;
	plhs[0] = mxCreateNumericArray(ndim2, dims_n, mxINT16_CLASS, mxREAL);
	short* res = (short *)mxGetData(plhs[0]);
	cudaSetDevice(0);
	
	cudaMallocPitch((void **)&dev_result, &pitchInBytes, nrows * sizeof(short), ncols);
	cudaMallocPitch((void **)&dev_h_raw, &pitchInBytes, nrows * sizeof(short), ncols);
	
	cudaMemcpy2D(dev_h_raw, pitchInBytes, h_raw, pitchInBytes, sizeof(short) * nrows, ncols, cudaMemcpyHostToDevice);
	
	dim3 threadsPerBlock(8, 8);  // 64 threads
	dim3 numBlocks(nrows / threadsPerBlock.x, ncols / threadsPerBlock.y);
	copyArray << < numBlocks, threadsPerBlock >> > (dev_result, dev_h_raw, nrows, ncols);// size = nrows
	cudaMemcpy2D(res, pitchInBytes, dev_result, pitchInBytes, nrows * sizeof(short),  ncols, cudaMemcpyDeviceToHost);
	//copyArr(res, h_raw,nrows,ncols,nframes,nrows);
	cudaDeviceReset();
	
	//plhs[0] = res;
	//matData(res, h_raw, nrows, ncols, nframes, tof,  distance_array, pix_x, pix_y, elements);
	//mexPrintf("%d \n", );
	return;
}

Joss Knight on 25 Apr 2020

You have specified pitchInBytes for both host and device pitches, but the pitch for the host arrays remains nrows*sizeof(short), whereas for the device arrays it has been modified by cudaMallocPitch.

You then proceed to ignore the pitch of the data in your kernel and so are copying data from and to the wrong addresses.

Sign in to comment.

Help with my mex function output from cudamemcpy2D

0 Comments
Show -2 older commentsHide -2 older comments

Accepted Answer

3 Comments
Show 1 older commentHide 1 older comment

More Answers (0)

See Also

Categories

Tags

Community Treasure Hunt

Help with my mex function output from cudamemcpy2D

0 Comments Show -2 older commentsHide -2 older comments

Accepted Answer

3 Comments Show 1 older commentHide 1 older comment

More Answers (0)

See Also

Categories

Tags

Community Treasure Hunt

0 Comments
Show -2 older commentsHide -2 older comments

3 Comments
Show 1 older commentHide 1 older comment