|
hi
i ve just wrote first cuda [2.3] script, but discovered after i put it in one m file with example fft [cuda for matlab 1.1] that they cant work at one time -> second script canot connect with device [however he sees it]. i also run my function two times in one empty m script and i got same msg, however calculation were donne correctly.
#include "mex.h"
#include "cuda.h"
#include "stdio.h"
#include "cuda_runtime.h"
/*makes .* operation plus multiplays with 1 integer
* (alpha, trans N, N, transM, M)
*/
//nlhs number of outputs
//nrhd number of inputs
//prhs inputs
__global__ void GPU2D(float *n, float *m, float *nm, int nx, int ny)
{
int i = blockIdx.x* blockDim.x + threadIdx.x ;
int j = blockIdx.y* blockDim.y + threadIdx.y ;
if ((i < ny) && (j < nx))
{
nm[i+j*ny] = n[i+j*ny] * m[i+j*ny];
}
}
int iDivUp(int a, int b)
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
int MX, MY, NX, NY; //matrix dims
int MMX, MMY, NNY, NNX; //dims for translation
float alpha; //alpha real multiplicator
int tn,tm; // transponing flags
float *n, *m, *nm; //host malloc
float *gn, *gm; //device malloc matrix
float *gnm; // device
//data check START-->
if(nrhs != 5){
mexErrMsgTxt("a*m*.n requires 5 args");
}else if (nlhs !=1 )
mexErrMsgTxt("a*m*.n requires 1 out");
if( !mxIsSingle(prhs[2]) || !mxIsSingle(prhs[4]) )
mexErrMsgTxt("inputs not in single precision mode");
tn= (int) mxGetScalar(prhs[1]);
tm= (int) mxGetScalar(prhs[3]);
alpha= (float) mxGetScalar(prhs[0]);
NX= mxGetM(prhs[2]); //gets number of rows in N
NY= mxGetN(prhs[2]); //gets number of colums in N
MX= mxGetM(prhs[4]); //gets number of rows in M
MY= mxGetN(prhs[4]); //gets number of columns in M
if(tn!=0)
{
NNX=NX;
NNY=NY;
}else
{
NNX=NY;
NNY=NX;
}
if(tm!=0)
{
MMX=MY;
MMY=MX;
}else
{
MMX=MX;
MMY=MY;
}
//get 2 arrays single precision (float*)for arrays (float** for matrix)
n = (float*) mxGetData(prhs[2]);
m = (float*) mxGetData(prhs[4]);
//initation of left handed array
plhs[0] = mxCreateNumericMatrix(NNY, NNX, mxSINGLE_CLASS, mxREAL);
nm = (float*) mxGetData(plhs[0]);
int device = 0;
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
printf("device %d \n", device);
cudaGetDeviceProperties(&deviceProp, device);
printf("Using device %d: %s \n", device, deviceProp.name);
/*if(cudaSetDevice(device) !=CUBLAS_STATUS_SUCCESS )
{
printf ("!!!! Set Device error\n");
}*/
cudaMalloc( &gn, NNX*NNY*sizeof(float) );
cudaMalloc( &gm, NNX*NNY*sizeof(float) );
cudaMalloc( &gnm, NNX*NNY*sizeof(float) );
cudaMemcpyHostToDevice );
cudaMemcpy ( gn, n, NNX*NNY*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice );
cudaMemcpy ( gm, m, NNX*NNY*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy ( gnm, nm, NNX*NNY*sizeof(float),cudaMemcpyHostToDevice);
dim3 dimBlock(8, 8);
dim3 dimGrid(iDivUp(NNX , dimBlock.x ), iDivUp(NNY ,dimBlock.y));
GPU2D<<<dimGrid, dimBlock>>>(gn, gm, gnm, NNX, NNY);
cudaMemcpy ( nm, gnm, NNX*NNY*sizeof float),cudaMemcpyDeviceToHost);
free(n);
free(m);
cudaFree(gn);
cudaFree(gm);
cudaFree(gnm);
}
----------------------------------------------------------------------
i cleared all memory i allocated, also tried without setting device [i got only one card so i suppose its unnecessary] but its still no go. or maybe i cant use two different cuda apis in one thread?
keneth
|