Code covered by the BSD License

# GPU vs CPU speed test of finite difference equation

22 Jul 2013 (Updated )

Compares the speed of the parallel computing toolbox functions vs CPU for finite difference

test_gpu_array3.m
```% script to test the speed of finite difference calculations on gpu vs cpu
% Mark Ward, University of Birmingham, r.m.ward@bham.ac.uk

LoopVals=500*(1:2); % how many times we want to loop the calculations for timing
NVals=1000*(1:4); % what array edge sizes we want to try
myPerf=zeros(length(LoopVals)+1,length(NVals)+1); % somewhere to store a table of performance numbers
myPerf(2:end,1)=LoopVals; % record the loop sizes
myPerf(1,2:end)=NVals; % and array sizes

%======================
iGPU=true % true to run on GPU, false for CPU
%======================
for iLoopVals=1:length(LoopVals)
for iNVals=1:length(NVals)
N=NVals(iNVals);

% set up arrays for the finite difference calculation
if iGPU,
clear g1 iicentre
pause(0.5)
g=gpuDevice(1);
reset(g);
pause(0.5)
g1=gpuArray.zeros(N,N); %,'single');
iicentre=2:N-1;
iicentre=gpuArray(iicentre); %(int32(iicentre)) isn't noticeably faster on average
iiup=iicentre+1;
iidown=iicentre-1;
else
g1=zeros(N,N);
iicentre=2:N-1; % indices to point to the array away from the edges
iiup=iicentre+1; % and the edges
iidown=iicentre-1;
end

ii=round(0.4*N):round(0.6*N);g1(ii,ii)=1; % make the array hotter in the centre

iLoop=LoopVals(iLoopVals);
fprintf('About to run %g loops of %g^2 2D finite difference calculation\n',iLoop,N)
tic % start timing
for ii=1:iLoop, % simplified finite difference. Should also have temperature-dependent material properties.
if iGPU % run gpu code using arrayfun as it's faster than overloading the normal syntax
g1(iicentre,iicentre)=arrayfun(@heat_eqn_fdiff,g1(iicentre,iicentre), ...
g1(iiup,iicentre),g1(iidown,iicentre), ...
g1(iicentre,iiup),g1(iicentre,iidown));
else % cpu is quicker using matrix operations, not arrayfun
g1(iicentre,iicentre)= 0.2*g1(iicentre,iicentre)+0.2* ...
(g1(iiup,iicentre)+g1(iidown,iicentre)+g1(iicentre,iiup)+g1(iicentre,iidown));
end
end
if iGPU,
g_res=gather(g1); % bring the results back from the gpu
else
g_res=g1;
end
t=toc; % stop timing after gathering the results back to main memory
myPerf(iLoopVals+1,iNVals+1)=N^2*iLoop/t/1e6 % speed in millions of fd operations per second
imagesc(g_res,[0 1]);axis image;drawnow;pause(0.1) % show the "temperature" plot
end
end

clear g1
g=gpuDevice(1);
reset(g);

% GTX580, arrayfun, indices as gpu arrays, single precision
% myPerf =
%             0         1000         2000         3000         4000
%           500        606.6       1332.2       1331.2       1222.8
%          1000       1000.6       1358.1       1342.7       1228.7

% all double precision below here:

% GTX580, arrayfun, indices as gpu arrays
% myPerf =
%             0         1000         2000         3000         4000
%           500       775.89       829.07       783.14       685.37
%          1000       813.41       831.59       803.56       688.19

% GTX580, arrayfun, indices on the cpu
% myPerf =
%             0         1000         2000         3000         4000
%           500       707.77       731.84       716.49       623.03
%          1000       733.92       741.25       720.24       596.46

% GTX 580, normal matlab syntax operating on gpu arrays
% myPerf =
%             0         1000         2000         3000         4000
%           500       551.02       568.05       537.08       500.59
%          1000       564.13       571.14       538.36       502.32

%cpu i5-2500K @ 4.2 GHz
% myPerf =
%             0         1000         2000         3000         4000
%           500       37.401       38.362            0            0
%          1000            0            0            0            0

```