Code covered by the BSD License  

Highlights from
GPU vs CPU speed test of finite difference equation

GPU vs CPU speed test of finite difference equation

by

 

22 Jul 2013 (Updated )

Compares the speed of the parallel computing toolbox functions vs CPU for finite difference

test_gpu_array3.m
% script to test the speed of finite difference calculations on gpu vs cpu
% Mark Ward, University of Birmingham, r.m.ward@bham.ac.uk

LoopVals=500*(1:2); % how many times we want to loop the calculations for timing
NVals=1000*(1:4); % what array edge sizes we want to try
myPerf=zeros(length(LoopVals)+1,length(NVals)+1); % somewhere to store a table of performance numbers
myPerf(2:end,1)=LoopVals; % record the loop sizes
myPerf(1,2:end)=NVals; % and array sizes

%======================
iGPU=true % true to run on GPU, false for CPU
%======================
for iLoopVals=1:length(LoopVals)
	for iNVals=1:length(NVals)
		N=NVals(iNVals);
		
		% set up arrays for the finite difference calculation
		if iGPU,
			clear g1 iicentre
			pause(0.5)
			g=gpuDevice(1);
			reset(g);
			pause(0.5)
			g1=gpuArray.zeros(N,N); %,'single');
			iicentre=2:N-1;
			iicentre=gpuArray(iicentre); %(int32(iicentre)) isn't noticeably faster on average
			iiup=iicentre+1;
			iidown=iicentre-1;
		else
			g1=zeros(N,N);
			iicentre=2:N-1; % indices to point to the array away from the edges
			iiup=iicentre+1; % and the edges
			iidown=iicentre-1;
		end
		
		ii=round(0.4*N):round(0.6*N);g1(ii,ii)=1; % make the array hotter in the centre
		
		iLoop=LoopVals(iLoopVals);
		fprintf('About to run %g loops of %g^2 2D finite difference calculation\n',iLoop,N)
		tic % start timing
		for ii=1:iLoop, % simplified finite difference. Should also have temperature-dependent material properties.
			if iGPU % run gpu code using arrayfun as it's faster than overloading the normal syntax
				g1(iicentre,iicentre)=arrayfun(@heat_eqn_fdiff,g1(iicentre,iicentre), ...
					g1(iiup,iicentre),g1(iidown,iicentre), ...
					g1(iicentre,iiup),g1(iicentre,iidown)); 
			else % cpu is quicker using matrix operations, not arrayfun
				g1(iicentre,iicentre)= 0.2*g1(iicentre,iicentre)+0.2* ...
 					(g1(iiup,iicentre)+g1(iidown,iicentre)+g1(iicentre,iiup)+g1(iicentre,iidown));
			end
		end
		if iGPU, 
			g_res=gather(g1); % bring the results back from the gpu
		else
			g_res=g1;
		end
		t=toc; % stop timing after gathering the results back to main memory
		myPerf(iLoopVals+1,iNVals+1)=N^2*iLoop/t/1e6 % speed in millions of fd operations per second
		imagesc(g_res,[0 1]);axis image;drawnow;pause(0.1) % show the "temperature" plot
	end
end

clear g1
g=gpuDevice(1);
reset(g);

% GTX580, arrayfun, indices as gpu arrays, single precision
% myPerf =
%             0         1000         2000         3000         4000
%           500        606.6       1332.2       1331.2       1222.8
%          1000       1000.6       1358.1       1342.7       1228.7


% all double precision below here:

% GTX580, arrayfun, indices as gpu arrays
% myPerf =
%             0         1000         2000         3000         4000
%           500       775.89       829.07       783.14       685.37
%          1000       813.41       831.59       803.56       688.19

% GTX580, arrayfun, indices on the cpu
% myPerf =
%             0         1000         2000         3000         4000
%           500       707.77       731.84       716.49       623.03
%          1000       733.92       741.25       720.24       596.46

% GTX 580, normal matlab syntax operating on gpu arrays
% myPerf =
%             0         1000         2000         3000         4000
%           500       551.02       568.05       537.08       500.59
%          1000       564.13       571.14       538.36       502.32

%cpu i5-2500K @ 4.2 GHz
% myPerf =
%             0         1000         2000         3000         4000
%           500       37.401       38.362            0            0
%          1000            0            0            0            0

Contact us