function C = gpuconv2(A,B,SHAPE)
%GPUCONV2 Two dimensional convolution on the GPU using Cuda.
%
% C = GPUCONV2(A, B) performs the 2-D convolution of matrices A and B.
% If [ma,na] = size(A), [mb,nb] = size(B), and [mc,nc] = size(C), then
% mc = max([ma+mb-1,ma,mb]) and nc = max([na+nb-1,na,nb]).
%
% C = GPUCONV2(A, B, SHAPE) returns a subsection of the 2-D
% convolution with size specified by SHAPE:
% 'full' - (default) returns the full 2-D convolution,
% 'same' - returns the central part of the convolution
% that is the same size as A.
% 'valid' - returns only those parts of the convolution
% that are computed without the zero-padded edges.
% size(C) = max([ma-max(0,mb-1),na-max(0,nb-1)],0).
%
% Note!
% This function depends on two Cuda kernel files
% conv2_float.cu and conv2_double.cu which needs to be compiled
% to .ptx files using the nvcc compiler.
%
% See also CONV2, NVCC
%
% Example,
% % Load an image
% I=im2double(imread('moon.tif'));
% % Create a Gaussian filtering kernel
% H = fspecial('gaussian',[20 20],3);
% % Perform the convolution on the CPU
% J = conv2(I,H);
% % Perform the convolution on the GPU
% Jcuda = gpuconv2(I,H);
% % Show the results
% figure,
% imshow(J,[]); title('CPU filtering');
% imshow(Jcuda,[]); title('GPU filtering');
% imshow(Jcuda-J,[]); title('Difference');
%
% Function is written by D.Kroon University of Twente (December 2010)
% Compile the kernel
if(~exist('conv2_float.ptx','file'))
if(exist('nvcc.m','file'))
nvcc -ptx conv2_float.cu
else
error('gpuconv2:files','Compile the kernel, nvcc -ptx conv2_float.cu ');
end
end
if(~exist('conv2_double.ptx','file'))
if(exist('nvcc.m','file'))
nvcc -ptx conv2_double.cu
else
error('gpuconv2:files','Compile the kernel, nvcc -ptx conv2_double.cu ');
end
end
% Create the Cuda kernel (.cu file is needed to know all the inputs/outputs)
if(isa(A,'single'))
Conv2Kernel = parallel.gpu.CUDAKernel('conv2_float.ptx', 'conv2_float.cu');
else
Conv2Kernel = parallel.gpu.CUDAKernel('conv2_double.ptx', 'conv2_double.cu');
end
% Matrices A,B to GPU
if(isa(A,'single'))
A_Cuda=gpuArray(single(A));
B_Cuda=gpuArray(single(B));
else
A_Cuda=gpuArray(double(A));
B_Cuda=gpuArray(double(B));
end
if(any((size(A)-size(B))<0))
error('gpuconv2:inputs','Size Matrix A must be larger then Size Matrix B');
end
% Size of Matrices to CUDA
Asize_Cuda=gpuArray(int32(size(A)));
Bsize_Cuda=gpuArray(int32(size(B)));
% Output size C
if(nargin<3), SHAPE='full'; end
switch(lower(SHAPE))
case 'full'
Csize=size(A)+size(B)-1;
case 'same'
Csize=size(A);
case 'valid'
Csize=size(A)-size(B)+1;
otherwise
error('gpuconv2:inputs','Unknow Shape');
end
% Size to GPU memory
Csize_Cuda=gpuArray(int32(Csize));
% Calculate max Block size
s = floor(sqrt(Conv2Kernel.MaxThreadsPerBlock));
% Create Blocks of s x s pixels
Conv2Kernel.ThreadBlockSize=[s s 1];
% Make a Grid to proces the whole output image in blocks
Conv2Kernel.GridSize=ceil(Csize/s);
% Initialize memory for the output image
if(isa(A,'single'))
C_Cuda = parallel.gpu.GPUArray.zeros(Csize(1),Csize(2),'single');
else
C_Cuda = parallel.gpu.GPUArray.zeros(Csize(1),Csize(2),'double');
end
% Perform the Convolution on the GPU
C_Cuda = feval(Conv2Kernel,C_Cuda, Csize_Cuda, A_Cuda, Asize_Cuda,B_Cuda, Bsize_Cuda);
% Get the data back to the main memory
C = gather(C_Cuda);