Code covered by the BSD License  

Highlights from
GPUCONV2

GPUCONV2

by

 

Example, Matlab R2010B Cuda CONV2 on GPU using Cuda kernels

gpuconv2(A,B,SHAPE)
function C = gpuconv2(A,B,SHAPE)
%GPUCONV2 Two dimensional convolution on the GPU using Cuda.
%
%   C = GPUCONV2(A, B) performs the 2-D convolution of matrices A and B.
%   If [ma,na] = size(A), [mb,nb] = size(B), and [mc,nc] = size(C), then
%   mc = max([ma+mb-1,ma,mb]) and nc = max([na+nb-1,na,nb]).
%
%   C = GPUCONV2(A, B, SHAPE) returns a subsection of the 2-D
%   convolution with size specified by SHAPE:
%     'full'  - (default) returns the full 2-D convolution,
%     'same'  - returns the central part of the convolution
%               that is the same size as A.
%     'valid' - returns only those parts of the convolution
%               that are computed without the zero-padded edges.
%               size(C) = max([ma-max(0,mb-1),na-max(0,nb-1)],0).
%
%   Note!
%   This function depends on two Cuda kernel files
%   conv2_float.cu and conv2_double.cu which needs to be compiled
%   to .ptx files using the nvcc compiler. 
%
%   See also CONV2, NVCC
%
%   Example,
%   % Load an image
%   I=im2double(imread('moon.tif'));
%   % Create a Gaussian filtering kernel
%   H = fspecial('gaussian',[20 20],3);
%   % Perform the convolution on the CPU
%   J = conv2(I,H);
%   % Perform the convolution on the GPU
%   Jcuda = gpuconv2(I,H);
%   % Show the results
%   figure,
%   imshow(J,[]); title('CPU filtering');
%   imshow(Jcuda,[]); title('GPU filtering');
%   imshow(Jcuda-J,[]); title('Difference');
%
%  Function is written by D.Kroon University of Twente (December 2010)

% Compile the kernel
if(~exist('conv2_float.ptx','file'))
    if(exist('nvcc.m','file'))
        nvcc -ptx conv2_float.cu
    else
        error('gpuconv2:files','Compile the kernel, nvcc -ptx conv2_float.cu ');
    end
end
if(~exist('conv2_double.ptx','file'))
    if(exist('nvcc.m','file'))
        nvcc -ptx conv2_double.cu
    else
        error('gpuconv2:files','Compile the kernel, nvcc -ptx conv2_double.cu ');
    end
end

% Create the Cuda kernel (.cu file is needed to know all the inputs/outputs)
if(isa(A,'single'))
    Conv2Kernel = parallel.gpu.CUDAKernel('conv2_float.ptx', 'conv2_float.cu');
else
    Conv2Kernel = parallel.gpu.CUDAKernel('conv2_double.ptx', 'conv2_double.cu');
end

% Matrices A,B to GPU
if(isa(A,'single'))
    A_Cuda=gpuArray(single(A));
    B_Cuda=gpuArray(single(B));
else
    A_Cuda=gpuArray(double(A));
    B_Cuda=gpuArray(double(B));
end

if(any((size(A)-size(B))<0))
    error('gpuconv2:inputs','Size Matrix A must be larger then Size Matrix B');
end

% Size of Matrices to CUDA
Asize_Cuda=gpuArray(int32(size(A)));
Bsize_Cuda=gpuArray(int32(size(B)));

% Output size C
if(nargin<3), SHAPE='full'; end
switch(lower(SHAPE))
    case 'full'
        Csize=size(A)+size(B)-1;
    case 'same'
        Csize=size(A);
    case 'valid'
        Csize=size(A)-size(B)+1;
    otherwise
        error('gpuconv2:inputs','Unknow Shape');
end
% Size to GPU memory
Csize_Cuda=gpuArray(int32(Csize));

% Calculate max Block size
s = floor(sqrt(Conv2Kernel.MaxThreadsPerBlock));

% Create Blocks of s x s pixels
Conv2Kernel.ThreadBlockSize=[s s 1];

% Make a Grid to proces the whole output image in blocks
Conv2Kernel.GridSize=ceil(Csize/s);

% Initialize memory for the output image
if(isa(A,'single'))
    C_Cuda = parallel.gpu.GPUArray.zeros(Csize(1),Csize(2),'single');
else
    C_Cuda = parallel.gpu.GPUArray.zeros(Csize(1),Csize(2),'double');
end

% Perform the Convolution on the GPU
C_Cuda = feval(Conv2Kernel,C_Cuda, Csize_Cuda, A_Cuda, Asize_Cuda,B_Cuda, Bsize_Cuda);

% Get the data back to the main memory
C = gather(C_Cuda);

Contact us