Code covered by the BSD License  

Highlights from
Multiple matrix multiplications, with array expansion enabled

image thumbnail
from Multiple matrix multiplications, with array expansion enabled by Paolo de Leva
Multiplying matrices, vectors, or scalars contained in two N-D arrays, with array expansion enabled.

timing_arraylab_engines
function timing_arraylab_engines
% TIMING_ARRAYLAB_ENGINES  Testing for speed different ARRAYLAB engines.
% 
%    Testing different methods to multi-multiply A by B, which can be used
%    as kernels of the MULTIPROD function. The tested engines are specific 
%    for three input formats:
%
%    1)  A  is  N(PQ) (no block expansion is needed)
%        B  is  N(QR)
%
%    2)  A  is  (PQ)  (this array requires block expansion)
%        B  is  N(QR)
%
%    3)  A  is  N(PQ)
%        B  is  (QR)  (this array requires block expansion)
%
%    The results obtained using two different personal computers are
%    reported in the manual of MULTIPROD 2.0 (Appendix A).

% Paolo de Leva  University of Rome, Foro Italico, Rome, Italy
% Jinhui Bai     Georgetown University, Washington, D.C.
% 2009 Gen 22

clear all

% Checking whether needed software exists
message = sysrequirements_for_testing('bsxfun','genop','bsxtimes','timeit');
if message
    disp ' ', error('timing_arraylab_engines:Missing_subfuncs', message)
end

disp ' ', 
disp '---------------------------- Experiment 1 ----------------------------'
N = 10000; P = 3; Q = 3; R = 1;  
[t(1,:), maxerror(1,:)] = timing(N,P,Q,R);

disp ' ', 
disp '---------------------------- Experiment 2 ----------------------------'
N = 1000; P = 3;  Q = 30; R = 1;  
[t(2,:), maxerror(2,:)] = timing(N,P,Q,R);

disp ' ', 
disp '---------------------------- Experiment 3 ----------------------------'
N = 1000; P = 9; Q = 10;  R = 3; 
[t(3,:), maxerror(3,:)] = timing(N,P,Q,R);

disp ' ', 
disp '---------------------------- Experiment 4 ----------------------------'
N = 100; P = 9; Q = 100;  R = 3; 
[t(4,:), maxerror(4,:)] = timing(N,P,Q,R);

disp ' ', 
disp '----------------------------   Summary   -----------------------------'
disp ' ', disp 'Execution time (ms)'
disp '     EXP 1     EXP 2     EXP 3     EXP 4'
disp (t'*1000)
disp 'Engine output error'
disp '     EXP 1     EXP 2     EXP 3     EXP 4'
fprintf('%10.2g%10.2g%10.2g%10.2g\n', maxerror')
disp 'NOTE: Engine output error should be zero or very close to EPS.'
fprintf ('      EPS ='); disp (eps)



function [t, maxerror] = timing(N,P,Q,R)

a0 = rand(1, P, Q); 
b0 = rand(1, Q, R);
a = a0(ones(1,N),:,:); % Cloning along first dimension
b = b0(ones(1,N),:,:); 
[n1 p q1] = size(a); % Reads third dim even if it is 1
[n2 q2 r] = size(b); %
disp ' '
disp        'Array  Size      Size               Number of elements'
fprintf (1, 'A      Nx(PxQ)   %0.0f x (%0.0f x %0.0f) %8.0f\n', [n1 p q1  numel(a)])
fprintf (1, 'B      Nx(QxR)   %0.0f x (%0.0f x %0.0f) %8.0f\n', [n2 q2 r  numel(b)])
disp ' '

% Standard loops (Probably they exploit MATLAB JIT accelerator)
disp 'ARRAYLAB by means of plain LOOPS'
f1  = @() loop1(a,b); 
f1b = @() loop1b(a,b);
f2  = @() loop2(a,b);
t = [timeit(f1), timeit(f1b), timeit(f2)];
disp (t)

% Method used in MULTIPROD 1.3, which uses "cloning", .* and SUM
% ("Cloning" means "singleton expansion"; see also BSXFUN help)
disp 'MULTIPROD VERSION 1 (column-by-column cloning) AND 2 (BSXFUN engine)'
disp '       1.3       1.3      1.31      1.32      1.33       2.1'
f3 = @() multiprod13(a, b, [2 3]);
f4 = @()  arraylab13(a, b, 2, 3); 
f5 = @() arraylab131(a, b, 2, 3); 
f6 = @() arraylab132(a, b, 2, 3); 
f7 = @() arraylab133(a, b, 2, 3); 
f8 = @()   multiprod(a, b, [2 3]);

lenT = length(t);
t = [t, timeit(f3),timeit(f4),timeit(f5),timeit(f6),timeit(f7),timeit(f8)];
disp (t(lenT+1:end))

% (PERMUTE) --> 4D RESHAPE --> SX --> TIMES --> SUM
disp 'ARRAYLAB by means of (PERMUTE) --> 4D-RESHAPE --> SX --> TIMES --> SUM'
disp ' TONY''S T.     GENOP  BSXTIMES    BSXFUN   BSXFUN2   BSXFUN3'
f9  = @() resh4D_clone(a, b);    % SX with Tony's trick
f10 = @() resh4D_genop(a, b);    % SX+TIMES with GENOP  (BSXFUN replacement)
f11 = @() resh4D_bsxtimes(a, b); % SX+TIMES with BSXTIMES (BSXFUN replacement)
f12 = @() resh4D_bsxfun(a, b);   % SX+TIMES with BSXFUN
f13 = @() permA_resh4D_bsxfun(a, b);
f14 = @() permAB_resh4D_bsxfun(a, b);
lenT = length(t);
t = [t, timeit(f9),timeit(f10),timeit(f11),timeit(f12),timeit(f13),timeit(f14)];
disp (t(lenT+1:end))

% Techniques for virtual matrix expansion (MX)
disp 'MATRIX EXPANSION (MX) TECHNIQUES (MULTIPROD 2 USES 2D-RESHAPE)'
disp '                          LOOP   4D-RESHAPE   2D-RESHAPE  MULTIPROD 2'
a2 = shiftdim(a(1,:,:)); % Single matrix
b2 = shiftdim(b(1,:,:)); % Single matrix
f15 = @()         loop2_expA(a2, b);     % PLAIN LOOP2
f16 = @() resh4D_bsxfun_expA(a2, b);     % 4D RESH --> SX --> TIMES --> SUM
f17 = @()     squashB_mtimes(a2, b);     % PERMUTE --> 2D RESH --> MTIMES
f18 = @() multiprod(a2, b, [1 2],[2 3]); % Generalized SQUASH_MTIMES
f19 = @()         loop2_expB(a, b2);
f20 = @() resh4D_bsxfun_expB(a, b2);
f21 = @()     squashA_mtimes(a, b2);
f22 = @() multiprod(a, b2, [2 3],[1 2]);
lenT = length(t);
t = [t, timeit(f15),timeit(f16),timeit(f17),timeit(f18)];
fprintf('MATRIX * 3D ARRAY')
fprintf(1, '%13.4f', t(lenT+1:end))
disp ' '
lenT = length(t);
t = [t, timeit(f19),timeit(f20),timeit(f21),timeit(f22)];
fprintf('3D ARRAY * MATRIX')
fprintf(1, '%13.4f', t(lenT+1:end))
disp ' '

% Comparing function output (differences)
c0 = f1();
c = c0 - f1b(); maxerror(1)  = max(c(:));
c = c0 - f2();  maxerror(2)  = max(c(:));
c = c0 - f3();  maxerror(3)  = max(c(:));
c = c0 - f4();  maxerror(4)  = max(c(:));
c = c0 - f5();  maxerror(5)  = max(c(:));
c = c0 - f6();  maxerror(6)  = max(c(:));
c = c0 - f7();  maxerror(7)  = max(c(:));
c = c0 - f8();  maxerror(8)  = max(c(:));
c = c0 - f9();  maxerror(9)  = max(c(:));
c = c0 - f10(); maxerror(10) = max(c(:));
c = c0 - f11(); maxerror(11) = max(c(:));
c = c0 - f12(); maxerror(12) = max(c(:));
c = c0 - f13(); maxerror(13) = max(c(:));
c = c0 - f14(); maxerror(14) = max(c(:));
c = c0 - f15(); maxerror(15) = max(c(:));
c = c0 - f16(); maxerror(16) = max(c(:));
c = c0 - f17(); maxerror(17) = max(c(:));
c = c0 - f18(); maxerror(18) = max(c(:));
c = c0 - f19(); maxerror(19) = max(c(:));
c = c0 - f20(); maxerror(20) = max(c(:));
c = c0 - f21(); maxerror(21) = max(c(:));
c = c0 - f22(); maxerror(22) = max(c(:));


function c = loop1(a, b)
% WARNING: You can use LOOP2 without PERMUTE if  A  is  (PQ)N  and
%                                                B  is  (QR)N
[n p q] = size(a);
r = size(b, 3);
c = zeros([n p r]);
for i = 1 : n
    c(i,:,:)= reshape(a(i,:,:), p, q)...
              * ...
              reshape(b(i,:,:), q, r);
end

function c = loop1b(a, b)
% Same as LOOP 1 but commands are not nested
% (I have read somewhere that JIT does not work with nested commands)
[n p q] = size(a);
r = size(b, 3);
c = zeros([n p r]);
for i = 1 : n
    a2 = reshape(a(i,:,:), p, q);
    b2 = reshape(b(i,:,:), q, r);
    c(i,:,:)= a2 * b2;
end

function c = loop2(a, b)
% Warning:  PERMUTE is not needed if  A  is  (PQ)N  and
%                                     B  is  (QR)N
[n p q] = size(a);
r = size(b, 3);
a = permute(a, [2 3 1]);
b = permute(b, [2 3 1]);
    c = zeros([p, r, n]);
    for i = 1 : n
        c(:,:,i) = a(:,:,i) * b(:,:,i);
    end
c = permute(c, [3 1 2]);
     
function c = resh4D_clone(a, b)
% By Paolo de Leva 
% Based on RESH4D_BSXFUN. Uses "Tony's trick" instead of BSXFUN. Tony's
% trick is an index vectorization trick used in REPMAT and MULTIPROD 1.3 to
% perform singleton expansion)
% WARNING: Tony's trick is memory expensive.
    [n p q] = size(a);
    [n q r] = size(b);
    b = reshape(b, [n 1 q r]);
    indexP = ones(1, p); % "Cloned" indexes
    indexR = ones(1, r);
    c = sum(a(:,:,:,indexR) .* b(:,indexP,:,:), 3);
    c = reshape(c, [n p r]);
    
function c = resh4D_bsxfun(a, b)
% By Jinhui Bai
    [n p q] = size(a);
    [n q r] = size(b);
    b = reshape(b, [n 1 q r]);
    c = sum(bsxfun(@times,a,b), 3);
    c = reshape(c, [n p r]);
    
function c = permA_resh4D_bsxfun(a, b)
% This version of RESH4D_BSXFUN permutes only A to make it possible a SUM
% along 2nd dimension (supposedly faster than along 3rd dimension). 
% By Jinhui Bai
    [n p q] = size(a);
    [n q r] = size(b);
    a = permute(a, [1 3 2]); % N x Q x P
    b = reshape(b, [n q 1 r]);
    c = sum(bsxfun(@times,a,b), 2);
    c = reshape(c, [n p r]);

function c = permAB_resh4D_bsxfun(a, b)
% This version of RESH4D_BSXFUN permutes both A and B to make it possible
% a SUM along 1st dimension (supposedly faster than along higher dim.). 
% By Jinhui Bai
    [n p q] = size(a);
    [n q r] = size(b);
    a = permute(a, [3 2 1]); % Q x P x N
    b = permute(b, [2 3 1]); % Q x R x N
        a = reshape(a, [q p 1 n]);
        b = reshape(b, [q 1 r n]);
        c = sum(bsxfun(@times,a,b), 1);
        c = reshape(c, [p r n]);
    c = permute(c, [3 1 2]); % N x P x R
    
function c = resh4D_genop(a, b)
% Based on RESH4D_BSXFUN. Uses GENOP by Doug Schwarz (MATLAB Central file
% #10333) as a replacement of BSXFUN.
    [n p q] = size(a);
    [n q r] = size(b);
    b = reshape(b, [n 1 q r]);
    c = sum(genop(@times,a,b), 3);
    c = reshape(c, [n p r]);
    
function c = resh4D_bsxtimes(a, b)
% Based on RESH4D_BSXFUN. Uses a BSXFUN substitute by Douglas Schwarz
% (MATLAB Central file #23005).
    [n p q] = size(a);
    [n q r] = size(b);
    b = reshape(b, [n 1 q r]);
    c = sum(bsx_times(a,b), 3);
    c = reshape(c, [n p r]);

function c = loop2_expA(a, b)
% Virtual single-block expansion
      p = size(a, 1);
[n q r] = size(b);
b = permute(b, [2 3 1]);
    c = zeros([p, r, n]);
    for i = 1 : n
        c(:,:,i) = a * b(:,:,i);
    end
c = permute(c, [3 1 2]);

function c = loop2_expB(a, b)
% Virtual single-block expansion
[n p q] = size(a);
      r = size(b, 2);
a = permute(a, [2 3 1]);
    c = zeros([p, r, n]);
    for i = 1 : n
        c(:,:,i) = a(:,:,i) * b;
    end
c = permute(c, [3 1 2]);

function c = resh4D_bsxfun_expA(a, b)
% Virtual single-block expansion
% Based on RESH4D_BSXFUN (singleton expansion).
% by Paolo de Leva
      [p q] = size(a);
    [n q r] = size(b);
    a = reshape(a, [1 p q 1]);
    b = reshape(b, [n 1 q r]);
    c = sum(bsxfun(@times,a,b), 3);
    c = reshape(c, [n p r]);
    
function c = resh4D_bsxfun_expB(a, b)
% Virtual single-block expansion 
% Based on RESH4D_BSXFUN (singleton expansion).
% by Paolo de Leva
    [n p q] = size(a);
      [q r] = size(b);
    b = reshape(b, [1 1 q r]);
    c = sum(bsxfun(@times,a,b), 3);
    c = reshape(c, [n p r]);   

function c = squashB_mtimes(a, b)
% Virtual single-block expansion
% Instead of expanding both A and B, it just squashes B from 3-D to 2-D. It
% avoids using additional memory for expansion. Based on code suggested by
% Jinhui Bai to deal with this format:
%     A  is  PQ
%     B  is (QR)N
%
% NOTE : B = PERMUTE(B, [2 1 3]), from N(QR) to QNR has been shown to 
%        be faster than B = PERMUTE(B, [2 3 1]), from N(QR) to (QR)N.
%        See TIMING_MATLAB_COMMANDS.M.
% By Paolo de Leva
b = permute(b, [2 1 3]); % QNR
    p = size(a,1);
    [q n r] = size(b);
    b = reshape(b, [q, n*r]); % squashing B (equivalent to B(:,:))
    c = reshape(a*b, [p n r]); 
c = permute(c, [2 1 3]); % N(PR)
    
function c = squashA_mtimes(a, b)
% Virtual single-block expansion
% Instead of expanding both A and B, it just squashes A from 3-D to 2-D. It
% avoids using additional memory for expansion. Based on code suggested by
% Jinhui Bai to deal with this format:
%     A  is (PQ)N
%     B  is  QR
% By Paolo de Leva
    [n p q] = size(a);
    r = size(b, 2);
    a = reshape(a, [n*p, q]); % squashing A
    c = reshape(a*b, [n p r]);

Contact us at files@mathworks.com