Code covered by the BSD License  

Highlights from
Statistical Learning Toolbox

from Statistical Learning Toolbox by Dahua Lin
Functions for statistical learning, pattern recognition and computer vision, covering many topics.

slpartitionpca(data, arrsiz, n, ps, filepath, varargin)
function slpartitionpca(data, arrsiz, n, ps, filepath, varargin)
%SLPARTITIONPCA Performs Partition-based PCA and saves the models
%
% $ Syntax $
%   - slpartitionpca(data, arrsiz, n, ps, filepath, ...)
%   
% $ Arguments $
%   - data:         the super-array of the unit arrays, or the set of
%                   filenames storing the arrays.
%   - arrsiz:       the size of each unit array
%   - n:            the number of samples
%   - ps:           the partition structure for each unit
%   - filepath:     the destination filepath (without extension)
%   
% $ Description $
%   - slpartitionpca(data, arrsiz, n, ps, filepath, ...) applies PCA to 
%     large arrays. To make the computation tractable, it divides the 
%     whole matrix into several partitions according to the structure
%     specified in ps. The trained models be stored in filepath and
%     related files. 
%     It will creates a core file named filepath.mat to give basic
%     information of the PCA model, which contains the following variables:
%       - 'parstruct':      the parition structure
%       - 'blocks':         the cell array of specification of blocks
%                           if the partition structure divides the whole
%                           array unit into m1 x m2 x ... blocks, then
%                           blocks would be a m1 x m2 x ... cell array,
%                           with each cell being a 2 x d matrix in the form
%                           of [s1 s2, ... sd; e1, e2, ..., ed]
%                           then the actual block extracted from an array
%                           unit A would be A(s1:e1, s2:e2, ..., sd:ed)
%       - 'projfiles'       The cell array of projection array files 
%                           corresponding to the blocks.
%       - 'combprojfile'    The filename of the combined projection. If
%                           the combined model is not learned, this
%                           filename is empty.
%       - 'meanarr'         The mean array
%       - 'energy'          The structure representing the energy info
%                           - 'total': the original total energy;
%                           - 'intpreserved': the total preserved energy of
%                             all partitions
%                           - 'combpreserved': the preserved energy of
%                             combination model
%                           - 'par':  the original partition enegies:
%                              an m1 x m2 x ... array.
%                           - 'parpreserved': the preserved partition 
%                              energy. an m1 x m2 x ... array.
%       - 'diminfo'         The information of the space dimension
%                           - 'size':   the size of the whole array unit
%                           - 'oridim': the original total dimension
%                           - 'dims':   the original dimensions of the 
%                             partitions: an m1 x m2 x ... array.
%                           - 'subdims': the subspace dimension of the
%                             partitions: an m1 x m2 x ... array
%                           - 'intdim': the dimension of the intermediate
%                             stacked vector space.
%                           - 'feadim': the dimension of the combined
%                             subspace. (If the combined model is not
%                             learned, subdim = intdim)
%       - 'evalset'         The eigenvalues preserved for all partitions
%       - 'combevals'       The eigenvalues of the combination model
%
%     The projection matrices will be stored in a set of array files 
%     named as filepath.proj.01, ...., the the combined model is learned
%     its projection matrix is given by filepath.proj.comb.
%     In addition, you can specify following properties to control the
%     learning of partition-based PCA.
%     \*
%     \t    Properties of Partition-based PCA Learning
%     \h     name        &      description
%           'combmodel'  &  Whether to learn a combined PCA model, 
%                           default = true;
%           'er0'        &  The level-0 of energy preservation ratio
%                           (default = 0.99)
%           'er1'        &  The level-1 of energy preservation ratio 
%                           (default = 0.95, only takes effect in the 
%                            training of combined model)
%           'mixev'      &  Whether to mix up the eigenvalues of all PCA
%                           models when selecting the principal components
%                           (default = 0)
%           'meanarr'    &  The precomputed mean array (default = [])
%           'weights'    &  The weights of individual samples 
%                           (default = [])
%           'verbose'    &  Whether to show intermediate step information
%                           (default = true)
%     \*
%   
%
% $ Remarks $
%   - The algorithm implements a divide and conquer strategy, it first
%     divides the whole array unit into smaller partitions, then train PCA
%     for each partition. The principal components for all the partitions 
%     are stacked together and then a combined PCA model is trained based 
%     on the stacked space. The combined model is learned when 
%   - On the selection of principal components for individual partitions,
%     there are two strategies. The simpler one is the separate strategy,
%     that is to select principal components merely based on the
%     eigenspectrum of the parition PCA model itself and select the 
%     components corresponding to the largest eigenvalues so that the 
%     energy of this partition is preserved up to the ratio of er0.
%     Another strategy is mix-up strategy, which pools all eigenvalues
%     from all paritions together, and select the eigenvectors 
%     corresponding to the largest eigenvalues according to the overall
%     ranking. It can be proved that the mix-up strategy is more efficient.
%     The property mixev is to balance the two strategies, its value
%     ranges in [0, 1]. When mixev is 0, then purely separate strategy 
%     would be used; when mixev is 1, then purely mix-up strategy would
%     be used. When 0 < mixev < 1, we first use separate strategy to 
%     guarantee that each partition preserve up to (1 - mixev) of energy, 
%     then the other mixev of total energy is preserved by pooling all the
%     rest components and select according to overall ranking.
%
% $ History $
%   - Created by Dahua Lin, on Jul 29th, 2006
%   - Modified by Dahua Lin, on Sep 10th, 2006
%       - replace sladd by sladdvec to increase efficiency
%

%% parse and verify input arguments

if nargin < 5
    raise_lackinput('slpartitionpca', 5);
end
arrsiz = arrsiz(:)';
if ~ischar(filepath)
    error('sltoolbox:invalidarg', ...
        'The filepath should be a char string');
end
arrdim = length(arrsiz);

if length(ps) ~= arrdim
    error('sltoolbox:sizmismatch', ...
        'The dimension of partition structure is not consisitent with the array unit');
end

opts.combmodel = true;
opts.er0 = 0.99;
opts.er1 = 0.95;
opts.mixev = 0;
opts.meanarr = [];
opts.weights = [];
opts.verbose = true;
opts = slparseprops(opts, varargin{:});

check_valuerange(opts.er0, 'er0', 0, 1);
check_valuerange(opts.er1, 'er1', 0, opts.er0);
check_valuerange(opts.mixev, 'mixev', 0, 1);

meanarr = opts.meanarr;
if ~isempty(meanarr)
    if ~isequal(size(meanarr), arrmatdim)
        error('sltoolbox:sizmismatch', ...
            'The mean array offered is not consistent with the array unit size');
    end
end

if ~isempty(opts.weights)
    opts.weights = opts.weights(:);
    if length(opts.weights) ~= n
        error('sltoolbox:sizmismatch', ...
            'The length of given weights is inconsistent with the number of samples');
    end
    hasweights = true;
else
    hasweights = false;
end
    


%% Initialization on Partitions and Filenames

showinfo('Initializing Blocks ...', opts);

% partition structure
parstruct = ps;

blocks =  slparblocks(parstruct);
blkinds = slallsubinds(size(blocks));
NBlks = numel(blocks);

projfiles = cell(size(blocks));
for i = 1 : NBlks
    projfiles{i} = [filepath, '.proj.', generate_indexstring(size(blocks), blkinds(:, i)')];
end

if opts.combmodel
    combprojfile = [filepath, '.proj.comb'];
else
    combprojfile = [];
end

clear curinds currange;


%% Prepare Info Structures

showinfo('Preparing Info Structure ...', opts);

energy.total = 0;
energy.intpreserved = 0;
energy.combpreserved = 0;
energy.par = zeros(size(blocks));
energy.parpreserved = zeros(size(blocks));

diminfo.size = arrsiz;
diminfo.oridim = prod(arrsiz);
diminfo.dims = zeros(size(blocks));
diminfo.subdims = zeros(size(blocks));
diminfo.intdim = 0;
diminfo.feadim = 0;



%% Compute the mean array

if isempty(meanarr)
    
    showinfo('Computing Mean Array ...', opts);
    
    if ~hasweights
        meanarr = slarrmean(data, arrsiz, n);
    else
        meanarr = slarrmean(data, arrsiz, n, 'weights', opts.weights);
    end
end


%% Learn the individual PCA models

showinfo('Learning Individual PCA Models ...', opts);

evalset = cell(size(blocks));
for ib = 1 : NBlks
    
    showinfo(sprintf('   PCA Model %d', ib), opts);   
    
    curblk = blocks{ib};
    rgncell = slrange2indcells(curblk);
    vecd = prod(curblk(2,:) - curblk(1,:) + 1);
    
    % compute covariance
    blkcov = compute_localcov(data, meanarr, rgncell, vecd, n, opts.weights);
    
    % compute eigen spectrum
    [evals, evecs] = slsymeig(blkcov);
    
    % initial truncate
    currk = sum(evals >= eps(evals(1)) / 10);    
    evals = evals(1:currk);
    evecs = evecs(:, 1:currk);
                      
    % initial save
    evalset{ib} = evals; 
    slwritearray(evecs, projfiles{ib});
    
    energy.par(ib) = sum(evals);
    diminfo.dims(ib) = vecd; 
    
end

energy.total = sum(energy.par(:));


%% Principal Components Selection

showinfo('Performing Principal Components Selection ...', opts);

if opts.mixev == 0 % separate strategy
    
    showinfo('   Use Separate Strategy', opts);
    
    for ib = 1 : NBlks
                
        evals = evalset{ib};
        currk = decide_rank(evals, opts.er0);     
        
        evals = evals(1:currk);
        evalset{ib} = evals;
        
        evecs = slreadarray(projfiles{ib});
        evecs = evecs(:, 1:currk);
        slwritearray(evecs, projfiles{ib});
                        
        energy.parpreserved(ib) = sum(evals);
        diminfo.subdims(ib) = currk;
        
        clear evals evecs
    end               
    
else % using mix-up strategy
    
    showinfo('   Use Mix-Up Strategy', opts);
    
    showinfo('   Individual Preservation', opts);
    
    % preserve individual projections
    if opts.mixev < 1
        er0 = opts.er0 * (1 - opts.mixev);              
        for ib = 1 : NBlks
            evals = evalset{ib};
            diminfo.subdims(ib) = decide_rank(evals, er0);            
            energy.parpreserved(ib) = sum(evals(1:diminfo.subdims(ib)));
        end        
    else
        diminfo.subdims(:) = 1;
        for ib = 1 : NBlks
            evals = evalset{ib};
            energy.parpreserved(ib) = evals(1);
        end
    end
        
    indv_preserved = sum(energy.parpreserved(:));
    
    % add additional components from mixed pool    
    if indv_preserved < energy.total * opts.er0
        
        showinfo('   Collecting Information for Mix-up ...', opts);
        
        % analyze target
        rest_target_energy = energy.total * opts.er0 - indv_preserved;
        
        % collect rest eigenvalues
        restevnums = zeros(NBlks, 1);
        for ib = 1 : NBlks
            evals = evalset{ib};
            restevnums(ib) = length(evals) - diminfo.subdims(ib);
        end
        total_restevnum = sum(restevnums(:));
        
        evspool = zeros(total_restevnum, 1);
        evsbid = zeros(total_restevnum, 1);
        cn = 0;
        for ib = 1 : NBlks
            if restevnums(ib) > 0
                evals = evalset{ib};
                evspool(cn+1:cn+restevnums(ib)) = ...
                    evals(diminfo.subdims(ib)+1:end);
                evsbid(cn+1:cn+restevnums(ib)) = ib;
                cn = cn + restevnums(ib);
            end            
        end                
        
        showinfo('   Overall Ranking ...', opts);
        
        % overall ranking
        [evspool, sortord] = sort(evspool, 'descend');
        evsbid = evsbid(sortord);
        
        % threshold
        er0 = rest_target_energy / sum(evspool);
        rk = decide_rank(evspool, er0);
        
        showinfo('   Updating Selection ...', opts);
        
        % update selection
        evsbid = evsbid(1:rk);
        for ib = 1 : NBlks
            knew = sum(evsbid == ib);
            if knew > 0
                evals = evalset{ib};
                diminfo.subdims(ib) = diminfo.subdims(ib) + knew;
                energy.parpreserved(ib) = sum(evals(1:diminfo.subdims(ib)));
                evalset{ib} = evals(1:diminfo.subdims(ib));
            end
        end
        
        % truncate the projections
        
        showinfo('   Truncating Projections ...', opts);
        for ib = 1 : NBlks
            pfn = projfiles{ib};
            curprojmat = slreadarray(pfn);
            curprojmat = curprojmat(:, 1:diminfo.subdims(ib));
            slwritearray(curprojmat, pfn);            
        end
        
        clear restevnums total_restevnum rest_target_energy;
        clear evsbid evspool sortord;
                
    end   
            
end

energy.intpreserved = sum(energy.parpreserved(:));
diminfo.intdim = sum(diminfo.subdims(:));


%% Combined PCA model

if opts.combmodel
    
    showinfo('Learning Combined PCA Model ...', opts);
    
    showinfo('   Generating Intermediate Stacked Vectors ...', opts);
    
    % generate intermediate stacked vectors
    intvecs = zeros(diminfo.intdim, n);
    dc = 0;
    for ib = 1 : NBlks
        
        curblk = blocks{ib};
        rgncell = slrange2indcells(curblk);
        vecd = prod(curblk(2,:) - curblk(1,:) + 1);
        cursubdim = diminfo.subdims(ib);    
        
        projmat = slreadarray(projfiles{ib});
        
        V = generate_pvecs(data, rgncell, vecd, cursubdim, n, meanarr, projmat);
        
        intvecs(dc+1:dc+cursubdim, :) = V;
        dc = dc + cursubdim;
        
        clear projmat V;        
    end
    
    showinfo('   Learning Combined PCA ...', opts);
    
    % learn the combined model
    covcomb = slcov(intvecs, opts.weights, 0);
    [combevals, evecs] = slsymeig(covcomb);
    
    showinfo('   Truncating Combined PCA ...', opts);
    
    % truncate
    rk = decide_rank(combevals, (opts.er1 * energy.total) / energy.intpreserved);
    combevals = combevals(1:rk);
    combproj = evecs(:,1:rk);
    slwritearray(combproj, combprojfile); 
    
    energy.combpreserved = sum(combevals);
    diminfo.feadim = rk;
    
else
    energy.combpreserved = energy.intpreserved;
    diminfo.feadim = diminfo.intdim;
    
    combevals = [];   
    slignorevars(combevals);
end


%% Output

showinfo('Outputing Core file ...', opts);

corefilename = [filepath, '.mat'];

% change the inner file paths to relative path
dstdir = fileparts(filepath);
if isempty(dstdir) 
    pathprelen = 0;
elseif dstdir(end) ~= '\'
    pathprelen = length(dstdir) + 1;
else
    pathprelen = length(dstdir);
end

if pathprelen > 0
    for ib = 1 : NBlks
        fn = projfiles{ib};
        fn = fn(pathprelen+1:end);
        projfiles{ib} = fn;
    end
    if ~isempty(combprojfile)
        combprojfile = combprojfile(pathprelen+1:end);
    end
end    

slignorevars(combprojfile);


save(corefilename, ...
    'parstruct', ...
    'blocks', ...
    'projfiles', ...
    'combprojfile', ...
    'meanarr', ...
    'energy', ...
    'diminfo', ...
    'evalset', ...
    'combevals', ...
    '-v6');




%% Core Computing functions

function C = compute_localcov(data, meanarr, rangecell, d, n, w)

localmean = meanarr(rangecell{:});
localmean = reshape(localmean, [d, 1]);

if isnumeric(data)
    
    localarr = data(rangecell{:}, :);
    localarr = reshape(localarr, [d, n]);
    
    C = slcov(localarr, w, localmean);
    
else
    
    C = zeros(d, d);
    nfiles = length(data);
    
    cf = 0;
    for i = 1 : nfiles
        localarr = slreadarray(data{i});
        curn = size(localarr, length(rangecell)+1);
        
        if isempty(w)
            curw = [];
            tw = curn;
        else
            curw = w(cf+1:cf+curn);
            tw = sum(curw);
        end
        
        curcov = compute_localcov(localarr, meanarr, rangecell, d, curn, curw);        
        C = C + curcov * tw;
        
        cf = cf + curn;
    end
    
    if cf ~= n
        error('sltoolbox:sizmismatch', ...
            'The total number of units in the set of array files is not n');
    end  
    
    if isempty(w)
        C = C / n;
    else
        C = C / sum(w);
    end
    
end


function V = generate_pvecs(data, rangecell, oridim, subdim, n, meanarr, projmat)

if isnumeric(data)

    localarr = data(rangecell{:}, :);
    localarr = reshape(localarr, [oridim, n]);
    localmean = meanarr(rangecell{:});
    localmean = reshape(localmean, [oridim, 1]);
    D = sladdvec(localarr, -localmean, 1);    
    clear localmean localarr;
    V = projmat' * D;
    
    if size(V, 1) ~= subdim
        error('sltoolbox:sizmismatch', ...
            'Inconsistent sub dimension');
    end
        
else

    V = zeros(subdim, n);
    nfiles = length(data);
    cf = 0;
    for i = 1 : nfiles
        curdata = slreadarray(data{i});
        curn = size(curdata, length(rangecell) + 1);
        curV = generate_pvecs(curdata, rangecell, oridim, subdim, curn, meanarr, projmat);
        clear curdata;
        V(:, cf+1:cf+curn) = curV;
        cf = cf + curn;
    end
    
    if cf ~= n
        error('sltoolbox:sizmismatch', ...
            'The total number of units in the set of array files is not n');
    end  
end



%%  Auxiliary Functions

function check_valuerange(var, name, minval, maxval)

if var < minval || var > maxval
    error('sltoolbox:outofrange', ...
        'The variable %s should be between %f and %f', ...
        name, minval, maxval);
end


function str = generate_indexstring(nums, inds)

d = length(nums);
if any(inds <= 0) || any(inds > nums)
    error('The indices are beyond boundary');
end

% generate pattern
pats = cell(1, d);
for i = 1 : d
    curlen = length(int2str(nums(i)));
    pats{i} = sprintf('%%0%dd.', curlen);
end

% generate sub-strings
sstrs = cell(1, d);
for i = 1 : d    
    sstrs{i} = sprintf(pats{i}, inds(i));
end

% concatenate sub-strings
str = strcat(sstrs{:});
str(end) = [];   % delete the trailing point


function rk = decide_rank(evals, er)

cumevs = cumsum(evals);
rk = min(sum(cumevs < sum(evals) * er) + 1, length(evals));


function showinfo(message, opts)

if opts.verbose
    disp(message);
end



Contact us at files@mathworks.com