No BSD License  

Highlights from
Principal Component Analysis for Large Matrix

from Principal Component Analysis for Large Matrix by Richang Hong
This function will occupy large storage space during running time.

LPCA(strFileName,odim,ndim)
%% Handle PCA in case of Lager Volumn Datasets
% This code show how to compute Principal Component in case of
% lager volumn datasets. We load the samples one by one to compute
% the mean and variance. The co-variance matrix is obtained after
% vector operation. Then we use matlab function to compute the 
% eigenvalues and eigen-vectors.

% Copyright Richard Hong.
% National University of Singapore
% $Revision: 1.1.6.7 $  $Date: 2008/12/13 15:53:31 $


function [EigValue,EigVec] = LPCA(strFileName,odim,ndim)
%% File Format
% input: strFileName: the input txt file include the specified path,
%                     rows are samples and columns are dimension of feature;          
%        ndim: the dimension should be reduced to; 
%        odim: the original dimension;

% output: EigValue: the eigenvalue of covariance;
%         EigVec: the eigen-vactor of covariance;
%                  
 
% check the dim
if odim < ndim;
    disp('ndim should lower than odim');
    return;
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% open the file for reading data
fid = fopen(strFileName, 'r');

disp('................Start to compute the mean of the samples.....................')
% compute the mean of the samples
vec = fscanf(fid,'%f',odim);
ind = 1; m_sumVec = zeros(length(vec),1);
while ( length(vec) > 0 )
    m_sumVec = m_sumVec + vec; 
    vec = fscanf(fid,'%f',odim);
    if ( mod(ind,10000) == 1 )
        disp('Now processed:');disp(ind);
    end
    ind = ind + 1;  
end
% count the number of the samples
num = ind - 1;
% compute the mean m_Vec
m_Vec = m_sumVec / num;
fclose(fid);
disp('....................Compute the means of the samples over!!...................')

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
disp('................Start to substract the mean of the samples.....................')
% open the file for substract the mean
fod = fopen('temp.dat', 'wt');
% compute the substraction
fid = fopen(strFileName, 'r');
vec = fscanf(fid,'%f',odim);
ind = 1;
while ( length(vec) > 0 )
    vec = vec - m_Vec; vec = vec';   
%     output the data into file 
    fprintf(fod,'%f\n',vec);    
%     read the data
    vec = fscanf(fid,'%f',odim);
    if ( mod(ind,10000) == 1 )
        disp('Now processed:');disp(ind);
    end
    ind = ind + 1;
end
fclose(fod);
fclose(fid);
disp('................Substract the mean of the samples over!!.....................')


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
disp('................Start to compute the covariance of the samples.....................')
% open the file for computing covariance
fid = fopen('temp.dat', 'r');
% generate the covariance
coV = zeros(odim,odim); ind = 1;
vec = fscanf(fid,'%f',odim);
while ( length(vec) > 0 )
    coV = coV + vec * vec';
%     read the data repeatedly
    vec = fscanf(fid,'%f',odim);
    ind = ind + 1;
    if ( mod(ind,10000) == 1 )
        disp('Now processed:');disp(ind);
    end
end
fclose(fid);
% expand the up-diagonal to down-diagonal matrix
for i = 1:odim
    for j = i+1:odim
        coV(j,i) = coV(i,j);
    end
end
coV = coV / num;
disp('................Compute the covariance of the samples over!.....................')
delete temp.dat;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Now the coV is the covaraiance matrix
% compute its eigen value and corresponding eigen vector
[V,D] = eig(coV);
EigValue = diag(D); EigVec = V;
% produce new basis
v_basis = V(:,odim-ndim+1:odim);


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
disp('................Start to compute the projection after eigen decomposition....................')
% dimension reduction
fid = fopen(strFileName, 'r');
%  the new sample set after reduced dimension.
fod = fopen('pcaReduced.txt', 'w');
vec = fscanf(fid,'%f',odim); ind = 1;
while ( length(vec) > 0 )
    vec = vec' * v_basis;    
%     output the data into file
    for i = 1:ndim
        fprintf(fod,'%7.4f',vec(i));
    end
    fprintf(fod,'\n');
%     read the data
    vec = fscanf(fid,'%f',odim);    
    if ( mod(ind,10000) == 1 )
        disp('Now processed:');disp(ind);
    end
    ind = ind + 1;    
end
fclose(fid);
fclose(fod);

        

Contact us at files@mathworks.com