% Copyright (c) 2017, Domenico L. Gatti
% All rights reserved.
% 
% Redistribution and use in source and binary forms, with or without 
% modification, are permitted provided that the following conditions are 
% met:
% 
%     * Redistributions of source code must retain the above copyright 
%       notice, this list of conditions and the following disclaimer.
%     * Redistributions in binary form must reproduce the above copyright 
%       notice, this list of conditions and the following disclaimer in 
%       the documentation and/or other materials provided with the 
%       distribution
%       
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%
%% General dependencies
% We always start from the CODE directory and we add to the path
% subdirectories containing various tools described in the book chapters.
addpath(genpath('../GENERAL_SCRIPTS_FUNCTIONS'));
addpath(genpath('../DATABASE'));
% addpath(genpath('../TOOLBOXES'));


%% CHAPTER 10: Principal component analysis

%% PCA of physico-chemical properties of different drugs

%% Problem 1

% We recall that the correlation matrix is the same as the covariance
% matrix calculated on normalized (zscored) data. Therefore PCA can be
% carried out using a correlation matrix instead of a covariance matrix.
% 
% a. With this information in mind, can you reproduce the PCA analysis of
% the 24 physico-chemical properties of 20 commonly used drugs described in
% the article Pharmacological Classification of Drugs by Principal
% Component Analysis Applying Molecular Modeling Descriptors and HPLC
% Retention Data?, J Chromatogr Sci. 2011 Nov-Dec;49(10):758-63.

% Recall here that QSAR (Quantitative Structure Activity Relationship)
% models are regression models relating a set of 'predictor' variables (the
% physico-chemical properties or theoretical molecular descriptors of
% chemicals) to some biological activity of those chemicals, which
% represent the 'response' variables.
% 
% b. Can you explain the physical meaning of the eigenvectors in this case?

clear
close all

% data = (24 physico chemical parameters) x (20 different drugs) matrix
% derived from the article listed above;

[nvar,nobs] = size(data);

n_data = zscore(data,1,2);

cor_data = cov(n_data',1);
cor_data = corr(data');

% Here we divide by nobs
cov_n_data = (n_data*n_data')/nobs;

[COEFF,SCORE,LATENT] = princomp(zscore(data,1,2)');
% 
[W,R,EXPLAINED] = pcacov(cor_data);
% 
[W1,R1,EXPLAINED1] = pcacov(cov_n_data);

% [V_corr,D_corr] = eig(cor_data); 
% [V,D] = eig(cov_n_data); 
%  
% k = size(V,2);
%  [l,i] = sort(diag(D),'descend');
%  W=V(:,i);
%  R=D(i,i);
%  
%  for n=1:k
%      W(:,n)=sign(mean(W(:,n)))*W(:,n);
%  end

% By representing the normalized data in the eigenvector basis of the
% covariance matrix the covariance between the different variables becomes
% 0 and only the variables variance is retained. The eigenvalues are
% therefore the variance of the data (the spread of the 20 observations 
% along each axis=variable) represented in the eigenvector basis.
pc_n_data = W' * n_data ;
cov_pc_n_data = cov(pc_n_data')
figure;imagesc(cov_pc_n_data)

% If the variance of the data along an eigenvector is larger than along a
% different eigenvector it means that eigenvector contributes more to the
% discrimination between different molecules. Each eigenvector contains
% different contributions ('loadings') from the different physico-chemical
% properties. If each property was an axis in n-dimensional property space,
% we could define an eigenvector as a new type of property axis: an
% 'eigenproperty'.
%
% We show the contribution of the various eigenvectors
figure; plot(R,'-o')
% figure; plot(diag(R),'-o')

% and the cumulative energy of the eigenvectors showing that ~90% of the
% energy is containing in the top 5 modes, and ~68% in the top 2 modes.
energy = cumsum(R);
% energy = cumsum(diag(R));
figure; plot(energy/energy(end),'-o')
xlabel('Eigenvalue index   ')
ylabel('E_j/E_t_o_t_a_l   ')

% We also recall that each column of 'pc_n_data' contains the coefficients
% that allow the representation of each column of the original 'n_data' as
% a linear combination of the different eigenvectors. 

% 2D-plot of the 1st and 2nd eigenvectors: the clustering of properties
% when 2 or 3 eigenvectors are plotted against each other, indicate that
% those properties are correlated.
figure; plot(-W(:,1),-W(:,2),'ro')
xlabel('Eigenmode 1 coefficients   ')
ylabel('Eigenmode 2 coefficients   ')
grid('on')

% 2D-plot of trajectory projection on the 1st and 2nd eigenvectors (the
% scores): if different chemicals have similar contributions from the top
% eigenvectors, it means those chemicals share similar properties. Thus,
% clustering of chemicals in a plot of the 'trajectory' projection of the
% 1st and 2nd eigenvectors indicate a commonality of properties.
figure; plot(-pc_n_data(1,:),-pc_n_data(2,:),'ro')
xlabel('Eigenmode 1 scores   ')
ylabel('Eigenmode 2 scores   ')
grid('on')

% 3D-plot of trajectory projection on the first 3 eigenvectors
figure; plot3(-pc_n_data(1,:),-pc_n_data(2,:),...
    -pc_n_data(3,:),'ro')
xlabel('Eigenmode 1 scores   ')
ylabel('Eigenmode 2 scores   ')
zlabel('Eigenmode 3 scores   ')

grid('on')
box('on')

% Same result using princomp
% figure; plot3(SCORE(:,1),SCORE(:,2),...
%     SCORE(:,3),'ro')
% xlabel('Eigenmode 1 scores   ')
% ylabel('Eigenmode 2 scores   ')
% zlabel('Eigenmode 3 scores   ')
% 
% grid('on')
% box('on')

%% Problem 2: whitening

% Normal distributions:
s1 = random('logistic',zeros(1,1000),1.0);
s2 = random('Uniform',ones(1,1000)*.1,ones(1,1000)*5.0);
cov(s1',s2'),corr(s1',s2')

% Linear combinations of the gaussian vectors:
a1 = [1;0.5]; a2 = [0;1.0]; 
a1 = a1/norm(a1); a2 = a2/norm(a2);
X = [a1 a2]*[s1;s2];

figure;plot(X(1,:),X(2,:),'.','Linewidth',0.5,'MarkerEdgeColor','b',...
'MarkerSize',15,'MarkerFaceColor','g')
line(1*[0 a1(1)],1*[0 a1(2)],'LineWidth',2.5,'Color','red');
line(1*[0 a2(1)],1*[0 a2(2)],'LineWidth',2.5,'Color','green');
xlabel('X1');ylabel('X2');grid on; 
box on;axis equal;hold on

cov_X = cov(X')

% PCA:
[E,~,D] = pca(X'); 

white = (E*diag(D.^-0.5)*E')
X_white = white*X;
cov(X_white')
 
figure;plot(X_white(1,:),X_white(2,:),'.','Linewidth',0.5,'MarkerEdgeColor','b',...
    'MarkerSize',15,'MarkerFaceColor','g')
xlabel('X1');ylabel('X2');
grid on;box on;axis equal
axescenter

% Proof the covariance matrix of whitened data is the I matrix
(E*diag(D.^-0.5)*E')*X*X'*(E*diag(D.^-0.5)*E')'/1000
(E*diag(D.^-0.5)*E')*X*X'*(E*diag(D.^-0.5)*E')/1000
(E*diag(D.^-0.5)*E')*(E*diag(D)*E')*(E*diag(D.^-0.5)*E')
(E*diag(D.^-0.5)*E'*E*diag(D)*E'*E*diag(D.^-0.5)*E')
(E*diag(D.^-0.5)*diag(D)*diag(D.^-0.5)*E')
(E*E')

