% Copyright (c) 2017, Domenico L. Gatti
% All rights reserved.
% 
% Redistribution and use in source and binary forms, with or without 
% modification, are permitted provided that the following conditions are 
% met:
% 
%     * Redistributions of source code must retain the above copyright 
%       notice, this list of conditions and the following disclaimer.
%     * Redistributions in binary form must reproduce the above copyright 
%       notice, this list of conditions and the following disclaimer in 
%       the documentation and/or other materials provided with the 
%       distribution
%       
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%
%% General dependencies
% We always start from the CODE directory and we add to the path
% subdirectories containing various tools described in the book chapters.
addpath(genpath('../GENERAL_SCRIPTS_FUNCTIONS'));
% addpath(genpath('../DATABASE'));
% addpath(genpath('../TOOLBOXES'));

%% CHAPTER 11-12: singular value decomposition (SVD)

%% Problem 1.

clear
close all

% Load the yeast microarray data described in this lecture:

load filteredyeastdata

data = yeastvalues;
[m,n] = size(data);
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data);
box('on')
xlabel('Time (hours)')
ylabel('Gene index')
zlabel('Expression level')

% a. Carry out an analysis of the microarray data using traditional PCA.

data = data';
[nvar,nobs] = size(data);
imagesc(data)
ylabel('Time (hours)')
xlabel('Gene index')
colorbar
mean_data = mean(data,2);
centered_data = data - mean_data(:,ones(1,nobs));

cov_data = (centered_data*centered_data')/(nobs-1);
 
[V,D] = eig(cov_data);
 
k = size(V,2);
 [l,i] = sort(diag(D),'descend');
 W=V(:,i);
 R=D(i,i);
 
 for n=1:k
     W(:,n)=sign(mean(W(:,n)))*W(:,n);
 end
 
% Here we calculate the scores
pc_centered_data = W' * centered_data ;

% we show the contribution of the various eigenvectors
figure; plot(diag(R),'-ob')

% and the cumulative energy of the eigenvectors 
energy = cumsum(diag(R));
figure; plot(energy/energy(end))
xlabel('Eigenvalue index   ')
ylabel('E_j/E_t_o_t_a_l   ')

% 2D-plot of projection on the 1st and 2nd eigenvectors
figure; plot(pc_centered_data(1,:),pc_centered_data(2,:),'ro')

% 3D-plot of projection on the first 3 eigenvectors
figure; plot3(pc_centered_data(1,:),pc_centered_data(2,:),...
    pc_centered_data(3,:),'ro')
xlabel('Eigenmode 1 scores   ')
ylabel('Eigenmode 2 scores   ')
zlabel('Eigenmode 3 scores   ')
xlim([-6 6])
grid('on')
box('on')

% However, it is more effective to use SVD to carry out this analysis:
%
% b. Using SVD identify which genes belong to each cluster of 
%    transcriptional responses (use the tools we used to cluster frames of 
%    a molecular dynamics trajectory).

clear data
load filteredyeastdata
data = yeastvalues;
[m,n] = size(data);

[U,S,V] = svd(data,'econ');

% The sign of singular vectors is arbitrary. For consistent result we
% change the sign of each vector so that the mean is higher than the
% median. Once the sign is calculated for the U's, it is applied also to
% the V's.

for n=1:n
     vec_sign = sign(mean(U(:,n))-median(U(:,n)));
     U(:,n)=vec_sign*U(:,n);V(:,n)=vec_sign*V(:,n); 
end


% Coordinates of the different 'expression profiles' in 'eigenassay space'
UX = U'*data;
SV = S*V';

% Coordinate of the different gene 'transcriptional responses' in 'eigengene 
% space'
XV = data*V;
US = U*S;

% Projection plot
figure
plot3(US(:,1),US(:,2),US(:,3),'ob')
xlabel('Eigenmode 1 scores   ')
ylabel('Eigenmode 2 scores   ')
zlabel('Eigenmode 3 scores   ')
grid('on')
box('on')

% Automatic identification of clusters:
Auto_Clusters = clusterdata(US(:,1:3),'linkage','ward','savememory','on',...
    'maxclust',3);
Auto_Clust1 = find(Auto_Clusters == 1);
Auto_Clust2 = find(Auto_Clusters == 2);
Auto_Clust3 = find(Auto_Clusters == 3);

figure;
scatter3(US(Auto_Clust1,1),US(Auto_Clust1,2),US(Auto_Clust1,3),'ok',...
    'SizeData',30,'MarkerFaceColor','c');
hold on
scatter3(US(Auto_Clust2,1),US(Auto_Clust2,2),US(Auto_Clust2,3),'ok',...
    'SizeData',30,'MarkerFaceColor','y');
scatter3(US(Auto_Clust3,1),US(Auto_Clust3,2),US(Auto_Clust3,3),'ok',...
    'SizeData',30,'MarkerFaceColor','g');
xlabel('Eigenmode 1 scores   ')
ylabel('Eigenmode 2 scores   ')
zlabel('Eigenmode 3 scores   ')
grid('on')
box('on')
hold off


% c. Produce reduced data sets containing the transcriptional responses of 
%    the genes in each cluster.

data_red_CL1 = U(Auto_Clust1,1:2)*S(1:2,1:2)*V(:,1:2)';
[XI,YI]=meshgrid(times,Auto_Clust1);
figure;surf(XI,YI,data_red_CL1);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 21])
ylim([0 614])
zlim([-5 5])

data_red_CL2 = U(Auto_Clust2,1:2)*S(1:2,1:2)*V(:,1:2)';
[XI,YI]=meshgrid(times,Auto_Clust2);
figure;surf(XI,YI,data_red_CL2);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 21])
ylim([0 614])
zlim([-5 5])

data_red_CL3 = U(Auto_Clust3,1:2)*S(1:2,1:2)*V(:,1:2)';
[XI,YI]=meshgrid(times,Auto_Clust3);
figure;surf(XI,YI,data_red_CL3);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 21])
ylim([0 614])
zlim([-5 5])

data_red = U(:,1:2)*S(1:2,1:2)*V(:,1:2)';
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data_red);
box('on')
xlabel('Time (hours)')
ylabel('Gene index')
zlabel('Expression level')
ylim([0 614]);xlim([0 21])

data_red_clusters = [data_red_CL1;data_red_CL2;data_red_CL3]
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data_red_clusters);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 21])
ylim([0 614])

%% Problem 2. Repeat the analysis, this time using ICA:

% Prior to running the following lines of code install the Toolbox RADICAL:
% (http://people.cs.umass.edu/~elm/ICA/) in the TOOLBOXES directory, and
% add it to your path using 'addpath', i.e.:
% addpath(genpath('../TOOLBOXES/RADICAL'))

close all
clear data

addpath(genpath('../TOOLBOXES/DWINNEL_MI'))

load filteredyeastdata
data = yeastvalues;

[m,n] = size(data);
[S1,W1] = RADICAL(data')
A1 = inv(W1)
S = S1'
A = A1'
covS = cov(S)
X = S*A;

%%
for n=1:n
 vec_sign =...
 sign(mean(S(:,n))-median(S(:,n)))
 S(:,n)=vec_sign*S(:,n);
 A(n,:)=vec_sign*A(n,:); 
end

X = S*A;
figure;imagesc(data)
figure;imagesc(X)
figure;imagesc(S)
figure;imagesc(A)

%%
X_1 = S(:,1)*A(1,:);
X_all = zeros(m,n,n);
for i = 1:n
    X_all(:,:,i) = S(:,i)*A(i,:);
end
figure;imagesc(X_all(:,:,3))
figure;imagesc(sum(X_all,3))

gauss_vec = random('normal',0, 1,[614,1]);
S_tag = [S gauss_vec];
S_binned = S_tag;
for j = 1:size(S_tag,2)
[bins,edges] = internal.stats.histbins(S_tag(:,j));
nbins = length(bins)

for i = 1:nbins
    ind = S_tag(:,j)>= edges(i) & S_tag(:,j)<= edges(i+1);
    S_binned(ind,j) = i;
end

end

H_s = Entropy(S_binned)
J_s = H_s(end)-H_s(1:end-1)
J_a = var(A1)
% figure;plot(J_a,J_s,'-or')

mean_J_s = mean(J_s)
mean_J_a = mean(J_a)
c = 0.5;
S_rank = c*J_s/mean_J_s + (1-c)*J_a/mean_J_a

[~,S_rank_ind] = sort(S_rank,'descend')  

% We can make a 'scree' plot of the S_rank to help us decide which IC to retain.

sumJ = cumsum(S_rank(S_rank_ind));
E = sumJ/sumJ(end);
relJ = S_rank(S_rank_ind)/sumJ(end);
figure;bar(relJ);
hold on
plot([0:7],[0  E],'-r')
xlabel('IC index')
ylabel('Relative S\_rank ')
legend('Relative S\_rank ','% Total S\_rank','Location','Best') 
grid on


% a. Identify the genes corresponding to the top 20% expression levels in
% both types of analysis using only the 1st left singular vector and the 
% best IC.
[~,sort_expr_ind_ICA] = sort(S(:,S_rank_ind(1)),'descend');
[~,sort_expr_ind_SVD] = sort(U(:,1),'descend');
top_expr_no = floor(m*.2)

% b. Find which genes are in both sets of top expression levels (Hint: use
% the 'find' function or work with logical arrays).
common_top_ind = intersect(sort_expr_ind_SVD(1:top_expr_no),...
    sort_expr_ind_ICA(1:top_expr_no))
genes(common_top_ind)

%% Remove from the path the toolboxes used only in this practice. 
% For example for Radical and Dwinnel_MI: 
% rmpath(genpath('../TOOLBOXES/RADICAL'));
rmpath(genpath('../TOOLBOXES/DWINNEL_MI'))
