% Copyright (c) 2017, Domenico L. Gatti
% All rights reserved.
% 
% Redistribution and use in source and binary forms, with or without 
% modification, are permitted provided that the following conditions are 
% met:
% 
%     * Redistributions of source code must retain the above copyright 
%       notice, this list of conditions and the following disclaimer.
%     * Redistributions in binary form must reproduce the above copyright 
%       notice, this list of conditions and the following disclaimer in 
%       the documentation and/or other materials provided with the 
%       distribution
%       
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%
%% General dependencies
% We always start from the CODE directory and we add to the path
% subdirectories containing various tools described in the book chapters.
addpath(genpath('../GENERAL_SCRIPTS_FUNCTIONS'));
% addpath(genpath('../DATABASE'));
% addpath(genpath('../TOOLBOXES'));

%% CHAPTER 12: SVD of microarray data

close all
clear
load filteredyeastdata
data = yeastvalues;
[m,n] = size(data);

% fileID = fopen('Yeast_gene_names.txt','w')
% formatSpec = '%s\n';
% for i = 1:m
% fprintf(fileID,formatSpec,genes{i,:});
% end
% fclose(fileID);
% 
% dlmwrite('Yeast_microarray_data.txt',yeastvalues,'\t')
% dlmwrite('Yeast_microarray_times.txt',times,'\t')

[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data);
box('on')
xlabel('Time (hours)')
ylabel('Gene index')
zlabel('Expression level')
ylim([0 614]);xlim([0 21])

% mean_data = repmat(mean(data,1),m,1);
% data = data - mean_data;
[U,S,V] = svd(data,'econ');

% The sign of singular vectors is arbitrary. For consistent result we
% change the sign of each vector so that the mean is higher than the
% median. Once the sign is calculated for the V's, it is applied also to
% the U's.
for n=1:n
     vec_sign = sign(mean(V(:,n))-median(V(:,n)));
     U(:,n)=vec_sign*U(:,n);V(:,n)=vec_sign*V(:,n); 
end

XV = data*V;
US = U*S;

figure; plot3(XV(:,1),XV(:,2),...
    XV(:,3),'ro')
xlabel('Eigengene 1 scores   ')
ylabel('Eigengene 2 scores   ')
zlabel('Eigengene 3 scores   ')
xlim([-6 6])
grid('on')
box('on')

%%
close all
clear
load filteredyeastdata
data = yeastvalues;
[m,n] = size(data);

% mean_data = repmat(mean(data,2),1,n);
% data = data - mean_data;
[U,S,V] = svd(data,'econ');

% The sign of singular vectors is arbitrary. For consistent result we
% change the sign of each vector so that the mean is higher than the
% median. Once the sign is calculated for the U's, it is applied also to
% the V's.
for n=1:n
     vec_sign = sign(mean(U(:,n))-median(U(:,n)));
     U(:,n)=vec_sign*U(:,n);V(:,n)=vec_sign*V(:,n); 
end

UX = U'*data;
SV = S*V';
XUUTX = figure;
set(gcf,'Unit','Normalized','Position',[0.1 0.1 0.4 0.9]) 
pos1 = [0.05 0.05 .26 .9]
subplot('Position',pos1)
imagesc(data);
title('X')
pos2 = [0.37 0.05 .26 .9]
subplot('Position',pos2)
imagesc(U);
title('U')
pos3 = [0.69 0.75 .26 .2]
subplot('Position',pos3)
imagesc(UX);
title('UX')

%%
% Here we make a 'scree' plot of the squared singular values:
D = diag(S).^2;
sumD = cumsum(D);
E = sumD/sumD(end);
relD = D/sumD(end);
figure;bar(relD);
hold on
plot([0:7],[0 ; E],'-r')
xlabel('Sigma index')
ylabel('Relative Variance ')
legend('Relative Variance ','% Total Variance','Location','Best') 

% Rank reduced data
data_red = U(:,1:2)*S(1:2,1:2)*V(:,1:2)';
% data_red = U(:,1:2)*S(1:2,1:2)*V(:,1:2)' + mean_data;
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data_red);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25])
ylim([0 700])
zlim([-5 5])

data_red_2 = U(:,1:2)*U(:,1:2)'*data;
figure;surf(XI,YI,data_red_2);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25])
ylim([0 700])
zlim([-5 5])

% Notice the same result would be obtained if we want to filter the data by
% selecting only the top two eigengenes:
% Rank reduced data
[Ut,St,Vt] = svd(data');
data_red_3 = (Ut(:,1:2)*St(1:2,1:2)*Vt(:,1:2)')';
% data_red = U(:,1:2)*S(1:2,1:2)*V(:,1:2)' + mean_data;
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data_red_3);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25])
ylim([0 700])
zlim([-5 5])


% However, we can  filter for eigengassays or eigengenes by
% centering correctly.

% 1. Filtering by eigenassays:
mean_data_2 = repmat(mean(data,2),1,n);
c_data_2 = data - mean_data_2;
[Uc2,Sc2,Vc2] = svd(c_data_2);
c_data_red_2 = Uc2(:,1:2)*Sc2(1:2,1:2)*Vc2(:,1:2)' + mean_data_2;
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,c_data_red_2);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25])
ylim([0 700])
zlim([-5 5])

% 2. Filtering by eigengenes:
mean_data_1 = repmat(mean(data,1),m,1);
c_data_1 = data - mean_data_1;
[Uc1,Sc1,Vc1] = svd(c_data_1);
c_data_red_1 = Uc1(:,1:2)*Sc1(1:2,1:2)*Vc1(:,1:2)' + mean_data_1;
[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,c_data_red_1);
box('on')
xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25])
ylim([0 700])
zlim([-5 5])

%%
% Here we plot the 1st 2 eigengenes
V1 = V(:,1);
V2 = V(:,2);

figure; plot(times, V1,'ob',times,V2,'sr')
hold on

f = fittype('a*exp(b*x)');
[Expon,GOF] = fit(times',V1,f,'StartPoint',[-.01 .2]);
U1 = coeffvalues(Expon)
xvec = [0:0.1:20.5];
yvec = Expon(xvec);
plot(xvec,yvec,'--b')

f = fittype('a1*sin(x*b1 + c1) + a2*sin(x*b2 + c2)');
[Cyclic, GOF] = fit(times',V2,f,'StartPoint',[0.5 0 0 0.5 0 0]);

U2 = coeffvalues(Cyclic)
xvec = [0:0.1:20.5];
yvec = Cyclic(xvec);
plot(xvec,yvec,'--r')
xlabel('Time (hours) ')
ylabel('Eigengene loading')
xlim([0 21]);ylim([-1 1])
legend('V1','V2','V1 fit','V2 fit','Location','Best')
hold on

%% Correlation plot
corrGV = corr(data',V);
plot3(corrGV(:,1),corrGV(:,2),corrGV(:,3),'ob')
xlabel('Eigengene 1 correlation   ')
ylabel('Eigengene 2 correlation   ')
zlabel('Eigengene 3 correlation   ')
grid('on')
box('on')

%% ICA of microarray data
close all
clear 

% Prior to running the following lines of code install the Toolboxes:
% FastICA (http://research.ics.aalto.fi/ica/fastica/) and RADICAL
% (http://people.cs.umass.edu/~elm/ICA/) in the TOOLBOXES directory, and
% add them to your path using 'addpath', i.e.:
% addpath(genpath('TOOLBOXES/FastICA'))
% addpath(genpath('TOOLBOXES/RADICAL'))

load filteredyeastdata
data = yeastvalues;
[m,n] = size(data);

% [S1, A1, W1] = fastica(data','lastEig', 5)
% [S1, A1, W1] = fastica(data')
[S1,W1] = RADICAL(data')
A1 = inv(W1)
S = S1'
A = A1'
covS = cov(S)
X = S*A;

%
close all

for n=1:n
 vec_sign =...
 sign(mean(S(:,n))-median(S(:,n)))
 S(:,n)=vec_sign*S(:,n);
 A(n,:)=vec_sign*A(n,:); 
end

X = S*A;

XSA = figure;
set(gcf,'Unit','Normalized','Position',[0.1 0.1 0.4 0.9]) 
pos1 = [0.05 0.05 .26 .9]
subplot('Position',pos1)
imagesc(data);
title('X')
pos2 = [0.37 0.05 .26 .9]
subplot('Position',pos2)
imagesc(S);
title('S')
pos3 = [0.69 0.75 .26 .2]
subplot('Position',pos3)
imagesc(A);
title('A')
                  
%%
X_1 = S(:,1)*A(1,:);
X_all = zeros(m,n,n);
for i = 1:n
X_all(:,:,i) = S(:,i)*A(i,:);
end
figure;imagesc(X_all(:,:,3))
figure;imagesc(sum(X_all,3))

%%
J_a = var(A,0,2)'

% All S vectors have variance 1
gauss_vec = random('normal',0, 1,[614,1]);
S_tag = [S gauss_vec];
S_binned = S_tag;
for j = 1:size(S_tag,2)
% [bins,edges] = internal.stats.histbins(S_tag(:,j));
[bins,edges] = internal.stats.histbins(S_tag(:));
nbins = length(bins)

for i = 1:nbins
    ind = S_tag(:,j)>= edges(i) & S_tag(:,j)<= edges(i+1);
    S_binned(ind,j) = i;
end

end

%%
addpath(genpath('../TOOLBOXES/DWINNEL_MI'))
H_s = Entropy(S_binned)
J_s = H_s(end)-H_s(1:end-1)
% figure;plot(J_a,J_s,'-or')

mean_J_s = mean(J_s)
mean_J_a = mean(J_a)
c = 0.5;
S_rank = c*J_s/mean_J_s + (1-c)*J_a/mean_J_a

[~,S_rank_ind] = sort(S_rank,'descend')  

% We can make a 'scree' plot of the S_rank to help us decide which IC to retain.

sumJ = cumsum(S_rank(S_rank_ind));
E = sumJ/sumJ(end);
relJ = S_rank(S_rank_ind)/sumJ(end);
figure;bar(relJ);
hold on
plot([0:7],[0  E],'-r')
xlabel('IC index')
ylabel('Relative S\_rank ')
legend('Relative S\_rank ','% Total S\_rank','Location','Best') 
grid on

%%
% In addition to selecting only informative IC's, for each independent component in S
% a 'conservative' set of induced and repressed genes can optionally be determined
% by following an iterative procedure in which the gene with the largest absolute
% value is considered an outlier and excluded until all remaining values are
% situated within 4σ from their median.
                  
% S2 = S;
% zero_ind = abs(S)>4
% S2(zero_ind) = 0;

for i = 1:size(S,2)
    S_median = median(S(:,i));
    absS = S(:,i) - S_median;
    absS = abs(absS);
while max(absS)>4
    [~,max_ind] = max(absS);
    S(max_ind,i) = 0;
    S_median = median(S(:,i));
    absS = S(:,i) - S_median;
    absS = abs(absS);
end
end

%%
data_recov = S(:,S_rank_ind(1:4))*A(S_rank_ind(1:4),:);

% figure;imagesc(data)
% figure;imagesc(data_recov)

[XI,YI]=meshgrid(times,1:m);
figure;surf(XI,YI,data_recov);
box('on'); xlabel('Time (hours  )')
ylabel('Gene index  ')
zlabel('Expression level  ')
xlim([0 25]),ylim([0 700]),zlim([-5 5])

%%
rmpath(genpath('../TOOLBOXES/RADICAL_1_2'));
rmpath(genpath('../TOOLBOXES/DWINNEL_MI'))
