% Copyright (c) 2017, Domenico L. Gatti
% All rights reserved.
% 
% Redistribution and use in source and binary forms, with or without 
% modification, are permitted provided that the following conditions are 
% met:
% 
%     * Redistributions of source code must retain the above copyright 
%       notice, this list of conditions and the following disclaimer.
%     * Redistributions in binary form must reproduce the above copyright 
%       notice, this list of conditions and the following disclaimer in 
%       the documentation and/or other materials provided with the 
%       distribution
%       
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%
%% General dependencies
% We always start from the CODE directory and we add to the path
% subdirectories containing various tools described in the book chapters.
addpath(genpath('../GENERAL_SCRIPTS_FUNCTIONS'));
addpath(genpath('../DATABASE'));
addpath(genpath('../TOOLBOXES/DWINNEL_MI'));
addpath(genpath('../TOOLBOXES/fastMI'));
addpath(genpath('../TOOLBOXES/kernelMI'));

%% CHAPTER 20: Information Theory

Decimal_1013 = [1 0 1 3]*(10.^[3:-1:0])'
Binary_1013 = [1 1 1 1 1 1 0 1 0 1]*(2.^[9:-1:0])'
dec2bin(1013)
bin2dec('1111110101')
dec2base(1013,8)
dec2base(1013,16)
base2dec('1765',8)
base2dec('1111110101',2)
base2dec('3F5',16)

%% Information of a single event
p = [0.01:0.01:1]';
h = log2(1./p);
plot(p,h,'-r')
xlabel('p(x)'),ylabel('Shannon information, h')
xlim([0 1.01]),ylim([-0.1 7])
grid on, box on

h_head = log2(1/0.8)
h_tail = log2(1/0.2)

H = [0.8 0.2]*[log2(1/0.8) log2(1/0.2)]'

%% Entropy of a 6-sided die
% We start by defining the 'Alphabet' (the possible symbols s on the faces
% of the die). These could be numbers, letters, or any other symbols:
s = ['A' '3' 'C' 'f' '2' 'w'];
ns = length(s);

% Total number of symbols in the realization of variable X
tobs = 1000;

% Each possible realization of variable X
ind = randi(6,[tobs,1]);
X = s(ind);

% Number of observations of each x_i
nobs = zeros(1,ns);
for i = 1:ns
    nobs(i) = sum(X==s(i));
end

% or simply:
nobs = accumarray(ind,ones(1,1000));
nobs = accumarray(ind,1);

% Frequency (probability) of each symbol
p = nobs/tobs;

% Information associated with each x_i must decreases with increasing
% probability and increase with decreasing probability of each x_i.
% Therefore the obvious choice is:

% I = 1./p

% Information should go down to 0 when the probability is 1. Therefore, two
% choices:

% I1 = log2(1./p);
% I2 = 1 - 1./p;

% But the information provided by each x_i can't be negative: that rules
% out I2. Thus:
% I = log2(1./p)

% Finally we add up all the informations provided by each x_i of X, and
% derive the average information content of X, the Entropy (H(X)):
px = p(ind);

H = sum(log2(1./px))/tobs
H_obs = mean(log2(1./px))
H_theor = log2(6)

% We can alternatively express this result as a sum of the probability of
% each symbol using the vector '1/p' instead of the vector '1/px'. In that
% case, we remove the division by the total number of observations, and we
% multiply by the fraction 'p' of each symbol x in X:
H = sum(p.*log2(1./p))  
H = -sum(p.*log2(p))

% If we use log2, the result is in bits or 'shannons', with the 'shannon'
% being the unit of information.

%% Entropy of biased coins:
clear, clc
% number of flips
nflips = 1000;

% bias
bias = [0.01:0.01:0.99];
nbias = length(bias);

% observed outcome
binary_mat = zeros(nflips,nbias);
for j = 1:nbias
    for i = 1:1000
        binary_mat(i,j) = binornd(1,bias(j));
    end
end

% or simply
for i = 1:nbias
        binary_mat(:,i) = binornd(1,bias(i),[1000 1]);
end

% Observed frequencies of heads and tails
fr_1 = sum(binary_mat)/nflips;
fr_0 = 1-fr_1;

% Precalculated values of log2(1/frequencies)
log_fr1 = log2(1./fr_1);
log_fr0 = log2(1./fr_0);

% Entropy
for i = 1:nbias
    sum_h0(i) = sum((binary_mat(:,i) == 0))*log_fr0(i);
    sum_h1(i) = sum((binary_mat(:,i) == 1))*log_fr1(i);
    H(i) = (sum_h0(i) + sum_h1(i))/nflips;
end

Coins_Entropy_1 = figure;
plot(fr_1,H,'Marker','.','MarkerSize',20,'MarkerFaceColor','r','MarkerEdgeColor','r');
ylim([0 1.1])
xlabel('coin bias, observed frequency of heads'),ylabel('Entropy, H(bits)')
grid on, box on
title('Entropy of biased coins')

% Alternative entropy calculation using frequencies
h0 = fr_0.*log_fr0;
h1 = fr_1.*log_fr1;
H = h0 + h1;

% or simply:
H_fr = sum([fr_0.*log_fr0;fr_1.*log_fr1]);
Coins_Entropy_2 = figure;
plot(fr_1,H,'Marker','.','MarkerSize',20,'MarkerFaceColor','r','MarkerEdgeColor','r');
ylim([0 1.1])
xlabel('coin bias, observed frequency of heads'),ylabel('Entropy, H(bits)')
grid on, box on
title('Entropy of biased coins')

% or probabilities known 'a priori'
H_pr = -sum([(1-bias).*log2(1-bias);bias.*log2(bias)]);
Coins_Entropy_3 = figure;
plot(bias,H_pr,'Marker','.','MarkerSize',20,'MarkerFaceColor','r','MarkerEdgeColor','r');
ylim([0 1.1])
xlabel('coin bias, probability of heads'),ylabel('Entropy, H(bits)')
grid on, box on
title('Entropy of biased coins')

% Entropy of a uniform distribution
ntrial = 1000000;
H = zeros(ntrial,1);
i = 0;
while i < ntrial
i = i + 1;
f = randi(10,10,1);
f = f/sum(f);
H(i) = -sum(f.*log2(f));
end

H_rnd = max(H)
f_unif = ones(1,10)/10;
H_unif = -sum(f_unif.*log2(f_unif))

his = histogram(H)
his.EdgeColor = [0 0 1]
his.FaceColor = [0 0 1]
hold on
grid on
plot([H_unif H_unif],[0 max(his.Values)],'-r', 'LineWidth', 3)
vline(H_unif,{'-r', 'LineWidth', 3})
legend('H random distr.','H uniform distr.','Location','Best') 
xlabel('Entropy, H');
ylabel('No. of X trial vectors')

%% Entropy of a 6 and 12 sided dye
m = 6
H6 = log2(m)
m = 2^H6
p_side = 2^(-H6)

m = 12
H12 = log2(m)

%% Entropy of a message
% Consider the case of a string of symbols picked randomly to form a
% 'message' X:
s = ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H']
ns = length(s)
tobs = 80
ind = randi(8,[tobs,1]);
X = s(ind);
nobs = accumarray(ind,1);

% Frequency (probability) of each symbol
p = nobs/tobs;

% Entropy.
px = p(ind)    
H = sum(log2(1./px))/tobs
% or
H = sum(p.*log2(1./p))  

% Now consider a UNIFORM distribution of the same symbols. 'xobs' is now
% the numer of times that each symbol must appear in the message:
s1 = ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H']
ns1 = length(s1)
tobs = 80
xobs1 = tobs/ns1

permutations1 = perms([1:8]);
nperms1 = size(permutations1,1);
ind1 = randi(nperms1,10,1)
X1ind = permutations1(ind1,:)
X1ind = X1ind(:)
nobs1 = accumarray(X1ind,1);

% Frequency (probability) of each symbol
p1 = nobs1/tobs;

% Entropy:
H1 = sum(p1.*log2(1./p1))  

% Which is exactly what we expected as:
H1 = log2(ns1)

% Finally, consider a UNIFORM distribution of the same symbols plus two new
% ones. The total length of the message does not change, and thus 'xobs',
% the numer of times that each symbol appears in the message, is smaller (8
% instead of 10):
s2 = ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J']
ns2 = length(s2)
tobs = 80
xobs2 = tobs/ns2

permutations2 = perms([1:10]);
nperms2 = size(permutations2,1);
ind2 = randi(nperms2,8,1)
X2ind = permutations2(ind2,:)
X2ind = X2ind(:)
nobs2 = accumarray(X2ind,1);

% Frequency (probability) of each symbol
p2 = nobs2/tobs;

% Entropy:
H2 = sum(p2.*log2(1./p2))  

% Which is exactly what we expected as:
H2 = log2(ns2)

% In conlusion two messages, X1 and X2 have the same length of 80
% characters.

X1 = s1(X1ind)
X2 = s2(X2ind)
%
% X1 = DFDGDHHCCAAGCDADHHBBBFDGGDCFCBBHHAFGCBFEEDHCEGGAHAGAECCEFEBEHGBFDEFDEAHCBAFEFBGA
% X2 = HGFAEAGAAGAFDCBDIJDIFGDEJGHFBABJJDDJFJGAFBCFJIEBHDCCCIHFGHBEIHCAIHCDBEEEBICGEJIH
%
% But each time X1 is observed it appears as 1 of 8 equally probable
% symbols, while each time X2 is observed it appears as 1 of 10 equally
% probable symbols. Thus, the entropy of variable X2 is H2 = log2(10) =
% 3.32 bits, larger than the entropy of variable X1, H1 = log2(8) = 3.0
% bits. In other words, each time we see a realization of X2 as 1 of 10
% equally probable letters we learn 3.32 bits of information, while each
% time we see a realization of X1 as 1 of 8 equally probable letters we
% learn only 3.0 bits of information. When receiving a message, the average
% amount of information we would get each time a letter of the message is
% revealed to us would be less if the letters were not equally probable.

%% Information Entropy and Thermodynamic Entropy.

% We have seen how the maximum value of H(X) is attained when all x_i are
% equally probable. We can refer to x_i as a possible 'microstate' of X,
% and we recall here that H is the log of the number of equally probable
% x_i:

Hmax = log2(ns)

% Notice the similarity with the thermodynamic Boltzmann entropy:

% S = k log(W)

% where k is the Boltzman constant and W is the number of equally probable
% microstates of the system. Thus, the entropy of a macrostate (a state
% that from the outside looks always in a certain way) is the log of the
% number of equally probable microstates that can give origin to that same
% macrostate.

% For example, consider 3 possible macrostates produced by microstates
% associated with 4 particles, two of which (1 and 2) have low velocity and
% produce a cold environment and two of which have high velocity and
% produce a hot environment. Each particle occupies one box in a row of 4
% boxes. The following are all the possible equally probable 'microstates':

micro = perms([1 2 3 4])

% >
%      4     3     2     1
%      4     3     1     2
%      4     2     3     1
%      4     2     1     3
%      4     1     3     2
%      4     1     2     3
%      3     4     2     1
%      3     4     1     2
%      3     2     4     1
%      3     2     1     4
%      3     1     4     2
%      3     1     2     4
%      2     4     3     1
%      2     4     1     3
%      2     3     4     1
%      2     3     1     4
%      2     1     4     3
%      2     1     3     4
%      1     4     3     2
%      1     4     2     3
%      1     3     4     2
%      1     3     2     4
%      1     2     4     3
%      1     2     3     4
     
% Now we make particle 1 and 2 identical (because they have the same low velocity) 
% and also 3 and 4 identical (because they have the same high velocity)
micro(micro == 2) = 1
unique(micro,'rows')
micro(micro == 4) = 3
unique(micro,'rows')

% >
%      1     1   |   3     3      Macrostate I
%      ---------------------
%      1     3   |   1     3
%      1     3   |   3     1      Macrostate II
%      3     1   |   1     3
%      3     1   |   3     1
%      ---------------------
%      3     3   |   1     1      Macrostate III
     
% If we can draw some lines between columns and rows we can recognize 3
% 'macrostates': two in which the right or left side of the chamber is
% 'hot' or 'cold', respectively, and one in which both sides of the chamber
% have the same temperature. You can see how each row of this 'macrostates'
% table derives from merging 4 'microstates'. Thus, 'macrostate I and III'
% are each produced by 4 different 'microstates', while 'macrostate II' is
% produced by 16 different 'microstates'. Since all the microstates are
% equally probable, 'macrostate II' which can be produced by the larger
% number of microstates (SII = log(16)) is more probable than 'macrostates
% I or II', which can be produced by a smaller number of microstates
% (SI=SIII=log(4)). That is why in a room the temperature will tend to be
% uniform everywhere (maximum entropy), rather than being hot in half of
% the room and cold in the other half (low entropy).

%% Coding Efficiency
clear, clc
m = 6;
H6 = log2(m);
L6 = ceil(H6);
E6 = H6/L6;

% Optimal encoding
A = {'S' '1' 'g' '23' 'T' '7'}
A_ind = 1:6;
ind = randi(6,[1,9000]);
M = A(ind);
nm = length(M);

% Message entropy and total information content
H_M = Entropy(ind')
Info_M = H_M*nm

% Encoding based on 1 symbol/codeword (900 codewords) and 3 binary
% digits/codeword
Cb_A = A';
n_Cb = size(Cb_A,1);
Cb_b = dec2bin([1:6]');
Lx = ceil(log2(n_Cb));

M_b_1 = [];
b_ind_1 = zeros(nm,1);
for i = 1:nm
    s = M(i);
    b_ind_1(i) = find(strcmp(Cb_A,s));
    M_b_1 = [M_b_1 Cb_b(b_ind_1(i),:)];
end

% Encoded message entropy and total information content
p_ones = sum(M_b_1 == '1',2)/length(M_b_1);
p_zeros = 1 - p_ones;
H_1 = - [p_zeros p_ones]*[log2(p_zeros) log2(p_ones)]'
Info_1 = H_1*length(M_b_1)
H_1_codewords = Entropy(b_ind_1)

% Encoding based on 3 symbols/codeword (300 codewords) and 8 binary
% digits/codeword
Cb_ind = unique(randi(6,[10000,3]),'rows');
n_Cb = size(Cb_ind,1);
Lx = ceil(log2(n_Cb));
Cb_A = A(Cb_ind);
Cb_A_string = cell(1,n_Cb);
for i = 1:n_Cb
    Cb_A_string{i} = [Cb_A{i,1} Cb_A{i,2} Cb_A{i,3}];
end
Cb_b = dec2bin([1:216]');

M_b_2 = [];
s_start = 0;
s_end = 0;
b_ind_2 = zeros(nm/3,1);
for i = 1:nm/3
    s_start = s_start + 1;
    s_end = s_end + 3;
    s = M(s_start:s_end);
    s_string = [s{1} s{2} s{3}];
    b_ind_2(i) = find(strcmp(Cb_A_string,s_string));
    M_b_2 = [M_b_2 Cb_b(b_ind_2(i),:)];
end

% Encoded message entropy and total information content
p_ones = sum(M_b_2 == '1',2)/length(M_b_2);
p_zeros = 1 - p_ones;
H_2 = - [p_zeros p_ones]*[log2(p_zeros) log2(p_ones)]'
Info_2 = H_2*length(M_b_2)
H_2_codewords = Entropy(b_ind_2)

% Alternative determination of entropy
% p = accumarray(b_ind_2,1)/length(b_ind_2);
% H_2_codewords = -p'*log2(p)

% Encoding
m = 7;
H7 = log2(m);
L7 = ceil(H7);
E7 = H7/L7;

% Source coding theorem
Info_M/Info_1
Info_M/Info_2

%% Channel Capacity in binary digits/second
C = 1;
N = 40;
for i = 1:N
    m_comb(i) = m^i;
    H_comb(i) = log2(m_comb(i));
    L_comb(i) = ceil(H_comb(i));
    E_comb(i) = H_comb(i)/L_comb(i);
    R_comb(i) = E_comb(i)/H7;
end

Coding_Efficiency = figure;
set(Coding_Efficiency,'Units','normalized','Position',[0.2 0.2 0.6 0.4],...
    'Name','Coding Efficiency'),clf
subplot(1,3,1)
plot([1:N],E_comb,'-',[1:N],E_comb,'or')
xlabel('No. of combined symbols in each input')
ylabel('Coding Efficiency, E (bits/binary digit)')
grid on, box on
subplot(1,3,2)
semilogy([1:N],m_comb,'-',[1:N],m_comb,'or')
xlabel('No. of combined symbols in each input')
ylabel('Total number of inputs, m = 7^N^C^O^M^B')
grid on, box on
subplot(1,3,3)
plot([1:N],R_comb,'-',[1:N],R_comb,'or')
hline([C/H7 C/H7]);
xlabel('No. of combined symbols in each input')
ylabel('Trasmission rate, E/H(s) (symbols/s)')
grid on, box on

%% Noiseless channel theorem
n = 8;
p = 0.125;

s_mat = zeros(10000,8);
for i = 1:10000
    for j = 1:8
        s_mat(i,j) = binornd(1,p);
    end
end

[s_mat_unique,ia,ic] = unique(s_mat,'rows');
nunique = size(s_mat_unique,1);

ia_freq = zeros(nunique,1);
for i = 1:nunique
    ia_freq(i,1) = sum(ic == i);
end

ia_nones = sum(s_mat_unique,2);

ia_table = [ia,ia_nones,ia_freq];

[~,sort_ind] = sort(ia_table(:,3),'descend');
freq_table = ia_table(sort_ind,2:3);

unique_one1 = find(sum(s_mat_unique,2) == 1);
sum(unique_one1)
A = [];
for i = 1:8
    ic_ind = ic == unique_one1(i);
    A = [A; s_mat_unique(ic(ic_ind),:)];
end
p_1s = sum(A)

total_0s = find(sum(s_mat,2) == 0);
total_1s = find(sum(s_mat,2) == 1);
total_2s = find(sum(s_mat,2) == 2);
total_3s = find(sum(s_mat,2) == 3);
total_4s = find(sum(s_mat,2) == 4);
total_5s = find(sum(s_mat,2) == 5);
total_6s = find(sum(s_mat,2) == 6);
total_7s = find(sum(s_mat,2) == 7);
total_8s = find(sum(s_mat,2) == 8);

unique_2s = find(sum(s_mat_unique,2) == 2);
unique_3s = find(sum(s_mat_unique,2) == 3);
unique_4s = find(sum(s_mat_unique,2) == 4);
unique_5s = find(sum(s_mat_unique,2) == 5);
unique_6s = find(sum(s_mat_unique,2) == 6);
unique_7s = find(sum(s_mat_unique,2) == 7);
unique_8s = find(sum(s_mat_unique,2) == 8);


%% Noisy channel theorem
% We have 16 different symbols encoded as 4 bits codewords.
clear, clc

% Number of bits in each codeword.
nbits = 4;

% Symbols.
S = [0:15]';
ns = length(S);

% Codewords: we use the function dec2binvector to obtain a uniform
% representation of all symbols with 4 bits codewords.
codewords = zeros(ns,nbits);
for i = 1:ns
    codewords(i,:) = +dec2binvector(S(i),nbits);        
end

% Here we produce a message of 1000 symbols (out of the 16 possible) and
% encode the message as 1000 codewords.
nsymbols = 1000;
message = randi(ns,nsymbols,1);
X = zeros(nsymbols,nbits);
for i = 1:nsymbols
    ind = message(i);
    X(i,:) = codewords(ind,:);
end

% Here we simulate a noisy channel with an error rate of 10% in flipping
% bits.
p_flip = 0.1;
flip_mat = zeros(nsymbols,nbits);
for i = 1:nsymbols
    for j = 1:4
        flip_mat(i,j) = binornd(1,p_flip);
    end
end

% X and Y are the input to and output from the channel. 
flip_mat = logical(flip_mat);
Y = X;
Y(flip_mat) = -(X(flip_mat)-1);

% Here we see which symbols changed as a consequence of a flip
XY_flips = zeros(nsymbols,2);
for i = 1:nsymbols
    [~,~,XY_flips(i,1)] = intersect(X(i,:),codewords,'rows');
    [~,~,XY_flips(i,2)] = intersect(Y(i,:),codewords,'rows');
end

% Here we count for each input x the times that a specific output y was
% transmitted.
Joint_counts = zeros(ns,ns);
for i = 1:ns
    for j = 1:ns
    test_pair = [i j];
    ib = XY_flips(:,1) == i & XY_flips(:,2) == j;
    Joint_counts(i,j) = sum(ib);
    end
end

% Alternatively we can use MATLAB hist3 command:
Joint_counts = hist3(XY_flips,[ns ns]);

% Or MATLAB accumarray command:
Joint_counts = accumarray(XY_flips,1);

% Here we calculate the marginal and total sums.
Sum_X = sum(Joint_counts,2);
Sum_Y = sum(Joint_counts,1);
Sum_XY = sum(Joint_counts(:));
Joint_counts_table = [[Joint_counts Sum_X];[Sum_Y Sum_XY]];

% We represent the joint counts as a histogram.
Joint_counts_hist = figure;
hist3(XY_flips,[ns ns]);
zlim([0.01 60]); xlabel('X');ylabel('Y');zlabel('Counts');box on; grid on

% or as a heat map:
% imagesc(Joint_counts); xlabel('X');ylabel('Y')

% or as scatterhist plot with the marginal counts on the sides
% scatterhist(XY_flips(:,1),XY_flips(:,2),'NBins',[ns,ns])

% Here we make a table of the joint counts
cnames = {'y1','y2','y3','y4','y5','y6','y7','y8','y9','y10',...
    'y11','y12','y13','y14','y15','y16','SUM X'};
rnames = {'x1','x2','x3','x4','x5','x6','x7','x8','x9','x10',...
    'x11','x12','x13','x14','x15','x16','SUM Y'};
cformat = {'char'};

Joint_xy_counts = figure('Units','normalized','Position',[0.25 0.4 .75 .4]);
t = uitable(Joint_xy_counts,'Data',Joint_counts_table,...
            'ColumnName',cnames,... 
            'RowName',rnames,'FontWeight','bold','FontSize',16,...
            'RowStriping','off','ColumnEditable',true);
 
t.Position(3) = t.Extent(3);
t.Position(4) = t.Extent(4);

% Here we make a table of the joint probability distribution
Joint_prob_table = Joint_counts_table/Sum_XY;
cnames = {'y1','y2','y3','y4','y5','y6','y7','y8','y9','y10',...
    'y11','y12','y13','y14','y15','y16','p(X)'};
rnames = {'x1','x2','x3','x4','x5','x6','x7','x8','x9','x10',...
    'x11','x12','x13','x14','x15','x16','p(Y)'};
cwidth = {90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90};
Joint_xy_prob = figure('Units','normalized','Position',[0.15 0.4 .85 .4]);
t = uitable(Joint_xy_prob,'Data',Joint_prob_table,...
            'ColumnName',cnames,'RowName',rnames,'ColumnWidth',cwidth,... 
            'FontWeight','bold','FontSize',16,...
            'RowStriping','off','ColumnEditable',true);
 
t.Position(3) = t.Extent(3);
t.Position(4) = t.Extent(4);

% Here we create a MATLAB Table object
cnames = {cnames{1:ns} 'pY'};
rnames = {rnames{1:ns} 'pX'};
T = array2table(Joint_prob_table,'VariableNames',cnames,'Rownames',rnames);

% Here we calculate the joint entropy as a dot product. Notice that we use
% only the indices corresponding to non-zero elements of the matrix
Joint_prob = Joint_prob_table(1:16,1:16);
ind = find(Joint_prob);
H_xy = -Joint_prob(ind)'*log2(Joint_prob(ind));

% or using DWINNEL_MI Toolbox
H_xy = JointEntropy(XY_flips);

% Marginal probabilities
p_X = sum(Joint_prob,2);
p_Y = sum(Joint_prob,1);
p_XY = sum(Joint_prob(:));

% Joint probability from outer product of the marginal probabilities.
Joint_prob_from_product = p_X * p_Y;

% We compare the two observed and calculated joint probability
% distributions:
Joint_prob_calc = figure;
set(Joint_prob_calc,'Units','normalized','Position',[0.4 0.2 0.5 0.8],...
    'Name','Observed vs calculated Joint Prob. Distr.')
subplot(2,2,1)
bar(p_X);xlim([0 17]);xlabel('x index');ylabel('p(x_i)');title('p(X)')
subplot(2,2,2)
bar(p_Y);xlim([0 17]);xlabel('y index');ylabel('p(y_j)');title('p(Y)')
subplot(2,2,3)
imagesc(Joint_prob); xlabel('x index');ylabel('y index');box on;
title('Observed p(X,Y)')
subplot(2,2,4)
imagesc(Joint_prob_from_product); xlabel('x index');ylabel('y index');box on;
title('Calculated p(X,Y)')

% If X and Y are statistically independent H(x,y) = H(x) + H(y)
H_xy_prod = -Joint_prob_from_product(:)'*log2(Joint_prob_from_product(:));
H_x = -p_X'*log2(p_X);
H_y = -p_Y*log2(p_Y');
H_xy_prod = H_x + H_y;

% Mutual Information of observed distribution
I_xy = H_x + H_y - H_xy;

% Mutual Information of product distribution
ind = find(Joint_prob_from_product);
H_xy_prod = -Joint_prob_from_product(ind)'*log2(Joint_prob_from_product(ind));
I_xy_prod = H_x + H_y - H_xy_prod;

% Conditional probability
i = 3;j = 4;
p_x3_given_y4 = Joint_counts(i,j)/Sum_Y(j);
p_x3_given_y4 = Joint_prob(i,j)/p_Y(j);

% Conditional entropy
H_xgy = H_x - I_xy
H_ygx = H_y - I_xy

% Relationship between different entropies in a noisy channel
Entropy_relationships = figure;
bar_y = [H_xy 0 0 0 0 0 0 0; 0 H_y H_xgy 0 0 0 0 0; ...
         0 0 0 H_ygx H_x 0 0 0; 0 0 0 0 0 H_ygx I_xy H_x-I_xy ]
barplot = bar(bar_y,'stacked');
barplot(1).FaceColor = 'm';
barplot(2).FaceColor = 'g';
barplot(3).FaceColor = 'y';
barplot(4).FaceColor = 'y';
barplot(5).FaceColor = 'c';
barplot(6).FaceColor = 'w';
barplot(7).FaceColor = 'r';
barplot(8).FaceColor = 'w';
set(gca,'FontSize',14,'LineWidth',1,'XTick',[1 2 3 4],'XTickLabel',...
    {'','','',''});
ylabel('bits');
legend('H(X,Y)','H(Y)','H(X|Y)','H(Y|X)','H(X)','','I(X;Y)','','Location','northeastoutside')

save('Noiy_channel')
%% Capacity of a noisy channel

% Direct calculation of the channel noise
X_linear = X(:);
Y_linear = Y(:);
H_x_linear = Entropy(X_linear);
H_y_linear = Entropy(Y_linear);
I_xy_linear = MutualInformation(X_linear,Y_linear); 
H_noise_bit_X = H_x_linear - I_xy_linear;
H_noise_bit_Y = H_y_linear - I_xy_linear;

% We can calculate the expected noise entropy also from the anticipated
% probability of a flip at each symbol.
H_noise_flip = p_flip*log2(1/p_flip) + (1-p_flip)*log2(1/(1-p_flip))

% but proper encoding decreases the noise entropy:
H_x = Entropy(XY_flips(:,1))
H_y = Entropy(XY_flips(:,2))
I_xy = MutualInformation(XY_flips(:,1),XY_flips(:,2))
H_Noise_Y = (H_y - I_xy)/4
H_Noise_X = (H_x - I_xy)/4

save('Noiy_channel')

%% SPECIAL TOPIC: Continuous variables (differential entropy and MI)
%% Various methods to determine the MI of continuous variables
addpath(genpath('../GENERAL_SCRIPTS_FUNCTIONS'));
addpath(genpath('../TOOLBOXES/fastMI'));
addpath(genpath('../TOOLBOXES/KernelMI'));

%% MI Using standard formula 
% MI Using standard formula on marginal and joint densities derived from
% ksdensity. 
clear, clc, close all
pd1 = makedist('uniform','lower',-3,'upper',3);
s1 = random(pd1,1,1000);
pd2 = makedist('uniform','lower',-2.5,'upper',2.5);
s2 = random(pd2,1,1000);
S = [s1;s2];
A = [2 3;2 1];
X = A*S;

% Centering
mX = mean(X,2);
cX = X - mX(:,ones(1,1000));

% Whitening.
[E,~,D] = pca(cX');
white = (E*diag(D.^-0.5)*E')
X_white = white*cX;
cov(X_white')

[f,xi,bw] = ksdensity(X_white');
npoints = sqrt(length(xi));
u = X_white(1,:);
v = X_white(2,:);

% Kernel density of marginal distributions with same binning as the
% joint distribution
[xi_1,xi_ind_1] = unique(xi(:,1));
[xi_2,xi_ind_2] = unique(xi(:,2));
[mf_1,xi_12] = ksdensity(u',xi_1);
[mf_2,xi_22] = ksdensity(v',xi_2);

figure;plot(mf_1)
figure;plot(mf_2)
% 'plot(mf_1)' and 'plot(mf_2) 'displays the same pdf as the following ----
pd = fitdist(X_white(1,:)','Kernel')
figure;
x = xi_1;
y = pdf(pd,x);
plot(x,y,'LineWidth',2)
hold on
histogram(X_white(1,:)','Normalization','pdf')

pd = fitdist(X_white(2,:)','Kernel')
figure;
x = xi_2;
y = pdf(pd,x);
plot(x,y,'LineWidth',2)
hold on
histogram(X_white(2,:)','Normalization','pdf')

rf = reshape(f,[npoints,npoints]);
rxi_1 = reshape(xi(:,1),[30,30]);
rxi_2 = reshape(xi(:,2),[30,30]);
figure;
surf(rf)

MI = zeros(npoints,npoints);
for x1 = 1:npoints
    px1 = mf_1(x1); 
    for x2 = 1:npoints
       px2 = mf_2(x2); 
       px12 = rf(x2,x1);
       px1px2 = px1*px2;
       MI_mat(x1,x2) = px12*log2(px12/px1px2);
    end
end
% Double Integration
MI_mat = nantozero(MI_mat);
MI = trapz(xi_1,trapz(xi_2,MI_mat,2))

% Alternative simpler code 
a_px12 = rf';
a_px1px2 = mf_1*mf_2';
MI_mat = a_px12.*log2(a_px12./a_px1px2);
MI_mat = nantozero(MI_mat);
MI = trapz(xi_1,trapz(xi_2,MI_mat,2))

%% MI using ksdensity 
% Here we convert to uniform distributions using ksdensity 
u = ksdensity(X_white(1,:)',X_white(1,:)','function','cdf');
v = ksdensity(X_white(2,:)',X_white(2,:)','function','cdf');
uv = [u v];
scatterhist(u,v)

nu = norminv(u,0,1);
nv = norminv(v,0,1);
nuv = [nu nv];
scatterhist(nu,nv)
rho = corr(nu,nv);
rhosq = rho^2;
MI = -0.5*log2(1-rhosq)

%--------------------------------------------------------------------------
% % the following is the same as above
% pd1 = fitdist(X_white(1,:)','Kernel');
% u1 = cdf(pd1,X_white(1,:)');
% histogram(u1)
% ns1 = norminv(u1,0,1);
% histogram(ns1)
% 
% pd2 = fitdist(X_white(2,:)','Kernel');
% u2 = cdf(pd2,X_white(2,:)');
% histogram(u2)
% ns2 = norminv(u2,0,1);
% histogram(ns2)
% 
% rho = corr(ns1,ns2);
% rhosq = rho^2;
% MI = -0.5*log2(1-rhosq)

%% MI using ctransform and copulafit
u = ctransform(X_white(1,:));
v = ctransform(X_white(2,:));
uv = [u' v'];
scatterhist(u,v)
[ind1,~] = find(uv == 1);
[ind0,~] = find(uv == 0);
ind = setdiff([1:1000],[ind1;ind0]);
uv1 = uv(ind,:);
rhomat = copulafit('Gaussian',uv1)
rhosq = rhomat(1,2)^2;
MI = -0.5*log2(1-rhosq)

%% MI using ctransform and kernelmi
% code
x = X_white(1,:);
y = X_white(2,:);
scatterhist(x',y')

[Nx, Mx]=size(x);
[Ny, My]=size(y);

alpha = 0.25;
h = (Mx + 1) / sqrt(12) / Mx ^ (1 + alpha);

ind = 1:Mx;

% Copula-transform variables
x = ctransform(x);
y = ctransform(y);
scatterhist(x',y')

h2 = 2*h^2;

% Pointwise values for kernels
Kx = squareform(exp(-ssqd([x;x])/h2))+eye(Mx);
Ky = squareform(exp(-ssqd([y;y])/h2))+eye(Mx);

% Kernel sums for marginal probabilities
Cx = sum(Kx);
Cy = sum(Ky);
scatterhist(Cx',Cy')

% Kernel product for joint probabilities
Kxy = Kx.*Ky;

f = sum(Cx.*Cy)*sum(Kxy)./(Cx*Ky)./(Cy*Kx);
I = mean(log(f(ind)))

%%
% Notice that if the two independent components of S1, S2, or S3 were
% Gaussian we could easily calculate the differential entropy and thus the
% differential MI from the following formula, from which it is quite clear
% that 0 correlation (covariance) corresponds to 0 MI:
MI = -0.5*log2(1-(corr(S3(1,:)',S3(2,:)'))^2)

% For multivariate normal distributions we have:
corr_S3 = corr(S3');
MI = -0.5*log2(det(corr_S3))

% However this formula is not valid for non-Gaussian distributions, in
% which case the easiest way to determine MI is to calculate the discrete
% form of MI by binning the data:
%
S_tag = S3;
[nrows,ncols] = size(S_tag);
if nrows < ncols
    S_tag = S_tag';
else
end
[ntimes,nchannels] = size(S_tag);

S_binned = S_tag;
[bins,edges] = internal.stats.histbins(S_tag(:));
nbins = length(bins);

for j = 1:size(S_tag,2)
    for i = 1:nbins
        ind = S_tag(:,j)>= edges(i) & S_tag(:,j)<= edges(i+1);
        S_binned(ind,j) = i;
    end
end

% MI matrix
MI_mat = zeros(nchannels,nchannels);
for i = 1:nchannels
    for j = 1:nchannels
    MI_mat(i,j) = fastMI(S_binned(:,i),S_binned(:,j));
    end
end
    
MI_mat

% Alternatively we can use Kernel density MI: here we use a function from
% the Mathworks exchange
MI1 = kernelmi(S1(1,:),S1(2,:))
MI2 = kernelmi(S2(1,:),S2(2,:))
MI3 = kernelmi(S3(1,:),S3(2,:))

%% 
% Here we use our own code using the MATLAB function ksdensity to derive 
% the kernel densities and then the standard MI formula.
S0 = X_white;

% If we want to use our own grid and/or bandwidth
% gridx1 = -2.4:.05:2.4;
% gridx2 = gridx1;
% [x1,x2] = meshgrid(gridx1,gridx2);
% x1 = x1(:);
% x2 = x2(:);
% xi = [x1 x2];
% [f,xi,bw] = ksdensity(S0',xi)
% [f,xi,bw] = ksdensity(S0','Bandwidth',0.3)

% Using default grid and bandwidth
[f,xi,bw] = ksdensity(S0');
% figure;plot(f)
% figure;plot(xi)
npoints = sqrt(length(xi));
% Kernel density of marginal distributions with same binning as the
% joint distribution
[xi_1,xi_ind_1] = unique(xi(:,1));
[xi_2,xi_ind_2] = unique(xi(:,2));
[mf_1,xi_12] = ksdensity(S0(1,:)',xi_1);
[mf_2,xi_22] = ksdensity(S0(2,:)',xi_2);

%--------------------------------------------------------------------------
% plot(mf_1)
% plot(mf_2)
% % Notice that 'plot(mf_1)' displays the same pdf as the following -------
% pd = fitdist(S0(1,:)','Kernel')
% figure;
% x = -2.4:.05:2.4;;
% y = pdf(pd,x);
% plot(x,y,'LineWidth',2)
% hold on
% histogram(S0(1,:)','Normalization','pdf')
%--------------------------------------------------------------------------

rf = reshape(f,[npoints,npoints]);
rxi_1 = reshape(xi(:,1),[30,30]);
rxi_2 = reshape(xi(:,2),[30,30]);
% surf(rf)
% ind1 = find(xi_1>=min(S0(1,:)) & xi_1<=max(S0(1,:))); 
% ind2 = find(xi_2>=min(S0(2,:)) & xi_2<=max(S0(2,:))); 
MI = zeros(npoints,npoints);
for x1 = 1:npoints
    px1 = mf_1(x1); 
    for x2 = 1:npoints
       px2 = mf_2(x2); 
       px12 = rf(x2,x1);
       px1px2 = px1*px2;
       MI_mat(x1,x2) = px12*log2(px12/px1px2);
    end
end
% Double Integration
MI = trapz(xi_1,trapz(xi_2,MI_mat,2))
% MI = trapz(xi_1(ind1),trapz(xi_2(ind2),MI_mat(ind1',ind2'),2))

% Alternative simpler code 
a_px12 = rf';
a_px1px2 = mf_1*mf_2';
MI_mat = a_px12.*log2(a_px12./a_px1px2);
MI = trapz(xi_1,trapz(xi_2,MI_mat,2))

% Alternative code with marginalization by integration 
a_px12 = rf';
mf_1m = trapz(xi_2,a_px12,2);
mf_2m = trapz(xi_1,a_px12,1);
mf_1m = mf_1m/trapz(xi_1,mf_1m);
mf_2m = mf_2m/trapz(xi_2,mf_2m);
a_px1px2 = mf_1m*mf_2m;
a_px1px2 = a_px1px2/trapz(xi_1,trapz(xi_2,a_px1px2,2));
MI_mat = a_px12.*log2(a_px12./a_px1px2);
MI = trapz(xi_1,trapz(xi_2,MI_mat,2))

%% Kernel MI via fitdist 
pd1 = fitdist(S0(1,:)','Kernel');
u1 = cdf(pd1,S0(1,:)');
histogram(u1)
ns1 = norminv(u1,0,1);
histogram(ns1)

pd2 = fitdist(S0(2,:)','Kernel');
u2 = cdf(pd2,S0(2,:)');
histogram(u2)
ns2 = norminv(u2,0,1);
histogram(ns2)

rho = corr(ns1,ns2);
rhosq = rho^2;
MI = -0.5*log2(1-rhosq)

%% Kernel MI via copulafit 
% [u,ux] = ecdf(S0(1,:)');
% [v,vx] = ecdf(S0(2,:)');

u = ksdensity(S0(1,:)',S0(1,:)','function','cdf');
v = ksdensity(S0(2,:)',S0(2,:)','function','cdf');
scatterhist(u,v)
rhomat = copulafit('Gaussian',[u,v])
rhosq = rhomat(1,2)^2;
MI = -0.5*log2(1-rhosq)

%% Information theory and covariation analysis of MSAs
% DNA sequence example
bases = 'GATC';
nbases = 1000;
ntypes = length(bases);

% Uniform distribution
bases_id = randi(ntypes,[1,nbases]);
dna_seq = bases(bases_id);
H_dna_seq = log2(ntypes); 
for i = 1:ntypes
p_bases(i,1) = sum(bases_id == i)/nbases;
end
H_dna_seq = -p_bases'*log2(p_bases);

% Kullback-Leibler divergence
p_bases_exp = ones(ntypes,1)/ntypes;
Dkl_dna_seq = p_bases'*log2(p_bases./p_bases_exp);
Dkl_dna_seq = p_bases'*(log2(p_bases)-log2(p_bases_exp));

% Non-uniform distribution 
p_bases_exp = [0.5 0.3 0.15 0.05];
bases_id_temp = [ones(1,500) 2*ones(1,300) 3*ones(1,150) 4*ones(1,50)];
bases_id_ind = randi(nbases,[1,nbases]);
bases_id = bases_id_temp(bases_id_ind);
dna_seq = bases(bases_id);
for i = 1:ntypes
p_bases(i,1) = sum(bases_id == i)/nbases;
end
H_dna_seq = -p_bases'*log2(p_bases);

% Kullback-Leibler divergence
Dkl_dna_seq = p_bases'*log2(p_bases./p_bases_exp');
Dkl_dna_seq = p_bases'*(log2(p_bases)-log2(p_bases_exp'));

% Self Information
p_bases_unif = ones(ntypes,1)/ntypes;
I_dna_seq = p_bases'*log2(p_bases./p_bases_unif);
I_dna_seq = p_bases'*(log2(p_bases)-log2(p_bases_unif));

% Mutual Information
% We generate a complementary strand
bases_id_1 = bases_id == 1;
bases_id_2 = bases_id == 2;
bases_id_3 = bases_id == 3;
bases_id_4 = bases_id == 4;

bases_id_compl = zeros(1,nbases);
bases_id_compl(bases_id_1) = 4;
bases_id_compl(bases_id_2) = 3;
bases_id_compl(bases_id_3) = 2;
bases_id_compl(bases_id_4) = 1;

dna_seq_compl = bases(bases_id_compl);

% Probability distribution of the complementary strand
for i = 1:ntypes
p_bases_compl(i,1) = sum(bases_id_compl == i)/nbases;
end

% Entropy of each strand
H_x = -p_bases'*log2(p_bases);
H_y = -p_bases_compl'*log2(p_bases_compl);

XY = [bases_id' bases_id_compl'];

% Observed Joint probability distribution
p_pairs = zeros(ntypes,ntypes);
for i = 1:ntypes
    for j = 1:ntypes
    p_pairs(i,j) = sum(bases_id == i & bases_id_compl == j)/nbases;
    end
end

% Joint probability distribution expected from the marginal distributions

p_pairs_exp = p_bases*p_bases_compl';

pos_ind = find(p_pairs(:))
p_pairs_pos = p_pairs(pos_ind);
H_xy = -p_pairs_pos'*log2(p_pairs_pos);
p_pairs_exp_pos = p_pairs_exp(pos_ind);

% Mutual Information
I_xy = p_pairs_pos'*log2(p_pairs_pos./p_pairs_exp_pos);

% Alternative MI calculation using the function 'accumarray'
X = bases_id';
Y = bases_id_compl';

h = accumarray([X Y], 1)/nbases; 

xy_prod = sum(h,2)*sum(h,1);

xy_ind = h~=0;
h = h(xy_ind);
xy_prod = xy_prod(xy_ind);

I_xy = h'*log2(h./xy_prod);

% Effect of mutations
Y_mut = Y;
nmut = 40;
flip_base = randi(nbases,nmut);
for i = 1:nmut
if Y(i) == 1
   Y_mut(i) = 3;
elseif Y(i) == 2
   Y_mut(i) = 4; 
elseif Y(i) == 3
   Y_mut(i) = 1;
elseif Y(i) == 4
   Y_mut(i) = 2;
end
end


h = accumarray([X Y_mut], 1)/nbases;
p_X = sum(h,2);
p_Y_mut = sum(h,1);
xy_prod = p_X*p_Y_mut;
xy_ind = h~=0;
p_XY_mut = h(xy_ind);
p_XY_prod = xy_prod(xy_ind);
H_x = -p_X'*log2(p_X);
H_y_mut = -p_Y_mut*log2(p_Y_mut');
H_xy_mut = -p_XY_mut'*log2(p_XY_mut);
I_xy_mut = p_XY_mut'*log2(p_XY_mut./p_XY_prod);

MutualInformation(X,Y_mut)    

%% Protein sequence example
clear, clc
resi = 'ARNDCQEGHILKMFPSTWYV';
nresi = 1000;
ntypes = length(resi);

% Uniform distribution
% X = randi(ntypes,[1,nresi]);
% X_seq = resi(X);
% H_x = log2(ntypes); 
% for i = 1:ntypes
% p_X(i,1) = sum(X == i)/nresi;
% end
% H_x = -p_x'*log2(p_x);

% Non-uniform distribution
% X column
p_X_exp = [0.01 0.68 0.01 0.005 0 0.01 0 0.005 0.08 0 0 0.17 0 0 0.01 0 0.01 0 0 0.01];
X_vec = [];
for i = 1:ntypes
    X_vec = [X_vec i*ones(1,p_X_exp(i)*nresi)];
end
X_vec = X_vec';

X_ind = randi(nresi,[1,nresi]);
X = X_vec(X_ind);
X_seq = resi(X)';
for i = 1:ntypes
p_X(i,1) = sum(X == i)/nresi;
end
pos_ind = find(p_X);
H_x = -p_X(pos_ind)'*log2(p_X(pos_ind));

% Y column 
p_Y_exp = [0.01 0 0 0.68 0 0.08 0.17 0 0.01 0 0.01 0 0.01 0 0.01 0 0.005 0.01 0 0.005];
Y_vec = [];
for i = 1:ntypes
    Y_vec = [Y_vec i*ones(1,p_Y_exp(i)*nresi)];
end
Y_vec = Y_vec';

Y_ind = randi(nresi,[1,nresi]);
Y = Y_vec(Y_ind);
Y_seq = resi(Y)';
for i = 1:ntypes
p_Y(i,1) = sum(Y == i)/nresi;
end
pos_ind = find(p_Y);
H_y = -p_Y(pos_ind)'*log2(p_Y(pos_ind));

XY = [X Y];
XY_seq = [X_seq Y_seq];

% Mutual Information
h = accumarray([X Y], 1)/nresi;
[hx,hy] = size(h);
jpd = zeros(ntypes,ntypes)
jpd(1:hx,1:hy) = h;

h_small = accumarray([X(400:600) Y(400:600)], 1)/nresi;
[hx,hy] = size(h_small);
jpd_small = zeros(ntypes,ntypes)
jpd_small(1:hx,1:hy) = h_small;

JPD_sampling = figure;
set(JPD_sampling,'Units','normalized','Position',[0.3 0.3 0.2 0.7]);
subplot(2,1,1)
spy(jpd); title('Large MSA');
subplot(2,1,2)
spy(jpd_small); title('Small MSA');

Entropy(X)
Entropy(Y)
JointEntropy([X Y])
MutualInformation(X,Y)

Entropy(X(400:600))
Entropy(Y(400:600))
JointEntropy([X(400:600) Y(400:600)])
MutualInformation(X(400:600),Y(400:600))

%% Example of MI analysis of KDO8PS MSA
% Here we start the final processing of the merged alignment.
msafile = '../DATABASE/KDSA_comb_trimmed.fasta';
msafile_type = 'faln';

switch msafile_type 
    case 'faln'
    [KDSA_comb_smsa,KDSA_comb_nmsa] = faln_to_nmsa(msafile);
    case 'aln'        
    [KDSA_comb_smsa,KDSA_comb_nmsa] = aln_to_nmsa(msafile);
end

MSA_heat_map = figure;
imagesc(KDSA_comb_nmsa);xlabel('Res. no.');ylabel('Seq. no.')
title('MSA');colorbar

KDSA_comb_cmsa = int2aa(KDSA_comb_nmsa);

MI_mat = NMSA_to_fastMI(KDSA_comb_nmsa);
MIP_mat = MI_to_MIP(MI_mat);
[ZPX_mat,ZPX2_mat] = MIP_to_ZPX(MIP_mat);

MI_matrices = figure
set(MI_matrices,'Units','normalized','Position',[0.2 0.2 0.56 0.8],'Name','MI_matrices');
subplot(2,2,1)
imagesc(MI_mat);set(gca,'YDir','Normal');xlabel('Res. no.');ylabel('Res. no.');
title('MI');colorbar, colormap jet
subplot(2,2,2)
imagesc(MIP_mat);set(gca,'YDir','Normal');xlabel('Res. no.');ylabel('Res. no.');
title('MIp');colorbar, colormap jet
subplot(2,2,3)
imagesc(ZPX_mat);set(gca,'YDir','Normal');xlabel('Res. no.');ylabel('Res. no.');
title('Zpx');colorbar, colormap jet
subplot(2,2,4)
imagesc(ZPX2_mat);set(gca,'YDir','Normal');xlabel('Res. no.');ylabel('Res. no.');
title('Zpx2');colorbar, colormap jet

%% PDB superposition

% Import pdb data
clear KDSA_pdb
% Information about the pdb file:
pdbfile = '../DATABASE/KDSA_simple.pdb';
KDSA_pdb = pdbread(pdbfile);
START = 1;
END = 280;
PDB_START = 1;
PDB_END = 280;
nmsa = KDSA_comb_nmsa(:,PDB_START:PDB_END);
REF_length = numel(nmsa(1,START:END));
cmsa = int2aa(nmsa);

%---------------------1st set----------------------------------------------
% Usage: [c_distances,sorted_c_distances,sorted_mat1,...
%           sorted_mat2,sorted_mat3,sorted_mat4,pdbstruct] = ...
%           coev_distance_matrix_3(pdbfile,chain,first_res_no,last_res_no,...
%           mat1,mat2,mat3,mat4,radius,near,npreds,plot_dist)
% chain: 1,2,3,...
% first_res_no: number of the first residue with coordinates in the pdb file
% last-res_no: number of the last residue with coordinates in the pdb file
% mat1, mat2 ...: coevolution matrices
% radius: threshold distance to select entries in the protein distance matrix
% near: minimum separation between residues in sequence to be included in
% the analysis: 1 = consecutive; 2 = separated by 1 intervening residue; 
% 3 = separated by 2 intervening residues; and so on ...
% npreds: this is 3 times the number of top positions that will be plotted
% on top of a distance matrix.
% plot_dist can have different values:
% 0 : no plot
% 1 : sparsity plot of the distance matrix (yellow on white background).
% 2 : heat map of the distance matrix with everything beyond the radius zeroed.
% and with blue background and colors representing the number of atomic
% interactions
% else : like 2 (but cyan on white background), with the covarions identified 
% by different methods overlaid on top.

%--------------------------------------------------------------------------
near = 1;
near1 = near;
ncov = REF_length;
radius = 8;

[c_distances,sorted_c_distances,sorted_MI,sorted_MIP,...
    sorted_ZPX,sorted_ZPX2] = ...
    coev_distance_matrix_3(pdbfile,1,PDB_START,PDB_END,MI_mat,...
    MIP_mat,ZPX_mat,ZPX2_mat,radius,near,ncov,3,...
    [0,1,0],[1 0 0],[1 0 1],[0 0 1],...
    [0.9 0.9 0.9],[0.9 0.9 0.9],[0.9 0.9 0.9],[0.9 0.9 0.9],...
    [1.0 1.0 1.0],[1.0 1.0 1.0],[1.0 1.0 1.0],[1.0 1.0 1.0]...
    );

% saveas(gcf,contact_map_file,'fig');

% save('Coevolution_methods')

