% sysEvolution (Version 3.0): Estimating Number of Clusters via System Evolution
% Please read the help file "Readme.txt" before running this program
% WANG Kaijun: wangkjun@yahoo.com, Dec. 2006, March, Sept. 2007
clear;
type = 2; % 2: Pearson coefficients; 1: Euclidean distances
staz=0; % standardization to [0 1] when mixed metric
Hsep=0; % threshold for Ep(k)-Em(k) > Hsep
N2=10; % searching limit is max(N2,nk+6)
runsave = 0; % saving PAM solutions at every K
newp=1; % opening a new figure window ?
subp=0; % plotting in which sub-window ?
pcolor=1; % color plotting ?
pc=1; % plotting in PC space ?
verbose = 1;
% Part 1: selecting a data set, data file: rows - data points, columns - dimensions
id = 5;
switch id
%(1) class labels are known, stored in 1st column
case 1, sw='4k20_lap.txt'; nk=4;
case 2, sw='4k40_lap.txt'; nk=4;
case 3, sw='6k20_close.txt'; nk=6;
case 4, sw='6k40_far.txt'; nk=6;
case 5, sw='leuk72_3k.txt'; nk=3;
case 6, sw='lym96_4k.txt'; nk=4;
case 7, sw='g205_4k.txt'; nk=4;
case 8, sw='y208_4k.txt'; nk=4;
case 9, sw='m691_11k.txt'; nk=11;
case 10, sw='14k10far.txt'; nk=14;
case 11, sw='22k10far.txt'; nk=22;
%(2) class labels are unknown (1st column is data too)
case 21, sw='colon2000_4k.txt'; nk=4;
end
if type == 2 && (id == 10 || id == 11) || type == 1 && (id == 1 || id == 2)
disp(' '); disp([' ## Please change value of type or select another data set !']);
return
end
% Part 2: Running PAM clustering algorithm
disp(' '); disp(['==> PAM clustering is running on ' sw ', please wait ...']);
[data,N1,N,Ns,truelabels,dc] = data_load(id,sw,nk,N2);
[labels,Dist,dmax] = pam_running(data,N,Ns,type,staz,id,runsave,verbose);
% Part 3: Assessing error rates of a clustering result if true labels are given
if id < 21
fprintf('\nError rates of clustering solution for reference only (might \n');
fprintf(' be inaccurate if big, then use Fowlkes-Mallows index) : ');
valid_errorate(labels(:,nk), truelabels,verbose);
index = valid_external(labels(:,nk), truelabels);
fprintf(' Fowlkes-Mallows index: %f\n', index(4));
end
% Part 4: Energy computations for System Evolution
[inter,inter2,intra,intra2,N2,Ns,nCluster] = SystemEvolution_energy(labels,Dist,dc,N,Ns,verbose);
% Part 5: System evolution gives optimal Number of Clusters
[ters, km,g1,g2,all,all2] = SystemEvolution_findk(inter,inter2,intra,intra2,N2,Ns,Hsep);
fprintf('\n # Number of clusters estimated by sysEvolution: k= %d\n', km(1));
ind = isfinite(ters(1,:));
if type == 2
fprintf('\n == The clustering result is under Pearson correlation');
fprintf('\n # Partition & merge energy at every k:\n');
disp(' [k; partition (inter-cluster); merge (inner-cluster)]');
disp([Ns; ters(:,ind)]);
fprintf('\n # Energy back to Pearson coefficients :\n');
disp(' [k; partition (inter-cluster); merge (inner-cluster)]');
disp([Ns; 1-2*ters(:,ind)]);
else
fprintf('\n == The clustering results is under Euclidean distance');
fprintf('\n # Partition & merge energy at every k:\n');
disp(' [k; partition (inter-cluster); merge (inner-cluster)]');
disp([Ns; ters(:,ind)]);
end
% Part 6: Plotting the evolution route and optimal NC
plot_evolution_route(data,labels,ters,g1,g2,km,nCluster,newp,subp,pcolor);
if 0
% Part 7: Detail of energy between (every two) clusters
% Please see help file for explaination
[ind,ind2,Edist,Esep] = energydiff_allclusters(Dist,labels,km(1),dc);
fprintf('\n # Epartition - Emerge for ind(i,j)>0 (row i denotes cluster i, column j denotes cluster j) : \n');
disp(ind);
fprintf('\n # Energy back to Pearson coefficients for Epartition - Emerge :\n');
disp(-2*ind);
if sum(sum(ind2))
fprintf('\n # Epartition - Emerge if ind(i,j)<=0 (row i denotes cluster i, column j denotes cluster j) : \n');
disp(ind2);
fprintf('\n # Energy back to Pearson coefficients for Epartition - Emerge :\n');
disp(-2*ind2);
end
% disp(' ## Edist : detail of ind'); Edist
% disp(' ## Esep : detail of ind2'); Esep
end
if 0
% Part 8: more plotting of data & class labels for reference only
% notice: Pearson measure can not be shown directly in PC space
figure; plotdata_bylabels(data,labels(:,km(1)),pc,subp,'nb');
title('clustering solution is plotted in Principal Component space')
%clf; plotdata_bylabels(data,truelabels,2,0,'nb');
%clf; showClass_up(data, labels(:,km(1)), 2, 0);
%clf; showClass_up(data, truelabels, 2, 0);
end