Code covered by the BSD License  

Highlights from
Estimating the number of clusters via System Evolution

from Estimating the number of clusters via System Evolution by Kaijun Wang
estimate number of clusters for far clusters, small-larger clusters, slightly overlapping clusters

MainSystemEvolution.m
% sysEvolution (Version 3.0): Estimating Number of Clusters via System Evolution
% Please read the help file "Readme.txt" before running this program
% WANG Kaijun: wangkjun@yahoo.com, Dec. 2006, March, Sept. 2007

clear;
type = 2;                   % 2: Pearson coefficients; 1: Euclidean distances
staz=0;                     % standardization to [0 1] when mixed metric
Hsep=0;                   % threshold for Ep(k)-Em(k) > Hsep
N2=10;                     % searching limit is max(N2,nk+6)
runsave = 0;              % saving PAM solutions at every K
newp=1;                   % opening a new figure window ?
subp=0;                    % plotting in which sub-window ?
pcolor=1;                  % color plotting ?
pc=1;                        % plotting in PC space ?
verbose = 1;

% Part 1: selecting a data set, data file: rows - data points, columns - dimensions
id = 5;

switch id
    %(1)  class labels are known, stored in 1st column
    case 1,  sw='4k20_lap.txt';               nk=4;
    case 2,  sw='4k40_lap.txt';               nk=4;
    case 3,  sw='6k20_close.txt';           nk=6;
    case 4,  sw='6k40_far.txt';                nk=6;
    case 5, sw='leuk72_3k.txt';               nk=3;
    case 6, sw='lym96_4k.txt';                nk=4;
    case 7, sw='g205_4k.txt';                  nk=4;
    case 8, sw='y208_4k.txt';                  nk=4;
    case 9, sw='m691_11k.txt';              nk=11;
    case 10, sw='14k10far.txt';               nk=14;
    case 11, sw='22k10far.txt';               nk=22;
    %(2)  class labels are unknown (1st column is data too)
    case 21, sw='colon2000_4k.txt';      nk=4;
end

if type == 2 && (id == 10 || id == 11) || type == 1 && (id == 1 || id == 2)
  disp(' '); disp([' ## Please change value of type or select another data set !']);
  return
end

% Part 2: Running PAM clustering algorithm
disp(' '); disp(['==> PAM clustering is running on ' sw ', please wait ...']);
[data,N1,N,Ns,truelabels,dc] = data_load(id,sw,nk,N2);

[labels,Dist,dmax] = pam_running(data,N,Ns,type,staz,id,runsave,verbose);


% Part 3:  Assessing error rates of a clustering result if true labels are given
if id < 21
    fprintf('\nError rates of clustering solution  for reference only (might \n');
    fprintf(' be inaccurate if big, then use Fowlkes-Mallows index) : ');
    valid_errorate(labels(:,nk), truelabels,verbose);
    index = valid_external(labels(:,nk), truelabels);
    fprintf(' Fowlkes-Mallows index: %f\n', index(4));
end

% Part 4: Energy computations for System Evolution
[inter,inter2,intra,intra2,N2,Ns,nCluster] = SystemEvolution_energy(labels,Dist,dc,N,Ns,verbose);

% Part 5: System evolution gives optimal Number of Clusters 
[ters, km,g1,g2,all,all2] = SystemEvolution_findk(inter,inter2,intra,intra2,N2,Ns,Hsep);

fprintf('\n # Number of clusters estimated by sysEvolution: k= %d\n', km(1));
ind = isfinite(ters(1,:));
if type == 2
  fprintf('\n ==  The clustering result is under Pearson correlation');
  fprintf('\n # Partition & merge energy at every k:\n');
  disp('    [k; partition (inter-cluster); merge (inner-cluster)]');
  disp([Ns; ters(:,ind)]);
  fprintf('\n # Energy back to Pearson coefficients :\n');
  disp('    [k; partition (inter-cluster); merge (inner-cluster)]');
  disp([Ns; 1-2*ters(:,ind)]);
else
  fprintf('\n ==  The clustering results is under Euclidean distance');
  fprintf('\n # Partition & merge energy at every k:\n');
  disp('    [k; partition (inter-cluster); merge (inner-cluster)]');
  disp([Ns; ters(:,ind)]);
end

% Part 6: Plotting the evolution route and optimal NC
plot_evolution_route(data,labels,ters,g1,g2,km,nCluster,newp,subp,pcolor);

if 0
% Part 7: Detail of energy between (every two) clusters
% Please see help file for explaination
[ind,ind2,Edist,Esep] = energydiff_allclusters(Dist,labels,km(1),dc);
fprintf('\n #  Epartition - Emerge for ind(i,j)>0 (row i denotes cluster i, column j denotes cluster j) : \n');
disp(ind); 
fprintf('\n # Energy back to Pearson coefficients for Epartition - Emerge :\n');
disp(-2*ind);
if sum(sum(ind2))
    fprintf('\n #  Epartition - Emerge if ind(i,j)<=0 (row i denotes cluster i, column j denotes cluster j) : \n');
    disp(ind2);
    fprintf('\n # Energy back to Pearson coefficients for Epartition - Emerge :\n');
    disp(-2*ind2);
end
% disp(' ##  Edist : detail of ind'); Edist
% disp(' ##  Esep : detail of ind2'); Esep
end

if 0
% Part 8: more plotting of data & class labels for reference only
% notice: Pearson measure can not be shown directly in PC space
figure;  plotdata_bylabels(data,labels(:,km(1)),pc,subp,'nb');
title('clustering solution is plotted in Principal Component space')
%clf; plotdata_bylabels(data,truelabels,2,0,'nb');
%clf; showClass_up(data, labels(:,km(1)), 2, 0);
%clf; showClass_up(data, truelabels, 2, 0);
end

Contact us at files@mathworks.com