Code covered by the BSD License  

Highlights from
CVAP: Cluster Validity Analysis Platform (cluster analysis and validation tool)

CVAP: Cluster Validity Analysis Platform (cluster analysis and validation tool)

by

 

12 Apr 2007 (Updated )

supplying over 17 validity indices and 5 clustering algorithms based on GUI

[idx,netsim,dpsim,expref,pref,lowp,highp]=apclusterK(...
% [idx,netsim,dpsim,expref,pref]=apclusterK(s,k)
%
% Finds approximately k clusters using affinity propagation (BJ Frey and
% D Dueck, Science 2007), by searching for an appropriate preference value
% using a bisection method. By default, the method stops refining the
% number of clusters when it is within 10%% of the value k provided by the
% user. To change this percentage, use apclusterK(s,k,prc) -- eg, setting
% prc to 0 causes the method to search for exactly k clusters. In any case
% the method terminates after 20 bisections are attempted.
%

function [idx,netsim,dpsim,expref,pref,lowp,highp]=apclusterK(...
    s,kk,prc,nrun,nconv,lam,splot,lowp,highp,lowk,highk);

if nargin<3 prc=10; end; % Default percentage error in k

% Construct similarity matrix and add a tiny amount of noise
if size(s,2)==3
    N=max(max(s(:,1)),max(s(:,2)));
    S=zeros(N,N);%-Inf*ones(N,N);
    for j=1:size(s,1) S(s(j,1),s(j,2))=s(j,3); end;
else N=size(s,1); S=s;
end;
rns=randn('state'); randn('state',0);
S=S+(eps*S+realmin*100).*rand(N,N);
randn('state',rns);

if kk == 1
% Find limits
[dpsim1 k11]=max(sum(S,1));
if dpsim1==-Inf
    error('Could not find pmin');
elseif N>1000
    for k=1:N S(k,k)=-Inf; end;
    m=max(S,[],2);
    tmp=sum(m);
    [yy ii]=min(m);
    tmp=tmp-yy-min(m([1:ii(1)-1,ii(1)+1:N]));
    pmin=dpsim1-tmp;
else
    dpsim2=-Inf;
    for j21=1:N-1
        for j22=j21+1:N
            tmp=sum(max(S(:,[j21,j22]),[],2));
            if tmp>dpsim2 dpsim2=tmp; k21=j21; k22=j22; end;
        end;
    end;
    pmin=dpsim1-dpsim2;
end;
for k=1:N S(k,k)=-Inf; end;
pmax=max(S(:));
highpref=pmax; highk=N;
lowpref=pmin; lowk=1;
for k=1:N S(k,k)=0; end;

lowp = lowpref;
highp = highpref;
pref = lowpref;
idx=[]; netsim=[]; dpsim=[]; expref=[]; 
return

else
    tmpk = lowk; 
    tmppref = lowp;
    highpref = highp;
    lowpref = lowp;
fprintf('Bounds: lowp=%f  highp=%f      lowk=%d  highk=%d\n', ...
    lowpref,highpref,lowk,highk);
fprintf('Attempting to improve lower bound:\n');

% Run AP several times to find better lower bound
i=-4; dn=0;
while ~dn
    tmppref=highpref-10^i*(highpref-lowpref);
    fprintf('  Trying p=%f\n',tmppref);
    [idx,netsim,dpsim,expref]=apcluster(S,tmppref,'dampfact',lam, ...
        'convits',nconv,'maxits',nrun,'nonoise');
    tmpk=length(unique(idx));
    if tmpk<=kk dn=1;
    elseif i==-1 tmpk=lowk; tmppref=lowpref; dn=1;
    else i=i+1;
    end;
end;
end

% Use bisection method to find k
if abs(tmpk-kk)/kk*100>prc
    fprintf('Applying bisection method:\n');
    lowk=tmpk; lowpref=tmppref; ntries=0;
    while (abs(tmpk-kk)/kk*100>prc)&&(ntries<20)
        fprintf('  lowp=%f  highp=%f      lowk=%d  highk=%d\n', ...
            lowpref,highpref,lowk,highk);
        tmppref=0.5*highpref+0.5*lowpref;
        [idx,netsim,dpsim,expref]=apcluster(S,tmppref,'dampfact',lam, ...
            'convits',nconv,'maxits',nrun,'nonoise',splot);
        tmpk=length(unique(idx));
        if kk>tmpk lowpref=tmppref; lowk=tmpk;
        else highpref=tmppref; highk=tmpk;
        end;
        ntries=ntries+1;
    end;
end;
pref=tmppref;
fprintf('Found %d clusters using a preference of %f\n',tmpk,pref);

Contact us