Code covered by the BSD License  

Highlights from
Estimating the number of clusters via System Evolution

from Estimating the number of clusters via System Evolution by Kaijun Wang
estimate number of clusters for far clusters, small-larger clusters, slightly overlapping clusters

pam(x,kclus,vtype,stdize,metric,silhplot)
function result = pam(x,kclus,vtype,stdize,metric,silhplot)

%PAM returns a list representing a clustering of the data into kclus
%clusters following the pam algorithm which is based on the search
%for kclus representative objects or medoids among the observations of
%the dataset
%
%The algorithm is fully described in:
%   Kaufman, L. and Rousseeuw, P.J. (1990),
%   "Finding groups in data: An introduction to cluster analysis",
%   Wiley-Interscience: New York (Series in Applied Probability and
%   Statistics), ISBN 0-471-87876-6.
%
% Required input arguments:
%   x : Data matrix (rows = observations, columns = variables)
%       Dissimilarity matrix (if number of columns equals 1)
%   kclus : The number of desired clusters
%   vtype : Variable type vector (length equals number of variables)
%       Possible values are 1  Asymmetric binary variable (0/1)
%                           2  Nominal variable (includes symmetric binary)
%                           3  Ordinal variable
%                           4  Interval variable
%       (in case x is Dissimilarity Matrix vtype isn't required.)
%
% Optional input arguments:
%   stdize : standardise the variables given by the x-matrix
%       Possible values are 0 : no standardisation
%                           1 : standardisation by the mean
%                           2 : standardisation by the median
%       (in case x is a dissimilarity matrix, stdize would be ignored)
%       (is case stdize is not given, stdize=0)
%   metric : Metric to be used (default euclidian (eucli) or mixed (mixed))
%       Possible values are 'eucli' Euclidian (all interval variables)
%                           'manha' Manhattan
%                           'mixed' Mixed (not all interval variables)
%   silhplot : draws picture
%       Possible values are 0 : do not create a silhouette plot
%                           1 : create a silhouette plot
%       (in case silhplot is not given, silhplot=0)
%
% I/O:
%   result=pam(x,kclus,vtype,'eucli',silhplot)
%
% Example (subtracted from the referenced book)
%   load ruspini.mat
%   result=pam(ruspini,2,[4 4],1);
%
% The output of PAM is a structure containing:
%   result.dys        : dissimilarities (read row by row from the
%                       lower dissimilarity matrix)
%   result.metric     : used metric
%   result.number     : number of observations
%   result.ttd        : Average silhouette width per cluster
%   result.ttsyl      : Average silhouette width for dataset
%   result.idmed      : Id of medoid observations
%   result.obj        : Objective function at the first two iterations
%   result.ncluv      : Cluster membership for each observation
%   result.clusinf    : Matrix, each row gives numerical information for
%                       one cluster. These are the cardinality of the cluster
%                       (number of observations), the maximal and average
%                       dissimilarity between the observations in the cluster
%                       and the cluster's medoid, the diameter of the cluster
%                       (maximal dissimilarity between two observations of the
%                       cluster), and the separation of the cluster (minimal
%                       dissimilarity between an observation of the cluster
%                       and an observation of another cluster).
%   result.sylinf     : Matrix, with for each observation i the cluster to
%                       which i belongs, as well as the neighbor cluster of i
%                       (the cluster, not containing i, for which the average
%                       dissimilarity between its observations and i is minimal),
%                       and the silhouette width of i.
%   result.nisol      : Vector, with for each cluster specifying whether it is
%                       an isolated cluster (L- or L*-clusters) or not isolated.
%                       A cluster is an L*-cluster iff its diameter is smaller than
%                       its separation.  A cluster is an L-cluster iff for each
%                       observation i the maximal dissimilarity between i and any
%                       other observation of the cluster is smaller than the minimal
%                       dissimilarity between i and any observation of another cluster.
%                       Clearly each L*-cluster is also an L-cluster.
%
% And PAM will create the silhouette plot if silhplot equals 1 (an empty bar indicated by
%                       zero is a sparse between two clusters).
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at:
%              http://wis.kuleuven.be/stat/robust.html
%
% Written by Guy Brys (May 2006)

%Checking and filling in the inputs
res1=[];
if (nargin<2)
    error('Two input arguments required')
elseif ((nargin<3) & (size(x,2)~=1) & (size(x,1)~=1))
    error('Three input arguments required')
elseif (nargin<3)
    if (size(x,2)==1)
        x = x';
    end
    res1.metric = 'unknown';
    res1.disv = x;
    lookup=seekN(x);
    res1.number = lookup.numb;   %(1+sqrt(1+8*size(x,1)))/2;
    stdize = 0;
    silhplot = 0;
elseif (nargin<4)
    stdize = 0;
    silhplot = 0;
    if (sum(vtype)~=4*size(x,2))
        metric = 'mixed';
    else
        metric = 'eucli';
    end
elseif (nargin<5)
    silhplot = 0;
    if (sum(vtype)~=4*size(x,2))
        metric = 'mixed';
    else
        metric = 'eucli';
    end
elseif (nargin<6)
    silhplot = 0;
end

%Replacement of missing values
for i=1:size(x,1)
    A=find(isnan(x(i,:)));
    if (~(isempty(A)))
        for j=A
            valmisdat=0;
            for c=1:size(x,2)
                if (c~=j)
                    [a,b] = sort(x(:,c));
                    if (~isempty(b(find(a==x(i,c)))))
                        valmisdat=valmisdat+find(a==x(i,c));
                    end
                end
            end
            x(i,j)=prctile(x(isnan(x(:,j))==0,j),100*valmisdat/(size(x,1)*(size(x,2)-1)));
        end
    end
end

%Standardization
if ((stdize==1) & (strcmp(metric,'eucli')))
    x = ((x - repmat(mean(x),size(x,1),1))./(repmat(std(x),size(x,1),1)));
elseif ((stdize==2) & (strcmp(metric,'eucli')))
    x = ((x - repmat(median(x),size(x,1),1))./(repmat(mad(x),size(x,1),1)));
end

%Calculating the dissimilarities with daisy
if (isempty(res1))
    res1=daisy(x,vtype,metric);
end

%Actual calculations (the second for latter use with CLUSPLOT)
[dys,ttd,ttsyl,idmed,obj,ncluv,clusinf,sylinf,nisol]=pamc(res1.number,kclus,[0 res1.disv]');
dys=res1.disv(lowertouppertrinds(res1.number));

%Create a silhouetteplot
if (silhplot==1)
    Y=sylinf(:,3);
    Y1=flipdim(Y,1);
    whitebg([1 1 1]);
    % we calculate b="a but with a bar with length zero if the objects
    % are from another cluster"
    % and h="objects but with a 0 between 2 clusters"="g with a 0 if
    % it is a sparse between 2 clusters"
    a=flipdim(Y1,1);
    b=[];
    g=sylinf(:,4);
    f=sylinf(:,1)-1;
    for j=1:res1.number
        b(j+f(j))=a(j);
        h(j+f(j))=g(j);
    end
    b1=flipdim(b,2);
    h1=flipdim(h,2);
    % we use this b1 and h1 to plot the barh (instead of a and g)
    barh(b1,1);
    title 'Silhouette Plot of Pam' ;
    xlabel('Silhouette width');
    YT=1:res1.number+(sylinf(res1.number,1)-1);
    set(gca,'YTick',YT);
    set(gca,'YTickLabel',h1);
    axis([min([Y' 0]),max([Y' 0]),0.5,res1.number+0.5+f(res1.number)]);
elseif ((silhplot~=0) & (silhplot~=1) & (nargin==6))
    error('silhplot must equals 0 or 1')
end


%Putting things together
result = struct('dys',dys,'metric',res1.metric,'number',res1.number,...
    'ttd',ttd,'ttsyl',ttsyl,'idmed',idmed,'obj',obj,'ncluv',ncluv,...
    'clusinf',clusinf,'sylinf',sylinf,'nisol',nisol,'x',x);

%------------
%SUBFUNCTIONS

function dv = lowertouppertrinds(n)

dv = [];
for i=0:(n-2)
    dv = [dv cumsum(i:(n-2))+repmat(1+sum(0:i),1,n-i-1)];
end

%---
function outn = seekN(x)

ok=0;
numb=0;
k=size(x,2);
sums=cumsum(1:k);
for i=1:k
    if(sums(i)==k)

        numb=i+1;
        ok=1;
    end
end
outn=struct('numb',numb,'ok',ok);

Contact us at files@mathworks.com