Code covered by the BSD License  

Highlights from
Adaptive Affinity Propagation clustering

from Adaptive Affinity Propagation clustering by Kaijun Wang
advantage of speed & performance appears under large number of clusters & large dataset

Main_adaptAP_demo.m
% programs for adaptive Affinity Propagation clustering; an improvement
% of Affinity Propagation clusteirng (see Frey & Dueck, Science, Feb. 2007)
% Note: Statistics Toolbox of Matlab needs to be installed
% WANG Kaijun: wangkjun@yahoo.com, July, Sept. 2007.

clear;
id = 5;        % selecting a data set, rows - data points, columns - dimensions
algorithm = 1;  % 1 --- adaptive AP, 0 --- original AP
nrun = 50000;   % max iteration times, default 50000
nrun2 = 2000;   % max iteration times for original AP
nconv = 50;     % convergence condition, default 50
pstep = 0.01;   % decreasing step of preferences: pstep*pmedian, default 0.01
lam = 0.5;      % damping factor, default 0.5
cut = 3;        % after clustering, drop an cluster with number of samples < cut
%splot = 'plot'; % observing a clustering process when it is on
splot = 'noplot';

switch id
%(1) general data, Euclidean distances
% true labels in 1st column
  case 1
     sw='3k2lap.txt';               % true number of clusters is 3
  case 2
     sw='5k8close.txt';           % true NC=5
  case 3
     sw='14k10close.txt';         % true NC=14
  case 4
     sw='22k10far.txt';            % true NC=22
  case 5
     sw='ionosphere.txt';        % true NC=2
  case 6
     sw='wine.txt';               % true NC=3
  case 7
     sw='yourdata.txt';
 
 % true labels being unknown (1st column is data too)
  case 11   
     sw='FaceClusteringSimilarities.txt'; nrow = 900; % number of samples
  case 12
     sw='DocumentSummarization.mat'; nrow = 125;
  case 13
     sw='TravelRouting.mat'; nrow = 456; lam = 0.8;
  case 14
     sw='GeneFindingProblem.mat';
     nrow = 75067; cut = 1; nsubset = 3500; % a subset of 75067
  case 15
     sw='yourdata.txt';      
    
%(2) gene data, Pearson distances, true labels in 1st column
  case 21
     sw='yeast.txt';                  % true NC=4
  case 22
     sw='nci60.txt';                 % true NC=8
  case 23
     sw='yourdata.txt'; 
% true labels being unknown (1st column is data too)
  case 31
     sw='yourdata.txt'; 
end


% initialization
type = 1;       % 1: Euclidean distances
if id > 20
   type = 2;    % 2: Pearson correlation coefficients
end
simatrix = 0;   % 0: data as input; 1: similarity matrix as input
if id > 10 && id <15
    simatrix = 1;
end
data_load      % loading a data file or similarity matrix


disp(' '); disp(['==> Clustering is running on ' sw ', please wait ...']);
if algorithm
   tic;
   if simatrix
      [labels,NCs,labelid,iend,Sp,Slam,NCfixs] = adapt_apcluster(M,type,...
        p,pstep,simatrix,'convits',nconv,'maxits',nrun,'dampfact',lam,splot);
   else
      [labels,NCs,labelid,iend,Sp,Slam,NCfixs] = adapt_apcluster(data,type,...
        p,pstep,simatrix,'convits',nconv,'maxits',nrun,'dampfact',lam,splot);
   end
  [NC,Sil,Silmin] = solution_evaluation(data,M,labels,NCs,...
      NCfixs,simatrix,nrow,type,cut);
  trun = toc;
  if id == 12 || id == 13
      NCs = unique(labelid);
  end
  fprintf('\n## Running time = %g seconds \n', trun);
  fprintf('## Running iterations = %g \n', iend);
  
  % finding an optimal clustering solution
  solution_findK
  
else
    tic;
    if ~simatrix
       M = simatrix_make(data,type,nrow);
    end
    if ~length(p)
        dn = find(M(:,3)>-realmax);
        p = median(M(dn,3));         % Set preference to similarity median
    end
    [labels,netsim,iend,unconverged] = apcluster(M,p,'convits',...
        nconv,'maxits',nrun2,'dampfact',lam,splot);
    trun = toc;
    fprintf('\n## Running time = %g seconds \n', trun);
    fprintf('## Running iterations = %g \n', iend);
    
    % finding an clustering solution
    solution_findK
end


truek = unique(truelabels);
truek = length(truek);
if truek > 1
    C = valid_external(labels(:,Sid), truelabels);
    fprintf('Fowlkes-Mallows validity index: %f\n', C(4));
end
if NCopt == truek
   fprintf('\n## Error rate of clustering solution might be inaccurate if large');
   fprintf('\n     (then use FM index instead) and it is for reference only:');
   valid_errorate(labels(:,Sid), truelabels);
end
if id == 12 || id == 13
    for j = 1:length(NCs)
       disp(name{NCs(j)});
    end
end
%clf; plotdata_bylabels(data,truelabels,2,0,'co');
%clf; plotdata_bylabels(data,labels(:,M),2,0,'nb');

Contact us at files@mathworks.com