Hi Walter,
thanks again for the tip. Although I am anything but an ace programmer, I managed to implement what you described for 1D slicing or 2D tiling of large numbers of points. Now I can get back to my research.
For ~500 points, slicing or tiling were comparable or a little slower than just using clusterdata. For ~5000 points, slicing or tiling were ~5x faster than clusterdata. For ~50,000 points, clusterdata failed, and slicing was ~2x faster than tiling for some reason. For ~500,000 points, tiling was quite slow (5 min) but was the only routine that didn't have a memory overload. If clusters were 'too large' relative to the window, some points were lost, although these appear as zeros in the cluster_assignment output so the user can be aware of it.
I included a testing routine and the slicing/tiling cluster functions below in case you or others want checking it out. Due to the lack of generality (e.g. it is hard coded for 2dimensional data and a certain clustering routine) I didn't think it was worthy of the file exchange.
best regards,
jay
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
clear all
%% input parameters
num_seed_pts = 100;
max_num_pts_per_cluster = 200;
cluster_spread = 0.002;
num_slices_per_dim = 20;
clust_dist_thr = cluster_spread;
%% generate test 2D cluster data and view it
seed_pts = [rand(num_seed_pts,1) rand(num_seed_pts,2)];
num_pts_per_cluster = round(max_num_pts_per_cluster*rand(num_seed_pts,1));
X = [];
ctr = 1;
for k = 1:num_seed_pts
x = seed_pts(k,1) + cluster_spread*rand(num_pts_per_cluster(k),1);
y = seed_pts(k,2) + cluster_spread*rand(num_pts_per_cluster(k),1);
X(ctr:ctr+num_pts_per_cluster(k)1,:) = [x y];
ctr = ctr + num_pts_per_cluster(k);
end
figure(1)
plot(X(:,1),X(:,2),'.','markersize',1)
xlim([0.1 1.1])
ylim([0.1 1.1])
clear num_seed_pts seed_pts max_num_pts_per_cluster num_pts_per_clutser ctr k x y num_pts_per_cluster
%% do the cluster analysis
try % conventional (unsliced/untiled) cluster analysis
tic
cluster_assignments1 = clusterdata(X,'distance','euclid','linkage','single','cutoff',...
clust_dist_thr,'criterion','distance');
conv_time = toc;
catch
conv_time = 0;
cluster_assignments1 = zeros(size(X,1),1);
end
try % 1D sliced cluster analysis
tic
cluster_assignments2 = clusterdata_slices(X,num_slices_per_dim,clust_dist_thr);
clust_1Dslices_time = toc;
catch
clust_1Dslices_time = 0;
cluster_assignments2 = zeros(size(X,1),1);
end
try % 2D, tiled cluster analysis
tic
cluster_assignments3 = clusterdata_tiles(X,num_slices_per_dim,clust_dist_thr);
clust_2Dslices_time = toc;
catch
clust_2Dslices_time = 0;
cluster_assignments3 = zeros(size(X,1),1);
end
%% results
disp(['number of input points=' num2str(size(X,1)) '; slices per dim=' num2str(num_slices_per_dim)])
disp(['conventional time=' num2str(conv_time) 's; ' ...
'1D slices time=' num2str(clust_1Dslices_time) 's; ' ...
'2D tiling time=' num2str(clust_2Dslices_time) 's'])
disp(['clusterdata lost points=' num2str(sum(cluster_assignments1==0)) '; ' ...
'1D slices lost points=' num2str(sum(cluster_assignments2==0)) '; ' ...
'2D tiles lost points=' num2str(sum(cluster_assignments3==0))])
disp([' '])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function cluster_assignments = clusterdata_slices(X,num_slices,clust_dist_thr)
% The function clusterdata_tiles can perform cluster analysis on 2D data. It
% is based on clusterdata from the statistical toolbox but can handle larger
% input matrices. Note that the entire cluster heierarchy is not returned,
% but instead, the bottom level clusters based on a distance threshold provided
% by the user. num_slices should be chosen to be large enough that no one slice
% has so many points that it will overload clusterdata, but not so large that a
% slice is smaller than 4x the size of the largest expected cluster.
%
% The data is divided into num_slices slices. All clusters within a hard
% threshold of clust_dist_thr are identified within the slice but
% excluding clusters whose center positions are within 1/4 the height of
% a slice from the leading edge. The routine then advances by half a slice
% in order to pick up clusters from the edge of the previously analyzed
% region as well as clusters within the new region. The process is repeated.
%
% Unassigned points may occur when the cluster size is larger than 1/4 of a
% slice's height. num_unassigned_points = sum(cluster_assignments==0).
%
% SYNTAX: cluster_assignments = clusterdata_slices(X,num_slices,clust_dist_thr)
% X; % input from user, Nx2 array containing 2D position information
% num_slices = 5; % input from user, number of slices
% clust_dist_thr = 0.05;
% code starts here
num_grid_pieces = num_slices*2;
ygrid = linspace(min(X(:,2)),1.1*max(X(:,2)),num_grid_pieces+1);
grid_inc = ygrid(2)ygrid(1);
remaining_X_inds = 1:size(X,1);
cluster_assignments = zeros(size(X,1),1);
ctr = 1;
% damn all these indices are confusing!
for m = 1:num_grid_pieces1
[n,bins_remaining_points] = histc(X(remaining_X_inds,2),ygrid);
remaining_points_inds_in_curr_bin = find(bins_remaining_points==m  bins_remaining_points==m+1);
if length(remaining_points_inds_in_curr_bin)>1
Xsubset = X(remaining_X_inds(remaining_points_inds_in_curr_bin),:);
T2 = clusterdata(Xsubset,'distance','euclid','linkage','single','cutoff',...
clust_dist_thr,'criterion','distance');
inds_to_dump = [];
num_clusters = max(T2);
for k = 1:num_clusters
Xsubset_cluster_inds = find(T2(:)==k);
% if the clutster is not at the edge of the bin, keep the
% cluster and remove the points from the master list
if or(mean(Xsubset(Xsubset_cluster_inds,2))<ygrid(m+2)0.5*grid_inc,m==num_grid_pieces1)
clust_inds = remaining_X_inds(remaining_points_inds_in_curr_bin(Xsubset_cluster_inds));
inds_to_dump = union(inds_to_dump,clust_inds);
cluster_assignments(clust_inds) = ctr;
ctr = ctr + 1;
end
end
remaining_X_inds = setdiff(remaining_X_inds,inds_to_dump);
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function cluster_assignments = clusterdata_tiles(X,num_slices_per_dim,clust_dist_thr)
% The function clusterdata_tiles can perform cluster analysis on 2D data. It
% is based on clusterdata from the statistical toolbox and clusterdata_slices
% but can handle larger input matrices. Note that the entire cluster heierarchy
% is not returned, but instead, the bottom level clusters based on a distance
% threshold provided by the user. num_slices_per_dim should be chosen to be
% large enough that no one slice has so many points that it will overload
% clusterdata, but not so large that half a slice is smaller than 4x the size
% of the largest expected cluster.
%
% The data is divided into num_slices_per_dim^2 tiles. All clusters within a
% hard threshold of clust_dist_thr are identified within the tile but
% excluding clusters whose center positions are within 1/4 the of a tile from
% the leading edges. The routine then advances by half a tile in order to pick
% up clusters from the edge of the previously analyzed region as well as clusters
% within the new region. The process is repeated.
%
% Unassigned points may occur when the cluster size is larger than 1/4 of a
% tile's width/height. num_unassigned_points = sum(cluster_assignments==0).
%
% SYNTAX: cluster_assignments = clusterdata_slices(X,num_slices_per_dim,clust_dist_thr)
% X; % input from user, Nx2 array containing 2D position information
% num_slices_per_dim = 5; % input from user, number of slices
% clust_dist_thr = 0.05;
% code starts here
num_grid_pieces = num_slices_per_dim*2;
xgrid = linspace(min(X(:,1)),1.1*max(X(:,1)),num_grid_pieces+1);
grid_inc = xgrid(2)xgrid(1);
remaining_X_inds = 1:size(X,1);
cluster_assignments = zeros(size(X,1),1);
ctr = 1;
h=waitbar(0,'tile clustering in progress');
% damn all these indices are confusing!
try
for m = 1:num_grid_pieces1
[n,bins_remaining_points] = histc(X(remaining_X_inds,1),xgrid);
remaining_points_inds_in_curr_bin = find(bins_remaining_points==m  bins_remaining_points==m+1);
if length(remaining_points_inds_in_curr_bin)>1
Xsubset = X(remaining_X_inds(remaining_points_inds_in_curr_bin),:);
T2 = clusterdata_slices(Xsubset,num_slices_per_dim,clust_dist_thr);
inds_to_dump = [];
num_clusters = max(T2);
for k = 1:num_clusters
Xsubset_cluster_inds = find(T2(:)==k);
% if the clutster is not at the edge of the bin, keep the
% cluster and remove the points from the master list
if or(mean(Xsubset(Xsubset_cluster_inds,1))<xgrid(m+2)0.5*grid_inc,m==num_grid_pieces1)
clust_inds = remaining_X_inds(remaining_points_inds_in_curr_bin(Xsubset_cluster_inds));
inds_to_dump = union(inds_to_dump,clust_inds);
cluster_assignments(clust_inds) = ctr;
ctr = ctr + 1;
end
end
remaining_X_inds = setdiff(remaining_X_inds,inds_to_dump);
end
waitbar(m/(num_grid_pieces1),h,['tile clustering in progress']);
end
catch
end
close(h)
