Code covered by the BSD License  

Highlights from
DM Utils (data mining utils)

image thumbnail

DM Utils (data mining utils)

by

 

15 Jan 2012 (Updated )

The tools for dealing with distance matrix, improving data mining capabilities

[Phdl]=pair_dist_spmd(X,fun,varargin)
function [Phdl]=pair_dist_spmd(X,fun,varargin)
% parallel version of paiwise distance similar to PDIST function
% calculates out_hdl=pair_dist_par(X,fun,parameters)
% input data X - rows, fun - distance function (single row to row !!!),
% parameters - optional parameter passed to function.
%
% X correspond to observations, columns correspond to variables.
% fun is: or one of predefined distance functions 'manhattan', 'euclidean'
% 'cosine','minp_manhattan', 'minkowski' with parameter p (order)
% or a handler to a distance function of a form d=fun(x,y,params)
% paraneters is a variable list of parameters, x and y are single row
% vectors    DIFFERENT to the pdist.
% if no fun is given the 'euclidean' is default
%
% It is intended for rather large distance matrices.
% The calculation is done using SPMD model with shared memory (SM)
% as IPC therefore it requires to have in path  J. Dillon's sharedmatrix (or win SharedMemory)
% The function returns a handler to sharedmemory segment containing
% results. The cache hit-ratio is improved by the interleaving.
% To use it You need to attach to the shared variable
%
% out = sharedmatrix('attach', out_hdl);
% or
% out = SharedMemory('attach', out_hdl);
%
% remember also to remove unused variable (there is no grabage collector)
% sharedmatrix('detach', out_hdl, out);
% sharedmatrix('free', out_hdl);
%
%   Copyright 2013 - P. Skurowski
%   v. 0.2 - Minkowski distance added, modified minperm (faster)
%   v. 0.2a - reorganized per CPU parfor (additionally interleaved)
%   v. 0.3 - shared memory employed

if nargin < 2
    fun='euclidean';
end
if ~isa(fun, 'function_handle')
    if isstr(fun)
        fun=name2fun(fun);
    else
        error('Provide a handler to distfun(x,y,params) or one of builtin names')
    end
end

sys=computer();
if strcmp(sys,'PCWIN')
    win=1;
    if exist('SharedMemory','file')~=3
        error('Win version require compiled shared memory SharedMemory mex module')
    end
else
    win=0;
    if exist('sharedmatrix','file')~=3
        error('UX version require compiled shared memory sharedmatrix mex module')
    end
end



%%
try
    if matlabpool('size') == 0
        matlabpool('local')
    end
catch exception%if no parallel accessible start sequential
    if strcmp(exception.identifier,'MATLAB:UndefinedFunction')
        warning('Parallel Computing Toolbox not found invoking sequential version') %#ok<WNTAG>
        P=pair_dist_seq(X,fun,varargin{:});
        if win==1 % generate random handlers
            Phdl=char(uint8(rand(1,3)*255));
            SharedMemory('clone', Phdl, P);
        else
            Phdl=uint16(rand(1)*65535);
            sharedmatrix('clone', Phdl, P);
        end
        clear X;
        clear P; %free
        return
    else
        rethrow(exception)
    end
end
%%
N=matlabpool('size');
m=size(X,1); % M=(m.^2-m)/2;
I=interleaver(m,N);
P=zeros((m^2-m)/2,1); %preallocate to copy

if win==1 % generate random handlers
    % short text to be fast in windows
    s=char(uint8(rand(1,3)*255));
    o=char(uint8(rand(1,3)*255));
    SharedMemory('clone', s, X);
    SharedMemory('clone', o, P);
else
    s=uint16(rand(1)*65535);
    o=uint16(rand(1)*65535);
    sharedmatrix('clone',s,X);
    sharedmatrix('clone', o, P);
end
clear X;
clear P; %free

%%
parfor n=1:N %parfor - iterate for the CPUs works slightly faster than spmd command
    if win==1
        X = SharedMemory('attach', s);
        wyn = SharedMemory('attach',o);
    else
        X=sharedmatrix('attach',s);
        wyn=sharedmatrix('attach',o);
    end
    for ind=n:N:m-1
        i=I(ind);
        tmp=double(nan(1,m-i));
        idx=0;
        xi=X(i,:);
        for j=(i+1):m
            
            idx=idx+1;
            xj=X(j,:);
            tmp(idx)=feval(fun,xi,xj,varargin{:}); %#ok<PFBNS> %{:} expands the list
        end
        kp=i-1;
        curr=m-i;
        prevs=kp*m-kp.*(kp+1)/2;
        wyn(prevs+1:prevs+curr)=tmp';
        
    end
    if win==1 % detach the shallow copy making Data_Shared safe for Matlab to clear
        SharedMemory('detach', s, X);
        SharedMemory('detach', o, wyn);
        SharedMemory('free', s);
    else
        sharedmatrix('detach', s, X);
        sharedmatrix('detach', o, wyn);
        
    end
end
%%
if win==1
    wyn = SharedMemory('attach', o);
    Phdl = o;
    SharedMemory('detach', o, wyn);
else
    sharedmatrix('free', s);
    wyn=sharedmatrix('attach',o);
    Phdl = o;
    sharedmatrix('detach', o, wyn);
end

end
%%
function handler=name2fun(fun)

switch fun
    case 'manhattan'
        handler = @(x,y)sum( abs(x-y) );  % manhattan (minkowski 1)
    case 'euclidean'
        handler = @(x,y)sqrt( sum( (x-y).^2 )); % euclid (minkowski 2)
    case 'cosine'
        handler = @(x,y)1-x*y'/sqrt(x*x')/sqrt(y*y'); % cosine distance
    case 'minp_manhattan' %minimal permutation manhattan dist
        handler = @(x,y)min(sum(abs(bsxfun(@minus,x(perms(1:5)),y)),2));
        %handler = @(x,y)min(sum(abs(y(perms(1:5))- ...
        %    x(ones(factorial(length(x)), 1),:)),2));
    case 'minkowski'
        handler = @(x,y,p)( sum( abs(x-y).^p )).^(1/p); % Minkowski
    otherwise
        error('Unknown method.');
end

end
%%
function I=interleaver(m,N) % 1,2,3,4,4,3,2,1,1,2,3,4...
r=mod(m,N);
padding = zeros(N-r+1,1);
I = [(1:m-1)';padding];
w = [false(m-1,1); true(N-r+1,1)];
I=reshape(I,N,[]);
w=reshape(w,N,[]);

I(:,2:2:end)=flipud(I(:,2:2:end));
w(:,2:2:end)=flipud(w(:,2:2:end));
I=reshape(I,[],1);
w=reshape(w,[],1);
I=I(not(w));
end

Contact us