Normalize Compositional Distance

by

 

28 Nov 2012 (Updated )

Calculate difference and differentiation potential between particle-size distributions.

NCD(varargin)
function[varargout] = NCD(varargin)

% [NCD,DP] = NCD(files) calculates the
% Normalized Compositional Difference (NCD) (aka Normalized Difference)
% between compositions and the Differentiation Potential (DP)
% between groups of compositions.
% 
% 
% INPUTS
% 
% NCD(files) is a 1 by nf cell array of the input file paths.
% If files is not specified they must be selected manually
% using a file open interface dialog.
% 
% Compositions are given in CSV files as row-ordered arrays,
% i.e. 1 composition per row.
% 
% All compositions must be vectors of the same length, i.e. all arrays have
% the same number of columns. The number of compositions (i.e. rows)
% can vary between files and is not limited.
% 
% Groups of compositions are stored in seperate CSV files,
% each with an identical number of columns.
% 
% The contents of CSV files should be numeric and contain no characters.
% 
% 
% OUTPUTS
% 
% Results are written to 'NCD_analysis.csv', containing:
% 
% 1. Mean, median and standard deviation of NCD between compositions in
% each file (group),
% 
% 2. Median NCD between files (groups), DP scores and significance at 95%
% confidence level (1 == pass, 0 == fail). The median NCD between files is
% calculated pairwise, so that each composition in File A is compared to
% each composition in File B.
% 
% 
% SAM ROBERSON -- RIJSWIJK, THE NETHERLANDS -- 17.10.2012
% 
% ROBERSON.SAM@GMAIL.COM

%% Get files
if nargin == 0
    [files,pathname] = uigetfile('*.csv','MultiSelect','On');
    [~,nf] = size(files);
    % check number of files is > 1
    if ischar(files) || isempty(files)
        error('Select two or more files to compare');
    end
    % add pathname
    for i = 1:length(files)
        files{i} = [pathname files{i}];
    end
elseif nargin == 1
    files = varargin{1};
    if isempty(files)
        error('Select two or more files to compare');
    end
    pathname = fileparts(files{1});
    [~,nf] = size(files);
end
%% check dimensions - number of size classes should be
% identical in each array and each file
nr = zeros(nf,1);
nc = zeros(nf,1);
for i = 1:nf
    test = csvread(files{i});
    [nr(i),nc(i)] = size(test);
end
if sum(diff(nc)) ~=0
    error('The number of columns in each file should be identical: Check the input files and try again');
end
%% load data from each file
mc = max(nc);
mr = max(nr);
psd = zeros(mr,mc,nf);
for i = 1:nf
    data = csvread(files{i});
    if isempty(data)
        error('Data files should be formatted as commas separated values: only numeric values are allowed');
    end
    psd(1:nr(i),1:nc(i),i) = data;
end
%% log ratio tranformation
q = logratio(zm(psd));
%% calculate NCD matrix for each file
meanncd = zeros(nf,1);
medianncd = zeros(nf,1);
stdncd = zeros(nf,1);
d2 = zeros(nf,1);
for i = 1:nf
    diffmat = ones(nr(i))*NaN;
    for j = 1:nr(i)
        for k = 1:nr(i)
            if j > k
                diffmat(j,k) = ed(q(j,:,i),q(k,:,i))/(nc(i)-1);
            end
        end
    end
    sd = diffmat(:);
    sd = sd(~isnan(sd));
    meanncd(i) = mean(sd);
    medianncd(i) = median(sd);
    stdncd(i) = std(sd);
    d2(i) = length(sd)*2;
end
%% calculate NCD between populations (files) 
% Differentiation potential
% Test at 95% confidence level (a = 0.05);
ncdmatrix = ones(sum(1:nf-1),1)*NaN;
dp = ones(sum(1:nf-1),1)*NaN;
cl95 = false(sum(1:nf-1),1);
a = 0.05;
count = 0;
z = 0:0.01:10;
for i = 1:nf
    for j = 1:nf
         if j > i
            count = count + 1;
            diffmat = ones(nr(i),nr(j))*NaN;
            for x = 1:nr(i)
                for y = 1:nr(j)
%                     if x > y
                        diffmat(x,y) = ed(q(x,:,i),q(y,:,j))/(nc(i)-1);
%                     end
                end
            end
            sd = diffmat(:);
            sd = sd(~isnan(sd));
            ncdmatrix(count) = median(sd);
            dp(count) = median(sd) / sqrt((((medianncd(i)^2)*(d2(i)-1)) + ((medianncd(j)^2)*(d2(j)-1)))/(d2(i)+d2(j)-1));
            if dp(count) > 0
                p = cdf('t',z,d2(i)+d2(j)-2);
                l = find(p>=1-a,1);
                cl95(count) = dp(count) >= z(l);
            end
         end
    end
end
d = dp<0;
dp(d) = 0;
%% write results to output files
outfile = [pathname ' (NCD_analysis).csv'];
fid1 = fopen(outfile,'w');
% write header
fprintf(fid1,'%s,%s,%s,%s\n','Variability within groups',...
    'median NCD','mean NCD','std NCD');
% write data with row labels
for i = 1:nf
    [~,name] = fileparts(files{i});
    fprintf(fid1,'%s,%4.3f,%4.3f,%4.3f\n',name,medianncd(i),meanncd(i),stdncd(i));
end
fprintf(fid1,'Median,%4.3f,%4.3f,%4.3f\n,\n,\n',median(medianncd),median(meanncd),median(stdncd));

fprintf(fid1,'%s, %s , %s,%s\n','Distance between groups',...
    'median NCD','DP','Pass at 95% confidence level [1 = pass / 0 = fail]');
format = '%s : %s, %4.3f , %4.3f , %i\n';
count = 0;
for i = 1:nf
    for j = 1:nf
        if j > i
            count = count + 1;
            [~,name1] = fileparts(files{i});
            [~,name2] = fileparts(files{j});
            fprintf(fid1,format,name1,name2,ncdmatrix(count),dp(count),cl95(count));
        end
    end
end
%% output variabels
if nargout > 0
    varargout{1} = [medianncd meanncd stdncd];
end
if nargout > 1
    varargout{2} =  [ncdmatrix dp cl95];
end

end
%% EUCLIDEAN DISTANCE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function[z] = ed(x,y)

% ed(x,y) CALCULATES THE EUCLIDIEAN DISTANCE BETWEEN
% VECTORS x AND y
%
% SAM ROBERSON - TU DELFT, THE NETHERLANDS - 26.11.2010

dif = x-y;
dif = dif.^2;
z = sqrt(sum(dif,2));
end

%% ZERO-REPLACEMENT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function[r] = zm(x,varargin)
% r = zm(x,accuracy) IS MULPLICATATIVE ZERO REPLACEMENT FOR COMPOSITIONAL
% ARRAY x. x IS AN m BY n ARRAY OF m COMPOSITIONS CONSITING OF n PARTS
% EACH. accuracy IS AN OPTIONAL INPUT THAT DEFINES ANALYTICAL ACCURACY
%
% USE THIS PRIOR TO LOG-RATIO TRANFORMATION IF ZEROS ARE PRESENT IN THE
% DATA.
%
% THIS FUNCTION IS BASED ON 'Dealing with zeros and missing values in
% compositional data sets using nonparametric imputation' in  Math. Geol.
% by Martin-Fernandez et al. (2003).
%
% SAM ROBERSON - TU DELFT, THE NETHERLANDS - 25.08.2010

% Measurement threshold
if nargin ==1
    threshold = 1e-3;
else
    threshold = varargin{1};
end
delta = threshold * 0.65;
% closure
c = 100./(sum(x,2));
x = bsxfun(@times,x,c);
e = x <= 0;
g = sum(sum(e));
% CALCULATE
f = g.*delta./100;
f = 1-f;
r = bsxfun(@times,x,f);
r(e) = delta;
end

%% LOG-RATIO TRANSFORM %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function[xout] = logratio(x)

% [xout] = logratio(x) COMPUTES THE CENTERED LOG-RATIO TRANSFORM
% OF ROWS IN MATRIX x
%
% After Gerald van den Boogaart http://www.stat.boogaart.de/
%
%
% SAM ROBERSON - TU DELFT, THE NETHERLANDS - 10.11.2010

if min(min(x)) <= 0
    errordlg('Remove zero and negative values from composition before clr transformation');
end
n = size(x,2);
gm = exp(sum(log(x),2)./n);
xout = log(bsxfun(@rdivide,x,gm));
end

Contact us