Code covered by the BSD License

# Fit all valid parametric probability distributions to data

### Mike Sheppard (view profile)

06 Feb 2012 (Updated )

ALLFITDIST Fit all valid parametric probability distributions to data.

### Editor's Notes:

This file was selected as MATLAB Central Pick of the Week

allfitdist(data,sortby,varargin)
```function [D PD] = allfitdist(data,sortby,varargin)
%ALLFITDIST Fit all valid parametric probability distributions to data.
%   [D PD] = ALLFITDIST(DATA) fits all valid parametric probability
%   distributions to the data in vector DATA, and returns a struct D of
%   fitted distributions and parameters and a struct of objects PD
%   representing the fitted distributions. PD is an object in a class
%   derived from the ProbDist class.
%
%   [...] = ALLFITDIST(DATA,SORTBY) returns the struct of valid distributions
%   sorted by the parameter SORTBY
%        NLogL - Negative of the log likelihood
%        BIC - Bayesian information criterion (default)
%        AIC - Akaike information criterion
%        AICc - AIC with a correction for finite sample sizes
%
%   [...] = ALLFITDIST(...,'DISCRETE') specifies it is a discrete
%   distribution and does not attempt to fit a continuous distribution
%   to the data
%
%   [...] = ALLFITDIST(...,'PDF') or (...,'CDF') plots either the PDF or CDF
%   of a subset of the fitted distribution. The distributions are plotted in
%   order of fit, according to SORTBY.
%
%   List of distributions it will try to fit
%     Continuous (default)
%       Beta
%       Birnbaum-Saunders
%       Exponential
%       Extreme value
%       Gamma
%       Generalized extreme value
%       Generalized Pareto
%       Inverse Gaussian
%       Logistic
%       Log-logistic
%       Lognormal
%       Nakagami
%       Normal
%       Rayleigh
%       Rician
%       t location-scale
%       Weibull
%
%     Discrete ('DISCRETE')
%       Binomial
%       Negative binomial
%       Poisson
%
%   Optional inputs:
%   [...] = ALLFITDIST(...,'n',N,...)
%   For the 'binomial' distribution only:
%      'n'            A positive integer specifying the N parameter (number
%                     of trials).  Not allowed for other distributions. If
%                     'n' is not given it is estimate by Method of Moments.
%                     If the estimated 'n' is negative then the maximum
%                     value of data will be used as the estimated value.
%   [...] = ALLFITDIST(...,'theta',THETA,...)
%   For the 'generalized pareto' distribution only:
%      'theta'        The value of the THETA (threshold) parameter for
%                     the generalized Pareto distribution. Not allowed for
%                     other distributions. If 'theta' is not given it is
%                     estimated by the minimum value of the data.
%
%   Note: ALLFITDIST does not handle nonparametric kernel-smoothing,
%
%
%   EXAMPLE 1
%     Given random data from an unknown continuous distribution, find the
%     best distribution which fits that data, and plot the PDFs to compare
%     graphically.
%        data = normrnd(5,3,1e4,1);         %Assumed from unknown distribution
%        [D PD] = allfitdist(data,'PDF');   %Compute and plot results
%        D(1)                               %Show output from best fit
%
%   EXAMPLE 2
%     Given random data from a discrete unknown distribution, with frequency
%     data, find the best discrete distribution which would fit that data,
%     sorted by 'NLogL', and plot the PDFs to compare graphically.
%        data = nbinrnd(20,.3,1e4,1);
%        values=unique(data); freq=histc(data,values);
%        [D PD] = allfitdist(values,'NLogL','frequency',freq,'PDF','DISCRETE');
%        PD{1}
%
%  EXAMPLE 3
%     Although the Geometric Distribution is not listed, it is a special
%     case of fitting the more general Negative Binomial Distribution. The
%     parameter 'r' should be close to 1. Show by example.
%        data=geornd(.7,1e4,1); %Random from Geometric
%        [D PD]= allfitdist(data,'PDF','DISCRETE');
%        PD{1}
%
%  EXAMPLE 4
%     Compare the resulting distributions under two different assumptions
%     of discrete data. The first, that it is known to be derived from a
%     Binomial Distribution with known 'n'. The second, that it may be
%     Binomial but 'n' is unknown and should be estimated. Note the second
%     scenario may not yield a Binomial Distribution as the best fit, if
%     'n' is estimated incorrectly. (Best to run example a couple times
%     to see effect)
%        data = binornd(10,.3,1e2,1);
%        [D1 PD1] = allfitdist(data,'n',10,'DISCRETE','PDF'); %Force binomial
%        [D2 PD2] = allfitdist(data,'DISCRETE','PDF');       %May be binomial
%        PD1{1}, PD2{1}                             %Compare distributions
%

%    Mike Sheppard

%% Check Inputs
if nargin == 0
data = 10.^((normrnd(2,10,1e4,1))/10);
sortby='BIC';
varargin={'CDF'};
end
if nargin==1
sortby='BIC';
end
sortbyname={'NLogL','BIC','AIC','AICc'};
if ~any(ismember(lower(sortby),lower(sortbyname)))
oldvar=sortby; %May be 'PDF' or 'CDF' or other commands
if isempty(varargin)
varargin={oldvar};
else
varargin=[oldvar varargin];
end
sortby='BIC';
end
if nargin < 2, sortby='BIC'; end
distname={'beta', 'birnbaumsaunders', 'exponential', ...
'extreme value', 'gamma', 'generalized extreme value', ...
'generalized pareto', 'inversegaussian', 'logistic', 'loglogistic', ...
'lognormal', 'nakagami', 'normal', ...
'rayleigh', 'rician', 'tlocationscale', 'weibull'};
if ~any(strcmpi(sortby,sortbyname))
error('allfitdist:SortBy','Sorting must be either NLogL, BIC, AIC, or AICc');
end
%Input may be mixed of numeric and strings, find only strings
vin=varargin;
strs=find(cellfun(@(vs)ischar(vs),vin));
vin(strs)=lower(vin(strs));
%Next check to see if 'PDF' or 'CDF' is listed
numplots=sum(ismember(vin(strs),{'pdf' 'cdf'}));
if numplots>=2
error('ALLFITDIST:PlotType','Either PDF or CDF must be given');
end
if numplots==1
plotind=true; %plot indicator
indxpdf=ismember(vin(strs),'pdf');
plotpdf=any(indxpdf);
indxcdf=ismember(vin(strs),'cdf');
vin(strs(indxpdf|indxcdf))=[]; %Delete 'PDF' and 'CDF' in vin
else
plotind=false;
end
%Check to see if discrete
strs=find(cellfun(@(vs)ischar(vs),vin));
indxdis=ismember(vin(strs),'discrete');
discind=false;
if any(indxdis)
discind=true;
distname={'binomial', 'negative binomial', 'poisson'};
vin(strs(indxdis))=[]; %Delete 'DISCRETE' in vin
end
strs=find(cellfun(@(vs)ischar(vs),vin));
n=numel(data); %Number of data points
data = data(:);
D=[];
%Check for NaN's to delete
deldatanan=isnan(data);
%Check to see if frequency is given
indxf=ismember(vin(strs),'frequency');
if any(indxf)
freq=vin{1+strs((indxf))}; freq=freq(:);
if numel(freq)~=numel(data)
error('ALLFITDIST:PlotType','Matrix dimensions must agree');
end
delfnan=isnan(freq);
data(deldatanan|delfnan)=[]; freq(deldatanan|delfnan)=[];
%Save back into vin
vin{1+strs((indxf))}=freq;
else
data(deldatanan)=[];
end

%% Run through all distributions in FITDIST function
warning('off','all'); %Turn off all future warnings
for indx=1:length(distname)
try
dname=distname{indx};
switch dname
case 'binomial'
PD=fitbinocase(data,vin,strs); %Special case
case 'generalized pareto'
PD=fitgpcase(data,vin,strs); %Special case
otherwise
%Built-in distribution using FITDIST
PD = fitdist(data,dname,vin{:});
end

NLL=PD.NLogL; % -Log(L)
%If NLL is non-finite number, produce error to ignore distribution
if ~isfinite(NLL)
error('non-finite NLL');
end
num=length(D)+1;
PDs(num) = {PD}; %#ok<*AGROW>
k=numel(PD.Params); %Number of parameters
D(num).DistName=PD.DistName;
D(num).NLogL=NLL;
D(num).BIC=-2*(-NLL)+k*log(n);
D(num).AIC=-2*(-NLL)+2*k;
D(num).AICc=(D(num).AIC)+((2*k*(k+1))/(n-k-1));
D(num).ParamNames=PD.ParamNames;
D(num).ParamDescription=PD.ParamDescription;
D(num).Params=PD.Params;
D(num).Paramci=PD.paramci;
D(num).ParamCov=PD.ParamCov;
D(num).Support=PD.Support;
catch err %#ok<NASGU>
%Ignore distribution
end
end
warning('on','all'); %Turn back on warnings
if numel(D)==0
error('ALLFITDIST:NoDist','No distributions were found');
end

%% Sort distributions
indx1=1:length(D); %Identity Map
sortbyindx=find(strcmpi(sortby,sortbyname));
switch sortbyindx
case 1
[~,indx1]=sort([D.NLogL]);
case 2
[~,indx1]=sort([D.BIC]);
case 3
[~,indx1]=sort([D.AIC]);
case 4
[~,indx1]=sort([D.AICc]);
end
%Sort
D=D(indx1); PD = PDs(indx1);

%% Plot if requested
if plotind;
plotfigs(data,D,PD,vin,strs,plotpdf,discind)
end

end

function PD=fitbinocase(data,vin,strs)
%% Special Case for Binomial
% 'n' is estimated if not given
vinbino=vin;
%Check to see if 'n' is given
indxn=any(ismember(vin(strs),'n'));
%Check to see if 'frequency' is given
indxfreq=ismember(vin(strs),'frequency');
if ~indxn
%Use Method of Moment estimator
%E[x]=np, V[x]=np(1-p) -> nhat=E/(1-(V/E));
if isempty(indxfreq)||~any(indxfreq)
%Raw data
mnx=mean(data);
nhat=round(mnx/(1-(var(data)/mnx)));
else
%Frequency data
freq=vin{1+strs(indxfreq)};
m1=dot(data,freq)/sum(freq);
m2=dot(data.^2,freq)/sum(freq);
mnx=m1; vx=m2-(m1^2);
nhat=round(mnx/(1-(vx/mnx)));
end
%If nhat is negative, use maximum value of data
if nhat<=0, nhat=max(data(:)); end
vinbino{end+1}='n'; vinbino{end+1}=nhat;
end
PD = fitdist(data,'binomial',vinbino{:});
end

function PD=fitgpcase(data,vin,strs)
%% Special Case for Generalized Pareto
% 'theta' is estimated if not given
vingp=vin;
%Check to see if 'theta' is given
indxtheta=any(ismember(vin(strs),'theta'));
if ~indxtheta
%Use minimum value for theta, minus small part
thetahat=min(data(:))-10*eps;
vingp{end+1}='theta'; vingp{end+1}=thetahat;
end
PD = fitdist(data,'generalized pareto',vingp{:});
end

function plotfigs(data,D,PD,vin,strs,plotpdf,discind)
%Plot functionality for continuous case due to Jonathan Sullivan
%Modified by author for discrete case

%Maximum number of distributions to include
%max_num_dist=Inf;  %All valid distributions
max_num_dist=4;

%Check to see if frequency is given
indxf=ismember(vin(strs),'frequency');
if any(indxf)
freq=vin{1+strs((indxf))};
end

figure

%% Probability Density / Mass Plot
if plotpdf
if ~discind
%Continuous Data
nbins = max(min(length(data)./10,100),50);
xi = linspace(min(data),max(data),nbins);
dx = mean(diff(xi));
xi2 = linspace(min(data),max(data),nbins*10)';
fi = histc(data,xi-dx);
fi = fi./sum(fi)./dx;
inds = 1:min([max_num_dist,numel(PD)]);
ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0);
ys = cat(2,ys{:});
bar(xi,fi,'FaceColor',[160 188 254]/255,'EdgeColor','k');
hold on;
plot(xi2,ys,'LineWidth',1.5)
legend(['empirical',{D(inds).DistName}],'Location','NE')
xlabel('Value');
ylabel('Probability Density');
title('Probability Density Function');
grid on
else
%Discrete Data
xi2=min(data):max(data);
%xi2=unique(x)'; %If only want observed x-values to be shown
indxf=ismember(vin(strs),'frequency');
if any(indxf)
fi=zeros(size(xi2));
fi((ismember(xi2,data)))=freq; fi=fi'./sum(fi);
else
fi=histc(data,xi2); fi=fi./sum(fi);
end
inds = 1:min([max_num_dist,numel(PD)]);
ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0);
ys=cat(1,ys{:})';
bar(xi2,[fi ys]);
legend(['empirical',{D(inds).DistName}],'Location','NE')
xlabel('Value');
ylabel('Probability Mass');
title('Probability Mass Function');
grid on
end
else

%Cumulative Distribution
if ~discind
%Continuous Data
[fi xi] = ecdf(data);
inds = 1:min([max_num_dist,numel(PD)]);
ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0);
ys = cat(2,ys{:});
if max(xi)/min(xi) > 1e4; lgx = true; else lgx = false; end
subplot(2,1,1)
if lgx
semilogx(xi,fi,'k',xi,ys)
else
plot(xi,fi,'k',xi,ys)
end
legend(['empirical',{D(inds).DistName}],'Location','NE')
xlabel('Value');
ylabel('Cumulative Probability');
title('Cumulative Distribution Function');
grid on
subplot(2,1,2)
y = 1.1*bsxfun(@minus,ys,fi);
if lgx
semilogx(xi,bsxfun(@minus,ys,fi))
else
plot(xi,bsxfun(@minus,ys,fi))
end
ybnds = max(abs(y(:)));
ax = axis;
axis([ax(1:2) -ybnds ybnds]);
legend({D(inds).DistName},'Location','NE')
xlabel('Value');
ylabel('Error');
title('CDF Error');
grid on
else
%Discrete Data
indxf=ismember(vin(strs),'frequency');
if any(indxf)
[fi xi] = ecdf(data,'frequency',freq);
else
[fi xi] = ecdf(data);
end
%Check unique xi, combine fi
[xi,ign,indx]=unique(xi); %#ok<ASGLU>
fi=accumarray(indx,fi);
inds = 1:min([max_num_dist,numel(PD)]);
ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0);
ys=cat(2,ys{:});
subplot(2,1,1)
stairs(xi,[fi ys]);
legend(['empirical',{D(inds).DistName}],'Location','NE')
xlabel('Value');
ylabel('Cumulative Probability');
title('Cumulative Distribution Function');
grid on
subplot(2,1,2)
y = 1.1*bsxfun(@minus,ys,fi);
stairs(xi,bsxfun(@minus,ys,fi))
ybnds = max(abs(y(:)));
ax = axis;
axis([ax(1:2) -ybnds ybnds]);
legend({D(inds).DistName},'Location','NE')
xlabel('Value');
ylabel('Error');
title('CDF Error');
grid on
end
end

end
```