No BSD License  

Highlights from
medoultierfilt

image thumbnail
from medoultierfilt by Colin Clarke
code to remove outliers from a mulitvariate dataset using the median

medoutlierfilt(x,outlier_cut,plot_state)
function [filtered_data] = medoutlierfilt(x,outlier_cut,plot_state)

%MEDOUTLIERFILT - remove outliers from a multivariate data set using the
%   median of each column
%
%   [stats_data, filtered_data] = medoutlierfilt(x,outlier_cut,plot_state)
%   removes possible outliers froma  data set, X,  by specifiying a cut off. 
%   OUTLIER_CUT is a cut off is a multiple of the inter quartile range above 
%   Q3 and below Q1, default value is the same as  BOXPLOT function.
%   
%   Plot_state = 1 for on, 0 for off, DEFAULT = ON %   
%   
%   EXAMPLE:    
%   load count.dat;
%   [stats, filtered_data] = medoutlierfilt(count,1,1)
%   
%   Inspired by quartile.m by Chris D. Larson
%   See also: BOXPLOT, QUARTILE 
%   Colin Clarke 2006
%   Cranfield Univeristy



if nargin < 3
    plot_state  = 1; 
end

if nargin < 2
    outlier_cut = 1.5; 
end


num_column = (min(size(x)));
%num_outliers = zeros(1,1:num_column);

%create the output matrix
filtered_data = x;


% sort the data to be filtered 
sorted_data = sort(filtered_data);

% remove outliers for each column

for i = 1:num_column
    
    current_column = sorted_data(:,i);
    
    % calculate the 25th percentile
    stats_data(1, i)  = median(current_column(find(current_column<median(current_column))));
    % calculate the 50th percentile
    stats_data(2,i)  = median(current_column);
    % calculate the 75th percentile
    stats_data(3,i)  = median(current_column(find(current_column>median(current_column))));
    % calculate the interquartile range of each column
    stats_data(4, i) =  stats_data(3, i) - stats_data(1, i);
    % calculate the semi interquartile range
    stats_data(5, i) =  stats_data(4, i)/2;
    
    
    
    % return the index of each outlier above and below Q1 and Q3 
    outliers_below = find(current_column<stats_data(1, i)-outlier_cut*stats_data(4, i));
    outliers_above = find(current_column>stats_data(3, i)+outlier_cut*stats_data(4, i));

    % find outlier values in the current column 
    if length(current_column)>0,
        outliersQ1 = current_column(outliers_below);
    else
        outliersQ1 = [];
    end
    
    
    
    
    if length(outliers_above)>0,
        outliersQ3 = current_column(outliers_above);
    else
        outliersQ3 = [];
    end
    
    % determine the total number of outliers in the c
    num_outliers(:, i) =  (length(outliersQ1)+length(outliersQ3));
    
    %Q1
    %find the outlier values and remove from the matrix
   
    if length(outliersQ1 < 0 )
        for j = 1:length(outliersQ1)
            value = outliersQ1(j);
            index = find(filtered_data(:,i) == value);
            filtered_data(index,:) = [];
        end
    end
   
    
    %Q3
    %find the outlier values above 75th quartile and remove from the matrix
    if length(outliersQ3 < 0 )
        for j = 1:length(outliersQ3)
            A = outliersQ3(j);
            index = find(filtered_data(:,i) == A);
            filtered_data(index,:) = [];
        end
    end
end

%display statistics for original data




disp(['Column:                              ' num2str(1:num_column     , '%f\t')]);
disp(['Mean:                                ' num2str(mean(x)          , '%f\t')]);
disp(['SD:                                  ' num2str(std(x)           , '%f\t')]);
disp(['Quartile1 (25th):                    ' num2str(stats_data(1,:)  , '%f\t')]);
disp(['Quartile1 (50th):                    ' num2str(stats_data(2,:)  , '%f\t')]);
disp(['Quartile1 (75th):                    ' num2str(stats_data(3,:)  , '%f\t')]);
disp(['Inter quartile range:                ' num2str(stats_data(4,:)  , '%f\t')]);
disp(['Semi Interquartile Deviation:        ' num2str(stats_data(5,:)  , '%f\t')]);
disp(['Number of outliers :                 ' num2str((num_outliers)   , '%f\t')]);



% boxplots to compare orignal and filtered data sets. 

if plot_state  == 1

subplot(1, 2, 1)
boxplot(x,'notch','on')
title('With outliers')
subplot(1, 2, 2)
boxplot(filtered_data,'notch','on')
title('Minus outliers')

end





Contact us at files@mathworks.com