| [C, y, X, n, p, error1]=LMS_trim(y, X, max_fits, max_points, flag2)
|
function [C, y, X, n, p, error1]=LMS_trim(y, X, max_fits, max_points, flag2)
% % Syntax;
% %
% % [C, y, X, n, p, error1]=LMS_trim(y, X, max_fits, max_points, flag2);
% %
% % This program trims the input data sets and trims the combinations of
% % best fit data pairs. The two trimming operations are performed
% % using a quick random uniform distribution to make all possible
% % combinations more equally probable. The output of this program
% % is used to perform the Least Median Trimmed Squares Robust Regression.
% %
% % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %
% % Input Variable Description
% %
% % y is the column vector of the dependent variable.
% %
% % X is the matrix of the independent variable. If it is one dimensional,
% % then it should be a column vector. If X is an empty matrix, then
% % X is assumed to be a column of integers starting from 0.
% %
% % max_fits is the number of best fit pairs of data. The default value
% % does not exceed 1000. The maximum value is 10000.
% %
% % max_points is the number of data points for curve fitting.
% % The maximum value is 100000. The default value is 100000.
% %
% % flag2 = 1 regular linear regression (unconstrained)
% % flag2 = 0 linear regression through the origin
% %
% % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %
% % Output Variable Description
% %
% % C is the matrix of data pairs for the best fit robust regression.
% %
% % y is the trimmed column vector of the dependent variable.
% %
% % X is the trimmed matrix of the independent variable. If it is one
% % dimensional, then it should be a column vector.
% %
% % n is the number of rows of the trimmed data array.
% %
% % p depends on whether the data is fit through the origin or not.
% % If flag2 is 1 then the data is unconstrained and p is the number
% % of columns of X + 1.
% %
% % If flag2~=1, then p is the number of columns of X.
% %
% % error1 is 1 if there is an error otherwise it is 0.
% %
% % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Example='';
% % Establish an exact solution (xe, ye)
%
% xe=(1:10000)';
% ye=10+10*xe;
%
% % Create a noisy data set with an outlier (X, y)
%
% X=randn(size(xe))+xe;
% y=9.5+rand(size(xe))+100*randn(size(xe))+10*(randn(size(xe))+xe);
%
% % Perform the robust median trimmed squares linear regression
% max_fits=100;
% max_points=500;
% flag2=1;
%
% % Outlier data points form a line with a different slope
% % randomly select 40% of the data points to be outliers
% [ndraw]=rand_int(1, length(xe), 0.4*length(X), 1, 1);
% y(ndraw)=1000+1000*ndraw;
% X(ndraw)=ndraw;
%
% [C, yt, Xt, n, p, error1]=LMS_trim(y, X, max_fits, max_points, flag2);
% [LMSout,blms,Rsq]=LMTSreg(yt, Xt, max_fits, max_points);
% % plot the robust solution
% xr=xe';
% yr=polyval([blms(2) blms(1)], xr);
% % plot the typical regression solution
% xp=xr;
% p=polyfit(X, y, 1);
% yp=polyval(p, xp);
%
% figure(1); plot(X, y, 'linestyle', 'none', 'marker', '.', 'markersize', 3, 'markeredgecolor', 'k');
% hold on; plot(xe, ye, 'g', 'linewidth', 1);
% plot(xr, yr, 'r', 'linewidth', 1);
% plot(xp, yp, 'b', 'linewidth', 1);
% legend({'Scattered Data', 'Exact Solution', 'Robust Solution', 'Regular Regression'});
% xlim([1 10000]);
% ylim([1 100000]);
% title({'51% of the data is near the x-axis', '49% of the data is near the y-axis'}, 'fontsize', 20);
% xlabel('x-axis', 'fontsize', 18);
% ylabel('y-axis', 'fontsize', 18);
% set(gca, 'fontsize', 14);
%
% % ***********************************************************
% %
% % This program was written by Edward L. Zechmann
% %
% % date 1 February 2008 updated comments
% %
% % modified 13 February 2008 updated comments
% % added flag2 to cater to regression
% % through the origin and unconstrained
% % regression
% %
% % modified 14 February 2008 updated comments
% % Improved the error handling and default
% % values.
% %
% %
% % ***********************************************************
% %
% % Feel free to modify this code.
% %
% set the flag to null
% set the error to no error
flag=0;
error1=0;
if nargin < 1 || isempty(y)
warning('Not enough input arguments. Return empty array.');
flag=1;
error1=1;
n=1;
y=1;
else
% y must be a column vector
y=y(:);
% n is the length of the data set
n=length(y);
end
if nargin < 2 || isempty(X)
% if X is omitted give it the values 1:n
X=(1:n)';
else
% X must be a 2-dimensional matrix
[mx, nx]=size(X);
if nx > mx
X=X';
end
if ndims(X) > 2
warning('Invalid data set X. Return empty array.');
flag=1;
error1=1;
end
if n~=size(X,1)
warning('The rows of X and y must have the same length');
flag=1;
error1=1;
end
end
pp=size(X,2);
% If not input, set the maximum number of fits
if nargin < 3 || isempty(max_fits)
% default value of max_fits is 1000
max_fits=min([1000, floor(n/pp)]);
end
% make sure that max_fits does not exceed 10000
max_fits=min( [max_fits, floor(n/pp), 10000]);
% If max_points is not an input, set the maximum number of points
% for the input arrays X and y to a reasonable value.
if nargin < 4
max_points=max([min([length(y), 100000]), max_fits*pp]);
end
if max_points < max_fits
max_points=max_fits;
end
if max_points > length(y) || logical(max_points > max_fits*pp)
max_points=min([length(y), max_fits*pp]);
end
% check the values for flag2
if nargin < 5
flag2=1;
end
if ~isequal(flag2, 1)
flag2=2;
end
n=max_points;
% Trim the arrays of the input data
[pts]=rand_int(1, length(y), [max_points 1], 0, 1);
X=X(pts, :);
y=y(pts, :);
% p is the number of parameters to be estimated
if isequal(flag2, 1)
p=size(X,2)+1;
else
p=size(X,2);
end
if isequal(flag, 1)
C=[];
else
if nchoosek(n,p) < max_fits
% Regime 1
% All the possible combinations of p with m-dimensional points
C=nchoosek(1:n,p);
elseif floor(n/p) < max_fits
% Regime 2
% some random combinations
[C]=rand_int(1, n, [floor(n/p) p], 0, 1);
else
% Regime 3
% sparse random combinations
if n < max_fits
np=n;
else
np=max_fits;
end
[C]=rand_int(1, n, [np p], 0, 1);
end
end
|
|