function s = modelMPG(data, fil1, fil2)
%modelMPG Create a regression model for MPG
% s = modelMPG(data, filter1, filter2)
% filter1 : 'Highway' or 'City'
% filter2 : 'Car' or 'Truck'
% Copyright 2007 The MathWorks, Inc.
if ~ismember(fil1, {'Highway', 'City'}) || ~ismember(fil2, {'Car', 'Truck'})
error('%s\n%s', 'The second argument should be either ''H'' or ''C''', ...
'and the third should be either ''C'' or ''T''');
else
switch fil1
case 'Highway'
f1 = 'H';
case 'City'
f1 = 'C';
end
switch fil2
case 'Car'
f2 = 'C';
case 'Truck'
f2 = 'T';
end
end
dat = data(data.C_H == f1 & data.car_truck == f2, :);
% List of potential predictor variables
predictors = {dat.cid, dat.rhp, dat.etw, dat.cmp, dat.axle, dat.n_v};
varNames = {'cid', 'rhp', 'etw', 'cmp', 'axle', 'n_v'};
p = anovan(dat.mpg, predictors, 'continuous', 1:length(predictors), 'varnames', varNames, 'display', 'off');
%%
% Looking at the linear contributions of the variables, we can see which
% variables have significant effects on the variability of MPG. If we
% specify a significance at p < 0.05, we can say that axle ratio is not
% significant.
%
% I am going to remove |axle| from the predictor variables and rerun ANOVA.
% This time, however, I will look into interaction terms. I've decided to
% examine up to 3-way interactions.
predictors(p > 0.05) = '';
varNames(p > 0.05) = '';
[p, t, stats, terms] = anovan(dat.mpg, predictors, 'continuous', 1:length(predictors), 'varnames', varNames, 'model', 3, 'display', 'off');
%%
% It seems like many of the interaction terms are significant, while a few
% are not. For example, there's little evidence that the effects of
% displacement, compression ratio, and engine-vehicle speed ratio on MPG
% depend on each other.
%
% Finally, we will perform a regression, after removing the non-significant
% terms, to create a model.
terms(p > 0.05, :) = [];
s = regstats(dat.mpg, cell2mat(predictors), [zeros(1, length(predictors)); terms], {'beta', 'yhat', 'r', 'rsquare'});
r_square = s.rsquare
%%
% The R2 for the regression is 0.85. So our model captures around 85% of
% the variation seen in MPG. I would also like to visually confirm the
% goodness of the regression by plotting the model MPG against the actual
% MPG.
myPlot(dat.mpg, s.yhat, s.rsquare, sprintf('%s-%s', fil1, fil2));