%% Load Forecasting using Bagged Regression Trees
% This example demonstrates an alternate model for building relationships
% between historical weather and load data to build and test a short term
% load forecasting. The model used is a set of aggregated Regression Trees.
%% Import Data
% The data set used is a table of historical hourly loads and temperature
% observations from the New England ISO for the years 2004 to 2008. The
% weather information includes the dry bulb temperature and the dew point.
% This data set is imported from an Access database using the
% auto-generated function _fetchDBLoadData_. This function was generated by
% the Visual Query Builder in Database Toolbox and then modified.
% data = fetchDBLoadData('2004-01-01', '2008-12-31');
load ausdata
%%
% Import list of holidays
[num, text] = xlsread('..\Data\Holidays2.xls');
holidays = text(2:end,1);
%% Generate Predictor Matrix
% The function *genPredictors* generates the predictor variables used as
% inputs for the model. For short-term forecasting these include
%
% * Dry bulb temperature
% * Dew point
% * Hour of day
% * Day of the week
% * A flag indicating if it is a holiday/weekend
% * Previous day's average load
% * Load from the same hour the previous day
% * Load from the same hour and same day from the previous week
%
% If the goal is medium-term or long-term load forecasting, only the inputs
% hour of day, day of week, time of year and holidays can be used
% deterministically. The weather/load information would need to be
% specified as an average or a distribution
% Select forecast horizon
term = 'short';
[X, dates, labels] = genPredictors2(data, term, holidays);
%% Split the dataset to create a Training and Test set
% The dataset is divided into two sets, a _training_ set which includes
% data from 2004 to 2007 and a _test_ set with data from 2008. The training
% set is used for building the model (estimating its parameters). The test
% set is used only for forecasting to test the performance of the model on
% out-of-sample data.
% Create training set
trainInd = data.NumDate < datenum('2010-01-01');
trainX = X(trainInd,:);
trainY = data.ElecPrice(trainInd);
% Create test set and save for later
testInd = data.NumDate >= datenum('2010-01-01');
testX = X(testInd,:);
testY = data.ElecPrice(testInd);
testDates = dates(testInd);
save Data\testSet_aus testDates testX testY
clear X data trainInd testInd term holidays dates ans
%% Build the Bootstrap Aggregated Regression Trees
% The function TreeBagger is used to build the model, ie. a set of regression
% trees each with a different set of rules for performing the non-linear regression.
% We initially start by building an aggregate of 20 such trees, with a
% minimum leaf size of 40. The larger the leaf size the smaller the tree.
% This provides a control for overfitting and performance.
model = TreeBagger(20, trainX, trainY, 'method', 'regression', 'minleaf', 40)
simpleTree = prune(model.Trees{1}, 1020);
view(simpleTree, 'names', labels);
%% What is an Appropriate Choice of Model Architecture?
% An important task when building such artificial intelligence models is
% the choice of model architecture, for example, the number of layers in a
% Neural network or the choice of a kernal function in a Support Vector
% Machine. In the case of bagged trees, some of these model architecture
% parameters include,
% * Number of trees
% * Minimum leaf size
% * The choice of predictors/inputs
%
% The choice of model architecture usually requires some fundamental
% knowledge about the problem being solved, for example the predictive
% power of a input. However, the flexibility of MATLAB and built-in
% functions in the Treebagger classl enable us to use a data driven
% approach to help determine these model structure parameters.
%
% In the next two sections we demonstrate a data driven approach that uses
% built-in methods of the TreeBagger object to help determine appropriate
% choice of model architecture.
%% Determine Number of Trees and Appropriate Leaf Size
% The number of trees in the ensemble and their minimum leaf size governs
% the accuracy of the model and can be selected to offer an appropriate tradeoff
% between precision, overfitting and speed. Here we compare the out-of-sample prediction
% error for a leaf size of 10, 20, 40 and 50. We can test this by
% turning on the parameter _oobpred_ which returns out-of-bag prediction
% errors during the training procedure.
oobError = [];
leafSizes = [10 20 40 50];
for i = 1:length(leafSizes)
model = TreeBagger(20, trainX, trainY, 'method', 'regression', ...
'oobpred', 'on', 'minleaf', leafSizes(i));
oobError = [oobError model.oobError];
figure(1), plot(oobError);
xlabel('Number of grown trees'), ylabel('Out-of-bag Regression Error');
title(sprintf('Regression Error versus Number of Trees & Leaf Size'));
legend(num2str(leafSizes(1:i)')), drawnow;
end
%% Determine Feature Importance
% Of each of the predictors, which ones provide the most predictive power?
% Turning on the _oobVarImp_ parameter shows you out-of-bag estimates of this
% relative feature (input) importance.
model = TreeBagger(20, trainX, trainY, 'method', 'regression', ...
'oobvarimp', 'on', 'minleaf', 30);
figure(2);
barh(model.OOBPermutedVarDeltaError);
ylabel('Feature');
xlabel('Out-of-bag feature importance');
title('Feature importance results');
set(gca, 'YTickLabel', labels)
%% Build the Final Model
% Given our analysis of parameters, we may wish to now build the final
% model with 20 trees, a leaf size of 20 and all of the features
model = TreeBagger(20, trainX, trainY, 'method', 'regression', 'minleaf', 20);
%% Save Trained Model
% We can compact the model (to remove any stored training data) and save
% for later reuse
model = compact(model);
save Models\TreeModel_aus model
%% Test Results
% Load in the model and test data and run the treeBagger forecaster and
% compare to actual load.
clear
load Models\TreeModel_aus
load Data\testSet_aus
%% Compute Prediction
% Predict the load for 2008 using the model trained on load data from 2007
% and before.
forecastLoad = predict(model, testX);
%% Compare Forecasted Load and Actual Load
% Create a plot to compare the actual load and the predicted load as well
% as the forecast error.
ax1 = subplot(2,1,1);
plot(testDates, [testY forecastLoad]);
ylabel('Load'); legend({'Actual', 'Forecast'}); legend('boxoff')
ax2 = subplot(2,1,2);
plot(testDates, testY-forecastLoad);
xlabel('Date'); ylabel('Error (MWh)');
linkaxes([ax1 ax2], 'x');
dynamicDateTicks([ax1 ax2], 'linked')
%% Compute Model Forecast Metrics
% In addition to the visualization we can quantify the performance of the
% forecaster using metrics such as mean average error (MAE), mean average
% percent error (MAPE) and daily peak forecast error.
err = testY-forecastLoad;
errpct = abs(err)./testY*100;
% fL = reshape(forecastLoad, 24, length(forecastLoad)/24)';
% tY = reshape(testY, 24, length(testY)/24)';
% peakerrpct = abs(max(tY,[],2) - max(fL,[],2))./max(tY,[],2) * 100;
fL = reshape(forecastLoad(1:end-1), 48, (length(forecastLoad)-1)/48)';
tY = reshape(testY(1:end-1), 48, (length(testY)-1)/48)';
peakerrpct = abs(max(tY,[],2) - max(fL,[],2))./max(tY,[],2) * 100;
fprintf('Mean Average Percent Error (MAPE): %0.2f%% \nMean Average Error (MAE): %0.2f MWh\nDaily Peak MAPE: %0.2f%%\n',...
mean(errpct(~isinf(errpct))), mean(abs(err)), mean(peakerrpct))