Code covered by the BSD License  

Highlights from
Forecasting the FTSE 100 with high-frequency data: A comparison of realized measures

Forecasting the FTSE 100 with high-frequency data: A comparison of realized measures

by

 

16 Sep 2011 (Updated )

My dissertation for the MSc in Finance & Economics from Warwick Business School

fltprice(data,type,scheme,interval)
function [out,spec] = fltprice(data,type,scheme,interval)

% FLTPRICE Filter a single intraday time series with a sampling scheme
%
%   ... = FLTPRICE(DATA,TYPE,SCHEME,INTERVAL)
%
%       DATA    is an m by 2 single/double matrix:
%                   - column 1      increasing serial dates
%                   - column 2      prices
%
%       TYPE    is a string indicating how data is sampled.
%                   - 'CalendarTime'    sampling on calendar time forms a
%                                     regular intraday grid every INTERVAL
%                                     seconds. The timestamps of the
%                                     observations do not necessary fall on
%                                     the gridpoints and the datapoints
%                                     are selected according to the SCHEME.
%                   - 'BusinessTime'    sampling on business time forms a
%                                     regular intraday grid every INTERVAL
%                                     ticks/trades. Grid-points fall exacly
%                                     on observations rather than on
%                                     timestamps.
%                   - 'FixedTime'       sample at specific points in time.
%                                     When using 'FixedTime', INTERVAL must
%                                     be a vector of values between [0 1].
%
%       SCHEME  is a string indicating how the datapoints corresponding to
%               the gridpoints are selected.
%
%               'CalendarTime' and 'FixedTime' schemes:
%                   - 'First'       first observation after the previous
%                                 gridpoint.
%                   - 'Last'        last observation after the previous
%                                 gridpoint.
%                   - 'Min'         observation with minimum price in the
%                                 interval (previous actual] gridpoint.
%                   - 'Max'         observation with maximum price in the
%                                 interval (previous actual] gridpoint.
%                   - 'Previous'    last available observation up to the
%                                 actual gridpoint (aka last price
%                                 interpolation).
%                   - 'Next'        first available observation after the
%                                 actual gridpoint (aka first price
%                                 interpolation).
%                   - 'Linear'      linear interpolation between datapoints
%                                 selected with 'Previous' and 'Next'.
%                   - 'Nearest'     closest datapoint to the gridpoint.
%                   - 'Uniform'     daily first and last observations are
%                                 included by default and the remaining
%                                 INTERVAL-2 points are selected as
%                                 'Nearest' from a uniformly spread grid.
%                                 Only for 'CalendarTime'.
%
%               'BusinessTime' schemes:
%                   - 'Standard'    the gridpoints fall exactly on the
%                                 datapoints.
%                   - 'Uniform'     daily first and last observations are
%                                 included by default and the remaining
%                                 INTERVAL-2 points intervealed with equal
%                                 equal amount of trades.
%
%       INTERVAL indicates the frequency of sampling depending on the TYPE.
%                'CalendarTime': should be a scalar value between [1 86400]. 
%                                Sampling in seconds.
%                'BusinessTime': should be a posititve integer value. 
%                                Sampling in ticks.
%                'FixedTime'   : should be a single/double scalar or vector
%                                sorted in ascending order. If all the 
%                                values fall in the [0 1] range it is an 
%                                intraday  grid which will be replicated 
%                                for all days. Otherways, INTERVAL should 
%                                contain serial dates.
%                Note: 1 second = 1/86400. 18:39 = (3600* 18+39 *60)/86400.
%
%   [OUT,SPEC] = ...   
%
%       OUT     filtered DATA.
%       SPEC    a structure with TYPE, SCHEME and INTERVAL chosen.
%
% Examples:
% 
%  See also REALIZED_COMPUTE_MEDIAN, REALIZED_VAR, HISTC, MCOLON

% Based on Sheppard's REALIZED_COMPUTE_MEDIAN - MFE toolbox v.3 (lastupdate 15 Mar 2011)
% Author: Oleg Komarov (oleg.komarov@hotmail.it)
% Tested on R14SP3 (7.1) and on R2011a. In-between compatibility is assumed.
% 25 aug 2011 - Created

% Ninput
error(nargchk(4,4,nargin))

% Data
szData = size(data);
if isempty(data) || ~isfloat(data) || szData(2) ~= 2
    error('fltprice:data','DATA should be a single/double m by 2 matrix.')
end
if ~issorted(data(:,1))
    error('fltprice:data1stColumn','DATA''s 1st column (serial dates) should be sorted in ascending order.')
end

% Type
if ~ischar(type) || ~isvector(type)
    error('fltprice:type','TYPE should be a string.')
else
    % Try to match
    whichType = {'CalendarTime','BusinessTime','FixedTime'};
    idx  = strncmpi(type,whichType,numel(type));
    if any(idx)
        type = whichType{idx};
    else
        error('fltprice:type','TYPE ''%s'' unrecognized.',type)
    end
end

% Scheme
if ~ischar(scheme) || ~isvector(scheme)
    error('fltprice:scheme','SCHEME should be a string.')
else
    % Try to match
    [err,scheme] = getScheme(scheme,type);
    if ~isempty(err)
        error('fltprice:scheme',err)
    end
end

% Interval
switch type
    case 'FixedTime'
        if ~isnumeric(interval) && ~issorted(interval) && any(interval < 0)
            error('fltprice:interval','TYPE ''FixedTime'': INTERVAL should be positive and a scalar or a sorted vector.')
        end
    case 'CalendarTime'
        if ~isnumeric(interval) || isempty(interval) || ~isscalar(interval) || ...
            interval < 1 || interval > 86400
            error('fltprice:interval','TYPE ''CalendarTime'': INTERVAL should be a value between [1 86400].')
        end
    case 'BusinessTime'
        if ~isnumeric(interval) || isempty(interval) || ~isscalar(interval) || ...
            mod(interval,1) > 0 || interval < 1
            error('fltprice:interval','TYPE ''BusinessTime'': INTERVAL should be a positive integer value.')
        end
end

% -------------------------------------------------------------------------
% ENGINE: create grid according to the TYPE
% -------------------------------------------------------------------------

% Slightly less than millisecond tolerance
tol = 1/(86400*1000)-eps;

% Position of last observations for each day
last  = [0; find(diff(fix(data(:,1)))); szData(1)];

% Create grid 
switch type
    case 'BusinessTime'
        idx = false(szData(1),1);
    
    case 'CalendarTime'
        if ~strcmp(scheme,'Uniform')
            tgrid = mcolon(data(last(1:end-1)+1,1),...
                           data(last(2:end)    ,1), interval/86400);
            if ~any(strcmp(scheme,{'Max','Min'}))
                n = histc(data(:,1), tgrid + tol);
            end
        end
        
    case 'FixedTime'
        % Vector of intraday times [0 1]: replicate grid for every day
        if all(interval <= 1)
            tgrid = bsxfun(@plus,interval(:),fix(data(last(2:end),1)).');
            tgrid = tgrid(:);
        % Grid with serial dates, use as is
        else
            tgrid = interval(:);
        end
        % Add beginning of the day or the very first gridpoint is skipped
        tgrid = [fix(tgrid(1))-tol; tgrid];
        
        % Trim consecutive days whose datapoints fall before the grid
        % Example: each day data are recorded from 8:00-12:00. If we sample
        %          from 13:00-16:00 then we'll trim everything away.
        [n,bin] = histc(data(:,1), tgrid + tol);
        if bin(1) == 0
            from = find(diff(bin == 0) == -1,1,'first');
            bin  = bin(from+1:end,:);
        else
            from = 0;
        end
        % Trim gridpoints that don't capture any observation
        idx   = [n(1:end-1)~=0; true];
        tgrid = tgrid(idx);
        n     = n(idx);
        % Shrink data to the first observation before the second gridpoint
        data        = data(from + n(1):end,:);
        last(2:end) = last(2:end)-n(1)+1;
        szData(1)   = last(end);
        % Adjust n and tgird because the schemes always include the first point
        tgrid = tgrid(2:end);
        n     = n(2:end);
        if isempty(data)
            warning('fltprice:data','No data fall between INTERVALs.')
            out = [];
            return
        end
end

% -------------------------------------------------------------------------
% ENGINE: select observations from the grid according to the SCHEME
% -------------------------------------------------------------------------

switch scheme
    case 'First'
        % If no values are found within a grid interval we don't want to 
        % interpolate the first value. This is accomplished with consequent
        % indexing of the same position instead of using position themselves.
        idx = false(szData(1),1);
        n   = cumsum([2; n(1:end-1)]);
        % Take care values that fall to the next day
        n(ismember(n,last+1) | n > szData(1)) = [];
        idx(n) = true;
           
    case 'Last'
        % Same concept for missing values as in First 
        idx = false(szData(1),1);
        idx(cumsum([1; n(1:end-1)])) = true;
            
    case 'Linear'
        % Calculate Previous and Next and interpolate time and price 
        prev = cumsum([1; n(1:end-1)]);
        next = prev + 1;
        % Remove first and last values
        prev(ismember(prev,last+1)) = [];
        [idx,loc] = ismember(prev,last);
        prev = prev(~idx);
        next(ismember(next,[last+1;last+2]) | next > szData(1)) = [];
        % Output        
        out = (data(prev,:) + data(next,:))/2;
        % Add first/last back and sort back (not efficient but will do for
        % now)
        out = sort([out;
                    data(last(1:end-1)+1,:);
                    data(last(setdiff(loc,0)),:)]);
        
    case 'Max'
        if ~strcmp(type,'FixedTime')
            [n,bin] = histc(data(:,1), tgrid + tol);
        end
        % Only prices because there can be multiple max prices per grid
        % interval
        out = accumarray([1;bin(2:end-1)],data(1:sum(n)+1,2),[],@max);
        
    case 'Min'
        if ~strcmp(type,'FixedTime')
            [n,bin] = histc(data(:,1), tgrid + tol);
        end
        % Only prices because there can be multiple min prices per grid
        % interval
        out = accumarray([1;bin(2:end-1)],data(1:sum(n)+1,2),[],@min);
    case 'Nearest'
        % Previous and next
        n = cumsum([1; n(1:end-1)]);
        n = [n, n+1];
        % If next overshots the day set to last observation
        if n(end,2) > szData(1)
           n(end,2) = szData(1);
        end
        % Find minimum distance from gridpoint (nearest)
        [~,pos] = min(diff([data(n(:,1),1) tgrid(:)...
                            data(n(:,2),1)],[],2),[],2);
        numn = size(n,1);
        % Select previous or next whichever closer
        idx  = n((1:numn).' + (pos-1)*numn);
        
    case 'Next'
        % First price interpolation
        % Use positions directly to obtain carry-on interpolation
        idx = cumsum([2; n(1:end-1)]);
        idx(ismembc(idx,last+1) | idx > szData(1)) = [];
    
    case 'Previous' 
        % Last price interpolation    
        % Use positions directly to obtain carry-on interpolation
        idx = cumsum([1; n(1:end-1)]);
        idx = idx(~ismembc(idx,last));

    case 'Standard'
        idx = mcolon(last(1:end-1)+1, last(2:end), interval);
         
    case 'Uniform'
        if interval == 2
            idx = [last(1:end-1)+1, last(2:end)].';
            idx = idx(:);
        elseif interval > 2
            % In BusinessTime
            if strcmp(type,'BusinessTime')
                idx = fix(mcolon(last(1:end-1)+1,last(2:end),diff(last)/(interval-1)));
            % In CalendarTime with Nearest interpolation
            else
                % Uniformly spaced grid in time
                tgrid  = mcolon(data(last(1:end-1)+1,1),data(last(2:end),1),...
                                diff([data(last(1:end-1)+1,1) data(last(2:end),1)],[],2)/(interval-1));
                n      = histc(data(:,1), tgrid + tol);
                % Previuos and next
                pn = cumsum([1; n(1:end-1)]);
                pn = [pn, pn+1];
                % If next overshots the day set to last observation
                if pn(end,2) > szData(1)
                   pn(end,2) = szData(1);
                end
                % Find minimum distance from gridpoint (nearest)
                [~,pos] = min(diff([data(pn(:,1),1) tgrid(:)...
                                    data(pn(:,2),1)],[],2),[],2);
                numn = numel(n);
                % Select previous or next whichever closer
                idx = pn((1:numn).' + (pos-1)*numn);
            end
        else
            error('fltprice:uniformInterval','When SCHEME is ''Uniform'', the INTERVAL should be > 1.')
        end
end

% Return actual prices with dates
if ~strcmp(scheme,'Linear')
    out = data(idx,:);
end

% Optionally return specifications
if nargout == 2
    spec = struct('type',type,'scheme',scheme,'interval',interval);
end

end

% getScheme ---------------------------------------------------------------
function [err,scheme] = getScheme(scheme,type)
% Initialize error
err = '';
% Available schemes
whichScheme = {'First','Last','Linear','Max','Min','Nearest','Next','Previous','Standard','Uniform'};
% Try to match
idx  = strncmpi(scheme,whichScheme,numel(scheme));
% # of matches
nidx = nnz(idx);

% Ambiguous scheme
if  nidx == 2
    err = sprintf('SCHEME ''%s'' is ambiguous. Did you mean ''%s'' or ''%s''?',scheme,whichScheme{idx});
    % No match
elseif nidx == 0
    err = sprintf('SCHEME ''%s'' unrecognized.',scheme);
    % Regular match
else
    scheme = whichScheme{idx};
end

% Check limited scheme-type
switch scheme
    case {'First','Last','Linear','Max','Min','Nearest','Next','Previous'}
        if strcmp(type,'BusinessTime')
            err = sprintf('SCHEME ''%s'' not allowed with TYPE ''BusinessTime''.',scheme);
        end
    case 'Uniform'
        if strcmp(type, 'FixedTime')
            err = sprintf('SCHEME ''%s'' not allowed with TYPE ''FixedTime''.',scheme);
        end
    case 'Standard'
        if ~strcmp(type,'BusinessTime')
            err = 'SCHEME ''Standard'' is limited to TYPE ''BusinessTime''.';
        end
end
end

Contact us