function [rsv,rsvss,dates] = realized_semivar(data,type,scheme,interval,nsamples)
% REALIZED_SEMIVAR Estimate Realized Semi Variance (RSV) and subsampled RSV
%
% ... = REALIZED_SEMIVAR(DATA,TYPE,SCHEME,INTERVAL, [NSAMPLES])
%
% DATA is an m by 2 single/double matrix:
% - column 1 increasing serial dates
% - column 2 prices
%
% TYPE is a string indicating how data is sampled.
% - 'CalendarTime' sampling on calendar time forms a
% regular intraday grid every INTERVAL
% seconds. The timestamps of the
% observations do not necessary fall on
% the gridpoints and the datapoints
% are selected according to the SCHEME.
% - 'BusinessTime' sampling on business time forms a
% regular intraday grid every INTERVAL
% ticks/trades. Grid-points fall exacly
% on observations rather than on
% timestamps.
% - 'FixedTime' sample at specific points in time.
% When using 'FixedTime', INTERVAL must
% be a vector of values between [0 1].
%
% SCHEME is a string indicating how the datapoints corresponding to
% the gridpoints are selected.
%
% 'CalendarTime' and 'FixedTime' schemes:
% - 'First' first observation after the previous
% gridpoint.
% - 'Last' last observation after the previous
% gridpoint.
% - 'Min' observation with minimum price in the
% interval (previous actual] gridpoint.
% - 'Max' observation with maximum price in the
% interval (previous actual] gridpoint.
% - 'Previous' last available observation up to the
% actual gridpoint (aka last price
% interpolation).
% - 'Next' first available observation after the
% actual gridpoint (aka first price
% interpolation).
% - 'Linear' linear interpolation between datapoints
% selected with 'Previous' and 'Next'.
% - 'Nearest' closest datapoint to the gridpoint.
% - 'Uniform' daily first and last observations are
% included by default and the remaining
% INTERVAL-2 points are selected as
% 'Nearest' from a uniformly spread grid.
% Only for 'CalendarTime'.
%
% 'BusinessTime' schemes:
% - 'Standard' the gridpoints fall exactly on the
% datapoints.
% - 'Uniform' daily first and last observations are
% included by default and the remaining
% INTERVAL-2 points intervealed with equal
% equal amount of trades.
%
% INTERVAL is a scalar integer value between [1 86400] indicating the
% frequency of sampling in seconds. If TYPE 'FixedTime'
% is used, INTERVAL should be a single/double vector sorted
% in ascending order. The vector can consist of values
% between [0 1] representing the intraday points in time of
% a fixed grid that will be replicated for all days.
% Otherways it should contain serial dates in which case the
% grid is limited to the values supplied.
% Note: 1 second = 1/86400. 18:39 = (3600* 18+39 *60)/86400.
%
% NSAMPLES [optional, default = empty] is a positive integer value
% indicating how many subsampled RSVs (RSVSS) to average with
% the original RSV. If empty, RSVSS will be empty if assigned.
% The RSVSS differs from the RSV by the starting point of
% the grid. The starting points are uniformly spaced between
% the first and second gridpoint of the RSV.
% Example: NSAMPLES = 4; INTERVAL = 300;
% If the day starts at 8:00 then the RSVs
% gridpoint is [8:00 8:05 ...]. Then the first
% gridpoint for 1st, 2nd, 3rd and 4th RSVSS will
% be respectively at 8:01, 8:02, 8:03 and 8:04.
% Note: using NSAMPLES with one output returns the
% subsampled version of realized measure in place of
% the original one.
%
%
% [RSV,RSVSS,DATES] = REALIZED_SEMIVAR...
%
% RSV m by 2 double vector of daily realized semi variances.
% - column 1 RSV-
% - column 2 RSV+
% RSVSS m by 2 double vector of daily subsampled realized semi
% variances. Same column division as RSV.
% [Empty] if NSAMPLES was not supplied.
% DATES vector of daily serial dates.
%
% Examples:
%
% See also
% Based on K. Sheppard's REALIZED_SEMIVARIANCE - MFE toolbox v.3 (lastupdate 15 Mar 2011)
% Author: Oleg Komarov (oleg.komarov@hotmail.it)
% Tested on R14SP3 (7.1) and on R2011a. In-between compatibility is assumed.
% 31 aug 2011 - Created
% Ninput
error(nargchk(4,5,nargin))
% Nsamples
if nargin == 5
if isempty(nsamples) || ~isnumeric(nsamples) && ~isscalar(nsamples) &&...
mod(nsamples,1) > 0 && nsamples <= 0
error('realized_semivar:nsamples','NSAMPLES should be a positive integer value.')
end
else
nsamples = [];
rsvss = [];
end
% Engine starts here
% -------------------------------------------------------------------------
% Filter prices: checks on the first 4 inputs are issued by fltprice
[fltdata,opt] = fltprice(data,type,scheme,interval);
% Last observation per day
datedf = diff(fix(fltdata(:,1)));
last = find(datedf);
% Which obs belongs to which day
subs = cumsum([1; logical(datedf(2:end))]);
% Returns
fltdata = [fltdata(2:end,1), diff(log(fltdata(:,2)))];
% Eliminate first obs next day with last obs prev day
subs(last) = [];
fltdata(last,:) = [];
% RSV
idxp = fltdata(:,2) > 0;
idxn = fltdata(:,2) < 0;
rsv = [accumarray(subs(idxn),fltdata(idxn,2).^2),...
accumarray(subs(idxp),fltdata(idxp,2).^2)];
% RSVSS
if ~isempty(nsamples)
rsvss = zeros(size(rsv,1),2,nsamples);
switch opt.type
case 'BusinessTime'
% Don't include an increasing part of the beginning of the day
if interval <= nsamples + 1
warning('realized_var:businessSubSampling','Ensure INTERVAL > NSAMPLES +1: one (or more) subsampled RV coincides with the original RV.')
end
% Offset of the initial gridpoint for the subsamples
df = fix(interval/(nsamples+1));
% Last observation of each day
last = [0; find(diff(fix(data(:,1)))); size(data,1)];
for c = 1:nsamples
rsvss(:,:,c) = realized_semivar(data(mcolon(last(1:end-1)+df*c+1,last(2:end),1),:),...
type,scheme,interval);
end
case 'CalendarTime'
% Don't include an increasing part of the beginning of the day
df = interval/(nsamples+1)/86400;
last = [0; find(diff(fix(data(:,1)))); size(data,1)];
ii = [data(last(1:end-1)+1,1)...
data(last(2:end),1) + 1/(86400*1000)-eps];
for c = 1:nsamples
fxdinter = mcolon(ii(:,1)+df*c, ii(:,2), interval/86400);
rsvss(:,:,c) = realized_semivar(data,'FixedTime',scheme,fxdinter);
end
case 'FixedTime'
% Shift the grid
df = diff(interval([1 2]))/(nsamples+1);
for c = 1:nsamples
rsvss(:,:,c) = realized_semivar(data,type,scheme,interval + df*c);
end
end
% Average RSV with RSVSS
rsvss = mean(cat(3,rsv,rsvss),3);
if nargout == 1
rsv = rsvss;
end
end
% Dates
if nargout == 3
dates = fix(fltdata(cumsum(accumarray(subs,1)),1));
end
end