function [corrsperc, corrstd] = corrperc(Y, perc, n_iters, matrix3D)
% CORRPERC performs a bootstrap (of size equal to n_iters) on
% correlation matrices of input variable Y and computes the percentiles
% corrsperc (according to input perc) of each correlation. The function
% also provides the standard deviation corrstd for each correlation.
%
% [corrsperc, corrstd] = corrperc(Y, perc, n_iters) returns a matrix of
% size (N * (N - 1) / 2)-by-length(perc). See below for further details.
%
% [corrsperc, corrstd] = corrperc(Y, perc, n_iters, 1) returns a matrix
% of size N-by-N-by-length(perc)
%
%*************************************************************************%
%************************WHY I NEEDED THIS FUNCTION***********************%
%*************************************************************************%
%
% When the number of columns in variable Y is big and the number of
% iterations n_iters for the bootstrap is high, then in general
% percentiles can't be computed at once. In fact, on most machines, the
% RAM memory won't be enough.
%
% For example, let's assume we have a 1000-by-500 matrix Y and we desire
% to perform a bootstrap based on 5000 iterations. Then we have:
% 500 * (500 - 1) / 2 = 124750 correlations
%
% So, if percentiles had to be computed at once we would need a matrix
% of size 124750-by-5000 and from this matrix we would be able to
% extract the desired percentiles. But my machine can't do that! The RAM
% is not enough. The amount of required memory is far too much.
%
% Then my idea was to compute percentiles for 10000 correlations at a
% time or so (if you desire to change this parameter you can do so from
% within the code, by changing the parameter named corrs_per_step). Then
% I need matrices of size 10000-by-5000 and repeat the computation 13
% times ( ---> ceil(124750 / 10000)).
% It's slow and not elegant at all, but it works.
%
%*************************************************************************%
%
% INPUTS
% Input Y is a matrix m-by-n where
% m is the number of observations and
% n is the number of variables
%
% Input perc is a vector of real numbers in the [0, 100] interval:
% 0 corresponds to the minimum;
% 100 corresponds to the maximum;
% 50 corresponds to the median;
% 25 and 75 are the first and third quartiles;
% 10 is the tenth percentile and so on;
% [1, 99] corresponds to a 98% Centered Confidence Interval.
%
% Input n_iters is the number of correlation matrices which will be
% generated in order to compute the percentiles desired. The higher
% the number of iterations the higher the precision for the
% estimation of the percentiles. A good - and possibly slow - choice
% is -->
% n_iters = 1000;
%
% Input matrix3D is a logical variable: if it is 1, then output
% corrsperc is stored in a 3D matrix and output corrstd is stored in
% a 2D matrix; otherwise corrsperc is stored in a 2D matrix and
% output corrstd is stored in a vector.
%
% OUTPUTS
% Output corrsperc is an N-by-N-by-length(perc) matrix, if matrix3D
% is 1. Otherwise, it is a (n * (n - 1) / 2)-by-length(perc) matrix.
% In the latter case, correlations are selected from the rows of the
% upper triangle of the n-by-n correlation matrix. For example, if
% the correlation matrix is a 9-by-9 matrix:
% a12, a13, a14, a15, a16, a17, a18, a19
% a23, a24, a25, a26, a27, a28, a29
% a34, a35, a36, a37, a38, a39
% a45, a46, a47, a48, a49
% a56, a57, a58, a59
% a67, a68, a69
% a78, a79
% a89
% then elements will be chosen in the following order:
% a12, a13, a14, a15, a16, a17, a18, a19, a23, a24, a25, a26,
% a27, a28, a29, a34, a35, a36, a37, a38, a39, a45, a46, a47,
% a48, a49, a56, a57, a58, a59, a67, a68, a69, a78, a79, a89
% and will be disposed over the columns of corrsperc. Each column
% represents correlation percentiles according to perc.
%
% Output corrstd is an estimate of the standard deviation regarding
% each correlation. The estimate is the more accurate the higher the
% value of n_iters and the lower the value of corrs_per_step. If
% matrix3D is 1, corrstd is a N-by-N symmetric matrix; otherwise
% corrstd is a vector of length (N * (N - 1) / 2).
%
%*************************************************************************%
%
% % Example
%
% T = 1000;
% N = 100;
% Y = cumsum(randn(T, N));
% perc = [0:100];
% n_iters = 250;
% [corrsperc, corrstd] = corrperc(Y, perc, n_iters);
% % Look at this: 96% Centered Confidence Intervals are approximately
% % four times the standard deviations. Cool!
% plot((corrsperc(:, 99) - corrsperc(:, 3)) / 2, 2 * corrstd, '.')
%
%*************************************************************************%
% %
% Author: Francesco Pozzi %
% E-Mail: francesco.pozzi@anu.edu.au %
% Date: 15 January 2008 %
% %
%*************************************************************************%
%
%*************************************************************************%
%*******************************Check inputs******************************%
%*************************************************************************%
ctrl = isnumeric(Y) & isreal(Y) & ~any(isnan(Y)) & ~any(isinf(Y));
if ~ctrl
error('Check Y: it needs be a matrix of real numbers with no infinite or nan values!')
end
% Size of matrix Y
[T, N] = size(Y);
if length([T, N]) ~= 2;
error('Y needs be a 2D matrix');
end
if T < 3
error('C''mon, you can''t perform a correlation bootstrap based on less than 3 observations!');
end
if N < 2
error('You need at least two variables ...');
end
ctrl = isvector(perc) & isreal(perc) & ~any(isnan(perc)) & ~any(isinf(perc)) & all(perc >= 0) & all(perc <= 100);
if ~ctrl
error('Percentiles need be a vector of numbers between 0 and 100, with no infinite or nan values!')
end
ctrl = isscalar(n_iters) & (n_iters > 0);
if ~ctrl
error('Number of iterations need be a positive scalar!')
end
n_iters = ceil(n_iters);
%*************************************************************************%
%*************************************************************************%
% Indexes of upper triangular N-by-N matrix, selected by rows
k = 1;
for i = 1:(N - 1)
i1(k:(k + N - i - 1)) = repmat(i, 1, (N - i));
i2(k:(k + N - i - 1)) = ((i + 1):N);
k = k + N - i;
end
indexes = sub2ind([N, N], i1, i2);
% corrs_mean is the average of correlations for a bootstrap of infinite
% size ... we assume that, for large dataset, this is equal to the
% correlation of Y
corrs_mean = corrcoef(Y);
corrs_mean = corrs_mean(indexes);
% K is the total number of correlations: N * (N - 1) / 2
% This is the number of columns necessary for output corrsperc
K = length(indexes);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%IMPORTANT%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% corrs_per_step is the number of correlations to be considered at each
% step: if n_iters is very high, consider lowering the parameter
% corrs_per_step. On most machines, you can't generate big matrices (for
% example: of size 8000-by-8000): be sure that a matrix of size
% (corrs_per_step)-by-(n_iters) doesn't exceed the limit of your RAM.
corrs_per_step = 10000;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% loops is the ratio between the total number of correlations K and the
% corrs_per_step chosen above. The number of loops should be small.
% So, if n_iters is sufficiently small, consider increasing corrs_per_step.
loops = floor(K / corrs_per_step);
% If the total number of correlations, K, is small enough, make the
% computations at once, by means of a single for loop. Then return the
% results
if loops == 0
corrs_temp = zeros(n_iters, K);
for i = 1:n_iters
% Extract rows randomly, with repetition, then compute correlations
temp = corrcoef(Y(ceil(T * rand(T, 1)), :));
corrs_temp(i, :) = temp(indexes);
end
% Compute percentiles
corrsperc = prctile(corrs_temp, perc)';
% Compute standard deviations by using corrs_mean
corrstd = sqrt(sum((corrs_temp - repmat(corrs_mean, n_iters, 1)) .^ 2) / n_iters);
% If matrix3D is 1, then write corrsperc as a 3D matrix and corrstd as a
% 2D matrix
if nargin == 4
if matrix3D == 1
temp_corrsperc = zeros(N, N, length(perc));
temp = zeros(N, N);
for i = 1:length(perc)
temp(indexes) = corrsperc(:, i);
temp = temp + temp';
temp(1:(N + 1):(N ^ 2)) = 1;
temp_corrsperc(:, :, i) = temp;
temp(:) = 0;
end
corrsperc = temp_corrsperc;
temp(indexes) = corrstd;
temp = temp + temp';
temp(1:(N + 1):(N ^ 2)) = 0;
corrstd = temp;
end
end
return
end
% But if K is big, compute corrs_per_step percentiles at a time ...
corrstd = zeros(K, 1);
for j = 1:loops
corrs_temp = zeros(n_iters, corrs_per_step);
for i = 1:n_iters
% Extract rows randomly, with repetition, then compute correlations
temp = corrcoef(Y(ceil(T * rand(T, 1)), :));
% Compute standard deviations iteratively: first sum squares
corrstd = corrstd + (temp(indexes) - corrs_mean)' .^ 2;
corrs_temp(i, :) = temp(indexes((corrs_per_step * (j - 1) + 1):(corrs_per_step * j)));
end
% Compute percentiles
corrsperc(:, (corrs_per_step * (j - 1) + 1):(corrs_per_step * j)) = prctile(corrs_temp, perc);
end
% ... then consider the remaining cells
if K > (loops * corrs_per_step)
clear corrs_temp
j = j + 1;
for i = 1:n_iters
temp = corrcoef(Y(ceil(T * rand(T, 1)), :));
corrstd = corrstd + (temp(indexes) - corrs_mean)' .^ 2;
corrs_temp(i, :) = temp(indexes((corrs_per_step * (j - 1) + 1):K));
end
corrsperc(:, (corrs_per_step * (j - 1) + 1):K) = prctile(corrs_temp, perc);
end
% Compute Standard Deviation
corrstd = sqrt(corrstd / (j * n_iters));
corrsperc = corrsperc';
if nargin == 4
if matrix3D == 1
temp_corrsperc = zeros(N, N, length(perc));
temp = zeros(N, N);
for i = 1:length(perc)
temp(indexes) = corrsperc(:, i);
temp = temp + temp';
temp(1:(N + 1):(N ^ 2)) = 1;
temp_corrsperc(:, :, i) = temp;
temp(:) = 0;
end
corrsperc = temp_corrsperc;
temp(indexes) = corrstd;
corrstd = temp;
end
end