Code covered by the BSD License  

Highlights from
Contingency periodogram

image thumbnail

Contingency periodogram

by

 

Identifies cyclic patterns in non-metric time-series

[sig_periods]=contperiod(t,states,part)
function [sig_periods]=contperiod(t,states,part)
%This function calculates the contingency periodogram from two input
%vectors of class integer. Based on the work of:
%Legendre, L., Frechette, M., and Legendre, P. 1981. The Contingency
%periodogram: A method of identifying rhythms in series of nonmetric
%ecological data. J. Ecol. 69(3): 965-979.
%
%[sig_periods]=contperiod(t,states,part)
%
%If t is a constant, t is intepreted as being the length of the vector with
%timestep = 1. A vector of length(t) will be produced from 1:1:t.
%Otherwise, t must be a vector representing the time domain of the
%time-series. If size(t) = [M,1], the vector will be transposed
%automatically to size(t) = [1,M]. As a vector, t must be regularly spaced.
%
%states is a vector with the data. The data must be in an integer form
%(i.e. data must not contain decimal values). For strings or other
%categorical data, convert states into integers by assigning a unique ID
%(integer) to each category of interest and use the converted state ID as
%the states vector. If categories should not be directly assigned, assign
%unique IDs based on the rank. If size(states) = [M,1], then states will be
%transposed.
%
%If parts is not assigned, states is considered to already be distributed
%into actual states and the program will compute the periodogram using the
%data in states directly. If parts is assigned a value (even if that value
%is an empty set), states is considered to consist of rank data.
%If parts is assigned an empty set ([]), contperiod uses a maximum entropy
%approach to automatically define where the partitions between states
%should be (up to 5 partitions). The number(s) in part identify the upper
%bound of a partition. The partion is based on the rank of the data.
%If parts is a scalar or vector, the time-series will be partioned at the
%value(s) in parts.
%
%sig_periods is a scalar or vector of periods with possible statistical
%significance, based on the contingency periodogram. Although the
%periodogram is calculated without it, the statistical significance
%comparison requires the function chi2inv from the statistics toolbox.
%
%
%Example input for different types of data:
%data = {'Type_1' 'Type_2' 'Type_2' 'Type_3' 'Type_4' 'Type_5' 'Type_5'}.
%This needs to be converted to:
%data = [1 2 2 3 4 5 5];
%
%[sig_periods]=contperiod(7,[1 2 2 3 4 5 5]) results in a time-series of
%x = 1:7; y = [1 2 2 3 4 5 5] and directly calculates the peridiogram.
%
%[sig_periods]=contperiod(3:9,[1 2 2 3 4 5 5]) results in a time-series of
%x = [3 4 5 6 7 8 9]; y = [1 2 2 3 4 5 5] and directly calculates the
%peridiogram.
%
%[sig_periods]=contperiod(3:9,[1 2 2 3 4 5 5],[]) results in a time-series
%of x = [3 4 5 6 7 8 9]; y = [1 2 2 3 4 5 5] and will partition the states
%based on a maximum entropy approach before calculating the periodiogram.
%
%[sig_periods]=contperiod(3:9,[1 2 2 3 4 5 5],[2 3]) results in a
%time-series of x = [3 4 5 6 7 8 9]; y = [1 2 2 3 4 5 5] and data will be
%organized into three groups (group 1 = 0,1,2,; group 2 = 3; group 3 = 4,5)
%before calculating the peridiogram.

%%
%******************************************
%test data from Legendre et al. (1981)
%t=1:16;
%states=[1 1 2 3 3 2 1 2 3 2 1 1 2 3 3 1];
%ranks=[2 2 4 7 10 5 2 5 8 4 1 2 5 9 6 3];
%******************************************

%%
%Error checking and data formatting

%if t is constant, convert to time domain vector with t timestep
if length(t)==1
    t=(1:t);
end

%error check for vectors (not matrices)
tsize=size(t);
statesize=size(states);

if numel(t)~=length(t)||numel(states)~=length(states)
    error('Data entered as matrix; please enter data as vector')
end

%error check for numeric array
if isnumeric(t)==0||isnumeric(states)==0
    error('time and states must be vectors of integer values.')
end

%error check if constant timestep
D=diff(t);
if sum(D(1)==D(2:end))~=length(D)-1
    error('Vector must be monotonically increasing.')
end

%error check number of partitions < number of states
if nargin==3
    if length(part)>=length(unique(states))
        error('Number of partitions must be less than the number of unique states or ranks')
    end
end

%error check length of time-series = length of data
if length(t)~=length(states)
    error('time-series and data must have the same length')
end

%rotate time vector to size=[1,M]
if tsize(1)>1
    t=t';
end

%rotate value vector to size=[1,M]
if statesize(1)>1
    states=states';
end

%rotate partition vector to size=[1,M]
if nargin==3
    if size(part,1)>1
        part=part';
        if size(part,1)~=1
            error('part needs to be a scalar or a vector')
        end
    end
end

%treat ranks and decimal numbers as ranks
if sum(mod(states,floor(states)))~=0
    error('states should be category or rank data and not contain decimals') 
end
  
clear tsize statesize D I

%%
%Partition data into states

if nargin==3
    if isempty(part)==1
        %maximum entropy to define partitions
        Urank=unique(states); %Urank is sorted small to large
        len_rank=length(Urank);
        NumObs=zeros(1,len_rank);
        for n=1:len_rank
            NumObs(1,n)=sum(states==Urank(n)); %number of observations
        end

        num_permute=len_rank-1;
        rank_prod=prod([Urank;NumObs],1); %product of rank and number of observations
        rank_sum=zeros(1,num_permute);
        cumNumObs=zeros(1,num_permute);
        for n=1:len_rank
            rank_sum(1,n)=sum(rank_prod(Urank(1:n)));
            cumNumObs(1,n)=sum(NumObs(1:n));
        end
    
        E_value=0; %initial entropy set to zero
        comp=0;
        numpart=1;
        while E_value(end)>=comp&&numpart<len_rank; %loop while information increases with partition number
            comp=E_value(end); %previous entropy saved for comparison to new value
    
            P=nchoosek(Urank(1:end),numpart); %possible partition locations
            P=P(P(:,end)<max(Urank),:);
    
            if numpart>5 %prevents loop errors
                disp('Too many partitions (>5) predicted.')
                break        
            end
    
            if numpart<2
                t1(1,:)=rank_sum(P); %sum of rank values at all partitions
                t1(2,:)=rank_sum(end)-rank_sum(P); %leftover ranks at all partitions
                t2(1,:)=cumNumObs(P); %sum of observations at all partitions
                t2(2,:)=cumNumObs(end)-cumNumObs(P); %leftover observations at all partitions
            else
                t1=zeros(numpart+1,size(P,1));
                t2=zeros(numpart+1,size(P,1));
        
                t1(1,:)=rank_sum(P(:,1));
                t2(1,:)=cumNumObs(P(:,1)); %sum of observations at all partitions
                for n=2:numpart
                    t1(n,:)=rank_sum(P(:,n))-rank_sum(P(:,n-1));
                    t2(n,:)=cumNumObs(P(:,n))-cumNumObs(P(:,n-1)); %leftover observations at all partitions
                end
                t1(n+1,:)=rank_sum(end)-rank_sum(P(:,n));
                t2(n+1,:)=cumNumObs(end)-cumNumObs(P(:,n));
            end
    
            tsum=t1.^2./t2; %sum of squared ranks/obs at all partitions
            totalsum=sum(tsum,1);
            [~,part_index]=max(totalsum); %location of maximum entropy
            Pmax=P(part_index,:);
     
            if numpart<2
                E(1,1)=cumNumObs(part_index)/cumNumObs(end);
                E(2,1)=(cumNumObs(end)-cumNumObs(part_index))/cumNumObs(end);
            else
                E=zeros(numpart,1);
                E(1,1)=cumNumObs(Pmax(1))/cumNumObs(end);
                for n=2:numpart
                    E(n,1)=(cumNumObs(Pmax(n))-cumNumObs(Pmax(n-1)))./cumNumObs(end);
                end
                E(n+1,1)=(cumNumObs(end)-cumNumObs(Pmax(n)))./cumNumObs(end);
            end
            E_value(numpart,1)=-1/(numpart+1)*sum(E.*log2(E));
            numpart=numpart+1;
        end

        %clear E_value E cumNumObs part_index totalsum tsum t1 t2 rank_sum Urank
        %clear comp len_rank NumObs num_permute rank_prod

        disp('')
        disp(['Using ',num2str(numpart-1),' partitions at locations [',num2str(Pmax),']'])

    else
        Pmax=part;
    end
    
%distribute ranks into states
    if length(Pmax)>1
        Pmax=[0,Pmax];
        for m=1:length(Pmax)-1
            states(states>Pmax(m)&states<=Pmax(m+1))=m;
        end
        states(states>Pmax(end))=m+1;
    else
        states(states<=Pmax)=1;
        states(states>Pmax)=2;
    end
end

%%
%Create contingency table

%Separate states vector into different categories (one for each unique value)
Ustates=unique(states); %identify uniques states
len_states=length(Ustates); %number of unique states
len_time=length(t); %length of time-series
time_series=zeros(len_states,len_time); %preallocate memory for contingency table

for n=1:len_states %for each state...
    time_series(n,:)=states==Ustates(n);...identify which periods contain state
end
time_series=int8(time_series); %time_series as integers, so row_sum can be done
Row_sum=sum(time_series,2); %sum of rows
Row_mult=log(Row_sum/len_time); %log of row data
Row_mult(Row_mult==-Inf)=0; %if log = -inf, replace with zero
H_S=-sum((Row_sum/len_time).*Row_mult); %entropy of rows of time-series

clear Ustates Row_sum Row_mult
%pre-allocate resources
Nyquist=floor(len_time/2);
periods=2:Nyquist;
Col_sum1=zeros(len_states,Nyquist);
H_X=zeros(1,Nyquist-1);
H_SX=zeros(1,Nyquist-1);

%create loop for all periods
for discreteperiod=periods
    for j=1:discreteperiod
        tempcontable=time_series(:,j:discreteperiod:end);
        Col_sum1(:,j)=sum(tempcontable,2); %build contingency table
    end
    Col_sum2=sum(Col_sum1,1); %sum of columns
    Col_mult=log(Col_sum2/len_time); %log of column data
    Col_mult(Col_mult==-Inf)=0; %if log = -inf, replace with zero 
    H_X(1,discreteperiod-1)=-sum((Col_sum2/len_time).*Col_mult); %entropy of columns in period
    
    Col_mult=log(Col_sum1/len_time); %log of contingency periodogram
    Col_mult(Col_mult==-Inf)=0; %if log = -inf, replace with zero 
    H_SX(1,discreteperiod-1)=-sum(sum((Col_sum1/len_time).*Col_mult)); %entropy in table
end

clear discreteperiod tempcontable Col_sum1 Col_sum2 Col_mult
Entropy=repmat(H_S,1,Nyquist-1)+H_X-H_SX; %total entropy
DOF=(len_states-1)*(periods-1); %degrees of freedom for chi-squared test
sig=chi2inv(0.95,DOF)/(2*len_time); %significance level for periodogram

%%
%Results: tabular
sig_periods=periods(Entropy>sig); %find potentially significant periods
if isempty(sig_periods)~=1
    disp('')
    disp('Periods with possible statistical significance:')
    disp(sig_periods)
else
    disp('')
    disp('No periods are statistically significant')
    sig_periods=NaN;
end

%Results: graphical
figure('color','w')
plot(t,states,'k-',t,states,'k.')
set(gca,'ytick',min(states):1:max(states))
xlabel('Time')
ylabel('States')
title('Time Series')

figure('color','w')
title('Contingency Periodogram Results')
xlabel('Period')
ylabel('Entropy')
hold on
plot(periods,Entropy,'k-',periods,sig,'k--')
legend('States','95% significance','location','NorthWest')

Contact us