Code covered by the BSD License  

Highlights from
Fisher’s Exact Test

from Fisher’s Exact Test by Michael Boedigheimer
Hypergeometric cumulative distribution (left, right and two-tailed) including permutation testing.

fexact( varargin )
function [pval x K P xx fx pperm] = fexact( varargin )
%FEXACT Fisher's exact test
%
% Fisher's exact test is a statistical test used to compare binary outcomes
% between two groups. For example, a laboratory test might be positive or negative
% and we may be interested in knowing whether there is a difference in frequency of positive
% results among people with a certain disease (cases) compared to those in healthy
% (controls). The test is applicable whenever there are independent tests conducted on two
% groups. A crosstable of results (pos/neg) and status (case/control) produces a 2x2 contingency
% table shown below.
%       cases  controls
%  pos   a        c       K
%  neg   b        d       -
% total  N        -       M
%
% a,b,c,d are the numbers of positive and negative results for the cases
% and controls. K, N, M are the row, column and grand totals.
%
% This function returns a pvalue against the hypothesis that the number of
% positive cases (and by extension other cells in the table) observed or a more extreme 
% distribution might have occured by chance given K, N,and M.
%
% usage
%   p = fexact(X,y)
%       y is a vetor of status (1=case/0=control). X is a MxP matrix of
%       binary results (pos=1/neg=0). P can be very large for
%       genotyping assays, each of which can be considered a different way
%       to categorize the cases and controls.
%
%  p = fexact( a,M,K,N, options)
%      M and N are constants described above. a and K are P-vectors. No checks for valid
%      input are made.
%
%  [p a K C] = fexact(X,y);
%       also returns vector a. a(i) is the upper left square of the contingency
%       corresponding to the ith column of X crosstabulated by y. K is a
%       vector containing a+b for the ith column of X.
%       C is a lookup table for CDFs that can be used in subsequent calls
%       to fexact to further improve performance.
%       C( x(i)+1, K(i)+1) is the cdf for the tail specified in
%       options. 
%       NB. this lookup table is only for the given M and N values
%
%  p = fexact( a,M,K,N,C]
%      C is a lookup table that improve performance. Intended to be
%      used with permutation testing where 1000s of calls are made.
%      the option "tail" is ignored if C is provided, as it implies
%      whatever tail was used to generate them. 
%
%  p = fexact(..., 'options', values)
%      options:
%      tail  l(eft)','r(ight)', 'b(oth)'
%            left tail tests a negative association. p( x<=a, M, K, N)
%            right tail tests a positive association. p( x<=b, M, K, M-N)
%            both (default) is either a positive or negative association.
%            this is the sum from the left and right tails of the
%            distribution including all x where p(x|M,K,N) is less than
%            or equal p(a|M,K,N).
%      perm  Q, where Q is an integer. Permutation testing when X has multiple Columns. 
%             to correct for multiple testing. The entire set of tests is repeated Q*repsz
%            times with permuted y variables. The reported p-values are 
%            corrected for multiple tests by interpolating into the emprical 
%            distribution of the minimum p-value obtained from each Q*repsz rounds .
%            To use this option the X and y calling convention must be used
%            (as opposed to (a,M,K,N,...) convention.
%            and Q must be greater than 1.
%      repsz S, where S is an integer. Used with the PERM option to specify 
%            the number of replicates to compute at one time (default=100). 
%            Larger S is faster but requires more memory. 
%
%
% NOTES and LIMITATIONS:
%      This function is extremely fast when doing multiple tests and is
%      acceptable with a small number of tests. However, it uses large
%      tracks of memory. On my 2Gb home Intel Core2 Quad Core Vista machine
%      this function does 250,000 tests with 100 observations each in 0.10
%      seconds. I run out of memory when I do more
%
% example
%  x = unidrnd(2, 200, 1000)-1;
%  y = unidrnd(2,200,1)-1;
%  p = fexact( x, y );                  % generates p-values for 1000 tests in x
%  p1 = fexact( x, y, 'perm', 10); % reports p-value relative to empirial
%                                               % cdf, since these are randomly generated 
%                                               % the values in p1 should be near  1
%   

% Copyright 2009 Mike Boedigheimer
% Amgen Inc.
% Department of Computational Biology
% mboedigh@amgen.com

p = inputParser;
p.addRequired('A');
p.addOptional('y',[]);
p.addOptional('K',[]);
p.addOptional('N',[]);
p.addOptional('C',[]);
p.addOptional('tail', 'b', @(c)ismember( c, {'b','l','r'} ));
p.addOptional('perm',  0, @(x) isnumeric(x)&isscalar(x));
p.addOptional('repsz', 100, @(x) isnumeric(x)&isscalar(x));
p.parse(varargin{:});

P = p.Results.C;
if isempty(p.Results.N)     % using fexact(X,y,options)
    X = p.Results.A;
    y = p.Results.y;
    if (~islogical(y) && ~all( ismember( y, [0 1])) ) 
        error('linstats:fexact:InvalidArgument', 'y must be in (0,1)' );
    end
    if (~islogical(X) && ~all( ismember( X(:), [0 1])) ) 
        error('linstats:fexact:InvalidArgument', 'X must be in (0,1)' );
    end

    y = logical(y);
    N = sum(y);           % number of cases
    M = length(y);
    K = sum(X,1)';
    x = X'*sparse(y);  % in unofficial testing using timeit. This was faster than sum(X(y==1,:))
    % and faster than sum(bsxfun(@eq,X,y))
else % using fexact( a,M,K,N ...)
    x = p.Results.A;
    M = p.Results.y;
    N = p.Results.N;
    K = p.Results.K;
end

switch p.Results.tail
    case 'l'; tail = -1;
    case 'r'; tail = 1;
    case 'b'; tail = 2;
end

if N==0 || N==M
    pval = ones(1,length(x));
    return;
end;

if isscalar(M) && isscalar(N) && isscalar(K) && isscalar(x) && ~(p.Results.perm > 1)
    pval = doTest( x, M, K, N, tail );
    pval(pval>1) = 1;   % fix roundoff errors
    return;
end

if isempty(P)
    P = getLookup( M,N,tail);
end

pval = P( sub2ind(size(P), x+1, K+1));


if p.Results.perm > 1
    pperm = doPerm(X,y,M,K,P,p.Results.perm,p.Results.repsz);
    [fx xx] = mecdf(pperm(:));
    pval   = interp1( xx, fx, pval);
end

pval(pval>1) = 1;   % fix roundoff errors


function pperm = doPerm(X,y,M,K,P,nperms,repsz)
% returns pperm a matrix with each element representing the smallest p
% value from a complete study where the rows of y have been randomly
% permuted
pperm = ones(repsz,nperms);

% sub2ind was previously 85% of the execution time. 
% this method computes the index directly rather than
% computing x and then calliung sub2ind. It is much
% faster because it uses the fast multiple engine
% precompute K*nrows +1, then add x after it is calculated
X = [ K*size(P,1)+1 X' ]; 
B = ones(1,repsz);

for permi = 1:nperms
    % generate new random vector y, by shuffling the rows (do not change
    % the counts of M,K or M)
    [i i] = sort( randn(M,repsz) ); % note randn was faster than rand when I checked
    pperm(:,permi) = min( P( X*[B;y(i)] ),[], 1);
end


function P = doTest( x, M, N, K, tail)

minx = max(0,N+K-M);
maxx = min(N,K);
if x < minx
    error( 'x is can not be smaller than max(0,N+K-M)' );
end

if x > maxx;
    error( 'x cannot be larger than min(N,K)');
end

%log p(x|M,N,K) = Q(K,x)+Q(M-K,N-x)-Q(M,N) where Q(n,k) = lnchoosek(n,k)
if tail == -1
    a = minx:x;
    logP = lnchoosek( K, a) + lnchoosek(M-K,N-a) - lnchoosek(M,N);
elseif tail == 1
    a = x:min(K,N);
    logP = lnchoosek( K, a) + lnchoosek(M-K,N-a) - lnchoosek(M,N);
else
    a = 0:min(K,N);
    logP = lnchoosek( K, a) + lnchoosek(M-K,N-a) - lnchoosek(M,N);
    pcrit = logP(x+1);
    k = (logP - pcrit) <= M*eps(max(logP));
    logP  = logP(k);
end

P = sum(exp(logP));



function P = getLookup( M, N, tail)

F = gammaln( 1:M+1);  % used to compute factorials. For small problems
% It is overkill to generate all factorials up
% to M, but it takes on .001 seconds to compute
% all of them to 1..10,001, so we can afford
% it. Memory-wise may be more of an issue, and
% a reason to

% lookup value for lnchoosek(a,b) in matrix L for all 1<=b<=a<=M
% to lookup log nchoosek(n,k) use L(n+1,k+1);
[k n ind] = tri2sqind(M);
L = zeros(M);
L(ind) = lnchoosek( n, k, F);
L = blkdiag(0,L');      % pad first row and col with 0s for n=0, k=0


%log p(x|M,N,K) = Q(K,x)+Q(M-K,N-x)-Q(M,N) where Q(n,k) = lnchoosek(n,k)

% create a lookup table for Log P
% to lookup log p(x,|M,N,K) use P(x+1,K+1);
% P initially contains 1s for valid values of x
% x is in [min( 0, N+K-M), ..., min(N,K)]
% min(N,K) is on the diagonal. the first
% column that contains a 0 in the first row will be when the minimum legal
% value of x is 1, so N+K-M == 1. K will equal M-N+1, which is the column
% M-N+2
P = spdiags(ones(N+1,M+1),0:M-N,N+1,M+1); 
[Xind Kind] = find(P);
MKind = M-Kind+2;
NXind = N-Xind+2;

q = sub2ind(size(P),Xind,Kind);
P = nan(size(P));
P(q) = exp( L(sub2ind(size(L),Kind,Xind))    + ... Q(K,x)
    L(sub2ind(size(L),MKind,NXind))  + ... Q(M-K, N-x)
    -L(M+1,N+1));                     % ... Q(M,N)
if tail == -1
    P = cumsum(tril(P, M-N));
elseif tail == 1
    P = flipud(triu(P));
    P = cumsum(P);
    R = N-(-1:size(P,1)-2)';
    P = P(R,:);
else
    % This function has been tested against two reference algorithm using all M
    % and N up to 30. The reference algorithms differ in how they find the
    % find the tail on the other side of the distribution.
    [q r] = size(P);
    [P idx] = sort(P,1);     % sort order, called rank, will be transformed to rank below
    ties = [abs(diff(P)) < M*numel(P)*eps(P(1:end-1,:));
        zeros(1,r)];
    R = repmat( (1:q)',1,r);
    R(col2ind(idx)) = R+ties;
    P = cumsum(P,1);          % reuse P to store cumulative P
    P = P(col2ind(R));
end



function q = lnchoosek( n, k, f)
% lnchoosek natural log of nchoosek
%
% usage
%       q = lnchoosek( n, k );
% usage
%       q = lnchoosek( n, k, f); where f is the gammaln(1:max(n,k)+1);
q    = n-k;       % allocate space same size as n
i    = q>0;       % only compute for valid n and k

if nargin < 3
    % q is 0 when n = k because log nchoosek(a,a) = 1 for any a >= 0
    % otherwise it is given by ...
    q(i) = -log(q(i)) - betaln(k(i)+1,q(i));     % otherwise this is it
else
    q(i) =  f(n(i)+1) - f(k(i)+1) - f( q(i)+1 );
end

q(~i) = 0;


function i = col2ind(order,siz)
%col2ind converts column specific 1-based indices to element based indices
%i = col2ind(A, siz)
%   return i, a set of integer indices into a m x n matrix
%   A is a p x n set of column specific indices as you'd get from the second
%   output of sort
%   size is optional.
%        if present, P is set to SIZ(1), otherwise
%        P is set to the size of the first dimension of ORDER,
%
% example
%   [xsort order] = sort(x);
%   wrong = x(order);    % this isn't what you want
%   xsort = x(col2ind(order));   % this is

[m p] = size(order);
if nargin < 2
    q = m;
else
    q = siz(1);
end
oset = 0:q:(q*p-1);
i = order + repmat( oset, m, 1 );

function [i,j,k] = tri2sqind( m, k )
%TRI2SQIND subscript and linear indices for upper tri portion of matrix
%
% get indices into a square matrix for a vector representing a the upper
% triangular portion of a matrix such as those returned by pdist.
%
% [i,j,k] = tri2sqind( m, k )
%  If V is a hypothetical vector representing the upper triangular portion
%  of a matrix (not including the diagonal) and
%  M is the size of a square matrix and
%  K is an optional vector of indices into V then tri2sqind returns
%  (i,j) the subscripted indices into the equivalent square matrix.
%  K is an integer index into the equivalent square matrix
%
% Example
%  X = randn(5, 20);
%  Y = pdist(X, 'euclidean');
%  [i,j,k] = tri2sqind( 5 );
%  S = squareform(Y);
%  isequal( Y(:), S(k) );
%  Z = zeros(5);
%  Z(k) = Y;

max_k = m*(m-1)/2;

if ( nargin < 2 )
    k = (1:max_k)';
end;

if any( k > max_k )
    error('linstats:tri2sqind:InvalidArgument', 'ind2subl:Out of range subscript');
end;


i = floor(m+1/2-sqrt(m^2-m+1/4-2.*(k-1)));
j = k - (i-1).*(m-i/2)+i;
k = sub2ind( [m m], i, j );

function [fx x] = mecdf( y )
% MECDF my ECDF function to avoid dependendencies on stats toolbox
% assumes 0 <= y <= 1
% x always includes 0 and fx(x>1) = 1;
n = length(y);
[x i] = unique(sort(y));
fx = linspace( 0, 1, n+1 )';
fx = fx(i+1);
if x(1)>0
    x = [0;x];
    fx = [0;fx];
end

if x(end) < 1    % set up the maximum x value as Inf, so interp1 never fails 
    x  = [x;Inf];
    fx = [fx;1];
end



Contact us at files@mathworks.com