% This file refers to section 6 (the INK example) of:
% Technical Report RP-SOR-0601, Robust Experimental Design for Multivariate Generalized Linear Models, Hovav A. Dror and David M. Steinberg, January 2006.
% Available at http://www.math.tau.ac.il/~dms/GLM_Design
% It is a work through for finding a robust design using clustering
% to a Poisson model example, 5-dimensional, normal prior, two competing models
% You can run the script and see its stages
% The Parameter Space
BetaV=[];
BetaV1=[];
BetaV2=[];
BetaMeans1=[ -2.3479 -5.5296 -2.9929 -3.9494 -0.8555 0.4139 -2.0666 -1.1263];
BetaSE1=[ 0.6863 0.9408 0.8164 0.5876 0.5389 0.3649 1.3195 0.9848];
BetaMeans2=[ -1.5165 -4.2972 -1.7859 -3.3852 -0.2806 0.2282];
BetaSE2=[ 0.2058 0.2011 0.1581 0.2367 0.3185 0.3026];
m=5; % m - number of variables
model1=[zeros(1,m) ; eye(m) ; 1 1 zeros(1,m-2) ; 1 0 1 zeros(1,m-3)];% for the INK example
model2='linear';
if isnumeric(model1)
p1=length(model1);
else
p1=length(x2fx(ones(1,m),model1));
end;
if isnumeric(model2)
p2=length(model2);
else
p2=length(x2fx(ones(1,m),model2));
end;
family='poisson';
link='log';
plot([0 0 1],[1 0 0],'k'); % empty plot - preparation only
scrsz = get(0,'ScreenSize');
set(gcf,'Position',[2*scrsz(3)/3 2*scrsz(4)/3 scrsz(3)/3 scrsz(4)/3]); % small figure upper right of the screen
% ------------------------------------------------------------------------
% This step is optional.
% Its purpose is to choose the number of support points
% in each Local D-optimal design
fprintf('----------------------------------------------------------------------------------------------------\n');
fprintf('\nThis is an optional step \n');
fprintf('It helps choosing the number of support \n');
fprintf('points for a Local D-optimal Design (richer model) \n');
t1=0; % time for first step
A=input('Choose "1" to use the option, \nor press "Enter" to skip: ');
if ~isempty(A)
t1=clock;
fprintf('The number of support points tested now is:');
BetaMean= BetaMeans1; % The centroid of the beta space for the richer model
NSP=p1:2^m; % Sequence of support points values to check
SPE=[]; % Vector with the Efficiency for each NSP value
i=0;
for SP=NSP
i=i+1;
fprintf(' %g',SP);
[X,IM,SPE(i)]=DGLM(BetaMean,m,SP,model1,family,link,10^-2);
end; % for SP
fprintf(' \n \n');
plot(NSP,SPE/max(SPE),'.-k');
axis([min(NSP) max(NSP) 0 1]);
grid on;
t1=etime(clock,t1); % Time to compute
fprintf('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n');
fprintf('(Finished optional part in %1.0f seconds)\n',t1);
xlabel('Number of Support Points in Beta''s Centroid Local D-Optimal Design');
ylabel('Relative Efficiency');
title('Choose the proper number of support points for a Local Design');
LSP=input('Using this graph, choose the number of \nSupport Points for Local Designs (default=8): ');
if isempty(LSP), LSP=8; end;
else % if skipped
LSP=input('Choose the number of Support Points \nfor Local Designs (default=8): ');
if isempty(LSP), LSP=8; end;
end; % if use the option to choose LSP by graph
% -------------------------------------------------------------------------
% First Step: Create Local Designs all over the Parameter Space
N=1000; % Number of vectors from the parameter sapce
% Use a Niederreiter sequence of proper size
%NI=Nied(N,p); % If function exist: Niederreiter sequence of the proper size
%otherwise:
t2=clock; % timing the Local Database creation
NI2=load('Nied1000.m');
NI1=NI2(1:N,1:p1);
NI2=NI2(1:N,1:p2);
% Now transform it according to distribution
BetaV1 = norminv(NI1,ones(N,1)*BetaMeans1,ones(N,1)*BetaSE1);
BetaV2 = norminv(NI2,ones(N,1)*BetaMeans2,ones(N,1)*BetaSE2);
% Now Search for the local Designs, and put them in the matrix X
X=[]; % Designs
IM=[]; %Information Matrices
IMD=[]; % D-Criteria value of the regression matrices
fprintf('\n----------------------------------------------------------------------------------------------------\n');
fprintf('Next: Finding 100 Local D-Optimal designs for each model \n');
N2=input('press "enter" to continue, \n(or choose a different number of Local Designs): ');
if ~isempty(N2), N=N2; else N=100; end;
for ModelN=1:2
fprintf('\nModel%g, Now Finding Design #:',ModelN);
if ModelN==1
model=model1;
BetaV=BetaV1;
else
model=model2;
BetaV=BetaV2;
end;
for i=1:N
fprintf(' %g',i);
[Xi,IMi, Di]=DGLM(BetaV(i,:),m,LSP,model,family,link);
X=[ X ; Xi ] ;
%IM{i}=IMi;
IMD=[IMD ; Di];
end;
end; %for ModelN
X=X-sign(X).*rand(size(X)).*0.0001; % Jittering
t2=etime(clock,t2);
fprintf('\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -');
fprintf('\nDone! Database Creation time %2.0f seconds.\n \n',t2);
% -----------------------------------------------------------------------------------------
% Next Step is choosing the number of Support points for the compromise
% design (What should be the K of the K-means clustering?)
t3=clock; %timing this step
fprintf('----------------------------------------------------------------------------------------------------\n');
fprintf('Time to choose "K" for K-means Clustering. \n');
fprintf('Repeateing the procedure only once for each K \n');
fprintf('(may cause the graph to be a bit jumpy) \n');
input('press "enter" to continue');
fprintf('Now Trying K= ');
KS=[8:16 18:2:30 32:4:64]; % sequence of tested K values
KX=[]; % Design matrix by clustering (for different K's);
Eff=[];
MedEff=[];
MinEff=[];
i=0;
for K=KS
i=i+1;
fprintf(' %g',K);
[IDX,KX,s] =kmeans(X,K,'maxiter',500,'emptyaction','singleton','distance','cityblock');
for j=1:N
Eff(i,j)=( det( InfoMtrxGLM(x2fx(KX,model1),BetaV1(j,:)',family,link) )^(1/p1) ) / K / IMD(j);
Eff(i,N+j)=( det( InfoMtrxGLM(x2fx(KX,model2),BetaV2(j,:)',family,link) )^(1/p2) ) / K / IMD(N+j);
end; % for j
MedEff(i)=median(Eff(i,:));
MinEff(i)=min(Eff(i,:));
end; % for K
t3=etime(clock,t3);
fprintf('\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -');
fprintf('\nDone in %2.0f seconds (All phases togeter %2.0f seconds)\n',[t3 t1+t2+t3]);
plot(KS,MedEff,'.-k');
hold on
plot(KS,MinEff,'s:k','markersize',3);
hold off
xlabel('Number of Support Points');
ylabel('Approximate Efficiency');
legend('Median Efficiency','Minimum Efficiency','Location','East')
title('Choose "K" - the number of support points for the robust design');
K=input('Using this graph, choose the number of Support Points, K, \nfor The Robust Design (default=48): ');
if isempty(K), K=48; end;
% -----------------------------------------------------------------------------------------
% Now use K-means clustering to produce the Robust Design
fprintf('\n----------------------------------------------------------------------------------------------------\n');
fprintf('Last step - repeat the clustering with the chosen K (%g), \nand choose the Robust Design\n',K);
%Nrepeat=N; % Number of times the clustering will be performed
Nrepeat=40; % Number of times the clustering will be performed
Nrepeats=input(sprintf('Choose the number of clustering repetitions \n(or press "enter" to use %g repetitions): ',Nrepeat));
if isempty(Nrepeats), Nrepeats=Nrepeat; end;
fprintf('Number of Clustering repetition processed: ');
t4=clock; % timing the step
BDV = -inf ; % Best D-criteria value found
for i=1:Nrepeats
fprintf(' %g',i);
[IDX,KX,s] =kmeans(X,K,'maxiter',500,'emptyaction','singleton','distance','cityblock');
DV=1; % D-criteria representing value for this clustering
for j=1:N
DV=DV + log( det(InfoMtrxGLM(x2fx(KX,model1),BetaV1(j,:)',family,link) ) )/p1;
DV=DV + log( det(InfoMtrxGLM(x2fx(KX,model2),BetaV2(j,:)',family,link) ) )/p2;
end; % for j
if DV>BDV
BX=KX;
BDV=DV;
end; % if improved the design update BX - the best design found so far
end; % for i (Nrpeats)
t4=etime(clock,t4);
fprintf('\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n');
fprintf('Done! This step took %2.0f seconds. Overall the process took %2.0f seconds',[t4 t1+t2+t3+t4]);
fprintf('\nThe best design found, the matrix BX, is ready to be used\n');
fprintf('----------------------------------------------------------------------------------------------------\n');
% -------------------------------------------------------------------------
% Last, without computing "many" local designs, we can't produce
% a good histogram of efficiencies.
% The following is a rough approximation, based on the N designs created
% before
input('Press "enter" for a plot representing a rough estimation \nof the efficiency histogram\n');
HEff=[];
for i=1:N
% HEff = [ HEff ( det( InfoMtrxGLM(x2fx(BX,model1),BetaV1(i,:)',family,link) )^(1/p1) ) / K / IMD(j) ];
% HEff = [ HEff ( det( InfoMtrxGLM(x2fx(BX,model2),BetaV2(i,:)',family,link) )^(1/p2) ) / K / IMD(N+j) ];
HEff(i)=( det( InfoMtrxGLM(x2fx(BX,model1),BetaV1(i,:)',family,link) )^(1/p1) ) / K / IMD(i);
HEff(N+i)=( det( InfoMtrxGLM(x2fx(BX,model2),BetaV2(i,:)',family,link) )^(1/p2) ) / K / IMD(N+i);
end;
hist(HEff);
x=axis;
axis([0 1 x(3:4)]);
xlabel('Approximate Efficiency');
title(sprintf('Efficiency Estimation based on the same %g local designs',N));