MMGDX: a maximum-margin training method for neural networks

by

 

17 Sep 2010 (Updated )

Maximum-margin training method applicable to MLP in the context of binary classification.

train_MMGDX.m
%This code is an implementation of the MMGDX, a maximum-margin training
%method for MLP neural networks. In case of publication of any aplication
%of this method, please, cite the original work: O. Ludwig and U. Nunes; 
%Novel Maximum-Margin Training Algorithms for Supervised Neural Networks; 
%IEEE Transactions on Neural Networks, vol.21, issue 6, pp. 972-984, Jun. 
%2010. The algorithm must receive the following input data: F_train, which 
%must have all the input data (each column represents one input vector),
%t_train, which is a row vector where each element is the target output of
%its respective input data, and nneu, which determines the number of hidden
%neurons. t_train must receive the value -1, for negative occurrences, or 1,
%for positive occurrences. The algorithm outputs the MLP parameters W1, W2,
%b1, and b2, which are the synaptic weight matrixes and biases of the net. 
%These parameters are required by the rotine that simulates the trained MLP
%(sim_MMGDX). This training method is ready to treat unbalanced data; 
%however, it considers the negative class as the majority class, as often 
%occurs in pratice. After the training section the algorithm may suggest to
%increase or decrease the number of neurons. This is only a suggestion,
%which is based on the analysis of the Bias-variance tradeoff over the
%training data. Therefore, the user should test the result before change the
%number of hidden neurons. 
function [Nor,W1,W2,b1,b2]=train_MMGDX(F_train,t_train,nneu)
if min(t_train)>=0
    'Warning: the target-output must be -1 or 1, otherwise the neural network training will fail'
    beep
    beep
    pause(10)
end
L=size(t_train,1);
if L>1
   'Warning: the target-output vector will be transposed'
   t_train=t_train';
   beep
   beep
   pause(5)
end
[L,C]=size(F_train);
if L>C
    'Warning: the input set has more features than data'
    beep
    beep
    pause(10)
end
Nor=zeros(L);
for k=1:L
    Nor(k,k)=1/(max(F_train(k,:))-min(F_train(k,:)));
end
F_train=Nor*F_train;
%**************************************************************************
%Parameters:
alpha1=10^-(19); %learning parameter first layer
alpha2=10^-(20); %learning parameter second layer
gain_down=0.8;
gain_up=1.02;
iter_max = 4000;
p=8;
%**************************************************************************
%Generating 10 sub-sets for cross-validation:
X=F_train;
y=t_train;
[Li,C]=size(X);
m=round(C/10-.5);
X_cv=zeros(Li,m,10);
y_cv=zeros(10,m);
for k=1:10
    col=0;
    for n=1:m
        C=size(X,2);
        col=col+1;
        r=round((C-1)*rand)+1;
        X_cv(:,col,k)=X(:,r);
        X=[X(:,1:r-1),X(:,r+1:end)];
        y_cv(k,col)=y(r);
        y=[y(1:r-1),y(r+1:end)];
    end
end
%**************************************************************************
%Generating training data set:
X=F_train;
y=t_train;
[L,C]=size(X);
N1=0;
N2=0;
for k=1:C
    if y(k)==1;
       N1=N1+1;
    else
       N2=N2+1;
    end
end
class1=zeros(L,N1);
class2=zeros(L,N2);
k1=0;
k2=0;
for k=1:C
    if y(k)==1;
        k1=k1+1;
       class1(:,k1)=X(:,k);
    else
        k2=k2+1;
       class2(:,k2)=X(:,k);
    end
end
N=zeros(N2,1);
for n=1:N2
    d_min=1e10;
    for m=1:N1
        d=norm(class2(:,n)-class1(:,m));
        if d<d_min
            d_min=d;
        end
    end  
    for m=1:N2
        d=norm(class2(:,n)-class2(:,m));
        if d<d_min
            N(n,1)=N(n,1)+1;
        end
    end
end
class2_sort=sortrows([N,class2'],1);
class2_sort=class2_sort(1:N1,2:end);
class2_sort=class2_sort';
X_bal=[class1,class2_sort];
y_bal=[ones(1,N1),-ones(1,N1)];
F_train=X_bal;
t_train=y_bal;
%**************************************************************************
%Initializing the weights:
W1=zeros(nneu,Li);
for k=1:Li
    W1(:,k)=(rand(nneu,1)-.5)/Li/mean(F_train(k,:));
end
W2=(rand(1,nneu)-.5)*.5/nneu;
b1=(rand(nneu,1)-.5)*.5;
b2=0;
%##########################################################################
%Here starts the gradient-based training:
p_bk=p;
dJdW1_mom=0;
dJdW2_mom=0;
dJdb1_mom=0;
cont=0;
flag=0;
J=[20000,2000];
namost=size(F_train,2);
max_lines=size(W1,1);
V=zeros(nneu,namost);
Yh=zeros(nneu,namost);
ye=zeros(1,namost);
J_max=1e10;
iter=2;
rep=0;
while and(iter<iter_max,rep<40)
    if iter>1000
        if abs(J(3)-J(iter))<1e-10
            alpha1=alpha1*10;
            alpha2=alpha2*10;            
        end
        if abs(J(iter)-J(iter-100))<J(iter)/10000
           rep=rep+1;
           'IT WAS REACHED A LOCAL MINIMUM OF THE MAXIMUM-MARGIN OBJECTIVE FUNCTION. THEREFORE, THE USUAL GDX WILL BE APPLIED IN ORDER TO SCAPE FROM THIS MINIMUM' 
           net=newff(F_train,t_train,[nneu,1],{'logsig','purelin'},'traingdx','learngdm', 'mse', {}, {}, 'dividerand');
           %net=newff(F_train,t_train,[nneu,1],{'logsig' 'purelin'});%for Matlab 7.0
           %net=newff([min(F_train')',max(F_train')'],[nneu,1],{'logsig' 'purelin'}); %for Matlab 6.0
           net.trainFcn = 'traingdx';
           net.trainParam.epochs = 100;
           net.IW{1,1}=W1;
           net.b{1,1}=b1;
           net.LW{2,1}=W2;
           net.b{2,1}=b2;
           [net,E] = train(net,F_train,t_train);
           W1=net.IW{1,1};
           b1=net.b{1,1};
           W2=net.LW{2,1};
           b2=0;
        end
    end
    iter=iter+1;
    dJdW1=zeros(size(W1));
    dJdW2=zeros(size(W2));
    dJdb1=zeros(size(b1));
    for exemplo=1:namost            
        x=F_train(:,exemplo); 
        v=W1*x+b1;           
        yh=logsig(v);           
        V(:,exemplo)=v;            
        Yh(:,exemplo)=yh;
        ye(1,exemplo)=W2*yh+b2;
    end
    gain=(sum((t_train-ye).^p))^((1-p)/p);
    nw=norm(W2);
    for exemplo=1:namost
        dJdW2=dJdW2+gain*(((((nneu)^.5)*t_train(exemplo)-ye(exemplo)/nw)^(p-1))*((nw^-3)*W2*Yh(:,exemplo)*W2'-Yh(:,exemplo)/nw+b2*(nw^-3)*W2'))';
        for line=1:max_lines
            dJdW1(line,:)=dJdW1(line,:)-gain*(((((nneu)^.5)*t_train(exemplo)-ye(exemplo)/nw)^(p-1))*W2(line)*(1/(1+exp(-V(line,exemplo)))^2*exp(-V(line,exemplo)))*F_train(:,exemplo)/nw)';
            dJdb1(line)=dJdb1(line)-gain*(((((nneu)^.5)*t_train(exemplo)-ye(exemplo)/nw)^(p-1))*W2(line)*(1/(1+exp(-V(line,exemplo)))^2*exp(-V(line,exemplo)))/nw)';
        end
    end  
     W1_bakup=W1;
     W2_bakup=W2;
     b1_bakup=b1;
     b2_bakup=b2;
     W1=W1-alpha1*dJdW1_mom;
     W2=W2-alpha2*(dJdW2+dJdW2_mom);
     b1=b1-alpha2*(dJdb1+dJdb1_mom);
     dJdW1_mom=dJdW1;
     dJdW2_mom=dJdW2;
     dJdb1_mom=dJdb1;
     J(iter)=norm(((nneu)^.5)*t_train-ye/nw,p)/namost;
     plot(log10(J(3:end)),'.');
     title(['training with MMGDX     J=',num2str(J(iter))])
     ylabel('log(J)')
     xlabel(['epoch=',num2str(iter),'/4000'])
     pause(0.01)
     if iter>1
        if J(iter)<=J(iter-1)
            alpha1=alpha1*gain_up;
            alpha2=alpha2*gain_up;
            cont=cont+1;
            if or(flag==0,cont>2)
            p=p_bk;
            flag=0;
            end       
        else
            dJdW1_mom=2*dJdW1_mom;
            dJdW2_mom=2*dJdW2_mom;
            dJdb1_mom=2*dJdb1_mom;
            W1=W1_bakup;
            W2=W2_bakup;
            b1=b1_bakup;
            b2=b2_bakup;
            alpha1=alpha1*gain_down;
            alpha2=alpha2*gain_down;
            flag=1;
            cont=0;
        end
    end
    if J(iter)<J_max
        W12=W1;
        W22=W2;
        b12=b1;
        b22=b2;
        J_max=J(iter);
     end       
     hold off
end
W1=W12;
W2=W22;
b1=b12;
b2=b22;
%************************************************************************************************    
%Adjusting the threshold:
E_max=1e10;
for b2_test=-.7:0.01:.7
    acerto=zeros(1,10);
    for k=1:10
        [acerto(k),estimado]=sim_MMGDX(Nor,W1,W2,b1,b2_test,X_cv(:,:,k),y_cv(k,:));
    end
    err=1-acerto;
    bias2=mean(err)^2;
    variance=var(err);
    E_error=(bias2+variance)^.5;
    if E_error<E_max
        E_max=E_error;
        b2=b2_test;
        bias2_best=bias2;
        variance_best=variance;
    end
end
if and(variance_best>bias2_best/4,nneu>2)
    ['ATTENTION: the generalization ability may be better by decreasing the number of hidden neurons, because the variance of acc over the sub-sets is to big.'] 
    pause(5)
end
if variance_best<bias2_best/6.5
    ['ATTENTION: may be possible to increase the number of hidden neurons, because the variance of acc over the sub-sets is small.'] 
    pause(5)
end
if sum(sum(W1))==0
    ['ERROR: reapeat the training. In case of repeated error, it is required the normilization of the data']
    pause(5)
end

Contact us