Ancestral polytree

by

 

The code here provides the algorithm of learning ancestral polytree.

ExtractSequenceOut( AADir,InformationType )
function [ OrigSeqPop,SequenceTitle,SequenceID ] = ExtractSequenceOut( AADir,InformationType )
%% read sequences from fasta file
% if InformationType == 0, default, compute all
% if InformationType == 1, only SequenceTitle and SequenceID

if nargin == 1, InformationType = 0; end
if InformationType == 1
   SeqNum = 10000;
   SequenceTitle = cell(1,SeqNum);
   AAfid = fopen( AADir,'r' );
   Line = fgetl( AAfid ); SequenceTitle{1} = Line;
   Index = 1;  
   while feof( AAfid ) == 0
       Line = fgetl( AAfid );
       if isempty(Line) == 0 && Line(1) == '>'
          Index = Index + 1;
          if mod(Index,10000)==0
              Index
          end
          SequenceTitle{ Index } = Line;   
       end
   end
   SequenceTitle = SequenceTitle(1:Index);
   SequenceID = cell( 1,Index );
   parfor p = 1:Index
       Pos = find( SequenceTitle{p}=='/',1 );
       if isempty(Pos)==0
          SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
       else
          Pos = find( SequenceTitle{p}==' ',1 );
          if isempty(Pos)== 1
             TempSeq = SequenceTitle{p};
             SequenceID{ p } = TempSeq( 2:length(TempSeq) );
          else
             SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
          end
       end
   end   
   OrigSeqPop = [];
   return;
end


%% first, estimate the number of sequences.
AAfid = fopen( AADir,'r' ); SeqNum = 0; 
while feof( AAfid ) == 0
    Line = fgetl( AAfid );
    if isempty(Line) == 0 && Line(1) == '>'
        SeqNum = SeqNum + 1;
        if mod(SeqNum,1000)==0
           % SeqNum
        end
    end
end

AAfid = fopen( AADir,'r' ); 
NewSeqLen = 0; MaxLen=0; Index = 0;
while feof( AAfid ) == 0
    Line = fgetl( AAfid );
    if isempty(Line) == 0 && Line(1) == '>'
        if MaxLen < NewSeqLen,  MaxLen = NewSeqLen;  end
        NewSeqLen = 0; Index = Index + 1;
        if Index > 10000,  break; end
    else
        NewSeqLen = NewSeqLen + length(Line);
    end
end
if MaxLen < NewSeqLen, MaxLen = NewSeqLen; end

%% second, store all sequences back into matrix.
OrigSeqPop = repmat('-',SeqNum,MaxLen);
SequenceTitle = cell(1,SeqNum);
AAfid = fopen( AADir,'r' );
Line = fgetl( AAfid ); SequenceTitle{1} = Line;
Index = 1; NewSeq = repmat( '-',1,MaxLen );
NewSeqIndex = 0; MaxLen = 0;
while feof( AAfid ) == 0
    Line = fgetl( AAfid );
    if isempty(Line) == 0
        if Line(1) == '>'
           NewSeqLen = length( NewSeq );
           if NewSeqLen > 0
              OrigSeqPop( Index,1:NewSeqLen ) = NewSeq;
           end
           Index = Index + 1;
           
           %% report the current sequence 
           if mod(Index,1000)==0
             % Index
           end
           SequenceTitle{ Index } = Line;
           NewSeq = repmat( '-',1,MaxLen );
           NewSeqIndex = 0;
        else
           Line = Line( setdiff(1:length(Line), find(Line==' ')) );
           NewSeq( NewSeqIndex+1:NewSeqIndex+length(Line) )  = Line;
           NewSeqIndex = NewSeqIndex+length(Line);
        end
    end
end

%% the last sequence
NewSeqLen = length( NewSeq ) ;
if NewSeqLen > 0
   OrigSeqPop( Index,1:NewSeqLen ) = NewSeq;
end


%% remove gaps at the end of all columns.
if 0
Col = size(OrigSeqPop,2);
SelectCol = zeros(1,Col);
parfor p = 1:Col
    if isempty( find(OrigSeqPop(:,p)~='-',1) ) == 0
       SelectCol(p) = 1;
    end
end

OrigSeqPop = OrigSeqPop( :,find(SelectCol>0) );
end
% L = length( find(SelectCol>0))


%% change it into upper case sequences.
if OrigSeqPop( 1,find( OrigSeqPop(1,:)~='-',1 ) ) > 96
   OrigSeqPop = upper( OrigSeqPop );
end

%% if the id is required from output.
if nargout > 2
   SequenceID = cell( 1,SeqNum );
   for p = 1:SeqNum
       Pos = find( SequenceTitle{p}=='/',1 );
       if isempty(Pos)==0
          SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
       else
          Pos = find( SequenceTitle{p}==' ',1 );
          if isempty(Pos)== 1
             TempSeq = SequenceTitle{p};
             SequenceID{ p } = TempSeq(2:length(TempSeq));
          else
             SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
          end
       end
   end
end
fclose(AAfid);
end

Contact us