function [ OrigSeqPop,SequenceTitle,SequenceID ] = ExtractSequenceOut( AADir,InformationType )
%% read sequences from fasta file
% if InformationType == 0, default, compute all
% if InformationType == 1, only SequenceTitle and SequenceID
if nargin == 1, InformationType = 0; end
if InformationType == 1
SeqNum = 10000;
SequenceTitle = cell(1,SeqNum);
AAfid = fopen( AADir,'r' );
Line = fgetl( AAfid ); SequenceTitle{1} = Line;
Index = 1;
while feof( AAfid ) == 0
Line = fgetl( AAfid );
if isempty(Line) == 0 && Line(1) == '>'
Index = Index + 1;
if mod(Index,10000)==0
Index
end
SequenceTitle{ Index } = Line;
end
end
SequenceTitle = SequenceTitle(1:Index);
SequenceID = cell( 1,Index );
parfor p = 1:Index
Pos = find( SequenceTitle{p}=='/',1 );
if isempty(Pos)==0
SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
else
Pos = find( SequenceTitle{p}==' ',1 );
if isempty(Pos)== 1
TempSeq = SequenceTitle{p};
SequenceID{ p } = TempSeq( 2:length(TempSeq) );
else
SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
end
end
end
OrigSeqPop = [];
return;
end
%% first, estimate the number of sequences.
AAfid = fopen( AADir,'r' ); SeqNum = 0;
while feof( AAfid ) == 0
Line = fgetl( AAfid );
if isempty(Line) == 0 && Line(1) == '>'
SeqNum = SeqNum + 1;
if mod(SeqNum,1000)==0
% SeqNum
end
end
end
AAfid = fopen( AADir,'r' );
NewSeqLen = 0; MaxLen=0; Index = 0;
while feof( AAfid ) == 0
Line = fgetl( AAfid );
if isempty(Line) == 0 && Line(1) == '>'
if MaxLen < NewSeqLen, MaxLen = NewSeqLen; end
NewSeqLen = 0; Index = Index + 1;
if Index > 10000, break; end
else
NewSeqLen = NewSeqLen + length(Line);
end
end
if MaxLen < NewSeqLen, MaxLen = NewSeqLen; end
%% second, store all sequences back into matrix.
OrigSeqPop = repmat('-',SeqNum,MaxLen);
SequenceTitle = cell(1,SeqNum);
AAfid = fopen( AADir,'r' );
Line = fgetl( AAfid ); SequenceTitle{1} = Line;
Index = 1; NewSeq = repmat( '-',1,MaxLen );
NewSeqIndex = 0; MaxLen = 0;
while feof( AAfid ) == 0
Line = fgetl( AAfid );
if isempty(Line) == 0
if Line(1) == '>'
NewSeqLen = length( NewSeq );
if NewSeqLen > 0
OrigSeqPop( Index,1:NewSeqLen ) = NewSeq;
end
Index = Index + 1;
%% report the current sequence
if mod(Index,1000)==0
% Index
end
SequenceTitle{ Index } = Line;
NewSeq = repmat( '-',1,MaxLen );
NewSeqIndex = 0;
else
Line = Line( setdiff(1:length(Line), find(Line==' ')) );
NewSeq( NewSeqIndex+1:NewSeqIndex+length(Line) ) = Line;
NewSeqIndex = NewSeqIndex+length(Line);
end
end
end
%% the last sequence
NewSeqLen = length( NewSeq ) ;
if NewSeqLen > 0
OrigSeqPop( Index,1:NewSeqLen ) = NewSeq;
end
%% remove gaps at the end of all columns.
if 0
Col = size(OrigSeqPop,2);
SelectCol = zeros(1,Col);
parfor p = 1:Col
if isempty( find(OrigSeqPop(:,p)~='-',1) ) == 0
SelectCol(p) = 1;
end
end
OrigSeqPop = OrigSeqPop( :,find(SelectCol>0) );
end
% L = length( find(SelectCol>0))
%% change it into upper case sequences.
if OrigSeqPop( 1,find( OrigSeqPop(1,:)~='-',1 ) ) > 96
OrigSeqPop = upper( OrigSeqPop );
end
%% if the id is required from output.
if nargout > 2
SequenceID = cell( 1,SeqNum );
for p = 1:SeqNum
Pos = find( SequenceTitle{p}=='/',1 );
if isempty(Pos)==0
SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
else
Pos = find( SequenceTitle{p}==' ',1 );
if isempty(Pos)== 1
TempSeq = SequenceTitle{p};
SequenceID{ p } = TempSeq(2:length(TempSeq));
else
SequenceID{ p } = SequenceTitle{p}(2:Pos-1);
end
end
end
end
fclose(AAfid);
end