No BSD License  

EMBLRead

by

 

12 Aug 2007 (Updated )

A modified version of EMBLRead for Reading a EMBL file into a Matlab structure much faster

data=emblread(embltext,varargin)
function data=emblread(embltext,varargin)
%EMBLREAD read in EMBL-EBI format data files.
%Reversed by SONG, Xindong Dec. 18 2006
%
%   DATA = EMBLREADS(FILENAME) imports the EMBL-EBI format data from file
%   FILENAME and creates a structure DATA with fields corresponding to the
%   EMBL-EBI two-character line type code. Each line type code is stored as
%   a separate element of the structure.
%
%   FILENAME can also be a URL or a MATLAB character array that contains
%   the text of an EMBL format file.
%
%    DATA contains the following fields:
%       Identification.EntryName
%       Identification.Version
%       Identification.Topology
%       Identification.Molecule
%       Identification.DataClass
%       Identification.Division
%       Identification.SequenceLength
%       Accession
%       SequenceVersion
%       DateCreated
%       DateUpdated
%       Description
%       Keyword
%       OrganismSpecies
%       OrganismClassification
%       Organelle
%       Reference.Number
%       Reference.Comment
%       Reference.Position
%       Reference.MedLine
%       Reference.PubMed
%       Reference.Authors
%       Reference.Title
%       Reference.Location
%       DbCrossRef
%       Comments
%       Feature
%       BaseCount.BP
%       BaseCount.A
%       BaseCount.C
%       BaseCount.G
%       BaseCount.T
%       BaseCount.Other
%       Sequence
%
%     DATA = EMBLREADS('FILENAME.EXT','SEQUENCEONLY',true) reads only the
%     sequence information.
%
%   Remarks: 
%   [1] Topology information was not included in EMBL flat files before
%   release 87 of the database. When reading a file created before release 
%   87, EMBLREAD returns an empty Identification.Topology field.
%   [2] The entry name is no longer displayed in the ID line of the EMBL
%   flat files in release 87. When reading a file created in release 87, 
%   EMBLREAD returns the accession number in the Identification.EntryName 
%   field. 
%
%   Examples:
%
%       % Get EMBL data and save it to a file.
%       emblout = getembl('X00558','TOFILE','X00558.ebi')
%
%       % In subsequent MATLAB sessions you can use emblread to access the
%       % local copy from disk instead of accessing it from the EMBL site.
%       data = emblreads('X00558.ebi')
%
%   See also FASTAREAD, GENBANKREAD, GETEMBL, SEQTOOL.

% Copyright 2002-2005 The MathWorks, Inc.
% $Revision: 1.14.4.7.12.1 $   $Date: 2006/08/03 00:56:13 $
tic;% $Revision: by SONG, Xindong  $Date: 2006/12/18 21:40:00 $
% set defaults
SeqOnly = false;
getPreambleText = false;
% check optional input arguments
if nargin > 1
    if rem(nargin,2)== 0
        error('Bioinfo:emblread:NumberOfArgumentsIncorrect','Incorrect number of arguments to %s.',mfilename);
    end
    okargs = {'sequenceonly','preambletext'};
    for j=1:2:nargin-2
        pname = varargin{j};
        pval = varargin{j+1};
        k = strmatch(lower(pname), okargs);%#ok
        if isempty(k)
            error('Bioinfo:emblread:UnknownParameterName',...
                'Unknown parameter name: %s.',pname);
        elseif length(k)>1
            error('Bioinfo:emblread:AmbiguousParameterName',...
                'Ambiguous parameter name: %s.',pname);
        else
            switch(k)
                case 1 % sequence
                    SeqOnly = opttf(pval);
                    if isempty(SeqOnly)
                        error('Bioinfo:emblread:InputOptionNotLogical','%s must be a logical value, true or false.',...
                            upper(char(okargs(k))));
                    end
                case 2  % 'preambletext'
                    getPreambleText = opttf(pval);
            end
        end
    end
end

if ~ischar(embltext) && ~iscellstr(embltext)
    error('Bioinfo:emblread:InvalidInput','The input is not an array of characters or a cell array of strings.');
end

% guess what type of input we have -- string, file or URL
if iscellstr(embltext)
    % do not mess with it, just put it to char and try to decipher in the try-catch trap
    embltext=char(embltext);
elseif size(embltext,1)==1 %it is char, lets check if it has an url or a file before try to decipher it
    if  ~isempty(strfind(embltext(1:min(10,end)), '://'))
        % must be a URL
        if (~usejava('jvm'))
            error('Bioinfo:emblread:NoJava','Reading from a URL requires Java.')
        end
        try
            embltext = urlread(embltext);
        catch
            error('Bioinfo:emblread:CannotReadURL','Cannot read URL "%s".',embltext);
        end
        % clean up any &amp s

        embltext = strrep(embltext,'&','&');
        % make each line a separate row in a string array
        embltext = char(strread(embltext,'%s','delimiter','\n','whitespace',''));
    elseif (exist(embltext,'file') || exist(fullfile(cd,embltext),'file'))
        % the file exists
        embltext = char(textread(embltext,'%s','delimiter','\n','whitespace',''));
    else
        embltext = char(strread(embltext,'%s','delimiter','\n','whitespace',''));
    end
end

% If the input is a string of EMBL data then words ID must be present
if size(embltext,1)==1 || isempty(strfind(embltext(1,:),'ID'))
    error('Bioinfo:emblread:NonMinimumRequiredFields','FILE is not a valid EMBL file or url.')
end

if ~ischar(embltext)
    error('Bioinfo:emblread:InvalidInput','The input should be a filename or an EMBL format text string.');
end

%toc;'read and input analysis',tic;

refcount=0;
[embltext,commentsize]=blanklines(embltext);

%toc;'pick up blank lines and count comment lines',tic

%line number
ln=1;
recordcount=1;

while 1
    %CC-free text Comment
    data(recordcount).Comments=[]; %#ok
    j=1;
    while 1
        if j <= commentsize && matchstart(embltext(j,:),'CC')
            data(recordcount).Comments=strvcat(data(recordcount).Comments,deblank(embltext(j,6:end))); %#ok
            embltext(j,:)='';
        else
            j=j+1;
            if j>commentsize || matchstart(embltext(j,:),'//')
                break
            end
        end
    end

%    toc;'pick up CC',tic
    
    %ID-Identification
    idLine = textscan(strtrim(embltext(ln,6:end)),'%s','delimiter',';');
    % check if ID line is formatted as EMBL Release 87
    if numel(idLine{1})==7
        formatIsR87 = true;
        data(recordcount).Identification.EntryName = idLine{1}{1};
        data(recordcount).Identification.Version = strrep(idLine{1}{2},'SV ','');
        data(recordcount).Identification.Topology = idLine{1}{3};
        data(recordcount).Identification.Molecule = idLine{1}{4};
        data(recordcount).Identification.DataClass = idLine{1}{5};
        data(recordcount).Identification.Division = idLine{1}{6};
        data(recordcount).Identification.SequenceLength = strrep(idLine{1}{7},'.','');       
    elseif numel(idLine{1})==4
    % then we try read the flat format before Release 87
        formatIsR87 = false;
        [tmp1,tmp2] = strread(idLine{1}{1},'%s%s');
        data(recordcount).Identification.EntryName = tmp1{1};
        data(recordcount).Identification.Version = ''; % fill it later
        data(recordcount).Identification.Topology = ''; % info not availbale before Release 87
        data(recordcount).Identification.Molecule = idLine{1}{2};
        data(recordcount).Identification.DataClass = tmp2{1};
        data(recordcount).Identification.Division = idLine{1}{3};
        data(recordcount).Identification.SequenceLength = strrep(idLine{1}{4},'.','');       
    else
        error('Bioinfo:emblread:CannotInterpretIDline','Cannot the interpret ID line in the EMBL data.');
    end
    ln=ln+1;

    %AC-Accession number
    data(recordcount).Accession=deblank(embltext(ln,6:end));
    data(recordcount).Accession = strrep(data(recordcount).Accession,';','');
    ln=ln+1;

    %SV-Sequence Version
    if formatIsR87 == true;
        data(recordcount).SequenceVersion = ...
            [ data(recordcount).Identification.EntryName '.' ...
              data(recordcount).Identification.Version ];
    else
        tmp = deblank(embltext(ln,6:end));
        data(recordcount).SequenceVersion = tmp;
        ln=ln+1;
        data(recordcount).Identification.Version = ...
            cell2mat(regexp(tmp,'\w*.(\w*)','tokens','once'));
    end;

    %DT-Date
    data(recordcount).DateCreated=deblank(embltext(ln,6:end));
    ln=ln+1;
    data(recordcount).DateUpdated=deblank(embltext(ln,6:end));
    ln=ln+1;

    %DE-Description
    data(recordcount).Description=[];
    while matchstart(embltext(ln,:),'DE')
        data(recordcount).Description=strvcat(data(recordcount).Description,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    %KW-Keyword
    data(recordcount).Keyword=[];
    while matchstart(embltext(ln,:),'KW')
        data(recordcount).Keyword=strvcat(data(recordcount).Keyword,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    %OS-Organism Species
    data(recordcount).OrganismSpecies=[];
    while matchstart(embltext(ln,:),'OS')
        data(recordcount).OrganismSpecies=strvcat(data(recordcount).OrganismSpecies,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    %OC-Organism Classification
    data(recordcount).OrganismClassification=[];
    while matchstart(embltext(ln,:),'OC')
        data(recordcount).OrganismClassification=strvcat(data(recordcount).OrganismClassification,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    %OG-Organelle
    data(recordcount).Organelle=[];
    while matchstart(embltext(ln,:),'OG')
        data(recordcount).Organelle=strvcat(data(recordcount).Organelle,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    %Reference
    while matchstart(embltext(ln,:),'RN') %ref name
        refcount=refcount+1;

        %RN-Reference name
        data(recordcount).Reference{refcount}.Number=deblank(embltext(ln,6:end));
        ln=ln+1;

        %RC-Reference Comment
        data(recordcount).Reference{refcount}.Comment=[];
        while matchstart(embltext(ln,:),'RC')
            data(recordcount).Reference{refcount}.Comment=strvcat(data(recordcount).Reference{refcount}.Comment,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end

        %RP-Reference Position
        data(recordcount).Reference{refcount}.Position=[];
        while matchstart(embltext(ln,:),'RP')
            data(recordcount).Reference{refcount}.Position=strvcat(data(recordcount).Reference{refcount}.Position,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end

        %RX-Reference Database identifier
        data(recordcount).Reference{refcount}.MedLine = '';
        data(recordcount).Reference{refcount}.PubMed = '';
        databaseText = '';
        while matchstart(embltext(ln,:),'RX')
            databaseText=strvcat(databaseText,lower(deblank(embltext(ln,6:end)))); %#ok
            ln=ln+1;
        end
        medlineRow = strmatch('medline',databaseText);%#ok
        if medlineRow
            [junk, medlineText] = strtok(databaseText(medlineRow,:)); %#ok
            medlineText(medlineText == '.') = '';
            medlineText(medlineText == ' ') = '';
            data(recordcount).Reference{refcount}.MedLine = medlineText;
        end

        pubmedRow = strmatch('pubmed',databaseText);%#ok
        if pubmedRow
            [junk, pubmedText] = strtok(databaseText(pubmedRow,:)); %#ok
            pubmedText(pubmedText == '.') = '';
            pubmedText(pubmedText == ' ') = '';
            data(recordcount).Reference{refcount}.PubMed = pubmedText;
        end


%%%%%   %RG-Reference Authors
%%%%%   added by SONG for BC001359 etc.
        data(recordcount).Reference{refcount}.Group=[];
        while matchstart(embltext(ln,:),'RG')
            data(recordcount).Reference{refcount}.Group=strcat(data(recordcount).Reference{refcount}.Group,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end
        
        %RA-Reference Authors
        data(recordcount).Reference{refcount}.Authors=[];
        while matchstart(embltext(ln,:),'RA')
            data(recordcount).Reference{refcount}.Authors=strvcat(data(recordcount).Reference{refcount}.Authors,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end

        %RT-Reference Title
        data(recordcount).Reference{refcount}.Title=[];
        while matchstart(embltext(ln,:),'RT')
            data(recordcount).Reference{refcount}.Title=strvcat(data(recordcount).Reference{refcount}.Title,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end

        %RL-Reference Location
        data(recordcount).Reference{refcount}.Location=[];
        while matchstart(embltext(ln,:),'RL')
            data(recordcount).Reference{refcount}.Location=strvcat(data(recordcount).Reference{refcount}.Location,deblank(embltext(ln,6:end))); %#ok
            ln=ln+1;
        end

    end

    %DR-Database Cross Reference
    data(recordcount).DatabaseCrossReference=[];
    while matchstart(embltext(ln,:),'DR')
        data(recordcount).DatabaseCrossReference=strvcat(data(recordcount).DatabaseCrossReference,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end
    
%%% %AH-Assembly Header
    data(recordcount).Assembly=[];
    while matchstart(embltext(ln,:),'AH')
        data(recordcount).Assembly=strvcat(data(recordcount).Assembly,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

%%% %AS-ASsembly Information
    while matchstart(embltext(ln,:),'AS')
        data(recordcount).Assembly=strvcat(data(recordcount).Assembly,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

%%% %CO-Constructed of Contig
    data(recordcount).Constructed=[];
    while matchstart(embltext(ln,:),'CO')
        data(recordcount).Constructed=strvcat(data(recordcount).Constructed,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

    lnPreambleText = ln;

    %FH Features header
    data(recordcount).Feature=[];
    while matchstart(embltext(ln,:),'FH')
        data(recordcount).Feature=strvcat(data(recordcount).Feature,deblank(embltext(ln,6:end))); %#ok
        ln=ln+1;
    end

%%% %FT Features text
%%% revised by SONG for efficiency
    lnftbegin = ln;
    while matchstart(embltext(ln,:),'FT')
        ln=ln+1;
    end
    lnftend = ln-1;
    fttext = embltext(lnftbegin:lnftend,6:end);
    data(recordcount).Feature=strvcat(data(recordcount).Feature,fttext);
    oth = false;
%%% revised end
    
    %SQ sequence header
    while matchstart(embltext(ln,:),'SQ')
        temp=embltext(ln,14:end);
        [numbp,temp]=strtok(temp,'BP'); temp= temp(3:end);%#ok
        [numa,temp]=strtok(temp,'A'); temp = temp(3:end);%#ok
        [numc,temp]=strtok(temp,'C'); temp = temp(3:end);%#ok
        [numg,temp]=strtok(temp,'G'); temp = temp(3:end);%#ok
        [numt,temp]=strtok(temp,'T');%#ok
        if strfind(temp,'other'),
            temp = embltext(ln,2:end);
            numo = strtok(temp,'o');
            oth = true;
        end

        data(recordcount).BaseCount.BP= str2double(numbp);
        data(recordcount).BaseCount.A = str2double(numa);
        data(recordcount).BaseCount.C = str2double(numc);
        data(recordcount).BaseCount.G = str2double(numg);
        data(recordcount).BaseCount.T = str2double(numt);
        data(recordcount).BaseCount.Other = [];
        if oth
            data(recordcount).BaseCount.Other = str2double(numo);
        end

        ln=ln+1;
    end
%toc;'before sq',tic;
    %Read in sequence until end of entry, //
    data(recordcount).Sequence=[];    
    lnsqbegin = ln;
    while ~matchstart(embltext(ln,:),'//') && ~matchstart(embltext(ln,:),'ID')
        ln=ln+1;
    end
    lnsqend = ln-1;
    data(recordcount).Sequence = [  embltext(lnsqbegin:lnsqend,6:15),...
                                    embltext(lnsqbegin:lnsqend,17:26),...
                                    embltext(lnsqbegin:lnsqend,28:37),...
                                    embltext(lnsqbegin:lnsqend,39:48),...
                                    embltext(lnsqbegin:lnsqend,50:59),...
                                    embltext(lnsqbegin:lnsqend,61:70)];
    data(recordcount).Sequence = reshape(data(recordcount).Sequence',1,[]);
    data(recordcount).Sequence = strrep(char(data(recordcount).Sequence),' ','');

    if getPreambleText
        data(recordcount).PreambleText = embltext(1:lnPreambleText-1,:);
    end
%toc;'sq',tic
    while  (ln < size(embltext,1)) && ~matchstart(embltext(ln,:),'ID')
        ln=ln+1;
    end

    if matchstart(embltext(ln,:),'ID') %see if line is part of next record
        recordcount = recordcount+1;
        continue
    else
        if SeqOnly == true
            if recordcount==1
                data=data(recordcount).Sequence;
            else
                data={data.Sequence};
            end
        end
        toc;'all'
        return
    end
end

%-------------------------------------------------------------------------%
function [out,commentsize]=blanklines(in)
m = size(in,1);
numblanks=0;
numcomments=0;
%find Number of blank lines
for i=1:m
    if matchstart(in(i,:),'XX')
        numblanks=numblanks+1;
    end
    if matchstart(in(i,:),'CC')
        numcomments=numcomments+1;
    end
end

%remove blank lines
for i=1:m-numblanks
    if matchstart(in(i,:),'XX')
        in(i,:)='';
    end
end
out=in;
commentsize=m-numblanks-numcomments;

%-------------------------------------------------------------------------%
function tf = matchstart(string,pattern)
%MATCHES start of string with pattern

tf = ~isempty(regexp(string,['^',pattern],'once'));

Contact us