No BSD License  

Highlights from
ASNread

from ASNread by Ahmed Abdalla
Reads in genetic sequence data in ASN.1 format.

ASNread(file)
function [DNAdata_gbank, varargout] = ASNread(file)
%ASNread - Reads in ASN.1 format data files
%    DATA =ASNread(FILENAME) reads in the ASN.1 
%    formatted sequence entry file, from FILENAME.
%    DATA is a structure containing these fields:
%       LocusName 
%       LocusSequenceLength
%       LocusMoleculeType
%       LocusGenBankDivision
%       LocusModificationDate
%       Definition
%       Accession
%       Version
%       GI
%       Keywords
%       Segment
%       Source
%       SourceOrganism
%       Reference.Number
%       Reference.Authors
%       Reference.Title
%       Reference.Journal
%       Reference.MedLine
%       Reference.PubMed
%       Reference.Remark
%       Comment
%       Features 
%       BaseCount
%       Sequence
%    To view the raw data aquired from the ASN.1 file,
%    specify an extra output argument.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%ERROR CHECKING
if ~isstr(file), error('The input argument must be a string.'); end

if nargout > 2, error('Too many output arguments.'); end

[status msg] = fopen(file,'r');

if ~isempty(msg), error(msg); end

fclose(status);


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%IMPORTING DATA FROM FILE
DNAdata = [];
all_word = {};
current_structure = [];

all_row = textread(file,'%s','delimiter','\n');
temp_word = textread(file,'%s','delimiter',' ');

%adding space to end of each word
for a = 1:length(temp_word), temp_word{a}(end+1)= ' '; end

%creating cell array of each word in file
n = 0;
for i=1:length(all_row)
    t = isspace(all_row{i});
    t2 = [0,t(1:end-1)];
    t3 = t+t2;
    double_spaces = length(find(t3 == 2));
    Num_words(i) = length(find(t))+1-double_spaces;
    [all_word{i,1:Num_words(i)}]=deal(temp_word{n+1:n+Num_words(i)});
    
    n = n + Num_words(i);
end
clear temp_word n;

[DNAdata_raw,DNAdata_gbank] = CreateStructure(all_row, all_word, Num_words', current_structure);

if nargout == 2
    varargout{1} = DNAdata_raw;
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%CONFIGURING THE IMPORTED DATA
function [data_structure,gbank_struct] = CreateStructure(all_row, all_word, Num_word, data_structure)

%Defining GENBANK structure
gbank_struct.LocusName = [];
gbank_struct.LocusSequenceLength = [];
gbank_struct.LocusMoleculeType = [];
gbank_struct.LocusGenBankDivision = [];
gbank_struct.LocusModificationDate = [];
gbank_struct.Definition = [];
gbank_struct.Accession = [];
gbank_struct.Version = [];
gbank_struct.GI = [];
gbank_struct.Keywords = [];
gbank_struct.Segment = [];
gbank_struct.Source = [];
gbank_struct.SourceOrganism = [];
gbank_struct.Reference.Number = [];
gbank_struct.Reference.Authors = [];
gbank_struct.Reference.Title = [];
gbank_struct.Reference.Journal = [];
gbank_struct.Reference.MedLine = [];
gbank_struct.Reference.PubMed = [];
gbank_struct.Reference.Remark = [];
gbank_struct.Comment = [];
gbank_struct.Features  = [];
gbank_struct.BaseCount = [];
gbank_struct.Sequence = [];

L = length(all_row); %how many rows

current_row_num = 2;
current_field = '';
balance_up = 0;
balance_down = 0;
while current_row_num <= L    

    current_row = all_row{current_row_num};
    down_field = findstr(current_row,'{');
    up_field = findstr(current_row,'}');
    
    if ~isempty(down_field) % if a { is found, update the current field ...
        
        if isempty(findstr(all_word{current_row_num, 1}, '{ '))
            current_field = go_up_field(current_field, all_word{current_row_num, 1});
        else %only a { is on the line
            balance_up = balance_up+1;
        end
        
        current_row_num = current_row_num+1; 
                
    else %if both { and } are NOT present or if only } is found, update current field and add data ...    
        
        if ~isempty(findstr(all_word{current_row_num, Num_word(current_row_num)}, ', ')) %if line ends in ',' 
        
            if (Num_word(current_row_num) - length(up_field) - 1) ~= 1 %if there is more than one word on the line
            
                if ~isempty(findstr(all_word{current_row_num,1},'"')) %if first word starts with "
                    data = strrep([all_word{current_row_num,1:Num_word(current_row_num)-1}],'"','');
                    data_structure = MakeField(data_structure, current_field, data);
                    gbank_struct = MapIt(data_structure, current_field, gbank_struct);
                else %normal case with field name and data
                    current_field = go_up_field(current_field, all_word{current_row_num, 1});
                    data = strrep([all_word{current_row_num,2:Num_word(current_row_num)-1}],'"','');
                    data = strrep(data,'} ','');
                    data_structure = MakeField(data_structure, current_field, data); 
                    gbank_struct = MapIt(data_structure, current_field, gbank_struct);
                    current_field = go_down_field(current_field);
                end
                
            else %there is only one word not including } or , on the line (ie: only data is on the line)
                
                if ~isempty(findstr(all_word{current_row_num,1},'"')) %if first word starts with "
                    data = strrep([all_word{current_row_num,1}],'"','');
                    data_structure = MakeField(data_structure, current_field, data);
                    gbank_struct = MapIt(data_structure, current_field, gbank_struct);
                end
                
            end
            
            if ~isempty(up_field) % if a } is found, update the field ...
                for g = 1:length(up_field)-balance_up-balance_down
                    current_field = go_down_field(current_field);
                end
                balance_up = 0;
                balance_down = 0;
            end
            
            current_row_num = current_row_num+1;
            
        elseif Num_word(current_row_num) == 1 %if there is only a single word on the line, update the field
            current_field = go_up_field(current_field, all_word{current_row_num, 1});
            balance_down = balance_down-1;
            
            current_row_num = current_row_num+1;
            
        else %if the last word is not a ',' then the data wraps to the next line or it is EOF
            
            if ~isempty(up_field), break %if } is present and no ',', then EOF is reached
            else
                data_row = current_row_num;
                n=3; x = 1; data_temp = all_word{data_row,2};

                while x~=0
                    data_temp = [data_temp all_word{data_row,n:Num_word(data_row)}];
                    quotes = findstr(data_temp,'''');
                    quotes2 = findstr(data_temp,'"');
                    
                    if length(quotes) > 1, x=0; end  %checking for 2 quotes
                    if length(quotes2) > 1, x=0; end %checking for 2 double quotes
                    data_row = data_row+1;
                    n = 1;
                end
                
                data = strrep(data_temp(1:end-4), '"','');
                data = strrep(data, '''','');               
                current_field = go_up_field(current_field, all_word{current_row_num, 1});
                data_structure = MakeField(data_structure, current_field, data); 
                gbank_struct = MapIt(data_structure, current_field, gbank_struct);
                current_field = go_down_field(current_field);
                
                current_row_num = data_row;
            end
        end 
    end
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%CREATE THE FULL FIELD NAME
function new_field = go_up_field(current_field, field)

new_field = strcat([current_field '.' field]);


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%GO BACK ONE FIELD FROM THE FULL FIELD NAME
function new_field = go_down_field(current_field)

dot = findstr(current_field, '.');
if ~isempty(current_field)
    new_field = current_field(1:dot(end)-1);
else 
    new_field = '';
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%USING THE FULL FIELD NAME TO CREATE FIELD IN STRUCTURE AND ADD DATA
function data_structure = MakeField(current_structure, current_field, data)

data_structure = current_structure;

if ~isempty(data)
    eval(['data_structure', sprintf(strrep(current_field, '-','_')), '=', '''', sprintf(strrep(data,'''','')), '''', ';']);
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%MAPPING RAW DATA STRUCTURE TO GENBANK STYLE STRUCTURE
function gbank_struct = MapIt(current_structure, current_field, gbank_struct)

dot = findstr(current_field, '.');
if length(dot) > 3
    field = strrep(current_field(dot(end-3)+1:end),'-','_');
elseif length(dot) > 2
    field = strrep(current_field(dot(end-2)+1:end),'-','_');
elseif length(dot) == 2
    field = strrep(current_field(dot(end-1)+1:end),'-','_');
else
    return
end

    %SEQUENCE
if ~isempty(findstr(lower(field),'inst.seq_data.ncbi2na')) | ~isempty(findstr(lower(field),'inst.seq_data.iupacna'))
    if isempty(gbank_struct.Sequence)
        gbank_struct.Sequence = strrep(eval(['current_structure' strrep(current_field, '-','_')]),' ',sprintf('\n'));
    end
    
    %LOCUS GENBANK DIVISION
elseif ~isempty(findstr(lower(field),'org.orgname.div')) | ~isempty(findstr(lower(field),'genbank.div'))
    if ~isempty(findstr(lower(field),'genbank.div'))
        gbank_struct.LocusGenBankDivision = eval(['current_structure' strrep(current_field, '-','_')]);
    elseif isempty(gbank_struct.LocusGenBankDivision)
        gbank_struct.LocusGenBankDivision = eval(['current_structure' strrep(current_field, '-','_')]);
    end
    
    %LOCUS NAME
elseif ~isempty(findstr(lower(field),'id.genbank.name'))
    gbank_struct.LocusName = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %LOCUSE MOLECULE TYPE
elseif ~isempty(findstr(lower(field),'molinfo.biomol')) 
    if isempty(gbank_struct.LocusMoleculeType)
        gbank_struct.LocusMoleculeType = eval(['current_structure' strrep(current_field, '-','_')]);
    end
    
    %LOCUS SEQUENCE LENGTH
elseif ~isempty(findstr(lower(field),'inst.length'))  
    if isempty(gbank_struct.LocusSequenceLength)
        gbank_struct.LocusSequenceLength = eval(['current_structure' strrep(current_field, '-','_')]);
    end
    
    %ACCESSION
elseif ~isempty(findstr(lower(field),'id.genbank.accession'))   
    if isempty(gbank_struct.Accession)
        gbank_struct.Accession = eval(['current_structure' strrep(current_field, '-','_')]);
    end
    
    %VERSION
elseif ~isempty(findstr(lower(field),'id.genbank.version')) 
    gbank_struct.Version = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %GI
elseif ~isempty(findstr(lower(field),'seq.id.gi')) 
    if isempty(gbank_struct.GI)
        gbank_struct.GI = eval(['current_structure' strrep(current_field, '-','_')]);
    end
    
    %DEFINITION
elseif ~isempty(findstr(lower(field),'descr.title')) 
    gbank_struct.Definition = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %SOURCE
elseif ~isempty(findstr(lower(field),'source.org.common'))  
    gbank_struct.Source = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %SOURCE ORGANISM
elseif ~isempty(findstr(lower(field),'source.org.taxname')) 
    gbank_struct.SourceOrganism{1} = eval(['current_structure' strrep(current_field, '-','_')]);
elseif ~isempty(findstr(lower(field),'org.orgname.lineage')) 
    gbank_struct.SourceOrganism{2} = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %COMMENT
elseif ~isempty(findstr(lower(field),'descr.comment')) 
    gbank_struct.Comment = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %KEYWORDS
elseif ~isempty(findstr(lower(field),'keywords')) 
    gbank_struct.Keywords = eval(['current_structure' strrep(current_field, '-','_')]);
    
    %DATE
elseif ~isempty(findstr(lower(field),'update_date.std.year'))
    if isempty(gbank_struct.LocusModificationDate)
        gbank_struct.LocusModificationDate = eval(['current_structure' strrep(current_field, '-','_')]);
    end
elseif ~isempty(findstr(lower(field),'update_date.std.month')) 
    if length(gbank_struct.LocusModificationDate) >= 12, break, end
    gbank_struct.LocusModificationDate = [eval(['current_structure' strrep(current_field, '-','_')]) '- '...
            gbank_struct.LocusModificationDate];
elseif ~isempty(findstr(lower(field),'update_date.std.day'))
    if length(gbank_struct.LocusModificationDate) >= 12, break, end
    gbank_struct.LocusModificationDate = [eval(['current_structure' strrep(current_field, '-','_')]) '- '...
            gbank_struct.LocusModificationDate];
        
    %REFERENCES

end

Contact us at files@mathworks.com