Code covered by the BSD License  

Highlights from
Primer Design GUI

image thumbnail

Primer Design GUI

by

 

16 Jun 2011 (Updated )

A GUI for designing primers

gbout=getncbidata(accessnum,varargin)
function gbout=getncbidata(accessnum,varargin)
% GETNCBIDATA retrieves sequence information from the NCBI databases.
%   GBOUT = GETNCBIDATA(ACCESSNUM)  searches for the Accession number in
%   the GenBank or GenPept database, and returns a structure containing
%   information for the sequence.
%
%   GBOUT = GETNCBIDATA(...,'DATABASE',DB) will search in the specified
%   database, DB, for data.  The accepted values are: 'nucleotide' or 'protein'.
%
%   GBOUT = GETNCBIDATA(...,'TOFILE',FILENAME) saves the data returned from
%   the database in the file FILENAME.
%
%   GBOUT = GETNCBIDATA(...,'FILEFORMAT',FORMAT) retrieves the data in the
%   specified format.  The accepted values are 'GenBank','GenPept',and 'FASTA'.
%
%   GBOUT = GETNCBIDATA(...,'SEQUENCEONLY',TF) returns just the sequence as
%   a character array.
%
%   Example:
%
%       nt = getncbidata('M10051','database','nucleotide')
%
%       aa = getncbidata('AAA59174','database','protein')
%
%   This retrieves the sequence from chromosome 19 that codes for the
%   Human insulin receptor and stores it in structure S.
%
%   Please see: http://www.ncbi.nlm.nih.gov/About/disclaimer.html
%
%   See also GENBANKREAD, GENPEPTREAD, GETGENBANK, GETGENPEPT.

% Copyright 2002-2006 The MathWorks, Inc.
% $Revision: 1.15.4.7.4.11 $   $Date: 2006/06/07 12:30:55 $

if ~usejava('jvm')
    error('Bioinfo:getncbidata:NeedJVM','%s requires Java.',mfilename);
end
% default to 'nucleotide'
db = 'nucleotide';
dbfrm = '';
tofile = false;
seqonly = false;
fileformat = 'GenBank';
passThroughArgs = {};

if nargin > 1
    if rem(nargin,2) == 0
        error('Bioinfo:getncbidata:IncorrectNumberOfArguments',...
            'Incorrect number of arguments to %s.',mfilename);
    end
    okargs = {'database','tofile','sequenceonly','fileformat','preambletext','allfeatures'};
    for j=1:2:nargin-2
        pname = varargin{j};
        pval = varargin{j+1};
        k = strmatch(lower(pname), okargs);%#ok
        if isempty(k)
            error('Bioinfo:getncbidata:UnknownParameterName',...
                'Unknown parameter name: %s.',pname);
        elseif length(k)>1
            error('Bioinfo:getncbidata:AmbiguousParameterName',...
                'Ambiguous parameter name: %s.',pname);
        else
            switch(k)
                case 1  % database
                    okdbs = {'nucleotide','protein'};
                    val = strmatch(lower(pval),okdbs);%#ok
                    if length(val) == 1
                        dbfrs = {'GenBank','GenPept'};
                        db = okdbs{val};
                        dbfrm = dbfrs{val};
                    else
                        if isempty(val)
                            error('Bioinfo:getncbidata:baddb','Invalid database name. Valid databases are ''nucleotide'', ''protein''. ')
                        else
                            error('Bioinfo:getncbidata:ambiguousdb','Ambiguous database name: %s.',pval);
                        end
                    end
                case 2  % tofile
                    if ischar(pval)
                        tofile = true;
                        filename = pval;
                    end
                case 3  % sequenceonly
                    seqonly = opttf(pval);
                    if isempty(seqonly)
                        error('Bioinfo:getncbidata:InputOptionNotLogical','%s must be a logical value, true or false.',...
                            upper(char(okargs(k))));
                    end
                case 4 % fileformat
                    okformats = {'GenBank','GenPept','FASTA'};
                    val = strmatch(lower(pval),lower(okformats));%#ok
                    if length(val) == 1
                        fileformat = okformats{val};
                        dbfrm = '';
                    else
                        if isempty(val)
                            error('Bioinfo:getncbidata:badformat','Invalid file format.  Valid formats are "GenBank","GenPept", and "FASTA"');
                        else
                            error('Bioinfo:getncbidata:ambiguousfileformat','Ambiguous file format: %s',pval);
                        end
                    end
                case {5,6}  % 'preambletext'
                    passThroughArgs{end+1} = pname;
                    passThroughArgs{end+1} = opttf(pval);
            end
        end
    end
end

% if a fileformat was specified as an input, then dbfrm should now be empty.  Use the
% specified format
if isempty(dbfrm)
    dbfrm = fileformat;
end

% convert accessnum to a string if it is a number
if isnumeric(accessnum)
    accessnum = num2str(accessnum);
end

% error if accessnum isn't a string
if ~ischar(accessnum)
    error('Bioinfo:getncbidata:NotString','Access Number is not a string.')
end

% create the url that is used
% for more information see:
%    http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html

% June 6, 2007
% Changes to Entrez search URL parameter dopt requires a different search
% URL for protein and nucleotide.  Protein needs to use the new doptcmdl
% parameter and Nucleotide needs to use the old dopt parameter.
if strcmpi(db,'protein')
    searchurl = ['http://www.ncbi.nlm.nih.gov/sites/entrez/query.fcgi?cmd=Search&db=' db '&term=' accessnum '&doptcmdl=' dbfrm '&mode=file'];
else
    searchurl = ['http://www.ncbi.nlm.nih.gov/sites/entrez/query.fcgi?cmd=Search&db=' db '&term=' accessnum '&dopt=' dbfrm '&mode=file'];
end

% get the html file that is returned as a string
s=urlread(searchurl);


% replace the html version of &
s=strrep(s,'&','&');

% search for text indicating that the incorrect database is being used
% NCBI devided nucleotide database to three subsections NucCore, NucGSS,
% And NucEST. Search for 'nuc' in foundDB to identify the nucleotide DB.
wrongDB = strfind(s,'The following term(s) refer to a different DB:');
if ~isempty(wrongDB)
    foundDB = char(regexp(s(wrongDB:end),'db=(\w*)&','once','tokens'));
    if ~isempty(foundDB)
        if strncmpi(foundDB,'protein',7)
            tryCommand = 'getgenpept';
%         elseif strncmpi(foundDB,'nucleotide',9)
%             tryCommand = 'getgenbank';
        elseif strncmpi(foundDB,'nuc',3)
            tryCommand = 'getgenbank';
        else
            tryCommand = '';
        end
        if ~isempty(tryCommand)
            error('Bioinfo:getncbidata:IncorrectKnownDatabaseSuggest',...
                ['The key ''%s'' was not found in the %s database, but it is\n',...
                'in the %s databases. Try using:\n%s(''%s'')'],accessnum,db,...
                foundDB,tryCommand,accessnum);
        else
            error('Bioinfo:getncbidata:IncorrectKnownDatabase',...
                ['The key ''%s'' was not found in the %s database, but it is\n',...
                'in the %s databases.'],accessnum,db,...
                foundDB);
        end
    else
        error('Bioinfo:getncbidata:IncorrectUnknownDatabase',...
            'The key ''%s'' was not found in the %s database.',...
            accessnum,db);
    end
end

% search for text indicating that there weren't any files found
if ~isempty(strfind(s,'No items found'))
    error('Bioinfo:getncbidata:NoItemsFound',...
        'The key ''%s'' was not found in the database.',accessnum) ;
end

% find first occcurrence of ['<a href=[''"]/entrez/viewer\.fcgi\?db=' db]
% which identifies links to the hits returned by the search command

% Commented out for work around to G384213
%basequery = ['<a href=[''"]/entrez/viewer\.fcgi\?db=' db];

% June 6, 2007
% NCBI is now strict about which nucleotide database the sequence is in.
% Need to relax regular expression to look for CoreNucleotide, GSS and EST
% databases, then parse out which database the file is in.
if strcmpi(db,'nucleotide')
    queryDB = '(nuccore|nucgss|nucest|nucleotide)';
else
    queryDB = db;
end

% Commented out for work around to G384213
% loc=regexpi(s,basequery);
[loc,DB]=regexpi(s,['<a href=[''"]/entrez/viewer\.fcgi\?db=(?<db>',queryDB,')'],'start','names');

% Set db to correct database (important for Nucleotide) and rebuild
% basequery to be used for retrieval query.
db = DB.db;
basequery = ['<a href=[''"]/entrez/viewer\.fcgi\?db=',db];

if isempty(loc)
    error('Bioinfo:getncbidata:NCBIcorruptedCanNotFindLinks',...
        'Can not interpret NCBI url data.') ;
end

% loc can be unique or have multiple links, look for labels and uid(s) for
% every loc, note: it is better to loop for every loc and ask for only one
% match, this will speed up the search. This may change after G202883
uids = cell(numel(loc),1);
labs = cell(numel(loc),1);
for i=1:numel(loc)
    uids{i} = regexp(s(loc(i):end),'(?<=&val=)\w*?(?=[''"]>)','match','once');
    labs{i} = regexp(s((loc(i)+length(basequery)):end),'(?<=>).*?(?=</a>)','match','once');
end
if numel(uids)~=numel(loc) ||  numel(labs)~=numel(loc)
    error('Bioinfo:getncbidata:NCBIcorruptedCanNotInterpretLinks',...
        'Can not interpret NCBI url data.') ;
end

% find if the record the user asked for has been replaced int he database,
% and if it has then throw a warning
locRecoredReplaced = strfind(s,'The record has been replaced by');
if ~isempty(locRecoredReplaced)
    todel = find(locRecoredReplaced<loc,1,'first');
    warning('Bioinfo:getncbidata:RecordReplaced',...
        ['The record ''%s'' has been replaced by record ''%s''\n',...
        '         in the NCBI database.'],...
        accessnum,labs{todel})
    % delete this link so it's no longer taken into account
    labs(todel)=[];
    uids(todel)=[];
    loc(todel)=[];

end

% find if the record the user asked for has been discontinued in the
% database, and if it has then throw a warning (example AAA00001)
if ~isempty(strfind(s,'This record has been discontinued'))
    warning('Bioinfo:getncbidata:RecordDiscontinued',...
        'The record ''%s'' has been discontinued in the NCBI database.',...
        accessnum)
end



% if loc is not unique then compare labs to select the uids of the entry
% that the user asked for
if numel(loc)>1
    sel=strmatch(lower(accessnum),lower(labs));%#ok
    if numel(sel) == 1  % found a unique label, great continue
        uids=uids(sel);
    elseif isempty(sel) % could not identify which one is, this error will occur most of the times
        error('Bioinfo:getncbidata:MultipleNonRecognizedLinks',...
            'NCBI search returned several entries, but none can be identified with ''%s''.',...
            accessnum)
    else % found multiple labels matching the accessnum, return first one,
        % this is very unlikely to occur, anyways we send a warning
        uids=uids(sel(1));
        warning('Bioinfo:getncbidata:MultipleRecognizedLinks',...
            'NCBI search returned several entries matching with ''%s'', considering first one.',...
            accessnum)
    end
end

% check to see if there is more than one uid listed separated by commas
uids = strread(uids{1},'%s','delimiter',',');
% now uids can have again multiple entries, but all will belong to the same
% accessnum (or label)

% multiple records returned, loop through, create a cell for each uid
genbankdata={};
retrieveurls = cell(size(uids,1),1);
for i=1:size(uids,1),
    % create url to retrieve record as text
    retrieveurls{i} = ['http://www.ncbi.nlm.nih.gov:80/sites/entrez/query.fcgi?cmd=Text&db=' db '&uid=' uids{i} '&dopt=' dbfrm];
    % Commented out for work around to G384213. NCBI is now returning XML
    % temp=urlread(retrieveurls{i}); 
    
    %June 6, 2007
    %NCBI is now returning XML instead of text.  Need to read in XML and
    %convert to char array to be used in genbankread, genpeptread and
    %fastaread
    xmlRecord = xmlread(retrieveurls{i});
    temp = xmlRecord.getElementsByTagName('pre').item(0).getFirstChild.getData.toCharArray';

    % make each line a separate row in a string array
    templine = char(strread(temp,'%s','delimiter','\n','whitespace',''));
    genbankdata{i}=templine;
end

while isempty(deblank(genbankdata(end,:)))
    genbankdata(end,:) = [];
end

% pass data to GENBANKREAD, to create structure
try
    switch dbfrm
        case 'GenBank'
                gbout = genbankread(char(genbankdata),passThroughArgs{:});
        case 'GenPept'
                gbout = genpeptread(char(genbankdata),passThroughArgs{:});
        case 'FASTA'
            fastadata = char(genbankdata);
            % check for spaces in last
            if all(isspace(fastadata(end,:)))
                fastadata(end,:) = '';
            end
            gbout = fastaread(fastadata);
    end
catch
    error('Bioinfo:getncbidata:URLreturnedIncorrectData','URL did not return valid NCBI data')
    % this error is random, depends on the reliability of the Internet connection
end

%  write out file
if tofile == true
    fid = fopen(filename,'wt') ;
    if fid == (-1)
        error('Bioinfo:getncbidata:CouldNotOpenFile',...
            'Unable to write to ''%s''.', filename);
    else
        genbankdata = char(genbankdata);
        rows = size(genbankdata,1);
        for rcount=1:rows-1
            fprintf(fid,'%s\n',genbankdata(rcount,:));
        end
        fprintf(fid,'%s',genbankdata(rows,:));
        fclose(fid);
    end
end

if seqonly == true
    gbout = gbout.Sequence;
else % add URLs
    if ~ strcmpi(fileformat,'FASTA')
        for n = 1:numel(gbout)
            gbout(n).SearchURL = searchurl;
            gbout(n).RetrieveURL = retrieveurls{i};
        end
    end
end

Contact us