from Fast Gene Sequence Search for Very Large Data File by Binlin Wu
Search for a specific sequence and record a neighboring code sequence with an offset position.

seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
function [seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch(f) with all default parameters.
%[seq,seqPos] = seqSearch(f,'','','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
%  Gene Sequence Search V2.3 - search for a specific sequence and record
%  a neighboring code sequence with an offset position using matrix for
%  large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% nHL: number of headlines you want to remove from the data, by default 0
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the 
%   code. e.g., offset is 1 for the first one after the last code (to the 
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time, 
%   by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
% 
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011


% set default values for the parameters
switch nargin
    case 0
        error('Parameters needed: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
    case 1
        nHL = 0; targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 2
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 3
        lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 4
        offset = 1; M = 1e6; sFlag = 0;
    case 5
        M = 1e6; sFlag = 0;
    case 6
        sFlag = 0;
    case 7
        if sFlag == 1
            error('save destination file name missing');
        end
    case 8
        if isequal(sFlag,1) && isempty(fs)
            error('save destination file name missing');
        end
    otherwise
        error('too many parameters: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
end

if isempty(nHL)
    nHL = 0;
end
if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(sFlag)
    sFlag = 0;
end

% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
    while(1)
        if confirm == 1
            break
        end
        if sFlag == 1 && exist(fs,'file') == 2
            fprintf('Filename %s already exists!\n',fs);
            reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
            if isempty(reply)
                reply = 'N';
            end
            if ~isequal(reply,'Y') && ~isequal(reply,'N')
                confirm = 0;
            elseif isequal(reply,'Y')
                confirm = 1;
            else
                fs1 = input('Please enter a new file name: ', 's');
                fprintf('Codes will be recorded in file: %s\n', fs1); 
                reply = upper(input('Confirm? Y/N [Y]: ', 's'));
                if isempty(reply)
                    reply = 'Y';
                end             
                if isequal(reply,'Y')
                    confirm = 1;
                    fs = fs1;
                else
                    confirm = 0;
                end
                if exist(fs1,'file') == 2
                    confirm = 0;
                end
            end
        end
    end
end

% ----- initialization -------
    
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
    sfid = fopen(fs,'w');
end

n = 0;
C0 = '';
seq = '';
seqPos = [];

% ----- start search -------

fprintf('Start searching ...\n')
if offset > 0    
    lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    
    C = textscan(fid,'%c',M,'HeaderLines',nHL);
    C = C{1};
    
    if isempty(C) % && n == 1
        fprintf('\n  The data file is empty!\n')
        return
        
    elseif length(C) > 0 && length(C) < lMin  %too short, length < lMin
        if nargout == 2
            seqPos0 = strfind(C',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0];
            end
        end
        
    else
        while(1)
            if length(C) == 0
                break
            else
                n = n+1;
                fprintf('\n  Checking the %dth %d codes ... \n',n,M);
                LastChunkLength = length(C);
                C = [C0;C];
                seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq); 
%                 seqPos0 = strfind(C(1:lengthC + lt-1)',targSeq); %not good for data length <=M
                if ~isempty(seqPos0)
                    seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
                    seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                    seqPos1 = seqPos1(:);

                    if sFlag == 1
                        fprintf(sfid,'%s',C(seqPos1));
                    end
                    if nargout >= 1
                        seq = [seq;C(seqPos1)];
                        if nargout >= 2
                            if n==1 %data length <= M
                                seqPos = seqPos0;
                            else %data length > M
%                                 seqPos = [seqPos seqPos0+length(C)-(lMin-1)+(n-1)*M-(lMin-1)];  %not good for data length <=M
                                seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
                            end
                        end            
                    end
                end
                fprintf('  The %dth %d codes done. \n',n,M);
                C0 = C(end - (lMin - 1) +1:end);
%                 C0 = C(lengthC + 1:end)  %not good for data length <=M                  
            end
            C = textscan(fid,'%c',M);
            C = C{1};
        end
        
        if nargout == 2
            seqPos0 = strfind(C0',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0+(n-1)*M+LastChunkLength-(lMin-1)];
            end
        end
    end
else
    lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    if -offset < lc
        error(' |offset| can not be smaller than lc, when offset < 0');
    end
    
    C = textscan(fid,'%c',M,'HeaderLines',1);   
    C = C{1};

    if isempty(C) % && n == 1
        fprintf('\n  The data file is empty!\n')
        return
        
    elseif length(C) > 0 && length(C) < lMin  %too short, length < lMin
        if nargout == 2
            seqPos0 = strfind(C',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end
        
    else
        if nargout == 2
            seqPos0 = strfind(C(1:lMin-1)',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end
        while(1)
            if length(C) == 0
                break
            else
                n = n+1;
                fprintf('\n  Checking the %dth %d codes ... \n',n,M);                                        
                C = [C0;C];
                seqPos0 = strfind(C(-offset+1:end)',targSeq);
        
                if ~isempty(seqPos0)
                    seqPos1 = repmat(seqPos0,lc,1);
                    seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                    seqPos1 = seqPos1(:);

                    if sFlag == 1
                        fprintf(sfid,'%s',C(seqPos1));
                    end
                    if nargout >= 1
                        seq = [seq;C(seqPos1)];
                        if nargout >= 2
                            if n == 1
                                seqPos = [seqPos seqPos0-offset];
                            else
                                seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
                            end
                        end
                    end
                end
                fprintf('  The %dth %d codes done. \n',n,M);
                C0 = C(end-(lMin-1)+1:end);
            end
            C = textscan(fid,'%c',M); 
            C=C{1};
        end        
    end
end
    
fprintf('\n  Searching finished! \n',n,M);
fclose('all');

Contact us at files@mathworks.com