| seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
|
function [seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch(f) with all default parameters.
%[seq,seqPos] = seqSearch(f,'','','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
% Gene Sequence Search V2.3 - search for a specific sequence and record
% a neighboring code sequence with an offset position using matrix for
% large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% nHL: number of headlines you want to remove from the data, by default 0
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% set default values for the parameters
switch nargin
case 0
error('Parameters needed: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
case 1
nHL = 0; targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 2
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 3
lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 4
offset = 1; M = 1e6; sFlag = 0;
case 5
M = 1e6; sFlag = 0;
case 6
sFlag = 0;
case 7
if sFlag == 1
error('save destination file name missing');
end
case 8
if isequal(sFlag,1) && isempty(fs)
error('save destination file name missing');
end
otherwise
error('too many parameters: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
end
if isempty(nHL)
nHL = 0;
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(sFlag)
sFlag = 0;
end
% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
while(1)
if confirm == 1
break
end
if sFlag == 1 && exist(fs,'file') == 2
fprintf('Filename %s already exists!\n',fs);
reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
if isempty(reply)
reply = 'N';
end
if ~isequal(reply,'Y') && ~isequal(reply,'N')
confirm = 0;
elseif isequal(reply,'Y')
confirm = 1;
else
fs1 = input('Please enter a new file name: ', 's');
fprintf('Codes will be recorded in file: %s\n', fs1);
reply = upper(input('Confirm? Y/N [Y]: ', 's'));
if isempty(reply)
reply = 'Y';
end
if isequal(reply,'Y')
confirm = 1;
fs = fs1;
else
confirm = 0;
end
if exist(fs1,'file') == 2
confirm = 0;
end
end
end
end
end
% ----- initialization -------
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
sfid = fopen(fs,'w');
end
n = 0;
C0 = '';
seq = '';
seqPos = [];
% ----- start search -------
fprintf('Start searching ...\n')
if offset > 0
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',M,'HeaderLines',nHL);
C = C{1};
if isempty(C) % && n == 1
fprintf('\n The data file is empty!\n')
return
elseif length(C) > 0 && length(C) < lMin %too short, length < lMin
if nargout == 2
seqPos0 = strfind(C',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0];
end
end
else
while(1)
if length(C) == 0
break
else
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
LastChunkLength = length(C);
C = [C0;C];
seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
% seqPos0 = strfind(C(1:lengthC + lt-1)',targSeq); %not good for data length <=M
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
if n==1 %data length <= M
seqPos = seqPos0;
else %data length > M
% seqPos = [seqPos seqPos0+length(C)-(lMin-1)+(n-1)*M-(lMin-1)]; %not good for data length <=M
seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
end
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) +1:end);
% C0 = C(lengthC + 1:end) %not good for data length <=M
end
C = textscan(fid,'%c',M);
C = C{1};
end
if nargout == 2
seqPos0 = strfind(C0',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0+(n-1)*M+LastChunkLength-(lMin-1)];
end
end
end
else
lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
if -offset < lc
error(' |offset| can not be smaller than lc, when offset < 0');
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if isempty(C) % && n == 1
fprintf('\n The data file is empty!\n')
return
elseif length(C) > 0 && length(C) < lMin %too short, length < lMin
if nargout == 2
seqPos0 = strfind(C',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
else
if nargout == 2
seqPos0 = strfind(C(1:lMin-1)',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
while(1)
if length(C) == 0
break
else
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C];
seqPos0 = strfind(C(-offset+1:end)',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
if n == 1
seqPos = [seqPos seqPos0-offset];
else
seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
end
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end-(lMin-1)+1:end);
end
C = textscan(fid,'%c',M);
C=C{1};
end
end
end
fprintf('\n Searching finished! \n',n,M);
fclose('all');
|
|