function [OUT_ALIGN cur_score] = RefineAlignments(IN_ALIGN, varargin)
% RefineAlignments
% Uses an interative algorithm to improve the multiple alignments of
% an input sequence. It will iteratively remove a fraction of the
% sequences, compress any gaps, and then re-align. This will be done
% for NUM_REPS.
%
% Written by Will Dampier. Contact at wnd22@drexel.edu
% Version 1.0 on 10/23/09
%
% [OUT_ALIGN SCORE] = RefineAlignments(IN_ALIGN)
%
% IN_ALIGN A char-array representing a multiple alignment to be
% refined.
%
% OUT_ALIGN The refined alignment.
% SCORE The new alignment score.
%
%
% Optional Arguements:
%
% 'Alphabet' Either 'AA' or 'NT' to indicate whether this is a
% nucleotide or amino acid alignment.
%
% 'GapPenalty' The penalty for gaps. This is set at -8 by
% default.
%
% 'DistMat' The distance matrix to use for the alignment
% scoring. Default is BLOSUM50 for 'AA' and NUC44
% for 'NT'
%
% 'NumReps' The number of refining iterations to do.
%
% 'NumTry' The number of tries to perform at each iteration.
%
% 'RefineType' Which type of refinement to perform. This can
% be either 'STOCASTIC', 'DETERMINISTIC' or 'MIXED'.
%
% 'Display' A boolean indicating whether to display the output
% at each iteration.
%
%
RM_FRAC = 0.1;
NUM_TRIES = 10;
NUM_REPS = 100;
alpha = 'aa';
dist_mat = [];
gap_penalty = -8;
TYPE = 'mixed';
disp_flag = true;
for i = 1:2:length(varargin)
switch lower(varargin{i})
case 'alphabet'
if strcmpi('nt', varargin{i+1}) || strcmpi('aa', varargin{i+1})
alpha = varargin{i+1};
else
error('RefineAlignments:BADALPHA', ...
'Arguement to "alphabet" must be "NT" or "AA"')
end
case 'gappenalty'
if isnumeric(varargin{i+1}) && varargin{i+1} < 0
gap_penalty = varargin{i+1};
else
error('RefineAlignments:BADGAP', ...
'Arguement to "GapPenalty" must be negative numeric')
end
case 'distmat'
dist_mat = varargin{i+1};
case 'numreps'
if isnumeric(varargin{i+1}) && varargin{i+1} > 1
NUM_REPS = varargin{i+1};
else
error('RefineAlignments:BADREPS', ...
'Arguement to NumReps must be a positive numeric')
end
case 'refinetype'
TYPE = varargin{i+1};
case 'display'
if islogical(varargin{i+1})
disp_flag = varargin{i+1};
else
error('RefineAlignments:DISPLAY', ...
'Arguement to Display must be a boolean.')
end
otherwise
error('RefineAlignments:BADARG', 'Unknown arguement: %s', ...
lower(varargin{i}))
end
end
if isempty(dist_mat)
if strcmpi(alpha, 'nt')
dist_mat = nuc44;
toint = @nt2int;
else
dist_mat = blosum50;
toint = @aa2int;
end
end
dist_mat(end+1,:) = gap_penalty;
dist_mat(:,end+1) = gap_penalty;
dist_mat(end,end) = 0;
switch upper(TYPE)
case 'MIXED'
is_stoch = true;
is_mix = true;
case 'STOCASTIC'
is_stoch = true;
is_mix = false;
case 'DETERMINISTIC'
is_stoch = false;
is_mix = false;
end
OUT_ALIGN = IN_ALIGN;
cur_score = CalculateScore(OUT_ALIGN, dist_mat, toint);
for i = 1:NUM_REPS
%determine whether to switch based on the "mix" factor
if is_mix
is_stoch = ~is_stoch;
end
rm_rows = GetRows(OUT_ALIGN, is_stoch, RM_FRAC, NUM_TRIES);
counter = 1;
while counter < NUM_TRIES
new_mat = SplitAndReAlign(OUT_ALIGN, rm_rows(:,counter), alpha);
new_score = CalculateScore(new_mat, dist_mat, toint);
if new_score > cur_score
if disp_flag
disp_cell = {'On Iter: ', num2str(i), ' Improved by: ', ...
num2str(abs(new_score-cur_score)), ' and by ', ...
num2str(abs(size(new_mat,2)-size(OUT_ALIGN,2))), ...
' columns', ' Using stoch:', num2str(is_stoch)};
display([disp_cell{:}])
end
cur_score = new_score;
OUT_ALIGN = new_mat;
break
end
counter = counter + 1;
end
end
function rows_mat = GetRows(MAT, STOCH, RM_FRAC, NUM_TRIES)
% GetRows
% A helper function which determines the rows to split the alignment
% with.
%
% MAT A char-array of the multiple alignment.
% STOCH A boolean indicating whether to use a stochastic approach.
% RM_FRAC The fraction of rows to remove.
% NUM_TRIES The number of tries to return.
%
% rows_mat A matrix indicating which rows to remove.
%
if STOCH
rows_mat = rand(size(MAT,1),NUM_TRIES) > RM_FRAC;
else
gap_mask = MAT =='-';
runs = FindRuns(gap_mask);
score = mode(runs.*(runs~=0))-min(runs);
[~, order] = sort(score, 'descend');
rm_rows = order(1:NUM_TRIES);
[~, norder] = sort(runs(:,rm_rows),'descend');
rows_mat = false(size(MAT,1), NUM_TRIES);
num_remove = ceil(RM_FRAC*size(MAT,1));
for i = 1:NUM_TRIES
rows_mat(norder(1:num_remove,i),i) = true;
end
end
function NEW_MAT = SplitAndReAlign(MAT, ROWS, alpha)
% SplitAndReAlign
% This will split the alignment matix based on the provided rows and
% then create two profiles and re-align them.
%
% MAT A char-array of the alignment.
% ROWS A boolean-array describing the switch between rows
% alpha Either 'NT' or 'AA' to indicate which sequence profile to
% create.
%
% NEW_MAT The ReAlinged alignment
%
mat1 = MAT(ROWS, :);
mat2 = MAT(~ROWS, :);
mat1(:,all(mat1 == '-',1)) = [];
mat2(:,all(mat2 == '-',1)) = [];
pmat1 = seqprofile(mat1, 'alphabet', alpha);
pmat2 = seqprofile(mat2, 'alphabet', alpha);
[~, ind1, ind2] = profalign(pmat1, pmat2);
NEW_MAT = char(0);
NEW_MAT(ROWS, ind1) = mat1;
NEW_MAT(~ROWS, ind2) = mat2;
NEW_MAT(~isletter(NEW_MAT)) = '-';
NEW_MAT(all(NEW_MAT == '-',1)) = [];
function s = CalculateScore(MAT, DIST, FUN)
% CalculateScore
% A function which calculates the score of the alignment. This score
% is based on the agreement between the consensus and each individual
% alignment.
%
% MAT A char-array of a multiple alignment.
% DIST A distance matrix which indicates the penalty for
% mismatches.
% FUN A function which converts letter to numbers such at nt2int
% or aa2int.
%
% s The score for this alingment.
%
%
cons = seqconsensus(MAT);
num_cons = FUN(cons);
num_align = FUN(MAT);
ncons = repmat(num_cons(:), [size(num_align,1), 1]);
nalign = num_align(:);
lookupinds = sub2ind(size(DIST), ncons, nalign);
s = sum(DIST(lookupinds));
function reverse_looking = FindRuns(input)
% FindRuns
% Finds consecutive runs of 1's along the rows of a boolean array.
%
% INPUT = [1 1 1 0 0 0 1 0 1 0 1 1;
% 0 1 1 1 0 1 1 0 0 1 1 1];
% RL = [1 2 3 0 0 0 1 0 1 0 1 2;
% [0 3 2 1 0 1 2 0 0 1 2 3];
%
[m,n] = size(input);
reverse_looking = [zeros(1,m);input.'];
reverse_looking = reverse_looking(:);
p = find(~reverse_looking);
reverse_looking(p) = [0;1-diff(p)];
reverse_looking = reshape(cumsum(reverse_looking),[],m).';
reverse_looking(:,1) = [];