image thumbnail

Linear time Outlier Scoring via Random Walks

by

 

I use a markov chain rejection sampling method to score outliers in linear time.

scoreOutliers(inputMat)
function outputMat = scoreOutliers(inputMat)
  % Linear Time Outlier Scoring via Random Walks   
  % Compute the outlier score for each p-dimensional data point
  % The highest scores are possible outliers, scores between [0,1]
  % The outputMat last column contains the scores
  % Original scoring algorithm by Michael S Kim (mikeskim [at] gmail.com)
  % Version 1.00 (10/02/2013) for Matlab ported from R
  % not fully tested on Matlab, tested on GNU Octave and R
  % warning('off','all');

  sampleSize = 2000; %increase this value for higher precision, lower for faster computation times
  [rowSize, colSize] = size(inputMat);
  scores = zeros(rowSize,1);
  inputMat = [inputMat, scores(:)];
  lastColIndex = colSize + 1;

  tmpSample = zeros(sampleSize,1);
  tmpIndex = unidrnd(rowSize, (sampleSize+1),1);
  for j =1:sampleSize
   tmpSample(j) = sum(abs(inputMat(tmpIndex(j),:)-inputMat(tmpIndex((j+1),:))));
  end
  sampleMax = median(tmpSample) + 4*1.4826*mad(tmpSample,1);

  for j = 1:rowSize
    mcount = 0;
    origin = inputMat(j,1:colSize);
    tmpSampleIndex = unidrnd(rowSize, sampleSize, 1);
    while mcount < sampleSize
      mcount = mcount + 1;
      tmpDestIndex = tmpSampleIndex(mcount);
      destination =  inputMat(tmpDestIndex,1:colSize);
      tmpDist = sum(abs(origin - destination))/sampleMax;
      if unifrnd(0,1,1) > tmpDist 
        origin = destination;
        inputMat(tmpDestIndex, lastColIndex) = inputMat(tmpDestIndex, lastColIndex) + 1;
      end
    end
  end

  tmpMax =  max(inputMat(:,lastColIndex));
  inputMat(:,lastColIndex) = 1 - inputMat(:,lastColIndex)/tmpMax;
  outputMat = sortrows(inputMat, lastColIndex);
end

%test function via
%tmpOut = scoreOutliers([randn(300,1), randn(300,1)])
%scatter3(tmpOut(:,1),tmpOut(:,2),tmpOut(:,3),10, tmpOut(:,3), "s")

Contact us