No BSD License  

Highlights from
disk, memory caching - store objects by arbitrary key

image thumbnail
from disk, memory caching - store objects by arbitrary key by Mark Histed
Store any matlab objects to disk or keep them in memory, indexed by arbirary objects.

disk_cache(functionStr, varargin)
function [outData, outKey, varargout] = disk_cache(functionStr, varargin)
%DISK_CACHE (posit): temporary disk caches
%
%   * Writing/reading data in cache:
%   DISK_CACHE('set', cacheData, cacheId, outdatedKey, ...
%                     allowDupsStr, dataClass)
%   cachedData = DISK_CACHE('get', cacheId, allowMissingStr)
%   cachedData = DISK_CACHE('get', cacheId, allowMissingStr, dataClass)    
%   cachedData = DISK_CACHE('get', cacheId, allowMissingStr, dataClass, ...
%                                  outdatedKey)
%   [cachedData,cachedOutdatedKey] = DISK_CACHE('get', cacheId, ...)
%
%   * Manipulate cache:    
%   DISK_CACHE('clear')
%   DISK_CACHE('clear', dataClass)  % Clears only data in this class
%   DISK_CACHE('list')
%   DISK_CACHE('list', dataClass)   % lists only data in this class    
%   [cacheIds, cacheOutdatedKeys, cacheClasses] = DISK_CACHE('getcacheindex')
%
%   There are two bits of index data associated with each cache entry: cacheId
%   and outdatedKey.  Both can be arbitrary arrays and comparisons are done
%   after hashing with HASH_RECURSE
%
%   cacheId identifies the type of data; it is used to retrieve the data 
%       and only one cache entry with a given cacheId is allowed.
%   outdatedKey is used to tell whether the data associated with a given
%       cacheId is out of date; use some key here that will change whenever
%       the underlying data changes (like a time stamp on a file?)
%   dataClass: a string which groups data into classes.  This string is used
%       as the data file name if specified.  Data in different classes
%       can collide if the key is the same
%
%   allowDupsStr: if equal to 'allowDups', then ignore the attempt to store
%       data with the same key and Id.  If not equal, raise an error on this.
%   allowMissingStr: if equal to 'allowMissing', then return an empty matrix
%       if the ID is not found.  If not equal raise an error on missing Id.
%
%   For an example: do DBTYPE disk_cache
%   Todo (bug): data in different classes should not collide
%
%   See also: MEMORY_CACHE    
%
%$Id: disk_cache.m 164 2008-04-07 04:32:20Z histed $

%   Example:
%
% $$$ %%% check disk cache %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% $$$ cClass = 'typeofdata'; cacheKey = {inputVar1}; outdatedKey = {}; 
% $$$ cDat = disk_cache('get', cacheKey, 'allowMissing', cClass, outdatedKey);
% $$$ if ~isempty(cDat), [v1, v2] = deal(cDat{:}); return; end
% $$$ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% $$$ 
% $$$ your code here
% $$$ 
% $$$ %%% store data in cache %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% $$$ disk_cache('set', {v1, v2}, cacheKey,outdatedKey,'allowDups',cClass);
% $$$ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% dataClass is used only for user classification purposes.  It is not used
% to find index entries at all.
    
if nargin < 1 || ~ischar(functionStr)
    error('Invalid arguments');
end

% cache dir name
dirs = directories;
cacheDir = dirs.diskcache;

delIxKey = repmat(NaN,1,16);  % hashLen for MD5 is 16bytes 

switch functionStr
    case 'set'
        % process args
        if nargin <= 3, error('Must specify data, cacheId, outdatedKey'); end
        inData = varargin{1};
        inFullId = varargin{2};
        inOutdatedKey = hash_recurse(varargin{3});        

        allowDups = false; % default
        if nargin > 4, allowDups = strcmpi(varargin{4}, 'allowDups'); end

        dataClass = 'unspecifiedClass'; 
        if nargin > 5, dataClass = varargin{5}; end

        if nargin > 6, error('Too many arguments'); end
        assert(isstr(dataClass));

        inId = hash_recurse(inFullId);
        hashLen = length(inId);

        
        [diskIds, diskOutdatedKeys, diskDataClasses] ...
            = subGetIndexData(cacheDir);
        
        no = subWhichCacheIdNo(diskIds, inId);
        
        if length(no) == 1  % one match
            cacheNo = no;
            
            % check if not outdated            
            storedKey = diskOutdatedKeys(cacheNo,hashLen);
            if all(storedKey == inOutdatedKey)
                % dup data found
                if allowDups
                    % data is already there
                    % no check to see if the existing data matches
                    %assert(~isequalwithequalnans(inData, ...
                    %                             diskCacheData{no}), ...
                    %       'Duplicate key/id found but data is different!');
                else
                    error('Trying to store duplicate data in cache');
                end
                isOutdated = true;
            else
                isOutdated = false;
            end
        else
            % no entry, find the first deleted
            firstDelNo = subWhichCacheIdNo(diskIds,delIxKey,true);
            if ~isempty(firstDelNo)
                % found one
                assert((strcmp(diskIds(firstDelNo), delIxKey) ...
                        && strcmp(diskOutdatedKeys(firstDelNo),delIxKey) ...
                        && strcmp(diskDataClasses(firstDelNo),delIxKey)), ...
                       'Internal cache bug');
                cacheNo = firstDelNo;
            else
                % no empties, add to end
                cacheNo = size(diskIds,1) + 1;                
            end

            isOutdated = false;
        end
        
        % store data
        subSaveDataByNum(cacheDir, dataClass, inId, cacheNo, inOutdatedKey, ...
                         inData);

        % update index
        diskIds(cacheNo,1:hashLen) = inId;
        diskOutdatedKeys(cacheNo,1:hashLen) = inOutdatedKey;
        diskDataClasses{cacheNo} = dataClass;
        subSetIndexData(cacheDir, diskIds, diskOutdatedKeys, diskDataClasses);

        % display some status in a warning
        if isOutdated, dupStr = '(dup)'; else dupStr = ''; end
        w = whos('inData');
        warningStr ...
            = sprintf('Cached to disk %sb (before zip)%s, class %s id %s', ...
                      num2str_metric(w.bytes, 3), ...
                      dupStr, ...
                      pp_hash_key(dataClass, 16), ...
                      pp_hash_key(inFullId, 16));

        if warningStr(end) == sprintf('\n'), 
            warningStr = warningStr(1:end-1);  % strip EOL 
        end
        warning('POSIT:diskcache', warningStr);
    
  case 'get'
    % Note: we ignore data class when retrieving
        inId = hash_recurse(varargin{1});
        hashLen = length(inId);
        allowMissing = false; % default
        if nargin > 2, 
            allowMissing = strcmpi(varargin{2}, 'allowMissing');
        end

        dataClass = ''; % default
        if nargin > 3, dataClass = varargin{3}; end

        outdatedKey = []; % default
        if nargin > 4, outdatedKey = hash_recurse(varargin{4}); end

        if nargin > 5, error('Too many inputs'); end
        
        %%%%%%%%%%%%%%%%

        [diskIds, diskOutdatedKeys, diskDataClasses] ...
            = subGetIndexData(cacheDir);
        no = subWhichCacheIdNo(diskIds, inId);

        if isempty(no)
            if ~allowMissing
                error('in ''get'': Id not found (allowMissing not set)');
            end
            outData = [];
            if nargout > 1, outKey = []; end
            return
        else
            assert(length(no) == 1, 'Cache inconsistency');
        end
        
        % got here means ID match; check outdated key if it exists
        if ~isempty(outdatedKey)
            cachedOutdatedKey = diskOutdatedKeys(no,1:hashLen);
            if ~all(outdatedKey == cachedOutdatedKey)
                % mismatch
                if ~allowMissing
                    error('''get'': OutdatedKey mismatch (allowMissing unset');
                else
                    outData = [];
                    assert(nargout == 1, 'outdatedKey both input and output');
                    return
                end
            end
        end
        % got here means OutdatedKey match or not checked (not passed in)
        
        % use class from index
        inClass = diskDataClasses{no};
        
        outData = subLoadDataByNum(cacheDir, inClass, inId, no, ...
                                   diskOutdatedKeys(no,1:hashLen));
        if nargout > 1, outKey = diskOutdatedKeys(no,1:hashLen); end
        
    case 'clear'
        dataClass = ''; % default, means delete all
        if nargin == 2, dataClass = varargin{1}; end
        if nargin > 2, error('Too many arguments to ''clear'''); end

        if isempty(dataClass)
            deleteMask = '_PSCACHE_data-*.mat';            
        else
            deleteMask = sprintf('_PSCACHE_data-class_%s-*.mat', dataClass);
        end

        deleteFileName = fullfile(cacheDir, deleteMask);
        d = dir(deleteFileName);
        nFiles = length(d);
        disp(sprintf('Clearing cache: %d data files (ids) to delete', ...
                     nFiles));

        disp('C-c to abort, RET to continue');
        pause;
        
        % deal with index
        if isempty(dataClass)
            % clear mem index cache
            subSetIndexData(cacheDir, [], [], {});        
            memory_cache('clear');
        else
            % remove this class's entries
            [diskIds, diskOutdatedKeys, diskDataClasses] ...
                = subGetIndexData(cacheDir);
            delIx = strcmp(diskDataClasses, dataClass);

            hashLen = length(hash_recurse(0));

            nToDel = sum(delIx);
            diskIds(delIx,1:hashLen) = repmat(delIxKey(:)', [nToDel,1]);
            diskOutdatedKeys(delIx,1:hashLen) = repmat(delIxKey(:)', [nToDel,1]);
            [diskDataClasses{delIx}] = deal(delIxKey);
            
            % rewrite index
            subSetIndexData(cacheDir, diskIds, ...
                            diskOutdatedKeys, ...
                            diskDataClasses);
        end

        delete(deleteFileName);
        disp('Success.');
    case 'list'
        dataClass = ''; % default: list all
        if nargin == 2, dataClass = varargin{1}; end
        if nargin > 2, error('Too many arguments to ''list'''); end
        
        % get data from index
        [diskIds, diskOutdatedKeys, diskDataClasses] ...
            = subGetIndexData(cacheDir);
        w = whos('diskIds', 'diskOutdatedKeys', 'diskDataClasses');
        indexSize = sum(cat(1,w.bytes));

        % restrict by class if desired
        if ~isempty(dataClass)
            selIx = strcmp(diskDataClasses, dataClass);
        else
            selIx = true(1,length(diskDataClasses));
        end
        filtIds = diskIds(selIx,:);
        filtOutdatedKeys = diskOutdatedKeys(selIx,:);            
        filtDataClasses = diskDataClasses(selIx);
        
        % pretty printing here
        nEntries = size(filtIds,1);
        disp('Class                Size    ID');
        disp('-------------------------------------------------------------'); 
        for i=1:nEntries
            tId = filtIds(i,:);
            if all(isnan(tId))
                % deleted, skip it
                sz(i) = 0;
                continue
                %idStr = 'deleted';
                %tClass = 'deleted';
            else
                idStr = reshape(lower(dec2hex(tId))',1,[]);
                no = subWhichCacheIdNo(diskIds, tId);
                tClass = filtDataClasses{i};
                sz(i) = subGetDataSize(cacheDir, filtDataClasses{i}, no);
            end
            
            tStr{i} = sprintf('%-30.30s %4sb   %s', ...
                              tClass, ...
                              num2str_metric(chop(sz(i),2)), ...
                              idStr);
        end
        if nEntries == 0
            disp('*** No data cached yet'); 
            sz = 0; indexSize = 0;
        else
            % remove blank lines (deleted entries)
            tStr = cellstr(cat(1,tStr{:}));
            disp(char(tStr));
        end
        
        % display summary sizes
        disp(sprintf('Total size of listed data: %sb; index is %sb', ...
                     num2str_metric(chop(sum(sz),3)), ...
                     num2str_metric(chop(indexSize,3))));
        return
  
    case 'getcacheindex'
        % get data from index
        [diskIds, diskOutdatedKeys, diskDataClasses] ...
            = subGetIndexData(cacheDir);
        outData = diskIds;
        outKey = diskOutdatedKeys;
        varargout{1} = diskDataClasses;
        return
    
    otherwise
      error('Unknown functionStr: %s', functionStr);
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function no = subWhichCacheIdNo(theCacheIds, idToMatch, allowDups, ...
                                theClasses, restrictToClass)

if nargin < 3, allowDups = false; end
if nargin < 4, theClasses = []; end
if nargin < 5, restrictToClass = []; end

nIds = size(theCacheIds,1);
if nIds == 0
    % no index at all
    no = []; 
else
    if ~isempty(restrictToClass)
        desIx = strcmp(theClasses, restrictToClass);
    else
        desIx = true(nIds,1);
    end
    
    no = find( (all((theCacheIds == repmat(idToMatch,[nIds,1])), 2) ...
                & desIx));
    nFound = length(no);
    if allowDups
        % allow dups only used when finding deleted placeholder key
        if nFound > 1
            % return the first one
            no = no(1); 
        end
    else
        if nFound > 1
            error('Cache inconsistency: Duplicate ids');
        end
        % do nothing, the single match found is returned
    end
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [diskCacheIds, diskOutdatedKeys, diskDataClasses] ...
        = subGetIndexData(cacheDir)
% This function and the next contain all the functions for storing the index
% data.  Note that the index data is cached in memory for speed

%%% check mem cache
persistent mFileChangedFlag
memCacheKey = mfilename;
memOutdatedKey = mFileChangedFlag;
cachedDat = memory_cache('get', memCacheKey, 'allowMissing', memOutdatedKey);
if ~isempty(cachedDat)  % cache hit
    [diskCacheIds, diskOutdatedKeys, diskDataClasses, cacheIndexDir] ...
        = deal(cachedDat{:});
    if strcmp(cacheIndexDir, cacheDir)
        % cache hit, return data
        return
    else
        % dir mismatch
        disp('disk_cache: Index in mem is for a different cache dir, clearing');
        memory_cache('clear');
        % just continue
    end
end  
%%% else go on

% if we got here, there is no data in mem, read it
xFileName = fullfile(cacheDir,'_PSCACHE_indexdata.mat');
if exist(xFileName, 'file')
    ds = load(xFileName);

    diskCacheIds = ds.diskCacheIds;
    diskOutdatedKeys = ds.diskOutdatedKeys;
    diskDataClasses = ds.diskDataClasses;

else
    %initialize cache
    diskCacheIds = [];
    diskOutdatedKeys = [];
    diskDataClasses = {};
end


%%% store data in mem cache
wstate = warning('off', 'POSIT:memorycache'); 
indexDat = {diskCacheIds, diskOutdatedKeys, diskDataClasses, cacheDir};
memory_cache('set', indexDat, memCacheKey, now+cputime, 'allowDups');
warning(wstate);
%%%    


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function subSetIndexData(cacheDir, diskCacheIds, diskOutdatedKeys, ...
                         diskDataClasses)

xFileName = fullfile(cacheDir,'_PSCACHE_indexdata');

save(xFileName, '-v6', 'diskCacheIds', 'diskOutdatedKeys', 'diskDataClasses');

%%% store data in mem cache too
memCacheKey = mfilename;
mFileChangedFlag = 'mFileInMemory';
memOutdatedKey = mFileChangedFlag;
wstate = warning('off', 'POSIT:memorycache'); 
indexDat = {diskCacheIds, diskOutdatedKeys, diskDataClasses, cacheDir};
memory_cache('set', indexDat, memCacheKey, now, 'allowDups');
warning(wstate);
%%%    


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% the below functions are used to save/load data from disk.  If I wanted to
% move away from the v6 .MAT format at some point, I just need to replace
% these four

function fName = subCacheFileName(cacheDir, className, idNum)
fName = fullfile(cacheDir, ...
                 sprintf('_PSCACHE_data-class_%s-id_%08d.mat', ...
                         className, idNum));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function subSaveDataByNum(cacheDir, className, cacheId, idNum, ...
                      cacheOutdatedKey, cacheData);
% Note: each data entry is saved in a separate file

assert(isvarname(className), 'Class name must be a valid var name');
cacheFileName = subCacheFileName(cacheDir, className, idNum);

% always save without compression: slower by a factor of 10 for large files
save(cacheFileName, '-v6', ...
     'className', 'cacheId', 'cacheOutdatedKey', 'cacheData');

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [dat] = subLoadDataByNum(cacheDir, className, cacheId, idNum, ...
                                  cacheOutdatedKey);

assert(isvarname(className), 'Class name must be a valid var name');
cacheFileName = subCacheFileName(cacheDir, className, idNum);

ds = load(cacheFileName);
assert(all(ds.cacheOutdatedKey == cacheOutdatedKey));
assert(all(ds.cacheId == cacheId));

dat = ds.cacheData;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function outSize = subGetDataSize(cacheDir, className, idNum)
cacheFileName = subCacheFileName(cacheDir, className, idNum);
d = dir(cacheFileName);
if ~isempty(d)
    outSize = d.bytes;
else
    error('index inconsistency: file not found?');
end


Contact us at files@mathworks.com