Code covered by the BSD License  

Highlights from
dataview.m

from dataview.m by Peter Cotton
Sparse and possibly normalized representation of a dataset

dataview
classdef dataview

    % A collection of one or more related @datasets, achieving a sparse (and
    % possibly normalized) representation of a larger @dataset   
    % 
    % Dataviews are compressed by providing a list of "key" variable names 
    % (analogous to key fields in a database) and a name for a new integer
    % variable variable which will substitute for the key fields. This
    % operation can be performed more than once. 
    %
    % Any dataset can be converted to a dataview by means of the dataview     
    % constructor, retrieved with dataview.full, and added to by means of
    % dataview.insert. The latter operation preserves the exising key lookups.
   
    %% Motivation note: 
    %
    % An ETL tool for relational databases
    % An updateable view of sorts
    
    
    %% Implementation note:
    % Key fields using the subsituted index head to the "right", whereas
    % the residual data stays on the left. The compressed dataview has the
    % following structure
    %
    %            |
    %            +----- right dataset #1  (key lookup)
    %            |         
    %            +----- right dataset #2  (key lookup)
    %            |         
    %            +----- right dataset #3  (key lookup)
    %            |         
    %            left dataset   (compressed dataset)
    %
    % 
    
    properties (SetAccess = private, GetAccess = public)
        Properties
        leftTable
        rightTable
        key
    end

    methods

        function dv = dataview(ds)
            % Convert dataset to dataview
            dv.Properties.Description = ds.Properties.Description;
            dv.Properties.VarNames = ds.Properties.VarNames;
            dv.leftTable = ds;
            dv.rightTable = [];
            dv.key = [];
        end

        function dv = compress(dv,cols,name)
            % dv    dataview
            % ds    dataset
            % cols  listing of variables to eliminate (or cell array of variable names)
            % name  name of new key variable created
            if isa(dv.leftTable,'dataview'),
                dv.leftTable = compress(dv.leftTable,cols,name);
            else
                if iscell(cols),
                    [f,cols] = ismember(cols,dv.leftTable.Properties.VarNames);
                end
                if any(cols==0),
                    error('Cannot find all fields specified');
                end
                
                [left_,right_] = createKey(dv.leftTable,cols,name);
                if isempty(dv.rightTable),
                    dv.leftTable = left_;
                    dv.rightTable = right_;
                    dv.key = name;
                else
                    l_ = dataview(dv.leftTable);
                    l_.leftTable = left_;
                    l_.rightTable = right_;
                    l_.key = name;
                    dv.leftTable = l_;
                end

            end
        end

        function dv = insert(dv,ds,name)
            % Insert dataset ds into dataview dv, optionally specifying a
            % table name
         

            if isa(ds,'dataview'),
                error('Anticipating dataset');
            end

            if nargin<3 || strcmp(dv.Properties.Description,name),
                % By default we assume dataset ds can be inserted into the
                % dataview at the root level
                if isempty(dv.rightTable),
                   dv.leftTable = concatenateDatasets(dv.leftTable,ds);  % [CHANGED] This line was: dv.leftTable = dataview.concatenate(dv.leftTable,ds);
                else
                    % Ensure dataset ds is suitable for insertion
                    nonKey = setdiff(dv.Properties.VarNames,dv.key);
                    [dummy,loc_] = ismember(nonKey,ds.Properties.VarNames);
                    if any(loc_==0),
                        f = find(loc_==0);
                        error(['Cannot insert. Missing ',dv.Properties.VarNames{f(1)}]);
                    else
                        ds = ds(:,loc_);
                    end
                    nonKeyRight = setdiff(dv.rightTable.Properties.VarNames,dv.key);
                    [dummy,nonKeyLoc] = ismember(nonKeyRight,ds.Properties.VarNames);
                    dsNonKeyRight = ds(:,nonKeyLoc);
                    [dummy,nonKeyLocTable] = ismember(nonKeyRight,dv.rightTable.Properties.VarNames);
                    if isa(dv.rightTable,'dataview'),
                        error('Cannot treat case where right Table is a dataview');
                    end
                    dvNonKeyRight = dv.rightTable(:,nonKeyLocTable);

                    % Create new keys (lookups) where necessary on the right to accomodate the
                    % additional data on the left.
                    bothNonKeyRight = [dvNonKeyRight;dsNonKeyRight];
                    [dsUniqueRight,I,J] = unique(bothNonKeyRight);
                    dvKeyRight = dv.rightTable.(dv.key);
                    nUnique = size(dsUniqueRight,1);
                    uniqueKeys = nan(nUnique,1);
                    [dum,loc] = dataset_ismember(dvNonKeyRight,dsUniqueRight); % Preserve the existing keys
                    uniqueKeys(loc) = dvKeyRight;
                    nMissingKeys = sum(isnan(uniqueKeys));
                    uniqueKeys(isnan(uniqueKeys)) = nanmax(uniqueKeys)+[1:nMissingKeys]';
                    concatenatedKeys = uniqueKeys(J);
                    leftKeys = concatenatedKeys(size(dvKeyRight,1)+1:end);

                    % Assign on right
                    uK = dataset(uniqueKeys);
                    uK.Properties.VarNames{1} = dv.key;
                    right_ = [uK,dsUniqueRight];
                    right_.Properties.Description = dsUniqueRight.Properties.Description;
                    dv.rightTable = right_;

                    % Append on left
                    lKeys = dataset(leftKeys);
                    lKeys.Properties.VarNames{1} = dv.key;
                    nonKeyLeft = setdiff(dv.leftTable.Properties.VarNames,dv.key);
                    [dummy,nonKeyLeftLoc] = ismember(nonKeyLeft,ds.Properties.VarNames);
                    dsInsertLeft = [lKeys,ds(:,nonKeyLeftLoc)];
                    dsInsertLeft.Properties.Description = dv.leftTable.Properties.Description;
                    if isa(dv.leftTable,'dataview'),
                        dv.leftTable = insert(dv.leftTable,dsInsertLeft);
                    elseif isa(dv.leftTable,'dataset'),
                        dv.leftTable = concatenateDatasets(dv.leftTable,dsInsertLeft);
                    else
                        error('Anticipated dataview or dataset object');
                    end
                end
            else
                % Search down for a compressed version of the dataview and
                % insert ds there
                dv.leftTable = insert(dv.leftTable,ds,name);
            end

        end
        
        function dv = full(dv)
           if isempty(dv.rightTable)
               dv = dv.leftTable;
           elseif ~isa(dv.leftTable,'dataview'), 
               lc = join(dv.leftTable,dv.rightTable,dv.key);
               lc.(dv.key) = []; % throw away the key!
               dv = lc;
           else
               dv.leftTable = full(dv.leftTable);
               dv = full(dv);
           end
        end
        
    end
    
    methods (Static)
        
        function pass = unitTests
            pass = insertionTest && compressionTest && maximalCompressionTest && ...
                compressionOrderTest && stepByStepCompressionTest && compressedCompressionTest;
        end

        function ds = renameVariablesInDataset(ds,varName,newName,varargin)
            % Supply old and new variable names in key,val,key,val format
            if isempty(varargin),
                col = dsCols(ds,varName);
                if isempty(col)
                    disp(['Cannot rename ',varName]);
                else
                    ds.Properties.VarNames{col} = newName;
                end
            else
                ds = dataview.renameVariablesInDataset(ds,varName,newName);
                ds = dataview.renameVariablesInDataset(ds,varargin{:});
            end

        end

    end
end 
   
%% Dataset missing method (Dear Mathworks, please include)
function [tf,loc] = dataset_ismember(A,S)
% Compare standard matlab function ismember 
[dum,dum,code] = unique([A;S]);
nA = size(A,1);
nS = size(S,1);
codeA = code(1:nA);
codeS = code(nA+1:nA+nS);
[tf,loc] = ismember(codeA,codeS);
end


%% Non-standard,private, uninteresting, quirky dataset manipulation methods
function [colNos,selectDs] = dsCols(originalDs,varNames)
% Retrive list of column numbers corresponding to variable
% names, and the corresponding sub-dataset
if ischar(varNames),
    varNames = {varNames};
end
[isMember,colNos] = ismember(varNames,originalDs.Properties.VarNames);
if nargout>1,
    selectDs = originalDs(:,colNos);
end
end

function ds = concatenateDatasets(ds,ds1)
% Fast and loose concatenation of datasets
% ds1 must contain at least the variables in ds, but may
% contain more.
[colNos,ds1_] = dsCols(ds1,ds.Properties.VarNames);
ds = [ds;ds1_];
end

function [leftDs,rightDs] = createKey(originalDs,keyCols,keyName)
% Remove some columns from orginal data set, replacing them by
% a key to a newly created table.
%
% originalDs, leftDs, rightDs   datasets
% cols        array of integer
nCols = size(originalDs,2);
nonKeyCols = setdiff([1:nCols],keyCols);
[rightDs,I,primaryKey] = unique(originalDs(:,keyCols));
primary = dataset({primaryKey,keyName});
leftDs = [primary,originalDs(:,nonKeyCols)];
newKey = dataset({[1:size(rightDs,1)]',keyName});
rightDs = [newKey,rightDs];
end


%% Examples and tests
function dv = dvZooExample
% Create a compressed dataview
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
end

function ds = dsZooExample
animals = {'dog','cat','kitten','dog'}';
age = [17;13;14;17];
color = {'blue','red','pink','blue'}';
legs = [4;4;2;3];
ds = dataset(animals,age,color,legs);
end

function ds = dsFriendsExample
newAnimals = {'dog','cow','lamb'}';
newAge = [17,12,11]';
newColor = {'blue','purple','red'}';
legs = [3;3;12];
extraneous = [123;124;213];
ds = dataset(newAnimals,newAge,newColor,extraneous,legs);
ds = dataview.renameVariablesInDataset(ds,'newAnimals','animals','newAge','age','newColor','color');
end

function result = insertionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dvZooExample;
dv = insert(dv,zooFrDs); % Insert into compressed dataview
dsFinal = full(dv);      % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end

function result = compressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
dsFinal = full(dv);      % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end

function result = compressedCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
dv = compress(dv,{'category','color_index'},'unique_animal');
dsFinal = full(dv);      % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end

function result = maximalCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age','color'},'category');
dsFinal = full(dv);      % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end

function result = stepByStepCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals'},'category');
dv = compress(dv,{'age'},'ancienty');
dv = compress(dv,{'color'},'color_index');
dsFinal = full(dv);      % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end

function result = compressionOrderTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;

dv1 = dataview(zooDs);
dv1 = insert(dv1,zooFrDs); % Insert into dataview
dv1 = compress(dv1,{'animals','age'},'category');
dv1 = compress(dv1,{'color'},'color_index');

dv2 = dataview(zooDs);
dv2 = insert(dv2,zooFrDs); % Insert into dataview
dv2 = compress(dv2,{'color'},'color_index');
dv2 = compress(dv2,{'animals','age'},'category');

Full_DS_Equal = dsEqual(full(dv1),full(dv2));
Compressed_DV_Equal = dsEqual(dv1.leftTable.leftTable,dv2.leftTable.leftTable);

result = Full_DS_Equal && Compressed_DV_Equal;
end

function result = retrievalTest
zooFrDs = dsFriendsExample;
dv = dvZooExample;
dv = insert(dv,zooFrDs); % Insert into compressed dataview
tb = getTableByName(dv,'category');
result = ~isempty(tb);
end

function result = dsEqual(A,B)
% Sloppy check for equality in datasets
try
    [dummy,A_] = dsCols(A,B.Properties.VarNames);
    C = [A_;B];
    nA = size(A,1);
    [dummy,dummy,J] = unique(C);
    jA = J(1:nA);
    jB = J(nA+1:end);
    result = all(jA==jB);
catch
    result = 0;
end
end


Contact us at files@mathworks.com