classdef dataview
% A collection of one or more related @datasets, achieving a sparse (and
% possibly normalized) representation of a larger @dataset
%
% Dataviews are compressed by providing a list of "key" variable names
% (analogous to key fields in a database) and a name for a new integer
% variable variable which will substitute for the key fields. This
% operation can be performed more than once.
%
% Any dataset can be converted to a dataview by means of the dataview
% constructor, retrieved with dataview.full, and added to by means of
% dataview.insert. The latter operation preserves the exising key lookups.
%% Motivation note:
%
% An ETL tool for relational databases
% An updateable view of sorts
%% Implementation note:
% Key fields using the subsituted index head to the "right", whereas
% the residual data stays on the left. The compressed dataview has the
% following structure
%
% |
% +----- right dataset #1 (key lookup)
% |
% +----- right dataset #2 (key lookup)
% |
% +----- right dataset #3 (key lookup)
% |
% left dataset (compressed dataset)
%
%
properties (SetAccess = private, GetAccess = public)
Properties
leftTable
rightTable
key
end
methods
function dv = dataview(ds)
% Convert dataset to dataview
dv.Properties.Description = ds.Properties.Description;
dv.Properties.VarNames = ds.Properties.VarNames;
dv.leftTable = ds;
dv.rightTable = [];
dv.key = [];
end
function dv = compress(dv,cols,name)
% dv dataview
% ds dataset
% cols listing of variables to eliminate (or cell array of variable names)
% name name of new key variable created
if isa(dv.leftTable,'dataview'),
dv.leftTable = compress(dv.leftTable,cols,name);
else
if iscell(cols),
[f,cols] = ismember(cols,dv.leftTable.Properties.VarNames);
end
if any(cols==0),
error('Cannot find all fields specified');
end
[left_,right_] = createKey(dv.leftTable,cols,name);
if isempty(dv.rightTable),
dv.leftTable = left_;
dv.rightTable = right_;
dv.key = name;
else
l_ = dataview(dv.leftTable);
l_.leftTable = left_;
l_.rightTable = right_;
l_.key = name;
dv.leftTable = l_;
end
end
end
function dv = insert(dv,ds,name)
% Insert dataset ds into dataview dv, optionally specifying a
% table name
if isa(ds,'dataview'),
error('Anticipating dataset');
end
if nargin<3 || strcmp(dv.Properties.Description,name),
% By default we assume dataset ds can be inserted into the
% dataview at the root level
if isempty(dv.rightTable),
dv.leftTable = concatenateDatasets(dv.leftTable,ds); % [CHANGED] This line was: dv.leftTable = dataview.concatenate(dv.leftTable,ds);
else
% Ensure dataset ds is suitable for insertion
nonKey = setdiff(dv.Properties.VarNames,dv.key);
[dummy,loc_] = ismember(nonKey,ds.Properties.VarNames);
if any(loc_==0),
f = find(loc_==0);
error(['Cannot insert. Missing ',dv.Properties.VarNames{f(1)}]);
else
ds = ds(:,loc_);
end
nonKeyRight = setdiff(dv.rightTable.Properties.VarNames,dv.key);
[dummy,nonKeyLoc] = ismember(nonKeyRight,ds.Properties.VarNames);
dsNonKeyRight = ds(:,nonKeyLoc);
[dummy,nonKeyLocTable] = ismember(nonKeyRight,dv.rightTable.Properties.VarNames);
if isa(dv.rightTable,'dataview'),
error('Cannot treat case where right Table is a dataview');
end
dvNonKeyRight = dv.rightTable(:,nonKeyLocTable);
% Create new keys (lookups) where necessary on the right to accomodate the
% additional data on the left.
bothNonKeyRight = [dvNonKeyRight;dsNonKeyRight];
[dsUniqueRight,I,J] = unique(bothNonKeyRight);
dvKeyRight = dv.rightTable.(dv.key);
nUnique = size(dsUniqueRight,1);
uniqueKeys = nan(nUnique,1);
[dum,loc] = dataset_ismember(dvNonKeyRight,dsUniqueRight); % Preserve the existing keys
uniqueKeys(loc) = dvKeyRight;
nMissingKeys = sum(isnan(uniqueKeys));
uniqueKeys(isnan(uniqueKeys)) = nanmax(uniqueKeys)+[1:nMissingKeys]';
concatenatedKeys = uniqueKeys(J);
leftKeys = concatenatedKeys(size(dvKeyRight,1)+1:end);
% Assign on right
uK = dataset(uniqueKeys);
uK.Properties.VarNames{1} = dv.key;
right_ = [uK,dsUniqueRight];
right_.Properties.Description = dsUniqueRight.Properties.Description;
dv.rightTable = right_;
% Append on left
lKeys = dataset(leftKeys);
lKeys.Properties.VarNames{1} = dv.key;
nonKeyLeft = setdiff(dv.leftTable.Properties.VarNames,dv.key);
[dummy,nonKeyLeftLoc] = ismember(nonKeyLeft,ds.Properties.VarNames);
dsInsertLeft = [lKeys,ds(:,nonKeyLeftLoc)];
dsInsertLeft.Properties.Description = dv.leftTable.Properties.Description;
if isa(dv.leftTable,'dataview'),
dv.leftTable = insert(dv.leftTable,dsInsertLeft);
elseif isa(dv.leftTable,'dataset'),
dv.leftTable = concatenateDatasets(dv.leftTable,dsInsertLeft);
else
error('Anticipated dataview or dataset object');
end
end
else
% Search down for a compressed version of the dataview and
% insert ds there
dv.leftTable = insert(dv.leftTable,ds,name);
end
end
function dv = full(dv)
if isempty(dv.rightTable)
dv = dv.leftTable;
elseif ~isa(dv.leftTable,'dataview'),
lc = join(dv.leftTable,dv.rightTable,dv.key);
lc.(dv.key) = []; % throw away the key!
dv = lc;
else
dv.leftTable = full(dv.leftTable);
dv = full(dv);
end
end
end
methods (Static)
function pass = unitTests
pass = insertionTest && compressionTest && maximalCompressionTest && ...
compressionOrderTest && stepByStepCompressionTest && compressedCompressionTest;
end
function ds = renameVariablesInDataset(ds,varName,newName,varargin)
% Supply old and new variable names in key,val,key,val format
if isempty(varargin),
col = dsCols(ds,varName);
if isempty(col)
disp(['Cannot rename ',varName]);
else
ds.Properties.VarNames{col} = newName;
end
else
ds = dataview.renameVariablesInDataset(ds,varName,newName);
ds = dataview.renameVariablesInDataset(ds,varargin{:});
end
end
end
end
%% Dataset missing method (Dear Mathworks, please include)
function [tf,loc] = dataset_ismember(A,S)
% Compare standard matlab function ismember
[dum,dum,code] = unique([A;S]);
nA = size(A,1);
nS = size(S,1);
codeA = code(1:nA);
codeS = code(nA+1:nA+nS);
[tf,loc] = ismember(codeA,codeS);
end
%% Non-standard,private, uninteresting, quirky dataset manipulation methods
function [colNos,selectDs] = dsCols(originalDs,varNames)
% Retrive list of column numbers corresponding to variable
% names, and the corresponding sub-dataset
if ischar(varNames),
varNames = {varNames};
end
[isMember,colNos] = ismember(varNames,originalDs.Properties.VarNames);
if nargout>1,
selectDs = originalDs(:,colNos);
end
end
function ds = concatenateDatasets(ds,ds1)
% Fast and loose concatenation of datasets
% ds1 must contain at least the variables in ds, but may
% contain more.
[colNos,ds1_] = dsCols(ds1,ds.Properties.VarNames);
ds = [ds;ds1_];
end
function [leftDs,rightDs] = createKey(originalDs,keyCols,keyName)
% Remove some columns from orginal data set, replacing them by
% a key to a newly created table.
%
% originalDs, leftDs, rightDs datasets
% cols array of integer
nCols = size(originalDs,2);
nonKeyCols = setdiff([1:nCols],keyCols);
[rightDs,I,primaryKey] = unique(originalDs(:,keyCols));
primary = dataset({primaryKey,keyName});
leftDs = [primary,originalDs(:,nonKeyCols)];
newKey = dataset({[1:size(rightDs,1)]',keyName});
rightDs = [newKey,rightDs];
end
%% Examples and tests
function dv = dvZooExample
% Create a compressed dataview
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
end
function ds = dsZooExample
animals = {'dog','cat','kitten','dog'}';
age = [17;13;14;17];
color = {'blue','red','pink','blue'}';
legs = [4;4;2;3];
ds = dataset(animals,age,color,legs);
end
function ds = dsFriendsExample
newAnimals = {'dog','cow','lamb'}';
newAge = [17,12,11]';
newColor = {'blue','purple','red'}';
legs = [3;3;12];
extraneous = [123;124;213];
ds = dataset(newAnimals,newAge,newColor,extraneous,legs);
ds = dataview.renameVariablesInDataset(ds,'newAnimals','animals','newAge','age','newColor','color');
end
function result = insertionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dvZooExample;
dv = insert(dv,zooFrDs); % Insert into compressed dataview
dsFinal = full(dv); % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end
function result = compressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
dsFinal = full(dv); % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end
function result = compressedCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age'},'category');
dv = compress(dv,{'color'},'color_index');
dv = compress(dv,{'category','color_index'},'unique_animal');
dsFinal = full(dv); % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end
function result = maximalCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals','age','color'},'category');
dsFinal = full(dv); % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end
function result = stepByStepCompressionTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv = dataview(zooDs);
dv = insert(dv,zooFrDs); % Insert into dataview
dv = compress(dv,{'animals'},'category');
dv = compress(dv,{'age'},'ancienty');
dv = compress(dv,{'color'},'color_index');
dsFinal = full(dv); % Convert back to dataset
dsFinalCheck = [zooDs;zooFrDs(:,[1,2,3,5])]; % Compare with datasset concatenation
result = dsEqual(dsFinal,dsFinalCheck);
end
function result = compressionOrderTest
zooFrDs = dsFriendsExample;
zooDs = dsZooExample;
dv1 = dataview(zooDs);
dv1 = insert(dv1,zooFrDs); % Insert into dataview
dv1 = compress(dv1,{'animals','age'},'category');
dv1 = compress(dv1,{'color'},'color_index');
dv2 = dataview(zooDs);
dv2 = insert(dv2,zooFrDs); % Insert into dataview
dv2 = compress(dv2,{'color'},'color_index');
dv2 = compress(dv2,{'animals','age'},'category');
Full_DS_Equal = dsEqual(full(dv1),full(dv2));
Compressed_DV_Equal = dsEqual(dv1.leftTable.leftTable,dv2.leftTable.leftTable);
result = Full_DS_Equal && Compressed_DV_Equal;
end
function result = retrievalTest
zooFrDs = dsFriendsExample;
dv = dvZooExample;
dv = insert(dv,zooFrDs); % Insert into compressed dataview
tb = getTableByName(dv,'category');
result = ~isempty(tb);
end
function result = dsEqual(A,B)
% Sloppy check for equality in datasets
try
[dummy,A_] = dsCols(A,B.Properties.VarNames);
C = [A_;B];
nA = size(A,1);
[dummy,dummy,J] = unique(C);
jA = J(1:nA);
jB = J(nA+1:end);
result = all(jA==jB);
catch
result = 0;
end
end