No BSD License  

Highlights from
Merge dataset arrays

image thumbnail
from Merge dataset arrays by Dimitri Shvorob
(in a SQL equijoin)

joindst(x,y,type,varargin)
function[z] = joindst(x,y,type,varargin)
% JOIN     Single-key equality join of two dataset arrays
% Inputs : x,y  - joined datasets
%          type - join type, string 'inner', 'left outer', 'full outer'
%          key  - [optional] key column name, string; if no key is
%                 provided, (assumed single) common column is used
% Outputs: z - dataset array
% Notes  : NaN/Inf values are not matched, and excluded from inner join.
%          Output rows are sorted by join key, with NaN/Inf values last.
%          Within any row of z, columns of x are listed first. Unmatched
%          rows are padded with NaNs
% Example: x = [[unidrnd(5,4,1); nan; inf] [10 20 30 40 50 60]']
%          y = [[unidrnd(5,4,1); nan; inf] [15 25 25 45 55 55]']
%          x = dataset({x,'index','x_value'})
%          y = dataset({y,'index','y_value'})
%          inner      = joindst(x,y,'inner')
%          left_outer = joindst(x,y,'left outer')
%          full_outer = joindst(x,y,'full outer')
% Dimitri Shvorob, dimitri.shvorob@gmail.com, 8/18/08

if ~isa(x,'dataset')
    error('??? Invalid type: ''%s'' not a dataset array.',inputname(1))
end
if ~isa(y,'dataset')
    error('??? Invalid type: ''%s'' not a dataset array.',inputname(2))
end

rows = @(x) size(x,1);

xvars  = get(x,'VarNames');
yvars  = get(y,'VarNames');
xyvars = intersect(xvars,yvars);

assert(~isempty(xyvars),'??? No common columns found in joined datasets.')
assert(isscalar(xyvars),'??? More than one common column found in joined datasets.')

p = inputParser;
p.addOptional('key',[],@ischar);
p.parse(varargin{:});
key = p.Results.key;
if isempty(key)
   key = char(xyvars);
else
   if ~isscalar(key)
       error('??? Multiple join keys not allowed.')
   end
   if ~strmatch(key,xvars)
       error('??? Column ''%s'' not found in dataset ''%s''.',key,inputname(1))
   end
   if ~strmatch(key,yvars)
       error('??? Column ''%s'' not found in dataset ''%s''.',key,inputname(2))
   end
end

[vx,ix] = keys(x.(key));
[vy,iy] = keys(y.(key));

xnan = x(~isfinite(x.(key)),:);
ynan = y(~isfinite(y.(key)),:);

y.(key) = [];
yvars = get(y,'VarNames');

p = size(x,2); xp = nan(1,p);
q = size(y,2); yp = nan(1,q);

z = [x(1,:) y(1,:)]; z(1,:) = [];

switch type
    case 'inner'
        for i = 1:length(vx)
            j = find(vy == vx(i));
            if ~isempty(j)
               m = ix{i};
               n = iy{j};
               xi = tile(x(m,:),rows(n));
               yi = tile(y(n,:),rows(m));
               z  = [z; [xi yi]];                %#ok
            end
        end
    case 'left outer'
        for i = 1:length(vx)
            xi = x(ix{i},:);
            j = find(vy == vx(i));
            if ~isempty(j)
                yi = y(iy{j},:);
            else
                yi = dataset({yp,yvars{:}});
            end
            rx = rows(xi);
            ry = rows(yi);
            xi = tile(xi,ry);
            yi = tile(yi,rx);
            z  = [z; [xi yi]];                   %#ok
        end
        z = [z; ...
             [xnan dataset({tile(yp,rows(xnan)),yvars{:}})]];            %#ok
  case 'full outer'
      v = union(vx,vy);
      for i = 1:length(v)
          j = find(vx == v(i));
          if ~isempty(j)
              xi = x(ix{j},:);
          else
              xi = dataset({xp,xvars{:}});
          end
          j = find(vy == v(i));
          if ~isempty(j)
              yi = y(iy{j},:);
          else
              yi = dataset({yp,yvars{:}});
          end
          xt = tile(xi,rows(yi));
          yt = tile(yi,rows(xi));
          z  = [z; [xt yt]];                     %#ok
      end
      w = dataset({tile(xp,rows(ynan)),xvars{:}});
      w.(key) = [];
      z = [z; ...
           [xnan dataset({tile(yp,rows(xnan)),yvars{:}})]; ...
           [ynan w] ];                           %#ok
  otherwise
      error('??? Join type not recognized.')
end

end

function[key,ind] = keys(x)
    key = unique(x(isfinite(x)));
    ind = cellfun(@(k) find(x == k),num2cell(key),'UniformOutput',false);
end

function[out] = tile(x,n)
    out = x;
    for i = 2:n
        out = [out; x];                          %#ok
    end
end

Contact us at files@mathworks.com