Code covered by the BSD License  

Highlights from
matml

image thumbnail
from matml by Ben Mitch
Convert between matlab variables and MatML (XML) files

matml(varargin)
function [result, ext] = matml(varargin)

% [result, ext] = matml('file', file, ...)
%
% Read a MatML file to a matlab variable, and optionally
% return file information.
% ________________________________
%
% [result, ext] = matml('text', text, ...)
%
% Read a MatML text to a matlab variable.
% ________________________________
%
% [result, ext] = matml('data', data, ...)
%
% Convert a variable into MatML text.
% ________________________________
%
% [result, ext] = matml('data', data, 'file', file, ...)
%
% Convert a variable into MatML and write it to a file.
% ________________________________
%
% Any form can take one additional named argument, 'ext',
% which should be a structure with any of the fields listed
% below. The ext object is always returned, possibly
% modified, at the output.
%
% 'precision'
%   output precision (significant figures)
%
% 'root'
%   tag name for root element ("MatMLDocument" by
%   default)
%
% 'binaryfile'
%   if present, the document can be written in
%   unencapsulated form (with links out to additional binary
%   files). binary files, if generated, will be named as
%   <binaryfile>.####. these are quicker to read and write.
%
% 'binaryindex'
%   if present, is the first index used in constructing
%   binary file filenames. if absent, numbering starts at
%   one.
%
% 'attr',
%   structure of additional attributes to add to the root
%   node.
% ________________________________
%
% test = matml
%
% Return a suitably convoluted variable for testing.



version = 7;

% Author: Ben Mitch
% URL: http://tinyurl.com/6m6qjy
% Modified: 20/12/2010
%
% Version: 7
%
% * removed GPL licensed component, so that BSD license now
% applies to all source code in this release
%
% Version: 6
%
% * fixed bug in safe/unsafe implementation
%
% Version: 5
%
% * added root node attributes
% * added encapsulation
%
% Version: 4
%
% * updated usage options to allow read/write from a text
% string rather than a file (breaks old interface)
%
% Version: 3
%
% * corrected help formatting
% * fixed incorrect Version in write
% * changed URL to tinyurl
% * specified encoding in xml header
%
% Version: 2
%
% * reviewed m-lint warnings and fixed some
% * correctly handle trailing singletons in all cases
% * improved code for reconstructing structures
% * changed parser to handle comments correctly
% * deprecated missing sz [0 0] attribute (too weird)
% * added basic XML header
% * added "written by" comment
%
% Version: 1



% usage
if nargout == 0 && nargin == 0
	
	% invoke help
	help matml
	
elseif nargout == 1 && nargin == 0
	
	% return a test variable
	result = [];
	result.a = {1 2 3 'the quick brown fox'};
	result.b = {result result [] {} 'jumped over'};
	result.aLongerName = result;
	result.with_us = result;
	result.nonscalar = [];
	result.nonscalar(1,2,3).f1 = result.b;
	result.simpletext = 'the lazy dog';
	result.cpx = 1 + 9i;
	result.matrix = rand(1,2,3,4);
	result.nonscalarcell = {1 2 3; 4 'foo' {'bar'}};
	
else
	
	% default result is empty
	result = [];
	
	% otherwise, expect key/value pairs
	if mod(nargin, 2)
		error('expects an even number of arguments')
	end
	
	% defaults
	file = [];
	data = [];
	text = [];
	
	% extended object
	ext = [];
	ext.precision = 60;
	ext.root = 'MatMLDocument';
	ext.binaryfile = [];
	ext.binaryindex = 1;
	ext.attr = struct();
	
	% parse key/value pairs
	for n = 1:2:nargin
		
		key = varargin{n};
		val = varargin{n+1};
		
		switch key
			
			case 'file'
				file = val;
			
			case 'data'
				data = val;
			
			case 'text'
				text = val;
				
			case 'ext'
				ext = val;
				
			otherwise
				error(['unrecognised key "' key '"'])
			
		end
		
	end
	
	% was data supplied?
	if ~isempty(data)
		
		% attributes
		attr = '';
		if ~isempty(ext.attr)
			f = fieldnames(ext.attr);
			for n = 1:length(f)
				key = f{n};
				if length(key) < 2
					error('additional root attributes must have keys of at least two characters (one character keys are reserved for MatML)');
				end
				val = ext.attr.(key);
				if isnumeric(val)
					val = num2str(val);
				end
				attr = [attr ' ' key '="' val '"'];
			end
		end
		attr = [attr ' Version="' int2str(version) '"'];
		attr = [attr ' Created="' datestr(now) '"'];
		if ext.precision ~= 60
			attr = [attr ' Precision="' num2str(ext.precision) '"'];
		end

		% root tag
		[xml, ext] = matml_write(ext.root, data, '', attr, ext);

		% comment
		comment = [ ...
			'<!-- ' ...
			'written by matml.m (http://tinyurl.com/6m6qjy)' ...
			' -->' ...
			];
		
		% write file
		if ~isempty(file)
			
			fid = fopen(file, 'w');
			if fid == -1
				error(['failed open "' file '"']);
			end

			% in fact, we write only US-ASCII characters (in the range
			% 32-127), but this is a subset of UTF-8, and i'm not 100%
			% how to indicate US-ASCII encoding, so...
			fwrite(fid, '<?xml version="1.0" encoding="UTF-8"?>');

			fwrite(fid, comment);
			fwrite(fid, xml);
			fclose(fid);
			
		else
			
			% return result
			result = xml;
			
		end

	else
		
		% read from file
		if ~isempty(file)
			text = matml_file2array(file);
			if isempty(text)
				error('specified XML file not valid (no content)');
			end
		end
		
		% read from text
		if ~isempty(text)
			xml = matml_xmlread(text);
			result = matml_matmlparse(xml);
		end
		
		% info
		if ~isempty(file)
			d = dir(file);
			w = whos('result');
			info.version = str2double(xml.attr.Version);
			info.created = datenum(xml.attr.Created);
			info.bytes.matlab = w.bytes;
			info.bytes.matml = d.bytes;
		end

	end
	
end






function [xml, writer] = matml_write(tagName, arr, path, attr, writer)


sz = size(arr);

% most MatML fields are standardised arrays, but we allow a
% special case for simple strings for readability and brevity,
% that is just a tag with no attributes and the string as content.
if ischar(arr) && length(sz) == 2 && sz(1) == 1 && all(arr >= 32) && all(arr <= 127)
	xml = ['<' tagName attr '>' matml_xmlsafe(arr) '</' tagName '>'];
	return
end

% calculate more stuff
numels = prod(sz);
isscalar = numels == 1;

while length(sz) > 1 && sz(end) == 1
	sz = sz(1:end-1);
end

% in MatML, scalar can have missing sz attribute
if isscalar
	sz = '';
else
	sz = [' b="' matml_numtostr(sz, []) '"'];
end




% given the note above and the note below, it's worth summarising
% the protocol for identifying the type of a matml node. the matml
% attribute can only be left out in two cases, a non-empty string
% or a structure element, therefore on encountering a node we can
% assess its type as follows:
%
% 1) if explicitly specified using c="type"
% 2) otherwise, if it has text, c="char" and b="1 length(text)"
% 3) otherwise, c="struct" (a struct element can never have text)

cls = class(arr);

switch cls

	case 'cell'
		xml = ['<' tagName attr ' c="y"' sz '>'];
		for n = 1:numels
			[subxml, writer] = matml_write('m', arr{n}, [tagName '/'], '', writer);
			xml = [xml subxml];
		end
		xml = [xml '</' tagName '>'];

	case 'struct'
		fn = fieldnames(arr);
		fns = ' a="';
		for f = 1:length(fn)
			fns = [fns fn{f} ';'];
		end
		fns = [fns '"'];
		xml = ['<' tagName attr ' c="z"' fns sz '>'];
		for n = 1:numels
			for f = 1:length(fn)
				[subxml, writer] = matml_write('m', arr(n).(fn{f}), [tagName '/'], '', writer);
				xml = [xml subxml];
			end
		end
		xml = [xml '</' tagName '>'];

	case {'char' 'single' 'double' 'int8' 'int16' 'int32' 'int64' 'uint8' 'uint16' 'uint32' 'uint64' 'logical'}
		
		switch cls
			case 'single', cls = 'f';
			case 'double', cls = 'd';
			case 'uint64', cls = 'v';
			case 'uint32', cls = 'u';
			case 'uint16', cls = 't';
			case 'uint8',  cls = 's';
			case 'int64', cls = 'p';
			case 'int32', cls = 'o';
			case 'int16', cls = 'n';
			case 'int8',  cls = 'm';
			case 'logical',  cls = 'l';
			case 'char',  cls = 'c';
			otherwise
				error(['uncoded MatML class "' cls '"']);
		end
		
		% in MatML, no cpx attribute means it's real
		if ~isreal(arr)
			cls = [cls 'x'];
		end
		
		if prod(size(arr)) <= 1000 || isempty(writer.binaryfile)
			xml = ['<' tagName attr ' c="' cls '"' sz '>'];
			xml = [xml matml_numtostr(arr, writer.precision)];
			xml = [xml '</' tagName '>'];
		else
			% use binary files only if array is very large
			filename = [writer.binaryfile '.' sprintf('%04i', writer.binaryindex)];
			writer.binaryindex = writer.binaryindex + 1;
			d = size(arr);
			while length(d) && d(end) == 1
				d = d(1:end-1);
			end
			matml_numtofile(filename, arr, length(d));
			filename = strrep(filename, filesep, '/');
			xml = ['<' tagName attr ' c="' cls '"' sz ' s="b">'];
			xml = [xml matml_xmlsafe(filename)];
			xml = [xml '</' tagName '>'];
		end
		
	case 'function_handle'
		
		xml = ['<' tagName attr ' c="e">'];
		xml = [xml func2str(arr)];
		xml = [xml '</' tagName '>'];
		
	otherwise
		error(['MatML cannot handle class "' class(arr) '" in node (' path tagName ')'])

end








function s = matml_numtostr(n, precision)

if isempty(n)
	s = '';
	return
end

if isreal(n)
	if isfloat(n)
		if isempty(precision)
			s = sprintf('%.20g ', n);
		else
			s = sprintf(['%.' int2str(precision) 'g '], n);
		end
	else
		% integers are written at maximum precision
		s = sprintf('%d ', n);
	end
	s = s(1:end-1);
else
	s = [matml_numtostr(real(n), precision) ' ' matml_numtostr(imag(n), precision)];
end






function matml_numtofile(filename, data, ndims)

%
% writeBinaryNumeric(filename, data, ndims)
%
% save the passed data into a binary file named
% as specified. the data must be numeric, but
% may be multi-dimensional and of any numeric type.
% the data is stored as it is stored in a mat
% file, with the exception that real and complex
% data are interleaved with granularity of the last
% dimension. for instance, a 3x2xN complex uint32
% matrix will be stored as
%
% real(1,1,1)
% real(2,1,1)
% ...
% real(3,2,1)
% imag(1,1,1)
% imag(2,1,1)
% ...
% imag(3,2,1)
% real(1,1,2)
% ...
%
% where all entries are a 4-byte uint32 element.
%
% by default, trailing scalar dimensions are
% removed. to change this behaviour, set ndims to
% the number of dimensions. this file is then in
% the right format to be read by std/source/numeric.
%


fid = fopen(filename, 'w');
if fid == -1
	error(['could not open "' filename '"'])
end

sz = size(data);
while length(sz)>1 & sz(end) == 1
	sz = sz(1:end-1);
end

if nargin >= 3
	while length(sz)<ndims
		sz = [sz 1];
	end
end

N = sz(end);
C = prod(sz(1:end-1));

if isreal(data)
	
	% we can just output the whole lot as it comes
	fwrite(fid, data, class(data));
	
else

	r = real(data);
	i = imag(data);
	cls = class(r);

	for n = 1:N
		ed = C * n;
		st = ed - C + 1;
		fwrite(fid, r(st:ed), cls);
		fwrite(fid, i(st:ed), cls);
	end

end

fclose(fid);





%% taken from systemml 'sml_xml'
%
% WARNING: THIS IS NOT A USER FUNCTION - ITS INTERFACE OR
%   OPERATION MAY CHANGE IN FUTURE RELEASES.
%
% xml = private_sml_xml(filename, xml)
%   read or write a generic XML structure into or from the
%   specified file.  the XML tree's format is that every node is:
%
% xml.name = '<tag-name>'
% xml.attr = struct('key',val,...) [optional]
% xml.value = <any-matlab-variable> [optional]
% xml.children = {<child-node-1>,...} [optional]
%
% where exactly one of "value" and "children" must be
% present in all nodes.

function xml = matml_xmlread(text)


% peel off first tag as special case
if strcmp(text(1:5), '<?xml')
	f = strfind(text, '?>');
	if isempty(f)
		error('malformed XML declaration');
	end
	text = text(f(1)+2:end);
end

xml = matml_xmlparts(text);
xml = matml_xmlparse(xml);



function parts = matml_xmlparts(xml)
% convert xml text into data,tag,data,tag,etc.

f = find(xml == '<');
g = find(xml == '>');
ch = 1;
tag = 1;
ntags = length(f);
if length(g)~=ntags
	error('mismatch in tags');
end
if min(diff(reshape([f; g],ntags*2,1))) < 1
	error('mismatch in tags');
end

emptystruct = struct();

part = [];
part.type = 0;
part.data = '';
part.attr = emptystruct;

parts = part;
partscount = 0;
partsreserved = 1;

while tag<=ntags

	if ch ~= f(tag)
		% add data
		part.type = 0;
		part.data = xml(ch:f(tag)-1);
		part.attr = emptystruct;
		
		% expand storage
		if partscount == partsreserved
			partsreserved = partsreserved * 2;
			parts(partsreserved,1) = part;
		end
		
		partscount = partscount + 1;
		parts(partscount,1) = part;
		ch = f(tag);
	end

	% add tag
	wholetag = xml(f(tag)+1:g(tag)-1);
	
	% CLOSE TAGS
	if wholetag(1) == '/'
		part.type = 2; % close tag
		part.data = wholetag(2:end);
		part.attr = emptystruct;
		
	% COMMENT TAGS
	elseif wholetag(1) == '!'
		part.type = 4; % comment
		part.data = wholetag(4:end-3);
		part.attr = emptystruct;
	
	% MUST BE OPEN TAGS
	else
		
		% check for attributes
		attr = emptystruct;
		ucase = wholetag >= 65 & wholetag <= 90;
		lcase = wholetag >= 97 & wholetag <= 122;
		numb = wholetag >= 48 & wholetag <= 57;
		endstagname = ~lcase & ~ucase & ~numb & wholetag ~= '_';
		
		part.type = 1; % open tag
		
		if any(endstagname)
			e = find(endstagname);
			e = e(1);
			if wholetag(e) ~= 32
				error(['malformed tag (no space after tag name) "' wholetag '"']);
			end
			attrs = wholetag(e+1:end);
			wholetag = wholetag(1:e-1);
			
			% interpret attrs
			while true
				e = find(attrs == '=');
				if isempty(e)
					error('invalid XML (no =)');
				end
				e = e(1);
				if attrs(e+1) ~= '"'
					error('invalid XML (not ")');
				end
				key = attrs(1:e-1);
				attrs = attrs(e+2:end);
				e = find(attrs == '"');
				if isempty(e)
					error('invalid XML (no ")');
				end
				e = e(1);
				val = attrs(1:e-1);
				attr.(key) = val;
% 				if strcmp(key,'class') part.type = 3; end % matlab data
				if e == length(attrs)
					break
				end
				attrs = attrs(e+2:end);
			end
			
		end

		part.data = wholetag;
		part.attr = attr;
		
	end

	% expand storage
	if partscount == partsreserved
		partsreserved = partsreserved * 2;
		parts(partsreserved,1) = part;
	end
	
	partscount = partscount + 1;
	parts(partscount,1) = part;
	ch = g(tag)+1;
	tag = tag + 1;

end

if ch <= length(xml)
	
	part = [];
	part.type = 0;
	part.data = xml(ch:end);
	part.attr = struct();
	partscount = partscount + 1;
	parts(partscount,1) = part;
	
end

parts = parts(1:partscount);




function [xml, nextpart] = matml_xmlparse(parts, nextpart, parent)

if nargin<2
	nextpart = 1;
end
if nargin<3
	parent = [];
end

% only simple (text or children, but not both) XML
% tags are supported in this format, so we store them
% separately

xml = [];
xml.name = '';
xml.attr = struct();
xml.value = '';
xml.children = {};


while nextpart <= length(parts)
	part = parts(nextpart);
	nextpart = nextpart + 1;
	
	switch part.type
		case 0
			if ~isempty(xml.children)
				% ignore whitespace if children are present, but use it otherwise
				f = find(part.data ~= 9 & part.data ~= 32 & part.data ~= 10 & part.data ~= 13);
				if isempty(f) continue; end
				% presence of non-whitespace data when we already have children is a
				% mixed XML tag
				error('mixed type (text and children) XML tags unsupported in this translation');
			end

			% just return string content
			xml.value = matml_xmlunsafe(part.data);
			
		% 1 = opening tag
		case 1
			% presence of children when we already have non-whitespace data is a
			% mixed XML tag
			if ~isempty(xml.value)
				f = find(xml.value ~= 9 & xml.value ~= 32 & xml.value ~= 10 & xml.value ~= 13);
				if ~isempty(f) error('mixed type (text and children) XML tags unsupported in this translation'); end
				% if data is whitespace, we'll just discard it
				xml.value = [];
			end
			
			[xml_, nextpart] = matml_xmlparse(parts, nextpart, part);
			xml_.name = part.data;
			xml_.attr = part.attr;

			% add to children
			xml.children{end+1} = xml_;
			
		case 2
			if ~strcmp(part.data,parent.data)
				error(['invalid nesting - "' part.data '" ends "' parent.data '"']);
			end
			return;

		case 4
			
			% comments are ignored
			
		otherwise
			
			error('unrecognised part code');
			
	end
end


if isempty(parent)
	% this is the root tag, and we should reach here
	% but we fix up the representation at this point
	if ~isempty(xml.value)
		error('document cannot have data outside root tag');
	end
	if length(xml.children) ~= 1
		error('document cannot have more than one root tag');
	end
	xml = xml.children{1};
		
else
	error('did not find closing tag');
	
end



function data = matml_xmlunsafe(data)

data = strrep(data, '&lt;', '<');
data = strrep(data, '&gt;', '>');
data = strrep(data, '&apos;', '''');
data = strrep(data, '&quot;', '"');
data = strrep(data, '&amp;', '&');

function data = matml_xmlsafe(data)

data = strrep(data, '&', '&amp;');
data = strrep(data, '"', '&quot;');
data = strrep(data, '''', '&apos;');
data = strrep(data, '>', '&gt;');
data = strrep(data, '<', '&lt;');





% matml_file2array(<filename>) returns a string containing the contents of
% filename with linefeeds converted to LF (asc 10)
%
% matml_file2array(<filename>, true) returns a cell array of lines instead
% of one long text string

function out = matml_file2array(fname, in_lines)

fid=fopen(fname);
if fid==-1
	error('File not found');
end

bin=fread(fid);

% go through replacing crlf with just lf
if ~isempty(bin)
	notcrs = bin~=13;
	bin = bin(notcrs);
end

out=char(bin');

if fclose(fid)
	error('File not closed');
end


if nargin > 1 && in_lines

	str = out;
	if str(end) ~= 10
		str = [str 10];
	end
	lf = find(str == 10);
	st = [1 lf(1:end-1)+1];
	ed = lf - 1;
	N = length(st);
	out = cell(N,1);
	
	for n=1:N
		out{n} = str(st(n):ed(n));
	end
	
end




%% PARSE MATML


function value = matml_matmlparse(tag)

if ~isfield(tag.attr, 'c')
	
	% it's a simple string
	value = tag.value;

elseif strcmp(tag.attr.c, 'e')
	
	value = str2func(tag.value);

elseif strcmp(tag.attr.c, 'z')

	% create scalar structure
	fn = matml_explode(';', tag.attr.a);
	nfields = length(fn);
	value = struct();
	for n = 1:length(fn)
		value.(fn{n}) = [];
	end
	
	% expand it to required sz if not scalar
	if isfield(tag.attr, 'b')
		sz = tag.attr.b;
		cmd = ['value = repmat(value, [' sz ']);'];
		eval(cmd);
	end
	
	% for each <struct> element
	for e = 0:length(tag.children)-1
		el = floor(e / nfields) + 1;
		n = mod(e, nfields) + 1;
 		value(el).(fn{n}) = matml_matmlparse(tag.children{e+1});
	end

elseif strcmp(tag.attr.c, 'y')
	
	if isfield(tag.attr, 'b')
		sz = str2num(tag.attr.b);
	else
		% no sz attr means scalar
		sz = 1;
	end
	
	value = cell(sz);
	for n = 1:length(tag.children)
		value{n} = matml_matmlparse(tag.children{n});
	end

else

	if isfield(tag.attr, 'b')
		sz = sscanf(tag.attr.b, '%f')';
	else
		% no sz attr means scalar
		sz = 1;
	end
	
	if length(tag.attr.c) == 2
		if tag.attr.c(2) ~= 'x'
			error('malformed MatML');
		end
		cls = tag.attr.c(1);
		cpx = true;
	else
		cls = tag.attr.c;
		cpx = false;
	end
	
	if isfield(tag.attr, 's')
		storage_protocol = tag.attr.s;
		if ~ischar(storage_protocol) || ~isscalar(storage_protocol) || storage_protocol ~= 'b'
			error('cannot recognise storage protocol');
		end
		filename = tag.value;
		d = dir(filename);
		f = find('fdvutsponmlc' == tag.attr.c(1));
		if isempty(f)
			error('unrecognised numeric type')
		end
		g = [4 8 8 4 2 1 8 4 2 1 1 1];
		c = {'single', 'double', 'uint64', 'uint32', 'uint16', 'uint8', 'int64', 'int32', 'int16', 'int8', 'logical', 'char'};
		bytesperel = g(f);
		nels = prod(sz);
		expbytes = bytesperel * nels;
		if cpx
			expbytes = expbytes * 2;
		end
		if expbytes ~= d.bytes
			error('binary file incorrect size');
		end
		fid = fopen(filename, 'r');
		if fid == -1
			error(['failed open "' filename '"']);
		end
		c = c{f};
		if cpx
			N = sz(end);
			C = prod(sz(1:end-1));
			value = zeros(C, N);
			for n = 1:N
				real = fread(fid, C, ['*' c]);
				imag = fread(fid, C, ['*' c]);
				value(:, n) = complex(real, imag);
			end
		else
			value = fread(fid, inf, ['*' c]);
		end
		fclose(fid);
	else
		value = sscanf(tag.value, '%f');
		if cpx
			value = value(1:end/2) + i * value(end/2+1:end);
		end
	end
	
	if length(sz) < 2
		sz = [sz 1];
	end

	if length(sz) > 1
		value = reshape(value, sz);
	end

	switch cls
		case 'f', value = single(value);
		case 'd', value = double(value);
		case 'v', value = uint64(value);
		case 'u', value = uint32(value);
		case 't', value = uint16(value);
		case 's',  value = uint8(value);
		case 'p', value = int64(value);
		case 'o', value = int32(value);
		case 'n', value = int16(value);
		case 'm',  value = int8(value);
		case 'l',  value = logical(value);
		case 'c',  value = char(value);
		otherwise
			error(['uncoded MatML class "' cls '"']);
	end

end


function c = matml_explode(t, s)

c = {};
f = [0 find(s == t)];
for n = 1:length(f)-1
	c{n} = s(f(n)+1:f(n+1)-1);
end


Contact us at files@mathworks.com