function machinecode = assembler(file)
%ASSEMBLER is an assembler for the extended Brookshear machine
% MACHINECODE = ASSEMBLER(FILE) reads assembly language from the file
% whose name is the argument. If this compiles successfully, a string
% containing machine code instructions is returned.
%
% The language for the input file is specified in ASSEMBHELP.M. The
% format of the output is the same as that of a memory file as specified
% in BMHELP.m.
%
% See also ASSEMBHELP, BMHELP
% Copyright 2008 University of Sussex and David Young
assembly = readtext(file);
% split input into lines - retain empty lines
lines = regexp(assembly, '\n', 'split');
nl = char(10);
address = 0;
machinecode = ['// Brookshear Machine, assembled from ' file nl '// at ' datestr(now) nl];
occupied = zeros(256);
labels = newlabels;
for i = 1:length(lines)
line = lines{i};
% initial split of line into fields
[label, op, args] = parseline(line);
if ~isempty(label)
[address, labels] = notelabel(address, label, labels);
end
if isequal(upper(op), 'DATA')
datastr = parsedata(args);
nbytes = length(datastr)/2;
occupied = checkmem(occupied, address, nbytes);
machinecode = [machinecode dec2hex(address, 2) ': ' datastr];
address = address + nbytes;
elseif ~isempty(op)
occupied = checkmem(occupied, address, 2);
% get machine code for command and extend output
[code, target_label] = parsecmd(op, args);
tgtpos = length(machinecode) + 7; % possible label position
machinecode = [machinecode dec2hex(address, 2) ': ' code];
address = address + 2;
if ~isempty(target_label)
labels = notetarget(tgtpos, target_label, labels);
end
end
% add source code as comments. Note regexp('','^$') returns []
if ~isempty(line) && isempty(regexp(line, '^\s*(?://|$)', 'once'))
line = [' // ' line];
end
machinecode = [machinecode line nl];
end
machinecode = fixlabels(labels, machinecode);
end
function occupied = checkmem(occupied, address, nbytes)
if address > 256-nbytes
throw(MException('bmachine:assembler:outofmem', ...
['Memory limit of 256 exceeded: ' line]));
end
if any(occupied(address+(1:nbytes)))
throw(MException('bmachine:assembler:overwrite', ...
['Memory overwritten: ' line]));
end
occupied(address+(1:nbytes)) = true;
end
function [label, op, args] = parseline(line)
label = ''; op = ''; args = '';
line = regexprep(line, '\s*//.*$', ''); % decomment
labelpatt = '^\s*(?<label>\w*:)?';
exprpatt = '\s*(?<expr>\S.*)?$';
labelexp = regexp(line, [labelpatt exprpatt], 'names', 'once');
if ~isempty(labelexp)
if length(labelexp.label) > 1
label = labelexp.label(1:end-1); % omit the colon
end
if ~isempty(labelexp.expr)
oppatt = '^(?<op>[a-zA-Z]*)';
argpatt = '(?<args>\W.*)?$';
opargs = regexp(labelexp.expr, [oppatt argpatt], 'names', 'once');
if isempty(opargs) || isempty(opargs.op)
throw(MException('bmachine:assembler:parseline', ...
['Invalid operator in ' line]));
end
op = opargs.op;
args = regexprep(opargs.args, {'^\s*' '\s*$'}, '');
end
end
end
function labels = newlabels
labels.list = {};
labels.addr = {};
labels.refs = {};
end
function [address, labels] = notelabel(address, label, labels)
if ~isempty(regexp(label, '^[0-9a-fA-F]{2}$', 'once'))
% have an address rather than a text label
address = hex2dec(label);
elseif ~isempty(regexp(label, '^[a-zA-Z]\w{3,}$', 'once'))
[known, loc] = ismember(label, labels.list);
if known
if ~isempty(labels.addr{loc})
throw(MException('bmachine:assembler:reuselabel', ...
['Label used more than once: ' label]));
end
else
loc = length(labels.list) + 1;
labels.list{loc} = label;
labels.refs{loc} = [];
end
labels.addr{loc} = address;
else
throw(MException('bmachine:assembler:labelformat', ...
['Invalid label format: ' label]));
end
end
function labels = notetarget(tgtpos, label, labels)
[known, loc] = ismember(label, labels.list);
if ~known
loc = length(labels.list) + 1;
labels.list{loc} = label;
labels.addr{loc} = [];
labels.refs{loc} = [];
end
labels.refs{loc} = [labels.refs{loc} tgtpos];
end
function machinecode = fixlabels(labels, machinecode)
for i = 1:length(labels.list)
if isempty(labels.addr{i})
throw(MException('bmachine:assembler:undef_label', ...
['Undefined label: ' labels.list{i}]));
end
addstr = dec2hex(labels.addr{i}, 2);
refs = labels.refs{i};
for j = 1:length(refs)
pos = refs(j);
machinecode(pos:pos+1) = addstr;
end
end
end
function datastr = parsedata(args)
if ~isempty(regexp(args, '^".*"$', 'once'))
s = (dec2hex(args(2:end-1), 2))';
datastr = [(s(:))' '00']; % null-terminated string
else
data = regexp(args, '\s*,\s*', 'split');
datastr = '';
for i = 1:length(data)
[a, mode, label] = parsearg(data{i});
if ~isequal(mode, 'immediate') || ~isempty(label)
throw(MException('bmachine:assembler:baddata', ...
['Invalid data: ' args]));
end
datastr = [datastr a];
end
end
end
function [code, target_label] = parsecmd(op, args)
ops = {'NOP' 'MOV' 'ADDI' 'ADDF' 'OR' 'AND' 'XOR' 'ROT' 'JMP' 'JMPEQ' ...
'JMPNE' 'JMPGE' 'JMPLE' 'JMPGT' 'JMPLT' 'HALT'};
parsefns = {@NOP @MOV @ADDI @ADDF @OR @AND @XOR @ROT @JMP @JMPEQ ...
@JMPNE @JMPGE @JMPLE @JMPGT @JMPLT @HALT};
[isop, opno] = ismember(upper(op), ops);
if ~isop
throw(MException('bmachine:assembler:parsecmd', ...
['Unknown operator ' op]));
end
parseargs = parsefns{opno};
[code, target_label] = parseargs(args);
end
function [code, target_label] = NOP(args)
if ~isempty(args)
throw(MException('bmachine:assembler:NOP', ...
['NOP should have no arguments, found: ' args]));
end
code = '0FFF';
target_label = '';
end
function [code, target_label] = MOV(args)
pargs = regexp(args, '^(\S+)\s*(?:->)\s*(\S+)$', 'tokens', 'once');
if isempty(pargs)
throw(MException('bmachine:assembler:MOV', ...
['MOVE arguments not well formed: ' args]));
end
[a1, m1, l1] = parsearg(pargs{1});
[a2, m2, l2] = parsearg(pargs{2});
target_label = '';
if isequal(m1, 'register')
switch m2
case 'immediate'
throw(MException('bmachine:assembler:MOV', ...
['Invalid destination for move ' args]));
case 'register'
code = ['40' a1 a2];
case 'direct'
code = ['3' a1 a2];
target_label = l2;
case 'indirect'
code = ['E0' a1 a2];
end
else
if ~isequal(m2, 'register')
throw(MException('bmachine:assembler:MOV', ...
['Value can only be moved to register ' args]));
end
switch m1
case 'immediate'
code = ['2' a2 a1];
case 'direct'
code = ['1' a2 a1];
case 'indirect'
code = ['D0' a2 a1];
end
target_label = l1;
end
end
function [code, target_label] = ADDI(args)
code = ['5' parseopargs(args)];
target_label = '';
end
function [code, target_label] = ADDF(args)
code = ['6' parseopargs(args)];
target_label = '';
end
function [code, target_label] = OR(args)
code = ['7' parseopargs(args)];
target_label = '';
end
function [code, target_label] = AND(args)
code = ['8' parseopargs(args)];
target_label = '';
end
function [code, target_label] = XOR(args)
code = ['9' parseopargs(args)];
target_label = '';
end
function regstring = parseopargs(args)
% Parse arguments to a register operation instruction, of the form
% OP R1, R2 -> R3
toks = regexp(args, '^R([0-9a-fA-F])\s*,\s*R([0-9a-fA-F])\s*(?:->)\s*R([0-9a-fA-F])$', ...
'tokens', 'once');
if isempty(toks)
throw(MException('bmachine:assembler:opargs', ...
['Invalid arguments for register operation: ' args]));
end
regstring = [toks{3} toks{1} toks{2}];
end
function [code, target_label] = ROT(args)
toks = regexp(args, '^R([0-9a-fA-F])\s*,\s*([0-7])$', 'tokens', 'once');
if isempty(toks)
throw(MException('bmachine:assembler:rotargs', ...
['Invalid arguments for ROT: ' args]));
end
code = ['A' toks{1} '0' toks{2}];
target_label = '';
end
function [code, target_label] = JMP(args)
[a, mode, label] = parsearg(args, true); % set hex only flag
if isequal(mode, 'immediate')
code = ['B0' a];
elseif isequal(mode, 'register')
code = ['F00' a];
else
throw(MException('bmachine:assembler:JMP', ...
['Invalid target for JMP: ' args]));
end
target_label = label;
end
function [code, target_label] = JMPEQ(args)
[arg1, arg2] = parsejmpargs(args);
[a1, mode1, label1] = parsearg(arg1, true); % set hex only flag
[a2, mode2, label2] = parsearg(arg2, true); %#ok<*NASGU>
if ~isequal(mode2, 'register')
throw(MException('bmachine:assembler:JMPEQ', ...
['JMPEQ requires register for test: ', args]));
end
if isequal(mode1, 'immediate')
code = ['B' a2 a1];
elseif isequal(mode1, 'register')
code = ['F' a2 '0' a1];
else
throw(MException('bmachine:assembler:JMP', ...
['Invalid target for JMPEQ: ' args]));
end
target_label = label1;
end
function [code, target_label] = JMPNE(args)
[a2, a1] = jmptst(args);
code = ['F' a2 '1' a1];
target_label = '';
end
function [code, target_label] = JMPGE(args)
[a2, a1] = jmptst(args);
code = ['F' a2 '2' a1];
target_label = '';
end
function [code, target_label] = JMPLE(args)
[a2, a1] = jmptst(args);
code = ['F' a2 '3' a1];
target_label = '';
end
function [code, target_label] = JMPGT(args)
[a2, a1] = jmptst(args);
code = ['F' a2 '4' a1];
target_label = '';
end
function [code, target_label] = JMPLT(args)
[a2, a1] = jmptst(args);
code = ['F' a2 '5' a1];
target_label = '';
end
function [a2, a1] = jmptst(args)
[arg1, arg2] = parsejmpargs(args);
[a1, mode1, label1] = parsearg(arg1, true);
[a2, mode2, label2] = parsearg(arg2, true);
if ~isequal(mode1, 'register') || ~isequal(mode2, 'register')
throw(MException('bmachine:assembler:jmptst', ...
['Jump with test requires registers: ', args]));
end
end
function [a1, a2] = parsejmpargs(args)
pargs = regexp(args, '^(\S+)\s*,\s*(\S+)$', 'tokens', 'once');
if isempty(pargs)
throw(MException('bmachine:assembler:jmpargs', ...
['Invalid format for jump arguments ' args]));
end
a1 = pargs{1};
a2 = pargs{2};
end
function [a, mode, label] = parsearg(arg, hexonly)
% Parses a single argument. Result a is 1 or 2 hex digits, mode is
% 'immediate', 'direct', 'register' or 'indirect'. A label may be used in
% immediate or direct mode - it is then returned in the label result and a is '00'.
% If second argument is given and is true, only hex values may be given
label = '';
% hex
toks = regexp(arg, '^([0-9a-fA-F]{2})h?$', 'tokens', 'once');
if ~isempty(toks)
a = upper(toks{1});
mode = 'immediate';
return
end
if nargin < 2 || ~hexonly
% binary
toks = regexp(arg, '^([01]{8})b?$', 'tokens', 'once');
if ~isempty(toks)
a = dec2hex(bin2dec(toks{1}), 2);
mode = 'immediate';
return
end
% ascii
toks = regexp(arg, '^"(.)"$', 'tokens', 'once');
if ~isempty(toks)
c = toks{1};
if c > 127
throw(MException('bmachine:assembler:parsearg', ...
['Non-ascii character: ' arg]));
end
a = dec2hex(c, 2);
mode = 'immediate';
return
end
% decimal integer
toks = regexp(arg, '^([+-][0-9]+)d?$|^([0-9])$', 'tokens', 'once');
if ~isempty(toks)
a = dec2hex(dble2int8(str2double(toks{1})), 2);
mode = 'immediate';
return
end
% float
toks = regexp(arg, '^([+-]?[0-9]+\.[0-9]*)$', 'tokens', 'once');
if ~isempty(toks)
a = dec2hex(dble2f8(str2double(toks{1})), 2);
mode = 'immediate';
return
end
end
% label
toks = regexp(arg, '^([a-zA-Z]\w{3,})$', 'tokens', 'once');
if ~isempty(toks)
label = toks{1};
a = '00';
mode = 'immediate';
return
end
% register
toks = regexp(arg, '^R([a-fA-F0-9])$', 'tokens', 'once');
if ~isempty(toks)
a = upper(toks{1});
mode = 'register';
return
end
% direct memory reference
toks = regexp(arg, '^\[([a-fA-F0-9]{2})\]$', 'tokens', 'once');
if ~isempty(toks)
a = upper(toks{1});
mode = 'direct';
return
end
% direct memory reference - label
toks = regexp(arg, '^\[([a-zA-Z]\w{3,})\]$', 'tokens', 'once');
if ~isempty(toks)
label = toks{1};
a = '00';
mode = 'direct';
return
end
% register indirect
toks = regexp(arg, '^\[R([a-fA-F0-9])\]$', 'tokens', 'once');
if ~isempty(toks)
a = upper(toks{1});
mode = 'indirect';
return
end
throw(MException('bmachine:assembler:parsearg', ...
['Unrecognised argument format: ' arg]));
end
function [code, target_label] = HALT(args)
if ~isempty(args)
throw(MException('bmachine:assembler:HALT', ...
['HALT should have no arguments, found: ' args]));
end
code = 'C000';
target_label = '';
end