image thumbnail
from Growing a Compiler by Bill McKeeman
Bootstrap compilers starting from a tiny compiler-compiler.

gem6()
%GEM6    Machine to execute self-describing grammars
%    G = gem6(); builds a GEM object. 
%    G.self0     is a self-describing grammar
%    G.self      as above, preprocessed for use
%    G.self1     as above, using long phrase names
%    G.postfix0  describes infix to postfix transformation
%    G.postfix   as above, preprocessed for use
%    G.nostar1   transforms  x* to X, <x>* to <x*>
%    G.nostar2   adds rules X=xX;x=;
%    G.nostar3   adds rules <x*>=<x><x*>;<x*>=;
%    G.noplus    replaces x+ and <x>+ with xx* and <x><x>*
%    G.pretty0   a grammar pretty printer
%    G.pretty    as above, preprocessed for use
%    G.invert    grammar inverter
%    G.nowhite   deblanker
%    G.scan      function to call deblanker
%    G.exe       function to run X86 code
%    G.run       function to execute o=GEM(i,g,flags)
% EXAMPLES:     
%   G = gem6();                          % constructor
%   G.scan('x y z')                      % deblank a string
%   G.self1                              % a version of self
%   G.run(G.scan(G.self), G.self, 'dlua')% self on self yields ''
%   G.run(G.self0, G.pretty, 'DLUA')     % pretty self
%   G.run('', 'g="1";')                  % prints '1'
%   G.run('', 'g="1";', 'T')             % example of trace
%   G.exe('5589E5606133C0C9C3')          % execute X86 code

% COPYRIGHT:   1988 W. M. McKeeman.  See license.txt.
% MODS:	       1988.06.02 -- mckeeman -- original
%              2008.06.11 -- mckeeman@dartmouth -- moved to MEX
%              2009.05.04 -- lu.he@dartmouth -- major extension

function obj = gem6()
  EOL = char(10);                         % ascii end-of-line
  
  %    ------- input grammars -----------------
  
  % de-white-ed de-whiter-er
  % removes blank and EOL, may fail on bad input
  nowhite = [        % requires flag A
    '<g>=<s><g>;'         ...
    '<g>=;'               ...
    '<s>='' '';'          ...  discard blank
    '<s>=''' EOL ''';'    ...  discard EOL
    '<s>=<I><I><I>;'      ...  keep '''
    '<s>=<I><i>;'         ...  keep 'input stuff'
    '<s>=<O><O><O>;'      ...  keep """
    '<s>=<O><o>;'         ...  keep "output stuff"
    '<s>=A;'              ...  keep everything else
    '<I>=''''''"''";'     ...  see ', send '
    '<i>=<I>;'            ...  comes after input stuff
    '<i>=A<i>;'           ...  keep an input char
    '<O>=''"''""";'       ...  see ", send "
    '<o>=<O>;'            ...  comes after output stuff
    '<o>=A<o>;'           ...  expand one output char
    ];
  
  % Grammar-grammar for extended IOG;
  % must be preprocessed by nostar1, nostar2, nostar3 and nodup;
  self0 = [  % requires flag luad
    '<g> = <r>*;'                         EOL ...
    '<r> = <n> ''='' <s>* '';'';'         EOL ...
    '<n> = l;'                            EOL ...
    '<n> = ''<'' l n* ''>'';'             EOL ...
    '<n> = ''<'' l ''*>'';'               EOL ...
    'n   = l;'                            EOL ...
    'n   = d;'                            EOL ...
    'n   = ''_'';'                        EOL ...
    '<s> = i i i;'                        EOL ... 
    '<s> = i <i>;'                        EOL ...
    '<s> = o o o;'                        EOL ...
    '<s> = o <o>;'                        EOL ...
    '<s> = l ''*'';'                      EOL ...
    '<s> = l ''+'';'                      EOL ...
    '<s> = ''<'' l ''>'' ''*'';'          EOL ...
    '<s> = ''<'' l ''>'' ''+'';'          EOL ...
    '<s> = <n>;'                          EOL ...
    'i   = '''''';'                       EOL ...
    '<i> = i;'                            EOL ...
    '<i> = a <i>;'                        EOL ...
    'o   = ''"'';'                        EOL ...
    '<o> = o;'                            EOL ...
    '<o> = a <o>;'                        EOL ...
    ];
  
  % Same as self0, except uses multi-character phrase names
  self1 = [   % requires flag luad
    '<self>    = <rules>;'                      EOL ...
    '<rules>   = <rule> <rules>;'               EOL ...
    '<rule>    = ;'                             EOL ...
    '<rule>    = <name> ''='' <symbols> '';'';' EOL ...
    '<name>    = l;'                            EOL ...
    '<name>    = ''<'' l n* ''>'';'             EOL ...
    '<name>    = ''<'' l ''*>'';'               EOL ...
    'n         = l;'                            EOL ...
    'n         = d;'                            EOL ...
    'n         = ''_'';'                        EOL ...
    '<symbols> = <symbols> <symbol>;'           EOL ...
    '<symbols> = ;'                             EOL ...
    '<symbol>  = i i i;'                        EOL ... 
    '<symbol>  = i <any_in>;'                   EOL ...
    '<symbol>  = o o o;'                        EOL ...
    '<symbol>  = o <any_out>;'                  EOL ...
    '<symbol>  = ''<'' l ''>'' ''*'';'          EOL ...
    '<symbol>  = ''<'' l ''>'' ''+'';'          EOL ...
    '<symbol>  = l ''*'';'                      EOL ...
    '<symbol>  = l ''+'';'                      EOL ...
    '<symbol>  = <name>;'                       EOL ...
    '<any_in>  = i;'                            EOL ...
    '<any_in>  = a <any_in>;'                   EOL ...
    'i         = '''''';'                       EOL ...
    '<any_out> = o;'                            EOL ...
    '<any_out> = a <any_out>;'                  EOL ...
    'o         = ''"'';'                        EOL ...
    ];
    
  % an IOG pretty printer
  % Handles extra and missing whitespace, * and +
  % Leaves multicharacter inputs and outputs unchanged
  % Requires preprocessing by nostar1, nostar2 and nostar3
  % <g> is a grammar, <r> is a rule, 
  % <s> is a symbol, <p> is a phrase name,
  % b is whitespace
  pretty0 = [  % needs flag LUDA
    '<g> = b* <r>*;'                      EOL ... rules
    '<r> = <p> b* ''='' " =" <s>* b* '';'' b* ";' EOL '";' EOL ...
    '<p> = L;'                            EOL ... phrase name
    '<p> = ''<'' "<" L n* ''>'' ">";'     EOL ... <stuff>
    '<p> = ''<'' "<" L ''*>'' "*>";'      EOL ... <s*>
    'n   = L;'                            EOL ... letter
    'n   = D;'                            EOL ... digit
    'n   = ''_'' "_";'                    EOL ... _
    '<s> = b* " " <t>;'                   EOL ... symbol
    '<t> = I I I;'                        EOL ... '''
    '<t> = I <i>;'                        EOL ... 'stuff'
    '<t> = O O O;'                        EOL ... """
    '<t> = O <o>;'                        EOL ... "stuff"
    '<t> = ''<'' "<" L ''>'' ">" b* ''*'' "*";' EOL ... <s>*
    '<t> = ''<'' "<" L ''>'' ">" b* ''+'' "+";' EOL ... <s>+
    '<t> = L b* ''*'' "*";'               EOL ... s*
    '<t> = L b* ''+'' "+";'               EOL ... s+
    '<t> = <p>;'                          EOL ... phrase name
    'I   = '''''' "''";'                  EOL ... see ', send '
    '<i> = I;'                            EOL ... end of 'stuff'
    '<i> = A <i>;'                        EOL ... an in char
    'O   = ''"'' """;'                    EOL ... see ", send "
    '<o> = O;'                            EOL ... end of "stuff"
    '<o> = A <o>;'                        EOL ... an out char
    'b   = '' '';'                        EOL ... see blank
    'b   = ''' EOL ''';'                  EOL ... see EOL
    ];

  % invert the input/output of a grammar
  % no need to deblank input
  invert = [   % requires flag A 
    'g   = s g;'                          EOL ...
    'g   = ;'                             EOL ...
    's   = I I I;'                        EOL ...
    's   = I <i>;'                        EOL ...
    's   = O O O;'                        EOL ...
    's   = O <o>;'                        EOL ...
    's   = A;'                            EOL ...
    'I   = '''''' """;'                   EOL ... see ', send "
    '<i> = I;'                            EOL ...
    '<i> = A <i>;'                        EOL ...
    'O   = ''"'' "''";'                   EOL ... see ", send '
    '<o> = O;'                            EOL ... 
    '<o> = A <o>;'                        EOL ...
    ];

  % expr with * to operator-late PFN
  % input must be deblanked
  postfix0  = [      % requires flag LD
    'g = e;'                              EOL ...
    'e = t r*;'                           EOL ...
    'r = ''+'' t "+";'                    EOL ...
    'r = ''-'' t "-";'                    EOL ...
    't = f s*;'                           EOL ...
    's = ''*'' f "*";'                    EOL ...
    's = ''/'' f "/";'                    EOL ...
    'f = L;'                              EOL ...
    'f = D;'                              EOL ...
    'f = ''('' e '')'';'                  EOL ...
    ];

  % replace x* with X in a grammar (only lower) 
  % replace <x>* with <x*> in a grammar
  nostar1 = [        % requires flag ALUD
    '<g> = <r*>;'                         EOL ...
    '<r> = <p> ''='' "=" <s*> '';'' ";' EOL '";' EOL ...
    '<p> = L;'                            EOL ...
    '<p> = ''<'' "<" L N ''>'' ">";'      EOL ...
    '<p> = ''<'' "<" L ''*>'' "*>";'      EOL ...
    'n   = L;'                            EOL ...
    'n   = D;'                            EOL ...
    'n   = ''_'' "_";'                    EOL ...
    '<s> = I I I;'                        EOL ... 
    '<s> = I <i>;'                        EOL ...
    '<s> = O O O;'                        EOL ...
    '<s> = O <o>;'                        EOL ...
    '<s> = <p>;'                          EOL ...
    sprintf('<s>=''%c*''"%c";',               ...
       shuffle('a':'z','A':'Z'))          EOL ...
    sprintf('<s>=''<%c>*''"<%c*>";',          ...
       repeat(['a':'z','A':'Z'], 2))      EOL ...
    'I = '''''' "''";'                    EOL ...
    '<i> = I;'                            EOL ...
    '<i> = A <i>;'                        EOL ...
    'O = ''"'' """;'                      EOL ...
    '<o> = O;'                            EOL ...
    '<o> = A <o>;'                        EOL ...
    'b   = '' '';'                        EOL ...
    'b   = ''' EOL ''';'                  EOL ...
    'B   = b B;'                          EOL ...
    'B   =;'                              EOL ...
    'N   = n N;'                          EOL ...
    'N   =;'                              EOL ...
    '<r*>=<r><r*>;'                       EOL ...
    '<r*>=;'                              EOL ...
    '<s*>=<s><s*>;'                       EOL ...
    '<s*>=;'                              EOL ...
    ];

  % make new rules to define a*    
  % 'x*' goes to "X=xX;X=;";
  % input must be deblanked
  nostar2 = [         % requires flag alud 
    '<g> = <r> <g>;'                      EOL ...
    '<g> =;'                              EOL ...
    '<r> = l ''='' <f> '';'';'            EOL ...
    '<r> = <n> ''='' <f> '';'';'          EOL ...
    '<n> = ''<'' l <m> ''>'';'            EOL ...
    '<n> = ''<'' l ''*>'';'               EOL ...
    '<m> = n <m>;'                        EOL ...
    '<m> =;'                              EOL ...
    'n   = l;'                            EOL ...
    'n   = d;'                            EOL ...
    'n   = ''_'';'                        EOL ...
    '<f> = <p> <f>;'                      EOL ...
    '<f> =;'                              EOL ...
    '<p> = i i i;'                        EOL ... 
    '<p> = i <i>;'                        EOL ...
    '<p> = o o o;'                        EOL ...
    '<p> = o <o>;'                        EOL ...
    '<p> = s;'                            EOL ...
    '<p> = ''<'' l ''>'' ''*'';'          EOL ...
    '<p> = l;'                            EOL ...
    '<p> = <n>;'                          EOL ...
    'i   = '''''';'                       EOL ...
    '<i> = i;'                            EOL ...
    '<i> = a <i>;'                        EOL ...
    'o   = ''"'';'                        EOL ...
    '<o> = o;'                            EOL ...
    '<o> = a <o>;'                        EOL ...
    sprintf(sprintf('s=''%c*''"%%c=%c%%c;%%c=;";', ...
      repeat('a':'z',2)),repeat('A':'Z',3)) EOL ...
    ];

  % make new rules to define <x>*    
  % '<x>*' goes to "<x*>=<x><x*>;<x*>=;"
  % requires deblanked input
  nostar3 = [         % requires flag alud 
    '<g> = <r> <g>;'                      EOL ...
    '<g> =;'                              EOL ...
    '<r> = l ''='' <f> '';'';'            EOL ...
    '<r> = <n> ''='' <f> '';'';'          EOL ...
    '<n> = ''<'' l <m> ''>'';'            EOL ...
    '<n> = ''<'' l ''*>'';'               EOL ...
    '<m> = n <m>;'                        EOL ...
    '<m> =;'                              EOL ...
    'n   = l;'                            EOL ...
    'n   = d;'                            EOL ...
    'n   = ''_'';'                        EOL ...
    '<f> = <p> <f>;'                      EOL ...
    '<f> =;'                              EOL ...
    '<p> = i i i;'                        EOL ... 
    '<p> = i <i>;'                        EOL ...
    '<p> = o o o;'                        EOL ...
    '<p> = o <o>;'                        EOL ...
    '<p> = l ''*'';'                      EOL ...
    '<p> = l;'                            EOL ...
    '<p> = s;'                            EOL ...
    '<p> = <n>;'                          EOL ...
    'i   = '''''';'                       EOL ...
    '<i> = i;'                            EOL ...
    '<i> = a <i>;'                        EOL ...
    'o   = ''"'';'                        EOL ...
    '<o> = o;'                            EOL ...
    '<o> = a <o>;'                        EOL ...
    sprintf('s=''<%c>*''"<%c*>=<%c><%c*>;<%c*>=;";', ...
      repeat(['a':'z' 'A':'Z'],5))        EOL ...
    ];

  % replace x+ in a grammar with xx* (only lower)
  % replaces <x>+ in a grammar with <x><x>* (upper or lower)
  noplus = [         % requires flag ALUD
    '<g> = <r*>;'                           EOL ...
    '<r> = <p> ''='' "=" <s*> '';'' ";' EOL '";' EOL ...
    '<p> = L;'                            EOL ...
    '<p> = ''<'' "<" L <m> ''>'' ">";'    EOL ...
    '<p> = ''<'' "<" L ''*>'' "*>";'      EOL ...
    '<m> = n <m>;'                        EOL ...
    '<m> =;'                              EOL ...
    'n   = L;'                            EOL ...
    'n   = D;'                            EOL ...
    'n   = ''_'' "_";'                    EOL ...
    '<s> = I I I;'                        EOL ... 
    '<s> = I <i>;'                        EOL ...
    '<s> = O O O;'                        EOL ...
    '<s> = O <o>;'                        EOL ...
    '<s> = L ''*'' "*";'                  EOL ...
    '<s> = ''<'' "<" L ''>'' ">" ''*'' "*";'  EOL ...
    sprintf('<s>=''%c+''"%c%c*";',            ...
       repeat('a':'z',3))                 EOL ...
    sprintf('<s>=''<%c>+''"<%c><%c>*";',      ...
       repeat(['a':'z','A':'Z'],3))       EOL ...
    '<s> = <p>;'                          EOL ...
    'I = '''''' "''";'                    EOL ...
    'O = ''"'' """;'                      EOL ...
    '<i> = I;'                            EOL ...
    '<i> = A <i>;'                        EOL ...
    '<o> = O;'                            EOL ...
    '<o> = A <o>;'                        EOL ...
    '<s*>=<s><s*>;'                       EOL ...
    '<s*>=;'                              EOL ...
    '<r*>=<r><r*>;'                       EOL ...
    '<r*>=;'                              EOL ...
    ];
  
  % convert ascii HEX characters into binary
  % Note: fails in MEX interface because of imbedded nulls
  % mxArrayToString truncates the input at the null (MATLAB bug)
  char2hex = ['g=hg;' EOL 'g=;' EOL];
  v = 0;
  for i=['0':'9' 'A':'F']; 
    for j=['0':'9' 'A':'F']
      char2hex = [char2hex 'h=''' i j ''' "' v '";' EOL]; %#ok<AGROW>
      v = v + 1;
    end; 
  end;

% Apply the * and + removing transformations.
  % The following integers are needed for nodup
  letlen = numel('X=xX;x=;');
  bnflen = numel('<x*>=<x><x*>;<x*>=;');
  
  % transform away the *
  s = scan(self0);
  selfgrammar = run(s, nostar1, 'LUA');
  selfrules2  = run(s, nostar2, 'lua');
  selfrules3  = run(s, nostar3, 'lua');
  self        = [selfgrammar selfrules2 selfrules3];
  
  % transform away the *
  s = scan(pretty0);
  prettygrammar = run(s, nostar1, 'LUA');
  prettyrules2  = nodup(run(s, nostar2, 'lua'), letlen);
  prettyrules3  = nodup(run(s, nostar3, 'lua'), bnflen);
  pretty        = [prettygrammar prettyrules2 prettyrules3];
  
  % transform away the *
  s = scan(postfix0);
  post1    = run(s, nostar1, 'LUA');
  post2    = run(s, nostar2, 'lua');
  postfix  = [post1 post2];
    
  obj = public();
  %return;
  
  function otxt = run(itxt, gtxt, flags)
    if nargin == 2, flags = ''; end;
    otxt = iog6(itxt, gtxt, flags);              % call mex
  end

  function otxt = scan(itxt)     
    otxt = iog6(itxt, nowhite, 'A');
  end

  % Remove duplicates of length sz from the input.
  % In a loop, 
  %   1. capture and remove the first n chars of input
  %   2. construct an IOG ignoring the capture and passing the rest
  %   3. apply the IOG, then repeat the process on the result.
  %   4. the concatenation of the "captures" is the answer.
  function final = nodup(dups, sz)
    assert(mod(numel(dups),sz)==0);
    final = '';
    subs = ['p=' char(zeros(1,sz)+'A') ';'];
    while numel(dups) > 0
      next = dups(1:sz);
      dups(1:sz) = [];
      gr = ['g=pg;g=;p=''' next ''';' subs];
      final = [final next];                      %#ok<AGROW>
      dups = run(dups, gr, 'A');                 % gr needs 'A'
    end
  end

  % Pack MATLAB chars into hexadecimal.
  % This is a case of using MATLAB to do something essential, which
  % apparently violates the stated challenge of bootstrapping everything.
  % GEM is actually capable of this transformation (see char2hex)
  % but MATLAB/C interface cannot transmit it.
  % The problem is that machine language has lots of zeros (ascii NULL)
  % which truncate the output.
  % The MEX limitation could be got around, but it is not worth the effort.
  function res = val(hexdigit)
    if '0'<=hexdigit && hexdigit<='9', res = hexdigit-'0';
    else res = hexdigit - 'A' + 10;
    end
  end

  % Run hex string on underlying hardware.
  % Squeeze 2 MATLAB chars into each code byte.
  function rc = exe(hex)
    switch computer
      case {'PCWIN', 'GLNX86', 'MACI'}
      otherwise
        error('Intel x86 hardware required');
    end
    nbytes = numel(hex)/2;
    code = zeros(1,nbytes,'uint8');
    for pc = 1:nbytes                       % squeeze out leading zeros
      b1 = val(hex(2*pc-1));
      b2 = val(hex(2*pc));
      code(pc) = bitor(bitshift(b1,4),b2);
    end
    rc = nonox(code);
    rc = runX86(code);
  end


  function o = public()
    o = struct;
    o.self0    = self0;    % lookable
    o.self1    = self1;    % long names
    o.self     = self;     % usable
    o.postfix0 = postfix0; % lookable
    o.postfix  = postfix;  % usable
    o.nostar1  = nostar1;  % x* to X, <x>* to <x*>
    o.nostar2  = nostar2;  % add rules for X
    o.nostar3  = nostar3;  % add rules for <x*>
    o.noplus   = noplus;   % remove x+ and <x>+
    o.pretty0  = pretty0;  % lookable
    o.pretty   = pretty;   % usable
    o.invert   = invert;   % decompiler builder
    o.nowhite  = nowhite;  % deblanker
    o.scan     = @scan;    % deblank
    o.exe      = @exe;     % run X86 code
    o.run      = @run;
  end

end

Contact us at files@mathworks.com