image thumbnail
from toy compiler by Bill McKeeman
A toy compiler

lexer(src)
% FILE:    lexer.m
% PURPOSE: Simple lexer 
% CALL:    [tcode, tstart, tend] = lexer(src)
% SIG:     [[uint8], [uint16], [uint16]] = lexer([char])
%          Accepts M-like source text and produces three arrays
%          tcode(i) is a unique code for the i-th token.
%          src(tstart(i):tend(i)) is the text of the i-th token.

function [tcode, tstart, tend] = lexer(src)

% Initialize character and token tables.
  TOK = enum(tokens);

  EOF = uint8(128);
  
  isdigit(1:EOF) = false;  
  isdigit(uint8('0':'9')) = true;
  
  isalnum = isdigit;  
  isalnum(uint8('a':'z')) = true; 
  isalnum(uint8('A':'Z')) = true;
    
  ch2code(EOF) = 0;                    % allocate fast table
  ch2code('=') = TOK.eq;               % for single char tokens
  ch2code('+') = TOK.add;              % watch out for ':'  !!!
  ch2code('-') = TOK.sub;
  ch2code('*') = TOK.mul;
  ch2code('/') = TOK.div;
  ch2code('\') = TOK.vid;
  ch2code('(') = TOK.lp;
  ch2code(')') = TOK.rp;
  ch2code(';') = TOK.semi;
  
% Allocate storage for the token sequences
  len    = uint16(length(src));         % more than enough
  tcode  = zeros(1,len, 'uint8');       % preallocate
  tstart = zeros(1,len, 'uint16'); 
  tend   = zeros(1,len, 'uint16'); 

  ti   = uint16(0);                     % token index
  ci   = uint16(1);                     % character index
  src  = [src EOF];                     % append eof

% Lex the input
  while true
    st = ci;
    ch = src(ci);
    if ch<1 || ch>EOF
      error(['unexpected input character ' num2str(ch)]);
    elseif isspace(ch)                  % blank
      ci = ci+1;
      continue;                         % ignore whitespace
    elseif isletter(ch) 
      while isalnum(ch) || ch == '_'    % identifier
        ci = ci+1;
        ch = src(ci);
      end
      tok = TOK.id;
    elseif isdigit(ch)                  % number
      while isdigit(ch)
        ci = ci+1;
        ch = src(ci);
      end
      tok = TOK.int;
    elseif ch2code(ch) ~= 0             % single char tokens
      ci = ci+1;
      tok = ch2code(ch);     
    elseif ch == EOF                    % eof
      tok = TOK.eof;
    else
      error(['unexpected input character ''' ch '''']);
    end

    if ti == numel(tcode)               % need more storage
      tcode(len+len)  = 0;              % double it
      tstart(len+len) = 0;
      tend(len+len)   = 0;
    end
    ti         = ti+1;                  % make room
    tcode(ti)  = tok;                   % record token info
    tstart(ti) = st;
    tend(ti)   = ci-1;                  % ci starts next token
    if tok == TOK.eof, break; end       % quit
  end

% Clean up
  tcode  = tcode(1:ti);                 % trim results
  tstart = tstart(1:ti);
  tend   = tend(1:ti);







Contact us at files@mathworks.com