1.21. Lexing

Felix provides a mechanism for constructing lexers. The reglex construction matches a prefix of the string. Of all possible matches, reglex chooses the longest match. As for regmatch, if more than one regexp matches, the first written is used.

The expression for each regexp has access to three values of type iterator: lexeme_start, lexeme_end and buffer_end.

Start felix section to tut/examples/tut_beg121a.flx[1 /1 ]
     1: #line 964 "./lpsrc/flx_tutorial.pak"
     2: #import <flx.flxh>
     3: open Lexer;
     4: 
     5: regexp lower = ["abcdefghijklmnopqrstuvwxyz"];
     6: regexp upper = ["ABCDEFGHIJKLMNOPQRSTUVWXYZ"];
     7: regexp digit = ["0123456789"];
     8: regexp alpha = lower | upper | "_";
     9: regexp space = " ";
    10: regexp white = space +;
    11: 
    12: fun lexit(start:iterator, finish:iterator):
    13:   iterator * (string * string)
    14: =
    15: {
    16:   return
    17:     reglex start to finish with
    18:     | digit+ => "Number",
    19:       string_between(lexeme_start,lexeme_end)
    20: 
    21:     | alpha+ =>  "Identifier",
    22:       string_between(lexeme_start,lexeme_end)
    23: 
    24:     | white =>  "White",
    25:       string_between(lexeme_start,lexeme_end)
    26:     endmatch
    27:   ;
    28: }
    29: 
    30: 
    31: var s = "A string 2 lex";
    32: val first = start_iterator s;
    33: val finish = end_iterator s;
    34: var current = first;
    35: 
    36: while { current != finish }
    37: {
    38:     match lexit(current, finish) with
    39:     | ?next,(?kind,?lexeme) =>
    40:     {
    41:       current = next;
    42:       print kind; print ": "; print lexeme; endl;
    43:     }
    44:     endmatch
    45:   ;
    46: };
    47: print "Done.\n";
    48: 
End felix section to tut/examples/tut_beg121a.flx[1]
Start data section to tut/examples/tut_beg121b.flx[1 /1 ]
     1: #!/bin/env flx
     2: #import <flx.flxh>
     3: include "flx_lex";
     4: use Lexer::sub;
     5: 
     6: print "Lexer here"; endl;
     7: 
     8: /* some /* commented */ stuffs */
     9: 
    10: val xx = 1214;
    11: 
    12: //val s = "A string is here == != @@ ";
    13: var s = Text_file::load("tut/examples/tut_beg121b.flx");
    14: 
    15: //print s; endl;
    16: 
    17: i2 := Lexer::end_iterator s;
    18: var i1 = Lexer::start_iterator s;
    19: 
    20: proc print_token()
    21: {
    22:   open Flx_lex;
    23:   def var j, var des = pre_flx_lex (i1, i2);
    24:   match des with
    25:   | qQuote =>        { j,des = parse_q_string (j,i2); }
    26:   | qqqQuote =>      { j,des = parse_qqq_string (j,i2); }
    27:   | dQuote =>        { j,des = parse_d_string (j,i2); }
    28:   | dddQuote =>      { j,des = parse_ddd_string (j,i2); }
    29:   | rqQuote =>       { j,des = parse_rq_string (j,i2); }
    30:   | rqqqQuote =>     { j,des = parse_rqqq_string (j,i2); }
    31:   | rdQuote =>       { j,des = parse_rd_string (j,i2); }
    32:   | rdddQuote =>     { j,des = parse_rddd_string (j,i2); }
    33:   | Preprocessor =>  { j = to_eol(j,i2) - 1; }
    34:   | Cpp_comment =>   { j = to_eol(j,i2) - 1; }
    35:   | C_comment =>     { j = to_end_c_comment (j,i2); }
    36:   | _ => {}
    37:   endmatch;
    38:   dess :=
    39:     match des with
    40:     | Eol => "Eol"
    41:     | Ident => "Id"
    42:     | DOLLAR => "DOLLAR"
    43:     | QUEST => "QUEST"
    44:     | EXCLAMATION => "EXCLAMATION"
    45:     | LPAR => "LPAR"
    46:     | RPAR => "RPAR"
    47:     | LSQB => "LSQB"
    48:     | RSQB => "RSQB"
    49:     | LBRACE => "LBRACE"
    50:     | RBRACE => "RBRACE"
    51:     | COLON => "COLON"
    52:     | COMMA => "COMMA"
    53:     | SEMI => "SEMI"
    54:     | PLUS => "PLUS"
    55:     | MINUS => "MINUS"
    56:     | STAR => "STAR"
    57:     | SLASH => "SLASH"
    58:     | VBAR => "VBAR"
    59:     | AMPER => "AMPER"
    60:     | LESS => "LESS"
    61:     | GREATER => "GREATER"
    62:     | EQUAL => "EQUAL"
    63:     | DOT => "DOT"
    64:     | PERCENT => "PERCENT"
    65:     | BACKQUOTE => "BACKQUOTE"
    66:     | TILDE => "TILDE"
    67:     | CIRCUMFLEX => "CIRCUMFLEX"
    68:     | HASH => "HASH"
    69:     | ANDLESS => "&<"
    70:     | ANDGREATER => "&>"
    71:     | EQEQUAL => "=="
    72:     | NOTEQUAL => "!="
    73:     | LESSEQUAL => "<="
    74:     | GREATEREQUAL => ">="
    75:     | LEFTSHIFT => "<<"
    76:     | RIGHTSHIFT => ">>"
    77:     | STARSTAR => "**"
    78:     | LESSCOLON => "<:"
    79:     | COLONGREATER => ":>"
    80:     | DOTDOT => ".."
    81:     | COLONCOLON => "::"
    82:     | PLUSPLUS => "++"
    83:     | MINUSMINUS => "--"
    84:     | PLUSEQUAL => "+="
    85:     | MINUSEQUAL => "-="
    86:     | STAREQUAL => "*="
    87:     | SLASHEQUAL => "/="
    88:     | PERCENTEQUAL => "%="
    89:     | CARETEQUAL => "^="
    90:     | VBAREQUAL => "|="
    91:     | AMPEREQUAL => "&="
    92:     | TILDEEQUAL => "~="
    93:     | COLONEQUAL => ":="
    94:     | RIGHTARROW => "->"
    95:     | EQRIGHTARROW => "=>"
    96:     | LEFTARROW => "<-"
    97:     | LSQANGLE => "[<"
    98:     | RSQANGLE => ">]"
    99:     | LSQBAR => "[|"
   100:     | RSQBAR => "|]"
   101:     | AMPERAMPER => "&&"
   102:     | VBARVBAR => "||"
   103:     | SLOSHAMPER => "\\&"
   104:     | SLOSHVBAR => "\\|"
   105:     | SLOSHCIRCUMFLEX => "\\^"
   106:     | HASHBANG => "#!"
   107:     | LEFTSHIFTEQUAL => "<<="
   108:     | RIGHTSHIFTEQUAL => ">>="
   109:     | LEFTRIGHTARROW => "<->"
   110:     | ANDEQEQUAL => "&=="
   111:     | ANDNOTEQUAL => "&!="
   112:     | ANDLESSEQUAL => "&<="
   113:     | ANDGREATEREQUAL => "&>="
   114:     | DOTDOTDOT => "..."
   115:     | DOTRIGHTARROW => ".->"
   116:     | LONGRIGHTARROW => "-->"
   117:     | PARSE_ACTION => "=>#"
   118:     | HASHBANGSLASH => "#!/"
   119:     | Preprocessor =>  "Pre"
   120:     | Cpp_comment =>   "Cppc"
   121:     | C_comment =>     "Cc"
   122:     | White => "White"
   123:     | Int => "Int"
   124:     | Float => "Float"
   125:     | _ => "Other"
   126:     endmatch
   127:   ;
   128:   print (dess ":       ").[0 to 9];
   129:   print ('"' (Lexer::string_between(i1,j)) '"');
   130:   endl;
   131:   i1 = j;
   132: }
   133: 
   134: use Lexer::ne;
   135: 
   136: while { i1 != i2 } { print_token; };
   137: 
End data section to tut/examples/tut_beg121b.flx[1]