|  | @@ -0,0 +1,2753 @@
 | 
	
		
			
				|  |  | +/*
 | 
	
		
			
				|  |  | +Copyright (c) 2013-2016 whitequark  <whitequark@whitequark.org>
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +Parts of the source are derived from ruby_parser:
 | 
	
		
			
				|  |  | +Copyright (c) Ryan Davis, seattle.rb
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +This lexer is a rewrite of the original in Ragel/C:
 | 
	
		
			
				|  |  | +Copyright (c) Charlie Somerville, GitHub
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +MIT License
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +Permission is hereby granted, free of charge, to any person obtaining
 | 
	
		
			
				|  |  | +a copy of this software and associated documentation files (the
 | 
	
		
			
				|  |  | +"Software"), to deal in the Software without restriction, including
 | 
	
		
			
				|  |  | +without limitation the rights to use, copy, modify, merge, publish,
 | 
	
		
			
				|  |  | +distribute, sublicense, and/or sell copies of the Software, and to
 | 
	
		
			
				|  |  | +permit persons to whom the Software is furnished to do so, subject to
 | 
	
		
			
				|  |  | +the following conditions:
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +The above copyright notice and this permission notice shall be
 | 
	
		
			
				|  |  | +included in all copies or substantial portions of the Software.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
	
		
			
				|  |  | +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
	
		
			
				|  |  | +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 | 
	
		
			
				|  |  | +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 | 
	
		
			
				|  |  | +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 | 
	
		
			
				|  |  | +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 | 
	
		
			
				|  |  | +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
	
		
			
				|  |  | +*/
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +%%machine lex; # % fix highlighting
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/*
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +# === BEFORE YOU START ===
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +# Read the Ruby Hacking Guide chapter 11, available in English at
 | 
	
		
			
				|  |  | +# http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +# Remember two things about Ragel scanners:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#   1) Longest match wins.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#   2) If two matches have the same length, the first
 | 
	
		
			
				|  |  | +#      in source code wins.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +# General rules of making Ragel and Bison happy:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * `p` (position) and `@te` contain the index of the character
 | 
	
		
			
				|  |  | +#    they're pointing to ("current"), plus one. `@ts` contains the index
 | 
	
		
			
				|  |  | +#    of the corresponding character. The code for extracting matched token is:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       @source_buffer.slice(@ts...@te)
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * If your input is `foooooooobar` and the rule is:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       'f' 'o'+
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#    the result will be:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       foooooooobar
 | 
	
		
			
				|  |  | +#       ^ ts=0   ^ p=te=9
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * A Ragel lexer action should not emit more than one token, unless
 | 
	
		
			
				|  |  | +#    you know what you are doing.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * If an action emits the token and transitions to another state, use
 | 
	
		
			
				|  |  | +#    these Ragel commands:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       emit($whatever)
 | 
	
		
			
				|  |  | +#       fnext $next_state; fbreak;
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#    If you perform `fgoto` in an action which does not emit a token nor
 | 
	
		
			
				|  |  | +#    rewinds the stream pointer, the parser's side-effectful,
 | 
	
		
			
				|  |  | +#    context-sensitive lookahead actions will break in a hard to detect
 | 
	
		
			
				|  |  | +#    and debug way.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * If an action does not emit a token:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       fgoto $next_state;
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * If an action features lookbehind, i.e. matches characters with the
 | 
	
		
			
				|  |  | +#    intent of passing them to another action:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       p = @ts - 1
 | 
	
		
			
				|  |  | +#       fgoto $next_state;
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#    or, if the lookbehind consists of a single character:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       fhold; fgoto $next_state;
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
 | 
	
		
			
				|  |  | +#    `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
 | 
	
		
			
				|  |  | +#    _will_ invoke the action `act`.
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#    e_something stands for "something with **e**mbedded action".
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * EOF is explicit and is matched by `c_eof`. If you want to introspect
 | 
	
		
			
				|  |  | +#    the state of the lexer, add this rule to the state:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       c_eof => do_eof;
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#  * If you proceed past EOF, the lexer will complain:
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +#       NoMethodError: undefined method `ord' for nil:NilClass
 | 
	
		
			
				|  |  | +#
 | 
	
		
			
				|  |  | +*/
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#include <ruby_parser/driver.hh>
 | 
	
		
			
				|  |  | +#include <cassert>
 | 
	
		
			
				|  |  | +#include "absl/strings/numbers.h"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +%% write data nofinal;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +using namespace ruby_parser;
 | 
	
		
			
				|  |  | +using namespace std::string_literals;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +%% prepush { check_stack_capacity(); }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +lexer::lexer(diagnostics_t &diag, ruby_version version, const std::string& source_buffer_)
 | 
	
		
			
				|  |  | +  : diagnostics(diag)
 | 
	
		
			
				|  |  | +  , version(version)
 | 
	
		
			
				|  |  | +  , source_buffer(source_buffer_ + std::string("\0\0", 2))
 | 
	
		
			
				|  |  | +  , cs(lex_en_line_begin)
 | 
	
		
			
				|  |  | +  , _p(source_buffer.data())
 | 
	
		
			
				|  |  | +  , _pe(source_buffer.data() + source_buffer.size())
 | 
	
		
			
				|  |  | +  , ts(nullptr)
 | 
	
		
			
				|  |  | +  , te(nullptr)
 | 
	
		
			
				|  |  | +  , act(0)
 | 
	
		
			
				|  |  | +  , top(0)
 | 
	
		
			
				|  |  | +  , eq_begin_s(nullptr)
 | 
	
		
			
				|  |  | +  , sharp_s(nullptr)
 | 
	
		
			
				|  |  | +  , newline_s(nullptr)
 | 
	
		
			
				|  |  | +  , paren_nest(0)
 | 
	
		
			
				|  |  | +  , command_start(true)
 | 
	
		
			
				|  |  | +  , num_base(0)
 | 
	
		
			
				|  |  | +  , num_digits_s(nullptr)
 | 
	
		
			
				|  |  | +  , num_suffix_s(nullptr)
 | 
	
		
			
				|  |  | +  , num_xfrm(num_xfrm_type::NONE)
 | 
	
		
			
				|  |  | +  , escape_s(nullptr)
 | 
	
		
			
				|  |  | +  , herebody_s(nullptr)
 | 
	
		
			
				|  |  | +  , in_kwarg(false)
 | 
	
		
			
				|  |  | +{
 | 
	
		
			
				|  |  | +  // ensure the stack is non-empty so we can just double in
 | 
	
		
			
				|  |  | +  // check_stack_capacity:
 | 
	
		
			
				|  |  | +  stack.resize(16);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  static_env.push(environment());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  cs_before_block_comment = lex_en_line_begin;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::check_stack_capacity() {
 | 
	
		
			
				|  |  | +    if (stack.size() == (size_t)top) {
 | 
	
		
			
				|  |  | +    stack.resize(stack.size() * 2);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +int lexer::stack_pop() {
 | 
	
		
			
				|  |  | +  return stack[--top];
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +int lexer::arg_or_cmdarg(int cmd_state) {
 | 
	
		
			
				|  |  | +  if (cmd_state) {
 | 
	
		
			
				|  |  | +    return lex_en_expr_cmdarg;
 | 
	
		
			
				|  |  | +  } else {
 | 
	
		
			
				|  |  | +    return lex_en_expr_arg;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit_comment(const char* s, const char* e) {
 | 
	
		
			
				|  |  | +  /* unused for now */
 | 
	
		
			
				|  |  | +  (void)s;
 | 
	
		
			
				|  |  | +  (void)e;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +std::string lexer::tok() {
 | 
	
		
			
				|  |  | +  return tok(ts);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +std::string lexer::tok(const char* start) {
 | 
	
		
			
				|  |  | +  return tok(start, te);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +std::string lexer::tok(const char* start, const char* end) {
 | 
	
		
			
				|  |  | +  assert(start <= end);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return std::string(start, (size_t)(end - start));
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +char lexer::unescape(uint32_t codepoint) {
 | 
	
		
			
				|  |  | +    switch (codepoint) {
 | 
	
		
			
				|  |  | +    case 'a': return '\a';
 | 
	
		
			
				|  |  | +    case 'b': return '\b';
 | 
	
		
			
				|  |  | +    case 'e': return 0x1b;
 | 
	
		
			
				|  |  | +    case 'f': return '\f';
 | 
	
		
			
				|  |  | +    case 'n': return '\n';
 | 
	
		
			
				|  |  | +    case 'r': return '\r';
 | 
	
		
			
				|  |  | +    case 's': return ' ';
 | 
	
		
			
				|  |  | +    case 't': return '\t';
 | 
	
		
			
				|  |  | +    case 'v': return '\v';
 | 
	
		
			
				|  |  | +    case '\\': return '\\';
 | 
	
		
			
				|  |  | +    default: return '\0';
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static const lexer::token_table_entry PUNCTUATION[] = {
 | 
	
		
			
				|  |  | +  { "=", token_type::tEQL },
 | 
	
		
			
				|  |  | +  { "&", token_type::tAMPER2 },
 | 
	
		
			
				|  |  | +  { "|", token_type::tPIPE },
 | 
	
		
			
				|  |  | +  { "!", token_type::tBANG },
 | 
	
		
			
				|  |  | +  { "^", token_type::tCARET },
 | 
	
		
			
				|  |  | +  { "+", token_type::tPLUS },
 | 
	
		
			
				|  |  | +  { "-", token_type::tMINUS },
 | 
	
		
			
				|  |  | +  { "*", token_type::tSTAR2 },
 | 
	
		
			
				|  |  | +  { "/", token_type::tDIVIDE },
 | 
	
		
			
				|  |  | +  { "%", token_type::tPERCENT },
 | 
	
		
			
				|  |  | +  { "~", token_type::tTILDE },
 | 
	
		
			
				|  |  | +  { ",", token_type::tCOMMA },
 | 
	
		
			
				|  |  | +  { ";", token_type::tSEMI },
 | 
	
		
			
				|  |  | +  { ".", token_type::tDOT },
 | 
	
		
			
				|  |  | +  { "..", token_type::tDOT2 },
 | 
	
		
			
				|  |  | +  { "...", token_type::tDOT3 },
 | 
	
		
			
				|  |  | +  { "[", token_type::tLBRACK2 },
 | 
	
		
			
				|  |  | +  { "]", token_type::tRBRACK },
 | 
	
		
			
				|  |  | +  { "(", token_type::tLPAREN2 },
 | 
	
		
			
				|  |  | +  { ")", token_type::tRPAREN },
 | 
	
		
			
				|  |  | +  { "?", token_type::tEH },
 | 
	
		
			
				|  |  | +  { ":", token_type::tCOLON },
 | 
	
		
			
				|  |  | +  { "&&", token_type::tANDOP },
 | 
	
		
			
				|  |  | +  { "||", token_type::tOROP },
 | 
	
		
			
				|  |  | +  { "-@", token_type::tUMINUS },
 | 
	
		
			
				|  |  | +  { "+@", token_type::tUPLUS },
 | 
	
		
			
				|  |  | +  { "~@", token_type::tTILDE },
 | 
	
		
			
				|  |  | +  { "**", token_type::tPOW },
 | 
	
		
			
				|  |  | +  { "->", token_type::tLAMBDA },
 | 
	
		
			
				|  |  | +  { "=~", token_type::tMATCH },
 | 
	
		
			
				|  |  | +  { "!~", token_type::tNMATCH },
 | 
	
		
			
				|  |  | +  { "==", token_type::tEQ },
 | 
	
		
			
				|  |  | +  { "!=", token_type::tNEQ },
 | 
	
		
			
				|  |  | +  { ">", token_type::tGT },
 | 
	
		
			
				|  |  | +  { ">>", token_type::tRSHFT },
 | 
	
		
			
				|  |  | +  { ">=", token_type::tGEQ },
 | 
	
		
			
				|  |  | +  { "<", token_type::tLT },
 | 
	
		
			
				|  |  | +  { "<<", token_type::tLSHFT },
 | 
	
		
			
				|  |  | +  { "<=", token_type::tLEQ },
 | 
	
		
			
				|  |  | +  { "=>", token_type::tASSOC },
 | 
	
		
			
				|  |  | +  { "::", token_type::tCOLON2 },
 | 
	
		
			
				|  |  | +  { "===", token_type::tEQQ },
 | 
	
		
			
				|  |  | +  { "<=>", token_type::tCMP },
 | 
	
		
			
				|  |  | +  { "[]", token_type::tAREF },
 | 
	
		
			
				|  |  | +  { "[]=", token_type::tASET },
 | 
	
		
			
				|  |  | +  { "{", token_type::tLCURLY },
 | 
	
		
			
				|  |  | +  { "}", token_type::tRCURLY },
 | 
	
		
			
				|  |  | +  { "`", token_type::tBACK_REF2 },
 | 
	
		
			
				|  |  | +  { "!@", token_type::tBANG },
 | 
	
		
			
				|  |  | +  { "&.", token_type::tANDDOT },
 | 
	
		
			
				|  |  | +  { NULL, token_type::error },
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static const lexer::token_table_entry PUNCTUATION_BEGIN[] = {
 | 
	
		
			
				|  |  | +  { "&", token_type::tAMPER },
 | 
	
		
			
				|  |  | +  { "*", token_type::tSTAR },
 | 
	
		
			
				|  |  | +  { "**", token_type::tDSTAR },
 | 
	
		
			
				|  |  | +  { "+", token_type::tUPLUS },
 | 
	
		
			
				|  |  | +  { "-", token_type::tUMINUS },
 | 
	
		
			
				|  |  | +  { "::", token_type::tCOLON3 },
 | 
	
		
			
				|  |  | +  { "(", token_type::tLPAREN },
 | 
	
		
			
				|  |  | +  { "{", token_type::tLBRACE },
 | 
	
		
			
				|  |  | +  { "[", token_type::tLBRACK },
 | 
	
		
			
				|  |  | +  { NULL, token_type::error },
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static const lexer::token_table_entry KEYWORDS[] = {
 | 
	
		
			
				|  |  | +  { "if", token_type::kIF_MOD },
 | 
	
		
			
				|  |  | +  { "unless", token_type::kUNLESS_MOD },
 | 
	
		
			
				|  |  | +  { "while", token_type::kWHILE_MOD },
 | 
	
		
			
				|  |  | +  { "until", token_type::kUNTIL_MOD },
 | 
	
		
			
				|  |  | +  { "rescue", token_type::kRESCUE_MOD },
 | 
	
		
			
				|  |  | +  { "defined?", token_type::kDEFINED },
 | 
	
		
			
				|  |  | +  { "BEGIN", token_type::klBEGIN },
 | 
	
		
			
				|  |  | +  { "END", token_type::klEND },
 | 
	
		
			
				|  |  | +  { "class", token_type::kCLASS },
 | 
	
		
			
				|  |  | +  { "module", token_type::kMODULE },
 | 
	
		
			
				|  |  | +  { "def", token_type::kDEF },
 | 
	
		
			
				|  |  | +  { "undef", token_type::kUNDEF },
 | 
	
		
			
				|  |  | +  { "begin", token_type::kBEGIN },
 | 
	
		
			
				|  |  | +  { "end", token_type::kEND },
 | 
	
		
			
				|  |  | +  { "then", token_type::kTHEN },
 | 
	
		
			
				|  |  | +  { "elsif", token_type::kELSIF },
 | 
	
		
			
				|  |  | +  { "else", token_type::kELSE },
 | 
	
		
			
				|  |  | +  { "ensure", token_type::kENSURE },
 | 
	
		
			
				|  |  | +  { "case", token_type::kCASE },
 | 
	
		
			
				|  |  | +  { "when", token_type::kWHEN },
 | 
	
		
			
				|  |  | +  { "for", token_type::kFOR },
 | 
	
		
			
				|  |  | +  { "break", token_type::kBREAK },
 | 
	
		
			
				|  |  | +  { "next", token_type::kNEXT },
 | 
	
		
			
				|  |  | +  { "redo", token_type::kREDO },
 | 
	
		
			
				|  |  | +  { "retry", token_type::kRETRY },
 | 
	
		
			
				|  |  | +  { "in", token_type::kIN },
 | 
	
		
			
				|  |  | +  { "do", token_type::kDO },
 | 
	
		
			
				|  |  | +  { "return", token_type::kRETURN },
 | 
	
		
			
				|  |  | +  { "yield", token_type::kYIELD },
 | 
	
		
			
				|  |  | +  { "super", token_type::kSUPER },
 | 
	
		
			
				|  |  | +  { "self", token_type::kSELF },
 | 
	
		
			
				|  |  | +  { "nil", token_type::kNIL },
 | 
	
		
			
				|  |  | +  { "true", token_type::kTRUE },
 | 
	
		
			
				|  |  | +  { "false", token_type::kFALSE },
 | 
	
		
			
				|  |  | +  { "and", token_type::kAND },
 | 
	
		
			
				|  |  | +  { "or", token_type::kOR },
 | 
	
		
			
				|  |  | +  { "not", token_type::kNOT },
 | 
	
		
			
				|  |  | +  { "alias", token_type::kALIAS },
 | 
	
		
			
				|  |  | +  { "__FILE__", token_type::k__FILE__ },
 | 
	
		
			
				|  |  | +  { "__LINE__", token_type::k__LINE__ },
 | 
	
		
			
				|  |  | +  { "__ENCODING__", token_type::k__ENCODING__ },
 | 
	
		
			
				|  |  | +  { NULL, token_type::error },
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static const lexer::token_table_entry KEYWORDS_BEGIN[] = {
 | 
	
		
			
				|  |  | +  { "if", token_type::kIF },
 | 
	
		
			
				|  |  | +  { "unless", token_type::kUNLESS },
 | 
	
		
			
				|  |  | +  { "while", token_type::kWHILE },
 | 
	
		
			
				|  |  | +  { "until", token_type::kUNTIL },
 | 
	
		
			
				|  |  | +  { "rescue", token_type::kRESCUE },
 | 
	
		
			
				|  |  | +  { "defined?", token_type::kDEFINED },
 | 
	
		
			
				|  |  | +  { "BEGIN", token_type::klBEGIN },
 | 
	
		
			
				|  |  | +  { "END", token_type::klEND },
 | 
	
		
			
				|  |  | +  { "class", token_type::kCLASS },
 | 
	
		
			
				|  |  | +  { "module", token_type::kMODULE },
 | 
	
		
			
				|  |  | +  { "def", token_type::kDEF },
 | 
	
		
			
				|  |  | +  { "undef", token_type::kUNDEF },
 | 
	
		
			
				|  |  | +  { "begin", token_type::kBEGIN },
 | 
	
		
			
				|  |  | +  { "end", token_type::kEND },
 | 
	
		
			
				|  |  | +  { "then", token_type::kTHEN },
 | 
	
		
			
				|  |  | +  { "elsif", token_type::kELSIF },
 | 
	
		
			
				|  |  | +  { "else", token_type::kELSE },
 | 
	
		
			
				|  |  | +  { "ensure", token_type::kENSURE },
 | 
	
		
			
				|  |  | +  { "case", token_type::kCASE },
 | 
	
		
			
				|  |  | +  { "when", token_type::kWHEN },
 | 
	
		
			
				|  |  | +  { "for", token_type::kFOR },
 | 
	
		
			
				|  |  | +  { "break", token_type::kBREAK },
 | 
	
		
			
				|  |  | +  { "next", token_type::kNEXT },
 | 
	
		
			
				|  |  | +  { "redo", token_type::kREDO },
 | 
	
		
			
				|  |  | +  { "retry", token_type::kRETRY },
 | 
	
		
			
				|  |  | +  { "in", token_type::kIN },
 | 
	
		
			
				|  |  | +  { "do", token_type::kDO },
 | 
	
		
			
				|  |  | +  { "return", token_type::kRETURN },
 | 
	
		
			
				|  |  | +  { "yield", token_type::kYIELD },
 | 
	
		
			
				|  |  | +  { "super", token_type::kSUPER },
 | 
	
		
			
				|  |  | +  { "self", token_type::kSELF },
 | 
	
		
			
				|  |  | +  { "nil", token_type::kNIL },
 | 
	
		
			
				|  |  | +  { "true", token_type::kTRUE },
 | 
	
		
			
				|  |  | +  { "false", token_type::kFALSE },
 | 
	
		
			
				|  |  | +  { "and", token_type::kAND },
 | 
	
		
			
				|  |  | +  { "or", token_type::kOR },
 | 
	
		
			
				|  |  | +  { "not", token_type::kNOT },
 | 
	
		
			
				|  |  | +  { "alias", token_type::kALIAS },
 | 
	
		
			
				|  |  | +  { "__FILE__", token_type::k__FILE__ },
 | 
	
		
			
				|  |  | +  { "__LINE__", token_type::k__LINE__ },
 | 
	
		
			
				|  |  | +  { "__ENCODING__", token_type::k__ENCODING__ },
 | 
	
		
			
				|  |  | +  { NULL, token_type::error },
 | 
	
		
			
				|  |  | +};
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static size_t utf8_encode_char(int32_t uc, std::string &dst) {
 | 
	
		
			
				|  |  | +  if (uc < 0x00) {
 | 
	
		
			
				|  |  | +    return 0;
 | 
	
		
			
				|  |  | +  } else if (uc < 0x80) {
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(uc));
 | 
	
		
			
				|  |  | +    return 1;
 | 
	
		
			
				|  |  | +  } else if (uc < 0x800) {
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0xC0 + (uc >> 6)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
 | 
	
		
			
				|  |  | +    return 2;
 | 
	
		
			
				|  |  | +  } else if (uc < 0x10000) {
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0xE0 + (uc >> 12)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
 | 
	
		
			
				|  |  | +    return 3;
 | 
	
		
			
				|  |  | +  } else if (uc < 0x110000) {
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0xF0 + (uc >> 18)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 12) & 0x3F)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
 | 
	
		
			
				|  |  | +    dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
 | 
	
		
			
				|  |  | +    return 4;
 | 
	
		
			
				|  |  | +  } else return 0;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static bool split_codepoints(const std::string &str, std::string &output) {
 | 
	
		
			
				|  |  | +  auto isspace = [](char c) { return c == ' ' || c == '\t'; };
 | 
	
		
			
				|  |  | +  const char *ptr = str.c_str();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  while (*ptr) {
 | 
	
		
			
				|  |  | +    while (isspace(*ptr))
 | 
	
		
			
				|  |  | +      ptr++;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    const char *start = ptr;
 | 
	
		
			
				|  |  | +    while (*ptr && !isspace(*ptr))
 | 
	
		
			
				|  |  | +      ptr++;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    std::string cp {start, static_cast<size_t>(ptr - start)};
 | 
	
		
			
				|  |  | +    if (utf8_encode_char(std::stoi(cp, nullptr, 16), output) == 0)
 | 
	
		
			
				|  |  | +      return false;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +  return true;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static std::string gsub(const std::string&& str, const std::string&& search, const std::string&& replace) {
 | 
	
		
			
				|  |  | +  std::string result;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  std::string::size_type from = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  while (true) {
 | 
	
		
			
				|  |  | +    auto index = str.find(search, from);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (index == std::string::npos) {
 | 
	
		
			
				|  |  | +      result += str.substr(from);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      result += str.substr(from, index - from);
 | 
	
		
			
				|  |  | +      result += replace;
 | 
	
		
			
				|  |  | +      from = index + search.size();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return result;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static bool eof_codepoint(char c) {
 | 
	
		
			
				|  |  | +  return c == 0 || c == 0x04 || c == 0x1a;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +token_t lexer::advance_() {
 | 
	
		
			
				|  |  | +  if (!token_queue.empty()) {
 | 
	
		
			
				|  |  | +    token_t token = token_queue.front();
 | 
	
		
			
				|  |  | +    token_queue.pop();
 | 
	
		
			
				|  |  | +    return token;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  int cmd_state = command_start;
 | 
	
		
			
				|  |  | +  command_start = false;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  const char* p = _p;
 | 
	
		
			
				|  |  | +  const char* pe = _pe;
 | 
	
		
			
				|  |  | +  const char* eof = _pe;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  const char* tm = NULL;
 | 
	
		
			
				|  |  | +  const char* heredoc_e = NULL;
 | 
	
		
			
				|  |  | +  const char* new_herebody_s = NULL;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  const char* ident_ts = NULL;
 | 
	
		
			
				|  |  | +  const char* ident_te = NULL;
 | 
	
		
			
				|  |  | +  std::string ident_tok;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  %% write exec;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  _p = p;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  if (!token_queue.empty()) {
 | 
	
		
			
				|  |  | +    token_t token = token_queue.front();
 | 
	
		
			
				|  |  | +    token_queue.pop();
 | 
	
		
			
				|  |  | +    return token;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  if (cs == lex_error) {
 | 
	
		
			
				|  |  | +    size_t start = (size_t)(p - source_buffer.data());
 | 
	
		
			
				|  |  | +    return mempool.alloc(token_type::error, start, start + 1, std::string(p - 1, 1));
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return mempool.alloc(token_type::eof, source_buffer.size(), source_buffer.size(), "");
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit(token_type type) {
 | 
	
		
			
				|  |  | +  emit(type, tok());
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit(token_type type, const std::string& str) {
 | 
	
		
			
				|  |  | +  emit(type, str, ts, te);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit(token_type type, const std::string& str, const char* start, const char* end) {
 | 
	
		
			
				|  |  | +  size_t offset_start = (size_t)(start - source_buffer.data());
 | 
	
		
			
				|  |  | +  size_t offset_end = (size_t)(end - source_buffer.data());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  token_queue.push(mempool.alloc(type, offset_start, offset_end, str));
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit_do(bool do_block) {
 | 
	
		
			
				|  |  | +  if (cond.active()) {
 | 
	
		
			
				|  |  | +    emit(token_type::kDO_COND, "do");
 | 
	
		
			
				|  |  | +  } else if (cmdarg.active() || do_block) {
 | 
	
		
			
				|  |  | +    emit(token_type::kDO_BLOCK, "do");
 | 
	
		
			
				|  |  | +  } else {
 | 
	
		
			
				|  |  | +    emit(token_type::kDO, "do");
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit_table(const token_table_entry* table) {
 | 
	
		
			
				|  |  | +  auto value = tok();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  for (; table->token; ++table) {
 | 
	
		
			
				|  |  | +    if (value == table->token) {
 | 
	
		
			
				|  |  | +      emit(table->type, value);
 | 
	
		
			
				|  |  | +      return;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // whitequark emits a `nil` token here, but if we do `yylex` hits an assert,
 | 
	
		
			
				|  |  | +  // so just drop the token.
 | 
	
		
			
				|  |  | +  return;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::emit_num(const std::string& num) {
 | 
	
		
			
				|  |  | +  switch (num_xfrm) {
 | 
	
		
			
				|  |  | +    case num_xfrm_type::NONE:
 | 
	
		
			
				|  |  | +      emit(token_type::tINTEGER, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    case num_xfrm_type::RATIONAL:
 | 
	
		
			
				|  |  | +      emit(token_type::tRATIONAL, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    case num_xfrm_type::IMAGINARY:
 | 
	
		
			
				|  |  | +      emit(token_type::tIMAGINARY, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    case num_xfrm_type::RATIONAL_IMAGINARY:
 | 
	
		
			
				|  |  | +      emit(token_type::tRATIONAL_IMAGINARY, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    case num_xfrm_type::FLOAT:
 | 
	
		
			
				|  |  | +      emit(token_type::tFLOAT, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +    case num_xfrm_type::FLOAT_IMAGINARY:
 | 
	
		
			
				|  |  | +      emit(token_type::tFLOAT_IMAGINARY, num);
 | 
	
		
			
				|  |  | +      break;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +std::string lexer::convert_base(const std::string& num, int num_base) {
 | 
	
		
			
				|  |  | +    long int result;
 | 
	
		
			
				|  |  | +    if (num_base == 10) {
 | 
	
		
			
				|  |  | +        return num;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    // This doesn't match Ruby's parsing but it is better than not handling it
 | 
	
		
			
				|  |  | +    if (!absl::numbers_internal::safe_strtoi_base(num, &result, num_base)) {
 | 
	
		
			
				|  |  | +        result = 0;
 | 
	
		
			
				|  |  | +        // dmitry: appartently we assume that outer functions reported all the errors!!!
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return std::to_string(result);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +diagnostic::range lexer::range(const char *start, const char *end) {
 | 
	
		
			
				|  |  | +  size_t token_start = (size_t)(start - source_buffer.data());
 | 
	
		
			
				|  |  | +  size_t token_end = (size_t)(end - source_buffer.data());
 | 
	
		
			
				|  |  | +  return diagnostic::range(token_start, token_end);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::diagnostic_(dlevel level, dclass type, const std::string &data) {
 | 
	
		
			
				|  |  | +  diagnostics.emplace_back(level, type, range(ts, te), data);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::diagnostic_(dlevel level, dclass type, diagnostic::range &&range, const std::string &data) {
 | 
	
		
			
				|  |  | +  diagnostics.emplace_back(level, type, range, data);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// === LITERAL STACK ===
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +template<typename... Args>
 | 
	
		
			
				|  |  | +int lexer::push_literal(Args&&... args) {
 | 
	
		
			
				|  |  | +  literal_stack.emplace(*this, std::forward<Args>(args)...);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  auto& literal = literal_stack.top();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return next_state_for_literal(literal);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +int lexer::next_state_for_literal(literal &lit) {
 | 
	
		
			
				|  |  | +  if (lit.words() && lit.backslash_delimited()) {
 | 
	
		
			
				|  |  | +    if (lit.interpolate()) {
 | 
	
		
			
				|  |  | +      return lex_en_interp_backslash_delimited_words;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      return lex_en_plain_backslash_delimited_words;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  } else if (lit.words() && !lit.backslash_delimited()) {
 | 
	
		
			
				|  |  | +    if (lit.interpolate()) {
 | 
	
		
			
				|  |  | +      return lex_en_interp_words;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      return lex_en_plain_words;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  } else if (!lit.words() && lit.backslash_delimited()) {
 | 
	
		
			
				|  |  | +    if (lit.interpolate()) {
 | 
	
		
			
				|  |  | +      return lex_en_interp_backslash_delimited;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      return lex_en_plain_backslash_delimited;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  } else {
 | 
	
		
			
				|  |  | +    if (lit.interpolate()) {
 | 
	
		
			
				|  |  | +      return lex_en_interp_string;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      return lex_en_plain_string;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +literal& lexer::literal_() {
 | 
	
		
			
				|  |  | +  return literal_stack.top();
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +int lexer::pop_literal() {
 | 
	
		
			
				|  |  | +  bool was_regexp;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  {
 | 
	
		
			
				|  |  | +    auto& old_literal = literal_stack.top();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    was_regexp = old_literal.regexp();
 | 
	
		
			
				|  |  | +    dedentLevel_ = old_literal.dedentLevel();
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  literal_stack.pop();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  if (was_regexp) {
 | 
	
		
			
				|  |  | +    return lex_en_regexp_modifiers;
 | 
	
		
			
				|  |  | +  } else {
 | 
	
		
			
				|  |  | +    return lex_en_expr_end;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::set_state_expr_beg() {
 | 
	
		
			
				|  |  | +  cs = lex_en_expr_beg;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::set_state_expr_end() {
 | 
	
		
			
				|  |  | +  cs = lex_en_expr_end;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::set_state_expr_endarg() {
 | 
	
		
			
				|  |  | +  cs = lex_en_expr_endarg;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::set_state_expr_fname() {
 | 
	
		
			
				|  |  | +  cs = lex_en_expr_fname;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::set_state_expr_value() {
 | 
	
		
			
				|  |  | +  cs = lex_en_expr_value;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +%%{
 | 
	
		
			
				|  |  | +  # access @;
 | 
	
		
			
				|  |  | +  # getkey (@source_pts[p] || 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # === CHARACTER CLASSES ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Pay close attention to the differences between c_any and any.
 | 
	
		
			
				|  |  | +  # c_any does not include EOF and so will cause incorrect behavior
 | 
	
		
			
				|  |  | +  # for machine subtraction (any-except rules) and default transitions
 | 
	
		
			
				|  |  | +  # for scanners.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action do_nl {
 | 
	
		
			
				|  |  | +    // Record position of a newline for precise location reporting on tNL
 | 
	
		
			
				|  |  | +    // tokens.
 | 
	
		
			
				|  |  | +    //
 | 
	
		
			
				|  |  | +    // This action is embedded directly into c_nl, as it is idempotent and
 | 
	
		
			
				|  |  | +    // there are no cases when we need to skip it.
 | 
	
		
			
				|  |  | +    newline_s = p;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  c_nl       = '\n' $ do_nl;
 | 
	
		
			
				|  |  | +  c_space    = [ \t\r\f\v];
 | 
	
		
			
				|  |  | +  c_space_nl = c_space | c_nl;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  c_eof      = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
 | 
	
		
			
				|  |  | +  c_eol      = c_nl | c_eof;
 | 
	
		
			
				|  |  | +  c_any      = any - c_eof;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  c_nl_zlen  = c_nl | zlen;
 | 
	
		
			
				|  |  | +  c_line     = any - c_nl_zlen;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  c_unicode  = c_any - 0x00..0x7f;
 | 
	
		
			
				|  |  | +  c_upper    = [A-Z];
 | 
	
		
			
				|  |  | +  c_lower    = [a-z_]  | c_unicode;
 | 
	
		
			
				|  |  | +  c_alpha    = c_lower | c_upper;
 | 
	
		
			
				|  |  | +  c_alnum    = c_alpha | [0-9];
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action do_eof {
 | 
	
		
			
				|  |  | +    // Sit at EOF indefinitely. #advance would return $eof each time.
 | 
	
		
			
				|  |  | +    // This allows to feed the lexer more data if needed; this is only used
 | 
	
		
			
				|  |  | +    // in tests.
 | 
	
		
			
				|  |  | +    //
 | 
	
		
			
				|  |  | +    // Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
 | 
	
		
			
				|  |  | +    // below. This is due to the fact that scanner state at EOF is observed
 | 
	
		
			
				|  |  | +    // by tests, and encapsulating it in a rule would break the introspection.
 | 
	
		
			
				|  |  | +    fhold; fbreak;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === TOKEN DEFINITIONS ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # All operators are punctuation. There is more to punctuation
 | 
	
		
			
				|  |  | +  # than just operators. Operators can be overridden by user;
 | 
	
		
			
				|  |  | +  # punctuation can not.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of operators which are valid in the function name context, but
 | 
	
		
			
				|  |  | +  # have different semantics in others.
 | 
	
		
			
				|  |  | +  operator_fname      = '[]' | '[]=' | '`'  | '-@' | '+@' | '~@'  | '!@' ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of operators which can occur within an assignment shortcut (+ → +=).
 | 
	
		
			
				|  |  | +  operator_arithmetic = '&'  | '|'   | '&&' | '||' | '^'  | '+'   | '-'  |
 | 
	
		
			
				|  |  | +                        '*'  | '/'   | '**' | '~'  | '<<' | '>>'  | '%'  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of all user-definable operators not covered by groups above.
 | 
	
		
			
				|  |  | +  operator_rest       = '=~' | '!~' | '==' | '!=' | '!'   | '===' |
 | 
	
		
			
				|  |  | +                        '<'  | '<=' | '>'  | '>=' | '<=>' | '=>'  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
 | 
	
		
			
				|  |  | +  # as they are ambiguous with interpolation `#{}` and should be counted.
 | 
	
		
			
				|  |  | +  # These braces are not present in punctuation lists.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of punctuation which has different meaning when used at the
 | 
	
		
			
				|  |  | +  # beginning of expression.
 | 
	
		
			
				|  |  | +  punctuation_begin   = '-'  | '+'  | '::' | '('  | '['  |
 | 
	
		
			
				|  |  | +                        '*'  | '**' | '&'  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of all punctuation except punctuation_begin.
 | 
	
		
			
				|  |  | +  punctuation_end     = ','  | '='  | '->' | '('  | '['  | ']'   |
 | 
	
		
			
				|  |  | +                        '::' | '?'  | ':'  | '.'  | '..' | '...' ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which have different meaning at the beginning of expression.
 | 
	
		
			
				|  |  | +  keyword_modifier    = 'if'     | 'unless' | 'while'  | 'until' | 'rescue' ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which accept an argument-like expression, i.e. have the
 | 
	
		
			
				|  |  | +  # same post-processing as method calls or commands. Example: `yield 1`,
 | 
	
		
			
				|  |  | +  # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
 | 
	
		
			
				|  |  | +  keyword_with_arg    = 'yield'  | 'super'  | 'not'    | 'defined?' ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which accept a literal function name as an argument.
 | 
	
		
			
				|  |  | +  keyword_with_fname  = 'def'    | 'undef'  | 'alias'  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which accept an expression after them.
 | 
	
		
			
				|  |  | +  keyword_with_value  = 'else'   | 'case'   | 'ensure' | 'module' | 'elsif' | 'then'  |
 | 
	
		
			
				|  |  | +                        'for'    | 'in'     | 'do'     | 'when'   | 'begin' | 'class' |
 | 
	
		
			
				|  |  | +                        'and'    | 'or'     ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which accept a value, and treat the keywords from
 | 
	
		
			
				|  |  | +  # `keyword_modifier` list as modifiers.
 | 
	
		
			
				|  |  | +  keyword_with_mid    = 'rescue' | 'return' | 'break'  | 'next'   ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A list of keywords which do not accept an expression after them.
 | 
	
		
			
				|  |  | +  keyword_with_end    = 'end'    | 'self'   | 'true'   | 'false'  | 'retry'    |
 | 
	
		
			
				|  |  | +                        'redo'   | 'nil'    | 'BEGIN'  | 'END'    | '__FILE__' |
 | 
	
		
			
				|  |  | +                        '__LINE__' | '__ENCODING__';
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # All keywords.
 | 
	
		
			
				|  |  | +  keyword             = keyword_with_value | keyword_with_mid |
 | 
	
		
			
				|  |  | +                        keyword_with_end   | keyword_with_arg |
 | 
	
		
			
				|  |  | +                        keyword_with_fname | keyword_modifier ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  constant       = c_upper c_alnum*;
 | 
	
		
			
				|  |  | +  bareword       = c_alpha c_alnum*;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  call_or_var    = c_lower c_alnum*;
 | 
	
		
			
				|  |  | +  class_var      = '@@' bareword;
 | 
	
		
			
				|  |  | +  instance_var   = '@' bareword;
 | 
	
		
			
				|  |  | +  global_var     = '$'
 | 
	
		
			
				|  |  | +      ( bareword | digit+
 | 
	
		
			
				|  |  | +      | [`'+~*$&?!@/\\;,.=:<>"] # `
 | 
	
		
			
				|  |  | +      | '-' c_alnum
 | 
	
		
			
				|  |  | +      )
 | 
	
		
			
				|  |  | +  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Ruby accepts (and fails on) variables with leading digit
 | 
	
		
			
				|  |  | +  # in literal context, but not in unquoted symbol body.
 | 
	
		
			
				|  |  | +  class_var_v    = '@@' c_alnum+;
 | 
	
		
			
				|  |  | +  instance_var_v = '@' c_alnum+;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  label          = bareword [?!]? ':';
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === NUMERIC PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  int_hex  = ( xdigit+ '_' )* xdigit* '_'? ;
 | 
	
		
			
				|  |  | +  int_dec  = ( digit+ '_' )* digit* '_'? ;
 | 
	
		
			
				|  |  | +  int_bin  = ( [01]+ '_' )* [01]* '_'? ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  flo_int  = [1-9] [0-9]* ( '_' digit+ )* | '0';
 | 
	
		
			
				|  |  | +  flo_frac = '.' ( digit+ '_' )* digit+;
 | 
	
		
			
				|  |  | +  flo_pow  = [eE] [+\-]? ( digit+ '_' )* digit+;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  int_suffix =
 | 
	
		
			
				|  |  | +    ''   % { num_xfrm = num_xfrm_type::NONE; }
 | 
	
		
			
				|  |  | +  | 'r'  % { num_xfrm = num_xfrm_type::RATIONAL; }
 | 
	
		
			
				|  |  | +  | 'i'  % { num_xfrm = num_xfrm_type::IMAGINARY; }
 | 
	
		
			
				|  |  | +  | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  flo_pow_suffix =
 | 
	
		
			
				|  |  | +    ''   % { num_xfrm = num_xfrm_type::FLOAT; }
 | 
	
		
			
				|  |  | +  | 'i'  % { num_xfrm = num_xfrm_type::FLOAT_IMAGINARY; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  flo_suffix =
 | 
	
		
			
				|  |  | +    flo_pow_suffix
 | 
	
		
			
				|  |  | +  | 'r'  % { num_xfrm = num_xfrm_type::RATIONAL; }
 | 
	
		
			
				|  |  | +  | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === ESCAPE SEQUENCE PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Escape parsing code is a Ragel pattern, not a scanner, and therefore
 | 
	
		
			
				|  |  | +  # it shouldn't directly raise errors or perform other actions with side effects.
 | 
	
		
			
				|  |  | +  # In reality this would probably just mess up error reporting in pathological
 | 
	
		
			
				|  |  | +  # cases, through.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # The amount of code required to parse \M\C stuff correctly is ridiculous.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  escaped_nl = "\\" c_nl;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action unicode_points {
 | 
	
		
			
				|  |  | +    auto codepoint_str = tok(escape_s + 2, p - 1);
 | 
	
		
			
				|  |  | +    std::string result;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (split_codepoints(codepoint_str, result)) {
 | 
	
		
			
				|  |  | +      escape = std::make_unique<std::string>(result);
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      auto codepoint_s = escape_s + 2;
 | 
	
		
			
				|  |  | +      diagnostic_(dlevel::ERROR, dclass::UnicodePointTooLarge,
 | 
	
		
			
				|  |  | +        range(codepoint_s, codepoint_s + codepoint_str.size()));
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action unescape_char {
 | 
	
		
			
				|  |  | +    char esc = unescape(p[-1]);
 | 
	
		
			
				|  |  | +    if (esc) {
 | 
	
		
			
				|  |  | +      escape = std::make_unique<std::string>(&esc, 1);
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      escape = std::make_unique<std::string>(p - 1, 1);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action invalid_complex_escape {
 | 
	
		
			
				|  |  | +    diagnostic_(dlevel::FATAL, dclass::InvalidEscape);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action slash_c_char {
 | 
	
		
			
				|  |  | +    // TODO multibyte
 | 
	
		
			
				|  |  | +    char c = escape->at(0) & 0x9f;
 | 
	
		
			
				|  |  | +    escape = std::make_unique<std::string>(&c, 1);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action slash_m_char {
 | 
	
		
			
				|  |  | +    // TODO multibyte
 | 
	
		
			
				|  |  | +    char c = escape->at(0) | 0x80;
 | 
	
		
			
				|  |  | +    escape = std::make_unique<std::string>(&c, 1);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  maybe_escaped_char = (
 | 
	
		
			
				|  |  | +        '\\' c_any      %unescape_char
 | 
	
		
			
				|  |  | +    | ( c_any - [\\] )  % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ }
 | 
	
		
			
				|  |  | +  );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  maybe_escaped_ctrl_char = ( # why?!
 | 
	
		
			
				|  |  | +        '\\' c_any      %unescape_char %slash_c_char
 | 
	
		
			
				|  |  | +    |   '?'             % { escape = std::make_unique<std::string>("\x7f"); }
 | 
	
		
			
				|  |  | +    | ( c_any - [\\?] ) % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ } %slash_c_char
 | 
	
		
			
				|  |  | +  );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  escape = (
 | 
	
		
			
				|  |  | +      # \377
 | 
	
		
			
				|  |  | +      [0-7]{1,3}
 | 
	
		
			
				|  |  | +      % {
 | 
	
		
			
				|  |  | +	auto esc = tok(escape_s, p);
 | 
	
		
			
				|  |  | +	char c = std::stoi(esc, nullptr, 8);
 | 
	
		
			
				|  |  | +	escape = std::make_unique<std::string>(&c, 1);
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      # \xff
 | 
	
		
			
				|  |  | +    | 'x' xdigit{1,2}
 | 
	
		
			
				|  |  | +        % {
 | 
	
		
			
				|  |  | +	  auto esc = tok(escape_s + 1, p);
 | 
	
		
			
				|  |  | +	  char c = std::stoi(esc, nullptr, 16);
 | 
	
		
			
				|  |  | +	  escape = std::make_unique<std::string>(&c, 1);
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      # \u263a
 | 
	
		
			
				|  |  | +    | 'u' xdigit{4}
 | 
	
		
			
				|  |  | +      % {
 | 
	
		
			
				|  |  | +	std::string result;
 | 
	
		
			
				|  |  | +	split_codepoints(tok(escape_s + 1, p), result);
 | 
	
		
			
				|  |  | +	escape = std::make_unique<std::string>(result);
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      # %q[\x]
 | 
	
		
			
				|  |  | +    | 'x' ( c_any - xdigit )
 | 
	
		
			
				|  |  | +      % {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::FATAL, dclass::InvalidHexEscape, range(escape_s - 1, p + 2));
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # %q[\u123] %q[\u{12]
 | 
	
		
			
				|  |  | +    | 'u' ( c_any{0,4}  -
 | 
	
		
			
				|  |  | +            xdigit{4}   -            # \u1234 is valid
 | 
	
		
			
				|  |  | +            ( '{' xdigit{1,3}        # \u{1 \u{12 \u{123 are valid
 | 
	
		
			
				|  |  | +            | '{' xdigit [ \t}] any? # \u{1. \u{1} are valid
 | 
	
		
			
				|  |  | +            | '{' xdigit{2} [ \t}]   # \u{12. \u{12} are valid
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  | +          )
 | 
	
		
			
				|  |  | +      % {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::FATAL, dclass::InvalidUnicodeEscape, range(escape_s - 1, p));
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # \u{123 456}
 | 
	
		
			
				|  |  | +    | 'u{' ( xdigit{1,6} [ \t] )*
 | 
	
		
			
				|  |  | +      ( xdigit{1,6} '}'
 | 
	
		
			
				|  |  | +        %unicode_points
 | 
	
		
			
				|  |  | +      | ( xdigit* ( c_any - xdigit - '}' )+ '}'
 | 
	
		
			
				|  |  | +        | ( c_any - '}' )* c_eof
 | 
	
		
			
				|  |  | +        | xdigit{7,}
 | 
	
		
			
				|  |  | +        ) % {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::FATAL, dclass::UnterminatedUnicode, range(p - 1, p));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # \C-\a \cx
 | 
	
		
			
				|  |  | +    | ( 'C-' | 'c' ) escaped_nl?
 | 
	
		
			
				|  |  | +      maybe_escaped_ctrl_char
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # \M-a
 | 
	
		
			
				|  |  | +    | 'M-' escaped_nl?
 | 
	
		
			
				|  |  | +      maybe_escaped_char
 | 
	
		
			
				|  |  | +      %slash_m_char
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # \C-\M-f \M-\cf \c\M-f
 | 
	
		
			
				|  |  | +    | ( ( 'C-'   | 'c' ) escaped_nl?   '\\M-'
 | 
	
		
			
				|  |  | +      |   'M-\\'         escaped_nl? ( 'C-'   | 'c' ) ) escaped_nl?
 | 
	
		
			
				|  |  | +      maybe_escaped_ctrl_char
 | 
	
		
			
				|  |  | +      %slash_m_char
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    | 'C' c_any %invalid_complex_escape
 | 
	
		
			
				|  |  | +    | 'M' c_any %invalid_complex_escape
 | 
	
		
			
				|  |  | +    | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    | ( c_any - [0-7xuCMc] ) %unescape_char
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    | c_eof % {
 | 
	
		
			
				|  |  | +      diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(p - 1, p));
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Use rules in form of `e_bs escape' when you need to parse a sequence.
 | 
	
		
			
				|  |  | +  e_bs = '\\' % {
 | 
	
		
			
				|  |  | +    escape_s = p;
 | 
	
		
			
				|  |  | +    escape   = nullptr;
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === STRING AND HEREDOC PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Heredoc parsing is quite a complex topic. First, consider that heredocs
 | 
	
		
			
				|  |  | +  # can be arbitrarily nested. For example:
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #     puts <<CODE
 | 
	
		
			
				|  |  | +  #     the result is: #{<<RESULT.inspect
 | 
	
		
			
				|  |  | +  #       i am a heredoc
 | 
	
		
			
				|  |  | +  #     RESULT
 | 
	
		
			
				|  |  | +  #     }
 | 
	
		
			
				|  |  | +  #     CODE
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # which, incidentally, evaluates to:
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #     the result is: "  i am a heredoc\n"
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # To parse them, lexer refers to two kinds (remember, nested heredocs)
 | 
	
		
			
				|  |  | +  # of positions in the input stream, namely heredoc_e
 | 
	
		
			
				|  |  | +  # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # heredoc_e is simply contained inside the corresponding Literal, and
 | 
	
		
			
				|  |  | +  # when the heredoc is closed, the lexing is restarted from that position.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # @herebody_s is quite more complex. First, @herebody_s changes after each
 | 
	
		
			
				|  |  | +  # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
 | 
	
		
			
				|  |  | +  # contains the current line, and also when a heredoc is started, @herebody_s
 | 
	
		
			
				|  |  | +  # contains the position from which the heredoc will be lexed.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Second, as (insanity) there are nested heredocs, we need to maintain a
 | 
	
		
			
				|  |  | +  # stack of these positions. Each time #push_literal is called, it saves current
 | 
	
		
			
				|  |  | +  # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
 | 
	
		
			
				|  |  | +  # containing another heredocs) is closed, the previous value is restored.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_heredoc_nl = c_nl % {
 | 
	
		
			
				|  |  | +    // After every heredoc was parsed, herebody_s contains the
 | 
	
		
			
				|  |  | +    // position of next token after all heredocs.
 | 
	
		
			
				|  |  | +    if (herebody_s) {
 | 
	
		
			
				|  |  | +      p = herebody_s;
 | 
	
		
			
				|  |  | +      herebody_s = NULL;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action extend_string {
 | 
	
		
			
				|  |  | +    auto str = tok();
 | 
	
		
			
				|  |  | +    std::string lookahead;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // tLABEL_END is only possible in non-cond context on >= 2.2
 | 
	
		
			
				|  |  | +    if (version >= ruby_version::RUBY_22 && !cond.active()) {
 | 
	
		
			
				|  |  | +      const char* lookahead_s = te;
 | 
	
		
			
				|  |  | +      const char* lookahead_e = te + 2;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (lookahead_e > eof) {
 | 
	
		
			
				|  |  | +        lookahead_e = eof;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      lookahead = std::string(lookahead_s, (size_t)(lookahead_e - lookahead_s));
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (!current_literal.heredoc() && current_literal.nest_and_try_closing(str, ts, te, lookahead)) {
 | 
	
		
			
				|  |  | +      if (token_queue.back()->type() == token_type::tLABEL_END) {
 | 
	
		
			
				|  |  | +        p += 1;
 | 
	
		
			
				|  |  | +        pop_literal();
 | 
	
		
			
				|  |  | +        fnext expr_labelarg;
 | 
	
		
			
				|  |  | +      } else {
 | 
	
		
			
				|  |  | +        fnext *pop_literal();
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      fbreak;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action extend_string_escaped {
 | 
	
		
			
				|  |  | +    auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // TODO multibyte
 | 
	
		
			
				|  |  | +    auto escaped_char = *escape_s;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (current_literal.munge_escape(escaped_char)) {
 | 
	
		
			
				|  |  | +      // If this particular literal uses this character as an opening
 | 
	
		
			
				|  |  | +      // or closing delimiter, it is an escape sequence for that
 | 
	
		
			
				|  |  | +      // particular character. Write it without the backslash.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (current_literal.regexp()
 | 
	
		
			
				|  |  | +          && (escaped_char == '\\' ||
 | 
	
		
			
				|  |  | +              escaped_char == '$'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '$'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '('  ||
 | 
	
		
			
				|  |  | +              escaped_char == ')'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '*'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '+'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '.'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '<'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '>'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '?'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '['  ||
 | 
	
		
			
				|  |  | +              escaped_char == ']'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '^'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '{'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '|'  ||
 | 
	
		
			
				|  |  | +              escaped_char == '}')) {
 | 
	
		
			
				|  |  | +        // Regular expressions should include escaped delimiters in their
 | 
	
		
			
				|  |  | +        // escaped form, except when the escaped character is
 | 
	
		
			
				|  |  | +        // a closing delimiter but not a regexp metacharacter.
 | 
	
		
			
				|  |  | +        //
 | 
	
		
			
				|  |  | +        // The backslash itself cannot be used as a closing delimiter
 | 
	
		
			
				|  |  | +        // at the same time as an escape symbol, but it is always munged,
 | 
	
		
			
				|  |  | +        // so this branch also executes for the non-closing-delimiter case
 | 
	
		
			
				|  |  | +        // for the backslash.
 | 
	
		
			
				|  |  | +        auto str = tok();
 | 
	
		
			
				|  |  | +        current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +      } else {
 | 
	
		
			
				|  |  | +        auto str = std::string(&escaped_char, 1);
 | 
	
		
			
				|  |  | +        current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      // It does not. So this is an actual escape sequence, yay!
 | 
	
		
			
				|  |  | +      if (current_literal.regexp()) {
 | 
	
		
			
				|  |  | +        // Regular expressions should include escape sequences in their
 | 
	
		
			
				|  |  | +        // escaped form. On the other hand, escaped newlines are removed.
 | 
	
		
			
				|  |  | +        std::string str = gsub(tok(), "\\\n", "");
 | 
	
		
			
				|  |  | +        current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +      } else {
 | 
	
		
			
				|  |  | +        auto str = escape ? *escape : tok();
 | 
	
		
			
				|  |  | +        current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Extend a string with a newline or a EOF character.
 | 
	
		
			
				|  |  | +  # As heredoc closing line can immediately precede EOF, this action
 | 
	
		
			
				|  |  | +  # has to handle such case specially.
 | 
	
		
			
				|  |  | +  action extend_string_eol {
 | 
	
		
			
				|  |  | +    auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (te == pe) {
 | 
	
		
			
				|  |  | +      diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(current_literal.str_s, current_literal.str_s + 1));
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (current_literal.heredoc()) {
 | 
	
		
			
				|  |  | +      auto line = tok(herebody_s, ts);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      while (!line.empty() && line.back() == '\r') {
 | 
	
		
			
				|  |  | +        line.pop_back();
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (version <= ruby_version::RUBY_20) {
 | 
	
		
			
				|  |  | +        // See ruby:c48b4209c
 | 
	
		
			
				|  |  | +        auto riter = line.rfind('\r');
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (riter != std::string::npos) {
 | 
	
		
			
				|  |  | +          line.erase(riter);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      // Try ending the heredoc with the complete most recently
 | 
	
		
			
				|  |  | +      // scanned line. @herebody_s always refers to the start of such line.
 | 
	
		
			
				|  |  | +      if (current_literal.nest_and_try_closing(line, herebody_s, ts)) {
 | 
	
		
			
				|  |  | +        herebody_s = te;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // Continue regular lexing after the heredoc reference (<<END).
 | 
	
		
			
				|  |  | +        p = current_literal.heredoc_e - 1;
 | 
	
		
			
				|  |  | +        fnext *pop_literal(); fbreak;
 | 
	
		
			
				|  |  | +      } else {
 | 
	
		
			
				|  |  | +        // Calculate indentation level for <<~HEREDOCs.
 | 
	
		
			
				|  |  | +        current_literal.infer_indent_level(line);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // Ditto.
 | 
	
		
			
				|  |  | +        herebody_s = te;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      // Try ending the literal with a newline.
 | 
	
		
			
				|  |  | +      auto str = tok();
 | 
	
		
			
				|  |  | +      if (current_literal.nest_and_try_closing(str, ts, te)) {
 | 
	
		
			
				|  |  | +        fnext *pop_literal(); fbreak;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (herebody_s) {
 | 
	
		
			
				|  |  | +        // This is a regular literal intertwined with a heredoc. Like:
 | 
	
		
			
				|  |  | +        //
 | 
	
		
			
				|  |  | +        //     p <<-foo+"1
 | 
	
		
			
				|  |  | +        //     bar
 | 
	
		
			
				|  |  | +        //     foo
 | 
	
		
			
				|  |  | +        //     2"
 | 
	
		
			
				|  |  | +        //
 | 
	
		
			
				|  |  | +        // which, incidentally, evaluates to "bar\n1\n2".
 | 
	
		
			
				|  |  | +        p = herebody_s - 1;
 | 
	
		
			
				|  |  | +        herebody_s = nullptr;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (current_literal.words() && !eof_codepoint(*p)) {
 | 
	
		
			
				|  |  | +      current_literal.extend_space(ts, te);
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      // A literal newline is appended if the heredoc was _not_ closed
 | 
	
		
			
				|  |  | +      // this time (see f break above). See also Literal#nest_and_try_closing
 | 
	
		
			
				|  |  | +      // for rationale of calling #flush_string here.
 | 
	
		
			
				|  |  | +      std::string str = tok();
 | 
	
		
			
				|  |  | +      current_literal.extend_string(str, ts, te);
 | 
	
		
			
				|  |  | +      current_literal.flush_string();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action extend_string_space {
 | 
	
		
			
				|  |  | +    literal_().extend_space(ts, te);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === INTERPOLATION PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Interpolations with immediate variable names simply call into
 | 
	
		
			
				|  |  | +  # the corresponding machine.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_var = '#' ( global_var | class_var_v | instance_var_v );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action extend_interp_var {
 | 
	
		
			
				|  |  | +    auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +    current_literal.flush_string();
 | 
	
		
			
				|  |  | +    current_literal.extend_content();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    emit(token_type::tSTRING_DVAR, "", ts, ts + 1);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    p = ts;
 | 
	
		
			
				|  |  | +    fcall expr_variable;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Interpolations with code blocks must match nested curly braces, as
 | 
	
		
			
				|  |  | +  # interpolation ending is ambiguous with a block ending. So, every
 | 
	
		
			
				|  |  | +  # opening and closing brace should be matched with e_[lr]brace rules,
 | 
	
		
			
				|  |  | +  # which automatically perform the counting.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Note that interpolations can themselves be nested, so brace balance
 | 
	
		
			
				|  |  | +  # is tied to the innermost literal.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Also note that literals themselves should not use e_[lr]brace rules
 | 
	
		
			
				|  |  | +  # when matching their opening and closing delimiters, as the amount of
 | 
	
		
			
				|  |  | +  # braces inside the characters of a string literal is independent.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_code = '#{';
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_lbrace = '{' % {
 | 
	
		
			
				|  |  | +    cond.push(false); cmdarg.push(false);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (!literal_stack.empty()) {
 | 
	
		
			
				|  |  | +      literal_().start_interp_brace();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_rbrace = '}' % {
 | 
	
		
			
				|  |  | +    if (!literal_stack.empty()) {
 | 
	
		
			
				|  |  | +      auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (current_literal.end_interp_brace_and_try_closing()) {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19) {
 | 
	
		
			
				|  |  | +          emit(token_type::tRCURLY, "}", p - 1, p);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tSTRING_DEND, "}", p - 1, p);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (current_literal.saved_herebody_s) {
 | 
	
		
			
				|  |  | +          herebody_s = current_literal.saved_herebody_s;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fhold;
 | 
	
		
			
				|  |  | +        fnext *next_state_for_literal(current_literal);
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  action extend_interp_code {
 | 
	
		
			
				|  |  | +    auto& current_literal = literal_();
 | 
	
		
			
				|  |  | +    current_literal.flush_string();
 | 
	
		
			
				|  |  | +    current_literal.extend_content();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    emit(token_type::tSTRING_DBEG, "#{");
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (current_literal.heredoc()) {
 | 
	
		
			
				|  |  | +      current_literal.saved_herebody_s = herebody_s;
 | 
	
		
			
				|  |  | +      herebody_s = nullptr;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    current_literal.start_interp_brace();
 | 
	
		
			
				|  |  | +    command_start = true;
 | 
	
		
			
				|  |  | +    fnext expr_value;
 | 
	
		
			
				|  |  | +    fbreak;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Actual string parsers are simply combined from the primitives defined
 | 
	
		
			
				|  |  | +  # above.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_words := |*
 | 
	
		
			
				|  |  | +      interp_code => extend_interp_code;
 | 
	
		
			
				|  |  | +      interp_var  => extend_interp_var;
 | 
	
		
			
				|  |  | +      e_bs escape => extend_string_escaped;
 | 
	
		
			
				|  |  | +      c_space+    => extend_string_space;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_string := |*
 | 
	
		
			
				|  |  | +      interp_code => extend_interp_code;
 | 
	
		
			
				|  |  | +      interp_var  => extend_interp_var;
 | 
	
		
			
				|  |  | +      e_bs escape => extend_string_escaped;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  plain_words := |*
 | 
	
		
			
				|  |  | +      e_bs c_any  => extend_string_escaped;
 | 
	
		
			
				|  |  | +      c_space+    => extend_string_space;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  plain_string := |*
 | 
	
		
			
				|  |  | +      '\\' c_nl   => extend_string_eol;
 | 
	
		
			
				|  |  | +      e_bs c_any  => extend_string_escaped;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_backslash_delimited := |*
 | 
	
		
			
				|  |  | +      interp_code => extend_interp_code;
 | 
	
		
			
				|  |  | +      interp_var  => extend_interp_var;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  plain_backslash_delimited := |*
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  interp_backslash_delimited_words := |*
 | 
	
		
			
				|  |  | +      interp_code => extend_interp_code;
 | 
	
		
			
				|  |  | +      interp_var  => extend_interp_var;
 | 
	
		
			
				|  |  | +      c_space+    => extend_string_space;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  plain_backslash_delimited_words := |*
 | 
	
		
			
				|  |  | +      c_space+    => extend_string_space;
 | 
	
		
			
				|  |  | +      c_eol       => extend_string_eol;
 | 
	
		
			
				|  |  | +      c_any       => extend_string;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  regexp_modifiers := |*
 | 
	
		
			
				|  |  | +      [A-Za-z]+
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        auto options = tok();
 | 
	
		
			
				|  |  | +        std::string unknown_options;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for (auto i = options.cbegin(); i != options.cend(); ++i) {
 | 
	
		
			
				|  |  | +          switch (char opt = *i) {
 | 
	
		
			
				|  |  | +            case 'i':
 | 
	
		
			
				|  |  | +            case 'm':
 | 
	
		
			
				|  |  | +            case 'x':
 | 
	
		
			
				|  |  | +            case 'o':
 | 
	
		
			
				|  |  | +            case 'u':
 | 
	
		
			
				|  |  | +            case 'e':
 | 
	
		
			
				|  |  | +            case 's':
 | 
	
		
			
				|  |  | +            case 'n':
 | 
	
		
			
				|  |  | +              continue;
 | 
	
		
			
				|  |  | +            default:
 | 
	
		
			
				|  |  | +              unknown_options += opt;
 | 
	
		
			
				|  |  | +              break;
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (!unknown_options.empty()) {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::RegexpOptions, unknown_options);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        emit(token_type::tREGEXP_OPT, options);
 | 
	
		
			
				|  |  | +        fnext expr_end;
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      any
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tREGEXP_OPT, tok(ts, te - 1), ts, te - 1);
 | 
	
		
			
				|  |  | +        fhold;
 | 
	
		
			
				|  |  | +        fgoto expr_end;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === WHITESPACE HANDLING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Various contexts in Ruby allow various kinds of whitespace
 | 
	
		
			
				|  |  | +  # to be used. They are grouped to clarify the lexing machines
 | 
	
		
			
				|  |  | +  # and ease collection of comments.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A line of code with inline #comment at end is always equivalent
 | 
	
		
			
				|  |  | +  # to a line of code ending with just a newline, so an inline
 | 
	
		
			
				|  |  | +  # comment is deemed equivalent to non-newline whitespace
 | 
	
		
			
				|  |  | +  # (c_space character class).
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  w_space =
 | 
	
		
			
				|  |  | +      c_space+
 | 
	
		
			
				|  |  | +    | '\\' e_heredoc_nl
 | 
	
		
			
				|  |  | +    ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  w_comment =
 | 
	
		
			
				|  |  | +      '#'     %{ sharp_s = p - 1; }
 | 
	
		
			
				|  |  | +      # The (p == pe) condition compensates for added "\0" and
 | 
	
		
			
				|  |  | +      # the way Ragel handles EOF.
 | 
	
		
			
				|  |  | +      c_line* %{ emit_comment(sharp_s, p == pe ? p - 2 : p); }
 | 
	
		
			
				|  |  | +    ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  w_space_comment =
 | 
	
		
			
				|  |  | +      w_space
 | 
	
		
			
				|  |  | +    | w_comment
 | 
	
		
			
				|  |  | +    ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # A newline in non-literal context always interoperates with
 | 
	
		
			
				|  |  | +  # here document logic and can always be escaped by a backslash,
 | 
	
		
			
				|  |  | +  # still interoperating with here document logic in the same way,
 | 
	
		
			
				|  |  | +  # yet being invisible to anything else.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # To demonstrate:
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #     foo = <<FOO \
 | 
	
		
			
				|  |  | +  #     bar
 | 
	
		
			
				|  |  | +  #     FOO
 | 
	
		
			
				|  |  | +  #      + 2
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # is equivalent to `foo = "bar\n" + 2`.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  w_newline =
 | 
	
		
			
				|  |  | +      e_heredoc_nl;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  w_any =
 | 
	
		
			
				|  |  | +      w_space
 | 
	
		
			
				|  |  | +    | w_comment
 | 
	
		
			
				|  |  | +    | w_newline
 | 
	
		
			
				|  |  | +    ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === EXPRESSION PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # These rules implement a form of manually defined lookahead.
 | 
	
		
			
				|  |  | +  # The default longest-match scanning does not work here due
 | 
	
		
			
				|  |  | +  # to sheer ambiguity.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  ambiguous_fid_suffix =         # actual    parsed
 | 
	
		
			
				|  |  | +      [?!]    %{ tm = p; }     | # a?        a?
 | 
	
		
			
				|  |  | +      [?!]'=' %{ tm = p - 2; }   # a!=b      a != b
 | 
	
		
			
				|  |  | +  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  ambiguous_ident_suffix =       # actual    parsed
 | 
	
		
			
				|  |  | +      ambiguous_fid_suffix     |
 | 
	
		
			
				|  |  | +      '='     %{ tm = p; }     | # a=        a=
 | 
	
		
			
				|  |  | +      '=='    %{ tm = p - 2; } | # a==b      a == b
 | 
	
		
			
				|  |  | +      '=~'    %{ tm = p - 2; } | # a=~b      a =~ b
 | 
	
		
			
				|  |  | +      '=>'    %{ tm = p - 2; } | # a=>b      a => b
 | 
	
		
			
				|  |  | +      '==='   %{ tm = p - 3; }   # a===b     a === b
 | 
	
		
			
				|  |  | +  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  ambiguous_symbol_suffix =      # actual    parsed
 | 
	
		
			
				|  |  | +      ambiguous_ident_suffix |
 | 
	
		
			
				|  |  | +      '==>'   %{ tm = p - 2; }   # :a==>b    :a= => b
 | 
	
		
			
				|  |  | +  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Ambiguous with 1.9 hash labels.
 | 
	
		
			
				|  |  | +  ambiguous_const_suffix =       # actual    parsed
 | 
	
		
			
				|  |  | +      '::'    %{ tm = p - 2; }   # A::B      A :: B
 | 
	
		
			
				|  |  | +  ;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
 | 
	
		
			
				|  |  | +  # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_lbrack = '[' % {
 | 
	
		
			
				|  |  | +    cond.push(false); cmdarg.push(false);
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Ruby 1.9 lambdas require parentheses counting in order to
 | 
	
		
			
				|  |  | +  # emit correct opening kDO/tLBRACE.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_lparen = '(' % {
 | 
	
		
			
				|  |  | +    cond.push(false); cmdarg.push(false);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    paren_nest += 1;
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  e_rparen = ')' % {
 | 
	
		
			
				|  |  | +    paren_nest -= 1;
 | 
	
		
			
				|  |  | +  };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Ruby is context-sensitive wrt/ local identifiers.
 | 
	
		
			
				|  |  | +  action local_ident {
 | 
	
		
			
				|  |  | +    auto ident = tok();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    emit(token_type::tIDENTIFIER, ident);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (is_declared(ident)) {
 | 
	
		
			
				|  |  | +      fnext expr_endfn; fbreak;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +      fnext *arg_or_cmdarg(cmd_state); fbreak;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Variable lexing code is accessed from both expressions and
 | 
	
		
			
				|  |  | +  # string interpolation related code.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_variable := |*
 | 
	
		
			
				|  |  | +      global_var
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (ts[1] >= '1' && ts[1] <= '9') {
 | 
	
		
			
				|  |  | +          emit(token_type::tNTH_REF, tok(ts + 1));
 | 
	
		
			
				|  |  | +        } else if (ts[1] == '&' || ts[1] == '`' || ts[1] == '\'' || ts[1] == '+') {
 | 
	
		
			
				|  |  | +          emit(token_type::tBACK_REF);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tGVAR);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fnext *stack_pop(); fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      class_var_v
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (ts[2] >= '0' && ts[2] <= '9') {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::CvarName, tok(ts, te));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        emit(token_type::tCVAR);
 | 
	
		
			
				|  |  | +        fnext *stack_pop(); fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      instance_var_v
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (ts[1] >= '0' && ts[1] <= '9') {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::IvarName, tok(ts, te));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        emit(token_type::tIVAR);
 | 
	
		
			
				|  |  | +        fnext *stack_pop(); fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Literal function name in definition (e.g. `def class`).
 | 
	
		
			
				|  |  | +  # Keywords are returned as their respective tokens; this is used
 | 
	
		
			
				|  |  | +  # to support singleton def `def self.foo`. Global variables are
 | 
	
		
			
				|  |  | +  # returned as `tGVAR`; this is used in global variable alias
 | 
	
		
			
				|  |  | +  # statements `alias $a $b`. Symbols are returned verbatim; this
 | 
	
		
			
				|  |  | +  # is used in `alias :a :"b#{foo}"` and `undef :a`.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Transitions to `expr_endfn` afterwards.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_fname := |*
 | 
	
		
			
				|  |  | +      keyword
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS_BEGIN);
 | 
	
		
			
				|  |  | +           fnext expr_endfn; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      constant
 | 
	
		
			
				|  |  | +      => { emit(token_type::tCONSTANT);
 | 
	
		
			
				|  |  | +           fnext expr_endfn; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      bareword [?=!]?
 | 
	
		
			
				|  |  | +      => { emit(token_type::tIDENTIFIER);
 | 
	
		
			
				|  |  | +           fnext expr_endfn; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      global_var
 | 
	
		
			
				|  |  | +      => { p = ts - 1;
 | 
	
		
			
				|  |  | +           fnext expr_end; fcall expr_variable; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # If the handling was to be delegated to expr_end,
 | 
	
		
			
				|  |  | +      # these cases would transition to something else than
 | 
	
		
			
				|  |  | +      # expr_endfn, which is incorrect.
 | 
	
		
			
				|  |  | +      operator_fname      |
 | 
	
		
			
				|  |  | +      operator_arithmetic |
 | 
	
		
			
				|  |  | +      operator_rest
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_endfn; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '::'
 | 
	
		
			
				|  |  | +      => { fhold; fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ':'
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '%s' c_any
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_23) {
 | 
	
		
			
				|  |  | +          fgoto *push_literal(literal_type::LOWERS_SYMBOL, std::string(ts + 2, 1), ts);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          p = ts - 1;
 | 
	
		
			
				|  |  | +          fgoto expr_end;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_any;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # After literal function name in definition. Behaves like `expr_end`,
 | 
	
		
			
				|  |  | +  # but allows a tLABEL.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Transitions to `expr_end` afterwards.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_endfn := |*
 | 
	
		
			
				|  |  | +      label ( any - ':' )
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
 | 
	
		
			
				|  |  | +           fhold; fnext expr_labelarg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Literal function name in method call (e.g. `a.class`).
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Transitions to `expr_arg` afterwards.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # KEEP IN SYNC WITH expr_dot_after_newline!
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_dot := |*
 | 
	
		
			
				|  |  | +      constant
 | 
	
		
			
				|  |  | +      => { emit(token_type::tCONSTANT);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      call_or_var
 | 
	
		
			
				|  |  | +      => { emit(token_type::tIDENTIFIER);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      bareword ambiguous_fid_suffix
 | 
	
		
			
				|  |  | +      => { emit(token_type::tFID, tok(ts, tm), ts, tm);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # See the comment in `expr_fname`.
 | 
	
		
			
				|  |  | +      operator_fname      |
 | 
	
		
			
				|  |  | +      operator_arithmetic |
 | 
	
		
			
				|  |  | +      operator_rest
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_arg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # This breaks compatibility with Ruby for better partial parses (useful
 | 
	
		
			
				|  |  | +      # for LSP especially). See comment for expr_dot_after_newline below.
 | 
	
		
			
				|  |  | +      w_newline
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_dot_after_newline; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_any;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # KEEP IN SYNC WITH expr_dot!
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # This state breaks from valid Ruby syntax, but in a way that enables Sorbet
 | 
	
		
			
				|  |  | +  # to recover better from parse errors. Recovering from parse errors is
 | 
	
		
			
				|  |  | +  # important because it lets us service LSP queries faster.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Specifically, this state makes is so that any keyword seen after w_newline
 | 
	
		
			
				|  |  | +  # is emitted as a keyword (like kEND) instead of a tIDENTIFIER. Examples:
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #   # Valid Ruby, valid in Sorbet (no newline between '.' and 'end')
 | 
	
		
			
				|  |  | +  #   def foo
 | 
	
		
			
				|  |  | +  #     x.end
 | 
	
		
			
				|  |  | +  #   end
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #   # Parse error in Ruby and Sorbet, but Sorbet at least sees the method def
 | 
	
		
			
				|  |  | +  #   # with an empty body (Ruby wouldn't even see an empty method def)
 | 
	
		
			
				|  |  | +  #   def foo
 | 
	
		
			
				|  |  | +  #     x.
 | 
	
		
			
				|  |  | +  #   end
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  #   # Valid Ruby, not valid in Sorbet (newline between '.' and 'end')
 | 
	
		
			
				|  |  | +  #   def foo
 | 
	
		
			
				|  |  | +  #     x.
 | 
	
		
			
				|  |  | +  #       end
 | 
	
		
			
				|  |  | +  #   end
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_dot_after_newline := |*
 | 
	
		
			
				|  |  | +      constant
 | 
	
		
			
				|  |  | +      => { emit(token_type::tCONSTANT);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # This is different from expr_dot. Here, keywords are NOT identifiers.
 | 
	
		
			
				|  |  | +      keyword
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fnext expr_end; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      call_or_var
 | 
	
		
			
				|  |  | +      => { emit(token_type::tIDENTIFIER);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      bareword ambiguous_fid_suffix
 | 
	
		
			
				|  |  | +      => { emit(token_type::tFID, tok(ts, tm), ts, tm);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # See the comment in `expr_fname`.
 | 
	
		
			
				|  |  | +      operator_fname      |
 | 
	
		
			
				|  |  | +      operator_arithmetic |
 | 
	
		
			
				|  |  | +      operator_rest
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_arg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_any;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
 | 
	
		
			
				|  |  | +  # is consumed; the current expression is a command or method call.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_arg := |*
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # COMMAND MODE SPECIFIC TOKENS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # cmd (1 + 2)
 | 
	
		
			
				|  |  | +      # See below the rationale about expr_endarg.
 | 
	
		
			
				|  |  | +      w_space+ e_lparen
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          emit(token_type::tLPAREN2, "(", te - 1, te);
 | 
	
		
			
				|  |  | +          fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tLPAREN_ARG, "(", te - 1, te);
 | 
	
		
			
				|  |  | +          fnext expr_beg; fbreak;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # meth(1 + 2)
 | 
	
		
			
				|  |  | +      # Regular method call.
 | 
	
		
			
				|  |  | +      e_lparen
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLPAREN2, "(");
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # meth [...]
 | 
	
		
			
				|  |  | +      # Array argument. Compare with indexing `meth[...]`.
 | 
	
		
			
				|  |  | +      w_space+ e_lbrack
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLBRACK, "[", te - 1, te);
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # cmd {}
 | 
	
		
			
				|  |  | +      # Command: method call without parentheses.
 | 
	
		
			
				|  |  | +      w_space* e_lbrace
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
 | 
	
		
			
				|  |  | +          lambda_stack.pop();
 | 
	
		
			
				|  |  | +          emit(token_type::tLAMBEG, "{", te - 1, te);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tLCURLY, "{", te - 1, te);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        command_start = true;
 | 
	
		
			
				|  |  | +        fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a??
 | 
	
		
			
				|  |  | +      # Ternary operator
 | 
	
		
			
				|  |  | +      '?' c_space_nl
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        // Unlike expr_beg as invoked in the next rule, do not warn
 | 
	
		
			
				|  |  | +        p = ts - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_end;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a ?b, a? ?
 | 
	
		
			
				|  |  | +      # Character literal or ternary operator
 | 
	
		
			
				|  |  | +      w_space* '?'
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
 | 
	
		
			
				|  |  | +      # a /foo/ (but not "a / foo" or "a /=foo")
 | 
	
		
			
				|  |  | +      # a <<HEREDOC
 | 
	
		
			
				|  |  | +      w_space+ %{ tm = p; }
 | 
	
		
			
				|  |  | +      ( [%/] ( c_any - c_space_nl - '=' ) # /
 | 
	
		
			
				|  |  | +      | '<<'
 | 
	
		
			
				|  |  | +      )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (*tm == '/') {
 | 
	
		
			
				|  |  | +          // Ambiguous regexp literal.
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::WARNING, dclass::AmbiguousLiteral, range(tm, tm + 1));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        p = tm - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_beg;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # x *1
 | 
	
		
			
				|  |  | +      # Ambiguous splat, kwsplat or block-pass.
 | 
	
		
			
				|  |  | +      w_space+ %{ tm = p; } ( '+' | '-' | '*' | '&' | '**' )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::WARNING, dclass::AmbiguousPrefix, range(tm, te), tok(tm, te));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        p = tm - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_beg;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # x ::Foo
 | 
	
		
			
				|  |  | +      # Ambiguous toplevel constant access.
 | 
	
		
			
				|  |  | +      w_space+ '::'
 | 
	
		
			
				|  |  | +      => { fhold; fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # x:b
 | 
	
		
			
				|  |  | +      # Symbol.
 | 
	
		
			
				|  |  | +      w_space* ':'
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space+ label
 | 
	
		
			
				|  |  | +      => { p = ts - 1; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a ? b
 | 
	
		
			
				|  |  | +      # Ternary operator.
 | 
	
		
			
				|  |  | +      w_space+ %{ tm = p; } '?' c_space_nl
 | 
	
		
			
				|  |  | +      => { p = tm - 1; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # x + 1: Binary operator or operator-assignment.
 | 
	
		
			
				|  |  | +      w_space* operator_arithmetic
 | 
	
		
			
				|  |  | +                  ( '=' | c_space_nl )?    |
 | 
	
		
			
				|  |  | +      # x rescue y: Modifier keyword.
 | 
	
		
			
				|  |  | +      w_space* keyword_modifier            |
 | 
	
		
			
				|  |  | +      # a &. b: Safe navigation operator.
 | 
	
		
			
				|  |  | +      w_space* '&.'                        |
 | 
	
		
			
				|  |  | +      # Miscellanea.
 | 
	
		
			
				|  |  | +      w_space* punctuation_end
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        p = ts - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_end;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_comment
 | 
	
		
			
				|  |  | +      => { fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_newline
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # The previous token was an identifier which was seen while in the
 | 
	
		
			
				|  |  | +  # command mode (that is, the state at the beginning of #advance was
 | 
	
		
			
				|  |  | +  # expr_value). This state is very similar to expr_arg, but disambiguates
 | 
	
		
			
				|  |  | +  # two very rare and specific condition:
 | 
	
		
			
				|  |  | +  #   * In 1.8 mode, "foo (lambda do end)".
 | 
	
		
			
				|  |  | +  #   * In 1.9+ mode, "f x: -> do foo do end end".
 | 
	
		
			
				|  |  | +  expr_cmdarg := |*
 | 
	
		
			
				|  |  | +      w_space+ e_lparen
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tLPAREN_ARG, "(", te - 1, te);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          fnext expr_beg; fbreak;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space* 'do'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (cond.active()) {
 | 
	
		
			
				|  |  | +          emit(token_type::kDO_COND, "do", te - 2, te);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::kDO, "do", te - 2, te);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any             |
 | 
	
		
			
				|  |  | +      # Disambiguate with the `do' rule above.
 | 
	
		
			
				|  |  | +      w_space* bareword |
 | 
	
		
			
				|  |  | +      w_space* label
 | 
	
		
			
				|  |  | +      => { p = ts - 1;
 | 
	
		
			
				|  |  | +           fgoto expr_arg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # The rationale for this state is pretty complex. Normally, if an argument
 | 
	
		
			
				|  |  | +  # is passed to a command and then there is a block (tLCURLY...tRCURLY),
 | 
	
		
			
				|  |  | +  # the block is attached to the innermost argument (`f` in `m f {}`), or it
 | 
	
		
			
				|  |  | +  # is a parse error (`m 1 {}`). But there is a special case for passing a single
 | 
	
		
			
				|  |  | +  # primary expression grouped with parentheses: if you write `m (1) {}` or
 | 
	
		
			
				|  |  | +  # (2.0 only) `m () {}`, then the block is attached to `m`.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Thus, we recognize the opening `(` of a command (remember, a command is
 | 
	
		
			
				|  |  | +  # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
 | 
	
		
			
				|  |  | +  # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
 | 
	
		
			
				|  |  | +  # lexer's state to `expr_endarg`, which makes it emit the possibly following
 | 
	
		
			
				|  |  | +  # `{` as `tLBRACE_ARG`.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # The default post-`expr_endarg` state is `expr_end`, so this state also handles
 | 
	
		
			
				|  |  | +  # `do` (as `kDO_BLOCK` in `expr_beg`).
 | 
	
		
			
				|  |  | +  expr_endarg := |*
 | 
	
		
			
				|  |  | +      e_lbrace
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
 | 
	
		
			
				|  |  | +          lambda_stack.pop();
 | 
	
		
			
				|  |  | +          emit(token_type::tLAMBEG, "{");
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tLBRACE_ARG, "{");
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        command_start = true;
 | 
	
		
			
				|  |  | +        fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      'do'
 | 
	
		
			
				|  |  | +      => { emit_do(true);
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # The rationale for this state is that several keywords accept value
 | 
	
		
			
				|  |  | +  # (i.e. should transition to `expr_beg`), do not accept it like a command
 | 
	
		
			
				|  |  | +  # (i.e. not an `expr_arg`), and must behave like a statement, that is,
 | 
	
		
			
				|  |  | +  # accept a modifier if/while/etc.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_mid := |*
 | 
	
		
			
				|  |  | +      keyword_modifier
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      bareword
 | 
	
		
			
				|  |  | +      => { p = ts - 1; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_newline
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Beginning of an expression.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # Don't fallthrough to this state from `c_any`; make sure to handle
 | 
	
		
			
				|  |  | +  # `c_space* c_nl` and let `expr_end` handle the newline.
 | 
	
		
			
				|  |  | +  # Otherwise code like `f\ndef x` gets glued together and the parser
 | 
	
		
			
				|  |  | +  # explodes.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_beg := |*
 | 
	
		
			
				|  |  | +      # +5, -5, - 5
 | 
	
		
			
				|  |  | +      [+\-] w_any* [0-9]
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tUNARY_NUM, tok(ts, ts + 1), ts, ts + 1);
 | 
	
		
			
				|  |  | +        fhold; fnext expr_end; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # splat *a
 | 
	
		
			
				|  |  | +      '*'
 | 
	
		
			
				|  |  | +      => { emit(token_type::tSTAR, "*");
 | 
	
		
			
				|  |  | +           fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # STRING AND REGEXP LITERALS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # /regexp/oui
 | 
	
		
			
				|  |  | +      # /=/ (disambiguation with /=)
 | 
	
		
			
				|  |  | +      '/' c_any
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        fhold; fgoto *push_literal(literal_type::SLASH_REGEXP, std::string(ts + 0, 1), ts);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # %<string>
 | 
	
		
			
				|  |  | +      '%' ( any - [A-Za-z] )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        fgoto *push_literal(literal_type::PERCENT_STRING, std::string(ts + 1, 1), ts);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # %w(we are the people)
 | 
	
		
			
				|  |  | +      '%' [A-Za-z]+ c_any
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        literal_type type;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        bool single_char_type = (ts + 3 == te);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (single_char_type && ts[1] == 'q') {
 | 
	
		
			
				|  |  | +          type = literal_type::LOWERQ_STRING;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'Q') {
 | 
	
		
			
				|  |  | +          type = literal_type::UPPERQ_STRING;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'w') {
 | 
	
		
			
				|  |  | +          type = literal_type::LOWERW_WORDS;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'W') {
 | 
	
		
			
				|  |  | +          type = literal_type::UPPERW_WORDS;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'i') {
 | 
	
		
			
				|  |  | +          type = literal_type::LOWERI_SYMBOLS;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'I') {
 | 
	
		
			
				|  |  | +          type = literal_type::UPPERI_SYMBOLS;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 's') {
 | 
	
		
			
				|  |  | +          type = literal_type::LOWERS_SYMBOL;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'r') {
 | 
	
		
			
				|  |  | +          type = literal_type::PERCENT_REGEXP;
 | 
	
		
			
				|  |  | +        } else if (single_char_type && ts[1] == 'x') {
 | 
	
		
			
				|  |  | +          type = literal_type::LOWERX_XSTRING;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          type = literal_type::PERCENT_STRING;
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::UnexpectedPercentStr, range(ts, te - 1), tok(ts, te-1));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fgoto *push_literal(type, std::string(te - 1, 1), ts);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '%' c_eof
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::FATAL, dclass::StringEof, range(ts, ts + 1));
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # Heredoc start.
 | 
	
		
			
				|  |  | +      # <<END  | <<'END'  | <<"END"  | <<`END`  |
 | 
	
		
			
				|  |  | +      # <<-END | <<-'END' | <<-"END" | <<-`END` |
 | 
	
		
			
				|  |  | +      # <<~END | <<~'END' | <<~"END" | <<~`END`
 | 
	
		
			
				|  |  | +      '<<' [~\-]?
 | 
	
		
			
				|  |  | +        ( '"' ( c_line - '"' )* '"'
 | 
	
		
			
				|  |  | +        | "'" ( c_line - "'" )* "'"
 | 
	
		
			
				|  |  | +        | "`" ( c_line - "`" )* "`"
 | 
	
		
			
				|  |  | +        | bareword ) % { heredoc_e      = p; }
 | 
	
		
			
				|  |  | +        c_line* c_nl % { new_herebody_s = p; }
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        bool indent;
 | 
	
		
			
				|  |  | +        bool dedent_body;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        const char* delim_s = ts + 2;
 | 
	
		
			
				|  |  | +        const char* delim_e = heredoc_e;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (*delim_s == '-') {
 | 
	
		
			
				|  |  | +          indent = true;
 | 
	
		
			
				|  |  | +          dedent_body = false;
 | 
	
		
			
				|  |  | +          delim_s++;
 | 
	
		
			
				|  |  | +        } else if (*delim_s == '~') {
 | 
	
		
			
				|  |  | +          indent = true;
 | 
	
		
			
				|  |  | +          dedent_body = true;
 | 
	
		
			
				|  |  | +          delim_s++;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          indent = false;
 | 
	
		
			
				|  |  | +          dedent_body = false;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        literal_type type;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (*delim_s == '"') {
 | 
	
		
			
				|  |  | +          type = literal_type::DQUOTE_HEREDOC;
 | 
	
		
			
				|  |  | +          delim_s++;
 | 
	
		
			
				|  |  | +          delim_e--;
 | 
	
		
			
				|  |  | +        } else if (*delim_s == '\'') {
 | 
	
		
			
				|  |  | +          type = literal_type::SQUOTE_HEREDOC;
 | 
	
		
			
				|  |  | +          delim_s++;
 | 
	
		
			
				|  |  | +          delim_e--;
 | 
	
		
			
				|  |  | +        } else if (*delim_s == '`') {
 | 
	
		
			
				|  |  | +          type = literal_type::BACKTICK_HEREDOC;
 | 
	
		
			
				|  |  | +          delim_s++;
 | 
	
		
			
				|  |  | +          delim_e--;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          type = literal_type::DQUOTE_HEREDOC;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (dedent_body && (version == ruby_version::RUBY_18 ||
 | 
	
		
			
				|  |  | +                            version == ruby_version::RUBY_19 ||
 | 
	
		
			
				|  |  | +                            version == ruby_version::RUBY_20 ||
 | 
	
		
			
				|  |  | +                            version == ruby_version::RUBY_21 ||
 | 
	
		
			
				|  |  | +                            version == ruby_version::RUBY_22)) {
 | 
	
		
			
				|  |  | +          emit(token_type::tLSHFT, "<<", ts, ts + 2);
 | 
	
		
			
				|  |  | +          p = ts + 1;
 | 
	
		
			
				|  |  | +          fnext expr_beg; fbreak;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          fnext *push_literal(type, std::string(delim_s, (size_t)(delim_e - delim_s)), ts, heredoc_e, indent, dedent_body);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          if (!herebody_s) {
 | 
	
		
			
				|  |  | +            herebody_s = new_herebody_s;
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          p = herebody_s - 1;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # SYMBOL LITERALS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # :&&, :||
 | 
	
		
			
				|  |  | +      ':' ('&&' | '||') => {
 | 
	
		
			
				|  |  | +        fhold; fhold;
 | 
	
		
			
				|  |  | +        emit(token_type::tSYMBEG, tok(ts, ts + 1), ts, ts + 1);
 | 
	
		
			
				|  |  | +        fgoto expr_fname;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # :"bar", :'baz'
 | 
	
		
			
				|  |  | +      ':' ['"] # '
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        literal_type type;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (ts[1] == '\'') {
 | 
	
		
			
				|  |  | +          type = literal_type::SQUOTE_SYMBOL;
 | 
	
		
			
				|  |  | +        } else { // '"'
 | 
	
		
			
				|  |  | +          type = literal_type::DQUOTE_SYMBOL;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fgoto *push_literal(type, std::string(ts + 1, 1), ts);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # :!@ is :!
 | 
	
		
			
				|  |  | +      # :~@ is :~
 | 
	
		
			
				|  |  | +      ':' [!~] '@'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tSYMBEG, tok(ts + 1, ts + 2), ts, te);
 | 
	
		
			
				|  |  | +        fnext expr_end; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ':' bareword ambiguous_symbol_suffix
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tSYMBOL, tok(ts + 1, tm), ts, tm);
 | 
	
		
			
				|  |  | +        p = tm - 1;
 | 
	
		
			
				|  |  | +        fnext expr_end; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ':' ( bareword | global_var | class_var | instance_var |
 | 
	
		
			
				|  |  | +            operator_fname | operator_arithmetic | operator_rest )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tSYMBOL, tok(ts + 1), ts, te);
 | 
	
		
			
				|  |  | +        fnext expr_end; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # AMBIGUOUS TERNARY OPERATOR
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # Character constant, like ?a, ?\n, ?\u1000, and so on
 | 
	
		
			
				|  |  | +      # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
 | 
	
		
			
				|  |  | +      '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
 | 
	
		
			
				|  |  | +          | (c_any - c_space_nl - e_bs) % { escape = nullptr; }
 | 
	
		
			
				|  |  | +          )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          emit(token_type::tINTEGER, std::to_string(static_cast<unsigned char>(ts[1])));
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tCHARACTER, escape ? *escape : tok(ts + 1));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fnext expr_end; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '?' c_space_nl
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        static const struct escape_map_ent { char c; const char* s; } escape_map[] {
 | 
	
		
			
				|  |  | +          { ' ',  "\\s" },
 | 
	
		
			
				|  |  | +          { '\r', "\\r" },
 | 
	
		
			
				|  |  | +          { '\n', "\\n" },
 | 
	
		
			
				|  |  | +          { '\t', "\\t" },
 | 
	
		
			
				|  |  | +          { '\v', "\\v" },
 | 
	
		
			
				|  |  | +          { '\f', "\\f" },
 | 
	
		
			
				|  |  | +          { 0, 0 },
 | 
	
		
			
				|  |  | +        };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for (const struct escape_map_ent* ent = escape_map; ent->c; ++ent) {
 | 
	
		
			
				|  |  | +          if (ts[1] == ent->c) {
 | 
	
		
			
				|  |  | +            diagnostic_(dlevel::WARNING, dclass::InvalidEscapeUse, ent->s);
 | 
	
		
			
				|  |  | +            break;
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        p = ts - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_end;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '?' c_eof
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::FATAL, dclass::IncompleteEscape, range(ts, ts + 1));
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # f ?aa : b: Disambiguate with a character literal.
 | 
	
		
			
				|  |  | +      '?' [A-Za-z_] bareword
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        p = ts - 1;
 | 
	
		
			
				|  |  | +        fgoto expr_end;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # KEYWORDS AND PUNCTUATION
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a({b=>c})
 | 
	
		
			
				|  |  | +      e_lbrace
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
 | 
	
		
			
				|  |  | +          lambda_stack.pop();
 | 
	
		
			
				|  |  | +          command_start = true;
 | 
	
		
			
				|  |  | +          emit(token_type::tLAMBEG, "{");
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tLBRACE, "{");
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a([1, 2])
 | 
	
		
			
				|  |  | +      e_lbrack
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLBRACK, "[");
 | 
	
		
			
				|  |  | +           fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a()
 | 
	
		
			
				|  |  | +      e_lparen
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLPAREN, "(");
 | 
	
		
			
				|  |  | +           fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a(+b)
 | 
	
		
			
				|  |  | +      punctuation_begin
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION_BEGIN);
 | 
	
		
			
				|  |  | +           fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # rescue Exception => e: Block rescue.
 | 
	
		
			
				|  |  | +      # Special because it should transition to expr_mid.
 | 
	
		
			
				|  |  | +      'rescue' %{ tm = p; } '=>'?
 | 
	
		
			
				|  |  | +      => { emit(token_type::kRESCUE, "rescue", ts, tm);
 | 
	
		
			
				|  |  | +           p = tm - 1;
 | 
	
		
			
				|  |  | +           fnext expr_mid; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # if a: Statement if.
 | 
	
		
			
				|  |  | +      keyword_modifier
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS_BEGIN);
 | 
	
		
			
				|  |  | +           command_start = true;
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # RUBY 1.9 HASH LABELS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      label ( any - ':' )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        fhold;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          auto ident = tok(ts, te - 2);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          if (*ts >= 'A' && *ts <= 'Z') {
 | 
	
		
			
				|  |  | +            emit(token_type::tCONSTANT, ident, ts, te - 2);
 | 
	
		
			
				|  |  | +          } else {
 | 
	
		
			
				|  |  | +            emit(token_type::tIDENTIFIER, ident, ts, te - 2);
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +          fhold; // continue as a symbol
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          if (is_declared(ident)) {
 | 
	
		
			
				|  |  | +            fnext expr_end;
 | 
	
		
			
				|  |  | +          } else {
 | 
	
		
			
				|  |  | +            fnext *arg_or_cmdarg(cmd_state);
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
 | 
	
		
			
				|  |  | +          fnext expr_labelarg;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # foo= bar:  Disambiguate with bareword rule below.
 | 
	
		
			
				|  |  | +      bareword ambiguous_ident_suffix |
 | 
	
		
			
				|  |  | +      # def foo:   Disambiguate with bareword rule below.
 | 
	
		
			
				|  |  | +      keyword
 | 
	
		
			
				|  |  | +      => { p = ts - 1;
 | 
	
		
			
				|  |  | +           fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a = 42;     a [42]: Indexing.
 | 
	
		
			
				|  |  | +      # def a; end; a [42]: Array argument.
 | 
	
		
			
				|  |  | +      call_or_var
 | 
	
		
			
				|  |  | +      => local_ident;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      (call_or_var - keyword)
 | 
	
		
			
				|  |  | +        % { ident_tok = tok(ts, te); ident_ts = ts; ident_te = te; }
 | 
	
		
			
				|  |  | +      w_space+ '('
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tIDENTIFIER, ident_tok, ident_ts, ident_te);
 | 
	
		
			
				|  |  | +        p = ident_te - 1;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fnext expr_cmdarg;
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # WHITESPACE
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_any;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
 | 
	
		
			
				|  |  | +      => { p = ts - 1;
 | 
	
		
			
				|  |  | +           cs_before_block_comment = cs;
 | 
	
		
			
				|  |  | +           fgoto line_begin; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # DEFAULT TRANSITION
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # The following rules match most binary and all unary operators.
 | 
	
		
			
				|  |  | +      # Rules for binary operators provide better error reporting.
 | 
	
		
			
				|  |  | +      operator_arithmetic '='    |
 | 
	
		
			
				|  |  | +      operator_rest              |
 | 
	
		
			
				|  |  | +      punctuation_end            |
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { p = ts - 1; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Special newline handling for "def a b:"
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_labelarg := |*
 | 
	
		
			
				|  |  | +    w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    w_newline
 | 
	
		
			
				|  |  | +    => {
 | 
	
		
			
				|  |  | +      if (in_kwarg) {
 | 
	
		
			
				|  |  | +        fhold; fgoto expr_end;
 | 
	
		
			
				|  |  | +      } else {
 | 
	
		
			
				|  |  | +        fgoto line_begin;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    c_any
 | 
	
		
			
				|  |  | +    => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  expr_value := |*
 | 
	
		
			
				|  |  | +      # a:b: a(:b), a::B, A::B
 | 
	
		
			
				|  |  | +      label (any - ':')
 | 
	
		
			
				|  |  | +      => { p = ts - 1;
 | 
	
		
			
				|  |  | +           fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # "bar", 'baz'
 | 
	
		
			
				|  |  | +      ['"] # '
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        literal_type type;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (ts[0] == '\'') {
 | 
	
		
			
				|  |  | +          type = literal_type::SQUOTE_STRING;
 | 
	
		
			
				|  |  | +        } else { // '"'
 | 
	
		
			
				|  |  | +          type = literal_type::DQUOTE_STRING;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fgoto *push_literal(type, tok(), ts);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_newline
 | 
	
		
			
				|  |  | +      => { fgoto line_begin; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { fhold; fgoto expr_beg; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  expr_end := |*
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # STABBY LAMBDA
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '->'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit(token_type::tLAMBDA, "->", ts, ts + 2);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        lambda_stack.push(paren_nest);
 | 
	
		
			
				|  |  | +        fnext expr_endfn; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      e_lbrace | 'do'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
 | 
	
		
			
				|  |  | +          lambda_stack.pop();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          if (ts[0] == '{') {
 | 
	
		
			
				|  |  | +            emit(token_type::tLAMBEG, "{");
 | 
	
		
			
				|  |  | +          } else { // 'do'
 | 
	
		
			
				|  |  | +            emit(token_type::kDO_LAMBDA, "do");
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          if (ts[0] == '{') {
 | 
	
		
			
				|  |  | +            emit(token_type::tLCURLY, "{");
 | 
	
		
			
				|  |  | +          } else { // 'do'
 | 
	
		
			
				|  |  | +            emit_do();
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        command_start = true;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # KEYWORDS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      keyword_with_fname
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fnext expr_fname; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      'class' w_any* '<<'
 | 
	
		
			
				|  |  | +      => { emit(token_type::kCLASS, "class", ts, ts + 5);
 | 
	
		
			
				|  |  | +           emit(token_type::tLSHFT, "<<",    te - 2, te);
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # a if b:c: Syntax error.
 | 
	
		
			
				|  |  | +      keyword_modifier
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # elsif b:c: elsif b(:c)
 | 
	
		
			
				|  |  | +      keyword_with_value
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           command_start = true;
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      keyword_with_mid
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fnext expr_mid; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      keyword_with_arg
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 && ts + 3 == te && ts[0] == 'n' && ts[1] == 'o' && ts[2] == 't') {
 | 
	
		
			
				|  |  | +          fnext expr_beg; fbreak;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          fnext expr_arg; fbreak;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '__ENCODING__'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          auto ident = tok();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          emit(token_type::tIDENTIFIER, ident);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +          if (!is_declared(ident)) {
 | 
	
		
			
				|  |  | +            fnext *arg_or_cmdarg(cmd_state);
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::k__ENCODING__, "__ENCODING__");
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      keyword_with_end
 | 
	
		
			
				|  |  | +      => { emit_table(KEYWORDS);
 | 
	
		
			
				|  |  | +           fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # NUMERIC LITERALS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ( '0' [Xx] %{ num_base = 16; num_digits_s = p; } int_hex
 | 
	
		
			
				|  |  | +      | '0' [Dd] %{ num_base = 10; num_digits_s = p; } int_dec
 | 
	
		
			
				|  |  | +      | '0' [Oo] %{ num_base = 8;  num_digits_s = p; } int_dec
 | 
	
		
			
				|  |  | +      | '0' [Bb] %{ num_base = 2;  num_digits_s = p; } int_bin
 | 
	
		
			
				|  |  | +      | [1-9] digit* '_'? %{ num_base = 10; num_digits_s = ts; } int_dec
 | 
	
		
			
				|  |  | +      | '0'   digit* '_'? %{ num_base = 8;  num_digits_s = ts; } int_dec
 | 
	
		
			
				|  |  | +      ) %{ num_suffix_s = p; } int_suffix
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        auto digits = tok(num_digits_s, num_suffix_s);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (num_suffix_s[-1] == '_') {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), "_");
 | 
	
		
			
				|  |  | +        } else if (num_digits_s == num_suffix_s && num_base == 8 && version == ruby_version::RUBY_18) {
 | 
	
		
			
				|  |  | +          // 1.8 did not raise an error on 0o.
 | 
	
		
			
				|  |  | +        } else if (num_digits_s == num_suffix_s) {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::EmptyNumeric);
 | 
	
		
			
				|  |  | +        } else if (num_base == 8) {
 | 
	
		
			
				|  |  | +          for (const char* digit_p = num_digits_s; digit_p < num_suffix_s; digit_p++) {
 | 
	
		
			
				|  |  | +            if (*digit_p == '8' || *digit_p == '9') {
 | 
	
		
			
				|  |  | +              diagnostic_(dlevel::ERROR, dclass::InvalidOctal,
 | 
	
		
			
				|  |  | +                range(digit_p, digit_p + 1));
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +          }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
 | 
	
		
			
				|  |  | +          emit(token_type::tINTEGER, convert_base(digits, num_base), ts, num_suffix_s);
 | 
	
		
			
				|  |  | +          p = num_suffix_s - 1;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit_num(convert_base(digits, num_base));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      flo_frac flo_pow?
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::ERROR, dclass::NoDotDigitLiteral);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      flo_int [eE]
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te-1, te));
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tINTEGER, tok(ts, te - 1), ts, te - 1);
 | 
	
		
			
				|  |  | +          fhold; fbreak;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      flo_int flo_frac [eE]
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
 | 
	
		
			
				|  |  | +          diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te - 1, te));
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit(token_type::tFLOAT, tok(ts, te - 1), ts, te - 1);
 | 
	
		
			
				|  |  | +          fhold; fbreak;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      flo_int
 | 
	
		
			
				|  |  | +      ( flo_frac? flo_pow %{ num_suffix_s = p; } flo_pow_suffix
 | 
	
		
			
				|  |  | +      | flo_frac          %{ num_suffix_s = p; } flo_suffix
 | 
	
		
			
				|  |  | +      )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        auto digits = tok(ts, num_suffix_s);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
 | 
	
		
			
				|  |  | +          emit(token_type::tFLOAT, digits, ts, num_suffix_s);
 | 
	
		
			
				|  |  | +          p = num_suffix_s - 1;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          emit_num(digits);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # STRING AND XSTRING LITERALS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # `echo foo`, "bar", 'baz'
 | 
	
		
			
				|  |  | +      '`' | ['"] # '
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        literal_type type;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (ts[0] == '`') {
 | 
	
		
			
				|  |  | +          type = literal_type::BACKTICK_XSTRING;
 | 
	
		
			
				|  |  | +        } else if (ts[0] == '\'') {
 | 
	
		
			
				|  |  | +          type = literal_type::SQUOTE_STRING;
 | 
	
		
			
				|  |  | +        } else { // '"'
 | 
	
		
			
				|  |  | +          type = literal_type::DQUOTE_STRING;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fgoto *push_literal(type, std::string(te - 1, 1), ts, nullptr, false, false, true);
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # CONSTANTS AND VARIABLES
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      constant
 | 
	
		
			
				|  |  | +      => { emit(token_type::tCONSTANT);
 | 
	
		
			
				|  |  | +           fnext *arg_or_cmdarg(cmd_state); fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      constant ambiguous_const_suffix
 | 
	
		
			
				|  |  | +      => { emit(token_type::tCONSTANT, tok(ts, tm), ts, tm);
 | 
	
		
			
				|  |  | +           p = tm - 1; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      global_var | class_var_v | instance_var_v
 | 
	
		
			
				|  |  | +      => { p = ts - 1; fcall expr_variable; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # METHOD CALLS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '.' | '&.' | '::'
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_dot; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      call_or_var
 | 
	
		
			
				|  |  | +      => local_ident;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      bareword ambiguous_fid_suffix
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        if (tm == te) {
 | 
	
		
			
				|  |  | +          // Suffix was consumed, e.g. foo!
 | 
	
		
			
				|  |  | +          emit(token_type::tFID);
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +          // Suffix was not consumed, e.g. foo!=
 | 
	
		
			
				|  |  | +          emit(token_type::tIDENTIFIER, tok(ts, tm), ts, tm);
 | 
	
		
			
				|  |  | +          p = tm - 1;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        fnext expr_arg; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # OPERATORS
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '*' | '=>'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +        fgoto expr_value;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      # When '|', '~', '!', '=>' are used as operators
 | 
	
		
			
				|  |  | +      # they do not accept any symbols (or quoted labels) after.
 | 
	
		
			
				|  |  | +      # Other binary operators accept it.
 | 
	
		
			
				|  |  | +      ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +        fnext expr_value; fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ( e_lparen | '|' | '~' | '!' )
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      e_rbrace | e_rparen | ']'
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        cond.pop();
 | 
	
		
			
				|  |  | +        cmdarg.pop();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (ts[0] == '}' || ts[0] == ']') {
 | 
	
		
			
				|  |  | +          fnext expr_end;
 | 
	
		
			
				|  |  | +        } else { // ')'
 | 
	
		
			
				|  |  | +          // this was commented out in the original lexer.rl:
 | 
	
		
			
				|  |  | +          // fnext expr_endfn; ?
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        fbreak;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      operator_arithmetic '='
 | 
	
		
			
				|  |  | +      => { emit(token_type::tOP_ASGN, tok(ts, te - 1));
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '?'
 | 
	
		
			
				|  |  | +      => { emit(token_type::tEH, "?");
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      e_lbrack
 | 
	
		
			
				|  |  | +      => { emit(token_type::tLBRACK2, "[");
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      punctuation_end
 | 
	
		
			
				|  |  | +      => { emit_table(PUNCTUATION);
 | 
	
		
			
				|  |  | +           fnext expr_beg; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +      # WHITESPACE
 | 
	
		
			
				|  |  | +      #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_space_comment;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      w_newline
 | 
	
		
			
				|  |  | +      => { fgoto leading_dot; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      ';'
 | 
	
		
			
				|  |  | +      => { emit(token_type::tSEMI, ";");
 | 
	
		
			
				|  |  | +           command_start = true;
 | 
	
		
			
				|  |  | +           fnext expr_value; fbreak; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '\\' c_line {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::ERROR, dclass::BareBackslash, range(ts, ts + 1));
 | 
	
		
			
				|  |  | +        fhold;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::ERROR, dclass::Unexpected, tok());
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  leading_dot := |*
 | 
	
		
			
				|  |  | +      # Insane leading dots:
 | 
	
		
			
				|  |  | +      # a #comment
 | 
	
		
			
				|  |  | +      #  .b: a.b
 | 
	
		
			
				|  |  | +      c_space* %{ tm = p; } ('.' | '&.')
 | 
	
		
			
				|  |  | +      => { p = tm - 1; fgoto expr_end; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      any
 | 
	
		
			
				|  |  | +      => { emit(token_type::tNL, std::string(), newline_s, newline_s + 1);
 | 
	
		
			
				|  |  | +           fhold; fnext line_begin; fbreak; };
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +  # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
 | 
	
		
			
				|  |  | +  #
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  line_comment := |*
 | 
	
		
			
				|  |  | +      '=end' c_line* c_nl_zlen
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        emit_comment(eq_begin_s, te);
 | 
	
		
			
				|  |  | +        fgoto *cs_before_block_comment;
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_line* c_nl;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_line* zlen
 | 
	
		
			
				|  |  | +      => {
 | 
	
		
			
				|  |  | +        diagnostic_(dlevel::FATAL, dclass::EmbeddedDocument,
 | 
	
		
			
				|  |  | +          range(eq_begin_s, eq_begin_s + "=begin"s.size()));
 | 
	
		
			
				|  |  | +      };
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  line_begin := |*
 | 
	
		
			
				|  |  | +      w_any;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '=begin' ( c_space | c_nl_zlen )
 | 
	
		
			
				|  |  | +      => { eq_begin_s = ts;
 | 
	
		
			
				|  |  | +           fgoto line_comment; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      '__END__' ( c_eol - zlen )
 | 
	
		
			
				|  |  | +      => { p = pe - 3; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_any
 | 
	
		
			
				|  |  | +      => { cmd_state = true; fhold; fgoto expr_value; };
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      c_eof => do_eof;
 | 
	
		
			
				|  |  | +  *|;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +}%%
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +token_t lexer::advance() {
 | 
	
		
			
				|  |  | +  auto tok = advance_();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  last_token_s = tok->start();
 | 
	
		
			
				|  |  | +  last_token_e = tok->end();
 | 
	
		
			
				|  |  | +  return tok;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::extend_static() {
 | 
	
		
			
				|  |  | +  static_env.emplace();
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::extend_dynamic() {
 | 
	
		
			
				|  |  | +  if (static_env.empty()) {
 | 
	
		
			
				|  |  | +    static_env.emplace();
 | 
	
		
			
				|  |  | +  } else {
 | 
	
		
			
				|  |  | +    environment& env = static_env.top();
 | 
	
		
			
				|  |  | +    static_env.push(env);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::unextend() {
 | 
	
		
			
				|  |  | +  static_env.pop();
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +void lexer::declare(const std::string& name) {
 | 
	
		
			
				|  |  | +  static_env.top().insert(name);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +bool lexer::is_declared(const std::string& identifier) const {
 | 
	
		
			
				|  |  | +  const environment& env = static_env.top();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return env.find(identifier) != env.end();
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +optional_size lexer::dedentLevel() {
 | 
	
		
			
				|  |  | +  // We erase @dedentLevel as a precaution to avoid accidentally
 | 
	
		
			
				|  |  | +  // using a stale value.
 | 
	
		
			
				|  |  | +  auto ret = dedentLevel_;
 | 
	
		
			
				|  |  | +  dedentLevel_ = std::nullopt;
 | 
	
		
			
				|  |  | +  return ret;
 | 
	
		
			
				|  |  | +}
 |