/* Copyright (c) 2013-2016 whitequark Parts of the source are derived from ruby_parser: Copyright (c) Ryan Davis, seattle.rb This lexer is a rewrite of the original in Ragel/C: Copyright (c) Charlie Somerville, GitHub MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ %%machine lex; # % fix highlighting /* # # === BEFORE YOU START === # # Read the Ruby Hacking Guide chapter 11, available in English at # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/ # # Remember two things about Ragel scanners: # # 1) Longest match wins. # # 2) If two matches have the same length, the first # in source code wins. # # General rules of making Ragel and Bison happy: # # * `p` (position) and `@te` contain the index of the character # they're pointing to ("current"), plus one. `@ts` contains the index # of the corresponding character. The code for extracting matched token is: # # @source_buffer.slice(@ts...@te) # # * If your input is `foooooooobar` and the rule is: # # 'f' 'o'+ # # the result will be: # # foooooooobar # ^ ts=0 ^ p=te=9 # # * A Ragel lexer action should not emit more than one token, unless # you know what you are doing. # # * All Ragel commands (fnext, fgoto, ...) end with a semicolon. # # * If an action emits the token and transitions to another state, use # these Ragel commands: # # emit($whatever) # fnext $next_state; fbreak; # # If you perform `fgoto` in an action which does not emit a token nor # rewinds the stream pointer, the parser's side-effectful, # context-sensitive lookahead actions will break in a hard to detect # and debug way. # # * If an action does not emit a token: # # fgoto $next_state; # # * If an action features lookbehind, i.e. matches characters with the # intent of passing them to another action: # # p = @ts - 1 # fgoto $next_state; # # or, if the lookbehind consists of a single character: # # fhold; fgoto $next_state; # # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result # _will_ invoke the action `act`. # # e_something stands for "something with **e**mbedded action". # # * EOF is explicit and is matched by `c_eof`. If you want to introspect # the state of the lexer, add this rule to the state: # # c_eof => do_eof; # # * If you proceed past EOF, the lexer will complain: # # NoMethodError: undefined method `ord' for nil:NilClass # */ #include #include #include "absl/strings/numbers.h" %% write data nofinal; using namespace ruby_parser; using namespace std::string_literals; %% prepush { check_stack_capacity(); } lexer::lexer(diagnostics_t &diag, ruby_version version, const std::string& source_buffer_) : diagnostics(diag) , version(version) , source_buffer(source_buffer_ + std::string("\0\0", 2)) , cs(lex_en_line_begin) , _p(source_buffer.data()) , _pe(source_buffer.data() + source_buffer.size()) , ts(nullptr) , te(nullptr) , act(0) , top(0) , eq_begin_s(nullptr) , sharp_s(nullptr) , newline_s(nullptr) , paren_nest(0) , command_start(true) , num_base(0) , num_digits_s(nullptr) , num_suffix_s(nullptr) , num_xfrm(num_xfrm_type::NONE) , escape_s(nullptr) , herebody_s(nullptr) , in_kwarg(false) { // ensure the stack is non-empty so we can just double in // check_stack_capacity: stack.resize(16); static_env.push(environment()); cs_before_block_comment = lex_en_line_begin; } void lexer::check_stack_capacity() { if (stack.size() == (size_t)top) { stack.resize(stack.size() * 2); } } int lexer::stack_pop() { return stack[--top]; } int lexer::arg_or_cmdarg(int cmd_state) { if (cmd_state) { return lex_en_expr_cmdarg; } else { return lex_en_expr_arg; } } void lexer::emit_comment(const char* s, const char* e) { /* unused for now */ (void)s; (void)e; } std::string lexer::tok() { return tok(ts); } std::string lexer::tok(const char* start) { return tok(start, te); } std::string lexer::tok(const char* start, const char* end) { assert(start <= end); return std::string(start, (size_t)(end - start)); } char lexer::unescape(uint32_t codepoint) { switch (codepoint) { case 'a': return '\a'; case 'b': return '\b'; case 'e': return 0x1b; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 's': return ' '; case 't': return '\t'; case 'v': return '\v'; case '\\': return '\\'; default: return '\0'; } } static const lexer::token_table_entry PUNCTUATION[] = { { "=", token_type::tEQL }, { "&", token_type::tAMPER2 }, { "|", token_type::tPIPE }, { "!", token_type::tBANG }, { "^", token_type::tCARET }, { "+", token_type::tPLUS }, { "-", token_type::tMINUS }, { "*", token_type::tSTAR2 }, { "/", token_type::tDIVIDE }, { "%", token_type::tPERCENT }, { "~", token_type::tTILDE }, { ",", token_type::tCOMMA }, { ";", token_type::tSEMI }, { ".", token_type::tDOT }, { "..", token_type::tDOT2 }, { "...", token_type::tDOT3 }, { "[", token_type::tLBRACK2 }, { "]", token_type::tRBRACK }, { "(", token_type::tLPAREN2 }, { ")", token_type::tRPAREN }, { "?", token_type::tEH }, { ":", token_type::tCOLON }, { "&&", token_type::tANDOP }, { "||", token_type::tOROP }, { "-@", token_type::tUMINUS }, { "+@", token_type::tUPLUS }, { "~@", token_type::tTILDE }, { "**", token_type::tPOW }, { "->", token_type::tLAMBDA }, { "=~", token_type::tMATCH }, { "!~", token_type::tNMATCH }, { "==", token_type::tEQ }, { "!=", token_type::tNEQ }, { ">", token_type::tGT }, { ">>", token_type::tRSHFT }, { ">=", token_type::tGEQ }, { "<", token_type::tLT }, { "<<", token_type::tLSHFT }, { "<=", token_type::tLEQ }, { "=>", token_type::tASSOC }, { "::", token_type::tCOLON2 }, { "===", token_type::tEQQ }, { "<=>", token_type::tCMP }, { "[]", token_type::tAREF }, { "[]=", token_type::tASET }, { "{", token_type::tLCURLY }, { "}", token_type::tRCURLY }, { "`", token_type::tBACK_REF2 }, { "!@", token_type::tBANG }, { "&.", token_type::tANDDOT }, { NULL, token_type::error }, }; static const lexer::token_table_entry PUNCTUATION_BEGIN[] = { { "&", token_type::tAMPER }, { "*", token_type::tSTAR }, { "**", token_type::tDSTAR }, { "+", token_type::tUPLUS }, { "-", token_type::tUMINUS }, { "::", token_type::tCOLON3 }, { "(", token_type::tLPAREN }, { "{", token_type::tLBRACE }, { "[", token_type::tLBRACK }, { NULL, token_type::error }, }; static const lexer::token_table_entry KEYWORDS[] = { { "if", token_type::kIF_MOD }, { "unless", token_type::kUNLESS_MOD }, { "while", token_type::kWHILE_MOD }, { "until", token_type::kUNTIL_MOD }, { "rescue", token_type::kRESCUE_MOD }, { "defined?", token_type::kDEFINED }, { "BEGIN", token_type::klBEGIN }, { "END", token_type::klEND }, { "class", token_type::kCLASS }, { "module", token_type::kMODULE }, { "def", token_type::kDEF }, { "undef", token_type::kUNDEF }, { "begin", token_type::kBEGIN }, { "end", token_type::kEND }, { "then", token_type::kTHEN }, { "elsif", token_type::kELSIF }, { "else", token_type::kELSE }, { "ensure", token_type::kENSURE }, { "case", token_type::kCASE }, { "when", token_type::kWHEN }, { "for", token_type::kFOR }, { "break", token_type::kBREAK }, { "next", token_type::kNEXT }, { "redo", token_type::kREDO }, { "retry", token_type::kRETRY }, { "in", token_type::kIN }, { "do", token_type::kDO }, { "return", token_type::kRETURN }, { "yield", token_type::kYIELD }, { "super", token_type::kSUPER }, { "self", token_type::kSELF }, { "nil", token_type::kNIL }, { "true", token_type::kTRUE }, { "false", token_type::kFALSE }, { "and", token_type::kAND }, { "or", token_type::kOR }, { "not", token_type::kNOT }, { "alias", token_type::kALIAS }, { "__FILE__", token_type::k__FILE__ }, { "__LINE__", token_type::k__LINE__ }, { "__ENCODING__", token_type::k__ENCODING__ }, { NULL, token_type::error }, }; static const lexer::token_table_entry KEYWORDS_BEGIN[] = { { "if", token_type::kIF }, { "unless", token_type::kUNLESS }, { "while", token_type::kWHILE }, { "until", token_type::kUNTIL }, { "rescue", token_type::kRESCUE }, { "defined?", token_type::kDEFINED }, { "BEGIN", token_type::klBEGIN }, { "END", token_type::klEND }, { "class", token_type::kCLASS }, { "module", token_type::kMODULE }, { "def", token_type::kDEF }, { "undef", token_type::kUNDEF }, { "begin", token_type::kBEGIN }, { "end", token_type::kEND }, { "then", token_type::kTHEN }, { "elsif", token_type::kELSIF }, { "else", token_type::kELSE }, { "ensure", token_type::kENSURE }, { "case", token_type::kCASE }, { "when", token_type::kWHEN }, { "for", token_type::kFOR }, { "break", token_type::kBREAK }, { "next", token_type::kNEXT }, { "redo", token_type::kREDO }, { "retry", token_type::kRETRY }, { "in", token_type::kIN }, { "do", token_type::kDO }, { "return", token_type::kRETURN }, { "yield", token_type::kYIELD }, { "super", token_type::kSUPER }, { "self", token_type::kSELF }, { "nil", token_type::kNIL }, { "true", token_type::kTRUE }, { "false", token_type::kFALSE }, { "and", token_type::kAND }, { "or", token_type::kOR }, { "not", token_type::kNOT }, { "alias", token_type::kALIAS }, { "__FILE__", token_type::k__FILE__ }, { "__LINE__", token_type::k__LINE__ }, { "__ENCODING__", token_type::k__ENCODING__ }, { NULL, token_type::error }, }; static size_t utf8_encode_char(int32_t uc, std::string &dst) { if (uc < 0x00) { return 0; } else if (uc < 0x80) { dst.push_back(static_cast(uc)); return 1; } else if (uc < 0x800) { dst.push_back(static_cast(0xC0 + (uc >> 6))); dst.push_back(static_cast(0x80 + (uc & 0x3F))); return 2; } else if (uc < 0x10000) { dst.push_back(static_cast(0xE0 + (uc >> 12))); dst.push_back(static_cast(0x80 + ((uc >> 6) & 0x3F))); dst.push_back(static_cast(0x80 + (uc & 0x3F))); return 3; } else if (uc < 0x110000) { dst.push_back(static_cast(0xF0 + (uc >> 18))); dst.push_back(static_cast(0x80 + ((uc >> 12) & 0x3F))); dst.push_back(static_cast(0x80 + ((uc >> 6) & 0x3F))); dst.push_back(static_cast(0x80 + (uc & 0x3F))); return 4; } else return 0; } static bool split_codepoints(const std::string &str, std::string &output) { auto isspace = [](char c) { return c == ' ' || c == '\t'; }; const char *ptr = str.c_str(); while (*ptr) { while (isspace(*ptr)) ptr++; const char *start = ptr; while (*ptr && !isspace(*ptr)) ptr++; std::string cp {start, static_cast(ptr - start)}; if (utf8_encode_char(std::stoi(cp, nullptr, 16), output) == 0) return false; } return true; } static std::string gsub(const std::string&& str, const std::string&& search, const std::string&& replace) { std::string result; std::string::size_type from = 0; while (true) { auto index = str.find(search, from); if (index == std::string::npos) { result += str.substr(from); break; } else { result += str.substr(from, index - from); result += replace; from = index + search.size(); } } return result; } static bool eof_codepoint(char c) { return c == 0 || c == 0x04 || c == 0x1a; } token_t lexer::advance_() { if (!token_queue.empty()) { token_t token = token_queue.front(); token_queue.pop(); return token; } int cmd_state = command_start; command_start = false; const char* p = _p; const char* pe = _pe; const char* eof = _pe; const char* tm = NULL; const char* heredoc_e = NULL; const char* new_herebody_s = NULL; const char* ident_ts = NULL; const char* ident_te = NULL; std::string ident_tok; %% write exec; _p = p; if (!token_queue.empty()) { token_t token = token_queue.front(); token_queue.pop(); return token; } if (cs == lex_error) { size_t start = (size_t)(p - source_buffer.data()); return mempool.alloc(token_type::error, start, start + 1, std::string(p - 1, 1)); } return mempool.alloc(token_type::eof, source_buffer.size(), source_buffer.size(), ""); } void lexer::emit(token_type type) { emit(type, tok()); } void lexer::emit(token_type type, const std::string& str) { emit(type, str, ts, te); } void lexer::emit(token_type type, const std::string& str, const char* start, const char* end) { size_t offset_start = (size_t)(start - source_buffer.data()); size_t offset_end = (size_t)(end - source_buffer.data()); token_queue.push(mempool.alloc(type, offset_start, offset_end, str)); } void lexer::emit_do(bool do_block) { if (cond.active()) { emit(token_type::kDO_COND, "do"); } else if (cmdarg.active() || do_block) { emit(token_type::kDO_BLOCK, "do"); } else { emit(token_type::kDO, "do"); } } void lexer::emit_table(const token_table_entry* table) { auto value = tok(); for (; table->token; ++table) { if (value == table->token) { emit(table->type, value); return; } } // whitequark emits a `nil` token here, but if we do `yylex` hits an assert, // so just drop the token. return; } void lexer::emit_num(const std::string& num) { switch (num_xfrm) { case num_xfrm_type::NONE: emit(token_type::tINTEGER, num); break; case num_xfrm_type::RATIONAL: emit(token_type::tRATIONAL, num); break; case num_xfrm_type::IMAGINARY: emit(token_type::tIMAGINARY, num); break; case num_xfrm_type::RATIONAL_IMAGINARY: emit(token_type::tRATIONAL_IMAGINARY, num); break; case num_xfrm_type::FLOAT: emit(token_type::tFLOAT, num); break; case num_xfrm_type::FLOAT_IMAGINARY: emit(token_type::tFLOAT_IMAGINARY, num); break; } } std::string lexer::convert_base(const std::string& num, int num_base) { long int result; if (num_base == 10) { return num; } // This doesn't match Ruby's parsing but it is better than not handling it if (!absl::numbers_internal::safe_strtoi_base(num, &result, num_base)) { result = 0; // dmitry: appartently we assume that outer functions reported all the errors!!! } return std::to_string(result); } diagnostic::range lexer::range(const char *start, const char *end) { size_t token_start = (size_t)(start - source_buffer.data()); size_t token_end = (size_t)(end - source_buffer.data()); return diagnostic::range(token_start, token_end); } void lexer::diagnostic_(dlevel level, dclass type, const std::string &data) { diagnostics.emplace_back(level, type, range(ts, te), data); } void lexer::diagnostic_(dlevel level, dclass type, diagnostic::range &&range, const std::string &data) { diagnostics.emplace_back(level, type, range, data); } // // === LITERAL STACK === // template int lexer::push_literal(Args&&... args) { literal_stack.emplace(*this, std::forward(args)...); auto& literal = literal_stack.top(); return next_state_for_literal(literal); } int lexer::next_state_for_literal(literal &lit) { if (lit.words() && lit.backslash_delimited()) { if (lit.interpolate()) { return lex_en_interp_backslash_delimited_words; } else { return lex_en_plain_backslash_delimited_words; } } else if (lit.words() && !lit.backslash_delimited()) { if (lit.interpolate()) { return lex_en_interp_words; } else { return lex_en_plain_words; } } else if (!lit.words() && lit.backslash_delimited()) { if (lit.interpolate()) { return lex_en_interp_backslash_delimited; } else { return lex_en_plain_backslash_delimited; } } else { if (lit.interpolate()) { return lex_en_interp_string; } else { return lex_en_plain_string; } } } literal& lexer::literal_() { return literal_stack.top(); } int lexer::pop_literal() { bool was_regexp; { auto& old_literal = literal_stack.top(); was_regexp = old_literal.regexp(); dedentLevel_ = old_literal.dedentLevel(); } literal_stack.pop(); if (was_regexp) { return lex_en_regexp_modifiers; } else { return lex_en_expr_end; } } void lexer::set_state_expr_beg() { cs = lex_en_expr_beg; } void lexer::set_state_expr_end() { cs = lex_en_expr_end; } void lexer::set_state_expr_endarg() { cs = lex_en_expr_endarg; } void lexer::set_state_expr_fname() { cs = lex_en_expr_fname; } void lexer::set_state_expr_value() { cs = lex_en_expr_value; } %%{ # access @; # getkey (@source_pts[p] || 0); # === CHARACTER CLASSES === # # Pay close attention to the differences between c_any and any. # c_any does not include EOF and so will cause incorrect behavior # for machine subtraction (any-except rules) and default transitions # for scanners. action do_nl { // Record position of a newline for precise location reporting on tNL // tokens. // // This action is embedded directly into c_nl, as it is idempotent and // there are no cases when we need to skip it. newline_s = p; } c_nl = '\n' $ do_nl; c_space = [ \t\r\f\v]; c_space_nl = c_space | c_nl; c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF c_eol = c_nl | c_eof; c_any = any - c_eof; c_nl_zlen = c_nl | zlen; c_line = any - c_nl_zlen; c_unicode = c_any - 0x00..0x7f; c_upper = [A-Z]; c_lower = [a-z_] | c_unicode; c_alpha = c_lower | c_upper; c_alnum = c_alpha | [0-9]; action do_eof { // Sit at EOF indefinitely. #advance would return $eof each time. // This allows to feed the lexer more data if needed; this is only used // in tests. // // Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs // below. This is due to the fact that scanner state at EOF is observed // by tests, and encapsulating it in a rule would break the introspection. fhold; fbreak; } # # === TOKEN DEFINITIONS === # # All operators are punctuation. There is more to punctuation # than just operators. Operators can be overridden by user; # punctuation can not. # A list of operators which are valid in the function name context, but # have different semantics in others. operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ; # A list of operators which can occur within an assignment shortcut (+ → +=). operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' | '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ; # A list of all user-definable operators not covered by groups above. operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' | '<' | '<=' | '>' | '>=' | '<=>' | '=>' ; # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace, # as they are ambiguous with interpolation `#{}` and should be counted. # These braces are not present in punctuation lists. # A list of punctuation which has different meaning when used at the # beginning of expression. punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '**' | '&' ; # A list of all punctuation except punctuation_begin. punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' | '::' | '?' | ':' | '.' | '..' | '...' ; # A list of keywords which have different meaning at the beginning of expression. keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ; # A list of keywords which accept an argument-like expression, i.e. have the # same post-processing as method calls or commands. Example: `yield 1`, # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function. keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ; # A list of keywords which accept a literal function name as an argument. keyword_with_fname = 'def' | 'undef' | 'alias' ; # A list of keywords which accept an expression after them. keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' | 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' | 'and' | 'or' ; # A list of keywords which accept a value, and treat the keywords from # `keyword_modifier` list as modifiers. keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ; # A list of keywords which do not accept an expression after them. keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' | 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' | '__LINE__' | '__ENCODING__'; # All keywords. keyword = keyword_with_value | keyword_with_mid | keyword_with_end | keyword_with_arg | keyword_with_fname | keyword_modifier ; constant = c_upper c_alnum*; bareword = c_alpha c_alnum*; call_or_var = c_lower c_alnum*; class_var = '@@' bareword; instance_var = '@' bareword; global_var = '$' ( bareword | digit+ | [`'+~*$&?!@/\\;,.=:<>"] # ` | '-' c_alnum ) ; # Ruby accepts (and fails on) variables with leading digit # in literal context, but not in unquoted symbol body. class_var_v = '@@' c_alnum+; instance_var_v = '@' c_alnum+; label = bareword [?!]? ':'; # # === NUMERIC PARSING === # int_hex = ( xdigit+ '_' )* xdigit* '_'? ; int_dec = ( digit+ '_' )* digit* '_'? ; int_bin = ( [01]+ '_' )* [01]* '_'? ; flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0'; flo_frac = '.' ( digit+ '_' )* digit+; flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+; int_suffix = '' % { num_xfrm = num_xfrm_type::NONE; } | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; } | 'i' % { num_xfrm = num_xfrm_type::IMAGINARY; } | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; }; flo_pow_suffix = '' % { num_xfrm = num_xfrm_type::FLOAT; } | 'i' % { num_xfrm = num_xfrm_type::FLOAT_IMAGINARY; }; flo_suffix = flo_pow_suffix | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; } | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; }; # # === ESCAPE SEQUENCE PARSING === # # Escape parsing code is a Ragel pattern, not a scanner, and therefore # it shouldn't directly raise errors or perform other actions with side effects. # In reality this would probably just mess up error reporting in pathological # cases, through. # The amount of code required to parse \M\C stuff correctly is ridiculous. escaped_nl = "\\" c_nl; action unicode_points { auto codepoint_str = tok(escape_s + 2, p - 1); std::string result; if (split_codepoints(codepoint_str, result)) { escape = std::make_unique(result); } else { auto codepoint_s = escape_s + 2; diagnostic_(dlevel::ERROR, dclass::UnicodePointTooLarge, range(codepoint_s, codepoint_s + codepoint_str.size())); } } action unescape_char { char esc = unescape(p[-1]); if (esc) { escape = std::make_unique(&esc, 1); } else { escape = std::make_unique(p - 1, 1); } } action invalid_complex_escape { diagnostic_(dlevel::FATAL, dclass::InvalidEscape); } action slash_c_char { // TODO multibyte char c = escape->at(0) & 0x9f; escape = std::make_unique(&c, 1); } action slash_m_char { // TODO multibyte char c = escape->at(0) | 0x80; escape = std::make_unique(&c, 1); } maybe_escaped_char = ( '\\' c_any %unescape_char | ( c_any - [\\] ) % { escape = std::make_unique(p - 1, 1); /* TODO multibyte */ } ); maybe_escaped_ctrl_char = ( # why?! '\\' c_any %unescape_char %slash_c_char | '?' % { escape = std::make_unique("\x7f"); } | ( c_any - [\\?] ) % { escape = std::make_unique(p - 1, 1); /* TODO multibyte */ } %slash_c_char ); escape = ( # \377 [0-7]{1,3} % { auto esc = tok(escape_s, p); char c = std::stoi(esc, nullptr, 8); escape = std::make_unique(&c, 1); } # \xff | 'x' xdigit{1,2} % { auto esc = tok(escape_s + 1, p); char c = std::stoi(esc, nullptr, 16); escape = std::make_unique(&c, 1); } # \u263a | 'u' xdigit{4} % { std::string result; split_codepoints(tok(escape_s + 1, p), result); escape = std::make_unique(result); } # %q[\x] | 'x' ( c_any - xdigit ) % { diagnostic_(dlevel::FATAL, dclass::InvalidHexEscape, range(escape_s - 1, p + 2)); } # %q[\u123] %q[\u{12] | 'u' ( c_any{0,4} - xdigit{4} - # \u1234 is valid ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid | '{' xdigit [ \t}] any? # \u{1. \u{1} are valid | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid ) ) % { diagnostic_(dlevel::FATAL, dclass::InvalidUnicodeEscape, range(escape_s - 1, p)); } # \u{123 456} | 'u{' ( xdigit{1,6} [ \t] )* ( xdigit{1,6} '}' %unicode_points | ( xdigit* ( c_any - xdigit - '}' )+ '}' | ( c_any - '}' )* c_eof | xdigit{7,} ) % { diagnostic_(dlevel::FATAL, dclass::UnterminatedUnicode, range(p - 1, p)); } ) # \C-\a \cx | ( 'C-' | 'c' ) escaped_nl? maybe_escaped_ctrl_char # \M-a | 'M-' escaped_nl? maybe_escaped_char %slash_m_char # \C-\M-f \M-\cf \c\M-f | ( ( 'C-' | 'c' ) escaped_nl? '\\M-' | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl? maybe_escaped_ctrl_char %slash_m_char | 'C' c_any %invalid_complex_escape | 'M' c_any %invalid_complex_escape | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape | ( c_any - [0-7xuCMc] ) %unescape_char | c_eof % { diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(p - 1, p)); } ); # Use rules in form of `e_bs escape' when you need to parse a sequence. e_bs = '\\' % { escape_s = p; escape = nullptr; }; # # === STRING AND HEREDOC PARSING === # # Heredoc parsing is quite a complex topic. First, consider that heredocs # can be arbitrarily nested. For example: # # puts <= 2.2 if (version >= ruby_version::RUBY_22 && !cond.active()) { const char* lookahead_s = te; const char* lookahead_e = te + 2; if (lookahead_e > eof) { lookahead_e = eof; } lookahead = std::string(lookahead_s, (size_t)(lookahead_e - lookahead_s)); } auto& current_literal = literal_(); if (!current_literal.heredoc() && current_literal.nest_and_try_closing(str, ts, te, lookahead)) { if (token_queue.back()->type() == token_type::tLABEL_END) { p += 1; pop_literal(); fnext expr_labelarg; } else { fnext *pop_literal(); } fbreak; } else { current_literal.extend_string(str, ts, te); } } action extend_string_escaped { auto& current_literal = literal_(); // TODO multibyte auto escaped_char = *escape_s; if (current_literal.munge_escape(escaped_char)) { // If this particular literal uses this character as an opening // or closing delimiter, it is an escape sequence for that // particular character. Write it without the backslash. if (current_literal.regexp() && (escaped_char == '\\' || escaped_char == '$' || escaped_char == '$' || escaped_char == '(' || escaped_char == ')' || escaped_char == '*' || escaped_char == '+' || escaped_char == '.' || escaped_char == '<' || escaped_char == '>' || escaped_char == '?' || escaped_char == '[' || escaped_char == ']' || escaped_char == '^' || escaped_char == '{' || escaped_char == '|' || escaped_char == '}')) { // Regular expressions should include escaped delimiters in their // escaped form, except when the escaped character is // a closing delimiter but not a regexp metacharacter. // // The backslash itself cannot be used as a closing delimiter // at the same time as an escape symbol, but it is always munged, // so this branch also executes for the non-closing-delimiter case // for the backslash. auto str = tok(); current_literal.extend_string(str, ts, te); } else { auto str = std::string(&escaped_char, 1); current_literal.extend_string(str, ts, te); } } else { // It does not. So this is an actual escape sequence, yay! if (current_literal.regexp()) { // Regular expressions should include escape sequences in their // escaped form. On the other hand, escaped newlines are removed. std::string str = gsub(tok(), "\\\n", ""); current_literal.extend_string(str, ts, te); } else { auto str = escape ? *escape : tok(); current_literal.extend_string(str, ts, te); } } } # Extend a string with a newline or a EOF character. # As heredoc closing line can immediately precede EOF, this action # has to handle such case specially. action extend_string_eol { auto& current_literal = literal_(); if (te == pe) { diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(current_literal.str_s, current_literal.str_s + 1)); } if (current_literal.heredoc()) { auto line = tok(herebody_s, ts); while (!line.empty() && line.back() == '\r') { line.pop_back(); } if (version <= ruby_version::RUBY_20) { // See ruby:c48b4209c auto riter = line.rfind('\r'); if (riter != std::string::npos) { line.erase(riter); } } // Try ending the heredoc with the complete most recently // scanned line. @herebody_s always refers to the start of such line. if (current_literal.nest_and_try_closing(line, herebody_s, ts)) { herebody_s = te; // Continue regular lexing after the heredoc reference (< extend_interp_code; interp_var => extend_interp_var; e_bs escape => extend_string_escaped; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; interp_string := |* interp_code => extend_interp_code; interp_var => extend_interp_var; e_bs escape => extend_string_escaped; c_eol => extend_string_eol; c_any => extend_string; *|; plain_words := |* e_bs c_any => extend_string_escaped; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; plain_string := |* '\\' c_nl => extend_string_eol; e_bs c_any => extend_string_escaped; c_eol => extend_string_eol; c_any => extend_string; *|; interp_backslash_delimited := |* interp_code => extend_interp_code; interp_var => extend_interp_var; c_eol => extend_string_eol; c_any => extend_string; *|; plain_backslash_delimited := |* c_eol => extend_string_eol; c_any => extend_string; *|; interp_backslash_delimited_words := |* interp_code => extend_interp_code; interp_var => extend_interp_var; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; plain_backslash_delimited_words := |* c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; regexp_modifiers := |* [A-Za-z]+ => { auto options = tok(); std::string unknown_options; for (auto i = options.cbegin(); i != options.cend(); ++i) { switch (char opt = *i) { case 'i': case 'm': case 'x': case 'o': case 'u': case 'e': case 's': case 'n': continue; default: unknown_options += opt; break; } } if (!unknown_options.empty()) { diagnostic_(dlevel::ERROR, dclass::RegexpOptions, unknown_options); } emit(token_type::tREGEXP_OPT, options); fnext expr_end; fbreak; }; any => { emit(token_type::tREGEXP_OPT, tok(ts, te - 1), ts, te - 1); fhold; fgoto expr_end; }; *|; # # === WHITESPACE HANDLING === # # Various contexts in Ruby allow various kinds of whitespace # to be used. They are grouped to clarify the lexing machines # and ease collection of comments. # A line of code with inline #comment at end is always equivalent # to a line of code ending with just a newline, so an inline # comment is deemed equivalent to non-newline whitespace # (c_space character class). w_space = c_space+ | '\\' e_heredoc_nl ; w_comment = '#' %{ sharp_s = p - 1; } # The (p == pe) condition compensates for added "\0" and # the way Ragel handles EOF. c_line* %{ emit_comment(sharp_s, p == pe ? p - 2 : p); } ; w_space_comment = w_space | w_comment ; # A newline in non-literal context always interoperates with # here document logic and can always be escaped by a backslash, # still interoperating with here document logic in the same way, # yet being invisible to anything else. # # To demonstrate: # # foo = <' %{ tm = p - 2; } | # a=>b a => b '===' %{ tm = p - 3; } # a===b a === b ; ambiguous_symbol_suffix = # actual parsed ambiguous_ident_suffix | '==>' %{ tm = p - 2; } # :a==>b :a= => b ; # Ambiguous with 1.9 hash labels. ambiguous_const_suffix = # actual parsed '::' %{ tm = p - 2; } # A::B A :: B ; # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace. e_lbrack = '[' % { cond.push(false); cmdarg.push(false); }; # Ruby 1.9 lambdas require parentheses counting in order to # emit correct opening kDO/tLBRACE. e_lparen = '(' % { cond.push(false); cmdarg.push(false); paren_nest += 1; }; e_rparen = ')' % { paren_nest -= 1; }; # Ruby is context-sensitive wrt/ local identifiers. action local_ident { auto ident = tok(); emit(token_type::tIDENTIFIER, ident); if (is_declared(ident)) { fnext expr_endfn; fbreak; } else { fnext *arg_or_cmdarg(cmd_state); fbreak; } } # Variable lexing code is accessed from both expressions and # string interpolation related code. # expr_variable := |* global_var => { if (ts[1] >= '1' && ts[1] <= '9') { emit(token_type::tNTH_REF, tok(ts + 1)); } else if (ts[1] == '&' || ts[1] == '`' || ts[1] == '\'' || ts[1] == '+') { emit(token_type::tBACK_REF); } else { emit(token_type::tGVAR); } fnext *stack_pop(); fbreak; }; class_var_v => { if (ts[2] >= '0' && ts[2] <= '9') { diagnostic_(dlevel::ERROR, dclass::CvarName, tok(ts, te)); } emit(token_type::tCVAR); fnext *stack_pop(); fbreak; }; instance_var_v => { if (ts[1] >= '0' && ts[1] <= '9') { diagnostic_(dlevel::ERROR, dclass::IvarName, tok(ts, te)); } emit(token_type::tIVAR); fnext *stack_pop(); fbreak; }; *|; # Literal function name in definition (e.g. `def class`). # Keywords are returned as their respective tokens; this is used # to support singleton def `def self.foo`. Global variables are # returned as `tGVAR`; this is used in global variable alias # statements `alias $a $b`. Symbols are returned verbatim; this # is used in `alias :a :"b#{foo}"` and `undef :a`. # # Transitions to `expr_endfn` afterwards. # expr_fname := |* keyword => { emit_table(KEYWORDS_BEGIN); fnext expr_endfn; fbreak; }; constant => { emit(token_type::tCONSTANT); fnext expr_endfn; fbreak; }; bareword [?=!]? => { emit(token_type::tIDENTIFIER); fnext expr_endfn; fbreak; }; global_var => { p = ts - 1; fnext expr_end; fcall expr_variable; }; # If the handling was to be delegated to expr_end, # these cases would transition to something else than # expr_endfn, which is incorrect. operator_fname | operator_arithmetic | operator_rest => { emit_table(PUNCTUATION); fnext expr_endfn; fbreak; }; '::' => { fhold; fhold; fgoto expr_end; }; ':' => { fhold; fgoto expr_beg; }; '%s' c_any => { if (version == ruby_version::RUBY_23) { fgoto *push_literal(literal_type::LOWERS_SYMBOL, std::string(ts + 2, 1), ts); } else { p = ts - 1; fgoto expr_end; } }; w_any; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # After literal function name in definition. Behaves like `expr_end`, # but allows a tLABEL. # # Transitions to `expr_end` afterwards. # expr_endfn := |* label ( any - ':' ) => { emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1); fhold; fnext expr_labelarg; fbreak; }; w_space_comment; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # Literal function name in method call (e.g. `a.class`). # # Transitions to `expr_arg` afterwards. # # KEEP IN SYNC WITH expr_dot_after_newline! # expr_dot := |* constant => { emit(token_type::tCONSTANT); fnext *arg_or_cmdarg(cmd_state); fbreak; }; call_or_var => { emit(token_type::tIDENTIFIER); fnext *arg_or_cmdarg(cmd_state); fbreak; }; bareword ambiguous_fid_suffix => { emit(token_type::tFID, tok(ts, tm), ts, tm); fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; }; # See the comment in `expr_fname`. operator_fname | operator_arithmetic | operator_rest => { emit_table(PUNCTUATION); fnext expr_arg; fbreak; }; # This breaks compatibility with Ruby for better partial parses (useful # for LSP especially). See comment for expr_dot_after_newline below. w_newline => { fhold; fgoto expr_dot_after_newline; }; w_any; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # KEEP IN SYNC WITH expr_dot! # # This state breaks from valid Ruby syntax, but in a way that enables Sorbet # to recover better from parse errors. Recovering from parse errors is # important because it lets us service LSP queries faster. # # Specifically, this state makes is so that any keyword seen after w_newline # is emitted as a keyword (like kEND) instead of a tIDENTIFIER. Examples: # # # Valid Ruby, valid in Sorbet (no newline between '.' and 'end') # def foo # x.end # end # # # Parse error in Ruby and Sorbet, but Sorbet at least sees the method def # # with an empty body (Ruby wouldn't even see an empty method def) # def foo # x. # end # # # Valid Ruby, not valid in Sorbet (newline between '.' and 'end') # def foo # x. # end # end # expr_dot_after_newline := |* constant => { emit(token_type::tCONSTANT); fnext *arg_or_cmdarg(cmd_state); fbreak; }; # This is different from expr_dot. Here, keywords are NOT identifiers. keyword => { emit_table(KEYWORDS); fnext expr_end; fbreak; }; call_or_var => { emit(token_type::tIDENTIFIER); fnext *arg_or_cmdarg(cmd_state); fbreak; }; bareword ambiguous_fid_suffix => { emit(token_type::tFID, tok(ts, tm), ts, tm); fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; }; # See the comment in `expr_fname`. operator_fname | operator_arithmetic | operator_rest => { emit_table(PUNCTUATION); fnext expr_arg; fbreak; }; w_any; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space # is consumed; the current expression is a command or method call. # expr_arg := |* # # COMMAND MODE SPECIFIC TOKENS # # cmd (1 + 2) # See below the rationale about expr_endarg. w_space+ e_lparen => { if (version == ruby_version::RUBY_18) { emit(token_type::tLPAREN2, "(", te - 1, te); fnext expr_value; fbreak; } else { emit(token_type::tLPAREN_ARG, "(", te - 1, te); fnext expr_beg; fbreak; } }; # meth(1 + 2) # Regular method call. e_lparen => { emit(token_type::tLPAREN2, "("); fnext expr_beg; fbreak; }; # meth [...] # Array argument. Compare with indexing `meth[...]`. w_space+ e_lbrack => { emit(token_type::tLBRACK, "[", te - 1, te); fnext expr_beg; fbreak; }; # cmd {} # Command: method call without parentheses. w_space* e_lbrace => { if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) { lambda_stack.pop(); emit(token_type::tLAMBEG, "{", te - 1, te); } else { emit(token_type::tLCURLY, "{", te - 1, te); } command_start = true; fnext expr_value; fbreak; }; # # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG # # a?? # Ternary operator '?' c_space_nl => { // Unlike expr_beg as invoked in the next rule, do not warn p = ts - 1; fgoto expr_end; }; # a ?b, a? ? # Character literal or ternary operator w_space* '?' => { fhold; fgoto expr_beg; }; # a %{1}, a %[1] (but not "a %=1=" or "a % foo") # a /foo/ (but not "a / foo" or "a /=foo") # a < { if (*tm == '/') { // Ambiguous regexp literal. diagnostic_(dlevel::WARNING, dclass::AmbiguousLiteral, range(tm, tm + 1)); } p = tm - 1; fgoto expr_beg; }; # x *1 # Ambiguous splat, kwsplat or block-pass. w_space+ %{ tm = p; } ( '+' | '-' | '*' | '&' | '**' ) => { diagnostic_(dlevel::WARNING, dclass::AmbiguousPrefix, range(tm, te), tok(tm, te)); p = tm - 1; fgoto expr_beg; }; # x ::Foo # Ambiguous toplevel constant access. w_space+ '::' => { fhold; fhold; fgoto expr_beg; }; # x:b # Symbol. w_space* ':' => { fhold; fgoto expr_beg; }; w_space+ label => { p = ts - 1; fgoto expr_beg; }; # # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END # # a ? b # Ternary operator. w_space+ %{ tm = p; } '?' c_space_nl => { p = tm - 1; fgoto expr_end; }; # x + 1: Binary operator or operator-assignment. w_space* operator_arithmetic ( '=' | c_space_nl )? | # x rescue y: Modifier keyword. w_space* keyword_modifier | # a &. b: Safe navigation operator. w_space* '&.' | # Miscellanea. w_space* punctuation_end => { p = ts - 1; fgoto expr_end; }; w_space; w_comment => { fgoto expr_end; }; w_newline => { fhold; fgoto expr_end; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # The previous token was an identifier which was seen while in the # command mode (that is, the state at the beginning of #advance was # expr_value). This state is very similar to expr_arg, but disambiguates # two very rare and specific condition: # * In 1.8 mode, "foo (lambda do end)". # * In 1.9+ mode, "f x: -> do foo do end end". expr_cmdarg := |* w_space+ e_lparen => { emit(token_type::tLPAREN_ARG, "(", te - 1, te); if (version == ruby_version::RUBY_18) { fnext expr_value; fbreak; } else { fnext expr_beg; fbreak; } }; w_space* 'do' => { if (cond.active()) { emit(token_type::kDO_COND, "do", te - 2, te); } else { emit(token_type::kDO, "do", te - 2, te); } fnext expr_value; fbreak; }; c_any | # Disambiguate with the `do' rule above. w_space* bareword | w_space* label => { p = ts - 1; fgoto expr_arg; }; c_eof => do_eof; *|; # The rationale for this state is pretty complex. Normally, if an argument # is passed to a command and then there is a block (tLCURLY...tRCURLY), # the block is attached to the innermost argument (`f` in `m f {}`), or it # is a parse error (`m 1 {}`). But there is a special case for passing a single # primary expression grouped with parentheses: if you write `m (1) {}` or # (2.0 only) `m () {}`, then the block is attached to `m`. # # Thus, we recognize the opening `(` of a command (remember, a command is # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the # lexer's state to `expr_endarg`, which makes it emit the possibly following # `{` as `tLBRACE_ARG`. # # The default post-`expr_endarg` state is `expr_end`, so this state also handles # `do` (as `kDO_BLOCK` in `expr_beg`). expr_endarg := |* e_lbrace => { if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) { lambda_stack.pop(); emit(token_type::tLAMBEG, "{"); } else { emit(token_type::tLBRACE_ARG, "{"); } command_start = true; fnext expr_value; fbreak; }; 'do' => { emit_do(true); fnext expr_value; fbreak; }; w_space_comment; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # The rationale for this state is that several keywords accept value # (i.e. should transition to `expr_beg`), do not accept it like a command # (i.e. not an `expr_arg`), and must behave like a statement, that is, # accept a modifier if/while/etc. # expr_mid := |* keyword_modifier => { emit_table(KEYWORDS); fnext expr_beg; fbreak; }; bareword => { p = ts - 1; fgoto expr_beg; }; w_space_comment; w_newline => { fhold; fgoto expr_end; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # Beginning of an expression. # # Don't fallthrough to this state from `c_any`; make sure to handle # `c_space* c_nl` and let `expr_end` handle the newline. # Otherwise code like `f\ndef x` gets glued together and the parser # explodes. # expr_beg := |* # +5, -5, - 5 [+\-] w_any* [0-9] => { emit(token_type::tUNARY_NUM, tok(ts, ts + 1), ts, ts + 1); fhold; fnext expr_end; fbreak; }; # splat *a '*' => { emit(token_type::tSTAR, "*"); fbreak; }; # # STRING AND REGEXP LITERALS # # /regexp/oui # /=/ (disambiguation with /=) '/' c_any => { fhold; fgoto *push_literal(literal_type::SLASH_REGEXP, std::string(ts + 0, 1), ts); }; # % '%' ( any - [A-Za-z] ) => { fgoto *push_literal(literal_type::PERCENT_STRING, std::string(ts + 1, 1), ts); }; # %w(we are the people) '%' [A-Za-z]+ c_any => { literal_type type; bool single_char_type = (ts + 3 == te); if (single_char_type && ts[1] == 'q') { type = literal_type::LOWERQ_STRING; } else if (single_char_type && ts[1] == 'Q') { type = literal_type::UPPERQ_STRING; } else if (single_char_type && ts[1] == 'w') { type = literal_type::LOWERW_WORDS; } else if (single_char_type && ts[1] == 'W') { type = literal_type::UPPERW_WORDS; } else if (single_char_type && ts[1] == 'i') { type = literal_type::LOWERI_SYMBOLS; } else if (single_char_type && ts[1] == 'I') { type = literal_type::UPPERI_SYMBOLS; } else if (single_char_type && ts[1] == 's') { type = literal_type::LOWERS_SYMBOL; } else if (single_char_type && ts[1] == 'r') { type = literal_type::PERCENT_REGEXP; } else if (single_char_type && ts[1] == 'x') { type = literal_type::LOWERX_XSTRING; } else { type = literal_type::PERCENT_STRING; diagnostic_(dlevel::ERROR, dclass::UnexpectedPercentStr, range(ts, te - 1), tok(ts, te-1)); } fgoto *push_literal(type, std::string(te - 1, 1), ts); }; '%' c_eof => { diagnostic_(dlevel::FATAL, dclass::StringEof, range(ts, ts + 1)); }; # Heredoc start. # < { bool indent; bool dedent_body; const char* delim_s = ts + 2; const char* delim_e = heredoc_e; if (*delim_s == '-') { indent = true; dedent_body = false; delim_s++; } else if (*delim_s == '~') { indent = true; dedent_body = true; delim_s++; } else { indent = false; dedent_body = false; } literal_type type; if (*delim_s == '"') { type = literal_type::DQUOTE_HEREDOC; delim_s++; delim_e--; } else if (*delim_s == '\'') { type = literal_type::SQUOTE_HEREDOC; delim_s++; delim_e--; } else if (*delim_s == '`') { type = literal_type::BACKTICK_HEREDOC; delim_s++; delim_e--; } else { type = literal_type::DQUOTE_HEREDOC; } if (dedent_body && (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20 || version == ruby_version::RUBY_21 || version == ruby_version::RUBY_22)) { emit(token_type::tLSHFT, "<<", ts, ts + 2); p = ts + 1; fnext expr_beg; fbreak; } else { fnext *push_literal(type, std::string(delim_s, (size_t)(delim_e - delim_s)), ts, heredoc_e, indent, dedent_body); if (!herebody_s) { herebody_s = new_herebody_s; } p = herebody_s - 1; } }; # # SYMBOL LITERALS # # :&&, :|| ':' ('&&' | '||') => { fhold; fhold; emit(token_type::tSYMBEG, tok(ts, ts + 1), ts, ts + 1); fgoto expr_fname; }; # :"bar", :'baz' ':' ['"] # ' => { literal_type type; if (ts[1] == '\'') { type = literal_type::SQUOTE_SYMBOL; } else { // '"' type = literal_type::DQUOTE_SYMBOL; } fgoto *push_literal(type, std::string(ts + 1, 1), ts); }; # :!@ is :! # :~@ is :~ ':' [!~] '@' => { emit(token_type::tSYMBEG, tok(ts + 1, ts + 2), ts, te); fnext expr_end; fbreak; }; ':' bareword ambiguous_symbol_suffix => { emit(token_type::tSYMBOL, tok(ts + 1, tm), ts, tm); p = tm - 1; fnext expr_end; fbreak; }; ':' ( bareword | global_var | class_var | instance_var | operator_fname | operator_arithmetic | operator_rest ) => { emit(token_type::tSYMBOL, tok(ts + 1), ts, te); fnext expr_end; fbreak; }; # # AMBIGUOUS TERNARY OPERATOR # # Character constant, like ?a, ?\n, ?\u1000, and so on # Don't accept \u escape with multiple codepoints, like \u{1 2 3} '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' )) | (c_any - c_space_nl - e_bs) % { escape = nullptr; } ) => { if (version == ruby_version::RUBY_18) { emit(token_type::tINTEGER, std::to_string(static_cast(ts[1]))); } else { emit(token_type::tCHARACTER, escape ? *escape : tok(ts + 1)); } fnext expr_end; fbreak; }; '?' c_space_nl => { static const struct escape_map_ent { char c; const char* s; } escape_map[] { { ' ', "\\s" }, { '\r', "\\r" }, { '\n', "\\n" }, { '\t', "\\t" }, { '\v', "\\v" }, { '\f', "\\f" }, { 0, 0 }, }; for (const struct escape_map_ent* ent = escape_map; ent->c; ++ent) { if (ts[1] == ent->c) { diagnostic_(dlevel::WARNING, dclass::InvalidEscapeUse, ent->s); break; } } p = ts - 1; fgoto expr_end; }; '?' c_eof => { diagnostic_(dlevel::FATAL, dclass::IncompleteEscape, range(ts, ts + 1)); }; # f ?aa : b: Disambiguate with a character literal. '?' [A-Za-z_] bareword => { p = ts - 1; fgoto expr_end; }; # # KEYWORDS AND PUNCTUATION # # a({b=>c}) e_lbrace => { if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) { lambda_stack.pop(); command_start = true; emit(token_type::tLAMBEG, "{"); } else { emit(token_type::tLBRACE, "{"); } fbreak; }; # a([1, 2]) e_lbrack => { emit(token_type::tLBRACK, "["); fbreak; }; # a() e_lparen => { emit(token_type::tLPAREN, "("); fbreak; }; # a(+b) punctuation_begin => { emit_table(PUNCTUATION_BEGIN); fbreak; }; # rescue Exception => e: Block rescue. # Special because it should transition to expr_mid. 'rescue' %{ tm = p; } '=>'? => { emit(token_type::kRESCUE, "rescue", ts, tm); p = tm - 1; fnext expr_mid; fbreak; }; # if a: Statement if. keyword_modifier => { emit_table(KEYWORDS_BEGIN); command_start = true; fnext expr_value; fbreak; }; # # RUBY 1.9 HASH LABELS # label ( any - ':' ) => { fhold; if (version == ruby_version::RUBY_18) { auto ident = tok(ts, te - 2); if (*ts >= 'A' && *ts <= 'Z') { emit(token_type::tCONSTANT, ident, ts, te - 2); } else { emit(token_type::tIDENTIFIER, ident, ts, te - 2); } fhold; // continue as a symbol if (is_declared(ident)) { fnext expr_end; } else { fnext *arg_or_cmdarg(cmd_state); } } else { emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1); fnext expr_labelarg; } fbreak; }; # # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION # # foo= bar: Disambiguate with bareword rule below. bareword ambiguous_ident_suffix | # def foo: Disambiguate with bareword rule below. keyword => { p = ts - 1; fgoto expr_end; }; # a = 42; a [42]: Indexing. # def a; end; a [42]: Array argument. call_or_var => local_ident; (call_or_var - keyword) % { ident_tok = tok(ts, te); ident_ts = ts; ident_te = te; } w_space+ '(' => { emit(token_type::tIDENTIFIER, ident_tok, ident_ts, ident_te); p = ident_te - 1; fnext expr_cmdarg; fbreak; }; # # WHITESPACE # w_any; e_heredoc_nl '=begin' ( c_space | c_nl_zlen ) => { p = ts - 1; cs_before_block_comment = cs; fgoto line_begin; }; # # DEFAULT TRANSITION # # The following rules match most binary and all unary operators. # Rules for binary operators provide better error reporting. operator_arithmetic '=' | operator_rest | punctuation_end | c_any => { p = ts - 1; fgoto expr_end; }; c_eof => do_eof; *|; # Special newline handling for "def a b:" # expr_labelarg := |* w_space_comment; w_newline => { if (in_kwarg) { fhold; fgoto expr_end; } else { fgoto line_begin; } }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # Like expr_beg, but no 1.9 label or 2.2 quoted label possible. # expr_value := |* # a:b: a(:b), a::B, A::B label (any - ':') => { p = ts - 1; fgoto expr_end; }; # "bar", 'baz' ['"] # ' => { literal_type type; if (ts[0] == '\'') { type = literal_type::SQUOTE_STRING; } else { // '"' type = literal_type::DQUOTE_STRING; } fgoto *push_literal(type, tok(), ts); }; w_space_comment; w_newline => { fgoto line_begin; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; expr_end := |* # # STABBY LAMBDA # '->' => { emit(token_type::tLAMBDA, "->", ts, ts + 2); lambda_stack.push(paren_nest); fnext expr_endfn; fbreak; }; e_lbrace | 'do' => { if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) { lambda_stack.pop(); if (ts[0] == '{') { emit(token_type::tLAMBEG, "{"); } else { // 'do' emit(token_type::kDO_LAMBDA, "do"); } } else { if (ts[0] == '{') { emit(token_type::tLCURLY, "{"); } else { // 'do' emit_do(); } } command_start = true; fnext expr_value; fbreak; }; # # KEYWORDS # keyword_with_fname => { emit_table(KEYWORDS); fnext expr_fname; fbreak; }; 'class' w_any* '<<' => { emit(token_type::kCLASS, "class", ts, ts + 5); emit(token_type::tLSHFT, "<<", te - 2, te); fnext expr_value; fbreak; }; # a if b:c: Syntax error. keyword_modifier => { emit_table(KEYWORDS); fnext expr_beg; fbreak; }; # elsif b:c: elsif b(:c) keyword_with_value => { emit_table(KEYWORDS); command_start = true; fnext expr_value; fbreak; }; keyword_with_mid => { emit_table(KEYWORDS); fnext expr_mid; fbreak; }; keyword_with_arg => { emit_table(KEYWORDS); if (version == ruby_version::RUBY_18 && ts + 3 == te && ts[0] == 'n' && ts[1] == 'o' && ts[2] == 't') { fnext expr_beg; fbreak; } else { fnext expr_arg; fbreak; } }; '__ENCODING__' => { if (version == ruby_version::RUBY_18) { auto ident = tok(); emit(token_type::tIDENTIFIER, ident); if (!is_declared(ident)) { fnext *arg_or_cmdarg(cmd_state); } } else { emit(token_type::k__ENCODING__, "__ENCODING__"); } fbreak; }; keyword_with_end => { emit_table(KEYWORDS); fbreak; }; # # NUMERIC LITERALS # ( '0' [Xx] %{ num_base = 16; num_digits_s = p; } int_hex | '0' [Dd] %{ num_base = 10; num_digits_s = p; } int_dec | '0' [Oo] %{ num_base = 8; num_digits_s = p; } int_dec | '0' [Bb] %{ num_base = 2; num_digits_s = p; } int_bin | [1-9] digit* '_'? %{ num_base = 10; num_digits_s = ts; } int_dec | '0' digit* '_'? %{ num_base = 8; num_digits_s = ts; } int_dec ) %{ num_suffix_s = p; } int_suffix => { auto digits = tok(num_digits_s, num_suffix_s); if (num_suffix_s[-1] == '_') { diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), "_"); } else if (num_digits_s == num_suffix_s && num_base == 8 && version == ruby_version::RUBY_18) { // 1.8 did not raise an error on 0o. } else if (num_digits_s == num_suffix_s) { diagnostic_(dlevel::ERROR, dclass::EmptyNumeric); } else if (num_base == 8) { for (const char* digit_p = num_digits_s; digit_p < num_suffix_s; digit_p++) { if (*digit_p == '8' || *digit_p == '9') { diagnostic_(dlevel::ERROR, dclass::InvalidOctal, range(digit_p, digit_p + 1)); } } } if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) { emit(token_type::tINTEGER, convert_base(digits, num_base), ts, num_suffix_s); p = num_suffix_s - 1; } else { emit_num(convert_base(digits, num_base)); } fbreak; }; flo_frac flo_pow? => { diagnostic_(dlevel::ERROR, dclass::NoDotDigitLiteral); }; flo_int [eE] => { if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) { diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te-1, te)); } else { emit(token_type::tINTEGER, tok(ts, te - 1), ts, te - 1); fhold; fbreak; } }; flo_int flo_frac [eE] => { if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) { diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te - 1, te)); } else { emit(token_type::tFLOAT, tok(ts, te - 1), ts, te - 1); fhold; fbreak; } }; flo_int ( flo_frac? flo_pow %{ num_suffix_s = p; } flo_pow_suffix | flo_frac %{ num_suffix_s = p; } flo_suffix ) => { auto digits = tok(ts, num_suffix_s); if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) { emit(token_type::tFLOAT, digits, ts, num_suffix_s); p = num_suffix_s - 1; } else { emit_num(digits); } fbreak; }; # # STRING AND XSTRING LITERALS # # `echo foo`, "bar", 'baz' '`' | ['"] # ' => { literal_type type; if (ts[0] == '`') { type = literal_type::BACKTICK_XSTRING; } else if (ts[0] == '\'') { type = literal_type::SQUOTE_STRING; } else { // '"' type = literal_type::DQUOTE_STRING; } fgoto *push_literal(type, std::string(te - 1, 1), ts, nullptr, false, false, true); }; # # CONSTANTS AND VARIABLES # constant => { emit(token_type::tCONSTANT); fnext *arg_or_cmdarg(cmd_state); fbreak; }; constant ambiguous_const_suffix => { emit(token_type::tCONSTANT, tok(ts, tm), ts, tm); p = tm - 1; fbreak; }; global_var | class_var_v | instance_var_v => { p = ts - 1; fcall expr_variable; }; # # METHOD CALLS # '.' | '&.' | '::' => { emit_table(PUNCTUATION); fnext expr_dot; fbreak; }; call_or_var => local_ident; bareword ambiguous_fid_suffix => { if (tm == te) { // Suffix was consumed, e.g. foo! emit(token_type::tFID); } else { // Suffix was not consumed, e.g. foo!= emit(token_type::tIDENTIFIER, tok(ts, tm), ts, tm); p = tm - 1; } fnext expr_arg; fbreak; }; # # OPERATORS # '*' | '=>' => { emit_table(PUNCTUATION); fgoto expr_value; }; # When '|', '~', '!', '=>' are used as operators # they do not accept any symbols (or quoted labels) after. # Other binary operators accept it. ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' ) => { emit_table(PUNCTUATION); fnext expr_value; fbreak; }; ( e_lparen | '|' | '~' | '!' ) => { emit_table(PUNCTUATION); fnext expr_beg; fbreak; }; e_rbrace | e_rparen | ']' => { emit_table(PUNCTUATION); cond.pop(); cmdarg.pop(); if (ts[0] == '}' || ts[0] == ']') { fnext expr_end; } else { // ')' // this was commented out in the original lexer.rl: // fnext expr_endfn; ? } fbreak; }; operator_arithmetic '=' => { emit(token_type::tOP_ASGN, tok(ts, te - 1)); fnext expr_beg; fbreak; }; '?' => { emit(token_type::tEH, "?"); fnext expr_value; fbreak; }; e_lbrack => { emit(token_type::tLBRACK2, "["); fnext expr_beg; fbreak; }; punctuation_end => { emit_table(PUNCTUATION); fnext expr_beg; fbreak; }; # # WHITESPACE # w_space_comment; w_newline => { fgoto leading_dot; }; ';' => { emit(token_type::tSEMI, ";"); command_start = true; fnext expr_value; fbreak; }; '\\' c_line { diagnostic_(dlevel::ERROR, dclass::BareBackslash, range(ts, ts + 1)); fhold; }; c_any => { diagnostic_(dlevel::ERROR, dclass::Unexpected, tok()); }; c_eof => do_eof; *|; leading_dot := |* # Insane leading dots: # a #comment # .b: a.b c_space* %{ tm = p; } ('.' | '&.') => { p = tm - 1; fgoto expr_end; }; any => { emit(token_type::tNL, std::string(), newline_s, newline_s + 1); fhold; fnext line_begin; fbreak; }; *|; # # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING === # line_comment := |* '=end' c_line* c_nl_zlen => { emit_comment(eq_begin_s, te); fgoto *cs_before_block_comment; }; c_line* c_nl; c_line* zlen => { diagnostic_(dlevel::FATAL, dclass::EmbeddedDocument, range(eq_begin_s, eq_begin_s + "=begin"s.size())); }; *|; line_begin := |* w_any; '=begin' ( c_space | c_nl_zlen ) => { eq_begin_s = ts; fgoto line_comment; }; '__END__' ( c_eol - zlen ) => { p = pe - 3; }; c_any => { cmd_state = true; fhold; fgoto expr_value; }; c_eof => do_eof; *|; }%% token_t lexer::advance() { auto tok = advance_(); last_token_s = tok->start(); last_token_e = tok->end(); return tok; } void lexer::extend_static() { static_env.emplace(); } void lexer::extend_dynamic() { if (static_env.empty()) { static_env.emplace(); } else { environment& env = static_env.top(); static_env.push(env); } } void lexer::unextend() { static_env.pop(); } void lexer::declare(const std::string& name) { static_env.top().insert(name); } bool lexer::is_declared(const std::string& identifier) const { const environment& env = static_env.top(); return env.find(identifier) != env.end(); } optional_size lexer::dedentLevel() { // We erase @dedentLevel as a precaution to avoid accidentally // using a stale value. auto ret = dedentLevel_; dedentLevel_ = std::nullopt; return ret; }