lexer.rl 74 KB


  1. /*
  2. Copyright (c) 2013-2016 whitequark <whitequark@whitequark.org>
  3. Parts of the source are derived from ruby_parser:
  4. Copyright (c) Ryan Davis, seattle.rb
  5. This lexer is a rewrite of the original in Ragel/C:
  6. Copyright (c) Charlie Somerville, GitHub
  7. MIT License
  8. Permission is hereby granted, free of charge, to any person obtaining
  9. a copy of this software and associated documentation files (the
  10. "Software"), to deal in the Software without restriction, including
  11. without limitation the rights to use, copy, modify, merge, publish,
  12. distribute, sublicense, and/or sell copies of the Software, and to
  13. permit persons to whom the Software is furnished to do so, subject to
  14. the following conditions:
  15. The above copyright notice and this permission notice shall be
  16. included in all copies or substantial portions of the Software.
  17. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18. EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  20. NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  21. LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  22. OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  23. WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. %%machine lex; # % fix highlighting
  26. /*
  27. #
  28. # === BEFORE YOU START ===
  29. #
  30. # Read the Ruby Hacking Guide chapter 11, available in English at
  31. # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
  32. #
  33. # Remember two things about Ragel scanners:
  34. #
  35. # 1) Longest match wins.
  36. #
  37. # 2) If two matches have the same length, the first
  38. # in source code wins.
  39. #
  40. # General rules of making Ragel and Bison happy:
  41. #
  42. # * `p` (position) and `@te` contain the index of the character
  43. # they're pointing to ("current"), plus one. `@ts` contains the index
  44. # of the corresponding character. The code for extracting matched token is:
  45. #
  46. # @source_buffer.slice(@ts...@te)
  47. #
  48. # * If your input is `foooooooobar` and the rule is:
  49. #
  50. # 'f' 'o'+
  51. #
  52. # the result will be:
  53. #
  54. # foooooooobar
  55. # ^ ts=0 ^ p=te=9
  56. #
  57. # * A Ragel lexer action should not emit more than one token, unless
  58. # you know what you are doing.
  59. #
  60. # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
  61. #
  62. # * If an action emits the token and transitions to another state, use
  63. # these Ragel commands:
  64. #
  65. # emit($whatever)
  66. # fnext $next_state; fbreak;
  67. #
  68. # If you perform `fgoto` in an action which does not emit a token nor
  69. # rewinds the stream pointer, the parser's side-effectful,
  70. # context-sensitive lookahead actions will break in a hard to detect
  71. # and debug way.
  72. #
  73. # * If an action does not emit a token:
  74. #
  75. # fgoto $next_state;
  76. #
  77. # * If an action features lookbehind, i.e. matches characters with the
  78. # intent of passing them to another action:
  79. #
  80. # p = @ts - 1
  81. # fgoto $next_state;
  82. #
  83. # or, if the lookbehind consists of a single character:
  84. #
  85. # fhold; fgoto $next_state;
  86. #
  87. # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
  88. # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
  89. # _will_ invoke the action `act`.
  90. #
  91. # e_something stands for "something with **e**mbedded action".
  92. #
  93. # * EOF is explicit and is matched by `c_eof`. If you want to introspect
  94. # the state of the lexer, add this rule to the state:
  95. #
  96. # c_eof => do_eof;
  97. #
  98. # * If you proceed past EOF, the lexer will complain:
  99. #
  100. # NoMethodError: undefined method `ord' for nil:NilClass
  101. #
  102. */
  103. #include <ruby_parser/driver.hh>
  104. #include <cassert>
  105. #include "absl/strings/numbers.h"
  106. %% write data nofinal;
  107. using namespace ruby_parser;
  108. using namespace std::string_literals;
  109. %% prepush { check_stack_capacity(); }
  110. lexer::lexer(diagnostics_t &diag, ruby_version version, const std::string& source_buffer_)
  111. : diagnostics(diag)
  112. , version(version)
  113. , source_buffer(source_buffer_ + std::string("\0\0", 2))
  114. , cs(lex_en_line_begin)
  115. , _p(source_buffer.data())
  116. , _pe(source_buffer.data() + source_buffer.size())
  117. , ts(nullptr)
  118. , te(nullptr)
  119. , act(0)
  120. , top(0)
  121. , eq_begin_s(nullptr)
  122. , sharp_s(nullptr)
  123. , newline_s(nullptr)
  124. , paren_nest(0)
  125. , command_start(true)
  126. , num_base(0)
  127. , num_digits_s(nullptr)
  128. , num_suffix_s(nullptr)
  129. , num_xfrm(num_xfrm_type::NONE)
  130. , escape_s(nullptr)
  131. , herebody_s(nullptr)
  132. , in_kwarg(false)
  133. {
  134. // ensure the stack is non-empty so we can just double in
  135. // check_stack_capacity:
  136. stack.resize(16);
  137. static_env.push(environment());
  138. cs_before_block_comment = lex_en_line_begin;
  139. }
  140. void lexer::check_stack_capacity() {
  141. if (stack.size() == (size_t)top) {
  142. stack.resize(stack.size() * 2);
  143. }
  144. }
  145. int lexer::stack_pop() {
  146. return stack[--top];
  147. }
  148. int lexer::arg_or_cmdarg(int cmd_state) {
  149. if (cmd_state) {
  150. return lex_en_expr_cmdarg;
  151. } else {
  152. return lex_en_expr_arg;
  153. }
  154. }
  155. void lexer::emit_comment(const char* s, const char* e) {
  156. /* unused for now */
  157. (void)s;
  158. (void)e;
  159. }
  160. std::string lexer::tok() {
  161. return tok(ts);
  162. }
  163. std::string lexer::tok(const char* start) {
  164. return tok(start, te);
  165. }
  166. std::string lexer::tok(const char* start, const char* end) {
  167. assert(start <= end);
  168. return std::string(start, (size_t)(end - start));
  169. }
  170. char lexer::unescape(uint32_t codepoint) {
  171. switch (codepoint) {
  172. case 'a': return '\a';
  173. case 'b': return '\b';
  174. case 'e': return 0x1b;
  175. case 'f': return '\f';
  176. case 'n': return '\n';
  177. case 'r': return '\r';
  178. case 's': return ' ';
  179. case 't': return '\t';
  180. case 'v': return '\v';
  181. case '\\': return '\\';
  182. default: return '\0';
  183. }
  184. }
  185. static const lexer::token_table_entry PUNCTUATION[] = {
  186. { "=", token_type::tEQL },
  187. { "&", token_type::tAMPER2 },
  188. { "|", token_type::tPIPE },
  189. { "!", token_type::tBANG },
  190. { "^", token_type::tCARET },
  191. { "+", token_type::tPLUS },
  192. { "-", token_type::tMINUS },
  193. { "*", token_type::tSTAR2 },
  194. { "/", token_type::tDIVIDE },
  195. { "%", token_type::tPERCENT },
  196. { "~", token_type::tTILDE },
  197. { ",", token_type::tCOMMA },
  198. { ";", token_type::tSEMI },
  199. { ".", token_type::tDOT },
  200. { "..", token_type::tDOT2 },
  201. { "...", token_type::tDOT3 },
  202. { "[", token_type::tLBRACK2 },
  203. { "]", token_type::tRBRACK },
  204. { "(", token_type::tLPAREN2 },
  205. { ")", token_type::tRPAREN },
  206. { "?", token_type::tEH },
  207. { ":", token_type::tCOLON },
  208. { "&&", token_type::tANDOP },
  209. { "||", token_type::tOROP },
  210. { "-@", token_type::tUMINUS },
  211. { "+@", token_type::tUPLUS },
  212. { "~@", token_type::tTILDE },
  213. { "**", token_type::tPOW },
  214. { "->", token_type::tLAMBDA },
  215. { "=~", token_type::tMATCH },
  216. { "!~", token_type::tNMATCH },
  217. { "==", token_type::tEQ },
  218. { "!=", token_type::tNEQ },
  219. { ">", token_type::tGT },
  220. { ">>", token_type::tRSHFT },
  221. { ">=", token_type::tGEQ },
  222. { "<", token_type::tLT },
  223. { "<<", token_type::tLSHFT },
  224. { "<=", token_type::tLEQ },
  225. { "=>", token_type::tASSOC },
  226. { "::", token_type::tCOLON2 },
  227. { "===", token_type::tEQQ },
  228. { "<=>", token_type::tCMP },
  229. { "[]", token_type::tAREF },
  230. { "[]=", token_type::tASET },
  231. { "{", token_type::tLCURLY },
  232. { "}", token_type::tRCURLY },
  233. { "`", token_type::tBACK_REF2 },
  234. { "!@", token_type::tBANG },
  235. { "&.", token_type::tANDDOT },
  236. { NULL, token_type::error },
  237. };
  238. static const lexer::token_table_entry PUNCTUATION_BEGIN[] = {
  239. { "&", token_type::tAMPER },
  240. { "*", token_type::tSTAR },
  241. { "**", token_type::tDSTAR },
  242. { "+", token_type::tUPLUS },
  243. { "-", token_type::tUMINUS },
  244. { "::", token_type::tCOLON3 },
  245. { "(", token_type::tLPAREN },
  246. { "{", token_type::tLBRACE },
  247. { "[", token_type::tLBRACK },
  248. { NULL, token_type::error },
  249. };
  250. static const lexer::token_table_entry KEYWORDS[] = {
  251. { "if", token_type::kIF_MOD },
  252. { "unless", token_type::kUNLESS_MOD },
  253. { "while", token_type::kWHILE_MOD },
  254. { "until", token_type::kUNTIL_MOD },
  255. { "rescue", token_type::kRESCUE_MOD },
  256. { "defined?", token_type::kDEFINED },
  257. { "BEGIN", token_type::klBEGIN },
  258. { "END", token_type::klEND },
  259. { "class", token_type::kCLASS },
  260. { "module", token_type::kMODULE },
  261. { "def", token_type::kDEF },
  262. { "undef", token_type::kUNDEF },
  263. { "begin", token_type::kBEGIN },
  264. { "end", token_type::kEND },
  265. { "then", token_type::kTHEN },
  266. { "elsif", token_type::kELSIF },
  267. { "else", token_type::kELSE },
  268. { "ensure", token_type::kENSURE },
  269. { "case", token_type::kCASE },
  270. { "when", token_type::kWHEN },
  271. { "for", token_type::kFOR },
  272. { "break", token_type::kBREAK },
  273. { "next", token_type::kNEXT },
  274. { "redo", token_type::kREDO },
  275. { "retry", token_type::kRETRY },
  276. { "in", token_type::kIN },
  277. { "do", token_type::kDO },
  278. { "return", token_type::kRETURN },
  279. { "yield", token_type::kYIELD },
  280. { "super", token_type::kSUPER },
  281. { "self", token_type::kSELF },
  282. { "nil", token_type::kNIL },
  283. { "true", token_type::kTRUE },
  284. { "false", token_type::kFALSE },
  285. { "and", token_type::kAND },
  286. { "or", token_type::kOR },
  287. { "not", token_type::kNOT },
  288. { "alias", token_type::kALIAS },
  289. { "__FILE__", token_type::k__FILE__ },
  290. { "__LINE__", token_type::k__LINE__ },
  291. { "__ENCODING__", token_type::k__ENCODING__ },
  292. { NULL, token_type::error },
  293. };
  294. static const lexer::token_table_entry KEYWORDS_BEGIN[] = {
  295. { "if", token_type::kIF },
  296. { "unless", token_type::kUNLESS },
  297. { "while", token_type::kWHILE },
  298. { "until", token_type::kUNTIL },
  299. { "rescue", token_type::kRESCUE },
  300. { "defined?", token_type::kDEFINED },
  301. { "BEGIN", token_type::klBEGIN },
  302. { "END", token_type::klEND },
  303. { "class", token_type::kCLASS },
  304. { "module", token_type::kMODULE },
  305. { "def", token_type::kDEF },
  306. { "undef", token_type::kUNDEF },
  307. { "begin", token_type::kBEGIN },
  308. { "end", token_type::kEND },
  309. { "then", token_type::kTHEN },
  310. { "elsif", token_type::kELSIF },
  311. { "else", token_type::kELSE },
  312. { "ensure", token_type::kENSURE },
  313. { "case", token_type::kCASE },
  314. { "when", token_type::kWHEN },
  315. { "for", token_type::kFOR },
  316. { "break", token_type::kBREAK },
  317. { "next", token_type::kNEXT },
  318. { "redo", token_type::kREDO },
  319. { "retry", token_type::kRETRY },
  320. { "in", token_type::kIN },
  321. { "do", token_type::kDO },
  322. { "return", token_type::kRETURN },
  323. { "yield", token_type::kYIELD },
  324. { "super", token_type::kSUPER },
  325. { "self", token_type::kSELF },
  326. { "nil", token_type::kNIL },
  327. { "true", token_type::kTRUE },
  328. { "false", token_type::kFALSE },
  329. { "and", token_type::kAND },
  330. { "or", token_type::kOR },
  331. { "not", token_type::kNOT },
  332. { "alias", token_type::kALIAS },
  333. { "__FILE__", token_type::k__FILE__ },
  334. { "__LINE__", token_type::k__LINE__ },
  335. { "__ENCODING__", token_type::k__ENCODING__ },
  336. { NULL, token_type::error },
  337. };
  338. static size_t utf8_encode_char(int32_t uc, std::string &dst) {
  339. if (uc < 0x00) {
  340. return 0;
  341. } else if (uc < 0x80) {
  342. dst.push_back(static_cast<uint8_t>(uc));
  343. return 1;
  344. } else if (uc < 0x800) {
  345. dst.push_back(static_cast<uint8_t>(0xC0 + (uc >> 6)));
  346. dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
  347. return 2;
  348. } else if (uc < 0x10000) {
  349. dst.push_back(static_cast<uint8_t>(0xE0 + (uc >> 12)));
  350. dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
  351. dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
  352. return 3;
  353. } else if (uc < 0x110000) {
  354. dst.push_back(static_cast<uint8_t>(0xF0 + (uc >> 18)));
  355. dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 12) & 0x3F)));
  356. dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
  357. dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
  358. return 4;
  359. } else return 0;
  360. }
  361. static bool split_codepoints(const std::string &str, std::string &output) {
  362. auto isspace = [](char c) { return c == ' ' || c == '\t'; };
  363. const char *ptr = str.c_str();
  364. while (*ptr) {
  365. while (isspace(*ptr))
  366. ptr++;
  367. const char *start = ptr;
  368. while (*ptr && !isspace(*ptr))
  369. ptr++;
  370. std::string cp {start, static_cast<size_t>(ptr - start)};
  371. if (utf8_encode_char(std::stoi(cp, nullptr, 16), output) == 0)
  372. return false;
  373. }
  374. return true;
  375. }
  376. static std::string gsub(const std::string&& str, const std::string&& search, const std::string&& replace) {
  377. std::string result;
  378. std::string::size_type from = 0;
  379. while (true) {
  380. auto index = str.find(search, from);
  381. if (index == std::string::npos) {
  382. result += str.substr(from);
  383. break;
  384. } else {
  385. result += str.substr(from, index - from);
  386. result += replace;
  387. from = index + search.size();
  388. }
  389. }
  390. return result;
  391. }
  392. static bool eof_codepoint(char c) {
  393. return c == 0 || c == 0x04 || c == 0x1a;
  394. }
  395. token_t lexer::advance_() {
  396. if (!token_queue.empty()) {
  397. token_t token = token_queue.front();
  398. token_queue.pop();
  399. return token;
  400. }
  401. int cmd_state = command_start;
  402. command_start = false;
  403. const char* p = _p;
  404. const char* pe = _pe;
  405. const char* eof = _pe;
  406. const char* tm = NULL;
  407. const char* heredoc_e = NULL;
  408. const char* new_herebody_s = NULL;
  409. const char* ident_ts = NULL;
  410. const char* ident_te = NULL;
  411. std::string ident_tok;
  412. %% write exec;
  413. _p = p;
  414. if (!token_queue.empty()) {
  415. token_t token = token_queue.front();
  416. token_queue.pop();
  417. return token;
  418. }
  419. if (cs == lex_error) {
  420. size_t start = (size_t)(p - source_buffer.data());
  421. return mempool.alloc(token_type::error, start, start + 1, std::string(p - 1, 1));
  422. }
  423. return mempool.alloc(token_type::eof, source_buffer.size(), source_buffer.size(), "");
  424. }
  425. void lexer::emit(token_type type) {
  426. emit(type, tok());
  427. }
  428. void lexer::emit(token_type type, const std::string& str) {
  429. emit(type, str, ts, te);
  430. }
  431. void lexer::emit(token_type type, const std::string& str, const char* start, const char* end) {
  432. size_t offset_start = (size_t)(start - source_buffer.data());
  433. size_t offset_end = (size_t)(end - source_buffer.data());
  434. token_queue.push(mempool.alloc(type, offset_start, offset_end, str));
  435. }
  436. void lexer::emit_do(bool do_block) {
  437. if (cond.active()) {
  438. emit(token_type::kDO_COND, "do");
  439. } else if (cmdarg.active() || do_block) {
  440. emit(token_type::kDO_BLOCK, "do");
  441. } else {
  442. emit(token_type::kDO, "do");
  443. }
  444. }
  445. void lexer::emit_table(const token_table_entry* table) {
  446. auto value = tok();
  447. for (; table->token; ++table) {
  448. if (value == table->token) {
  449. emit(table->type, value);
  450. return;
  451. }
  452. }
  453. // whitequark emits a `nil` token here, but if we do `yylex` hits an assert,
  454. // so just drop the token.
  455. return;
  456. }
  457. void lexer::emit_num(const std::string& num) {
  458. switch (num_xfrm) {
  459. case num_xfrm_type::NONE:
  460. emit(token_type::tINTEGER, num);
  461. break;
  462. case num_xfrm_type::RATIONAL:
  463. emit(token_type::tRATIONAL, num);
  464. break;
  465. case num_xfrm_type::IMAGINARY:
  466. emit(token_type::tIMAGINARY, num);
  467. break;
  468. case num_xfrm_type::RATIONAL_IMAGINARY:
  469. emit(token_type::tRATIONAL_IMAGINARY, num);
  470. break;
  471. case num_xfrm_type::FLOAT:
  472. emit(token_type::tFLOAT, num);
  473. break;
  474. case num_xfrm_type::FLOAT_IMAGINARY:
  475. emit(token_type::tFLOAT_IMAGINARY, num);
  476. break;
  477. }
  478. }
  479. std::string lexer::convert_base(const std::string& num, int num_base) {
  480. long int result;
  481. if (num_base == 10) {
  482. return num;
  483. }
  484. // This doesn't match Ruby's parsing but it is better than not handling it
  485. if (!absl::numbers_internal::safe_strtoi_base(num, &result, num_base)) {
  486. result = 0;
  487. // dmitry: appartently we assume that outer functions reported all the errors!!!
  488. }
  489. return std::to_string(result);
  490. }
  491. diagnostic::range lexer::range(const char *start, const char *end) {
  492. size_t token_start = (size_t)(start - source_buffer.data());
  493. size_t token_end = (size_t)(end - source_buffer.data());
  494. return diagnostic::range(token_start, token_end);
  495. }
  496. void lexer::diagnostic_(dlevel level, dclass type, const std::string &data) {
  497. diagnostics.emplace_back(level, type, range(ts, te), data);
  498. }
  499. void lexer::diagnostic_(dlevel level, dclass type, diagnostic::range &&range, const std::string &data) {
  500. diagnostics.emplace_back(level, type, range, data);
  501. }
  502. //
  503. // === LITERAL STACK ===
  504. //
  505. template<typename... Args>
  506. int lexer::push_literal(Args&&... args) {
  507. literal_stack.emplace(*this, std::forward<Args>(args)...);
  508. auto& literal = literal_stack.top();
  509. return next_state_for_literal(literal);
  510. }
  511. int lexer::next_state_for_literal(literal &lit) {
  512. if (lit.words() && lit.backslash_delimited()) {
  513. if (lit.interpolate()) {
  514. return lex_en_interp_backslash_delimited_words;
  515. } else {
  516. return lex_en_plain_backslash_delimited_words;
  517. }
  518. } else if (lit.words() && !lit.backslash_delimited()) {
  519. if (lit.interpolate()) {
  520. return lex_en_interp_words;
  521. } else {
  522. return lex_en_plain_words;
  523. }
  524. } else if (!lit.words() && lit.backslash_delimited()) {
  525. if (lit.interpolate()) {
  526. return lex_en_interp_backslash_delimited;
  527. } else {
  528. return lex_en_plain_backslash_delimited;
  529. }
  530. } else {
  531. if (lit.interpolate()) {
  532. return lex_en_interp_string;
  533. } else {
  534. return lex_en_plain_string;
  535. }
  536. }
  537. }
  538. literal& lexer::literal_() {
  539. return literal_stack.top();
  540. }
  541. int lexer::pop_literal() {
  542. bool was_regexp;
  543. {
  544. auto& old_literal = literal_stack.top();
  545. was_regexp = old_literal.regexp();
  546. dedentLevel_ = old_literal.dedentLevel();
  547. }
  548. literal_stack.pop();
  549. if (was_regexp) {
  550. return lex_en_regexp_modifiers;
  551. } else {
  552. return lex_en_expr_end;
  553. }
  554. }
  555. void lexer::set_state_expr_beg() {
  556. cs = lex_en_expr_beg;
  557. }
  558. void lexer::set_state_expr_end() {
  559. cs = lex_en_expr_end;
  560. }
  561. void lexer::set_state_expr_endarg() {
  562. cs = lex_en_expr_endarg;
  563. }
  564. void lexer::set_state_expr_fname() {
  565. cs = lex_en_expr_fname;
  566. }
  567. void lexer::set_state_expr_value() {
  568. cs = lex_en_expr_value;
  569. }
  570. %%{
  571. # access @;
  572. # getkey (@source_pts[p] || 0);
  573. # === CHARACTER CLASSES ===
  574. #
  575. # Pay close attention to the differences between c_any and any.
  576. # c_any does not include EOF and so will cause incorrect behavior
  577. # for machine subtraction (any-except rules) and default transitions
  578. # for scanners.
  579. action do_nl {
  580. // Record position of a newline for precise location reporting on tNL
  581. // tokens.
  582. //
  583. // This action is embedded directly into c_nl, as it is idempotent and
  584. // there are no cases when we need to skip it.
  585. newline_s = p;
  586. }
  587. c_nl = '\n' $ do_nl;
  588. c_space = [ \t\r\f\v];
  589. c_space_nl = c_space | c_nl;
  590. c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
  591. c_eol = c_nl | c_eof;
  592. c_any = any - c_eof;
  593. c_nl_zlen = c_nl | zlen;
  594. c_line = any - c_nl_zlen;
  595. c_unicode = c_any - 0x00..0x7f;
  596. c_upper = [A-Z];
  597. c_lower = [a-z_] | c_unicode;
  598. c_alpha = c_lower | c_upper;
  599. c_alnum = c_alpha | [0-9];
  600. action do_eof {
  601. // Sit at EOF indefinitely. #advance would return $eof each time.
  602. // This allows to feed the lexer more data if needed; this is only used
  603. // in tests.
  604. //
  605. // Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
  606. // below. This is due to the fact that scanner state at EOF is observed
  607. // by tests, and encapsulating it in a rule would break the introspection.
  608. fhold; fbreak;
  609. }
  610. #
  611. # === TOKEN DEFINITIONS ===
  612. #
  613. # All operators are punctuation. There is more to punctuation
  614. # than just operators. Operators can be overridden by user;
  615. # punctuation can not.
  616. # A list of operators which are valid in the function name context, but
  617. # have different semantics in others.
  618. operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
  619. # A list of operators which can occur within an assignment shortcut (+ → +=).
  620. operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
  621. '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
  622. # A list of all user-definable operators not covered by groups above.
  623. operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
  624. '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
  625. # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
  626. # as they are ambiguous with interpolation `#{}` and should be counted.
  627. # These braces are not present in punctuation lists.
  628. # A list of punctuation which has different meaning when used at the
  629. # beginning of expression.
  630. punctuation_begin = '-' | '+' | '::' | '(' | '[' |
  631. '*' | '**' | '&' ;
  632. # A list of all punctuation except punctuation_begin.
  633. punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
  634. '::' | '?' | ':' | '.' | '..' | '...' ;
  635. # A list of keywords which have different meaning at the beginning of expression.
  636. keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
  637. # A list of keywords which accept an argument-like expression, i.e. have the
  638. # same post-processing as method calls or commands. Example: `yield 1`,
  639. # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
  640. keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
  641. # A list of keywords which accept a literal function name as an argument.
  642. keyword_with_fname = 'def' | 'undef' | 'alias' ;
  643. # A list of keywords which accept an expression after them.
  644. keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
  645. 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
  646. 'and' | 'or' ;
  647. # A list of keywords which accept a value, and treat the keywords from
  648. # `keyword_modifier` list as modifiers.
  649. keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
  650. # A list of keywords which do not accept an expression after them.
  651. keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
  652. 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
  653. '__LINE__' | '__ENCODING__';
  654. # All keywords.
  655. keyword = keyword_with_value | keyword_with_mid |
  656. keyword_with_end | keyword_with_arg |
  657. keyword_with_fname | keyword_modifier ;
  658. constant = c_upper c_alnum*;
  659. bareword = c_alpha c_alnum*;
  660. call_or_var = c_lower c_alnum*;
  661. class_var = '@@' bareword;
  662. instance_var = '@' bareword;
  663. global_var = '$'
  664. ( bareword | digit+
  665. | [`'+~*$&?!@/\\;,.=:<>"] # `
  666. | '-' c_alnum
  667. )
  668. ;
  669. # Ruby accepts (and fails on) variables with leading digit
  670. # in literal context, but not in unquoted symbol body.
  671. class_var_v = '@@' c_alnum+;
  672. instance_var_v = '@' c_alnum+;
  673. label = bareword [?!]? ':';
  674. #
  675. # === NUMERIC PARSING ===
  676. #
  677. int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
  678. int_dec = ( digit+ '_' )* digit* '_'? ;
  679. int_bin = ( [01]+ '_' )* [01]* '_'? ;
  680. flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
  681. flo_frac = '.' ( digit+ '_' )* digit+;
  682. flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
  683. int_suffix =
  684. '' % { num_xfrm = num_xfrm_type::NONE; }
  685. | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; }
  686. | 'i' % { num_xfrm = num_xfrm_type::IMAGINARY; }
  687. | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
  688. flo_pow_suffix =
  689. '' % { num_xfrm = num_xfrm_type::FLOAT; }
  690. | 'i' % { num_xfrm = num_xfrm_type::FLOAT_IMAGINARY; };
  691. flo_suffix =
  692. flo_pow_suffix
  693. | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; }
  694. | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
  695. #
  696. # === ESCAPE SEQUENCE PARSING ===
  697. #
  698. # Escape parsing code is a Ragel pattern, not a scanner, and therefore
  699. # it shouldn't directly raise errors or perform other actions with side effects.
  700. # In reality this would probably just mess up error reporting in pathological
  701. # cases, through.
  702. # The amount of code required to parse \M\C stuff correctly is ridiculous.
  703. escaped_nl = "\\" c_nl;
  704. action unicode_points {
  705. auto codepoint_str = tok(escape_s + 2, p - 1);
  706. std::string result;
  707. if (split_codepoints(codepoint_str, result)) {
  708. escape = std::make_unique<std::string>(result);
  709. } else {
  710. auto codepoint_s = escape_s + 2;
  711. diagnostic_(dlevel::ERROR, dclass::UnicodePointTooLarge,
  712. range(codepoint_s, codepoint_s + codepoint_str.size()));
  713. }
  714. }
  715. action unescape_char {
  716. char esc = unescape(p[-1]);
  717. if (esc) {
  718. escape = std::make_unique<std::string>(&esc, 1);
  719. } else {
  720. escape = std::make_unique<std::string>(p - 1, 1);
  721. }
  722. }
  723. action invalid_complex_escape {
  724. diagnostic_(dlevel::FATAL, dclass::InvalidEscape);
  725. }
  726. action slash_c_char {
  727. // TODO multibyte
  728. char c = escape->at(0) & 0x9f;
  729. escape = std::make_unique<std::string>(&c, 1);
  730. }
  731. action slash_m_char {
  732. // TODO multibyte
  733. char c = escape->at(0) | 0x80;
  734. escape = std::make_unique<std::string>(&c, 1);
  735. }
  736. maybe_escaped_char = (
  737. '\\' c_any %unescape_char
  738. | ( c_any - [\\] ) % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ }
  739. );
  740. maybe_escaped_ctrl_char = ( # why?!
  741. '\\' c_any %unescape_char %slash_c_char
  742. | '?' % { escape = std::make_unique<std::string>("\x7f"); }
  743. | ( c_any - [\\?] ) % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ } %slash_c_char
  744. );
  745. escape = (
  746. # \377
  747. [0-7]{1,3}
  748. % {
  749. auto esc = tok(escape_s, p);
  750. char c = std::stoi(esc, nullptr, 8);
  751. escape = std::make_unique<std::string>(&c, 1);
  752. }
  753. # \xff
  754. | 'x' xdigit{1,2}
  755. % {
  756. auto esc = tok(escape_s + 1, p);
  757. char c = std::stoi(esc, nullptr, 16);
  758. escape = std::make_unique<std::string>(&c, 1);
  759. }
  760. # \u263a
  761. | 'u' xdigit{4}
  762. % {
  763. std::string result;
  764. split_codepoints(tok(escape_s + 1, p), result);
  765. escape = std::make_unique<std::string>(result);
  766. }
  767. # %q[\x]
  768. | 'x' ( c_any - xdigit )
  769. % {
  770. diagnostic_(dlevel::FATAL, dclass::InvalidHexEscape, range(escape_s - 1, p + 2));
  771. }
  772. # %q[\u123] %q[\u{12]
  773. | 'u' ( c_any{0,4} -
  774. xdigit{4} - # \u1234 is valid
  775. ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
  776. | '{' xdigit [ \t}] any? # \u{1. \u{1} are valid
  777. | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
  778. )
  779. )
  780. % {
  781. diagnostic_(dlevel::FATAL, dclass::InvalidUnicodeEscape, range(escape_s - 1, p));
  782. }
  783. # \u{123 456}
  784. | 'u{' ( xdigit{1,6} [ \t] )*
  785. ( xdigit{1,6} '}'
  786. %unicode_points
  787. | ( xdigit* ( c_any - xdigit - '}' )+ '}'
  788. | ( c_any - '}' )* c_eof
  789. | xdigit{7,}
  790. ) % {
  791. diagnostic_(dlevel::FATAL, dclass::UnterminatedUnicode, range(p - 1, p));
  792. }
  793. )
  794. # \C-\a \cx
  795. | ( 'C-' | 'c' ) escaped_nl?
  796. maybe_escaped_ctrl_char
  797. # \M-a
  798. | 'M-' escaped_nl?
  799. maybe_escaped_char
  800. %slash_m_char
  801. # \C-\M-f \M-\cf \c\M-f
  802. | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
  803. | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
  804. maybe_escaped_ctrl_char
  805. %slash_m_char
  806. | 'C' c_any %invalid_complex_escape
  807. | 'M' c_any %invalid_complex_escape
  808. | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
  809. | ( c_any - [0-7xuCMc] ) %unescape_char
  810. | c_eof % {
  811. diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(p - 1, p));
  812. }
  813. );
  814. # Use rules in form of `e_bs escape' when you need to parse a sequence.
  815. e_bs = '\\' % {
  816. escape_s = p;
  817. escape = nullptr;
  818. };
  819. #
  820. # === STRING AND HEREDOC PARSING ===
  821. #
  822. # Heredoc parsing is quite a complex topic. First, consider that heredocs
  823. # can be arbitrarily nested. For example:
  824. #
  825. # puts <<CODE
  826. # the result is: #{<<RESULT.inspect
  827. # i am a heredoc
  828. # RESULT
  829. # }
  830. # CODE
  831. #
  832. # which, incidentally, evaluates to:
  833. #
  834. # the result is: " i am a heredoc\n"
  835. #
  836. # To parse them, lexer refers to two kinds (remember, nested heredocs)
  837. # of positions in the input stream, namely heredoc_e
  838. # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
  839. #
  840. # heredoc_e is simply contained inside the corresponding Literal, and
  841. # when the heredoc is closed, the lexing is restarted from that position.
  842. #
  843. # @herebody_s is quite more complex. First, @herebody_s changes after each
  844. # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
  845. # contains the current line, and also when a heredoc is started, @herebody_s
  846. # contains the position from which the heredoc will be lexed.
  847. #
  848. # Second, as (insanity) there are nested heredocs, we need to maintain a
  849. # stack of these positions. Each time #push_literal is called, it saves current
  850. # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
  851. # containing another heredocs) is closed, the previous value is restored.
  852. e_heredoc_nl = c_nl % {
  853. // After every heredoc was parsed, herebody_s contains the
  854. // position of next token after all heredocs.
  855. if (herebody_s) {
  856. p = herebody_s;
  857. herebody_s = NULL;
  858. }
  859. };
  860. action extend_string {
  861. auto str = tok();
  862. std::string lookahead;
  863. // tLABEL_END is only possible in non-cond context on >= 2.2
  864. if (version >= ruby_version::RUBY_22 && !cond.active()) {
  865. const char* lookahead_s = te;
  866. const char* lookahead_e = te + 2;
  867. if (lookahead_e > eof) {
  868. lookahead_e = eof;
  869. }
  870. lookahead = std::string(lookahead_s, (size_t)(lookahead_e - lookahead_s));
  871. }
  872. auto& current_literal = literal_();
  873. if (!current_literal.heredoc() && current_literal.nest_and_try_closing(str, ts, te, lookahead)) {
  874. if (token_queue.back()->type() == token_type::tLABEL_END) {
  875. p += 1;
  876. pop_literal();
  877. fnext expr_labelarg;
  878. } else {
  879. fnext *pop_literal();
  880. }
  881. fbreak;
  882. } else {
  883. current_literal.extend_string(str, ts, te);
  884. }
  885. }
  886. action extend_string_escaped {
  887. auto& current_literal = literal_();
  888. // TODO multibyte
  889. auto escaped_char = *escape_s;
  890. if (current_literal.munge_escape(escaped_char)) {
  891. // If this particular literal uses this character as an opening
  892. // or closing delimiter, it is an escape sequence for that
  893. // particular character. Write it without the backslash.
  894. if (current_literal.regexp()
  895. && (escaped_char == '\\' ||
  896. escaped_char == '$' ||
  897. escaped_char == '$' ||
  898. escaped_char == '(' ||
  899. escaped_char == ')' ||
  900. escaped_char == '*' ||
  901. escaped_char == '+' ||
  902. escaped_char == '.' ||
  903. escaped_char == '<' ||
  904. escaped_char == '>' ||
  905. escaped_char == '?' ||
  906. escaped_char == '[' ||
  907. escaped_char == ']' ||
  908. escaped_char == '^' ||
  909. escaped_char == '{' ||
  910. escaped_char == '|' ||
  911. escaped_char == '}')) {
  912. // Regular expressions should include escaped delimiters in their
  913. // escaped form, except when the escaped character is
  914. // a closing delimiter but not a regexp metacharacter.
  915. //
  916. // The backslash itself cannot be used as a closing delimiter
  917. // at the same time as an escape symbol, but it is always munged,
  918. // so this branch also executes for the non-closing-delimiter case
  919. // for the backslash.
  920. auto str = tok();
  921. current_literal.extend_string(str, ts, te);
  922. } else {
  923. auto str = std::string(&escaped_char, 1);
  924. current_literal.extend_string(str, ts, te);
  925. }
  926. } else {
  927. // It does not. So this is an actual escape sequence, yay!
  928. if (current_literal.regexp()) {
  929. // Regular expressions should include escape sequences in their
  930. // escaped form. On the other hand, escaped newlines are removed.
  931. std::string str = gsub(tok(), "\\\n", "");
  932. current_literal.extend_string(str, ts, te);
  933. } else {
  934. auto str = escape ? *escape : tok();
  935. current_literal.extend_string(str, ts, te);
  936. }
  937. }
  938. }
  939. # Extend a string with a newline or a EOF character.
  940. # As heredoc closing line can immediately precede EOF, this action
  941. # has to handle such case specially.
  942. action extend_string_eol {
  943. auto& current_literal = literal_();
  944. if (te == pe) {
  945. diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(current_literal.str_s, current_literal.str_s + 1));
  946. }
  947. if (current_literal.heredoc()) {
  948. auto line = tok(herebody_s, ts);
  949. while (!line.empty() && line.back() == '\r') {
  950. line.pop_back();
  951. }
  952. if (version <= ruby_version::RUBY_20) {
  953. // See ruby:c48b4209c
  954. auto riter = line.rfind('\r');
  955. if (riter != std::string::npos) {
  956. line.erase(riter);
  957. }
  958. }
  959. // Try ending the heredoc with the complete most recently
  960. // scanned line. @herebody_s always refers to the start of such line.
  961. if (current_literal.nest_and_try_closing(line, herebody_s, ts)) {
  962. herebody_s = te;
  963. // Continue regular lexing after the heredoc reference (<<END).
  964. p = current_literal.heredoc_e - 1;
  965. fnext *pop_literal(); fbreak;
  966. } else {
  967. // Calculate indentation level for <<~HEREDOCs.
  968. current_literal.infer_indent_level(line);
  969. // Ditto.
  970. herebody_s = te;
  971. }
  972. } else {
  973. // Try ending the literal with a newline.
  974. auto str = tok();
  975. if (current_literal.nest_and_try_closing(str, ts, te)) {
  976. fnext *pop_literal(); fbreak;
  977. }
  978. if (herebody_s) {
  979. // This is a regular literal intertwined with a heredoc. Like:
  980. //
  981. // p <<-foo+"1
  982. // bar
  983. // foo
  984. // 2"
  985. //
  986. // which, incidentally, evaluates to "bar\n1\n2".
  987. p = herebody_s - 1;
  988. herebody_s = nullptr;
  989. }
  990. }
  991. if (current_literal.words() && !eof_codepoint(*p)) {
  992. current_literal.extend_space(ts, te);
  993. } else {
  994. // A literal newline is appended if the heredoc was _not_ closed
  995. // this time (see f break above). See also Literal#nest_and_try_closing
  996. // for rationale of calling #flush_string here.
  997. std::string str = tok();
  998. current_literal.extend_string(str, ts, te);
  999. current_literal.flush_string();
  1000. }
  1001. }
  1002. action extend_string_space {
  1003. literal_().extend_space(ts, te);
  1004. }
  1005. #
  1006. # === INTERPOLATION PARSING ===
  1007. #
  1008. # Interpolations with immediate variable names simply call into
  1009. # the corresponding machine.
  1010. interp_var = '#' ( global_var | class_var_v | instance_var_v );
  1011. action extend_interp_var {
  1012. auto& current_literal = literal_();
  1013. current_literal.flush_string();
  1014. current_literal.extend_content();
  1015. emit(token_type::tSTRING_DVAR, "", ts, ts + 1);
  1016. p = ts;
  1017. fcall expr_variable;
  1018. }
  1019. # Interpolations with code blocks must match nested curly braces, as
  1020. # interpolation ending is ambiguous with a block ending. So, every
  1021. # opening and closing brace should be matched with e_[lr]brace rules,
  1022. # which automatically perform the counting.
  1023. #
  1024. # Note that interpolations can themselves be nested, so brace balance
  1025. # is tied to the innermost literal.
  1026. #
  1027. # Also note that literals themselves should not use e_[lr]brace rules
  1028. # when matching their opening and closing delimiters, as the amount of
  1029. # braces inside the characters of a string literal is independent.
  1030. interp_code = '#{';
  1031. e_lbrace = '{' % {
  1032. cond.push(false); cmdarg.push(false);
  1033. if (!literal_stack.empty()) {
  1034. literal_().start_interp_brace();
  1035. }
  1036. };
  1037. e_rbrace = '}' % {
  1038. if (!literal_stack.empty()) {
  1039. auto& current_literal = literal_();
  1040. if (current_literal.end_interp_brace_and_try_closing()) {
  1041. if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19) {
  1042. emit(token_type::tRCURLY, "}", p - 1, p);
  1043. } else {
  1044. emit(token_type::tSTRING_DEND, "}", p - 1, p);
  1045. }
  1046. if (current_literal.saved_herebody_s) {
  1047. herebody_s = current_literal.saved_herebody_s;
  1048. }
  1049. fhold;
  1050. fnext *next_state_for_literal(current_literal);
  1051. fbreak;
  1052. }
  1053. }
  1054. };
  1055. action extend_interp_code {
  1056. auto& current_literal = literal_();
  1057. current_literal.flush_string();
  1058. current_literal.extend_content();
  1059. emit(token_type::tSTRING_DBEG, "#{");
  1060. if (current_literal.heredoc()) {
  1061. current_literal.saved_herebody_s = herebody_s;
  1062. herebody_s = nullptr;
  1063. }
  1064. current_literal.start_interp_brace();
  1065. command_start = true;
  1066. fnext expr_value;
  1067. fbreak;
  1068. }
  1069. # Actual string parsers are simply combined from the primitives defined
  1070. # above.
  1071. interp_words := |*
  1072. interp_code => extend_interp_code;
  1073. interp_var => extend_interp_var;
  1074. e_bs escape => extend_string_escaped;
  1075. c_space+ => extend_string_space;
  1076. c_eol => extend_string_eol;
  1077. c_any => extend_string;
  1078. *|;
  1079. interp_string := |*
  1080. interp_code => extend_interp_code;
  1081. interp_var => extend_interp_var;
  1082. e_bs escape => extend_string_escaped;
  1083. c_eol => extend_string_eol;
  1084. c_any => extend_string;
  1085. *|;
  1086. plain_words := |*
  1087. e_bs c_any => extend_string_escaped;
  1088. c_space+ => extend_string_space;
  1089. c_eol => extend_string_eol;
  1090. c_any => extend_string;
  1091. *|;
  1092. plain_string := |*
  1093. '\\' c_nl => extend_string_eol;
  1094. e_bs c_any => extend_string_escaped;
  1095. c_eol => extend_string_eol;
  1096. c_any => extend_string;
  1097. *|;
  1098. interp_backslash_delimited := |*
  1099. interp_code => extend_interp_code;
  1100. interp_var => extend_interp_var;
  1101. c_eol => extend_string_eol;
  1102. c_any => extend_string;
  1103. *|;
  1104. plain_backslash_delimited := |*
  1105. c_eol => extend_string_eol;
  1106. c_any => extend_string;
  1107. *|;
  1108. interp_backslash_delimited_words := |*
  1109. interp_code => extend_interp_code;
  1110. interp_var => extend_interp_var;
  1111. c_space+ => extend_string_space;
  1112. c_eol => extend_string_eol;
  1113. c_any => extend_string;
  1114. *|;
  1115. plain_backslash_delimited_words := |*
  1116. c_space+ => extend_string_space;
  1117. c_eol => extend_string_eol;
  1118. c_any => extend_string;
  1119. *|;
  1120. regexp_modifiers := |*
  1121. [A-Za-z]+
  1122. => {
  1123. auto options = tok();
  1124. std::string unknown_options;
  1125. for (auto i = options.cbegin(); i != options.cend(); ++i) {
  1126. switch (char opt = *i) {
  1127. case 'i':
  1128. case 'm':
  1129. case 'x':
  1130. case 'o':
  1131. case 'u':
  1132. case 'e':
  1133. case 's':
  1134. case 'n':
  1135. continue;
  1136. default:
  1137. unknown_options += opt;
  1138. break;
  1139. }
  1140. }
  1141. if (!unknown_options.empty()) {
  1142. diagnostic_(dlevel::ERROR, dclass::RegexpOptions, unknown_options);
  1143. }
  1144. emit(token_type::tREGEXP_OPT, options);
  1145. fnext expr_end;
  1146. fbreak;
  1147. };
  1148. any
  1149. => {
  1150. emit(token_type::tREGEXP_OPT, tok(ts, te - 1), ts, te - 1);
  1151. fhold;
  1152. fgoto expr_end;
  1153. };
  1154. *|;
  1155. #
  1156. # === WHITESPACE HANDLING ===
  1157. #
  1158. # Various contexts in Ruby allow various kinds of whitespace
  1159. # to be used. They are grouped to clarify the lexing machines
  1160. # and ease collection of comments.
  1161. # A line of code with inline #comment at end is always equivalent
  1162. # to a line of code ending with just a newline, so an inline
  1163. # comment is deemed equivalent to non-newline whitespace
  1164. # (c_space character class).
  1165. w_space =
  1166. c_space+
  1167. | '\\' e_heredoc_nl
  1168. ;
  1169. w_comment =
  1170. '#' %{ sharp_s = p - 1; }
  1171. # The (p == pe) condition compensates for added "\0" and
  1172. # the way Ragel handles EOF.
  1173. c_line* %{ emit_comment(sharp_s, p == pe ? p - 2 : p); }
  1174. ;
  1175. w_space_comment =
  1176. w_space
  1177. | w_comment
  1178. ;
  1179. # A newline in non-literal context always interoperates with
  1180. # here document logic and can always be escaped by a backslash,
  1181. # still interoperating with here document logic in the same way,
  1182. # yet being invisible to anything else.
  1183. #
  1184. # To demonstrate:
  1185. #
  1186. # foo = <<FOO \
  1187. # bar
  1188. # FOO
  1189. # + 2
  1190. #
  1191. # is equivalent to `foo = "bar\n" + 2`.
  1192. w_newline =
  1193. e_heredoc_nl;
  1194. w_any =
  1195. w_space
  1196. | w_comment
  1197. | w_newline
  1198. ;
  1199. #
  1200. # === EXPRESSION PARSING ===
  1201. #
  1202. # These rules implement a form of manually defined lookahead.
  1203. # The default longest-match scanning does not work here due
  1204. # to sheer ambiguity.
  1205. ambiguous_fid_suffix = # actual parsed
  1206. [?!] %{ tm = p; } | # a? a?
  1207. [?!]'=' %{ tm = p - 2; } # a!=b a != b
  1208. ;
  1209. ambiguous_ident_suffix = # actual parsed
  1210. ambiguous_fid_suffix |
  1211. '=' %{ tm = p; } | # a= a=
  1212. '==' %{ tm = p - 2; } | # a==b a == b
  1213. '=~' %{ tm = p - 2; } | # a=~b a =~ b
  1214. '=>' %{ tm = p - 2; } | # a=>b a => b
  1215. '===' %{ tm = p - 3; } # a===b a === b
  1216. ;
  1217. ambiguous_symbol_suffix = # actual parsed
  1218. ambiguous_ident_suffix |
  1219. '==>' %{ tm = p - 2; } # :a==>b :a= => b
  1220. ;
  1221. # Ambiguous with 1.9 hash labels.
  1222. ambiguous_const_suffix = # actual parsed
  1223. '::' %{ tm = p - 2; } # A::B A :: B
  1224. ;
  1225. # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
  1226. # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
  1227. e_lbrack = '[' % {
  1228. cond.push(false); cmdarg.push(false);
  1229. };
  1230. # Ruby 1.9 lambdas require parentheses counting in order to
  1231. # emit correct opening kDO/tLBRACE.
  1232. e_lparen = '(' % {
  1233. cond.push(false); cmdarg.push(false);
  1234. paren_nest += 1;
  1235. };
  1236. e_rparen = ')' % {
  1237. paren_nest -= 1;
  1238. };
  1239. # Ruby is context-sensitive wrt/ local identifiers.
  1240. action local_ident {
  1241. auto ident = tok();
  1242. emit(token_type::tIDENTIFIER, ident);
  1243. if (is_declared(ident)) {
  1244. fnext expr_endfn; fbreak;
  1245. } else {
  1246. fnext *arg_or_cmdarg(cmd_state); fbreak;
  1247. }
  1248. }
  1249. # Variable lexing code is accessed from both expressions and
  1250. # string interpolation related code.
  1251. #
  1252. expr_variable := |*
  1253. global_var
  1254. => {
  1255. if (ts[1] >= '1' && ts[1] <= '9') {
  1256. emit(token_type::tNTH_REF, tok(ts + 1));
  1257. } else if (ts[1] == '&' || ts[1] == '`' || ts[1] == '\'' || ts[1] == '+') {
  1258. emit(token_type::tBACK_REF);
  1259. } else {
  1260. emit(token_type::tGVAR);
  1261. }
  1262. fnext *stack_pop(); fbreak;
  1263. };
  1264. class_var_v
  1265. => {
  1266. if (ts[2] >= '0' && ts[2] <= '9') {
  1267. diagnostic_(dlevel::ERROR, dclass::CvarName, tok(ts, te));
  1268. }
  1269. emit(token_type::tCVAR);
  1270. fnext *stack_pop(); fbreak;
  1271. };
  1272. instance_var_v
  1273. => {
  1274. if (ts[1] >= '0' && ts[1] <= '9') {
  1275. diagnostic_(dlevel::ERROR, dclass::IvarName, tok(ts, te));
  1276. }
  1277. emit(token_type::tIVAR);
  1278. fnext *stack_pop(); fbreak;
  1279. };
  1280. *|;
  1281. # Literal function name in definition (e.g. `def class`).
  1282. # Keywords are returned as their respective tokens; this is used
  1283. # to support singleton def `def self.foo`. Global variables are
  1284. # returned as `tGVAR`; this is used in global variable alias
  1285. # statements `alias $a $b`. Symbols are returned verbatim; this
  1286. # is used in `alias :a :"b#{foo}"` and `undef :a`.
  1287. #
  1288. # Transitions to `expr_endfn` afterwards.
  1289. #
  1290. expr_fname := |*
  1291. keyword
  1292. => { emit_table(KEYWORDS_BEGIN);
  1293. fnext expr_endfn; fbreak; };
  1294. constant
  1295. => { emit(token_type::tCONSTANT);
  1296. fnext expr_endfn; fbreak; };
  1297. bareword [?=!]?
  1298. => { emit(token_type::tIDENTIFIER);
  1299. fnext expr_endfn; fbreak; };
  1300. global_var
  1301. => { p = ts - 1;
  1302. fnext expr_end; fcall expr_variable; };
  1303. # If the handling was to be delegated to expr_end,
  1304. # these cases would transition to something else than
  1305. # expr_endfn, which is incorrect.
  1306. operator_fname |
  1307. operator_arithmetic |
  1308. operator_rest
  1309. => { emit_table(PUNCTUATION);
  1310. fnext expr_endfn; fbreak; };
  1311. '::'
  1312. => { fhold; fhold; fgoto expr_end; };
  1313. ':'
  1314. => { fhold; fgoto expr_beg; };
  1315. '%s' c_any
  1316. => {
  1317. if (version == ruby_version::RUBY_23) {
  1318. fgoto *push_literal(literal_type::LOWERS_SYMBOL, std::string(ts + 2, 1), ts);
  1319. } else {
  1320. p = ts - 1;
  1321. fgoto expr_end;
  1322. }
  1323. };
  1324. w_any;
  1325. c_any
  1326. => { fhold; fgoto expr_end; };
  1327. c_eof => do_eof;
  1328. *|;
  1329. # After literal function name in definition. Behaves like `expr_end`,
  1330. # but allows a tLABEL.
  1331. #
  1332. # Transitions to `expr_end` afterwards.
  1333. #
  1334. expr_endfn := |*
  1335. label ( any - ':' )
  1336. => { emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
  1337. fhold; fnext expr_labelarg; fbreak; };
  1338. w_space_comment;
  1339. c_any
  1340. => { fhold; fgoto expr_end; };
  1341. c_eof => do_eof;
  1342. *|;
  1343. # Literal function name in method call (e.g. `a.class`).
  1344. #
  1345. # Transitions to `expr_arg` afterwards.
  1346. #
  1347. # KEEP IN SYNC WITH expr_dot_after_newline!
  1348. #
  1349. expr_dot := |*
  1350. constant
  1351. => { emit(token_type::tCONSTANT);
  1352. fnext *arg_or_cmdarg(cmd_state); fbreak; };
  1353. call_or_var
  1354. => { emit(token_type::tIDENTIFIER);
  1355. fnext *arg_or_cmdarg(cmd_state); fbreak; };
  1356. bareword ambiguous_fid_suffix
  1357. => { emit(token_type::tFID, tok(ts, tm), ts, tm);
  1358. fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
  1359. # See the comment in `expr_fname`.
  1360. operator_fname |
  1361. operator_arithmetic |
  1362. operator_rest
  1363. => { emit_table(PUNCTUATION);
  1364. fnext expr_arg; fbreak; };
  1365. # This breaks compatibility with Ruby for better partial parses (useful
  1366. # for LSP especially). See comment for expr_dot_after_newline below.
  1367. w_newline
  1368. => { fhold; fgoto expr_dot_after_newline; };
  1369. w_any;
  1370. c_any
  1371. => { fhold; fgoto expr_end; };
  1372. c_eof => do_eof;
  1373. *|;
  1374. # KEEP IN SYNC WITH expr_dot!
  1375. #
  1376. # This state breaks from valid Ruby syntax, but in a way that enables Sorbet
  1377. # to recover better from parse errors. Recovering from parse errors is
  1378. # important because it lets us service LSP queries faster.
  1379. #
  1380. # Specifically, this state makes is so that any keyword seen after w_newline
  1381. # is emitted as a keyword (like kEND) instead of a tIDENTIFIER. Examples:
  1382. #
  1383. # # Valid Ruby, valid in Sorbet (no newline between '.' and 'end')
  1384. # def foo
  1385. # x.end
  1386. # end
  1387. #
  1388. # # Parse error in Ruby and Sorbet, but Sorbet at least sees the method def
  1389. # # with an empty body (Ruby wouldn't even see an empty method def)
  1390. # def foo
  1391. # x.
  1392. # end
  1393. #
  1394. # # Valid Ruby, not valid in Sorbet (newline between '.' and 'end')
  1395. # def foo
  1396. # x.
  1397. # end
  1398. # end
  1399. #
  1400. expr_dot_after_newline := |*
  1401. constant
  1402. => { emit(token_type::tCONSTANT);
  1403. fnext *arg_or_cmdarg(cmd_state); fbreak; };
  1404. # This is different from expr_dot. Here, keywords are NOT identifiers.
  1405. keyword
  1406. => { emit_table(KEYWORDS);
  1407. fnext expr_end; fbreak; };
  1408. call_or_var
  1409. => { emit(token_type::tIDENTIFIER);
  1410. fnext *arg_or_cmdarg(cmd_state); fbreak; };
  1411. bareword ambiguous_fid_suffix
  1412. => { emit(token_type::tFID, tok(ts, tm), ts, tm);
  1413. fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
  1414. # See the comment in `expr_fname`.
  1415. operator_fname |
  1416. operator_arithmetic |
  1417. operator_rest
  1418. => { emit_table(PUNCTUATION);
  1419. fnext expr_arg; fbreak; };
  1420. w_any;
  1421. c_any
  1422. => { fhold; fgoto expr_end; };
  1423. c_eof => do_eof;
  1424. *|;
  1425. # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
  1426. # is consumed; the current expression is a command or method call.
  1427. #
  1428. expr_arg := |*
  1429. #
  1430. # COMMAND MODE SPECIFIC TOKENS
  1431. #
  1432. # cmd (1 + 2)
  1433. # See below the rationale about expr_endarg.
  1434. w_space+ e_lparen
  1435. => {
  1436. if (version == ruby_version::RUBY_18) {
  1437. emit(token_type::tLPAREN2, "(", te - 1, te);
  1438. fnext expr_value; fbreak;
  1439. } else {
  1440. emit(token_type::tLPAREN_ARG, "(", te - 1, te);
  1441. fnext expr_beg; fbreak;
  1442. }
  1443. };
  1444. # meth(1 + 2)
  1445. # Regular method call.
  1446. e_lparen
  1447. => { emit(token_type::tLPAREN2, "(");
  1448. fnext expr_beg; fbreak; };
  1449. # meth [...]
  1450. # Array argument. Compare with indexing `meth[...]`.
  1451. w_space+ e_lbrack
  1452. => { emit(token_type::tLBRACK, "[", te - 1, te);
  1453. fnext expr_beg; fbreak; };
  1454. # cmd {}
  1455. # Command: method call without parentheses.
  1456. w_space* e_lbrace
  1457. => {
  1458. if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
  1459. lambda_stack.pop();
  1460. emit(token_type::tLAMBEG, "{", te - 1, te);
  1461. } else {
  1462. emit(token_type::tLCURLY, "{", te - 1, te);
  1463. }
  1464. command_start = true;
  1465. fnext expr_value; fbreak;
  1466. };
  1467. #
  1468. # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
  1469. #
  1470. # a??
  1471. # Ternary operator
  1472. '?' c_space_nl
  1473. => {
  1474. // Unlike expr_beg as invoked in the next rule, do not warn
  1475. p = ts - 1;
  1476. fgoto expr_end;
  1477. };
  1478. # a ?b, a? ?
  1479. # Character literal or ternary operator
  1480. w_space* '?'
  1481. => { fhold; fgoto expr_beg; };
  1482. # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
  1483. # a /foo/ (but not "a / foo" or "a /=foo")
  1484. # a <<HEREDOC
  1485. w_space+ %{ tm = p; }
  1486. ( [%/] ( c_any - c_space_nl - '=' ) # /
  1487. | '<<'
  1488. )
  1489. => {
  1490. if (*tm == '/') {
  1491. // Ambiguous regexp literal.
  1492. diagnostic_(dlevel::WARNING, dclass::AmbiguousLiteral, range(tm, tm + 1));
  1493. }
  1494. p = tm - 1;
  1495. fgoto expr_beg;
  1496. };
  1497. # x *1
  1498. # Ambiguous splat, kwsplat or block-pass.
  1499. w_space+ %{ tm = p; } ( '+' | '-' | '*' | '&' | '**' )
  1500. => {
  1501. diagnostic_(dlevel::WARNING, dclass::AmbiguousPrefix, range(tm, te), tok(tm, te));
  1502. p = tm - 1;
  1503. fgoto expr_beg;
  1504. };
  1505. # x ::Foo
  1506. # Ambiguous toplevel constant access.
  1507. w_space+ '::'
  1508. => { fhold; fhold; fgoto expr_beg; };
  1509. # x:b
  1510. # Symbol.
  1511. w_space* ':'
  1512. => { fhold; fgoto expr_beg; };
  1513. w_space+ label
  1514. => { p = ts - 1; fgoto expr_beg; };
  1515. #
  1516. # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
  1517. #
  1518. # a ? b
  1519. # Ternary operator.
  1520. w_space+ %{ tm = p; } '?' c_space_nl
  1521. => { p = tm - 1; fgoto expr_end; };
  1522. # x + 1: Binary operator or operator-assignment.
  1523. w_space* operator_arithmetic
  1524. ( '=' | c_space_nl )? |
  1525. # x rescue y: Modifier keyword.
  1526. w_space* keyword_modifier |
  1527. # a &. b: Safe navigation operator.
  1528. w_space* '&.' |
  1529. # Miscellanea.
  1530. w_space* punctuation_end
  1531. => {
  1532. p = ts - 1;
  1533. fgoto expr_end;
  1534. };
  1535. w_space;
  1536. w_comment
  1537. => { fgoto expr_end; };
  1538. w_newline
  1539. => { fhold; fgoto expr_end; };
  1540. c_any
  1541. => { fhold; fgoto expr_beg; };
  1542. c_eof => do_eof;
  1543. *|;
  1544. # The previous token was an identifier which was seen while in the
  1545. # command mode (that is, the state at the beginning of #advance was
  1546. # expr_value). This state is very similar to expr_arg, but disambiguates
  1547. # two very rare and specific condition:
  1548. # * In 1.8 mode, "foo (lambda do end)".
  1549. # * In 1.9+ mode, "f x: -> do foo do end end".
  1550. expr_cmdarg := |*
  1551. w_space+ e_lparen
  1552. => {
  1553. emit(token_type::tLPAREN_ARG, "(", te - 1, te);
  1554. if (version == ruby_version::RUBY_18) {
  1555. fnext expr_value; fbreak;
  1556. } else {
  1557. fnext expr_beg; fbreak;
  1558. }
  1559. };
  1560. w_space* 'do'
  1561. => {
  1562. if (cond.active()) {
  1563. emit(token_type::kDO_COND, "do", te - 2, te);
  1564. } else {
  1565. emit(token_type::kDO, "do", te - 2, te);
  1566. }
  1567. fnext expr_value; fbreak;
  1568. };
  1569. c_any |
  1570. # Disambiguate with the `do' rule above.
  1571. w_space* bareword |
  1572. w_space* label
  1573. => { p = ts - 1;
  1574. fgoto expr_arg; };
  1575. c_eof => do_eof;
  1576. *|;
  1577. # The rationale for this state is pretty complex. Normally, if an argument
  1578. # is passed to a command and then there is a block (tLCURLY...tRCURLY),
  1579. # the block is attached to the innermost argument (`f` in `m f {}`), or it
  1580. # is a parse error (`m 1 {}`). But there is a special case for passing a single
  1581. # primary expression grouped with parentheses: if you write `m (1) {}` or
  1582. # (2.0 only) `m () {}`, then the block is attached to `m`.
  1583. #
  1584. # Thus, we recognize the opening `(` of a command (remember, a command is
  1585. # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
  1586. # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
  1587. # lexer's state to `expr_endarg`, which makes it emit the possibly following
  1588. # `{` as `tLBRACE_ARG`.
  1589. #
  1590. # The default post-`expr_endarg` state is `expr_end`, so this state also handles
  1591. # `do` (as `kDO_BLOCK` in `expr_beg`).
  1592. expr_endarg := |*
  1593. e_lbrace
  1594. => {
  1595. if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
  1596. lambda_stack.pop();
  1597. emit(token_type::tLAMBEG, "{");
  1598. } else {
  1599. emit(token_type::tLBRACE_ARG, "{");
  1600. }
  1601. command_start = true;
  1602. fnext expr_value; fbreak;
  1603. };
  1604. 'do'
  1605. => { emit_do(true);
  1606. fnext expr_value; fbreak; };
  1607. w_space_comment;
  1608. c_any
  1609. => { fhold; fgoto expr_end; };
  1610. c_eof => do_eof;
  1611. *|;
  1612. # The rationale for this state is that several keywords accept value
  1613. # (i.e. should transition to `expr_beg`), do not accept it like a command
  1614. # (i.e. not an `expr_arg`), and must behave like a statement, that is,
  1615. # accept a modifier if/while/etc.
  1616. #
  1617. expr_mid := |*
  1618. keyword_modifier
  1619. => { emit_table(KEYWORDS);
  1620. fnext expr_beg; fbreak; };
  1621. bareword
  1622. => { p = ts - 1; fgoto expr_beg; };
  1623. w_space_comment;
  1624. w_newline
  1625. => { fhold; fgoto expr_end; };
  1626. c_any
  1627. => { fhold; fgoto expr_beg; };
  1628. c_eof => do_eof;
  1629. *|;
  1630. # Beginning of an expression.
  1631. #
  1632. # Don't fallthrough to this state from `c_any`; make sure to handle
  1633. # `c_space* c_nl` and let `expr_end` handle the newline.
  1634. # Otherwise code like `f\ndef x` gets glued together and the parser
  1635. # explodes.
  1636. #
  1637. expr_beg := |*
  1638. # +5, -5, - 5
  1639. [+\-] w_any* [0-9]
  1640. => {
  1641. emit(token_type::tUNARY_NUM, tok(ts, ts + 1), ts, ts + 1);
  1642. fhold; fnext expr_end; fbreak;
  1643. };
  1644. # splat *a
  1645. '*'
  1646. => { emit(token_type::tSTAR, "*");
  1647. fbreak; };
  1648. #
  1649. # STRING AND REGEXP LITERALS
  1650. #
  1651. # /regexp/oui
  1652. # /=/ (disambiguation with /=)
  1653. '/' c_any
  1654. => {
  1655. fhold; fgoto *push_literal(literal_type::SLASH_REGEXP, std::string(ts + 0, 1), ts);
  1656. };
  1657. # %<string>
  1658. '%' ( any - [A-Za-z] )
  1659. => {
  1660. fgoto *push_literal(literal_type::PERCENT_STRING, std::string(ts + 1, 1), ts);
  1661. };
  1662. # %w(we are the people)
  1663. '%' [A-Za-z]+ c_any
  1664. => {
  1665. literal_type type;
  1666. bool single_char_type = (ts + 3 == te);
  1667. if (single_char_type && ts[1] == 'q') {
  1668. type = literal_type::LOWERQ_STRING;
  1669. } else if (single_char_type && ts[1] == 'Q') {
  1670. type = literal_type::UPPERQ_STRING;
  1671. } else if (single_char_type && ts[1] == 'w') {
  1672. type = literal_type::LOWERW_WORDS;
  1673. } else if (single_char_type && ts[1] == 'W') {
  1674. type = literal_type::UPPERW_WORDS;
  1675. } else if (single_char_type && ts[1] == 'i') {
  1676. type = literal_type::LOWERI_SYMBOLS;
  1677. } else if (single_char_type && ts[1] == 'I') {
  1678. type = literal_type::UPPERI_SYMBOLS;
  1679. } else if (single_char_type && ts[1] == 's') {
  1680. type = literal_type::LOWERS_SYMBOL;
  1681. } else if (single_char_type && ts[1] == 'r') {
  1682. type = literal_type::PERCENT_REGEXP;
  1683. } else if (single_char_type && ts[1] == 'x') {
  1684. type = literal_type::LOWERX_XSTRING;
  1685. } else {
  1686. type = literal_type::PERCENT_STRING;
  1687. diagnostic_(dlevel::ERROR, dclass::UnexpectedPercentStr, range(ts, te - 1), tok(ts, te-1));
  1688. }
  1689. fgoto *push_literal(type, std::string(te - 1, 1), ts);
  1690. };
  1691. '%' c_eof
  1692. => {
  1693. diagnostic_(dlevel::FATAL, dclass::StringEof, range(ts, ts + 1));
  1694. };
  1695. # Heredoc start.
  1696. # <<END | <<'END' | <<"END" | <<`END` |
  1697. # <<-END | <<-'END' | <<-"END" | <<-`END` |
  1698. # <<~END | <<~'END' | <<~"END" | <<~`END`
  1699. '<<' [~\-]?
  1700. ( '"' ( c_line - '"' )* '"'
  1701. | "'" ( c_line - "'" )* "'"
  1702. | "`" ( c_line - "`" )* "`"
  1703. | bareword ) % { heredoc_e = p; }
  1704. c_line* c_nl % { new_herebody_s = p; }
  1705. => {
  1706. bool indent;
  1707. bool dedent_body;
  1708. const char* delim_s = ts + 2;
  1709. const char* delim_e = heredoc_e;
  1710. if (*delim_s == '-') {
  1711. indent = true;
  1712. dedent_body = false;
  1713. delim_s++;
  1714. } else if (*delim_s == '~') {
  1715. indent = true;
  1716. dedent_body = true;
  1717. delim_s++;
  1718. } else {
  1719. indent = false;
  1720. dedent_body = false;
  1721. }
  1722. literal_type type;
  1723. if (*delim_s == '"') {
  1724. type = literal_type::DQUOTE_HEREDOC;
  1725. delim_s++;
  1726. delim_e--;
  1727. } else if (*delim_s == '\'') {
  1728. type = literal_type::SQUOTE_HEREDOC;
  1729. delim_s++;
  1730. delim_e--;
  1731. } else if (*delim_s == '`') {
  1732. type = literal_type::BACKTICK_HEREDOC;
  1733. delim_s++;
  1734. delim_e--;
  1735. } else {
  1736. type = literal_type::DQUOTE_HEREDOC;
  1737. }
  1738. if (dedent_body && (version == ruby_version::RUBY_18 ||
  1739. version == ruby_version::RUBY_19 ||
  1740. version == ruby_version::RUBY_20 ||
  1741. version == ruby_version::RUBY_21 ||
  1742. version == ruby_version::RUBY_22)) {
  1743. emit(token_type::tLSHFT, "<<", ts, ts + 2);
  1744. p = ts + 1;
  1745. fnext expr_beg; fbreak;
  1746. } else {
  1747. fnext *push_literal(type, std::string(delim_s, (size_t)(delim_e - delim_s)), ts, heredoc_e, indent, dedent_body);
  1748. if (!herebody_s) {
  1749. herebody_s = new_herebody_s;
  1750. }
  1751. p = herebody_s - 1;
  1752. }
  1753. };
  1754. #
  1755. # SYMBOL LITERALS
  1756. #
  1757. # :&&, :||
  1758. ':' ('&&' | '||') => {
  1759. fhold; fhold;
  1760. emit(token_type::tSYMBEG, tok(ts, ts + 1), ts, ts + 1);
  1761. fgoto expr_fname;
  1762. };
  1763. # :"bar", :'baz'
  1764. ':' ['"] # '
  1765. => {
  1766. literal_type type;
  1767. if (ts[1] == '\'') {
  1768. type = literal_type::SQUOTE_SYMBOL;
  1769. } else { // '"'
  1770. type = literal_type::DQUOTE_SYMBOL;
  1771. }
  1772. fgoto *push_literal(type, std::string(ts + 1, 1), ts);
  1773. };
  1774. # :!@ is :!
  1775. # :~@ is :~
  1776. ':' [!~] '@'
  1777. => {
  1778. emit(token_type::tSYMBEG, tok(ts + 1, ts + 2), ts, te);
  1779. fnext expr_end; fbreak;
  1780. };
  1781. ':' bareword ambiguous_symbol_suffix
  1782. => {
  1783. emit(token_type::tSYMBOL, tok(ts + 1, tm), ts, tm);
  1784. p = tm - 1;
  1785. fnext expr_end; fbreak;
  1786. };
  1787. ':' ( bareword | global_var | class_var | instance_var |
  1788. operator_fname | operator_arithmetic | operator_rest )
  1789. => {
  1790. emit(token_type::tSYMBOL, tok(ts + 1), ts, te);
  1791. fnext expr_end; fbreak;
  1792. };
  1793. #
  1794. # AMBIGUOUS TERNARY OPERATOR
  1795. #
  1796. # Character constant, like ?a, ?\n, ?\u1000, and so on
  1797. # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
  1798. '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
  1799. | (c_any - c_space_nl - e_bs) % { escape = nullptr; }
  1800. )
  1801. => {
  1802. if (version == ruby_version::RUBY_18) {
  1803. emit(token_type::tINTEGER, std::to_string(static_cast<unsigned char>(ts[1])));
  1804. } else {
  1805. emit(token_type::tCHARACTER, escape ? *escape : tok(ts + 1));
  1806. }
  1807. fnext expr_end; fbreak;
  1808. };
  1809. '?' c_space_nl
  1810. => {
  1811. static const struct escape_map_ent { char c; const char* s; } escape_map[] {
  1812. { ' ', "\\s" },
  1813. { '\r', "\\r" },
  1814. { '\n', "\\n" },
  1815. { '\t', "\\t" },
  1816. { '\v', "\\v" },
  1817. { '\f', "\\f" },
  1818. { 0, 0 },
  1819. };
  1820. for (const struct escape_map_ent* ent = escape_map; ent->c; ++ent) {
  1821. if (ts[1] == ent->c) {
  1822. diagnostic_(dlevel::WARNING, dclass::InvalidEscapeUse, ent->s);
  1823. break;
  1824. }
  1825. }
  1826. p = ts - 1;
  1827. fgoto expr_end;
  1828. };
  1829. '?' c_eof
  1830. => {
  1831. diagnostic_(dlevel::FATAL, dclass::IncompleteEscape, range(ts, ts + 1));
  1832. };
  1833. # f ?aa : b: Disambiguate with a character literal.
  1834. '?' [A-Za-z_] bareword
  1835. => {
  1836. p = ts - 1;
  1837. fgoto expr_end;
  1838. };
  1839. #
  1840. # KEYWORDS AND PUNCTUATION
  1841. #
  1842. # a({b=>c})
  1843. e_lbrace
  1844. => {
  1845. if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
  1846. lambda_stack.pop();
  1847. command_start = true;
  1848. emit(token_type::tLAMBEG, "{");
  1849. } else {
  1850. emit(token_type::tLBRACE, "{");
  1851. }
  1852. fbreak;
  1853. };
  1854. # a([1, 2])
  1855. e_lbrack
  1856. => { emit(token_type::tLBRACK, "[");
  1857. fbreak; };
  1858. # a()
  1859. e_lparen
  1860. => { emit(token_type::tLPAREN, "(");
  1861. fbreak; };
  1862. # a(+b)
  1863. punctuation_begin
  1864. => { emit_table(PUNCTUATION_BEGIN);
  1865. fbreak; };
  1866. # rescue Exception => e: Block rescue.
  1867. # Special because it should transition to expr_mid.
  1868. 'rescue' %{ tm = p; } '=>'?
  1869. => { emit(token_type::kRESCUE, "rescue", ts, tm);
  1870. p = tm - 1;
  1871. fnext expr_mid; fbreak; };
  1872. # if a: Statement if.
  1873. keyword_modifier
  1874. => { emit_table(KEYWORDS_BEGIN);
  1875. command_start = true;
  1876. fnext expr_value; fbreak; };
  1877. #
  1878. # RUBY 1.9 HASH LABELS
  1879. #
  1880. label ( any - ':' )
  1881. => {
  1882. fhold;
  1883. if (version == ruby_version::RUBY_18) {
  1884. auto ident = tok(ts, te - 2);
  1885. if (*ts >= 'A' && *ts <= 'Z') {
  1886. emit(token_type::tCONSTANT, ident, ts, te - 2);
  1887. } else {
  1888. emit(token_type::tIDENTIFIER, ident, ts, te - 2);
  1889. }
  1890. fhold; // continue as a symbol
  1891. if (is_declared(ident)) {
  1892. fnext expr_end;
  1893. } else {
  1894. fnext *arg_or_cmdarg(cmd_state);
  1895. }
  1896. } else {
  1897. emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
  1898. fnext expr_labelarg;
  1899. }
  1900. fbreak;
  1901. };
  1902. #
  1903. # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
  1904. #
  1905. # foo= bar: Disambiguate with bareword rule below.
  1906. bareword ambiguous_ident_suffix |
  1907. # def foo: Disambiguate with bareword rule below.
  1908. keyword
  1909. => { p = ts - 1;
  1910. fgoto expr_end; };
  1911. # a = 42; a [42]: Indexing.
  1912. # def a; end; a [42]: Array argument.
  1913. call_or_var
  1914. => local_ident;
  1915. (call_or_var - keyword)
  1916. % { ident_tok = tok(ts, te); ident_ts = ts; ident_te = te; }
  1917. w_space+ '('
  1918. => {
  1919. emit(token_type::tIDENTIFIER, ident_tok, ident_ts, ident_te);
  1920. p = ident_te - 1;
  1921. fnext expr_cmdarg;
  1922. fbreak;
  1923. };
  1924. #
  1925. # WHITESPACE
  1926. #
  1927. w_any;
  1928. e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
  1929. => { p = ts - 1;
  1930. cs_before_block_comment = cs;
  1931. fgoto line_begin; };
  1932. #
  1933. # DEFAULT TRANSITION
  1934. #
  1935. # The following rules match most binary and all unary operators.
  1936. # Rules for binary operators provide better error reporting.
  1937. operator_arithmetic '=' |
  1938. operator_rest |
  1939. punctuation_end |
  1940. c_any
  1941. => { p = ts - 1; fgoto expr_end; };
  1942. c_eof => do_eof;
  1943. *|;
  1944. # Special newline handling for "def a b:"
  1945. #
  1946. expr_labelarg := |*
  1947. w_space_comment;
  1948. w_newline
  1949. => {
  1950. if (in_kwarg) {
  1951. fhold; fgoto expr_end;
  1952. } else {
  1953. fgoto line_begin;
  1954. }
  1955. };
  1956. c_any
  1957. => { fhold; fgoto expr_beg; };
  1958. c_eof => do_eof;
  1959. *|;
  1960. # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
  1961. #
  1962. expr_value := |*
  1963. # a:b: a(:b), a::B, A::B
  1964. label (any - ':')
  1965. => { p = ts - 1;
  1966. fgoto expr_end; };
  1967. # "bar", 'baz'
  1968. ['"] # '
  1969. => {
  1970. literal_type type;
  1971. if (ts[0] == '\'') {
  1972. type = literal_type::SQUOTE_STRING;
  1973. } else { // '"'
  1974. type = literal_type::DQUOTE_STRING;
  1975. }
  1976. fgoto *push_literal(type, tok(), ts);
  1977. };
  1978. w_space_comment;
  1979. w_newline
  1980. => { fgoto line_begin; };
  1981. c_any
  1982. => { fhold; fgoto expr_beg; };
  1983. c_eof => do_eof;
  1984. *|;
  1985. expr_end := |*
  1986. #
  1987. # STABBY LAMBDA
  1988. #
  1989. '->'
  1990. => {
  1991. emit(token_type::tLAMBDA, "->", ts, ts + 2);
  1992. lambda_stack.push(paren_nest);
  1993. fnext expr_endfn; fbreak;
  1994. };
  1995. e_lbrace | 'do'
  1996. => {
  1997. if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
  1998. lambda_stack.pop();
  1999. if (ts[0] == '{') {
  2000. emit(token_type::tLAMBEG, "{");
  2001. } else { // 'do'
  2002. emit(token_type::kDO_LAMBDA, "do");
  2003. }
  2004. } else {
  2005. if (ts[0] == '{') {
  2006. emit(token_type::tLCURLY, "{");
  2007. } else { // 'do'
  2008. emit_do();
  2009. }
  2010. }
  2011. command_start = true;
  2012. fnext expr_value; fbreak;
  2013. };
  2014. #
  2015. # KEYWORDS
  2016. #
  2017. keyword_with_fname
  2018. => { emit_table(KEYWORDS);
  2019. fnext expr_fname; fbreak; };
  2020. 'class' w_any* '<<'
  2021. => { emit(token_type::kCLASS, "class", ts, ts + 5);
  2022. emit(token_type::tLSHFT, "<<", te - 2, te);
  2023. fnext expr_value; fbreak; };
  2024. # a if b:c: Syntax error.
  2025. keyword_modifier
  2026. => { emit_table(KEYWORDS);
  2027. fnext expr_beg; fbreak; };
  2028. # elsif b:c: elsif b(:c)
  2029. keyword_with_value
  2030. => { emit_table(KEYWORDS);
  2031. command_start = true;
  2032. fnext expr_value; fbreak; };
  2033. keyword_with_mid
  2034. => { emit_table(KEYWORDS);
  2035. fnext expr_mid; fbreak; };
  2036. keyword_with_arg
  2037. => {
  2038. emit_table(KEYWORDS);
  2039. if (version == ruby_version::RUBY_18 && ts + 3 == te && ts[0] == 'n' && ts[1] == 'o' && ts[2] == 't') {
  2040. fnext expr_beg; fbreak;
  2041. } else {
  2042. fnext expr_arg; fbreak;
  2043. }
  2044. };
  2045. '__ENCODING__'
  2046. => {
  2047. if (version == ruby_version::RUBY_18) {
  2048. auto ident = tok();
  2049. emit(token_type::tIDENTIFIER, ident);
  2050. if (!is_declared(ident)) {
  2051. fnext *arg_or_cmdarg(cmd_state);
  2052. }
  2053. } else {
  2054. emit(token_type::k__ENCODING__, "__ENCODING__");
  2055. }
  2056. fbreak;
  2057. };
  2058. keyword_with_end
  2059. => { emit_table(KEYWORDS);
  2060. fbreak; };
  2061. #
  2062. # NUMERIC LITERALS
  2063. #
  2064. ( '0' [Xx] %{ num_base = 16; num_digits_s = p; } int_hex
  2065. | '0' [Dd] %{ num_base = 10; num_digits_s = p; } int_dec
  2066. | '0' [Oo] %{ num_base = 8; num_digits_s = p; } int_dec
  2067. | '0' [Bb] %{ num_base = 2; num_digits_s = p; } int_bin
  2068. | [1-9] digit* '_'? %{ num_base = 10; num_digits_s = ts; } int_dec
  2069. | '0' digit* '_'? %{ num_base = 8; num_digits_s = ts; } int_dec
  2070. ) %{ num_suffix_s = p; } int_suffix
  2071. => {
  2072. auto digits = tok(num_digits_s, num_suffix_s);
  2073. if (num_suffix_s[-1] == '_') {
  2074. diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), "_");
  2075. } else if (num_digits_s == num_suffix_s && num_base == 8 && version == ruby_version::RUBY_18) {
  2076. // 1.8 did not raise an error on 0o.
  2077. } else if (num_digits_s == num_suffix_s) {
  2078. diagnostic_(dlevel::ERROR, dclass::EmptyNumeric);
  2079. } else if (num_base == 8) {
  2080. for (const char* digit_p = num_digits_s; digit_p < num_suffix_s; digit_p++) {
  2081. if (*digit_p == '8' || *digit_p == '9') {
  2082. diagnostic_(dlevel::ERROR, dclass::InvalidOctal,
  2083. range(digit_p, digit_p + 1));
  2084. }
  2085. }
  2086. }
  2087. if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
  2088. emit(token_type::tINTEGER, convert_base(digits, num_base), ts, num_suffix_s);
  2089. p = num_suffix_s - 1;
  2090. } else {
  2091. emit_num(convert_base(digits, num_base));
  2092. }
  2093. fbreak;
  2094. };
  2095. flo_frac flo_pow?
  2096. => {
  2097. diagnostic_(dlevel::ERROR, dclass::NoDotDigitLiteral);
  2098. };
  2099. flo_int [eE]
  2100. => {
  2101. if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
  2102. diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te-1, te));
  2103. } else {
  2104. emit(token_type::tINTEGER, tok(ts, te - 1), ts, te - 1);
  2105. fhold; fbreak;
  2106. }
  2107. };
  2108. flo_int flo_frac [eE]
  2109. => {
  2110. if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
  2111. diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te - 1, te));
  2112. } else {
  2113. emit(token_type::tFLOAT, tok(ts, te - 1), ts, te - 1);
  2114. fhold; fbreak;
  2115. }
  2116. };
  2117. flo_int
  2118. ( flo_frac? flo_pow %{ num_suffix_s = p; } flo_pow_suffix
  2119. | flo_frac %{ num_suffix_s = p; } flo_suffix
  2120. )
  2121. => {
  2122. auto digits = tok(ts, num_suffix_s);
  2123. if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
  2124. emit(token_type::tFLOAT, digits, ts, num_suffix_s);
  2125. p = num_suffix_s - 1;
  2126. } else {
  2127. emit_num(digits);
  2128. }
  2129. fbreak;
  2130. };
  2131. #
  2132. # STRING AND XSTRING LITERALS
  2133. #
  2134. # `echo foo`, "bar", 'baz'
  2135. '`' | ['"] # '
  2136. => {
  2137. literal_type type;
  2138. if (ts[0] == '`') {
  2139. type = literal_type::BACKTICK_XSTRING;
  2140. } else if (ts[0] == '\'') {
  2141. type = literal_type::SQUOTE_STRING;
  2142. } else { // '"'
  2143. type = literal_type::DQUOTE_STRING;
  2144. }
  2145. fgoto *push_literal(type, std::string(te - 1, 1), ts, nullptr, false, false, true);
  2146. };
  2147. #
  2148. # CONSTANTS AND VARIABLES
  2149. #
  2150. constant
  2151. => { emit(token_type::tCONSTANT);
  2152. fnext *arg_or_cmdarg(cmd_state); fbreak; };
  2153. constant ambiguous_const_suffix
  2154. => { emit(token_type::tCONSTANT, tok(ts, tm), ts, tm);
  2155. p = tm - 1; fbreak; };
  2156. global_var | class_var_v | instance_var_v
  2157. => { p = ts - 1; fcall expr_variable; };
  2158. #
  2159. # METHOD CALLS
  2160. #
  2161. '.' | '&.' | '::'
  2162. => { emit_table(PUNCTUATION);
  2163. fnext expr_dot; fbreak; };
  2164. call_or_var
  2165. => local_ident;
  2166. bareword ambiguous_fid_suffix
  2167. => {
  2168. if (tm == te) {
  2169. // Suffix was consumed, e.g. foo!
  2170. emit(token_type::tFID);
  2171. } else {
  2172. // Suffix was not consumed, e.g. foo!=
  2173. emit(token_type::tIDENTIFIER, tok(ts, tm), ts, tm);
  2174. p = tm - 1;
  2175. }
  2176. fnext expr_arg; fbreak;
  2177. };
  2178. #
  2179. # OPERATORS
  2180. #
  2181. '*' | '=>'
  2182. => {
  2183. emit_table(PUNCTUATION);
  2184. fgoto expr_value;
  2185. };
  2186. # When '|', '~', '!', '=>' are used as operators
  2187. # they do not accept any symbols (or quoted labels) after.
  2188. # Other binary operators accept it.
  2189. ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
  2190. => {
  2191. emit_table(PUNCTUATION);
  2192. fnext expr_value; fbreak;
  2193. };
  2194. ( e_lparen | '|' | '~' | '!' )
  2195. => { emit_table(PUNCTUATION);
  2196. fnext expr_beg; fbreak; };
  2197. e_rbrace | e_rparen | ']'
  2198. => {
  2199. emit_table(PUNCTUATION);
  2200. cond.pop();
  2201. cmdarg.pop();
  2202. if (ts[0] == '}' || ts[0] == ']') {
  2203. fnext expr_end;
  2204. } else { // ')'
  2205. // this was commented out in the original lexer.rl:
  2206. // fnext expr_endfn; ?
  2207. }
  2208. fbreak;
  2209. };
  2210. operator_arithmetic '='
  2211. => { emit(token_type::tOP_ASGN, tok(ts, te - 1));
  2212. fnext expr_beg; fbreak; };
  2213. '?'
  2214. => { emit(token_type::tEH, "?");
  2215. fnext expr_value; fbreak; };
  2216. e_lbrack
  2217. => { emit(token_type::tLBRACK2, "[");
  2218. fnext expr_beg; fbreak; };
  2219. punctuation_end
  2220. => { emit_table(PUNCTUATION);
  2221. fnext expr_beg; fbreak; };
  2222. #
  2223. # WHITESPACE
  2224. #
  2225. w_space_comment;
  2226. w_newline
  2227. => { fgoto leading_dot; };
  2228. ';'
  2229. => { emit(token_type::tSEMI, ";");
  2230. command_start = true;
  2231. fnext expr_value; fbreak; };
  2232. '\\' c_line {
  2233. diagnostic_(dlevel::ERROR, dclass::BareBackslash, range(ts, ts + 1));
  2234. fhold;
  2235. };
  2236. c_any
  2237. => {
  2238. diagnostic_(dlevel::ERROR, dclass::Unexpected, tok());
  2239. };
  2240. c_eof => do_eof;
  2241. *|;
  2242. leading_dot := |*
  2243. # Insane leading dots:
  2244. # a #comment
  2245. # .b: a.b
  2246. c_space* %{ tm = p; } ('.' | '&.')
  2247. => { p = tm - 1; fgoto expr_end; };
  2248. any
  2249. => { emit(token_type::tNL, std::string(), newline_s, newline_s + 1);
  2250. fhold; fnext line_begin; fbreak; };
  2251. *|;
  2252. #
  2253. # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
  2254. #
  2255. line_comment := |*
  2256. '=end' c_line* c_nl_zlen
  2257. => {
  2258. emit_comment(eq_begin_s, te);
  2259. fgoto *cs_before_block_comment;
  2260. };
  2261. c_line* c_nl;
  2262. c_line* zlen
  2263. => {
  2264. diagnostic_(dlevel::FATAL, dclass::EmbeddedDocument,
  2265. range(eq_begin_s, eq_begin_s + "=begin"s.size()));
  2266. };
  2267. *|;
  2268. line_begin := |*
  2269. w_any;
  2270. '=begin' ( c_space | c_nl_zlen )
  2271. => { eq_begin_s = ts;
  2272. fgoto line_comment; };
  2273. '__END__' ( c_eol - zlen )
  2274. => { p = pe - 3; };
  2275. c_any
  2276. => { cmd_state = true; fhold; fgoto expr_value; };
  2277. c_eof => do_eof;
  2278. *|;
  2279. }%%
  2280. token_t lexer::advance() {
  2281. auto tok = advance_();
  2282. last_token_s = tok->start();
  2283. last_token_e = tok->end();
  2284. return tok;
  2285. }
  2286. void lexer::extend_static() {
  2287. static_env.emplace();
  2288. }
  2289. void lexer::extend_dynamic() {
  2290. if (static_env.empty()) {
  2291. static_env.emplace();
  2292. } else {
  2293. environment& env = static_env.top();
  2294. static_env.push(env);
  2295. }
  2296. }
  2297. void lexer::unextend() {
  2298. static_env.pop();
  2299. }
  2300. void lexer::declare(const std::string& name) {
  2301. static_env.top().insert(name);
  2302. }
  2303. bool lexer::is_declared(const std::string& identifier) const {
  2304. const environment& env = static_env.top();
  2305. return env.find(identifier) != env.end();
  2306. }
  2307. optional_size lexer::dedentLevel() {
  2308. // We erase @dedentLevel as a precaution to avoid accidentally
  2309. // using a stale value.
  2310. auto ret = dedentLevel_;
  2311. dedentLevel_ = std::nullopt;
  2312. return ret;
  2313. }