12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753 |
- /*
- Copyright (c) 2013-2016 whitequark <whitequark@whitequark.org>
- Parts of the source are derived from ruby_parser:
- Copyright (c) Ryan Davis, seattle.rb
- This lexer is a rewrite of the original in Ragel/C:
- Copyright (c) Charlie Somerville, GitHub
- MIT License
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
- %%machine lex; # % fix highlighting
- /*
- #
- # === BEFORE YOU START ===
- #
- # Read the Ruby Hacking Guide chapter 11, available in English at
- # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
- #
- # Remember two things about Ragel scanners:
- #
- # 1) Longest match wins.
- #
- # 2) If two matches have the same length, the first
- # in source code wins.
- #
- # General rules of making Ragel and Bison happy:
- #
- # * `p` (position) and `@te` contain the index of the character
- # they're pointing to ("current"), plus one. `@ts` contains the index
- # of the corresponding character. The code for extracting matched token is:
- #
- # @source_buffer.slice(@ts...@te)
- #
- # * If your input is `foooooooobar` and the rule is:
- #
- # 'f' 'o'+
- #
- # the result will be:
- #
- # foooooooobar
- # ^ ts=0 ^ p=te=9
- #
- # * A Ragel lexer action should not emit more than one token, unless
- # you know what you are doing.
- #
- # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
- #
- # * If an action emits the token and transitions to another state, use
- # these Ragel commands:
- #
- # emit($whatever)
- # fnext $next_state; fbreak;
- #
- # If you perform `fgoto` in an action which does not emit a token nor
- # rewinds the stream pointer, the parser's side-effectful,
- # context-sensitive lookahead actions will break in a hard to detect
- # and debug way.
- #
- # * If an action does not emit a token:
- #
- # fgoto $next_state;
- #
- # * If an action features lookbehind, i.e. matches characters with the
- # intent of passing them to another action:
- #
- # p = @ts - 1
- # fgoto $next_state;
- #
- # or, if the lookbehind consists of a single character:
- #
- # fhold; fgoto $next_state;
- #
- # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
- # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
- # _will_ invoke the action `act`.
- #
- # e_something stands for "something with **e**mbedded action".
- #
- # * EOF is explicit and is matched by `c_eof`. If you want to introspect
- # the state of the lexer, add this rule to the state:
- #
- # c_eof => do_eof;
- #
- # * If you proceed past EOF, the lexer will complain:
- #
- # NoMethodError: undefined method `ord' for nil:NilClass
- #
- */
- #include <ruby_parser/driver.hh>
- #include <cassert>
- #include "absl/strings/numbers.h"
- %% write data nofinal;
- using namespace ruby_parser;
- using namespace std::string_literals;
- %% prepush { check_stack_capacity(); }
- lexer::lexer(diagnostics_t &diag, ruby_version version, const std::string& source_buffer_)
- : diagnostics(diag)
- , version(version)
- , source_buffer(source_buffer_ + std::string("\0\0", 2))
- , cs(lex_en_line_begin)
- , _p(source_buffer.data())
- , _pe(source_buffer.data() + source_buffer.size())
- , ts(nullptr)
- , te(nullptr)
- , act(0)
- , top(0)
- , eq_begin_s(nullptr)
- , sharp_s(nullptr)
- , newline_s(nullptr)
- , paren_nest(0)
- , command_start(true)
- , num_base(0)
- , num_digits_s(nullptr)
- , num_suffix_s(nullptr)
- , num_xfrm(num_xfrm_type::NONE)
- , escape_s(nullptr)
- , herebody_s(nullptr)
- , in_kwarg(false)
- {
- // ensure the stack is non-empty so we can just double in
- // check_stack_capacity:
- stack.resize(16);
- static_env.push(environment());
- cs_before_block_comment = lex_en_line_begin;
- }
- void lexer::check_stack_capacity() {
- if (stack.size() == (size_t)top) {
- stack.resize(stack.size() * 2);
- }
- }
- int lexer::stack_pop() {
- return stack[--top];
- }
- int lexer::arg_or_cmdarg(int cmd_state) {
- if (cmd_state) {
- return lex_en_expr_cmdarg;
- } else {
- return lex_en_expr_arg;
- }
- }
- void lexer::emit_comment(const char* s, const char* e) {
- /* unused for now */
- (void)s;
- (void)e;
- }
- std::string lexer::tok() {
- return tok(ts);
- }
- std::string lexer::tok(const char* start) {
- return tok(start, te);
- }
- std::string lexer::tok(const char* start, const char* end) {
- assert(start <= end);
- return std::string(start, (size_t)(end - start));
- }
- char lexer::unescape(uint32_t codepoint) {
- switch (codepoint) {
- case 'a': return '\a';
- case 'b': return '\b';
- case 'e': return 0x1b;
- case 'f': return '\f';
- case 'n': return '\n';
- case 'r': return '\r';
- case 's': return ' ';
- case 't': return '\t';
- case 'v': return '\v';
- case '\\': return '\\';
- default: return '\0';
- }
- }
- static const lexer::token_table_entry PUNCTUATION[] = {
- { "=", token_type::tEQL },
- { "&", token_type::tAMPER2 },
- { "|", token_type::tPIPE },
- { "!", token_type::tBANG },
- { "^", token_type::tCARET },
- { "+", token_type::tPLUS },
- { "-", token_type::tMINUS },
- { "*", token_type::tSTAR2 },
- { "/", token_type::tDIVIDE },
- { "%", token_type::tPERCENT },
- { "~", token_type::tTILDE },
- { ",", token_type::tCOMMA },
- { ";", token_type::tSEMI },
- { ".", token_type::tDOT },
- { "..", token_type::tDOT2 },
- { "...", token_type::tDOT3 },
- { "[", token_type::tLBRACK2 },
- { "]", token_type::tRBRACK },
- { "(", token_type::tLPAREN2 },
- { ")", token_type::tRPAREN },
- { "?", token_type::tEH },
- { ":", token_type::tCOLON },
- { "&&", token_type::tANDOP },
- { "||", token_type::tOROP },
- { "-@", token_type::tUMINUS },
- { "+@", token_type::tUPLUS },
- { "~@", token_type::tTILDE },
- { "**", token_type::tPOW },
- { "->", token_type::tLAMBDA },
- { "=~", token_type::tMATCH },
- { "!~", token_type::tNMATCH },
- { "==", token_type::tEQ },
- { "!=", token_type::tNEQ },
- { ">", token_type::tGT },
- { ">>", token_type::tRSHFT },
- { ">=", token_type::tGEQ },
- { "<", token_type::tLT },
- { "<<", token_type::tLSHFT },
- { "<=", token_type::tLEQ },
- { "=>", token_type::tASSOC },
- { "::", token_type::tCOLON2 },
- { "===", token_type::tEQQ },
- { "<=>", token_type::tCMP },
- { "[]", token_type::tAREF },
- { "[]=", token_type::tASET },
- { "{", token_type::tLCURLY },
- { "}", token_type::tRCURLY },
- { "`", token_type::tBACK_REF2 },
- { "!@", token_type::tBANG },
- { "&.", token_type::tANDDOT },
- { NULL, token_type::error },
- };
- static const lexer::token_table_entry PUNCTUATION_BEGIN[] = {
- { "&", token_type::tAMPER },
- { "*", token_type::tSTAR },
- { "**", token_type::tDSTAR },
- { "+", token_type::tUPLUS },
- { "-", token_type::tUMINUS },
- { "::", token_type::tCOLON3 },
- { "(", token_type::tLPAREN },
- { "{", token_type::tLBRACE },
- { "[", token_type::tLBRACK },
- { NULL, token_type::error },
- };
- static const lexer::token_table_entry KEYWORDS[] = {
- { "if", token_type::kIF_MOD },
- { "unless", token_type::kUNLESS_MOD },
- { "while", token_type::kWHILE_MOD },
- { "until", token_type::kUNTIL_MOD },
- { "rescue", token_type::kRESCUE_MOD },
- { "defined?", token_type::kDEFINED },
- { "BEGIN", token_type::klBEGIN },
- { "END", token_type::klEND },
- { "class", token_type::kCLASS },
- { "module", token_type::kMODULE },
- { "def", token_type::kDEF },
- { "undef", token_type::kUNDEF },
- { "begin", token_type::kBEGIN },
- { "end", token_type::kEND },
- { "then", token_type::kTHEN },
- { "elsif", token_type::kELSIF },
- { "else", token_type::kELSE },
- { "ensure", token_type::kENSURE },
- { "case", token_type::kCASE },
- { "when", token_type::kWHEN },
- { "for", token_type::kFOR },
- { "break", token_type::kBREAK },
- { "next", token_type::kNEXT },
- { "redo", token_type::kREDO },
- { "retry", token_type::kRETRY },
- { "in", token_type::kIN },
- { "do", token_type::kDO },
- { "return", token_type::kRETURN },
- { "yield", token_type::kYIELD },
- { "super", token_type::kSUPER },
- { "self", token_type::kSELF },
- { "nil", token_type::kNIL },
- { "true", token_type::kTRUE },
- { "false", token_type::kFALSE },
- { "and", token_type::kAND },
- { "or", token_type::kOR },
- { "not", token_type::kNOT },
- { "alias", token_type::kALIAS },
- { "__FILE__", token_type::k__FILE__ },
- { "__LINE__", token_type::k__LINE__ },
- { "__ENCODING__", token_type::k__ENCODING__ },
- { NULL, token_type::error },
- };
- static const lexer::token_table_entry KEYWORDS_BEGIN[] = {
- { "if", token_type::kIF },
- { "unless", token_type::kUNLESS },
- { "while", token_type::kWHILE },
- { "until", token_type::kUNTIL },
- { "rescue", token_type::kRESCUE },
- { "defined?", token_type::kDEFINED },
- { "BEGIN", token_type::klBEGIN },
- { "END", token_type::klEND },
- { "class", token_type::kCLASS },
- { "module", token_type::kMODULE },
- { "def", token_type::kDEF },
- { "undef", token_type::kUNDEF },
- { "begin", token_type::kBEGIN },
- { "end", token_type::kEND },
- { "then", token_type::kTHEN },
- { "elsif", token_type::kELSIF },
- { "else", token_type::kELSE },
- { "ensure", token_type::kENSURE },
- { "case", token_type::kCASE },
- { "when", token_type::kWHEN },
- { "for", token_type::kFOR },
- { "break", token_type::kBREAK },
- { "next", token_type::kNEXT },
- { "redo", token_type::kREDO },
- { "retry", token_type::kRETRY },
- { "in", token_type::kIN },
- { "do", token_type::kDO },
- { "return", token_type::kRETURN },
- { "yield", token_type::kYIELD },
- { "super", token_type::kSUPER },
- { "self", token_type::kSELF },
- { "nil", token_type::kNIL },
- { "true", token_type::kTRUE },
- { "false", token_type::kFALSE },
- { "and", token_type::kAND },
- { "or", token_type::kOR },
- { "not", token_type::kNOT },
- { "alias", token_type::kALIAS },
- { "__FILE__", token_type::k__FILE__ },
- { "__LINE__", token_type::k__LINE__ },
- { "__ENCODING__", token_type::k__ENCODING__ },
- { NULL, token_type::error },
- };
- static size_t utf8_encode_char(int32_t uc, std::string &dst) {
- if (uc < 0x00) {
- return 0;
- } else if (uc < 0x80) {
- dst.push_back(static_cast<uint8_t>(uc));
- return 1;
- } else if (uc < 0x800) {
- dst.push_back(static_cast<uint8_t>(0xC0 + (uc >> 6)));
- dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
- return 2;
- } else if (uc < 0x10000) {
- dst.push_back(static_cast<uint8_t>(0xE0 + (uc >> 12)));
- dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
- dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
- return 3;
- } else if (uc < 0x110000) {
- dst.push_back(static_cast<uint8_t>(0xF0 + (uc >> 18)));
- dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 12) & 0x3F)));
- dst.push_back(static_cast<uint8_t>(0x80 + ((uc >> 6) & 0x3F)));
- dst.push_back(static_cast<uint8_t>(0x80 + (uc & 0x3F)));
- return 4;
- } else return 0;
- }
- static bool split_codepoints(const std::string &str, std::string &output) {
- auto isspace = [](char c) { return c == ' ' || c == '\t'; };
- const char *ptr = str.c_str();
- while (*ptr) {
- while (isspace(*ptr))
- ptr++;
- const char *start = ptr;
- while (*ptr && !isspace(*ptr))
- ptr++;
- std::string cp {start, static_cast<size_t>(ptr - start)};
- if (utf8_encode_char(std::stoi(cp, nullptr, 16), output) == 0)
- return false;
- }
- return true;
- }
- static std::string gsub(const std::string&& str, const std::string&& search, const std::string&& replace) {
- std::string result;
- std::string::size_type from = 0;
- while (true) {
- auto index = str.find(search, from);
- if (index == std::string::npos) {
- result += str.substr(from);
- break;
- } else {
- result += str.substr(from, index - from);
- result += replace;
- from = index + search.size();
- }
- }
- return result;
- }
- static bool eof_codepoint(char c) {
- return c == 0 || c == 0x04 || c == 0x1a;
- }
- token_t lexer::advance_() {
- if (!token_queue.empty()) {
- token_t token = token_queue.front();
- token_queue.pop();
- return token;
- }
- int cmd_state = command_start;
- command_start = false;
- const char* p = _p;
- const char* pe = _pe;
- const char* eof = _pe;
- const char* tm = NULL;
- const char* heredoc_e = NULL;
- const char* new_herebody_s = NULL;
- const char* ident_ts = NULL;
- const char* ident_te = NULL;
- std::string ident_tok;
- %% write exec;
- _p = p;
- if (!token_queue.empty()) {
- token_t token = token_queue.front();
- token_queue.pop();
- return token;
- }
- if (cs == lex_error) {
- size_t start = (size_t)(p - source_buffer.data());
- return mempool.alloc(token_type::error, start, start + 1, std::string(p - 1, 1));
- }
- return mempool.alloc(token_type::eof, source_buffer.size(), source_buffer.size(), "");
- }
- void lexer::emit(token_type type) {
- emit(type, tok());
- }
- void lexer::emit(token_type type, const std::string& str) {
- emit(type, str, ts, te);
- }
- void lexer::emit(token_type type, const std::string& str, const char* start, const char* end) {
- size_t offset_start = (size_t)(start - source_buffer.data());
- size_t offset_end = (size_t)(end - source_buffer.data());
- token_queue.push(mempool.alloc(type, offset_start, offset_end, str));
- }
- void lexer::emit_do(bool do_block) {
- if (cond.active()) {
- emit(token_type::kDO_COND, "do");
- } else if (cmdarg.active() || do_block) {
- emit(token_type::kDO_BLOCK, "do");
- } else {
- emit(token_type::kDO, "do");
- }
- }
- void lexer::emit_table(const token_table_entry* table) {
- auto value = tok();
- for (; table->token; ++table) {
- if (value == table->token) {
- emit(table->type, value);
- return;
- }
- }
- // whitequark emits a `nil` token here, but if we do `yylex` hits an assert,
- // so just drop the token.
- return;
- }
- void lexer::emit_num(const std::string& num) {
- switch (num_xfrm) {
- case num_xfrm_type::NONE:
- emit(token_type::tINTEGER, num);
- break;
- case num_xfrm_type::RATIONAL:
- emit(token_type::tRATIONAL, num);
- break;
- case num_xfrm_type::IMAGINARY:
- emit(token_type::tIMAGINARY, num);
- break;
- case num_xfrm_type::RATIONAL_IMAGINARY:
- emit(token_type::tRATIONAL_IMAGINARY, num);
- break;
- case num_xfrm_type::FLOAT:
- emit(token_type::tFLOAT, num);
- break;
- case num_xfrm_type::FLOAT_IMAGINARY:
- emit(token_type::tFLOAT_IMAGINARY, num);
- break;
- }
- }
- std::string lexer::convert_base(const std::string& num, int num_base) {
- long int result;
- if (num_base == 10) {
- return num;
- }
- // This doesn't match Ruby's parsing but it is better than not handling it
- if (!absl::numbers_internal::safe_strtoi_base(num, &result, num_base)) {
- result = 0;
- // dmitry: appartently we assume that outer functions reported all the errors!!!
- }
- return std::to_string(result);
- }
- diagnostic::range lexer::range(const char *start, const char *end) {
- size_t token_start = (size_t)(start - source_buffer.data());
- size_t token_end = (size_t)(end - source_buffer.data());
- return diagnostic::range(token_start, token_end);
- }
- void lexer::diagnostic_(dlevel level, dclass type, const std::string &data) {
- diagnostics.emplace_back(level, type, range(ts, te), data);
- }
- void lexer::diagnostic_(dlevel level, dclass type, diagnostic::range &&range, const std::string &data) {
- diagnostics.emplace_back(level, type, range, data);
- }
- //
- // === LITERAL STACK ===
- //
- template<typename... Args>
- int lexer::push_literal(Args&&... args) {
- literal_stack.emplace(*this, std::forward<Args>(args)...);
- auto& literal = literal_stack.top();
- return next_state_for_literal(literal);
- }
- int lexer::next_state_for_literal(literal &lit) {
- if (lit.words() && lit.backslash_delimited()) {
- if (lit.interpolate()) {
- return lex_en_interp_backslash_delimited_words;
- } else {
- return lex_en_plain_backslash_delimited_words;
- }
- } else if (lit.words() && !lit.backslash_delimited()) {
- if (lit.interpolate()) {
- return lex_en_interp_words;
- } else {
- return lex_en_plain_words;
- }
- } else if (!lit.words() && lit.backslash_delimited()) {
- if (lit.interpolate()) {
- return lex_en_interp_backslash_delimited;
- } else {
- return lex_en_plain_backslash_delimited;
- }
- } else {
- if (lit.interpolate()) {
- return lex_en_interp_string;
- } else {
- return lex_en_plain_string;
- }
- }
- }
- literal& lexer::literal_() {
- return literal_stack.top();
- }
- int lexer::pop_literal() {
- bool was_regexp;
- {
- auto& old_literal = literal_stack.top();
- was_regexp = old_literal.regexp();
- dedentLevel_ = old_literal.dedentLevel();
- }
- literal_stack.pop();
- if (was_regexp) {
- return lex_en_regexp_modifiers;
- } else {
- return lex_en_expr_end;
- }
- }
- void lexer::set_state_expr_beg() {
- cs = lex_en_expr_beg;
- }
- void lexer::set_state_expr_end() {
- cs = lex_en_expr_end;
- }
- void lexer::set_state_expr_endarg() {
- cs = lex_en_expr_endarg;
- }
- void lexer::set_state_expr_fname() {
- cs = lex_en_expr_fname;
- }
- void lexer::set_state_expr_value() {
- cs = lex_en_expr_value;
- }
- %%{
- # access @;
- # getkey (@source_pts[p] || 0);
- # === CHARACTER CLASSES ===
- #
- # Pay close attention to the differences between c_any and any.
- # c_any does not include EOF and so will cause incorrect behavior
- # for machine subtraction (any-except rules) and default transitions
- # for scanners.
- action do_nl {
- // Record position of a newline for precise location reporting on tNL
- // tokens.
- //
- // This action is embedded directly into c_nl, as it is idempotent and
- // there are no cases when we need to skip it.
- newline_s = p;
- }
- c_nl = '\n' $ do_nl;
- c_space = [ \t\r\f\v];
- c_space_nl = c_space | c_nl;
- c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
- c_eol = c_nl | c_eof;
- c_any = any - c_eof;
- c_nl_zlen = c_nl | zlen;
- c_line = any - c_nl_zlen;
- c_unicode = c_any - 0x00..0x7f;
- c_upper = [A-Z];
- c_lower = [a-z_] | c_unicode;
- c_alpha = c_lower | c_upper;
- c_alnum = c_alpha | [0-9];
- action do_eof {
- // Sit at EOF indefinitely. #advance would return $eof each time.
- // This allows to feed the lexer more data if needed; this is only used
- // in tests.
- //
- // Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
- // below. This is due to the fact that scanner state at EOF is observed
- // by tests, and encapsulating it in a rule would break the introspection.
- fhold; fbreak;
- }
- #
- # === TOKEN DEFINITIONS ===
- #
- # All operators are punctuation. There is more to punctuation
- # than just operators. Operators can be overridden by user;
- # punctuation can not.
- # A list of operators which are valid in the function name context, but
- # have different semantics in others.
- operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
- # A list of operators which can occur within an assignment shortcut (+ → +=).
- operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
- '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
- # A list of all user-definable operators not covered by groups above.
- operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
- '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
- # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
- # as they are ambiguous with interpolation `#{}` and should be counted.
- # These braces are not present in punctuation lists.
- # A list of punctuation which has different meaning when used at the
- # beginning of expression.
- punctuation_begin = '-' | '+' | '::' | '(' | '[' |
- '*' | '**' | '&' ;
- # A list of all punctuation except punctuation_begin.
- punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
- '::' | '?' | ':' | '.' | '..' | '...' ;
- # A list of keywords which have different meaning at the beginning of expression.
- keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
- # A list of keywords which accept an argument-like expression, i.e. have the
- # same post-processing as method calls or commands. Example: `yield 1`,
- # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
- keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
- # A list of keywords which accept a literal function name as an argument.
- keyword_with_fname = 'def' | 'undef' | 'alias' ;
- # A list of keywords which accept an expression after them.
- keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
- 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
- 'and' | 'or' ;
- # A list of keywords which accept a value, and treat the keywords from
- # `keyword_modifier` list as modifiers.
- keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
- # A list of keywords which do not accept an expression after them.
- keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
- 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
- '__LINE__' | '__ENCODING__';
- # All keywords.
- keyword = keyword_with_value | keyword_with_mid |
- keyword_with_end | keyword_with_arg |
- keyword_with_fname | keyword_modifier ;
- constant = c_upper c_alnum*;
- bareword = c_alpha c_alnum*;
- call_or_var = c_lower c_alnum*;
- class_var = '@@' bareword;
- instance_var = '@' bareword;
- global_var = '$'
- ( bareword | digit+
- | [`'+~*$&?!@/\\;,.=:<>"] # `
- | '-' c_alnum
- )
- ;
- # Ruby accepts (and fails on) variables with leading digit
- # in literal context, but not in unquoted symbol body.
- class_var_v = '@@' c_alnum+;
- instance_var_v = '@' c_alnum+;
- label = bareword [?!]? ':';
- #
- # === NUMERIC PARSING ===
- #
- int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
- int_dec = ( digit+ '_' )* digit* '_'? ;
- int_bin = ( [01]+ '_' )* [01]* '_'? ;
- flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
- flo_frac = '.' ( digit+ '_' )* digit+;
- flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
- int_suffix =
- '' % { num_xfrm = num_xfrm_type::NONE; }
- | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; }
- | 'i' % { num_xfrm = num_xfrm_type::IMAGINARY; }
- | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
- flo_pow_suffix =
- '' % { num_xfrm = num_xfrm_type::FLOAT; }
- | 'i' % { num_xfrm = num_xfrm_type::FLOAT_IMAGINARY; };
- flo_suffix =
- flo_pow_suffix
- | 'r' % { num_xfrm = num_xfrm_type::RATIONAL; }
- | 'ri' % { num_xfrm = num_xfrm_type::RATIONAL_IMAGINARY; };
- #
- # === ESCAPE SEQUENCE PARSING ===
- #
- # Escape parsing code is a Ragel pattern, not a scanner, and therefore
- # it shouldn't directly raise errors or perform other actions with side effects.
- # In reality this would probably just mess up error reporting in pathological
- # cases, through.
- # The amount of code required to parse \M\C stuff correctly is ridiculous.
- escaped_nl = "\\" c_nl;
- action unicode_points {
- auto codepoint_str = tok(escape_s + 2, p - 1);
- std::string result;
- if (split_codepoints(codepoint_str, result)) {
- escape = std::make_unique<std::string>(result);
- } else {
- auto codepoint_s = escape_s + 2;
- diagnostic_(dlevel::ERROR, dclass::UnicodePointTooLarge,
- range(codepoint_s, codepoint_s + codepoint_str.size()));
- }
- }
- action unescape_char {
- char esc = unescape(p[-1]);
- if (esc) {
- escape = std::make_unique<std::string>(&esc, 1);
- } else {
- escape = std::make_unique<std::string>(p - 1, 1);
- }
- }
- action invalid_complex_escape {
- diagnostic_(dlevel::FATAL, dclass::InvalidEscape);
- }
- action slash_c_char {
- // TODO multibyte
- char c = escape->at(0) & 0x9f;
- escape = std::make_unique<std::string>(&c, 1);
- }
- action slash_m_char {
- // TODO multibyte
- char c = escape->at(0) | 0x80;
- escape = std::make_unique<std::string>(&c, 1);
- }
- maybe_escaped_char = (
- '\\' c_any %unescape_char
- | ( c_any - [\\] ) % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ }
- );
- maybe_escaped_ctrl_char = ( # why?!
- '\\' c_any %unescape_char %slash_c_char
- | '?' % { escape = std::make_unique<std::string>("\x7f"); }
- | ( c_any - [\\?] ) % { escape = std::make_unique<std::string>(p - 1, 1); /* TODO multibyte */ } %slash_c_char
- );
- escape = (
- # \377
- [0-7]{1,3}
- % {
- auto esc = tok(escape_s, p);
- char c = std::stoi(esc, nullptr, 8);
- escape = std::make_unique<std::string>(&c, 1);
- }
- # \xff
- | 'x' xdigit{1,2}
- % {
- auto esc = tok(escape_s + 1, p);
- char c = std::stoi(esc, nullptr, 16);
- escape = std::make_unique<std::string>(&c, 1);
- }
- # \u263a
- | 'u' xdigit{4}
- % {
- std::string result;
- split_codepoints(tok(escape_s + 1, p), result);
- escape = std::make_unique<std::string>(result);
- }
- # %q[\x]
- | 'x' ( c_any - xdigit )
- % {
- diagnostic_(dlevel::FATAL, dclass::InvalidHexEscape, range(escape_s - 1, p + 2));
- }
- # %q[\u123] %q[\u{12]
- | 'u' ( c_any{0,4} -
- xdigit{4} - # \u1234 is valid
- ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
- | '{' xdigit [ \t}] any? # \u{1. \u{1} are valid
- | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
- )
- )
- % {
- diagnostic_(dlevel::FATAL, dclass::InvalidUnicodeEscape, range(escape_s - 1, p));
- }
- # \u{123 456}
- | 'u{' ( xdigit{1,6} [ \t] )*
- ( xdigit{1,6} '}'
- %unicode_points
- | ( xdigit* ( c_any - xdigit - '}' )+ '}'
- | ( c_any - '}' )* c_eof
- | xdigit{7,}
- ) % {
- diagnostic_(dlevel::FATAL, dclass::UnterminatedUnicode, range(p - 1, p));
- }
- )
- # \C-\a \cx
- | ( 'C-' | 'c' ) escaped_nl?
- maybe_escaped_ctrl_char
- # \M-a
- | 'M-' escaped_nl?
- maybe_escaped_char
- %slash_m_char
- # \C-\M-f \M-\cf \c\M-f
- | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
- | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
- maybe_escaped_ctrl_char
- %slash_m_char
- | 'C' c_any %invalid_complex_escape
- | 'M' c_any %invalid_complex_escape
- | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
- | ( c_any - [0-7xuCMc] ) %unescape_char
- | c_eof % {
- diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(p - 1, p));
- }
- );
- # Use rules in form of `e_bs escape' when you need to parse a sequence.
- e_bs = '\\' % {
- escape_s = p;
- escape = nullptr;
- };
- #
- # === STRING AND HEREDOC PARSING ===
- #
- # Heredoc parsing is quite a complex topic. First, consider that heredocs
- # can be arbitrarily nested. For example:
- #
- # puts <<CODE
- # the result is: #{<<RESULT.inspect
- # i am a heredoc
- # RESULT
- # }
- # CODE
- #
- # which, incidentally, evaluates to:
- #
- # the result is: " i am a heredoc\n"
- #
- # To parse them, lexer refers to two kinds (remember, nested heredocs)
- # of positions in the input stream, namely heredoc_e
- # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
- #
- # heredoc_e is simply contained inside the corresponding Literal, and
- # when the heredoc is closed, the lexing is restarted from that position.
- #
- # @herebody_s is quite more complex. First, @herebody_s changes after each
- # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
- # contains the current line, and also when a heredoc is started, @herebody_s
- # contains the position from which the heredoc will be lexed.
- #
- # Second, as (insanity) there are nested heredocs, we need to maintain a
- # stack of these positions. Each time #push_literal is called, it saves current
- # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
- # containing another heredocs) is closed, the previous value is restored.
- e_heredoc_nl = c_nl % {
- // After every heredoc was parsed, herebody_s contains the
- // position of next token after all heredocs.
- if (herebody_s) {
- p = herebody_s;
- herebody_s = NULL;
- }
- };
- action extend_string {
- auto str = tok();
- std::string lookahead;
- // tLABEL_END is only possible in non-cond context on >= 2.2
- if (version >= ruby_version::RUBY_22 && !cond.active()) {
- const char* lookahead_s = te;
- const char* lookahead_e = te + 2;
- if (lookahead_e > eof) {
- lookahead_e = eof;
- }
- lookahead = std::string(lookahead_s, (size_t)(lookahead_e - lookahead_s));
- }
- auto& current_literal = literal_();
- if (!current_literal.heredoc() && current_literal.nest_and_try_closing(str, ts, te, lookahead)) {
- if (token_queue.back()->type() == token_type::tLABEL_END) {
- p += 1;
- pop_literal();
- fnext expr_labelarg;
- } else {
- fnext *pop_literal();
- }
- fbreak;
- } else {
- current_literal.extend_string(str, ts, te);
- }
- }
- action extend_string_escaped {
- auto& current_literal = literal_();
- // TODO multibyte
- auto escaped_char = *escape_s;
- if (current_literal.munge_escape(escaped_char)) {
- // If this particular literal uses this character as an opening
- // or closing delimiter, it is an escape sequence for that
- // particular character. Write it without the backslash.
- if (current_literal.regexp()
- && (escaped_char == '\\' ||
- escaped_char == '$' ||
- escaped_char == '$' ||
- escaped_char == '(' ||
- escaped_char == ')' ||
- escaped_char == '*' ||
- escaped_char == '+' ||
- escaped_char == '.' ||
- escaped_char == '<' ||
- escaped_char == '>' ||
- escaped_char == '?' ||
- escaped_char == '[' ||
- escaped_char == ']' ||
- escaped_char == '^' ||
- escaped_char == '{' ||
- escaped_char == '|' ||
- escaped_char == '}')) {
- // Regular expressions should include escaped delimiters in their
- // escaped form, except when the escaped character is
- // a closing delimiter but not a regexp metacharacter.
- //
- // The backslash itself cannot be used as a closing delimiter
- // at the same time as an escape symbol, but it is always munged,
- // so this branch also executes for the non-closing-delimiter case
- // for the backslash.
- auto str = tok();
- current_literal.extend_string(str, ts, te);
- } else {
- auto str = std::string(&escaped_char, 1);
- current_literal.extend_string(str, ts, te);
- }
- } else {
- // It does not. So this is an actual escape sequence, yay!
- if (current_literal.regexp()) {
- // Regular expressions should include escape sequences in their
- // escaped form. On the other hand, escaped newlines are removed.
- std::string str = gsub(tok(), "\\\n", "");
- current_literal.extend_string(str, ts, te);
- } else {
- auto str = escape ? *escape : tok();
- current_literal.extend_string(str, ts, te);
- }
- }
- }
- # Extend a string with a newline or a EOF character.
- # As heredoc closing line can immediately precede EOF, this action
- # has to handle such case specially.
- action extend_string_eol {
- auto& current_literal = literal_();
- if (te == pe) {
- diagnostic_(dlevel::FATAL, dclass::EscapeEof, range(current_literal.str_s, current_literal.str_s + 1));
- }
- if (current_literal.heredoc()) {
- auto line = tok(herebody_s, ts);
- while (!line.empty() && line.back() == '\r') {
- line.pop_back();
- }
- if (version <= ruby_version::RUBY_20) {
- // See ruby:c48b4209c
- auto riter = line.rfind('\r');
- if (riter != std::string::npos) {
- line.erase(riter);
- }
- }
- // Try ending the heredoc with the complete most recently
- // scanned line. @herebody_s always refers to the start of such line.
- if (current_literal.nest_and_try_closing(line, herebody_s, ts)) {
- herebody_s = te;
- // Continue regular lexing after the heredoc reference (<<END).
- p = current_literal.heredoc_e - 1;
- fnext *pop_literal(); fbreak;
- } else {
- // Calculate indentation level for <<~HEREDOCs.
- current_literal.infer_indent_level(line);
- // Ditto.
- herebody_s = te;
- }
- } else {
- // Try ending the literal with a newline.
- auto str = tok();
- if (current_literal.nest_and_try_closing(str, ts, te)) {
- fnext *pop_literal(); fbreak;
- }
- if (herebody_s) {
- // This is a regular literal intertwined with a heredoc. Like:
- //
- // p <<-foo+"1
- // bar
- // foo
- // 2"
- //
- // which, incidentally, evaluates to "bar\n1\n2".
- p = herebody_s - 1;
- herebody_s = nullptr;
- }
- }
- if (current_literal.words() && !eof_codepoint(*p)) {
- current_literal.extend_space(ts, te);
- } else {
- // A literal newline is appended if the heredoc was _not_ closed
- // this time (see f break above). See also Literal#nest_and_try_closing
- // for rationale of calling #flush_string here.
- std::string str = tok();
- current_literal.extend_string(str, ts, te);
- current_literal.flush_string();
- }
- }
- action extend_string_space {
- literal_().extend_space(ts, te);
- }
- #
- # === INTERPOLATION PARSING ===
- #
- # Interpolations with immediate variable names simply call into
- # the corresponding machine.
- interp_var = '#' ( global_var | class_var_v | instance_var_v );
- action extend_interp_var {
- auto& current_literal = literal_();
- current_literal.flush_string();
- current_literal.extend_content();
- emit(token_type::tSTRING_DVAR, "", ts, ts + 1);
- p = ts;
- fcall expr_variable;
- }
- # Interpolations with code blocks must match nested curly braces, as
- # interpolation ending is ambiguous with a block ending. So, every
- # opening and closing brace should be matched with e_[lr]brace rules,
- # which automatically perform the counting.
- #
- # Note that interpolations can themselves be nested, so brace balance
- # is tied to the innermost literal.
- #
- # Also note that literals themselves should not use e_[lr]brace rules
- # when matching their opening and closing delimiters, as the amount of
- # braces inside the characters of a string literal is independent.
- interp_code = '#{';
- e_lbrace = '{' % {
- cond.push(false); cmdarg.push(false);
- if (!literal_stack.empty()) {
- literal_().start_interp_brace();
- }
- };
- e_rbrace = '}' % {
- if (!literal_stack.empty()) {
- auto& current_literal = literal_();
- if (current_literal.end_interp_brace_and_try_closing()) {
- if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19) {
- emit(token_type::tRCURLY, "}", p - 1, p);
- } else {
- emit(token_type::tSTRING_DEND, "}", p - 1, p);
- }
- if (current_literal.saved_herebody_s) {
- herebody_s = current_literal.saved_herebody_s;
- }
- fhold;
- fnext *next_state_for_literal(current_literal);
- fbreak;
- }
- }
- };
- action extend_interp_code {
- auto& current_literal = literal_();
- current_literal.flush_string();
- current_literal.extend_content();
- emit(token_type::tSTRING_DBEG, "#{");
- if (current_literal.heredoc()) {
- current_literal.saved_herebody_s = herebody_s;
- herebody_s = nullptr;
- }
- current_literal.start_interp_brace();
- command_start = true;
- fnext expr_value;
- fbreak;
- }
- # Actual string parsers are simply combined from the primitives defined
- # above.
- interp_words := |*
- interp_code => extend_interp_code;
- interp_var => extend_interp_var;
- e_bs escape => extend_string_escaped;
- c_space+ => extend_string_space;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- interp_string := |*
- interp_code => extend_interp_code;
- interp_var => extend_interp_var;
- e_bs escape => extend_string_escaped;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- plain_words := |*
- e_bs c_any => extend_string_escaped;
- c_space+ => extend_string_space;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- plain_string := |*
- '\\' c_nl => extend_string_eol;
- e_bs c_any => extend_string_escaped;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- interp_backslash_delimited := |*
- interp_code => extend_interp_code;
- interp_var => extend_interp_var;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- plain_backslash_delimited := |*
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- interp_backslash_delimited_words := |*
- interp_code => extend_interp_code;
- interp_var => extend_interp_var;
- c_space+ => extend_string_space;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- plain_backslash_delimited_words := |*
- c_space+ => extend_string_space;
- c_eol => extend_string_eol;
- c_any => extend_string;
- *|;
- regexp_modifiers := |*
- [A-Za-z]+
- => {
- auto options = tok();
- std::string unknown_options;
- for (auto i = options.cbegin(); i != options.cend(); ++i) {
- switch (char opt = *i) {
- case 'i':
- case 'm':
- case 'x':
- case 'o':
- case 'u':
- case 'e':
- case 's':
- case 'n':
- continue;
- default:
- unknown_options += opt;
- break;
- }
- }
- if (!unknown_options.empty()) {
- diagnostic_(dlevel::ERROR, dclass::RegexpOptions, unknown_options);
- }
- emit(token_type::tREGEXP_OPT, options);
- fnext expr_end;
- fbreak;
- };
- any
- => {
- emit(token_type::tREGEXP_OPT, tok(ts, te - 1), ts, te - 1);
- fhold;
- fgoto expr_end;
- };
- *|;
- #
- # === WHITESPACE HANDLING ===
- #
- # Various contexts in Ruby allow various kinds of whitespace
- # to be used. They are grouped to clarify the lexing machines
- # and ease collection of comments.
- # A line of code with inline #comment at end is always equivalent
- # to a line of code ending with just a newline, so an inline
- # comment is deemed equivalent to non-newline whitespace
- # (c_space character class).
- w_space =
- c_space+
- | '\\' e_heredoc_nl
- ;
- w_comment =
- '#' %{ sharp_s = p - 1; }
- # The (p == pe) condition compensates for added "\0" and
- # the way Ragel handles EOF.
- c_line* %{ emit_comment(sharp_s, p == pe ? p - 2 : p); }
- ;
- w_space_comment =
- w_space
- | w_comment
- ;
- # A newline in non-literal context always interoperates with
- # here document logic and can always be escaped by a backslash,
- # still interoperating with here document logic in the same way,
- # yet being invisible to anything else.
- #
- # To demonstrate:
- #
- # foo = <<FOO \
- # bar
- # FOO
- # + 2
- #
- # is equivalent to `foo = "bar\n" + 2`.
- w_newline =
- e_heredoc_nl;
- w_any =
- w_space
- | w_comment
- | w_newline
- ;
- #
- # === EXPRESSION PARSING ===
- #
- # These rules implement a form of manually defined lookahead.
- # The default longest-match scanning does not work here due
- # to sheer ambiguity.
- ambiguous_fid_suffix = # actual parsed
- [?!] %{ tm = p; } | # a? a?
- [?!]'=' %{ tm = p - 2; } # a!=b a != b
- ;
- ambiguous_ident_suffix = # actual parsed
- ambiguous_fid_suffix |
- '=' %{ tm = p; } | # a= a=
- '==' %{ tm = p - 2; } | # a==b a == b
- '=~' %{ tm = p - 2; } | # a=~b a =~ b
- '=>' %{ tm = p - 2; } | # a=>b a => b
- '===' %{ tm = p - 3; } # a===b a === b
- ;
- ambiguous_symbol_suffix = # actual parsed
- ambiguous_ident_suffix |
- '==>' %{ tm = p - 2; } # :a==>b :a= => b
- ;
- # Ambiguous with 1.9 hash labels.
- ambiguous_const_suffix = # actual parsed
- '::' %{ tm = p - 2; } # A::B A :: B
- ;
- # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
- # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
- e_lbrack = '[' % {
- cond.push(false); cmdarg.push(false);
- };
- # Ruby 1.9 lambdas require parentheses counting in order to
- # emit correct opening kDO/tLBRACE.
- e_lparen = '(' % {
- cond.push(false); cmdarg.push(false);
- paren_nest += 1;
- };
- e_rparen = ')' % {
- paren_nest -= 1;
- };
- # Ruby is context-sensitive wrt/ local identifiers.
- action local_ident {
- auto ident = tok();
- emit(token_type::tIDENTIFIER, ident);
- if (is_declared(ident)) {
- fnext expr_endfn; fbreak;
- } else {
- fnext *arg_or_cmdarg(cmd_state); fbreak;
- }
- }
- # Variable lexing code is accessed from both expressions and
- # string interpolation related code.
- #
- expr_variable := |*
- global_var
- => {
- if (ts[1] >= '1' && ts[1] <= '9') {
- emit(token_type::tNTH_REF, tok(ts + 1));
- } else if (ts[1] == '&' || ts[1] == '`' || ts[1] == '\'' || ts[1] == '+') {
- emit(token_type::tBACK_REF);
- } else {
- emit(token_type::tGVAR);
- }
- fnext *stack_pop(); fbreak;
- };
- class_var_v
- => {
- if (ts[2] >= '0' && ts[2] <= '9') {
- diagnostic_(dlevel::ERROR, dclass::CvarName, tok(ts, te));
- }
- emit(token_type::tCVAR);
- fnext *stack_pop(); fbreak;
- };
- instance_var_v
- => {
- if (ts[1] >= '0' && ts[1] <= '9') {
- diagnostic_(dlevel::ERROR, dclass::IvarName, tok(ts, te));
- }
- emit(token_type::tIVAR);
- fnext *stack_pop(); fbreak;
- };
- *|;
- # Literal function name in definition (e.g. `def class`).
- # Keywords are returned as their respective tokens; this is used
- # to support singleton def `def self.foo`. Global variables are
- # returned as `tGVAR`; this is used in global variable alias
- # statements `alias $a $b`. Symbols are returned verbatim; this
- # is used in `alias :a :"b#{foo}"` and `undef :a`.
- #
- # Transitions to `expr_endfn` afterwards.
- #
- expr_fname := |*
- keyword
- => { emit_table(KEYWORDS_BEGIN);
- fnext expr_endfn; fbreak; };
- constant
- => { emit(token_type::tCONSTANT);
- fnext expr_endfn; fbreak; };
- bareword [?=!]?
- => { emit(token_type::tIDENTIFIER);
- fnext expr_endfn; fbreak; };
- global_var
- => { p = ts - 1;
- fnext expr_end; fcall expr_variable; };
- # If the handling was to be delegated to expr_end,
- # these cases would transition to something else than
- # expr_endfn, which is incorrect.
- operator_fname |
- operator_arithmetic |
- operator_rest
- => { emit_table(PUNCTUATION);
- fnext expr_endfn; fbreak; };
- '::'
- => { fhold; fhold; fgoto expr_end; };
- ':'
- => { fhold; fgoto expr_beg; };
- '%s' c_any
- => {
- if (version == ruby_version::RUBY_23) {
- fgoto *push_literal(literal_type::LOWERS_SYMBOL, std::string(ts + 2, 1), ts);
- } else {
- p = ts - 1;
- fgoto expr_end;
- }
- };
- w_any;
- c_any
- => { fhold; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # After literal function name in definition. Behaves like `expr_end`,
- # but allows a tLABEL.
- #
- # Transitions to `expr_end` afterwards.
- #
- expr_endfn := |*
- label ( any - ':' )
- => { emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
- fhold; fnext expr_labelarg; fbreak; };
- w_space_comment;
- c_any
- => { fhold; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # Literal function name in method call (e.g. `a.class`).
- #
- # Transitions to `expr_arg` afterwards.
- #
- # KEEP IN SYNC WITH expr_dot_after_newline!
- #
- expr_dot := |*
- constant
- => { emit(token_type::tCONSTANT);
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
- call_or_var
- => { emit(token_type::tIDENTIFIER);
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
- bareword ambiguous_fid_suffix
- => { emit(token_type::tFID, tok(ts, tm), ts, tm);
- fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
- # See the comment in `expr_fname`.
- operator_fname |
- operator_arithmetic |
- operator_rest
- => { emit_table(PUNCTUATION);
- fnext expr_arg; fbreak; };
- # This breaks compatibility with Ruby for better partial parses (useful
- # for LSP especially). See comment for expr_dot_after_newline below.
- w_newline
- => { fhold; fgoto expr_dot_after_newline; };
- w_any;
- c_any
- => { fhold; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # KEEP IN SYNC WITH expr_dot!
- #
- # This state breaks from valid Ruby syntax, but in a way that enables Sorbet
- # to recover better from parse errors. Recovering from parse errors is
- # important because it lets us service LSP queries faster.
- #
- # Specifically, this state makes is so that any keyword seen after w_newline
- # is emitted as a keyword (like kEND) instead of a tIDENTIFIER. Examples:
- #
- # # Valid Ruby, valid in Sorbet (no newline between '.' and 'end')
- # def foo
- # x.end
- # end
- #
- # # Parse error in Ruby and Sorbet, but Sorbet at least sees the method def
- # # with an empty body (Ruby wouldn't even see an empty method def)
- # def foo
- # x.
- # end
- #
- # # Valid Ruby, not valid in Sorbet (newline between '.' and 'end')
- # def foo
- # x.
- # end
- # end
- #
- expr_dot_after_newline := |*
- constant
- => { emit(token_type::tCONSTANT);
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
- # This is different from expr_dot. Here, keywords are NOT identifiers.
- keyword
- => { emit_table(KEYWORDS);
- fnext expr_end; fbreak; };
- call_or_var
- => { emit(token_type::tIDENTIFIER);
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
- bareword ambiguous_fid_suffix
- => { emit(token_type::tFID, tok(ts, tm), ts, tm);
- fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
- # See the comment in `expr_fname`.
- operator_fname |
- operator_arithmetic |
- operator_rest
- => { emit_table(PUNCTUATION);
- fnext expr_arg; fbreak; };
- w_any;
- c_any
- => { fhold; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
- # is consumed; the current expression is a command or method call.
- #
- expr_arg := |*
- #
- # COMMAND MODE SPECIFIC TOKENS
- #
- # cmd (1 + 2)
- # See below the rationale about expr_endarg.
- w_space+ e_lparen
- => {
- if (version == ruby_version::RUBY_18) {
- emit(token_type::tLPAREN2, "(", te - 1, te);
- fnext expr_value; fbreak;
- } else {
- emit(token_type::tLPAREN_ARG, "(", te - 1, te);
- fnext expr_beg; fbreak;
- }
- };
- # meth(1 + 2)
- # Regular method call.
- e_lparen
- => { emit(token_type::tLPAREN2, "(");
- fnext expr_beg; fbreak; };
- # meth [...]
- # Array argument. Compare with indexing `meth[...]`.
- w_space+ e_lbrack
- => { emit(token_type::tLBRACK, "[", te - 1, te);
- fnext expr_beg; fbreak; };
- # cmd {}
- # Command: method call without parentheses.
- w_space* e_lbrace
- => {
- if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
- lambda_stack.pop();
- emit(token_type::tLAMBEG, "{", te - 1, te);
- } else {
- emit(token_type::tLCURLY, "{", te - 1, te);
- }
- command_start = true;
- fnext expr_value; fbreak;
- };
- #
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
- #
- # a??
- # Ternary operator
- '?' c_space_nl
- => {
- // Unlike expr_beg as invoked in the next rule, do not warn
- p = ts - 1;
- fgoto expr_end;
- };
- # a ?b, a? ?
- # Character literal or ternary operator
- w_space* '?'
- => { fhold; fgoto expr_beg; };
- # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
- # a /foo/ (but not "a / foo" or "a /=foo")
- # a <<HEREDOC
- w_space+ %{ tm = p; }
- ( [%/] ( c_any - c_space_nl - '=' ) # /
- | '<<'
- )
- => {
- if (*tm == '/') {
- // Ambiguous regexp literal.
- diagnostic_(dlevel::WARNING, dclass::AmbiguousLiteral, range(tm, tm + 1));
- }
- p = tm - 1;
- fgoto expr_beg;
- };
- # x *1
- # Ambiguous splat, kwsplat or block-pass.
- w_space+ %{ tm = p; } ( '+' | '-' | '*' | '&' | '**' )
- => {
- diagnostic_(dlevel::WARNING, dclass::AmbiguousPrefix, range(tm, te), tok(tm, te));
- p = tm - 1;
- fgoto expr_beg;
- };
- # x ::Foo
- # Ambiguous toplevel constant access.
- w_space+ '::'
- => { fhold; fhold; fgoto expr_beg; };
- # x:b
- # Symbol.
- w_space* ':'
- => { fhold; fgoto expr_beg; };
- w_space+ label
- => { p = ts - 1; fgoto expr_beg; };
- #
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
- #
- # a ? b
- # Ternary operator.
- w_space+ %{ tm = p; } '?' c_space_nl
- => { p = tm - 1; fgoto expr_end; };
- # x + 1: Binary operator or operator-assignment.
- w_space* operator_arithmetic
- ( '=' | c_space_nl )? |
- # x rescue y: Modifier keyword.
- w_space* keyword_modifier |
- # a &. b: Safe navigation operator.
- w_space* '&.' |
- # Miscellanea.
- w_space* punctuation_end
- => {
- p = ts - 1;
- fgoto expr_end;
- };
- w_space;
- w_comment
- => { fgoto expr_end; };
- w_newline
- => { fhold; fgoto expr_end; };
- c_any
- => { fhold; fgoto expr_beg; };
- c_eof => do_eof;
- *|;
- # The previous token was an identifier which was seen while in the
- # command mode (that is, the state at the beginning of #advance was
- # expr_value). This state is very similar to expr_arg, but disambiguates
- # two very rare and specific condition:
- # * In 1.8 mode, "foo (lambda do end)".
- # * In 1.9+ mode, "f x: -> do foo do end end".
- expr_cmdarg := |*
- w_space+ e_lparen
- => {
- emit(token_type::tLPAREN_ARG, "(", te - 1, te);
- if (version == ruby_version::RUBY_18) {
- fnext expr_value; fbreak;
- } else {
- fnext expr_beg; fbreak;
- }
- };
- w_space* 'do'
- => {
- if (cond.active()) {
- emit(token_type::kDO_COND, "do", te - 2, te);
- } else {
- emit(token_type::kDO, "do", te - 2, te);
- }
- fnext expr_value; fbreak;
- };
- c_any |
- # Disambiguate with the `do' rule above.
- w_space* bareword |
- w_space* label
- => { p = ts - 1;
- fgoto expr_arg; };
- c_eof => do_eof;
- *|;
- # The rationale for this state is pretty complex. Normally, if an argument
- # is passed to a command and then there is a block (tLCURLY...tRCURLY),
- # the block is attached to the innermost argument (`f` in `m f {}`), or it
- # is a parse error (`m 1 {}`). But there is a special case for passing a single
- # primary expression grouped with parentheses: if you write `m (1) {}` or
- # (2.0 only) `m () {}`, then the block is attached to `m`.
- #
- # Thus, we recognize the opening `(` of a command (remember, a command is
- # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
- # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
- # lexer's state to `expr_endarg`, which makes it emit the possibly following
- # `{` as `tLBRACE_ARG`.
- #
- # The default post-`expr_endarg` state is `expr_end`, so this state also handles
- # `do` (as `kDO_BLOCK` in `expr_beg`).
- expr_endarg := |*
- e_lbrace
- => {
- if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
- lambda_stack.pop();
- emit(token_type::tLAMBEG, "{");
- } else {
- emit(token_type::tLBRACE_ARG, "{");
- }
- command_start = true;
- fnext expr_value; fbreak;
- };
- 'do'
- => { emit_do(true);
- fnext expr_value; fbreak; };
- w_space_comment;
- c_any
- => { fhold; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # The rationale for this state is that several keywords accept value
- # (i.e. should transition to `expr_beg`), do not accept it like a command
- # (i.e. not an `expr_arg`), and must behave like a statement, that is,
- # accept a modifier if/while/etc.
- #
- expr_mid := |*
- keyword_modifier
- => { emit_table(KEYWORDS);
- fnext expr_beg; fbreak; };
- bareword
- => { p = ts - 1; fgoto expr_beg; };
- w_space_comment;
- w_newline
- => { fhold; fgoto expr_end; };
- c_any
- => { fhold; fgoto expr_beg; };
- c_eof => do_eof;
- *|;
- # Beginning of an expression.
- #
- # Don't fallthrough to this state from `c_any`; make sure to handle
- # `c_space* c_nl` and let `expr_end` handle the newline.
- # Otherwise code like `f\ndef x` gets glued together and the parser
- # explodes.
- #
- expr_beg := |*
- # +5, -5, - 5
- [+\-] w_any* [0-9]
- => {
- emit(token_type::tUNARY_NUM, tok(ts, ts + 1), ts, ts + 1);
- fhold; fnext expr_end; fbreak;
- };
- # splat *a
- '*'
- => { emit(token_type::tSTAR, "*");
- fbreak; };
- #
- # STRING AND REGEXP LITERALS
- #
- # /regexp/oui
- # /=/ (disambiguation with /=)
- '/' c_any
- => {
- fhold; fgoto *push_literal(literal_type::SLASH_REGEXP, std::string(ts + 0, 1), ts);
- };
- # %<string>
- '%' ( any - [A-Za-z] )
- => {
- fgoto *push_literal(literal_type::PERCENT_STRING, std::string(ts + 1, 1), ts);
- };
- # %w(we are the people)
- '%' [A-Za-z]+ c_any
- => {
- literal_type type;
- bool single_char_type = (ts + 3 == te);
- if (single_char_type && ts[1] == 'q') {
- type = literal_type::LOWERQ_STRING;
- } else if (single_char_type && ts[1] == 'Q') {
- type = literal_type::UPPERQ_STRING;
- } else if (single_char_type && ts[1] == 'w') {
- type = literal_type::LOWERW_WORDS;
- } else if (single_char_type && ts[1] == 'W') {
- type = literal_type::UPPERW_WORDS;
- } else if (single_char_type && ts[1] == 'i') {
- type = literal_type::LOWERI_SYMBOLS;
- } else if (single_char_type && ts[1] == 'I') {
- type = literal_type::UPPERI_SYMBOLS;
- } else if (single_char_type && ts[1] == 's') {
- type = literal_type::LOWERS_SYMBOL;
- } else if (single_char_type && ts[1] == 'r') {
- type = literal_type::PERCENT_REGEXP;
- } else if (single_char_type && ts[1] == 'x') {
- type = literal_type::LOWERX_XSTRING;
- } else {
- type = literal_type::PERCENT_STRING;
- diagnostic_(dlevel::ERROR, dclass::UnexpectedPercentStr, range(ts, te - 1), tok(ts, te-1));
- }
- fgoto *push_literal(type, std::string(te - 1, 1), ts);
- };
- '%' c_eof
- => {
- diagnostic_(dlevel::FATAL, dclass::StringEof, range(ts, ts + 1));
- };
- # Heredoc start.
- # <<END | <<'END' | <<"END" | <<`END` |
- # <<-END | <<-'END' | <<-"END" | <<-`END` |
- # <<~END | <<~'END' | <<~"END" | <<~`END`
- '<<' [~\-]?
- ( '"' ( c_line - '"' )* '"'
- | "'" ( c_line - "'" )* "'"
- | "`" ( c_line - "`" )* "`"
- | bareword ) % { heredoc_e = p; }
- c_line* c_nl % { new_herebody_s = p; }
- => {
- bool indent;
- bool dedent_body;
- const char* delim_s = ts + 2;
- const char* delim_e = heredoc_e;
- if (*delim_s == '-') {
- indent = true;
- dedent_body = false;
- delim_s++;
- } else if (*delim_s == '~') {
- indent = true;
- dedent_body = true;
- delim_s++;
- } else {
- indent = false;
- dedent_body = false;
- }
- literal_type type;
- if (*delim_s == '"') {
- type = literal_type::DQUOTE_HEREDOC;
- delim_s++;
- delim_e--;
- } else if (*delim_s == '\'') {
- type = literal_type::SQUOTE_HEREDOC;
- delim_s++;
- delim_e--;
- } else if (*delim_s == '`') {
- type = literal_type::BACKTICK_HEREDOC;
- delim_s++;
- delim_e--;
- } else {
- type = literal_type::DQUOTE_HEREDOC;
- }
- if (dedent_body && (version == ruby_version::RUBY_18 ||
- version == ruby_version::RUBY_19 ||
- version == ruby_version::RUBY_20 ||
- version == ruby_version::RUBY_21 ||
- version == ruby_version::RUBY_22)) {
- emit(token_type::tLSHFT, "<<", ts, ts + 2);
- p = ts + 1;
- fnext expr_beg; fbreak;
- } else {
- fnext *push_literal(type, std::string(delim_s, (size_t)(delim_e - delim_s)), ts, heredoc_e, indent, dedent_body);
- if (!herebody_s) {
- herebody_s = new_herebody_s;
- }
- p = herebody_s - 1;
- }
- };
- #
- # SYMBOL LITERALS
- #
- # :&&, :||
- ':' ('&&' | '||') => {
- fhold; fhold;
- emit(token_type::tSYMBEG, tok(ts, ts + 1), ts, ts + 1);
- fgoto expr_fname;
- };
- # :"bar", :'baz'
- ':' ['"] # '
- => {
- literal_type type;
- if (ts[1] == '\'') {
- type = literal_type::SQUOTE_SYMBOL;
- } else { // '"'
- type = literal_type::DQUOTE_SYMBOL;
- }
- fgoto *push_literal(type, std::string(ts + 1, 1), ts);
- };
- # :!@ is :!
- # :~@ is :~
- ':' [!~] '@'
- => {
- emit(token_type::tSYMBEG, tok(ts + 1, ts + 2), ts, te);
- fnext expr_end; fbreak;
- };
- ':' bareword ambiguous_symbol_suffix
- => {
- emit(token_type::tSYMBOL, tok(ts + 1, tm), ts, tm);
- p = tm - 1;
- fnext expr_end; fbreak;
- };
- ':' ( bareword | global_var | class_var | instance_var |
- operator_fname | operator_arithmetic | operator_rest )
- => {
- emit(token_type::tSYMBOL, tok(ts + 1), ts, te);
- fnext expr_end; fbreak;
- };
- #
- # AMBIGUOUS TERNARY OPERATOR
- #
- # Character constant, like ?a, ?\n, ?\u1000, and so on
- # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
- '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
- | (c_any - c_space_nl - e_bs) % { escape = nullptr; }
- )
- => {
- if (version == ruby_version::RUBY_18) {
- emit(token_type::tINTEGER, std::to_string(static_cast<unsigned char>(ts[1])));
- } else {
- emit(token_type::tCHARACTER, escape ? *escape : tok(ts + 1));
- }
- fnext expr_end; fbreak;
- };
- '?' c_space_nl
- => {
- static const struct escape_map_ent { char c; const char* s; } escape_map[] {
- { ' ', "\\s" },
- { '\r', "\\r" },
- { '\n', "\\n" },
- { '\t', "\\t" },
- { '\v', "\\v" },
- { '\f', "\\f" },
- { 0, 0 },
- };
- for (const struct escape_map_ent* ent = escape_map; ent->c; ++ent) {
- if (ts[1] == ent->c) {
- diagnostic_(dlevel::WARNING, dclass::InvalidEscapeUse, ent->s);
- break;
- }
- }
- p = ts - 1;
- fgoto expr_end;
- };
- '?' c_eof
- => {
- diagnostic_(dlevel::FATAL, dclass::IncompleteEscape, range(ts, ts + 1));
- };
- # f ?aa : b: Disambiguate with a character literal.
- '?' [A-Za-z_] bareword
- => {
- p = ts - 1;
- fgoto expr_end;
- };
- #
- # KEYWORDS AND PUNCTUATION
- #
- # a({b=>c})
- e_lbrace
- => {
- if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
- lambda_stack.pop();
- command_start = true;
- emit(token_type::tLAMBEG, "{");
- } else {
- emit(token_type::tLBRACE, "{");
- }
- fbreak;
- };
- # a([1, 2])
- e_lbrack
- => { emit(token_type::tLBRACK, "[");
- fbreak; };
- # a()
- e_lparen
- => { emit(token_type::tLPAREN, "(");
- fbreak; };
- # a(+b)
- punctuation_begin
- => { emit_table(PUNCTUATION_BEGIN);
- fbreak; };
- # rescue Exception => e: Block rescue.
- # Special because it should transition to expr_mid.
- 'rescue' %{ tm = p; } '=>'?
- => { emit(token_type::kRESCUE, "rescue", ts, tm);
- p = tm - 1;
- fnext expr_mid; fbreak; };
- # if a: Statement if.
- keyword_modifier
- => { emit_table(KEYWORDS_BEGIN);
- command_start = true;
- fnext expr_value; fbreak; };
- #
- # RUBY 1.9 HASH LABELS
- #
- label ( any - ':' )
- => {
- fhold;
- if (version == ruby_version::RUBY_18) {
- auto ident = tok(ts, te - 2);
- if (*ts >= 'A' && *ts <= 'Z') {
- emit(token_type::tCONSTANT, ident, ts, te - 2);
- } else {
- emit(token_type::tIDENTIFIER, ident, ts, te - 2);
- }
- fhold; // continue as a symbol
- if (is_declared(ident)) {
- fnext expr_end;
- } else {
- fnext *arg_or_cmdarg(cmd_state);
- }
- } else {
- emit(token_type::tLABEL, tok(ts, te - 2), ts, te - 1);
- fnext expr_labelarg;
- }
- fbreak;
- };
- #
- # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
- #
- # foo= bar: Disambiguate with bareword rule below.
- bareword ambiguous_ident_suffix |
- # def foo: Disambiguate with bareword rule below.
- keyword
- => { p = ts - 1;
- fgoto expr_end; };
- # a = 42; a [42]: Indexing.
- # def a; end; a [42]: Array argument.
- call_or_var
- => local_ident;
- (call_or_var - keyword)
- % { ident_tok = tok(ts, te); ident_ts = ts; ident_te = te; }
- w_space+ '('
- => {
- emit(token_type::tIDENTIFIER, ident_tok, ident_ts, ident_te);
- p = ident_te - 1;
- fnext expr_cmdarg;
- fbreak;
- };
- #
- # WHITESPACE
- #
- w_any;
- e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
- => { p = ts - 1;
- cs_before_block_comment = cs;
- fgoto line_begin; };
- #
- # DEFAULT TRANSITION
- #
- # The following rules match most binary and all unary operators.
- # Rules for binary operators provide better error reporting.
- operator_arithmetic '=' |
- operator_rest |
- punctuation_end |
- c_any
- => { p = ts - 1; fgoto expr_end; };
- c_eof => do_eof;
- *|;
- # Special newline handling for "def a b:"
- #
- expr_labelarg := |*
- w_space_comment;
- w_newline
- => {
- if (in_kwarg) {
- fhold; fgoto expr_end;
- } else {
- fgoto line_begin;
- }
- };
- c_any
- => { fhold; fgoto expr_beg; };
- c_eof => do_eof;
- *|;
- # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
- #
- expr_value := |*
- # a:b: a(:b), a::B, A::B
- label (any - ':')
- => { p = ts - 1;
- fgoto expr_end; };
- # "bar", 'baz'
- ['"] # '
- => {
- literal_type type;
- if (ts[0] == '\'') {
- type = literal_type::SQUOTE_STRING;
- } else { // '"'
- type = literal_type::DQUOTE_STRING;
- }
- fgoto *push_literal(type, tok(), ts);
- };
- w_space_comment;
- w_newline
- => { fgoto line_begin; };
- c_any
- => { fhold; fgoto expr_beg; };
- c_eof => do_eof;
- *|;
- expr_end := |*
- #
- # STABBY LAMBDA
- #
- '->'
- => {
- emit(token_type::tLAMBDA, "->", ts, ts + 2);
- lambda_stack.push(paren_nest);
- fnext expr_endfn; fbreak;
- };
- e_lbrace | 'do'
- => {
- if (!lambda_stack.empty() && lambda_stack.top() == paren_nest) {
- lambda_stack.pop();
- if (ts[0] == '{') {
- emit(token_type::tLAMBEG, "{");
- } else { // 'do'
- emit(token_type::kDO_LAMBDA, "do");
- }
- } else {
- if (ts[0] == '{') {
- emit(token_type::tLCURLY, "{");
- } else { // 'do'
- emit_do();
- }
- }
- command_start = true;
- fnext expr_value; fbreak;
- };
- #
- # KEYWORDS
- #
- keyword_with_fname
- => { emit_table(KEYWORDS);
- fnext expr_fname; fbreak; };
- 'class' w_any* '<<'
- => { emit(token_type::kCLASS, "class", ts, ts + 5);
- emit(token_type::tLSHFT, "<<", te - 2, te);
- fnext expr_value; fbreak; };
- # a if b:c: Syntax error.
- keyword_modifier
- => { emit_table(KEYWORDS);
- fnext expr_beg; fbreak; };
- # elsif b:c: elsif b(:c)
- keyword_with_value
- => { emit_table(KEYWORDS);
- command_start = true;
- fnext expr_value; fbreak; };
- keyword_with_mid
- => { emit_table(KEYWORDS);
- fnext expr_mid; fbreak; };
- keyword_with_arg
- => {
- emit_table(KEYWORDS);
- if (version == ruby_version::RUBY_18 && ts + 3 == te && ts[0] == 'n' && ts[1] == 'o' && ts[2] == 't') {
- fnext expr_beg; fbreak;
- } else {
- fnext expr_arg; fbreak;
- }
- };
- '__ENCODING__'
- => {
- if (version == ruby_version::RUBY_18) {
- auto ident = tok();
- emit(token_type::tIDENTIFIER, ident);
- if (!is_declared(ident)) {
- fnext *arg_or_cmdarg(cmd_state);
- }
- } else {
- emit(token_type::k__ENCODING__, "__ENCODING__");
- }
- fbreak;
- };
- keyword_with_end
- => { emit_table(KEYWORDS);
- fbreak; };
- #
- # NUMERIC LITERALS
- #
- ( '0' [Xx] %{ num_base = 16; num_digits_s = p; } int_hex
- | '0' [Dd] %{ num_base = 10; num_digits_s = p; } int_dec
- | '0' [Oo] %{ num_base = 8; num_digits_s = p; } int_dec
- | '0' [Bb] %{ num_base = 2; num_digits_s = p; } int_bin
- | [1-9] digit* '_'? %{ num_base = 10; num_digits_s = ts; } int_dec
- | '0' digit* '_'? %{ num_base = 8; num_digits_s = ts; } int_dec
- ) %{ num_suffix_s = p; } int_suffix
- => {
- auto digits = tok(num_digits_s, num_suffix_s);
- if (num_suffix_s[-1] == '_') {
- diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), "_");
- } else if (num_digits_s == num_suffix_s && num_base == 8 && version == ruby_version::RUBY_18) {
- // 1.8 did not raise an error on 0o.
- } else if (num_digits_s == num_suffix_s) {
- diagnostic_(dlevel::ERROR, dclass::EmptyNumeric);
- } else if (num_base == 8) {
- for (const char* digit_p = num_digits_s; digit_p < num_suffix_s; digit_p++) {
- if (*digit_p == '8' || *digit_p == '9') {
- diagnostic_(dlevel::ERROR, dclass::InvalidOctal,
- range(digit_p, digit_p + 1));
- }
- }
- }
- if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
- emit(token_type::tINTEGER, convert_base(digits, num_base), ts, num_suffix_s);
- p = num_suffix_s - 1;
- } else {
- emit_num(convert_base(digits, num_base));
- }
- fbreak;
- };
- flo_frac flo_pow?
- => {
- diagnostic_(dlevel::ERROR, dclass::NoDotDigitLiteral);
- };
- flo_int [eE]
- => {
- if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
- diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te-1, te));
- } else {
- emit(token_type::tINTEGER, tok(ts, te - 1), ts, te - 1);
- fhold; fbreak;
- }
- };
- flo_int flo_frac [eE]
- => {
- if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
- diagnostic_(dlevel::ERROR, dclass::TrailingInNumber, range(te - 1, te), tok(te - 1, te));
- } else {
- emit(token_type::tFLOAT, tok(ts, te - 1), ts, te - 1);
- fhold; fbreak;
- }
- };
- flo_int
- ( flo_frac? flo_pow %{ num_suffix_s = p; } flo_pow_suffix
- | flo_frac %{ num_suffix_s = p; } flo_suffix
- )
- => {
- auto digits = tok(ts, num_suffix_s);
- if (version == ruby_version::RUBY_18 || version == ruby_version::RUBY_19 || version == ruby_version::RUBY_20) {
- emit(token_type::tFLOAT, digits, ts, num_suffix_s);
- p = num_suffix_s - 1;
- } else {
- emit_num(digits);
- }
- fbreak;
- };
- #
- # STRING AND XSTRING LITERALS
- #
- # `echo foo`, "bar", 'baz'
- '`' | ['"] # '
- => {
- literal_type type;
- if (ts[0] == '`') {
- type = literal_type::BACKTICK_XSTRING;
- } else if (ts[0] == '\'') {
- type = literal_type::SQUOTE_STRING;
- } else { // '"'
- type = literal_type::DQUOTE_STRING;
- }
- fgoto *push_literal(type, std::string(te - 1, 1), ts, nullptr, false, false, true);
- };
- #
- # CONSTANTS AND VARIABLES
- #
- constant
- => { emit(token_type::tCONSTANT);
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
- constant ambiguous_const_suffix
- => { emit(token_type::tCONSTANT, tok(ts, tm), ts, tm);
- p = tm - 1; fbreak; };
- global_var | class_var_v | instance_var_v
- => { p = ts - 1; fcall expr_variable; };
- #
- # METHOD CALLS
- #
- '.' | '&.' | '::'
- => { emit_table(PUNCTUATION);
- fnext expr_dot; fbreak; };
- call_or_var
- => local_ident;
- bareword ambiguous_fid_suffix
- => {
- if (tm == te) {
- // Suffix was consumed, e.g. foo!
- emit(token_type::tFID);
- } else {
- // Suffix was not consumed, e.g. foo!=
- emit(token_type::tIDENTIFIER, tok(ts, tm), ts, tm);
- p = tm - 1;
- }
- fnext expr_arg; fbreak;
- };
- #
- # OPERATORS
- #
- '*' | '=>'
- => {
- emit_table(PUNCTUATION);
- fgoto expr_value;
- };
- # When '|', '~', '!', '=>' are used as operators
- # they do not accept any symbols (or quoted labels) after.
- # Other binary operators accept it.
- ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
- => {
- emit_table(PUNCTUATION);
- fnext expr_value; fbreak;
- };
- ( e_lparen | '|' | '~' | '!' )
- => { emit_table(PUNCTUATION);
- fnext expr_beg; fbreak; };
- e_rbrace | e_rparen | ']'
- => {
- emit_table(PUNCTUATION);
- cond.pop();
- cmdarg.pop();
- if (ts[0] == '}' || ts[0] == ']') {
- fnext expr_end;
- } else { // ')'
- // this was commented out in the original lexer.rl:
- // fnext expr_endfn; ?
- }
- fbreak;
- };
- operator_arithmetic '='
- => { emit(token_type::tOP_ASGN, tok(ts, te - 1));
- fnext expr_beg; fbreak; };
- '?'
- => { emit(token_type::tEH, "?");
- fnext expr_value; fbreak; };
- e_lbrack
- => { emit(token_type::tLBRACK2, "[");
- fnext expr_beg; fbreak; };
- punctuation_end
- => { emit_table(PUNCTUATION);
- fnext expr_beg; fbreak; };
- #
- # WHITESPACE
- #
- w_space_comment;
- w_newline
- => { fgoto leading_dot; };
- ';'
- => { emit(token_type::tSEMI, ";");
- command_start = true;
- fnext expr_value; fbreak; };
- '\\' c_line {
- diagnostic_(dlevel::ERROR, dclass::BareBackslash, range(ts, ts + 1));
- fhold;
- };
- c_any
- => {
- diagnostic_(dlevel::ERROR, dclass::Unexpected, tok());
- };
- c_eof => do_eof;
- *|;
- leading_dot := |*
- # Insane leading dots:
- # a #comment
- # .b: a.b
- c_space* %{ tm = p; } ('.' | '&.')
- => { p = tm - 1; fgoto expr_end; };
- any
- => { emit(token_type::tNL, std::string(), newline_s, newline_s + 1);
- fhold; fnext line_begin; fbreak; };
- *|;
- #
- # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
- #
- line_comment := |*
- '=end' c_line* c_nl_zlen
- => {
- emit_comment(eq_begin_s, te);
- fgoto *cs_before_block_comment;
- };
- c_line* c_nl;
- c_line* zlen
- => {
- diagnostic_(dlevel::FATAL, dclass::EmbeddedDocument,
- range(eq_begin_s, eq_begin_s + "=begin"s.size()));
- };
- *|;
- line_begin := |*
- w_any;
- '=begin' ( c_space | c_nl_zlen )
- => { eq_begin_s = ts;
- fgoto line_comment; };
- '__END__' ( c_eol - zlen )
- => { p = pe - 3; };
- c_any
- => { cmd_state = true; fhold; fgoto expr_value; };
- c_eof => do_eof;
- *|;
- }%%
- token_t lexer::advance() {
- auto tok = advance_();
- last_token_s = tok->start();
- last_token_e = tok->end();
- return tok;
- }
- void lexer::extend_static() {
- static_env.emplace();
- }
- void lexer::extend_dynamic() {
- if (static_env.empty()) {
- static_env.emplace();
- } else {
- environment& env = static_env.top();
- static_env.push(env);
- }
- }
- void lexer::unextend() {
- static_env.pop();
- }
- void lexer::declare(const std::string& name) {
- static_env.top().insert(name);
- }
- bool lexer::is_declared(const std::string& identifier) const {
- const environment& env = static_env.top();
- return env.find(identifier) != env.end();
- }
- optional_size lexer::dedentLevel() {
- // We erase @dedentLevel as a precaution to avoid accidentally
- // using a stale value.
- auto ret = dedentLevel_;
- dedentLevel_ = std::nullopt;
- return ret;
- }
|