123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309 |
- #include <ruby_parser/literal.hh>
- #include <cassert>
- using namespace ruby_parser;
- literal::literal(lexer& lexer, literal_type type, std::string delimiter, const char* str_s, const char* heredoc_e, bool indent, bool dedent_body, bool label_allowed)
- : _lexer(lexer)
- , _nesting(1)
- , _type(type)
- , indent(indent)
- , dedent_body(dedent_body)
- , label_allowed(label_allowed)
- , _interp_braces(0)
- , space_emitted(true)
- , str_s(str_s)
- , saved_herebody_s(nullptr)
- , heredoc_e(heredoc_e)
- {
- if (delimiter == "(") {
- start_delim = "(";
- end_delim = ")";
- } else if (delimiter == "[") {
- start_delim = "[";
- end_delim = "]";
- } else if (delimiter == "{") {
- start_delim = "{";
- end_delim = "}";
- } else if (delimiter == "<") {
- start_delim = "<";
- end_delim = ">";
- } else {
- start_delim = "";
- end_delim = delimiter;
- }
- // Monolithic strings are glued into a single token, e.g.
- // tSTRING_BEG tSTRING_CONTENT tSTRING_END -> tSTRING.
- monolithic = (type == literal_type::SQUOTE_STRING || type == literal_type::DQUOTE_STRING);
- clear_buffer();
- if (!monolithic) {
- emit_start_token();
- }
- }
- bool literal::words() const {
- return _type == literal_type::UPPERW_WORDS
- || _type == literal_type::LOWERW_WORDS
- || _type == literal_type::UPPERI_SYMBOLS
- || _type == literal_type::LOWERI_SYMBOLS
- ;
- }
- bool literal::backslash_delimited() const {
- return end_delim == "\\";
- }
- bool literal::interpolate() const {
- return _type == literal_type::DQUOTE_STRING
- || _type == literal_type::DQUOTE_HEREDOC
- || _type == literal_type::PERCENT_STRING
- || _type == literal_type::UPPERQ_STRING
- || _type == literal_type::UPPERW_WORDS
- || _type == literal_type::UPPERI_SYMBOLS
- || _type == literal_type::DQUOTE_SYMBOL
- || _type == literal_type::SLASH_REGEXP
- || _type == literal_type::PERCENT_REGEXP
- || _type == literal_type::LOWERX_XSTRING
- || _type == literal_type::BACKTICK_XSTRING
- || _type == literal_type::BACKTICK_HEREDOC
- ;
- }
- bool literal::regexp() const {
- return _type == literal_type::SLASH_REGEXP
- || _type == literal_type::PERCENT_REGEXP
- ;
- }
- bool literal::heredoc() const {
- return heredoc_e != nullptr;
- }
- token_type literal::start_token_type() const {
- switch (_type) {
- case literal_type::SQUOTE_STRING:
- case literal_type::SQUOTE_HEREDOC:
- case literal_type::LOWERQ_STRING:
- case literal_type::DQUOTE_STRING:
- case literal_type::DQUOTE_HEREDOC:
- case literal_type::PERCENT_STRING:
- case literal_type::UPPERQ_STRING:
- return token_type::tSTRING_BEG;
- case literal_type::LOWERW_WORDS:
- return token_type::tQWORDS_BEG;
- case literal_type::UPPERW_WORDS:
- return token_type::tWORDS_BEG;
- case literal_type::LOWERI_SYMBOLS:
- return token_type::tQSYMBOLS_BEG;
- case literal_type::UPPERI_SYMBOLS:
- return token_type::tSYMBOLS_BEG;
- case literal_type::SQUOTE_SYMBOL:
- case literal_type::LOWERS_SYMBOL:
- case literal_type::DQUOTE_SYMBOL:
- return token_type::tSYMBEG;
- case literal_type::SLASH_REGEXP:
- case literal_type::PERCENT_REGEXP:
- return token_type::tREGEXP_BEG;
- case literal_type::LOWERX_XSTRING:
- case literal_type::BACKTICK_XSTRING:
- case literal_type::BACKTICK_HEREDOC:
- return token_type::tXSTRING_BEG;
- default:
- assert(false);
- }
- }
- optional_size literal::dedentLevel() const {
- return _dedentLevel;
- }
- bool literal::munge_escape(char c) const {
- if (words() && (c == ' ' || c == '\t' || c == '\v' || c == '\r' || c == '\f' || c == '\n')) {
- return true;
- } else if (c == '\\' || (start_delim.size() == 1 && start_delim.at(0) == c)
- || (end_delim.size() == 1 && end_delim.at(0) == c)) {
- return true;
- } else {
- return false;
- }
- }
- void literal::infer_indent_level(std::string& line) {
- if (!dedent_body) {
- return;
- }
- size_t indent_level = 0;
- for (auto it = line.cbegin(); it != line.cend(); ++it) {
- if (*it == ' ') {
- indent_level++;
- continue;
- }
- if (*it == '\t') {
- indent_level += (8 - indent_level % 8);
- continue;
- }
- if (!_dedentLevel || *_dedentLevel > indent_level) {
- _dedentLevel = indent_level;
- }
- break;
- }
- }
- void literal::start_interp_brace() {
- _interp_braces++;
- }
- bool literal::end_interp_brace_and_try_closing() {
- _interp_braces--;
- return _interp_braces == 0;
- }
- // copied from MRI's include/ruby/ruby.h:
- static bool rb_isspace(char c) {
- return c == ' ' || ('\t' <= c && c <= '\r');
- }
- static void lstrip(std::string& str) {
- size_t index = 0;
- while (index < str.size()) {
- if (rb_isspace(str.at(index))) {
- index++;
- } else {
- break;
- }
- }
- str.erase(0, index);
- }
- bool literal::is_delimiter(std::string& delimiter) const {
- if (indent) {
- std::string stripped_delimiter = delimiter;
- lstrip(stripped_delimiter);
- return end_delim == stripped_delimiter;
- } else {
- return end_delim == delimiter;
- }
- }
- static bool lookahead_quoted_label(std::string& lookahead) {
- switch (lookahead.size()) {
- case 0:
- return false;
- case 1:
- return lookahead.at(0) == ':';
- default:
- return lookahead.at(0) == ':' && lookahead.at(1) != ':';
- }
- }
- bool literal::nest_and_try_closing(std::string& delimiter, const char* ts, const char* te, std::string lookahead) {
- if (start_delim.size() > 0 && start_delim == delimiter) {
- _nesting++;
- } else if (is_delimiter(delimiter)) {
- _nesting--;
- }
- if (_nesting == 0) {
- if (words()) {
- extend_space(ts, ts);
- }
- if (label_allowed && lookahead_quoted_label(lookahead) && start_token_type() == token_type::tSTRING_BEG) {
- // This is a quoted label.
- flush_string();
- emit(token_type::tLABEL_END, end_delim, ts, te + 1);
- return true;
- } else if (monolithic) {
- // Emit the string as a single token.
- emit(token_type::tSTRING, buffer, str_s, te);
- return true;
- } else {
- // If this is a heredoc, @buffer contains the sentinel now.
- // Just throw it out. Lexer flushes the heredoc after each
- // non-heredoc-terminating \n anyway, so no data will be lost.
- if (!heredoc()) {
- flush_string();
- }
- emit(token_type::tSTRING_END, end_delim, ts, te);
- return true;
- }
- }
- return false;
- }
- void literal::extend_space(const char* ts, const char* te) {
- flush_string();
- if (!space_emitted) {
- std::string nothing;
- emit(token_type::tSPACE, nothing, ts, te);
- space_emitted = true;
- }
- }
- void literal::extend_string(std::string& str, const char* ts, const char* te) {
- if (!buffer_s) {
- buffer_s = ts;
- }
- buffer_e = te;
- buffer += str;
- }
- void literal::extend_content() {
- space_emitted = false;
- }
- void literal::flush_string() {
- if (monolithic) {
- emit_start_token();
- monolithic = false;
- }
- if (!buffer.empty()) {
- emit(token_type::tSTRING_CONTENT, buffer, buffer_s, buffer_e);
- clear_buffer();
- extend_content();
- }
- }
- void literal::clear_buffer() {
- buffer = "";
- buffer_s = nullptr;
- buffer_e = nullptr;
- }
- void literal::emit_start_token() {
- auto str_type_length = 1 /* TODO @str_type.length */;
- auto str_e = heredoc_e ? heredoc_e : str_s + str_type_length;
- std::string nothing;
- emit(start_token_type(), nothing, str_s, str_e);
- }
- void literal::emit(token_type tok, std::string& value, const char* s, const char* e) {
- _lexer.emit(tok, value, s, e);
- }
|