literal.cc 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. #include <ruby_parser/literal.hh>
  2. #include <cassert>
  3. using namespace ruby_parser;
  4. literal::literal(lexer& lexer, literal_type type, std::string delimiter, const char* str_s, const char* heredoc_e, bool indent, bool dedent_body, bool label_allowed)
  5. : _lexer(lexer)
  6. , _nesting(1)
  7. , _type(type)
  8. , indent(indent)
  9. , dedent_body(dedent_body)
  10. , label_allowed(label_allowed)
  11. , _interp_braces(0)
  12. , space_emitted(true)
  13. , str_s(str_s)
  14. , saved_herebody_s(nullptr)
  15. , heredoc_e(heredoc_e)
  16. {
  17. if (delimiter == "(") {
  18. start_delim = "(";
  19. end_delim = ")";
  20. } else if (delimiter == "[") {
  21. start_delim = "[";
  22. end_delim = "]";
  23. } else if (delimiter == "{") {
  24. start_delim = "{";
  25. end_delim = "}";
  26. } else if (delimiter == "<") {
  27. start_delim = "<";
  28. end_delim = ">";
  29. } else {
  30. start_delim = "";
  31. end_delim = delimiter;
  32. }
  33. // Monolithic strings are glued into a single token, e.g.
  34. // tSTRING_BEG tSTRING_CONTENT tSTRING_END -> tSTRING.
  35. monolithic = (type == literal_type::SQUOTE_STRING || type == literal_type::DQUOTE_STRING);
  36. clear_buffer();
  37. if (!monolithic) {
  38. emit_start_token();
  39. }
  40. }
  41. bool literal::words() const {
  42. return _type == literal_type::UPPERW_WORDS
  43. || _type == literal_type::LOWERW_WORDS
  44. || _type == literal_type::UPPERI_SYMBOLS
  45. || _type == literal_type::LOWERI_SYMBOLS
  46. ;
  47. }
  48. bool literal::backslash_delimited() const {
  49. return end_delim == "\\";
  50. }
  51. bool literal::interpolate() const {
  52. return _type == literal_type::DQUOTE_STRING
  53. || _type == literal_type::DQUOTE_HEREDOC
  54. || _type == literal_type::PERCENT_STRING
  55. || _type == literal_type::UPPERQ_STRING
  56. || _type == literal_type::UPPERW_WORDS
  57. || _type == literal_type::UPPERI_SYMBOLS
  58. || _type == literal_type::DQUOTE_SYMBOL
  59. || _type == literal_type::SLASH_REGEXP
  60. || _type == literal_type::PERCENT_REGEXP
  61. || _type == literal_type::LOWERX_XSTRING
  62. || _type == literal_type::BACKTICK_XSTRING
  63. || _type == literal_type::BACKTICK_HEREDOC
  64. ;
  65. }
  66. bool literal::regexp() const {
  67. return _type == literal_type::SLASH_REGEXP
  68. || _type == literal_type::PERCENT_REGEXP
  69. ;
  70. }
  71. bool literal::heredoc() const {
  72. return heredoc_e != nullptr;
  73. }
  74. token_type literal::start_token_type() const {
  75. switch (_type) {
  76. case literal_type::SQUOTE_STRING:
  77. case literal_type::SQUOTE_HEREDOC:
  78. case literal_type::LOWERQ_STRING:
  79. case literal_type::DQUOTE_STRING:
  80. case literal_type::DQUOTE_HEREDOC:
  81. case literal_type::PERCENT_STRING:
  82. case literal_type::UPPERQ_STRING:
  83. return token_type::tSTRING_BEG;
  84. case literal_type::LOWERW_WORDS:
  85. return token_type::tQWORDS_BEG;
  86. case literal_type::UPPERW_WORDS:
  87. return token_type::tWORDS_BEG;
  88. case literal_type::LOWERI_SYMBOLS:
  89. return token_type::tQSYMBOLS_BEG;
  90. case literal_type::UPPERI_SYMBOLS:
  91. return token_type::tSYMBOLS_BEG;
  92. case literal_type::SQUOTE_SYMBOL:
  93. case literal_type::LOWERS_SYMBOL:
  94. case literal_type::DQUOTE_SYMBOL:
  95. return token_type::tSYMBEG;
  96. case literal_type::SLASH_REGEXP:
  97. case literal_type::PERCENT_REGEXP:
  98. return token_type::tREGEXP_BEG;
  99. case literal_type::LOWERX_XSTRING:
  100. case literal_type::BACKTICK_XSTRING:
  101. case literal_type::BACKTICK_HEREDOC:
  102. return token_type::tXSTRING_BEG;
  103. default:
  104. assert(false);
  105. }
  106. }
  107. optional_size literal::dedentLevel() const {
  108. return _dedentLevel;
  109. }
  110. bool literal::munge_escape(char c) const {
  111. if (words() && (c == ' ' || c == '\t' || c == '\v' || c == '\r' || c == '\f' || c == '\n')) {
  112. return true;
  113. } else if (c == '\\' || (start_delim.size() == 1 && start_delim.at(0) == c)
  114. || (end_delim.size() == 1 && end_delim.at(0) == c)) {
  115. return true;
  116. } else {
  117. return false;
  118. }
  119. }
  120. void literal::infer_indent_level(std::string& line) {
  121. if (!dedent_body) {
  122. return;
  123. }
  124. size_t indent_level = 0;
  125. for (auto it = line.cbegin(); it != line.cend(); ++it) {
  126. if (*it == ' ') {
  127. indent_level++;
  128. continue;
  129. }
  130. if (*it == '\t') {
  131. indent_level += (8 - indent_level % 8);
  132. continue;
  133. }
  134. if (!_dedentLevel || *_dedentLevel > indent_level) {
  135. _dedentLevel = indent_level;
  136. }
  137. break;
  138. }
  139. }
  140. void literal::start_interp_brace() {
  141. _interp_braces++;
  142. }
  143. bool literal::end_interp_brace_and_try_closing() {
  144. _interp_braces--;
  145. return _interp_braces == 0;
  146. }
  147. // copied from MRI's include/ruby/ruby.h:
  148. static bool rb_isspace(char c) {
  149. return c == ' ' || ('\t' <= c && c <= '\r');
  150. }
  151. static void lstrip(std::string& str) {
  152. size_t index = 0;
  153. while (index < str.size()) {
  154. if (rb_isspace(str.at(index))) {
  155. index++;
  156. } else {
  157. break;
  158. }
  159. }
  160. str.erase(0, index);
  161. }
  162. bool literal::is_delimiter(std::string& delimiter) const {
  163. if (indent) {
  164. std::string stripped_delimiter = delimiter;
  165. lstrip(stripped_delimiter);
  166. return end_delim == stripped_delimiter;
  167. } else {
  168. return end_delim == delimiter;
  169. }
  170. }
  171. static bool lookahead_quoted_label(std::string& lookahead) {
  172. switch (lookahead.size()) {
  173. case 0:
  174. return false;
  175. case 1:
  176. return lookahead.at(0) == ':';
  177. default:
  178. return lookahead.at(0) == ':' && lookahead.at(1) != ':';
  179. }
  180. }
  181. bool literal::nest_and_try_closing(std::string& delimiter, const char* ts, const char* te, std::string lookahead) {
  182. if (start_delim.size() > 0 && start_delim == delimiter) {
  183. _nesting++;
  184. } else if (is_delimiter(delimiter)) {
  185. _nesting--;
  186. }
  187. if (_nesting == 0) {
  188. if (words()) {
  189. extend_space(ts, ts);
  190. }
  191. if (label_allowed && lookahead_quoted_label(lookahead) && start_token_type() == token_type::tSTRING_BEG) {
  192. // This is a quoted label.
  193. flush_string();
  194. emit(token_type::tLABEL_END, end_delim, ts, te + 1);
  195. return true;
  196. } else if (monolithic) {
  197. // Emit the string as a single token.
  198. emit(token_type::tSTRING, buffer, str_s, te);
  199. return true;
  200. } else {
  201. // If this is a heredoc, @buffer contains the sentinel now.
  202. // Just throw it out. Lexer flushes the heredoc after each
  203. // non-heredoc-terminating \n anyway, so no data will be lost.
  204. if (!heredoc()) {
  205. flush_string();
  206. }
  207. emit(token_type::tSTRING_END, end_delim, ts, te);
  208. return true;
  209. }
  210. }
  211. return false;
  212. }
  213. void literal::extend_space(const char* ts, const char* te) {
  214. flush_string();
  215. if (!space_emitted) {
  216. std::string nothing;
  217. emit(token_type::tSPACE, nothing, ts, te);
  218. space_emitted = true;
  219. }
  220. }
  221. void literal::extend_string(std::string& str, const char* ts, const char* te) {
  222. if (!buffer_s) {
  223. buffer_s = ts;
  224. }
  225. buffer_e = te;
  226. buffer += str;
  227. }
  228. void literal::extend_content() {
  229. space_emitted = false;
  230. }
  231. void literal::flush_string() {
  232. if (monolithic) {
  233. emit_start_token();
  234. monolithic = false;
  235. }
  236. if (!buffer.empty()) {
  237. emit(token_type::tSTRING_CONTENT, buffer, buffer_s, buffer_e);
  238. clear_buffer();
  239. extend_content();
  240. }
  241. }
  242. void literal::clear_buffer() {
  243. buffer = "";
  244. buffer_s = nullptr;
  245. buffer_e = nullptr;
  246. }
  247. void literal::emit_start_token() {
  248. auto str_type_length = 1 /* TODO @str_type.length */;
  249. auto str_e = heredoc_e ? heredoc_e : str_s + str_type_length;
  250. std::string nothing;
  251. emit(start_token_type(), nothing, str_s, str_e);
  252. }
  253. void literal::emit(token_type tok, std::string& value, const char* s, const char* e) {
  254. _lexer.emit(tok, value, s, e);
  255. }