Line data Source code
1 : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
2 :
3 : // This file is part of GCC.
4 :
5 : // GCC is free software; you can redistribute it and/or modify it under
6 : // the terms of the GNU General Public License as published by the Free
7 : // Software Foundation; either version 3, or (at your option) any later
8 : // version.
9 :
10 : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 : // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 : // for more details.
14 :
15 : // You should have received a copy of the GNU General Public License
16 : // along with GCC; see the file COPYING3. If not see
17 : // <http://www.gnu.org/licenses/>.
18 :
19 : #include "rust-codepoint.h"
20 : #include "rust-system.h"
21 : #include "rust-lex.h"
22 : #include "rust-diagnostics.h"
23 : #include "rust-linemap.h"
24 : #include "rust-edition.h"
25 : #include "safe-ctype.h"
26 : #include "cpplib.h"
27 : #include "rust-keyword-values.h"
28 :
29 : namespace Rust {
30 : // TODO: move to separate compilation unit?
31 : // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
32 : std::string &
33 3176200 : operator+= (std::string &str, Codepoint char32)
34 : {
35 3176200 : if (char32.value < 0x80)
36 : {
37 3175187 : str += static_cast<char> (char32.value);
38 : }
39 1013 : else if (char32.value < (0x1F + 1) << (1 * 6))
40 : {
41 674 : str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
42 674 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
43 : }
44 339 : else if (char32.value < (0x0F + 1) << (2 * 6))
45 : {
46 329 : str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
47 329 : str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
48 329 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
49 : }
50 10 : else if (char32.value < (0x07 + 1) << (3 * 6))
51 : {
52 6 : str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
53 6 : str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
54 6 : str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
55 6 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
56 : }
57 : else
58 : {
59 4 : rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
60 : }
61 3176200 : return str;
62 : }
63 :
64 : std::string
65 2894707 : Codepoint::as_string ()
66 : {
67 2894707 : std::string str;
68 :
69 : // str += Codepoint (value);
70 2894707 : str += *this;
71 :
72 2894707 : return str;
73 : }
74 :
75 : /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
76 : * for handling. */
77 : bool
78 0 : is_float_digit (uint32_t number)
79 : {
80 0 : return ISDIGIT (number) || number == 'E' || number == 'e';
81 : }
82 :
83 : /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
84 : * whatever is different */
85 : bool
86 1953 : is_x_digit (uint32_t number)
87 : {
88 1953 : return ISXDIGIT (number);
89 : }
90 :
91 : bool
92 53 : is_octal_digit (uint32_t number)
93 : {
94 53 : return number >= '0' && number <= '7';
95 : }
96 :
97 : bool
98 193 : is_bin_digit (uint32_t number)
99 : {
100 193 : return number == '0' || number == '1';
101 : }
102 :
103 : bool
104 142 : check_valid_float_dot_end (uint32_t character)
105 : {
106 142 : return character != '.' && character != '_' && !ISALPHA (character);
107 : }
108 :
109 : bool
110 3484 : is_whitespace (uint32_t character)
111 : {
112 : // https://doc.rust-lang.org/reference/whitespace.html
113 3484 : switch (character)
114 : {
115 : case '\t':
116 : case '\n':
117 : case '\v':
118 : case '\f':
119 : case '\r':
120 : case ' ':
121 : case 0x0085: // next line
122 : case 0x200e: // left-to-right mark
123 : case 0x200f: // right-to-left mark
124 : case 0x2028: // line separator
125 : case 0x2029: // paragraph separator
126 : return true;
127 3141 : default:
128 3141 : return false;
129 : }
130 : }
131 :
132 : bool
133 3527 : is_non_decimal_int_literal_separator (uint32_t character)
134 : {
135 3527 : return character == 'x' || character == 'o' || character == 'b';
136 : }
137 :
138 : bool
139 312001 : is_identifier_start (uint32_t codepoint)
140 : {
141 312001 : return (cpp_check_xid_property (codepoint) & CPP_XID_START)
142 312001 : || codepoint == '_';
143 : }
144 :
145 : bool
146 1147267 : is_identifier_continue (uint32_t codepoint)
147 : {
148 1147267 : return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
149 : }
150 :
151 103 : Lexer::Lexer (const std::string &input, Linemap *linemap)
152 103 : : input (RAIIFile::create_error ()), current_line (1), current_column (1),
153 103 : line_map (linemap), dump_lex_out ({}),
154 103 : raw_input_source (new BufferInputSource (input, 0)),
155 103 : input_queue{*raw_input_source}, token_queue (TokenSource (this))
156 103 : {}
157 :
158 4690 : Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
159 4690 : tl::optional<std::ofstream &> dump_lex_opt)
160 4690 : : input (std::move (file_input)), current_line (1), current_column (1),
161 4690 : line_map (linemap), dump_lex_out (dump_lex_opt),
162 9380 : raw_input_source (new FileInputSource (input.get_raw ())),
163 9380 : input_queue{*raw_input_source}, token_queue (TokenSource (this))
164 : {
165 : // inform line_table that file is being entered and is in line 1
166 4690 : if (linemap)
167 4690 : line_map->start_file (filename, current_line);
168 4690 : }
169 :
170 4791 : Lexer::~Lexer ()
171 : {
172 : /* ok apparently stop (which is equivalent of original code in destructor) is
173 : * meant to be called after all files have finished parsing, for cleanup. On
174 : * the other hand, actual code that it calls to leave a certain line map is
175 : * mentioned in GCC docs as being useful for "just leaving an included header"
176 : * and stuff like that, so this line mapping functionality may need fixing.
177 : * FIXME: find out whether this occurs. */
178 :
179 : // line_map->stop();
180 4791 : }
181 :
182 : bool
183 4635 : Lexer::input_source_is_valid_utf8 ()
184 : {
185 4635 : return raw_input_source->is_valid ();
186 : }
187 :
188 : location_t
189 1839756 : Lexer::get_current_location ()
190 : {
191 1839756 : if (line_map)
192 1839587 : return linemap_position_for_column (line_table, current_column);
193 : else
194 : // If we have no linemap, we're lexing something without proper locations
195 : return UNDEF_LOCATION;
196 : }
197 :
198 : Codepoint
199 4137209 : Lexer::peek_input (int n)
200 : {
201 4137209 : return input_queue.peek (n);
202 : }
203 :
204 : Codepoint
205 4039740 : Lexer::peek_input ()
206 : {
207 4039740 : return peek_input (0);
208 : }
209 :
210 : void
211 3478657 : Lexer::skip_input (int n)
212 : {
213 3478657 : input_queue.skip (n);
214 3478657 : }
215 :
216 : void
217 3468373 : Lexer::skip_input ()
218 : {
219 3468373 : skip_input (0);
220 3468373 : }
221 :
222 : void
223 725633 : Lexer::skip_token (int n)
224 : {
225 : // dump tokens if dump-lex option is enabled
226 725633 : if (dump_lex_out.has_value ())
227 0 : dump_and_skip (n);
228 : else
229 725633 : token_queue.skip (n);
230 725633 : }
231 :
232 : void
233 0 : Lexer::dump_and_skip (int n)
234 : {
235 0 : std::ofstream &out = dump_lex_out.value ();
236 0 : bool found_eof = false;
237 0 : const_TokenPtr tok;
238 0 : for (int i = 0; i < n + 1; i++)
239 : {
240 0 : if (!found_eof)
241 : {
242 0 : tok = peek_token ();
243 0 : found_eof |= tok->get_id () == Rust::END_OF_FILE;
244 :
245 0 : location_t loc = tok->get_locus ();
246 :
247 0 : out << "<id=";
248 0 : out << tok->token_id_to_str ();
249 0 : out << (tok->should_have_str ()
250 0 : ? (std::string (", text=") + tok->get_str ()
251 0 : + std::string (", typehint=")
252 0 : + std::string (tok->get_type_hint_str ()))
253 0 : : "")
254 0 : << " ";
255 0 : out << Linemap::location_to_string (loc) << '\n';
256 : }
257 :
258 0 : token_queue.skip (0);
259 : }
260 0 : }
261 :
262 : void
263 0 : Lexer::replace_current_token (TokenPtr replacement)
264 : {
265 0 : token_queue.replace_current_value (replacement);
266 :
267 0 : rust_debug ("called 'replace_current_token' - this is deprecated");
268 0 : }
269 :
270 : /* Determines whether the string passed in is a keyword or not. If it is, it
271 : * returns the keyword name. */
272 : TokenId
273 281070 : Lexer::classify_keyword (const std::string &str)
274 : {
275 281070 : auto &keywords = Rust::Values::Keywords::keywords_tokens;
276 281070 : auto keyword = keywords.find (str);
277 :
278 281070 : if (keyword == keywords.end ())
279 : return IDENTIFIER;
280 :
281 94047 : auto id = keyword->second;
282 :
283 : // We now have the expected token ID of the reserved keyword. However, some
284 : // keywords are reserved starting in certain editions. For example, `try` is
285 : // only a reserved keyword in editions >=2018. The language might gain new
286 : // reserved keywords in the future.
287 : //
288 : // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
289 :
290 : // `try` is not a reserved keyword before 2018
291 94047 : if (get_rust_edition () == Edition::E2015 && id == TRY)
292 : return IDENTIFIER;
293 :
294 : return id;
295 : }
296 :
297 : TokenPtr
298 733690 : Lexer::build_token ()
299 : {
300 : // loop to go through multiple characters to build a single token
301 1827146 : while (true)
302 : {
303 1827146 : location_t loc = get_current_location ();
304 :
305 1827146 : current_char = peek_input ();
306 1827146 : skip_input ();
307 :
308 : // detect shebang
309 : // Must be the first thing on the first line, starting with #!
310 : // But since an attribute can also start with an #! we don't count it as a
311 : // shebang line when after any whitespace or comments there is a [. If it
312 : // is a shebang line we simple drop the line. Otherwise we don't consume
313 : // any characters and fall through to the real tokenizer.
314 31175 : if (current_line == 1 && current_column == 1 && current_char == '#'
315 1858321 : && peek_input () == '!')
316 : {
317 : int n = 1;
318 3112 : while (true)
319 : {
320 3112 : Codepoint next_char = peek_input (n);
321 3112 : if (is_whitespace (next_char.value))
322 7 : n++;
323 3105 : else if ((next_char == '/' && peek_input (n + 1) == '/'
324 7 : && peek_input (n + 2) != '!'
325 7 : && peek_input (n + 2) != '/')
326 3126 : || (next_char == '/' && peek_input (n + 1) == '/'
327 0 : && peek_input (n + 2) == '/'
328 0 : && peek_input (n + 3) == '/'))
329 : {
330 : // two // or four ////
331 : // A single line comment
332 : // (but not an inner or outer doc comment)
333 7 : n += 2;
334 7 : next_char = peek_input (n);
335 119 : while (next_char != '\n' && !next_char.is_eof ())
336 : {
337 112 : n++;
338 112 : next_char = peek_input (n);
339 : }
340 7 : if (next_char == '\n')
341 7 : n++;
342 : }
343 3098 : else if (next_char == '/' && peek_input (n + 1) == '*'
344 0 : && peek_input (n + 2) == '*'
345 3098 : && peek_input (n + 3) == '/')
346 : {
347 : /**/
348 0 : n += 4;
349 : }
350 3098 : else if (next_char == '/' && peek_input (n + 1) == '*'
351 0 : && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
352 3098 : && peek_input (n + 4) == '/')
353 : {
354 : /***/
355 0 : n += 5;
356 : }
357 3098 : else if ((next_char == '/' && peek_input (n + 1) == '*'
358 0 : && peek_input (n + 2) != '*'
359 0 : && peek_input (n + 2) != '!')
360 3119 : || (next_char == '/' && peek_input (n + 1) == '*'
361 0 : && peek_input (n + 2) == '*'
362 0 : && peek_input (n + 3) == '*'))
363 : {
364 : // one /* or three /***
365 : // Start of a block comment
366 : // (but not an inner or outer doc comment)
367 0 : n += 2;
368 0 : int level = 1;
369 0 : while (level > 0)
370 : {
371 0 : if (peek_input (n).is_eof ())
372 : break;
373 0 : else if (peek_input (n) == '/'
374 0 : && peek_input (n + 1) == '*')
375 : {
376 0 : n += 2;
377 0 : level += 1;
378 : }
379 0 : else if (peek_input (n) == '*'
380 0 : && peek_input (n + 1) == '/')
381 : {
382 0 : n += 2;
383 0 : level -= 1;
384 : }
385 : else
386 0 : n++;
387 : }
388 : }
389 3098 : else if (next_char != '[')
390 : {
391 : // definitely shebang, ignore the first line
392 518 : while (current_char != '\n' && !current_char.is_eof ())
393 : {
394 490 : current_char = peek_input ();
395 490 : skip_input ();
396 : }
397 :
398 : // newline
399 28 : current_line++;
400 28 : current_column = 1;
401 : // tell line_table that new line starts
402 28 : start_line (current_line, max_column_hint);
403 28 : break;
404 : }
405 : else
406 : break; /* Definitely not a shebang line. */
407 : }
408 : }
409 :
410 : // return end of file token if end of file
411 1827146 : if (current_char.is_eof ())
412 5089 : return Token::make (END_OF_FILE, loc);
413 :
414 : // if not end of file, start tokenising
415 1822057 : switch (current_char.value)
416 : {
417 : /* ignore whitespace characters for tokens but continue updating
418 : * location */
419 160571 : case '\n': // newline
420 160571 : case 0x0085: // next line
421 160571 : case 0x2028: // line separator
422 160571 : case 0x2029: // paragraph separator
423 160571 : current_line++;
424 160571 : current_column = 1;
425 : // tell line_table that new line starts
426 160571 : start_line (current_line, max_column_hint);
427 160571 : continue;
428 252 : case '\r': // cr
429 : // Ignore, we expect a newline (lf) soon.
430 252 : continue;
431 922883 : case ' ': // space
432 922883 : current_column++;
433 922883 : continue;
434 113 : case '\t': // horizontal tab
435 : // width of a tab is not well-defined, assume 8 spaces
436 113 : current_column += 8;
437 113 : continue;
438 28 : case '\v': // vertical tab
439 28 : case 0x000c: // form feed
440 28 : case 0x200e: // left-to-right mark
441 28 : case 0x200f: // right-to-left mark
442 : // Ignored.
443 28 : continue;
444 :
445 : // punctuation - actual tokens
446 27915 : case '=':
447 27915 : if (peek_input () == '>')
448 : {
449 : // match arm arrow
450 3270 : skip_input ();
451 3270 : current_column += 2;
452 3270 : loc += 1;
453 :
454 3270 : return Token::make (MATCH_ARROW, loc);
455 : }
456 24645 : else if (peek_input () == '=')
457 : {
458 : // equality operator
459 667 : skip_input ();
460 667 : current_column += 2;
461 667 : loc += 1;
462 :
463 667 : return Token::make (EQUAL_EQUAL, loc);
464 : }
465 : else
466 : {
467 : // assignment operator
468 23978 : current_column++;
469 23978 : return Token::make (EQUAL, loc);
470 : }
471 45137 : case '(':
472 45137 : current_column++;
473 45137 : return Token::make (LEFT_PAREN, loc);
474 11551 : case '-':
475 11551 : if (peek_input () == '>')
476 : {
477 : // return type specifier
478 10220 : skip_input ();
479 10220 : current_column += 2;
480 10220 : loc += 1;
481 :
482 10220 : return Token::make (RETURN_TYPE, loc);
483 : }
484 1331 : else if (peek_input () == '=')
485 : {
486 : // minus-assign
487 105 : skip_input ();
488 105 : current_column += 2;
489 105 : loc += 1;
490 :
491 105 : return Token::make (MINUS_EQ, loc);
492 : }
493 : else
494 : {
495 : // minus
496 1226 : current_column++;
497 1226 : return Token::make (MINUS, loc);
498 : }
499 1743 : case '+':
500 1743 : if (peek_input () == '=')
501 : {
502 : // add-assign
503 152 : skip_input ();
504 152 : current_column += 2;
505 152 : loc += 1;
506 :
507 152 : return Token::make (PLUS_EQ, loc);
508 : }
509 : else
510 : {
511 : // add
512 1591 : current_column++;
513 1591 : return Token::make (PLUS, loc);
514 : }
515 45118 : case ')':
516 45118 : current_column++;
517 45118 : return Token::make (RIGHT_PAREN, loc);
518 29713 : case ';':
519 29713 : current_column++;
520 29713 : return Token::make (SEMICOLON, loc);
521 10832 : case '*':
522 10832 : if (peek_input () == '=')
523 : {
524 : // multiplication-assign
525 7 : skip_input ();
526 7 : current_column += 2;
527 7 : loc += 1;
528 :
529 7 : return Token::make (ASTERISK_EQ, loc);
530 : }
531 : else
532 : {
533 : // multiplication
534 10825 : current_column++;
535 10825 : return Token::make (ASTERISK, loc);
536 : }
537 22688 : case ',':
538 22688 : current_column++;
539 22688 : return Token::make (COMMA, loc);
540 17752 : case '/':
541 17752 : if (peek_input () == '=')
542 : {
543 : // division-assign
544 7 : skip_input ();
545 7 : current_column += 2;
546 7 : loc += 1;
547 :
548 7 : return Token::make (DIV_EQ, loc);
549 : }
550 17745 : else if ((peek_input () == '/' && peek_input (1) != '!'
551 16638 : && peek_input (1) != '/')
552 25745 : || (peek_input () == '/' && peek_input (1) == '/'
553 7900 : && peek_input (2) == '/'))
554 : {
555 : // two // or four ////
556 : // single line comment
557 : // (but not an inner or outer doc comment)
558 8753 : skip_input ();
559 8753 : current_column += 2;
560 8753 : current_char = peek_input ();
561 :
562 : // basically ignore until line finishes
563 429376 : while (current_char != '\n' && !current_char.is_eof ())
564 : {
565 411870 : skip_input ();
566 411870 : current_column++; // not used
567 411870 : current_char = peek_input ();
568 : }
569 8753 : continue;
570 : }
571 8992 : else if (peek_input () == '/'
572 8992 : && (peek_input (1) == '!' || peek_input (1) == '/'))
573 : {
574 : /* single line doc comment, inner or outer. */
575 7985 : bool is_inner = peek_input (1) == '!';
576 7985 : skip_input (1);
577 7985 : current_column += 3;
578 :
579 7985 : std::string str;
580 7985 : str.reserve (32);
581 7985 : current_char = peek_input ();
582 192537 : while (current_char != '\n')
583 : {
584 176616 : skip_input ();
585 176616 : if (current_char == '\r')
586 : {
587 51 : Codepoint next_char = peek_input ();
588 51 : if (next_char == '\n')
589 : {
590 49 : current_char = '\n';
591 49 : break;
592 : }
593 2 : rust_error_at (
594 : loc, "Isolated CR %<\\r%> not allowed in doc comment");
595 2 : current_char = next_char;
596 2 : continue;
597 2 : }
598 176565 : if (current_char.is_eof ())
599 : {
600 0 : rust_error_at (
601 : loc, ErrorCode::E0758,
602 : "unexpected EOF while looking for end of comment");
603 0 : break;
604 : }
605 176565 : str += current_char;
606 176565 : current_char = peek_input ();
607 : }
608 7985 : skip_input ();
609 7985 : current_line++;
610 7985 : current_column = 1;
611 : // tell line_table that new line starts
612 7985 : start_line (current_line, max_column_hint);
613 :
614 7985 : str.shrink_to_fit ();
615 :
616 7985 : loc += str.size () - 1;
617 7985 : if (is_inner)
618 100 : return Token::make_inner_doc_comment (loc, std::move (str));
619 : else
620 7885 : return Token::make_outer_doc_comment (loc, std::move (str));
621 7985 : }
622 1007 : else if (peek_input () == '*' && peek_input (1) == '*'
623 1092 : && peek_input (2) == '/')
624 : {
625 : /**/
626 14 : skip_input (2);
627 14 : current_column += 4;
628 14 : continue;
629 : }
630 993 : else if (peek_input () == '*' && peek_input (1) == '*'
631 1064 : && peek_input (2) == '*' && peek_input (3) == '/')
632 : {
633 : /***/
634 14 : skip_input (3);
635 14 : current_column += 5;
636 14 : continue;
637 : }
638 979 : else if ((peek_input () == '*' && peek_input (1) != '!'
639 870 : && peek_input (1) != '*')
640 1109 : || (peek_input () == '*' && peek_input (1) == '*'
641 57 : && peek_input (2) == '*'))
642 : {
643 : // one /* or three /***
644 : // block comment
645 : // (but not an inner or outer doc comment)
646 827 : skip_input ();
647 827 : current_column += 2;
648 :
649 827 : int level = 1;
650 36902 : while (level > 0)
651 : {
652 36076 : current_char = peek_input ();
653 :
654 36076 : if (current_char.is_eof ())
655 : {
656 1 : rust_error_at (
657 : loc, ErrorCode::E0758,
658 : "unexpected EOF while looking for end of comment");
659 1 : break;
660 : }
661 :
662 : // if /* found
663 36075 : if (current_char == '/' && peek_input (1) == '*')
664 : {
665 : // skip /* characters
666 49 : skip_input (1);
667 :
668 49 : current_column += 2;
669 :
670 49 : level += 1;
671 49 : continue;
672 : }
673 :
674 : // ignore until */ is found
675 36026 : if (current_char == '*' && peek_input (1) == '/')
676 : {
677 : // skip */ characters
678 875 : skip_input (1);
679 :
680 875 : current_column += 2;
681 :
682 875 : level -= 1;
683 875 : continue;
684 : }
685 :
686 35151 : if (current_char == '\n')
687 : {
688 398 : skip_input ();
689 398 : current_line++;
690 398 : current_column = 1;
691 : // tell line_table that new line starts
692 398 : start_line (current_line, max_column_hint);
693 398 : continue;
694 : }
695 :
696 34753 : skip_input ();
697 34753 : current_column++;
698 : }
699 :
700 : // refresh new token
701 827 : continue;
702 827 : }
703 152 : else if (peek_input () == '*'
704 152 : && (peek_input (1) == '!' || peek_input (1) == '*'))
705 : {
706 : // block doc comment, inner /*! or outer /**
707 116 : bool is_inner = peek_input (1) == '!';
708 116 : skip_input (1);
709 116 : current_column += 3;
710 :
711 116 : std::string str;
712 116 : str.reserve (96);
713 :
714 116 : int level = 1;
715 116 : while (level > 0)
716 : {
717 2685 : current_char = peek_input ();
718 :
719 2685 : if (current_char.is_eof ())
720 : {
721 0 : rust_error_at (
722 : loc, ErrorCode::E0758,
723 : "unexpected EOF while looking for end of comment");
724 0 : break;
725 : }
726 :
727 : // if /* found
728 2685 : if (current_char == '/' && peek_input (1) == '*')
729 : {
730 : // skip /* characters
731 84 : skip_input (1);
732 84 : current_column += 2;
733 :
734 84 : level += 1;
735 84 : str += "/*";
736 84 : continue;
737 : }
738 :
739 : // ignore until */ is found
740 2601 : if (current_char == '*' && peek_input (1) == '/')
741 : {
742 : // skip */ characters
743 200 : skip_input (1);
744 200 : current_column += 2;
745 :
746 200 : level -= 1;
747 200 : if (level > 0)
748 84 : str += "*/";
749 200 : continue;
750 : }
751 :
752 2401 : if (current_char == '\r' && peek_input (1) != '\n')
753 2 : rust_error_at (
754 : loc, "Isolated CR %<\\r%> not allowed in doc comment");
755 :
756 2401 : if (current_char == '\n')
757 : {
758 0 : skip_input ();
759 0 : current_line++;
760 0 : current_column = 1;
761 : // tell line_table that new line starts
762 0 : start_line (current_line, max_column_hint);
763 0 : str += '\n';
764 0 : continue;
765 : }
766 :
767 2401 : str += current_char;
768 2401 : skip_input ();
769 2401 : current_column++;
770 : }
771 :
772 116 : str.shrink_to_fit ();
773 :
774 116 : loc += str.size () - 1;
775 116 : if (is_inner)
776 73 : return Token::make_inner_doc_comment (loc, std::move (str));
777 : else
778 43 : return Token::make_outer_doc_comment (loc, std::move (str));
779 116 : }
780 : else
781 : {
782 : // division
783 36 : current_column++;
784 36 : return Token::make (DIV, loc);
785 : }
786 43 : case '%':
787 43 : if (peek_input () == '=')
788 : {
789 : // modulo-assign
790 7 : skip_input ();
791 7 : current_column += 2;
792 7 : loc += 1;
793 :
794 7 : return Token::make (PERCENT_EQ, loc);
795 : }
796 : else
797 : {
798 : // modulo
799 36 : current_column++;
800 36 : return Token::make (PERCENT, loc);
801 : }
802 147 : case '^':
803 147 : if (peek_input () == '=')
804 : {
805 : // xor-assign?
806 84 : skip_input ();
807 84 : current_column += 2;
808 84 : loc += 1;
809 :
810 84 : return Token::make (CARET_EQ, loc);
811 : }
812 : else
813 : {
814 : // xor?
815 63 : current_column++;
816 63 : return Token::make (CARET, loc);
817 : }
818 8558 : case '<':
819 8558 : if (peek_input () == '<')
820 : {
821 66 : if (peek_input (1) == '=')
822 : {
823 : // left-shift assign
824 7 : skip_input (1);
825 7 : current_column += 3;
826 7 : loc += 2;
827 :
828 7 : return Token::make (LEFT_SHIFT_EQ, loc);
829 : }
830 : else
831 : {
832 : // left-shift
833 59 : skip_input ();
834 59 : current_column += 2;
835 59 : loc += 1;
836 :
837 59 : return Token::make (LEFT_SHIFT, loc);
838 : }
839 : }
840 8492 : else if (peek_input () == '=')
841 : {
842 : // smaller than or equal to
843 224 : skip_input ();
844 224 : current_column += 2;
845 224 : loc += 1;
846 :
847 224 : return Token::make (LESS_OR_EQUAL, loc);
848 : }
849 : else
850 : {
851 : // smaller than
852 8268 : current_column++;
853 8268 : return Token::make (LEFT_ANGLE, loc);
854 : }
855 8409 : break;
856 8409 : case '>':
857 8409 : if (peek_input () == '>')
858 : {
859 126 : if (peek_input (1) == '=')
860 : {
861 : // right-shift-assign
862 7 : skip_input (1);
863 7 : current_column += 3;
864 7 : loc += 2;
865 :
866 7 : return Token::make (RIGHT_SHIFT_EQ, loc);
867 : }
868 : else
869 : {
870 : // right-shift
871 119 : skip_input ();
872 119 : current_column += 2;
873 119 : loc += 1;
874 :
875 119 : return Token::make (RIGHT_SHIFT, loc);
876 : }
877 : }
878 8283 : else if (peek_input () == '=')
879 : {
880 : // larger than or equal to
881 209 : skip_input ();
882 209 : current_column += 2;
883 209 : loc += 1;
884 :
885 209 : return Token::make (GREATER_OR_EQUAL, loc);
886 : }
887 : else
888 : {
889 : // larger than
890 8074 : current_column++;
891 8074 : return Token::make (RIGHT_ANGLE, loc);
892 : }
893 28773 : case ':':
894 28773 : if (peek_input () == ':')
895 : {
896 : // scope resolution ::
897 9826 : skip_input ();
898 9826 : current_column += 2;
899 9826 : loc += 1;
900 :
901 9826 : return Token::make (SCOPE_RESOLUTION, loc);
902 : }
903 : else
904 : {
905 : // single colon :
906 18947 : current_column++;
907 18947 : return Token::make (COLON, loc);
908 : }
909 14608 : case '!':
910 : // no special handling for macros in lexer?
911 14608 : if (peek_input () == '=')
912 : {
913 : // not equal boolean operator
914 186 : skip_input ();
915 186 : current_column += 2;
916 186 : loc += 1;
917 :
918 186 : return Token::make (NOT_EQUAL, loc);
919 : }
920 : else
921 : {
922 : // not equal unary operator
923 14422 : current_column++;
924 :
925 14422 : return Token::make (EXCLAM, loc);
926 : }
927 368 : case '?':
928 368 : current_column++;
929 368 : return Token::make (QUESTION_MARK, loc);
930 19850 : case '#':
931 19850 : current_column++;
932 19850 : return Token::make (HASH, loc);
933 21864 : case '[':
934 21864 : current_column++;
935 21864 : return Token::make (LEFT_SQUARE, loc);
936 21857 : case ']':
937 21857 : current_column++;
938 21857 : return Token::make (RIGHT_SQUARE, loc);
939 34529 : case '{':
940 34529 : current_column++;
941 34529 : return Token::make (LEFT_CURLY, loc);
942 34482 : case '}':
943 34482 : current_column++;
944 34482 : return Token::make (RIGHT_CURLY, loc);
945 19 : case '@':
946 19 : current_column++;
947 19 : return Token::make (PATTERN_BIND, loc);
948 3589 : case '$':
949 3589 : current_column++;
950 3589 : return Token::make (DOLLAR_SIGN, loc);
951 0 : case '~':
952 0 : current_column++;
953 0 : return Token::make (TILDE, loc);
954 0 : case '\\':
955 0 : current_column++;
956 0 : return Token::make (BACKSLASH, loc);
957 0 : case '`':
958 0 : current_column++;
959 0 : return Token::make (BACKTICK, loc);
960 475 : case '|':
961 475 : if (peek_input () == '=')
962 : {
963 : // bitwise or-assign?
964 28 : skip_input ();
965 28 : current_column += 2;
966 28 : loc += 1;
967 :
968 28 : return Token::make (PIPE_EQ, loc);
969 : }
970 447 : else if (peek_input () == '|')
971 : {
972 : // logical or
973 69 : skip_input ();
974 69 : current_column += 2;
975 69 : loc += 1;
976 :
977 69 : return Token::make (OR, loc);
978 : }
979 : else
980 : {
981 : // bitwise or
982 378 : current_column++;
983 :
984 378 : return Token::make (PIPE, loc);
985 : }
986 9958 : case '&':
987 9958 : if (peek_input () == '=')
988 : {
989 : // bitwise and-assign?
990 21 : skip_input ();
991 21 : current_column += 2;
992 21 : loc += 1;
993 :
994 21 : return Token::make (AMP_EQ, loc);
995 : }
996 9937 : else if (peek_input () == '&')
997 : {
998 : // logical and
999 306 : skip_input ();
1000 306 : current_column += 2;
1001 306 : loc += 1;
1002 :
1003 306 : return Token::make (LOGICAL_AND, loc);
1004 : }
1005 : else
1006 : {
1007 : // bitwise and/reference
1008 9631 : current_column++;
1009 :
1010 9631 : return Token::make (AMP, loc);
1011 : }
1012 6656 : case '.':
1013 6656 : if (peek_input () == '.')
1014 : {
1015 1166 : if (peek_input (1) == '.')
1016 : {
1017 : // ellipsis
1018 838 : skip_input (1);
1019 838 : current_column += 3;
1020 838 : loc += 2;
1021 :
1022 838 : return Token::make (ELLIPSIS, loc);
1023 : }
1024 328 : else if (peek_input (1) == '=')
1025 : {
1026 : // ..=
1027 38 : skip_input (1);
1028 38 : current_column += 3;
1029 38 : loc += 2;
1030 :
1031 38 : return Token::make (DOT_DOT_EQ, loc);
1032 : }
1033 : else
1034 : {
1035 : // ..
1036 290 : skip_input ();
1037 290 : current_column += 2;
1038 290 : loc += 1;
1039 :
1040 290 : return Token::make (DOT_DOT, loc);
1041 : }
1042 : }
1043 : else /*if (!ISDIGIT (peek_input ()))*/
1044 : {
1045 : // single dot .
1046 : // Only if followed by a non-number - otherwise is float
1047 : // nope, float cannot start with '.'.
1048 5490 : current_column++;
1049 5490 : return Token::make (DOT, loc);
1050 : }
1051 1083847 : }
1052 : // TODO: special handling of _ in the lexer? instead of being identifier
1053 :
1054 : // byte character, byte string and raw byte string literals
1055 311576 : if (current_char == 'b')
1056 : {
1057 10662 : if (peek_input () == '\'')
1058 78 : return parse_byte_char (loc);
1059 10584 : else if (peek_input () == '"')
1060 64 : return parse_byte_string (loc);
1061 10520 : else if (peek_input () == 'r'
1062 10520 : && (peek_input (1) == '#' || peek_input (1) == '"'))
1063 32 : return parse_raw_byte_string (loc);
1064 : }
1065 :
1066 : // raw identifiers and raw strings
1067 311402 : if (current_char == 'r')
1068 : {
1069 3995 : Codepoint peek = peek_input ();
1070 3995 : Codepoint peek1 = peek_input (1);
1071 :
1072 : // TODO (tamaron) parse Unicode ident
1073 3995 : if (peek == '#' && is_identifier_start (peek1.value))
1074 : {
1075 81 : TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1076 81 : if (raw_ident_ptr != nullptr)
1077 80 : return raw_ident_ptr;
1078 : else
1079 1 : continue; /* input got parsed, it just wasn't valid. An error
1080 : was produced. */
1081 81 : }
1082 : else
1083 : {
1084 3914 : TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1085 3914 : if (maybe_raw_string_ptr != nullptr)
1086 25 : return maybe_raw_string_ptr;
1087 3914 : }
1088 : }
1089 :
1090 : // find identifiers and keywords.
1091 311296 : if (is_identifier_start (current_char.value))
1092 282362 : return parse_identifier_or_keyword (loc);
1093 :
1094 : // int and float literals
1095 28934 : if (ISDIGIT (current_char.value))
1096 : { // _ not allowed as first char
1097 15667 : if (current_char == '0'
1098 15667 : && is_non_decimal_int_literal_separator (peek_input ().value))
1099 : {
1100 : // handle binary, octal, hex literals
1101 216 : TokenPtr non_dec_int_lit_ptr
1102 216 : = parse_non_decimal_int_literals (loc);
1103 216 : if (non_dec_int_lit_ptr != nullptr)
1104 216 : return non_dec_int_lit_ptr;
1105 216 : }
1106 : else
1107 : {
1108 : // handle decimals (integer or float)
1109 15451 : TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1110 15451 : if (decimal_or_float_ptr != nullptr)
1111 15451 : return decimal_or_float_ptr;
1112 15451 : }
1113 : }
1114 :
1115 : // string literals
1116 13267 : if (current_char == '"')
1117 12424 : return parse_string (loc);
1118 :
1119 : // char literals and lifetime names
1120 843 : if (current_char == '\'')
1121 : {
1122 843 : TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1123 843 : if (char_or_lifetime_ptr != nullptr)
1124 843 : return char_or_lifetime_ptr;
1125 843 : }
1126 :
1127 : // DEBUG: check for specific character problems:
1128 0 : if (current_char == '0')
1129 0 : rust_debug ("'0' uncaught before unexpected character");
1130 0 : else if (current_char == ']')
1131 0 : rust_debug ("']' uncaught before unexpected character");
1132 : else if (current_char == 0x5d)
1133 : rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1134 : "unexpected character");
1135 :
1136 : // didn't match anything so error
1137 0 : rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
1138 0 : current_column++;
1139 : }
1140 : }
1141 :
1142 : // Parses in a type suffix.
1143 : std::pair<PrimitiveCoreType, int>
1144 15659 : Lexer::parse_in_type_suffix ()
1145 : {
1146 15659 : std::string suffix;
1147 15659 : suffix.reserve (5);
1148 :
1149 15659 : int additional_length_offset = 0;
1150 :
1151 : // get suffix
1152 33589 : while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
1153 34941 : || current_char == '_')
1154 : {
1155 3623 : if (current_char == '_')
1156 : {
1157 : // don't add _ to suffix
1158 0 : skip_input ();
1159 0 : current_char = peek_input ();
1160 :
1161 0 : additional_length_offset++;
1162 :
1163 0 : continue;
1164 : }
1165 :
1166 3623 : additional_length_offset++;
1167 :
1168 3623 : suffix += current_char;
1169 3623 : skip_input ();
1170 3623 : current_char = peek_input ();
1171 : }
1172 :
1173 15659 : if (suffix.empty ())
1174 : {
1175 : // no type suffix: do nothing but also no error
1176 14471 : return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1177 : }
1178 1188 : else if (suffix == "f32")
1179 : {
1180 501 : return std::make_pair (CORETYPE_F32, additional_length_offset);
1181 : }
1182 687 : else if (suffix == "f64")
1183 : {
1184 221 : return std::make_pair (CORETYPE_F64, additional_length_offset);
1185 : }
1186 466 : else if (suffix == "i8")
1187 : {
1188 23 : return std::make_pair (CORETYPE_I8, additional_length_offset);
1189 : }
1190 443 : else if (suffix == "i16")
1191 : {
1192 15 : return std::make_pair (CORETYPE_I16, additional_length_offset);
1193 : }
1194 428 : else if (suffix == "i32")
1195 : {
1196 177 : return std::make_pair (CORETYPE_I32, additional_length_offset);
1197 : }
1198 251 : else if (suffix == "i64")
1199 : {
1200 15 : return std::make_pair (CORETYPE_I64, additional_length_offset);
1201 : }
1202 236 : else if (suffix == "i128")
1203 : {
1204 15 : return std::make_pair (CORETYPE_I128, additional_length_offset);
1205 : }
1206 221 : else if (suffix == "isize")
1207 : {
1208 4 : return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
1209 : }
1210 217 : else if (suffix == "u8")
1211 : {
1212 30 : return std::make_pair (CORETYPE_U8, additional_length_offset);
1213 : }
1214 187 : else if (suffix == "u16")
1215 : {
1216 25 : return std::make_pair (CORETYPE_U16, additional_length_offset);
1217 : }
1218 162 : else if (suffix == "u32")
1219 : {
1220 85 : return std::make_pair (CORETYPE_U32, additional_length_offset);
1221 : }
1222 77 : else if (suffix == "u64")
1223 : {
1224 25 : return std::make_pair (CORETYPE_U64, additional_length_offset);
1225 : }
1226 52 : else if (suffix == "u128")
1227 : {
1228 15 : return std::make_pair (CORETYPE_U128, additional_length_offset);
1229 : }
1230 37 : else if (suffix == "usize")
1231 : {
1232 37 : return std::make_pair (CORETYPE_USIZE, additional_length_offset);
1233 : }
1234 : else
1235 : {
1236 0 : rust_error_at (get_current_location (), "unknown number suffix %qs",
1237 : suffix.c_str ());
1238 :
1239 0 : return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1240 : }
1241 15659 : }
1242 :
1243 : // Parses in the exponent part (if any) of a float literal.
1244 : std::pair<std::string, int>
1245 345 : Lexer::parse_in_exponent_part ()
1246 : {
1247 345 : int additional_length_offset = 0;
1248 345 : std::string str;
1249 345 : if (current_char == 'E' || current_char == 'e')
1250 : {
1251 : // add exponent to string as strtod works with it
1252 7 : str += current_char;
1253 7 : skip_input ();
1254 7 : current_char = peek_input ();
1255 :
1256 7 : additional_length_offset++;
1257 :
1258 : // special - and + handling
1259 7 : if (current_char == '-')
1260 : {
1261 0 : str += '-';
1262 :
1263 0 : skip_input ();
1264 0 : current_char = peek_input ();
1265 :
1266 0 : additional_length_offset++;
1267 : }
1268 7 : else if (current_char == '+')
1269 : {
1270 : // don't add + but still skip input
1271 7 : skip_input ();
1272 7 : current_char = peek_input ();
1273 :
1274 7 : additional_length_offset++;
1275 : }
1276 :
1277 : // parse another decimal number for exponent
1278 7 : auto str_length = parse_in_decimal ();
1279 7 : str += std::get<0> (str_length);
1280 7 : additional_length_offset += std::get<1> (str_length);
1281 7 : }
1282 690 : return std::make_pair (str, additional_length_offset);
1283 345 : }
1284 :
1285 : // Parses a decimal integer.
1286 : std::tuple<std::string, int, bool>
1287 15803 : Lexer::parse_in_decimal ()
1288 : {
1289 : /* A pure decimal contains only digits. */
1290 15803 : bool pure_decimal = true;
1291 15803 : int additional_length_offset = 0;
1292 15803 : std::string str;
1293 22889 : while (ISDIGIT (current_char.value) || current_char.value == '_')
1294 : {
1295 7086 : if (current_char == '_')
1296 : {
1297 9 : pure_decimal = false;
1298 : // don't add _ to number
1299 9 : skip_input ();
1300 9 : current_char = peek_input ();
1301 :
1302 9 : additional_length_offset++;
1303 :
1304 9 : continue;
1305 : }
1306 :
1307 7077 : additional_length_offset++;
1308 :
1309 7077 : str += current_char;
1310 7077 : skip_input ();
1311 7077 : current_char = peek_input ();
1312 : }
1313 31606 : return std::make_tuple (str, additional_length_offset, pure_decimal);
1314 15803 : }
1315 :
1316 : /* Parses escapes (and string continues) in "byte" strings and characters. Does
1317 : * not support unicode. */
1318 : std::tuple<char, int, bool>
1319 61 : Lexer::parse_escape (char opening_char)
1320 : {
1321 61 : int additional_length_offset = 0;
1322 61 : char output_char = 0;
1323 :
1324 : // skip to actual letter
1325 61 : skip_input ();
1326 61 : current_char = peek_input ();
1327 61 : additional_length_offset++;
1328 :
1329 61 : switch (current_char.value)
1330 : {
1331 17 : case 'x':
1332 17 : {
1333 17 : auto hex_escape_pair = parse_partial_hex_escape ();
1334 17 : long hexLong = hex_escape_pair.first;
1335 17 : additional_length_offset += hex_escape_pair.second;
1336 :
1337 17 : if (hexLong > 255 || hexLong < 0)
1338 0 : rust_error_at (
1339 : get_current_location (),
1340 : "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1341 : static_cast<unsigned int> (hexLong));
1342 : /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1343 : * support %X directly */
1344 17 : char hexChar = static_cast<char> (hexLong);
1345 :
1346 17 : output_char = hexChar;
1347 : }
1348 17 : break;
1349 : case 'n':
1350 : output_char = '\n';
1351 : break;
1352 0 : case 'r':
1353 0 : output_char = '\r';
1354 0 : break;
1355 1 : case 't':
1356 1 : output_char = '\t';
1357 1 : break;
1358 8 : case '\\':
1359 8 : output_char = '\\';
1360 8 : break;
1361 9 : case '0':
1362 9 : output_char = '\0';
1363 9 : break;
1364 15 : case '\'':
1365 15 : output_char = '\'';
1366 15 : break;
1367 1 : case '"':
1368 1 : output_char = '"';
1369 1 : break;
1370 2 : case 'u':
1371 3 : rust_error_at (get_current_location (),
1372 : "cannot have a unicode escape \\u in a byte %s",
1373 : opening_char == '\'' ? "character" : "string");
1374 : // Try to parse it anyway, just to skip it
1375 2 : parse_partial_unicode_escape ();
1376 2 : return std::make_tuple (output_char, additional_length_offset, false);
1377 0 : case '\r':
1378 0 : case '\n':
1379 : // string continue
1380 0 : return std::make_tuple (0, parse_partial_string_continue (), true);
1381 1 : default:
1382 1 : rust_error_at (get_current_location (),
1383 : "unknown escape sequence %<\\%s%>",
1384 1 : current_char.as_string ().c_str ());
1385 : // returns false if no parsing could be done
1386 : // return false;
1387 1 : return std::make_tuple (output_char, additional_length_offset, false);
1388 58 : break;
1389 : }
1390 : // all non-special cases (string continue) should skip their used char
1391 58 : skip_input ();
1392 58 : current_char = peek_input ();
1393 58 : additional_length_offset++;
1394 :
1395 : // returns true if parsing was successful
1396 : // return true;
1397 58 : return std::make_tuple (output_char, additional_length_offset, false);
1398 : }
1399 :
1400 : /* Parses an escape (or string continue) in a string or character. Supports
1401 : * unicode escapes. */
1402 : std::tuple<Codepoint, int, bool>
1403 2802 : Lexer::parse_utf8_escape ()
1404 : {
1405 2802 : Codepoint output_char;
1406 2802 : int additional_length_offset = 0;
1407 :
1408 : // skip to actual letter
1409 2802 : skip_input ();
1410 2802 : current_char = peek_input ();
1411 2802 : additional_length_offset++;
1412 :
1413 2802 : switch (current_char.value)
1414 : {
1415 17 : case 'x':
1416 17 : {
1417 17 : auto hex_escape_pair = parse_partial_hex_escape ();
1418 17 : long hexLong = hex_escape_pair.first;
1419 17 : additional_length_offset += hex_escape_pair.second;
1420 :
1421 17 : if (hexLong > 127 || hexLong < 0)
1422 4 : rust_error_at (
1423 : get_current_location (),
1424 : "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1425 : static_cast<unsigned int> (hexLong));
1426 : /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1427 : * support %X directly */
1428 17 : char hexChar = static_cast<char> (hexLong);
1429 :
1430 17 : output_char = hexChar;
1431 : }
1432 17 : break;
1433 : case 'n':
1434 : output_char = '\n';
1435 : break;
1436 0 : case 'r':
1437 0 : output_char = '\r';
1438 0 : break;
1439 2 : case 't':
1440 2 : output_char = '\t';
1441 2 : break;
1442 1 : case '\\':
1443 1 : output_char = '\\';
1444 1 : break;
1445 1404 : case '0':
1446 1404 : output_char = '\0';
1447 1404 : break;
1448 1 : case '\'':
1449 1 : output_char = '\'';
1450 1 : break;
1451 1 : case '"':
1452 1 : output_char = '"';
1453 1 : break;
1454 46 : case 'u':
1455 46 : {
1456 46 : auto unicode_escape_pair = parse_partial_unicode_escape ();
1457 46 : output_char = unicode_escape_pair.first;
1458 46 : additional_length_offset += unicode_escape_pair.second;
1459 :
1460 46 : return std::make_tuple (output_char, additional_length_offset, false);
1461 : }
1462 28 : break;
1463 28 : case '\r':
1464 28 : case '\n':
1465 : // string continue
1466 28 : return std::make_tuple (0, parse_partial_string_continue (), true);
1467 1 : default:
1468 1 : rust_error_at (get_current_location (),
1469 : "unknown escape sequence %<\\%s%>",
1470 1 : current_char.as_string ().c_str ());
1471 : // returns false if no parsing could be done
1472 : // return false;
1473 1 : return std::make_tuple (output_char, additional_length_offset, false);
1474 2727 : break;
1475 : }
1476 : /* all non-special cases (unicode, string continue) should skip their used
1477 : * char */
1478 2727 : skip_input ();
1479 2727 : current_char = peek_input ();
1480 2727 : additional_length_offset++;
1481 :
1482 : // returns true if parsing was successful
1483 : // return true;
1484 2727 : return std::make_tuple (output_char, additional_length_offset, false);
1485 : }
1486 :
1487 : // Parses the body of a string continue that has been found in an escape.
1488 : int
1489 28 : Lexer::parse_partial_string_continue ()
1490 : {
1491 28 : int additional_length_offset = 1;
1492 :
1493 : // string continue
1494 : // TODO use utf-8 codepoint to skip whitespaces
1495 364 : while (is_whitespace (current_char.value))
1496 : {
1497 336 : if (current_char == '\n')
1498 : {
1499 28 : current_line++;
1500 28 : current_column = 1;
1501 : // tell line_table that new line starts
1502 28 : start_line (current_line, max_column_hint);
1503 :
1504 : // reset "length"
1505 28 : additional_length_offset = 1;
1506 :
1507 : // get next char
1508 28 : skip_input ();
1509 28 : current_char = peek_input ();
1510 :
1511 28 : continue;
1512 : }
1513 :
1514 308 : skip_input ();
1515 308 : current_char = peek_input ();
1516 308 : additional_length_offset++;
1517 : }
1518 :
1519 28 : return additional_length_offset;
1520 : }
1521 :
1522 : /* Parses the body of a '\x' escape. Note that it does not check that the number
1523 : * is valid and smaller than 255. */
1524 : std::pair<long, int>
1525 34 : Lexer::parse_partial_hex_escape ()
1526 : {
1527 : // hex char string (null-terminated)
1528 34 : char hexNum[3] = {0, 0, 0};
1529 :
1530 : // first hex char
1531 34 : current_char = peek_input (1);
1532 34 : int additional_length_offset = 1;
1533 :
1534 34 : if (!is_x_digit (current_char.value))
1535 : {
1536 4 : rust_error_at (get_current_location (),
1537 : "invalid character %<\\x%s%> in \\x sequence",
1538 4 : current_char.as_string ().c_str ());
1539 4 : return std::make_pair (0, 0);
1540 : }
1541 30 : hexNum[0] = current_char.value;
1542 :
1543 : // second hex char
1544 30 : skip_input ();
1545 30 : current_char = peek_input (1);
1546 30 : additional_length_offset++;
1547 :
1548 30 : if (!is_x_digit (current_char.value))
1549 : {
1550 2 : rust_error_at (get_current_location (),
1551 2 : "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
1552 2 : current_char.as_string ().c_str ());
1553 2 : return std::make_pair (0, 1);
1554 : }
1555 28 : skip_input ();
1556 28 : hexNum[1] = current_char.value;
1557 :
1558 28 : long hexLong = std::strtol (hexNum, nullptr, 16);
1559 :
1560 28 : return std::make_pair (hexLong, additional_length_offset);
1561 : }
1562 :
1563 : // Parses the body of a unicode escape.
1564 : std::pair<Codepoint, int>
1565 48 : Lexer::parse_partial_unicode_escape ()
1566 : {
1567 48 : skip_input ();
1568 48 : current_char = peek_input ();
1569 48 : int additional_length_offset = 0;
1570 :
1571 48 : if (current_char != '{')
1572 : {
1573 2 : rust_error_at (get_current_location (),
1574 : "unicode escape should start with %<{%>");
1575 : /* Skip what should probaby have been between brackets. */
1576 10 : while (is_x_digit (current_char.value) || current_char == '_')
1577 : {
1578 6 : skip_input ();
1579 6 : current_char = peek_input ();
1580 6 : additional_length_offset++;
1581 : }
1582 2 : return std::make_pair (Codepoint (0), additional_length_offset);
1583 : }
1584 :
1585 46 : skip_input ();
1586 46 : current_char = peek_input ();
1587 46 : additional_length_offset++;
1588 :
1589 46 : if (current_char == '_')
1590 : {
1591 2 : rust_error_at (get_current_location (),
1592 : "unicode escape cannot start with %<_%>");
1593 2 : skip_input ();
1594 2 : current_char = peek_input ();
1595 2 : additional_length_offset++;
1596 : // fallthrough and try to parse the rest anyway
1597 : }
1598 :
1599 : // parse unicode escape - 1-6 hex digits
1600 46 : std::string num_str;
1601 46 : num_str.reserve (6);
1602 :
1603 : // loop through to add entire hex number to string
1604 304 : while (is_x_digit (current_char.value) || current_char.value == '_')
1605 : {
1606 212 : if (current_char == '_')
1607 : {
1608 : // don't add _ to number
1609 24 : skip_input ();
1610 24 : current_char = peek_input ();
1611 :
1612 24 : additional_length_offset++;
1613 :
1614 24 : continue;
1615 : }
1616 :
1617 188 : additional_length_offset++;
1618 :
1619 : // add raw hex numbers
1620 188 : num_str += current_char;
1621 :
1622 188 : skip_input ();
1623 188 : current_char = peek_input ();
1624 : }
1625 :
1626 46 : if (current_char == '}')
1627 : {
1628 44 : skip_input ();
1629 44 : current_char = peek_input ();
1630 44 : additional_length_offset++;
1631 : }
1632 : else
1633 : {
1634 : // actually an error, but allow propagation anyway Assume that
1635 : // wrong bracketm whitespace or single/double quotes are wrong
1636 : // termination, otherwise it is a wrong character, then skip to the actual
1637 : // terminator.
1638 : // TODO use utf-8 codepoint to skip whitespaces
1639 2 : if (current_char == '{' || is_whitespace (current_char.value)
1640 4 : || current_char == '\'' || current_char == '"')
1641 : {
1642 0 : rust_error_at (get_current_location (),
1643 : "expected terminating %<}%> in unicode escape");
1644 0 : return std::make_pair (Codepoint (0), additional_length_offset);
1645 : }
1646 : else
1647 : {
1648 2 : rust_error_at (get_current_location (),
1649 : "invalid character %qs in unicode escape",
1650 2 : current_char.as_string ().c_str ());
1651 : // TODO use utf-8 codepoint to skip whitespaces
1652 8 : while (current_char != '}' && current_char != '{'
1653 6 : && !is_whitespace (current_char.value) && current_char != '\''
1654 14 : && current_char != '"')
1655 : {
1656 6 : skip_input ();
1657 6 : current_char = peek_input ();
1658 6 : additional_length_offset++;
1659 : }
1660 : // Consume the actual closing bracket if found
1661 2 : if (current_char == '}')
1662 : {
1663 2 : skip_input ();
1664 2 : current_char = peek_input ();
1665 2 : additional_length_offset++;
1666 : }
1667 2 : return std::make_pair (Codepoint (0), additional_length_offset);
1668 : }
1669 : }
1670 :
1671 : // ensure 1-6 hex characters
1672 44 : if (num_str.length () > 6 || num_str.length () < 1)
1673 : {
1674 4 : rust_error_at (get_current_location (),
1675 : "unicode escape should be between 1 and 6 hex "
1676 : "characters; it is %lu",
1677 4 : (unsigned long) num_str.length ());
1678 : // return false;
1679 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1680 : }
1681 :
1682 40 : unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1683 :
1684 40 : if (hex_num > 0xd7ff && hex_num < 0xe000)
1685 : {
1686 4 : rust_error_at (
1687 : get_current_location (),
1688 : "unicode escape cannot be a surrogate value (D800 to DFFF)");
1689 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1690 : }
1691 :
1692 36 : if (hex_num > 0x10ffff)
1693 : {
1694 4 : rust_error_at (get_current_location (),
1695 : "unicode escape cannot be larger than 10FFFF");
1696 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1697 : }
1698 :
1699 : // return true;
1700 32 : return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1701 : additional_length_offset);
1702 46 : }
1703 :
1704 : // Parses a byte character.
1705 : TokenPtr
1706 78 : Lexer::parse_byte_char (location_t loc)
1707 : {
1708 78 : skip_input ();
1709 78 : current_column++;
1710 : // make current char the next character
1711 78 : current_char = peek_input ();
1712 :
1713 78 : int length = 1;
1714 :
1715 : // char to save
1716 78 : Codepoint byte_char = 0;
1717 :
1718 : // detect escapes
1719 78 : if (current_char == '\\')
1720 : {
1721 30 : auto escape_length_pair = parse_escape ('\'');
1722 30 : byte_char = std::get<0> (escape_length_pair);
1723 30 : length += std::get<1> (escape_length_pair);
1724 :
1725 30 : current_char = peek_input ();
1726 :
1727 30 : if (current_char != '\'')
1728 : {
1729 0 : rust_error_at (get_current_location (), "unclosed %<byte char%>");
1730 : }
1731 :
1732 30 : skip_input ();
1733 30 : current_char = peek_input ();
1734 30 : length++; // go to next char
1735 : }
1736 48 : else if (current_char != '\'')
1737 : {
1738 : // otherwise, get character from direct input character
1739 48 : byte_char = current_char;
1740 :
1741 48 : if (!byte_char.is_ascii ())
1742 : {
1743 2 : rust_error_at (get_current_location (),
1744 : "non-ASCII character in %<byte char%>");
1745 : }
1746 :
1747 48 : skip_input ();
1748 48 : current_char = peek_input ();
1749 48 : length++;
1750 :
1751 48 : if (current_char != '\'')
1752 : {
1753 0 : rust_error_at (get_current_location (), "unclosed %<byte char%>");
1754 : }
1755 :
1756 48 : skip_input ();
1757 48 : current_char = peek_input ();
1758 48 : length++; // go to next char
1759 : }
1760 : else
1761 : {
1762 0 : rust_error_at (get_current_location (),
1763 : "no character inside %<%> for %<byte char%>");
1764 : }
1765 :
1766 78 : current_column += length;
1767 :
1768 78 : loc += length - 1;
1769 78 : return Token::make_byte_char (loc, byte_char.value);
1770 : }
1771 :
1772 : // Parses a byte string.
1773 : TokenPtr
1774 64 : Lexer::parse_byte_string (location_t loc)
1775 : {
1776 : // byte string
1777 :
1778 : // skip quote character
1779 64 : skip_input ();
1780 64 : current_column++;
1781 :
1782 64 : std::string str;
1783 64 : str.reserve (16); // some sensible default
1784 :
1785 64 : current_char = peek_input ();
1786 :
1787 64 : const location_t string_begin_locus = get_current_location ();
1788 :
1789 438 : while (current_char != '"' && !current_char.is_eof ())
1790 : {
1791 310 : if (current_char == '\\')
1792 : {
1793 31 : int length = 1;
1794 31 : auto escape_length_pair = parse_escape ('"');
1795 31 : char output_char = std::get<0> (escape_length_pair);
1796 :
1797 31 : if (output_char == 0 && std::get<2> (escape_length_pair))
1798 0 : length = std::get<1> (escape_length_pair) - 1;
1799 : else
1800 31 : length += std::get<1> (escape_length_pair);
1801 :
1802 31 : if (output_char != 0 || !std::get<2> (escape_length_pair))
1803 31 : str += output_char;
1804 :
1805 31 : current_column += length;
1806 :
1807 31 : continue;
1808 31 : }
1809 :
1810 279 : current_column++;
1811 279 : if (current_char.value == '\n')
1812 : {
1813 23 : current_line++;
1814 23 : current_column = 1;
1815 : // tell line_table that new line starts
1816 23 : start_line (current_line, max_column_hint);
1817 : }
1818 :
1819 279 : str += current_char;
1820 279 : skip_input ();
1821 279 : current_char = peek_input ();
1822 : }
1823 :
1824 64 : if (current_char == '"')
1825 : {
1826 57 : current_column++;
1827 :
1828 57 : skip_input ();
1829 57 : current_char = peek_input ();
1830 : }
1831 7 : else if (current_char.is_eof ())
1832 : {
1833 7 : rust_error_at (string_begin_locus, "unended byte string literal");
1834 7 : return Token::make (END_OF_FILE, get_current_location ());
1835 : }
1836 : else
1837 : {
1838 : rust_unreachable ();
1839 : }
1840 :
1841 57 : str.shrink_to_fit ();
1842 57 : loc += str.size () - 1;
1843 :
1844 57 : return Token::make_byte_string (loc, std::move (str));
1845 64 : }
1846 :
1847 : // Parses a raw byte string.
1848 : TokenPtr
1849 32 : Lexer::parse_raw_byte_string (location_t loc)
1850 : {
1851 : // raw byte string literals
1852 32 : std::string str;
1853 32 : str.reserve (16); // some sensible default
1854 :
1855 32 : int length = 1;
1856 32 : int hash_count = 0;
1857 :
1858 32 : const location_t string_begin_locus = get_current_location ();
1859 :
1860 : // get hash count at beginnning
1861 32 : skip_input ();
1862 32 : current_char = peek_input ();
1863 32 : length++;
1864 32 : current_column++;
1865 54 : while (current_char == '#')
1866 : {
1867 22 : hash_count++;
1868 22 : length++;
1869 22 : current_column++;
1870 :
1871 22 : skip_input ();
1872 22 : current_char = peek_input ();
1873 : }
1874 :
1875 32 : if (current_char != '"')
1876 : {
1877 0 : rust_error_at (get_current_location (),
1878 : "raw byte string has no opening %<\"%>");
1879 : }
1880 :
1881 32 : skip_input ();
1882 32 : current_char = peek_input ();
1883 32 : length++;
1884 32 : current_column++;
1885 :
1886 330 : while (true)
1887 : {
1888 181 : if (current_char == '"')
1889 : {
1890 51 : bool enough_hashes = true;
1891 :
1892 51 : for (int i = 0; i < hash_count; i++)
1893 : {
1894 26 : if (peek_input (i + 1) != '#')
1895 : {
1896 : enough_hashes = false;
1897 : break;
1898 : }
1899 : }
1900 :
1901 35 : if (enough_hashes)
1902 : {
1903 : // skip enough input and peek enough input
1904 25 : skip_input (hash_count);
1905 25 : current_char = peek_input ();
1906 25 : length += hash_count + 1;
1907 25 : current_column += hash_count + 1;
1908 25 : break;
1909 : }
1910 : }
1911 146 : else if (current_char.is_eof ())
1912 : {
1913 7 : rust_error_at (string_begin_locus, "unended raw byte string literal");
1914 7 : return Token::make (END_OF_FILE, get_current_location ());
1915 : }
1916 139 : else if (current_char.value > 127)
1917 : {
1918 1 : rust_error_at (get_current_location (),
1919 : "character %qs in raw byte string out of range",
1920 1 : current_char.as_string ().c_str ());
1921 1 : current_char = 0;
1922 : }
1923 :
1924 149 : length++;
1925 149 : current_column++;
1926 149 : if (current_char == '\n')
1927 : {
1928 22 : current_line++;
1929 22 : current_column = 1;
1930 22 : start_line (current_line, max_column_hint);
1931 : }
1932 :
1933 149 : str += current_char;
1934 149 : skip_input ();
1935 149 : current_char = peek_input ();
1936 149 : }
1937 :
1938 25 : loc += length - 1;
1939 :
1940 25 : str.shrink_to_fit ();
1941 :
1942 25 : return Token::make_byte_string (loc, std::move (str));
1943 32 : }
1944 :
1945 : // Parses a raw identifier.
1946 : TokenPtr
1947 81 : Lexer::parse_raw_identifier (location_t loc)
1948 : {
1949 : // raw identifier
1950 81 : std::string str;
1951 81 : str.reserve (16); // default
1952 :
1953 81 : skip_input ();
1954 81 : current_char = peek_input ();
1955 :
1956 81 : current_column += 2;
1957 :
1958 81 : bool first_is_underscore = current_char == '_';
1959 :
1960 81 : int length = 0;
1961 81 : current_char = peek_input ();
1962 : // loop through entire name
1963 475 : while (is_identifier_continue (current_char.value))
1964 : {
1965 313 : length++;
1966 :
1967 313 : str += current_char;
1968 313 : skip_input ();
1969 313 : current_char = peek_input ();
1970 : }
1971 :
1972 81 : current_column += length;
1973 :
1974 81 : rust_debug ("raw ident: %s", str.c_str ());
1975 :
1976 : // if just a single underscore, not an identifier
1977 81 : if (first_is_underscore && length == 1)
1978 1 : rust_error_at (get_current_location (),
1979 : "%<_%> is not a valid raw identifier");
1980 :
1981 81 : using namespace Rust::Values;
1982 81 : std::set<std::string> invalid{
1983 81 : Keywords::CRATE, Keywords::EXTERN_KW, Keywords::SELF,
1984 81 : Keywords::SUPER, Keywords::SELF_ALIAS,
1985 486 : };
1986 :
1987 81 : if (invalid.find (str) != invalid.end ())
1988 : {
1989 1 : rust_error_at (get_current_location (),
1990 : "%qs is a forbidden raw identifier", str.c_str ());
1991 :
1992 1 : return nullptr;
1993 : }
1994 : else
1995 : {
1996 80 : str.shrink_to_fit ();
1997 80 : loc += length - 1;
1998 :
1999 80 : return Token::make_identifier (loc, std::move (str));
2000 : }
2001 81 : }
2002 :
2003 : // skip broken string input (unterminated strings)
2004 : void
2005 0 : Lexer::skip_broken_string_input (Codepoint current_char)
2006 : {
2007 0 : while (current_char != '"' && !current_char.is_eof ())
2008 : {
2009 0 : if (current_char == '\n')
2010 : {
2011 0 : current_line++;
2012 0 : current_column = 1;
2013 : }
2014 : else
2015 : {
2016 0 : current_column++;
2017 : }
2018 0 : skip_input ();
2019 0 : current_char = peek_input ();
2020 : }
2021 0 : if (current_char == '"')
2022 : {
2023 0 : current_column++;
2024 :
2025 0 : skip_input ();
2026 0 : current_char = peek_input ();
2027 : }
2028 0 : rust_debug ("skipped to %d:%d due to bad quotes", current_line,
2029 : current_column);
2030 0 : }
2031 :
2032 : // Parses a string.
2033 : TokenPtr
2034 12424 : Lexer::parse_string (location_t loc)
2035 : {
2036 12424 : std::string str;
2037 12424 : str.reserve (16); // some sensible default
2038 :
2039 12424 : current_char = peek_input ();
2040 :
2041 12424 : const location_t string_begin_locus = get_current_location ();
2042 :
2043 : // FIXME: This fails if the input ends. How do we check for EOF?
2044 100866 : while (current_char.value != '"' && !current_char.is_eof ())
2045 : {
2046 76018 : if (current_char.value == '\\')
2047 : {
2048 2779 : int length = 1;
2049 :
2050 : // parse escape
2051 2779 : auto utf8_escape_pair = parse_utf8_escape ();
2052 2779 : current_char = std::get<0> (utf8_escape_pair);
2053 :
2054 2779 : if (current_char == Codepoint (0) && std::get<2> (utf8_escape_pair))
2055 28 : length = std::get<1> (utf8_escape_pair) - 1;
2056 : else
2057 2751 : length += std::get<1> (utf8_escape_pair);
2058 :
2059 2779 : if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair))
2060 5502 : str += current_char.as_string ();
2061 :
2062 2779 : current_column += length;
2063 :
2064 : // FIXME: should remove this but can't.
2065 : // `parse_utf8_escape` does not update `current_char` correctly.
2066 2779 : current_char = peek_input ();
2067 2779 : continue;
2068 2779 : }
2069 :
2070 73239 : current_column++;
2071 73239 : if (current_char.value == '\n')
2072 : {
2073 67 : current_line++;
2074 67 : current_column = 1;
2075 : // tell line_table that new line starts
2076 67 : start_line (current_line, max_column_hint);
2077 : }
2078 :
2079 73239 : str += current_char;
2080 73239 : skip_input ();
2081 73239 : current_char = peek_input ();
2082 : }
2083 :
2084 12424 : if (current_char.value == '"')
2085 : {
2086 12410 : current_column++;
2087 :
2088 12410 : skip_input ();
2089 12410 : current_char = peek_input ();
2090 : }
2091 14 : else if (current_char.is_eof ())
2092 : {
2093 14 : rust_error_at (string_begin_locus, "unended string literal");
2094 14 : return Token::make (END_OF_FILE, get_current_location ());
2095 : }
2096 : else
2097 : {
2098 : rust_unreachable ();
2099 : }
2100 :
2101 12410 : str.shrink_to_fit ();
2102 :
2103 12410 : return Token::make_string (loc, std::move (str));
2104 12424 : }
2105 :
2106 : // Parses an identifier or keyword.
2107 : TokenPtr
2108 282362 : Lexer::parse_identifier_or_keyword (location_t loc)
2109 : {
2110 282362 : std::string str;
2111 282362 : str.reserve (16); // default
2112 564724 : str += current_char.as_string ();
2113 :
2114 282362 : bool first_is_underscore = current_char == '_';
2115 :
2116 282362 : int length = 1;
2117 282362 : current_char = peek_input ();
2118 :
2119 : // loop through entire name
2120 1427911 : while (is_identifier_continue (current_char.value))
2121 : {
2122 863187 : auto s = current_char.as_string ();
2123 863187 : length++;
2124 :
2125 1726374 : str += current_char.as_string ();
2126 863187 : skip_input ();
2127 863187 : current_char = peek_input ();
2128 863187 : }
2129 :
2130 282362 : current_column += length;
2131 :
2132 : // if just a single underscore, not an identifier
2133 282362 : if (first_is_underscore && length == 1)
2134 1292 : return Token::make (UNDERSCORE, loc);
2135 :
2136 281070 : str.shrink_to_fit ();
2137 :
2138 281070 : loc += length - 1;
2139 :
2140 281070 : TokenId keyword = classify_keyword (str);
2141 281070 : if (keyword == IDENTIFIER)
2142 187024 : return Token::make_identifier (loc, std::move (str));
2143 : else
2144 94046 : return Token::make (keyword, loc);
2145 282362 : }
2146 :
2147 : // Possibly returns a raw string token if it exists - otherwise returns null.
2148 : TokenPtr
2149 3914 : Lexer::maybe_parse_raw_string (location_t loc)
2150 : {
2151 3914 : int peek_index = 0;
2152 3923 : while (peek_input (peek_index) == '#')
2153 9 : peek_index++;
2154 :
2155 3914 : if (peek_input (peek_index) == '"')
2156 25 : return parse_raw_string (loc, peek_index);
2157 : else
2158 3889 : return nullptr;
2159 : }
2160 :
2161 : // Returns a raw string token.
2162 : TokenPtr
2163 25 : Lexer::parse_raw_string (location_t loc, int initial_hash_count)
2164 : {
2165 : // raw string literals
2166 25 : std::string str;
2167 25 : str.reserve (16); // some sensible default
2168 :
2169 25 : int length = 1 + initial_hash_count;
2170 25 : current_column += length;
2171 :
2172 25 : const location_t string_begin_locus = get_current_location ();
2173 :
2174 25 : if (initial_hash_count > 0)
2175 7 : skip_input (initial_hash_count - 1);
2176 :
2177 25 : current_char = peek_input ();
2178 :
2179 25 : if (current_char != '"')
2180 0 : rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2181 :
2182 25 : length++;
2183 25 : current_column++;
2184 25 : skip_input ();
2185 25 : current_char = peek_input ();
2186 :
2187 181 : while (true)
2188 : {
2189 103 : if (current_char.value == '"')
2190 : {
2191 38 : bool enough_hashes = true;
2192 :
2193 38 : for (int i = 0; i < initial_hash_count; i++)
2194 : {
2195 13 : if (peek_input (i + 1) != '#')
2196 : {
2197 : enough_hashes = false;
2198 : break;
2199 : }
2200 : }
2201 :
2202 28 : if (enough_hashes)
2203 : {
2204 : // skip enough input and peek enough input
2205 25 : skip_input (initial_hash_count);
2206 25 : current_char = peek_input ();
2207 25 : length += initial_hash_count + 1;
2208 25 : current_column += initial_hash_count + 1;
2209 25 : break;
2210 : }
2211 : }
2212 75 : else if (current_char.is_eof ())
2213 : {
2214 0 : rust_error_at (string_begin_locus, "unended raw string literal");
2215 0 : return Token::make (END_OF_FILE, get_current_location ());
2216 : }
2217 :
2218 78 : length++;
2219 78 : current_column++;
2220 78 : if (current_char == '\n')
2221 : {
2222 1 : current_line++;
2223 1 : current_column = 1;
2224 1 : start_line (current_line, max_column_hint);
2225 : }
2226 :
2227 156 : str += current_char.as_string ();
2228 78 : skip_input ();
2229 78 : current_char = peek_input ();
2230 78 : }
2231 :
2232 25 : loc += length - 1;
2233 :
2234 25 : str.shrink_to_fit ();
2235 :
2236 25 : return Token::make_raw_string (loc, std::move (str));
2237 25 : }
2238 :
2239 : template <typename IsDigitFunc>
2240 : TokenPtr
2241 216 : Lexer::parse_non_decimal_int_literal (location_t loc, IsDigitFunc is_digit_func,
2242 : std::string existent_str, int base)
2243 : {
2244 216 : int length = 1;
2245 :
2246 216 : skip_input ();
2247 216 : current_char = peek_input ();
2248 :
2249 216 : length++;
2250 :
2251 : // loop through to add entire number to string
2252 1869 : while (is_digit_func (current_char.value) || current_char == '_')
2253 : {
2254 1653 : if (current_char == '_')
2255 : {
2256 : // don't add _ to number
2257 21 : skip_input ();
2258 21 : current_char = peek_input ();
2259 :
2260 21 : length++;
2261 :
2262 21 : continue;
2263 : }
2264 :
2265 1632 : length++;
2266 :
2267 : // add raw numbers
2268 1632 : existent_str += current_char;
2269 1632 : skip_input ();
2270 1632 : current_char = peek_input ();
2271 : }
2272 :
2273 : // convert value to decimal representation
2274 216 : long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
2275 :
2276 216 : existent_str = std::to_string (dec_num);
2277 :
2278 : // parse in type suffix if it exists
2279 216 : auto type_suffix_pair = parse_in_type_suffix ();
2280 216 : PrimitiveCoreType type_hint = type_suffix_pair.first;
2281 216 : length += type_suffix_pair.second;
2282 :
2283 216 : current_column += length;
2284 :
2285 216 : if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
2286 : {
2287 0 : rust_error_at (get_current_location (),
2288 : "invalid type suffix %qs for integer (%s) literal",
2289 : get_type_hint_string (type_hint),
2290 : base == 16
2291 : ? "hex"
2292 : : (base == 8 ? "octal"
2293 : : (base == 2 ? "binary"
2294 : : "<insert unknown base>")));
2295 0 : return nullptr;
2296 : }
2297 :
2298 216 : loc += length - 1;
2299 :
2300 216 : return Token::make_int (loc, std::move (existent_str), type_hint);
2301 : }
2302 :
2303 : // Parses a hex, binary or octal int literal.
2304 : TokenPtr
2305 216 : Lexer::parse_non_decimal_int_literals (location_t loc)
2306 : {
2307 216 : std::string str;
2308 216 : str.reserve (16); // some sensible default
2309 216 : str += current_char;
2310 :
2311 216 : current_char = peek_input ();
2312 :
2313 216 : if (current_char == 'x')
2314 : {
2315 : // hex (integer only)
2316 184 : return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
2317 : }
2318 32 : else if (current_char == 'o')
2319 : {
2320 : // octal (integer only)
2321 32 : return parse_non_decimal_int_literal (loc, is_octal_digit,
2322 16 : std::move (str), 8);
2323 : }
2324 16 : else if (current_char == 'b')
2325 : {
2326 : // binary (integer only)
2327 32 : return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2328 16 : 2);
2329 : }
2330 : else
2331 : {
2332 0 : return nullptr;
2333 : }
2334 216 : }
2335 :
2336 : // Parses a decimal-based int literal or float literal.
2337 : TokenPtr
2338 15451 : Lexer::parse_decimal_int_or_float (location_t loc)
2339 : {
2340 15451 : std::string str;
2341 15451 : str.reserve (16); // some sensible default
2342 15451 : str += current_char;
2343 :
2344 15451 : int length = 1;
2345 15451 : bool first_zero = current_char == '0';
2346 :
2347 15451 : current_char = peek_input ();
2348 :
2349 : // parse initial decimal integer (or first integer part of float) literal
2350 15451 : auto initial_decimal = parse_in_decimal ();
2351 15451 : str += std::get<0> (initial_decimal);
2352 15451 : length += std::get<1> (initial_decimal);
2353 :
2354 : // detect float literal
2355 : //
2356 : // Note:
2357 : //
2358 : // We should not use is_float_digit () for this verification but instead
2359 : // directly ISDIGIT because rust does not support non digit values right after
2360 : // a dot.
2361 : // The following value is not legal in rust:
2362 : // let a = 3.e1;
2363 : // A `0` should be put between the dot and the exponent to be valid
2364 : // (eg. 3.0e1).
2365 15451 : if (current_char == '.' && ISDIGIT (peek_input (1).value))
2366 : {
2367 : // float with a '.', parse another decimal into it
2368 :
2369 : // add . to str
2370 345 : str += current_char;
2371 345 : skip_input ();
2372 345 : current_char = peek_input ();
2373 345 : length++;
2374 :
2375 : // parse another decimal number for float
2376 345 : auto second_decimal = parse_in_decimal ();
2377 345 : str += std::get<0> (second_decimal);
2378 345 : length += std::get<1> (second_decimal);
2379 :
2380 : // parse in exponent part if it exists
2381 345 : auto exponent_pair = parse_in_exponent_part ();
2382 345 : str += exponent_pair.first;
2383 345 : length += exponent_pair.second;
2384 :
2385 : // parse in type suffix if it exists
2386 345 : auto type_suffix_pair = parse_in_type_suffix ();
2387 345 : PrimitiveCoreType type_hint = type_suffix_pair.first;
2388 345 : length += type_suffix_pair.second;
2389 :
2390 345 : if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2391 345 : && type_hint != CORETYPE_UNKNOWN)
2392 : {
2393 0 : rust_error_at (get_current_location (),
2394 : "invalid type suffix %qs for floating-point literal",
2395 : get_type_hint_string (type_hint));
2396 : // ignore invalid type suffix as everything else seems fine
2397 0 : type_hint = CORETYPE_UNKNOWN;
2398 : }
2399 :
2400 345 : current_column += length;
2401 :
2402 345 : loc += length - 1;
2403 :
2404 345 : str.shrink_to_fit ();
2405 345 : return Token::make_float (loc, std::move (str), type_hint);
2406 345 : }
2407 15106 : else if (current_char == '.'
2408 15106 : && check_valid_float_dot_end (peek_input (1).value))
2409 : {
2410 : // float that is just an integer with a terminating '.' character
2411 :
2412 : // add . to str
2413 8 : str += current_char;
2414 8 : skip_input ();
2415 8 : current_char = peek_input ();
2416 8 : length++;
2417 :
2418 : // type hint not allowed
2419 :
2420 8 : current_column += length;
2421 :
2422 8 : loc += length - 1;
2423 :
2424 8 : str.shrink_to_fit ();
2425 8 : return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
2426 : }
2427 15098 : else if (current_char == 'E' || current_char == 'e')
2428 : {
2429 : // exponent float with no '.' character
2430 :
2431 : // parse exponent part
2432 0 : auto exponent_pair = parse_in_exponent_part ();
2433 0 : str += exponent_pair.first;
2434 0 : length += exponent_pair.second;
2435 :
2436 : // parse in type suffix if it exists
2437 0 : auto type_suffix_pair = parse_in_type_suffix ();
2438 0 : PrimitiveCoreType type_hint = type_suffix_pair.first;
2439 0 : length += type_suffix_pair.second;
2440 :
2441 0 : if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2442 0 : && type_hint != CORETYPE_UNKNOWN)
2443 : {
2444 0 : rust_error_at (get_current_location (),
2445 : "invalid type suffix %qs for floating-point literal",
2446 : get_type_hint_string (type_hint));
2447 : // ignore invalid type suffix as everything else seems fine
2448 0 : type_hint = CORETYPE_UNKNOWN;
2449 : }
2450 :
2451 0 : current_column += length;
2452 :
2453 0 : loc += length - 1;
2454 :
2455 0 : str.shrink_to_fit ();
2456 0 : return Token::make_float (loc, std::move (str), type_hint);
2457 0 : }
2458 : else
2459 : {
2460 : // is an integer
2461 :
2462 : // parse in type suffix if it exists
2463 15098 : auto type_suffix_pair = parse_in_type_suffix ();
2464 15098 : PrimitiveCoreType type_hint = type_suffix_pair.first;
2465 : /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2466 15098 : if (type_hint == CORETYPE_UNKNOWN)
2467 : {
2468 13979 : bool pure_decimal = std::get<2> (initial_decimal);
2469 17151 : if (pure_decimal && (!first_zero || str.size () == 1))
2470 : type_hint = CORETYPE_PURE_DECIMAL;
2471 : }
2472 15098 : length += type_suffix_pair.second;
2473 :
2474 15098 : current_column += length;
2475 :
2476 15098 : loc += length - 1;
2477 :
2478 15098 : str.shrink_to_fit ();
2479 15098 : return Token::make_int (loc, std::move (str), type_hint);
2480 : }
2481 15451 : }
2482 :
2483 : TokenPtr
2484 843 : Lexer::parse_char_or_lifetime (location_t loc)
2485 : {
2486 843 : int length = 1;
2487 :
2488 843 : current_char = peek_input ();
2489 843 : if (current_char.is_eof ())
2490 0 : return nullptr;
2491 :
2492 : // parse escaped char literal
2493 843 : if (current_char.value == '\\')
2494 : {
2495 : // parse escape
2496 23 : auto utf8_escape_pair = parse_utf8_escape ();
2497 23 : Codepoint escaped_char = std::get<0> (utf8_escape_pair);
2498 23 : length += std::get<1> (utf8_escape_pair);
2499 :
2500 23 : if (peek_input ().value != '\'')
2501 : {
2502 0 : rust_error_at (get_current_location (), "unended character literal");
2503 : }
2504 : else
2505 : {
2506 23 : skip_input ();
2507 23 : current_char = peek_input ();
2508 23 : length++;
2509 : }
2510 :
2511 23 : current_column += length;
2512 :
2513 23 : loc += length - 1;
2514 :
2515 23 : return Token::make_char (loc, escaped_char);
2516 : }
2517 : else
2518 : {
2519 820 : skip_input ();
2520 :
2521 820 : if (peek_input ().value == '\'')
2522 : {
2523 : // parse non-escaped char literal
2524 203 : Codepoint non_escaped_char = current_char;
2525 :
2526 : // skip the ' character
2527 203 : skip_input ();
2528 203 : current_char = peek_input ();
2529 :
2530 : // TODO fix due to different widths of utf-8 chars?
2531 203 : current_column += 3;
2532 :
2533 203 : loc += 2;
2534 :
2535 203 : return Token::make_char (loc, non_escaped_char);
2536 : }
2537 617 : else if (is_identifier_start (current_char.value))
2538 : {
2539 : // parse lifetime name
2540 617 : std::string str;
2541 1234 : str += current_char.as_string ();
2542 617 : length++;
2543 :
2544 617 : current_char = peek_input ();
2545 1941 : while (is_identifier_continue (current_char.value))
2546 : {
2547 1414 : str += current_char.as_string ();
2548 707 : skip_input ();
2549 707 : current_char = peek_input ();
2550 707 : length++;
2551 : }
2552 :
2553 617 : current_column += length;
2554 :
2555 617 : loc += length - 1;
2556 :
2557 : // TODO some keywords cannot be used for a lifetime label #2306
2558 : // https://doc.rust-lang.org/reference/tokens.html
2559 :
2560 617 : str.shrink_to_fit ();
2561 617 : return Token::make_lifetime (loc, std::move (str));
2562 617 : }
2563 : else
2564 : {
2565 0 : rust_error_at (
2566 : get_current_location (),
2567 : "expected %' after character constant in character literal");
2568 0 : return nullptr;
2569 : }
2570 : }
2571 : }
2572 :
2573 : void
2574 100 : Lexer::split_current_token (TokenId new_left, TokenId new_right)
2575 : {
2576 : /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2577 : * like "IDENTIFIER"? */
2578 100 : location_t current_loc = peek_token ()->get_locus ();
2579 100 : TokenPtr new_left_tok = Token::make (new_left, current_loc);
2580 100 : TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2581 :
2582 100 : token_queue.replace_current_value (std::move (new_left_tok));
2583 100 : token_queue.insert (1, std::move (new_right_tok));
2584 100 : }
2585 :
2586 : void
2587 2 : Lexer::split_current_token (std::vector<TokenPtr> new_tokens)
2588 : {
2589 2 : rust_assert (new_tokens.size () > 0);
2590 4 : token_queue.replace_current_value (new_tokens[0]);
2591 :
2592 5 : for (size_t i = 1; i < new_tokens.size (); i++)
2593 : {
2594 6 : token_queue.insert (i, new_tokens[i]);
2595 : }
2596 2 : }
2597 :
2598 : void
2599 169123 : Lexer::start_line (int current_line, int current_column)
2600 : {
2601 169123 : if (line_map)
2602 169123 : linemap_line_start (line_table, current_line, current_column);
2603 169123 : }
2604 :
2605 : } // namespace Rust
2606 :
2607 : #if CHECKING_P
2608 :
2609 : namespace selftest {
2610 :
2611 : // Checks if `src` has the same contents as the given characters
2612 : static void
2613 6 : assert_source_content (Rust::InputSource &src,
2614 : const std::vector<uint32_t> &expected)
2615 : {
2616 6 : Rust::Codepoint src_char = src.next ();
2617 41 : for (auto expected_char : expected)
2618 : {
2619 : // Make sure that `src` is not shorter than `expected`
2620 35 : ASSERT_FALSE (src_char.is_eof ());
2621 : // Checks skipped character is expeceted one.
2622 35 : ASSERT_EQ (src_char.value, expected_char);
2623 35 : src_char = src.next ();
2624 : }
2625 : // Checks if `src` and `chars` has the same length.
2626 6 : ASSERT_TRUE (src_char.is_eof ());
2627 6 : }
2628 :
2629 : static void
2630 4 : test_buffer_input_source (std::string str,
2631 : const std::vector<uint32_t> &expected)
2632 : {
2633 4 : Rust::BufferInputSource source (str, 0);
2634 4 : assert_source_content (source, expected);
2635 4 : }
2636 :
2637 : static void
2638 2 : test_file_input_source (std::string str, const std::vector<uint32_t> &expected)
2639 : {
2640 2 : FILE *tmpf = tmpfile ();
2641 : // Moves to the first character
2642 2 : fputs (str.c_str (), tmpf);
2643 2 : std::rewind (tmpf);
2644 2 : Rust::FileInputSource source (tmpf);
2645 2 : assert_source_content (source, expected);
2646 2 : }
2647 :
2648 : void
2649 1 : rust_input_source_test ()
2650 : {
2651 : // ASCII
2652 1 : std::string src = (const char *) u8"_abcde\tXYZ\v\f";
2653 1 : std::vector<uint32_t> expected = {u'_', u'a', u'b', u'c', u'd', u'e',
2654 1 : u'\t', u'X', u'Y', u'Z', u'\v', u'\f'};
2655 2 : test_buffer_input_source (src, expected);
2656 :
2657 : // BOM
2658 1 : src = (const char *) u8"\xef\xbb\xbfOK";
2659 1 : expected = {u'O', u'K'};
2660 2 : test_buffer_input_source (src, expected);
2661 :
2662 : // Russian
2663 1 : src = (const char *) u8"приве́т";
2664 1 : expected = {u'п',
2665 : u'р',
2666 : u'и',
2667 : u'в',
2668 : 0x0435 /* CYRILLIC SMALL LETTER IE е */,
2669 : 0x301 /* COMBINING ACUTE ACCENT ́ */,
2670 1 : u'т'};
2671 2 : test_buffer_input_source (src, expected);
2672 :
2673 1 : src = (const char *) u8"❤️🦀";
2674 1 : expected = {0x2764 /* HEAVY BLACK HEART */,
2675 1 : 0xfe0f /* VARIATION SELECTOR-16 */, U'🦀'};
2676 2 : test_buffer_input_source (src, expected);
2677 :
2678 1 : src = (const char *) u8"こんにちは";
2679 1 : expected = {u'こ', u'ん', u'に', u'ち', u'は'};
2680 2 : test_file_input_source (src, expected);
2681 :
2682 1 : src = (const char *) u8"👮♂👩⚕";
2683 1 : expected
2684 : = {0x1f46e /* POLICE OFFICER */, 0x200d /* ZERO WIDTH JOINER */,
2685 : 0x2642 /* MALE SIGN */, 0x1f469 /* WOMAN */,
2686 1 : 0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
2687 2 : test_file_input_source (src, expected);
2688 1 : }
2689 :
2690 : } // namespace selftest
2691 :
2692 : #endif // CHECKING_P
|