Line data Source code
1 : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
2 :
3 : // This file is part of GCC.
4 :
5 : // GCC is free software; you can redistribute it and/or modify it under
6 : // the terms of the GNU General Public License as published by the Free
7 : // Software Foundation; either version 3, or (at your option) any later
8 : // version.
9 :
10 : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 : // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 : // for more details.
14 :
15 : // You should have received a copy of the GNU General Public License
16 : // along with GCC; see the file COPYING3. If not see
17 : // <http://www.gnu.org/licenses/>.
18 :
19 : #include "rust-codepoint.h"
20 : #include "rust-system.h"
21 : #include "rust-lex.h"
22 : #include "rust-diagnostics.h"
23 : #include "rust-linemap.h"
24 : #include "rust-edition.h"
25 : #include "safe-ctype.h"
26 : #include "cpplib.h"
27 : #include "rust-keyword-values.h"
28 :
29 : namespace Rust {
30 : // TODO: move to separate compilation unit?
31 : // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
32 : std::string &
33 3275683 : operator+= (std::string &str, Codepoint char32)
34 : {
35 3275683 : if (char32.value < 0x80)
36 : {
37 3274670 : str += static_cast<char> (char32.value);
38 : }
39 1013 : else if (char32.value < (0x1F + 1) << (1 * 6))
40 : {
41 674 : str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
42 674 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
43 : }
44 339 : else if (char32.value < (0x0F + 1) << (2 * 6))
45 : {
46 329 : str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
47 329 : str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
48 329 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
49 : }
50 10 : else if (char32.value < (0x07 + 1) << (3 * 6))
51 : {
52 6 : str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
53 6 : str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
54 6 : str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
55 6 : str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
56 : }
57 : else
58 : {
59 4 : rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
60 : }
61 3275683 : return str;
62 : }
63 :
64 : std::string
65 2986365 : Codepoint::as_string ()
66 : {
67 2986365 : std::string str;
68 :
69 : // str += Codepoint (value);
70 2986365 : str += *this;
71 :
72 2986365 : return str;
73 : }
74 :
75 : /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
76 : * for handling. */
77 : bool
78 0 : is_float_digit (uint32_t number)
79 : {
80 0 : return ISDIGIT (number) || number == 'E' || number == 'e';
81 : }
82 :
83 : /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
84 : * whatever is different */
85 : bool
86 2681 : is_x_digit (uint32_t number)
87 : {
88 2681 : return ISXDIGIT (number);
89 : }
90 :
91 : bool
92 87 : is_octal_digit (uint32_t number)
93 : {
94 87 : return number >= '0' && number <= '7';
95 : }
96 :
97 : bool
98 276 : is_bin_digit (uint32_t number)
99 : {
100 276 : return number == '0' || number == '1';
101 : }
102 :
103 : bool
104 142 : check_valid_float_dot_end (uint32_t character)
105 : {
106 142 : return character != '.' && character != '_' && !ISALPHA (character);
107 : }
108 :
109 : bool
110 3661 : is_whitespace (uint32_t character)
111 : {
112 : // https://doc.rust-lang.org/reference/whitespace.html
113 3661 : switch (character)
114 : {
115 : case '\t':
116 : case '\n':
117 : case '\v':
118 : case '\f':
119 : case '\r':
120 : case ' ':
121 : case 0x0085: // next line
122 : case 0x200e: // left-to-right mark
123 : case 0x200f: // right-to-left mark
124 : case 0x2028: // line separator
125 : case 0x2029: // paragraph separator
126 : return true;
127 3318 : default:
128 3318 : return false;
129 : }
130 : }
131 :
132 : bool
133 4384 : is_non_decimal_int_literal_separator (uint32_t character)
134 : {
135 4384 : return character == 'x' || character == 'o' || character == 'b';
136 : }
137 :
138 : bool
139 321359 : is_identifier_start (uint32_t codepoint)
140 : {
141 321359 : return (cpp_check_xid_property (codepoint) & CPP_XID_START)
142 321359 : || codepoint == '_';
143 : }
144 :
145 : bool
146 1182099 : is_identifier_continue (uint32_t codepoint)
147 : {
148 1182099 : return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
149 : }
150 :
151 104 : Lexer::Lexer (const std::string &input, Linemap *linemap)
152 104 : : input (RAIIFile::create_error ()), current_line (1), current_column (1),
153 104 : line_map (linemap), dump_lex_out ({}),
154 104 : raw_input_source (new BufferInputSource (input, 0)),
155 104 : input_queue{*raw_input_source}, token_queue (TokenSource (this))
156 104 : {}
157 :
158 4888 : Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
159 4888 : tl::optional<std::ofstream &> dump_lex_opt)
160 4888 : : input (std::move (file_input)), current_line (1), current_column (1),
161 4888 : line_map (linemap), dump_lex_out (dump_lex_opt),
162 9776 : raw_input_source (new FileInputSource (input.get_raw ())),
163 9776 : input_queue{*raw_input_source}, token_queue (TokenSource (this))
164 : {
165 : // inform line_table that file is being entered and is in line 1
166 4888 : if (linemap)
167 4888 : line_map->start_file (filename, current_line);
168 4888 : }
169 :
170 4990 : Lexer::~Lexer ()
171 : {
172 : /* ok apparently stop (which is equivalent of original code in destructor) is
173 : * meant to be called after all files have finished parsing, for cleanup. On
174 : * the other hand, actual code that it calls to leave a certain line map is
175 : * mentioned in GCC docs as being useful for "just leaving an included header"
176 : * and stuff like that, so this line mapping functionality may need fixing.
177 : * FIXME: find out whether this occurs. */
178 :
179 : // line_map->stop();
180 4990 : }
181 :
182 : bool
183 4833 : Lexer::input_source_is_valid_utf8 ()
184 : {
185 4833 : return raw_input_source->is_valid ();
186 : }
187 :
188 : location_t
189 1899038 : Lexer::get_current_location ()
190 : {
191 1899038 : if (line_map)
192 1898869 : return linemap_position_for_column (line_table, current_column);
193 : else
194 : // If we have no linemap, we're lexing something without proper locations
195 : return UNDEF_LOCATION;
196 : }
197 :
198 : Codepoint
199 4257539 : Lexer::peek_input (int n)
200 : {
201 4257539 : return input_queue.peek (n);
202 : }
203 :
204 : Codepoint
205 4159223 : Lexer::peek_input ()
206 : {
207 4159223 : return peek_input (0);
208 : }
209 :
210 : void
211 3584366 : Lexer::skip_input (int n)
212 : {
213 3584366 : input_queue.skip (n);
214 3584366 : }
215 :
216 : void
217 3574072 : Lexer::skip_input ()
218 : {
219 3574072 : skip_input (0);
220 3574072 : }
221 :
222 : void
223 750466 : Lexer::skip_token (int n)
224 : {
225 : // dump tokens if dump-lex option is enabled
226 750466 : if (dump_lex_out.has_value ())
227 55 : dump_and_skip (n);
228 : else
229 750411 : token_queue.skip (n);
230 750466 : }
231 :
232 : void
233 55 : Lexer::dump_and_skip (int n)
234 : {
235 55 : std::ofstream &out = dump_lex_out.value ();
236 55 : bool found_eof = false;
237 55 : const_TokenPtr tok;
238 110 : for (int i = 0; i < n + 1; i++)
239 : {
240 55 : if (!found_eof)
241 : {
242 55 : tok = peek_token ();
243 55 : found_eof |= tok->get_id () == Rust::END_OF_FILE;
244 :
245 55 : location_t loc = tok->get_locus ();
246 :
247 55 : out << "<id=";
248 55 : out << tok->token_id_to_str ();
249 55 : out << (tok->should_have_str ()
250 167 : ? (std::string (", text=") + tok->get_str ()
251 167 : + std::string (", typehint=")
252 93 : + std::string (tok->get_type_hint_str ()))
253 91 : : "")
254 110 : << " ";
255 110 : out << Linemap::location_to_string (loc) << '\n';
256 : }
257 :
258 55 : token_queue.skip (0);
259 : }
260 55 : }
261 :
262 : void
263 0 : Lexer::replace_current_token (TokenPtr replacement)
264 : {
265 0 : token_queue.replace_current_value (replacement);
266 :
267 0 : rust_debug ("called 'replace_current_token' - this is deprecated");
268 0 : }
269 :
270 : /* Determines whether the string passed in is a keyword or not. If it is, it
271 : * returns the keyword name. */
272 : TokenId
273 288288 : Lexer::classify_keyword (const std::string &str)
274 : {
275 288288 : auto &keywords = Rust::Values::Keywords::keywords_tokens;
276 288288 : auto keyword = keywords.find (str);
277 :
278 288288 : if (keyword == keywords.end ())
279 : return IDENTIFIER;
280 :
281 96371 : auto id = keyword->second;
282 :
283 : // We now have the expected token ID of the reserved keyword. However, some
284 : // keywords are reserved starting in certain editions. For example, `try` is
285 : // only a reserved keyword in editions >=2018. The language might gain new
286 : // reserved keywords in the future.
287 : //
288 : // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
289 :
290 : // `try` is not a reserved keyword before 2018
291 96371 : if (get_rust_edition () == Edition::E2015 && id == TRY)
292 : return IDENTIFIER;
293 :
294 : return id;
295 : }
296 :
297 : TokenPtr
298 758739 : Lexer::build_token ()
299 : {
300 : // loop to go through multiple characters to build a single token
301 1886107 : while (true)
302 : {
303 1886107 : location_t loc = get_current_location ();
304 :
305 1886107 : current_char = peek_input ();
306 1886107 : skip_input ();
307 :
308 : // detect shebang
309 : // Must be the first thing on the first line, starting with #!
310 : // But since an attribute can also start with an #! we don't count it as a
311 : // shebang line when after any whitespace or comments there is a [. If it
312 : // is a shebang line we simple drop the line. Otherwise we don't consume
313 : // any characters and fall through to the real tokenizer.
314 32809 : if (current_line == 1 && current_column == 1 && current_char == '#'
315 1918916 : && peek_input () == '!')
316 : {
317 : int n = 1;
318 3289 : while (true)
319 : {
320 3289 : Codepoint next_char = peek_input (n);
321 3289 : if (is_whitespace (next_char.value))
322 7 : n++;
323 3282 : else if ((next_char == '/' && peek_input (n + 1) == '/'
324 7 : && peek_input (n + 2) != '!'
325 7 : && peek_input (n + 2) != '/')
326 3303 : || (next_char == '/' && peek_input (n + 1) == '/'
327 0 : && peek_input (n + 2) == '/'
328 0 : && peek_input (n + 3) == '/'))
329 : {
330 : // two // or four ////
331 : // A single line comment
332 : // (but not an inner or outer doc comment)
333 7 : n += 2;
334 7 : next_char = peek_input (n);
335 119 : while (next_char != '\n' && !next_char.is_eof ())
336 : {
337 112 : n++;
338 112 : next_char = peek_input (n);
339 : }
340 7 : if (next_char == '\n')
341 7 : n++;
342 : }
343 3275 : else if (next_char == '/' && peek_input (n + 1) == '*'
344 0 : && peek_input (n + 2) == '*'
345 3275 : && peek_input (n + 3) == '/')
346 : {
347 : /**/
348 0 : n += 4;
349 : }
350 3275 : else if (next_char == '/' && peek_input (n + 1) == '*'
351 0 : && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
352 3275 : && peek_input (n + 4) == '/')
353 : {
354 : /***/
355 0 : n += 5;
356 : }
357 3275 : else if ((next_char == '/' && peek_input (n + 1) == '*'
358 0 : && peek_input (n + 2) != '*'
359 0 : && peek_input (n + 2) != '!')
360 3296 : || (next_char == '/' && peek_input (n + 1) == '*'
361 0 : && peek_input (n + 2) == '*'
362 0 : && peek_input (n + 3) == '*'))
363 : {
364 : // one /* or three /***
365 : // Start of a block comment
366 : // (but not an inner or outer doc comment)
367 0 : n += 2;
368 0 : int level = 1;
369 0 : while (level > 0)
370 : {
371 0 : if (peek_input (n).is_eof ())
372 : break;
373 0 : else if (peek_input (n) == '/'
374 0 : && peek_input (n + 1) == '*')
375 : {
376 0 : n += 2;
377 0 : level += 1;
378 : }
379 0 : else if (peek_input (n) == '*'
380 0 : && peek_input (n + 1) == '/')
381 : {
382 0 : n += 2;
383 0 : level -= 1;
384 : }
385 : else
386 0 : n++;
387 : }
388 : }
389 3275 : else if (next_char != '[')
390 : {
391 : // definitely shebang, ignore the first line
392 518 : while (current_char != '\n' && !current_char.is_eof ())
393 : {
394 490 : current_char = peek_input ();
395 490 : skip_input ();
396 : }
397 :
398 : // newline
399 28 : current_line++;
400 28 : current_column = 1;
401 : // tell line_table that new line starts
402 28 : start_line (current_line, max_column_hint);
403 28 : break;
404 : }
405 : else
406 : break; /* Definitely not a shebang line. */
407 : }
408 : }
409 :
410 : // return end of file token if end of file
411 1886107 : if (current_char.is_eof ())
412 5288 : return Token::make (END_OF_FILE, loc);
413 :
414 : // if not end of file, start tokenising
415 1880819 : switch (current_char.value)
416 : {
417 : /* ignore whitespace characters for tokens but continue updating
418 : * location */
419 166382 : case '\n': // newline
420 166382 : case 0x0085: // next line
421 166382 : case 0x2028: // line separator
422 166382 : case 0x2029: // paragraph separator
423 166382 : current_line++;
424 166382 : current_column = 1;
425 : // tell line_table that new line starts
426 166382 : start_line (current_line, max_column_hint);
427 166382 : continue;
428 252 : case '\r': // cr
429 : // Ignore, we expect a newline (lf) soon.
430 252 : continue;
431 950746 : case ' ': // space
432 950746 : current_column++;
433 950746 : continue;
434 113 : case '\t': // horizontal tab
435 : // width of a tab is not well-defined, assume 8 spaces
436 113 : current_column += 8;
437 113 : continue;
438 28 : case '\v': // vertical tab
439 28 : case 0x000c: // form feed
440 28 : case 0x200e: // left-to-right mark
441 28 : case 0x200f: // right-to-left mark
442 : // Ignored.
443 28 : continue;
444 :
445 : // punctuation - actual tokens
446 28197 : case '=':
447 28197 : if (peek_input () == '>')
448 : {
449 : // match arm arrow
450 3304 : skip_input ();
451 3304 : current_column += 2;
452 3304 : loc += 1;
453 :
454 3304 : return Token::make (MATCH_ARROW, loc);
455 : }
456 24893 : else if (peek_input () == '=')
457 : {
458 : // equality operator
459 667 : skip_input ();
460 667 : current_column += 2;
461 667 : loc += 1;
462 :
463 667 : return Token::make (EQUAL_EQUAL, loc);
464 : }
465 : else
466 : {
467 : // assignment operator
468 24226 : current_column++;
469 24226 : return Token::make (EQUAL, loc);
470 : }
471 47725 : case '(':
472 47725 : current_column++;
473 47725 : return Token::make (LEFT_PAREN, loc);
474 12102 : case '-':
475 12102 : if (peek_input () == '>')
476 : {
477 : // return type specifier
478 10658 : skip_input ();
479 10658 : current_column += 2;
480 10658 : loc += 1;
481 :
482 10658 : return Token::make (RETURN_TYPE, loc);
483 : }
484 1444 : else if (peek_input () == '=')
485 : {
486 : // minus-assign
487 105 : skip_input ();
488 105 : current_column += 2;
489 105 : loc += 1;
490 :
491 105 : return Token::make (MINUS_EQ, loc);
492 : }
493 : else
494 : {
495 : // minus
496 1339 : current_column++;
497 1339 : return Token::make (MINUS, loc);
498 : }
499 1750 : case '+':
500 1750 : if (peek_input () == '=')
501 : {
502 : // add-assign
503 155 : skip_input ();
504 155 : current_column += 2;
505 155 : loc += 1;
506 :
507 155 : return Token::make (PLUS_EQ, loc);
508 : }
509 : else
510 : {
511 : // add
512 1595 : current_column++;
513 1595 : return Token::make (PLUS, loc);
514 : }
515 47706 : case ')':
516 47706 : current_column++;
517 47706 : return Token::make (RIGHT_PAREN, loc);
518 30907 : case ';':
519 30907 : current_column++;
520 30907 : return Token::make (SEMICOLON, loc);
521 10845 : case '*':
522 10845 : if (peek_input () == '=')
523 : {
524 : // multiplication-assign
525 7 : skip_input ();
526 7 : current_column += 2;
527 7 : loc += 1;
528 :
529 7 : return Token::make (ASTERISK_EQ, loc);
530 : }
531 : else
532 : {
533 : // multiplication
534 10838 : current_column++;
535 10838 : return Token::make (ASTERISK, loc);
536 : }
537 22893 : case ',':
538 22893 : current_column++;
539 22893 : return Token::make (COMMA, loc);
540 17990 : case '/':
541 17990 : if (peek_input () == '=')
542 : {
543 : // division-assign
544 7 : skip_input ();
545 7 : current_column += 2;
546 7 : loc += 1;
547 :
548 7 : return Token::make (DIV_EQ, loc);
549 : }
550 17983 : else if ((peek_input () == '/' && peek_input (1) != '!'
551 16868 : && peek_input (1) != '/')
552 25983 : || (peek_input () == '/' && peek_input (1) == '/'
553 7900 : && peek_input (2) == '/'))
554 : {
555 : // two // or four ////
556 : // single line comment
557 : // (but not an inner or outer doc comment)
558 8983 : skip_input ();
559 8983 : current_column += 2;
560 8983 : current_char = peek_input ();
561 :
562 : // basically ignore until line finishes
563 440128 : while (current_char != '\n' && !current_char.is_eof ())
564 : {
565 422162 : skip_input ();
566 422162 : current_column++; // not used
567 422162 : current_char = peek_input ();
568 : }
569 8983 : continue;
570 : }
571 9000 : else if (peek_input () == '/'
572 9000 : && (peek_input (1) == '!' || peek_input (1) == '/'))
573 : {
574 : /* single line doc comment, inner or outer. */
575 7985 : bool is_inner = peek_input (1) == '!';
576 7985 : skip_input (1);
577 7985 : current_column += 3;
578 :
579 7985 : std::string str;
580 7985 : str.reserve (32);
581 7985 : current_char = peek_input ();
582 192537 : while (current_char != '\n')
583 : {
584 176616 : skip_input ();
585 176616 : if (current_char == '\r')
586 : {
587 51 : Codepoint next_char = peek_input ();
588 51 : if (next_char == '\n')
589 : {
590 49 : current_char = '\n';
591 49 : break;
592 : }
593 2 : rust_error_at (
594 : loc, "Isolated CR %<\\r%> not allowed in doc comment");
595 2 : current_char = next_char;
596 2 : continue;
597 2 : }
598 176565 : if (current_char.is_eof ())
599 : {
600 0 : rust_error_at (
601 : loc, ErrorCode::E0758,
602 : "unexpected EOF while looking for end of comment");
603 0 : break;
604 : }
605 176565 : str += current_char;
606 176565 : current_char = peek_input ();
607 : }
608 7985 : skip_input ();
609 7985 : current_line++;
610 7985 : current_column = 1;
611 : // tell line_table that new line starts
612 7985 : start_line (current_line, max_column_hint);
613 :
614 7985 : str.shrink_to_fit ();
615 :
616 7985 : loc += str.size () - 1;
617 7985 : if (is_inner)
618 100 : return Token::make_inner_doc_comment (loc, std::move (str));
619 : else
620 7885 : return Token::make_outer_doc_comment (loc, std::move (str));
621 7985 : }
622 1015 : else if (peek_input () == '*' && peek_input (1) == '*'
623 1100 : && peek_input (2) == '/')
624 : {
625 : /**/
626 14 : skip_input (2);
627 14 : current_column += 4;
628 14 : continue;
629 : }
630 1001 : else if (peek_input () == '*' && peek_input (1) == '*'
631 1072 : && peek_input (2) == '*' && peek_input (3) == '/')
632 : {
633 : /***/
634 14 : skip_input (3);
635 14 : current_column += 5;
636 14 : continue;
637 : }
638 987 : else if ((peek_input () == '*' && peek_input (1) != '!'
639 878 : && peek_input (1) != '*')
640 1117 : || (peek_input () == '*' && peek_input (1) == '*'
641 57 : && peek_input (2) == '*'))
642 : {
643 : // one /* or three /***
644 : // block comment
645 : // (but not an inner or outer doc comment)
646 835 : skip_input ();
647 835 : current_column += 2;
648 :
649 835 : int level = 1;
650 37830 : while (level > 0)
651 : {
652 36996 : current_char = peek_input ();
653 :
654 36996 : if (current_char.is_eof ())
655 : {
656 1 : rust_error_at (
657 : loc, ErrorCode::E0758,
658 : "unexpected EOF while looking for end of comment");
659 1 : break;
660 : }
661 :
662 : // if /* found
663 36995 : if (current_char == '/' && peek_input (1) == '*')
664 : {
665 : // skip /* characters
666 49 : skip_input (1);
667 :
668 49 : current_column += 2;
669 :
670 49 : level += 1;
671 49 : continue;
672 : }
673 :
674 : // ignore until */ is found
675 36946 : if (current_char == '*' && peek_input (1) == '/')
676 : {
677 : // skip */ characters
678 883 : skip_input (1);
679 :
680 883 : current_column += 2;
681 :
682 883 : level -= 1;
683 883 : continue;
684 : }
685 :
686 36063 : if (current_char == '\n')
687 : {
688 414 : skip_input ();
689 414 : current_line++;
690 414 : current_column = 1;
691 : // tell line_table that new line starts
692 414 : start_line (current_line, max_column_hint);
693 414 : continue;
694 : }
695 :
696 35649 : skip_input ();
697 35649 : current_column++;
698 : }
699 :
700 : // refresh new token
701 835 : continue;
702 835 : }
703 152 : else if (peek_input () == '*'
704 152 : && (peek_input (1) == '!' || peek_input (1) == '*'))
705 : {
706 : // block doc comment, inner /*! or outer /**
707 116 : bool is_inner = peek_input (1) == '!';
708 116 : skip_input (1);
709 116 : current_column += 3;
710 :
711 116 : std::string str;
712 116 : str.reserve (96);
713 :
714 116 : int level = 1;
715 116 : while (level > 0)
716 : {
717 2685 : current_char = peek_input ();
718 :
719 2685 : if (current_char.is_eof ())
720 : {
721 0 : rust_error_at (
722 : loc, ErrorCode::E0758,
723 : "unexpected EOF while looking for end of comment");
724 0 : break;
725 : }
726 :
727 : // if /* found
728 2685 : if (current_char == '/' && peek_input (1) == '*')
729 : {
730 : // skip /* characters
731 84 : skip_input (1);
732 84 : current_column += 2;
733 :
734 84 : level += 1;
735 84 : str += "/*";
736 84 : continue;
737 : }
738 :
739 : // ignore until */ is found
740 2601 : if (current_char == '*' && peek_input (1) == '/')
741 : {
742 : // skip */ characters
743 200 : skip_input (1);
744 200 : current_column += 2;
745 :
746 200 : level -= 1;
747 200 : if (level > 0)
748 84 : str += "*/";
749 200 : continue;
750 : }
751 :
752 2401 : if (current_char == '\r' && peek_input (1) != '\n')
753 2 : rust_error_at (
754 : loc, "Isolated CR %<\\r%> not allowed in doc comment");
755 :
756 2401 : if (current_char == '\n')
757 : {
758 0 : skip_input ();
759 0 : current_line++;
760 0 : current_column = 1;
761 : // tell line_table that new line starts
762 0 : start_line (current_line, max_column_hint);
763 0 : str += '\n';
764 0 : continue;
765 : }
766 :
767 2401 : str += current_char;
768 2401 : skip_input ();
769 2401 : current_column++;
770 : }
771 :
772 116 : str.shrink_to_fit ();
773 :
774 116 : loc += str.size () - 1;
775 116 : if (is_inner)
776 73 : return Token::make_inner_doc_comment (loc, std::move (str));
777 : else
778 43 : return Token::make_outer_doc_comment (loc, std::move (str));
779 116 : }
780 : else
781 : {
782 : // division
783 36 : current_column++;
784 36 : return Token::make (DIV, loc);
785 : }
786 43 : case '%':
787 43 : if (peek_input () == '=')
788 : {
789 : // modulo-assign
790 7 : skip_input ();
791 7 : current_column += 2;
792 7 : loc += 1;
793 :
794 7 : return Token::make (PERCENT_EQ, loc);
795 : }
796 : else
797 : {
798 : // modulo
799 36 : current_column++;
800 36 : return Token::make (PERCENT, loc);
801 : }
802 147 : case '^':
803 147 : if (peek_input () == '=')
804 : {
805 : // xor-assign?
806 84 : skip_input ();
807 84 : current_column += 2;
808 84 : loc += 1;
809 :
810 84 : return Token::make (CARET_EQ, loc);
811 : }
812 : else
813 : {
814 : // xor?
815 63 : current_column++;
816 63 : return Token::make (CARET, loc);
817 : }
818 8717 : case '<':
819 8717 : if (peek_input () == '<')
820 : {
821 66 : if (peek_input (1) == '=')
822 : {
823 : // left-shift assign
824 7 : skip_input (1);
825 7 : current_column += 3;
826 7 : loc += 2;
827 :
828 7 : return Token::make (LEFT_SHIFT_EQ, loc);
829 : }
830 : else
831 : {
832 : // left-shift
833 59 : skip_input ();
834 59 : current_column += 2;
835 59 : loc += 1;
836 :
837 59 : return Token::make (LEFT_SHIFT, loc);
838 : }
839 : }
840 8651 : else if (peek_input () == '=')
841 : {
842 : // smaller than or equal to
843 224 : skip_input ();
844 224 : current_column += 2;
845 224 : loc += 1;
846 :
847 224 : return Token::make (LESS_OR_EQUAL, loc);
848 : }
849 : else
850 : {
851 : // smaller than
852 8427 : current_column++;
853 8427 : return Token::make (LEFT_ANGLE, loc);
854 : }
855 8570 : break;
856 8570 : case '>':
857 8570 : if (peek_input () == '>')
858 : {
859 129 : if (peek_input (1) == '=')
860 : {
861 : // right-shift-assign
862 7 : skip_input (1);
863 7 : current_column += 3;
864 7 : loc += 2;
865 :
866 7 : return Token::make (RIGHT_SHIFT_EQ, loc);
867 : }
868 : else
869 : {
870 : // right-shift
871 122 : skip_input ();
872 122 : current_column += 2;
873 122 : loc += 1;
874 :
875 122 : return Token::make (RIGHT_SHIFT, loc);
876 : }
877 : }
878 8441 : else if (peek_input () == '=')
879 : {
880 : // larger than or equal to
881 209 : skip_input ();
882 209 : current_column += 2;
883 209 : loc += 1;
884 :
885 209 : return Token::make (GREATER_OR_EQUAL, loc);
886 : }
887 : else
888 : {
889 : // larger than
890 8232 : current_column++;
891 8232 : return Token::make (RIGHT_ANGLE, loc);
892 : }
893 29008 : case ':':
894 29008 : if (peek_input () == ':')
895 : {
896 : // scope resolution ::
897 9846 : skip_input ();
898 9846 : current_column += 2;
899 9846 : loc += 1;
900 :
901 9846 : return Token::make (SCOPE_RESOLUTION, loc);
902 : }
903 : else
904 : {
905 : // single colon :
906 19162 : current_column++;
907 19162 : return Token::make (COLON, loc);
908 : }
909 16292 : case '!':
910 : // no special handling for macros in lexer?
911 16292 : if (peek_input () == '=')
912 : {
913 : // not equal boolean operator
914 941 : skip_input ();
915 941 : current_column += 2;
916 941 : loc += 1;
917 :
918 941 : return Token::make (NOT_EQUAL, loc);
919 : }
920 : else
921 : {
922 : // not equal unary operator
923 15351 : current_column++;
924 :
925 15351 : return Token::make (EXCLAM, loc);
926 : }
927 371 : case '?':
928 371 : current_column++;
929 371 : return Token::make (QUESTION_MARK, loc);
930 20749 : case '#':
931 20749 : current_column++;
932 20749 : return Token::make (HASH, loc);
933 22826 : case '[':
934 22826 : current_column++;
935 22826 : return Token::make (LEFT_SQUARE, loc);
936 22819 : case ']':
937 22819 : current_column++;
938 22819 : return Token::make (RIGHT_SQUARE, loc);
939 35986 : case '{':
940 35986 : current_column++;
941 35986 : return Token::make (LEFT_CURLY, loc);
942 35936 : case '}':
943 35936 : current_column++;
944 35936 : return Token::make (RIGHT_CURLY, loc);
945 19 : case '@':
946 19 : current_column++;
947 19 : return Token::make (PATTERN_BIND, loc);
948 3619 : case '$':
949 3619 : current_column++;
950 3619 : return Token::make (DOLLAR_SIGN, loc);
951 0 : case '~':
952 0 : current_column++;
953 0 : return Token::make (TILDE, loc);
954 0 : case '\\':
955 0 : current_column++;
956 0 : return Token::make (BACKSLASH, loc);
957 0 : case '`':
958 0 : current_column++;
959 0 : return Token::make (BACKTICK, loc);
960 475 : case '|':
961 475 : if (peek_input () == '=')
962 : {
963 : // bitwise or-assign?
964 28 : skip_input ();
965 28 : current_column += 2;
966 28 : loc += 1;
967 :
968 28 : return Token::make (PIPE_EQ, loc);
969 : }
970 447 : else if (peek_input () == '|')
971 : {
972 : // logical or
973 69 : skip_input ();
974 69 : current_column += 2;
975 69 : loc += 1;
976 :
977 69 : return Token::make (OR, loc);
978 : }
979 : else
980 : {
981 : // bitwise or
982 378 : current_column++;
983 :
984 378 : return Token::make (PIPE, loc);
985 : }
986 9962 : case '&':
987 9962 : if (peek_input () == '=')
988 : {
989 : // bitwise and-assign?
990 21 : skip_input ();
991 21 : current_column += 2;
992 21 : loc += 1;
993 :
994 21 : return Token::make (AMP_EQ, loc);
995 : }
996 9941 : else if (peek_input () == '&')
997 : {
998 : // logical and
999 306 : skip_input ();
1000 306 : current_column += 2;
1001 306 : loc += 1;
1002 :
1003 306 : return Token::make (LOGICAL_AND, loc);
1004 : }
1005 : else
1006 : {
1007 : // bitwise and/reference
1008 9635 : current_column++;
1009 :
1010 9635 : return Token::make (AMP, loc);
1011 : }
1012 6715 : case '.':
1013 6715 : if (peek_input () == '.')
1014 : {
1015 1170 : if (peek_input (1) == '.')
1016 : {
1017 : // ellipsis
1018 840 : skip_input (1);
1019 840 : current_column += 3;
1020 840 : loc += 2;
1021 :
1022 840 : return Token::make (ELLIPSIS, loc);
1023 : }
1024 330 : else if (peek_input (1) == '=')
1025 : {
1026 : // ..=
1027 38 : skip_input (1);
1028 38 : current_column += 3;
1029 38 : loc += 2;
1030 :
1031 38 : return Token::make (DOT_DOT_EQ, loc);
1032 : }
1033 : else
1034 : {
1035 : // ..
1036 292 : skip_input ();
1037 292 : current_column += 2;
1038 292 : loc += 1;
1039 :
1040 292 : return Token::make (DOT_DOT, loc);
1041 : }
1042 : }
1043 : else /*if (!ISDIGIT (peek_input ()))*/
1044 : {
1045 : // single dot .
1046 : // Only if followed by a non-number - otherwise is float
1047 : // nope, float cannot start with '.'.
1048 5545 : current_column++;
1049 5545 : return Token::make (DOT, loc);
1050 : }
1051 1117521 : }
1052 : // TODO: special handling of _ in the lexer? instead of being identifier
1053 :
1054 : // byte character, byte string and raw byte string literals
1055 320929 : if (current_char == 'b')
1056 : {
1057 10743 : if (peek_input () == '\'')
1058 78 : return parse_byte_char (loc);
1059 10665 : else if (peek_input () == '"')
1060 64 : return parse_byte_string (loc);
1061 10601 : else if (peek_input () == 'r'
1062 10601 : && (peek_input (1) == '#' || peek_input (1) == '"'))
1063 32 : return parse_raw_byte_string (loc);
1064 : }
1065 :
1066 : // raw identifiers and raw strings
1067 320755 : if (current_char == 'r')
1068 : {
1069 4042 : Codepoint peek = peek_input ();
1070 4042 : Codepoint peek1 = peek_input (1);
1071 :
1072 : // TODO (tamaron) parse Unicode ident
1073 4042 : if (peek == '#' && is_identifier_start (peek1.value))
1074 : {
1075 81 : TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1076 81 : if (raw_ident_ptr != nullptr)
1077 80 : return raw_ident_ptr;
1078 : else
1079 1 : continue; /* input got parsed, it just wasn't valid. An error
1080 : was produced. */
1081 81 : }
1082 : else
1083 : {
1084 3961 : TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1085 3961 : if (maybe_raw_string_ptr != nullptr)
1086 25 : return maybe_raw_string_ptr;
1087 3961 : }
1088 : }
1089 :
1090 : // find identifiers and keywords.
1091 320649 : if (is_identifier_start (current_char.value))
1092 289604 : return parse_identifier_or_keyword (loc);
1093 :
1094 : // int and float literals
1095 31045 : if (ISDIGIT (current_char.value))
1096 : { // _ not allowed as first char
1097 17452 : if (current_char == '0'
1098 17452 : && is_non_decimal_int_literal_separator (peek_input ().value))
1099 : {
1100 : // handle binary, octal, hex literals
1101 335 : TokenPtr non_dec_int_lit_ptr
1102 335 : = parse_non_decimal_int_literals (loc);
1103 335 : if (non_dec_int_lit_ptr != nullptr)
1104 335 : return non_dec_int_lit_ptr;
1105 335 : }
1106 : else
1107 : {
1108 : // handle decimals (integer or float)
1109 17117 : TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1110 17117 : if (decimal_or_float_ptr != nullptr)
1111 17117 : return decimal_or_float_ptr;
1112 17117 : }
1113 : }
1114 :
1115 : // string literals
1116 13593 : if (current_char == '"')
1117 12745 : return parse_string (loc);
1118 :
1119 : // char literals and lifetime names
1120 848 : if (current_char == '\'')
1121 : {
1122 848 : TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1123 848 : if (char_or_lifetime_ptr != nullptr)
1124 848 : return char_or_lifetime_ptr;
1125 848 : }
1126 :
1127 : // DEBUG: check for specific character problems:
1128 0 : if (current_char == '0')
1129 0 : rust_debug ("'0' uncaught before unexpected character");
1130 0 : else if (current_char == ']')
1131 0 : rust_debug ("']' uncaught before unexpected character");
1132 : else if (current_char == 0x5d)
1133 : rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1134 : "unexpected character");
1135 :
1136 : // didn't match anything so error
1137 0 : rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
1138 0 : current_column++;
1139 : }
1140 : }
1141 :
1142 : // Parses in a suffix
1143 : std::pair<std::string, int>
1144 17444 : Lexer::parse_in_suffix ()
1145 : {
1146 17444 : std::string suffix;
1147 :
1148 17444 : int additional_length_offset = 0;
1149 :
1150 : // get suffix
1151 38430 : while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
1152 40525 : || current_char == '_')
1153 : {
1154 5636 : additional_length_offset++;
1155 :
1156 5636 : suffix += current_char;
1157 5636 : skip_input ();
1158 5636 : current_char = peek_input ();
1159 : }
1160 :
1161 17444 : return std::make_pair (std::move (suffix), additional_length_offset);
1162 17444 : }
1163 :
1164 : // Parses in the exponent part (if any) of a float literal.
1165 : std::pair<std::string, int>
1166 349 : Lexer::parse_in_exponent_part ()
1167 : {
1168 349 : int additional_length_offset = 0;
1169 349 : std::string str;
1170 349 : if (current_char == 'E' || current_char == 'e')
1171 : {
1172 : // add exponent to string as strtod works with it
1173 8 : str += current_char;
1174 8 : skip_input ();
1175 8 : current_char = peek_input ();
1176 :
1177 8 : additional_length_offset++;
1178 :
1179 : // special - and + handling
1180 8 : if (current_char == '-' || current_char == '+')
1181 : {
1182 8 : str += current_char;
1183 :
1184 8 : skip_input ();
1185 8 : current_char = peek_input ();
1186 :
1187 8 : additional_length_offset++;
1188 : }
1189 :
1190 : // parse another decimal number for exponent
1191 8 : auto str_length = parse_in_decimal ();
1192 8 : str += std::get<0> (str_length);
1193 8 : additional_length_offset += std::get<1> (str_length);
1194 8 : }
1195 698 : return std::make_pair (str, additional_length_offset);
1196 349 : }
1197 :
1198 : // Parses a decimal integer.
1199 : std::tuple<std::string, int, bool>
1200 17474 : Lexer::parse_in_decimal ()
1201 : {
1202 : /* A pure decimal contains only digits. */
1203 17474 : bool pure_decimal = true;
1204 17474 : int additional_length_offset = 0;
1205 17474 : std::string str;
1206 24932 : while (ISDIGIT (current_char.value) || current_char.value == '_')
1207 : {
1208 7458 : if (current_char == '_')
1209 : {
1210 14 : pure_decimal = false;
1211 : }
1212 7458 : additional_length_offset++;
1213 :
1214 7458 : str += current_char;
1215 7458 : skip_input ();
1216 7458 : current_char = peek_input ();
1217 : }
1218 34948 : return std::make_tuple (str, additional_length_offset, pure_decimal);
1219 17474 : }
1220 :
1221 : /* Parses escapes (and string continues) in "byte" strings and characters. Does
1222 : * not support unicode. */
1223 : std::tuple<char, int, bool>
1224 61 : Lexer::parse_escape (char opening_char)
1225 : {
1226 61 : int additional_length_offset = 0;
1227 61 : char output_char = 0;
1228 :
1229 : // skip to actual letter
1230 61 : skip_input ();
1231 61 : current_char = peek_input ();
1232 61 : additional_length_offset++;
1233 :
1234 61 : switch (current_char.value)
1235 : {
1236 17 : case 'x':
1237 17 : {
1238 17 : auto hex_escape_pair = parse_partial_hex_escape ();
1239 17 : long hexLong = hex_escape_pair.first;
1240 17 : additional_length_offset += hex_escape_pair.second;
1241 :
1242 17 : if (hexLong > 255 || hexLong < 0)
1243 0 : rust_error_at (
1244 : get_current_location (),
1245 : "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1246 : static_cast<unsigned int> (hexLong));
1247 : /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1248 : * support %X directly */
1249 17 : char hexChar = static_cast<char> (hexLong);
1250 :
1251 17 : output_char = hexChar;
1252 : }
1253 17 : break;
1254 : case 'n':
1255 : output_char = '\n';
1256 : break;
1257 0 : case 'r':
1258 0 : output_char = '\r';
1259 0 : break;
1260 1 : case 't':
1261 1 : output_char = '\t';
1262 1 : break;
1263 8 : case '\\':
1264 8 : output_char = '\\';
1265 8 : break;
1266 9 : case '0':
1267 9 : output_char = '\0';
1268 9 : break;
1269 15 : case '\'':
1270 15 : output_char = '\'';
1271 15 : break;
1272 1 : case '"':
1273 1 : output_char = '"';
1274 1 : break;
1275 2 : case 'u':
1276 3 : rust_error_at (get_current_location (),
1277 : "cannot have a unicode escape \\u in a byte %s",
1278 : opening_char == '\'' ? "character" : "string");
1279 : // Try to parse it anyway, just to skip it
1280 2 : parse_partial_unicode_escape ();
1281 2 : return std::make_tuple (output_char, additional_length_offset, false);
1282 0 : case '\r':
1283 0 : case '\n':
1284 : // string continue
1285 0 : return std::make_tuple (0, parse_partial_string_continue (), true);
1286 1 : default:
1287 1 : rust_error_at (get_current_location (),
1288 : "unknown escape sequence %<\\%s%>",
1289 1 : current_char.as_string ().c_str ());
1290 : // returns false if no parsing could be done
1291 : // return false;
1292 1 : return std::make_tuple (output_char, additional_length_offset, false);
1293 58 : break;
1294 : }
1295 : // all non-special cases (string continue) should skip their used char
1296 58 : skip_input ();
1297 58 : current_char = peek_input ();
1298 58 : additional_length_offset++;
1299 :
1300 : // returns true if parsing was successful
1301 : // return true;
1302 58 : return std::make_tuple (output_char, additional_length_offset, false);
1303 : }
1304 :
1305 : /* Parses an escape (or string continue) in a string or character. Supports
1306 : * unicode escapes. */
1307 : std::tuple<Codepoint, int, bool>
1308 2806 : Lexer::parse_utf8_escape ()
1309 : {
1310 2806 : Codepoint output_char;
1311 2806 : int additional_length_offset = 0;
1312 :
1313 : // skip to actual letter
1314 2806 : skip_input ();
1315 2806 : current_char = peek_input ();
1316 2806 : additional_length_offset++;
1317 :
1318 2806 : switch (current_char.value)
1319 : {
1320 17 : case 'x':
1321 17 : {
1322 17 : auto hex_escape_pair = parse_partial_hex_escape ();
1323 17 : long hexLong = hex_escape_pair.first;
1324 17 : additional_length_offset += hex_escape_pair.second;
1325 :
1326 17 : if (hexLong > 127 || hexLong < 0)
1327 4 : rust_error_at (
1328 : get_current_location (),
1329 : "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1330 : static_cast<unsigned int> (hexLong));
1331 : /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1332 : * support %X directly */
1333 17 : char hexChar = static_cast<char> (hexLong);
1334 :
1335 17 : output_char = hexChar;
1336 : }
1337 17 : break;
1338 : case 'n':
1339 : output_char = '\n';
1340 : break;
1341 0 : case 'r':
1342 0 : output_char = '\r';
1343 0 : break;
1344 2 : case 't':
1345 2 : output_char = '\t';
1346 2 : break;
1347 1 : case '\\':
1348 1 : output_char = '\\';
1349 1 : break;
1350 1406 : case '0':
1351 1406 : output_char = '\0';
1352 1406 : break;
1353 1 : case '\'':
1354 1 : output_char = '\'';
1355 1 : break;
1356 1 : case '"':
1357 1 : output_char = '"';
1358 1 : break;
1359 46 : case 'u':
1360 46 : {
1361 46 : auto unicode_escape_pair = parse_partial_unicode_escape ();
1362 46 : output_char = unicode_escape_pair.first;
1363 46 : additional_length_offset += unicode_escape_pair.second;
1364 :
1365 46 : return std::make_tuple (output_char, additional_length_offset, false);
1366 : }
1367 28 : break;
1368 28 : case '\r':
1369 28 : case '\n':
1370 : // string continue
1371 28 : return std::make_tuple (0, parse_partial_string_continue (), true);
1372 1 : default:
1373 1 : rust_error_at (get_current_location (),
1374 : "unknown escape sequence %<\\%s%>",
1375 1 : current_char.as_string ().c_str ());
1376 : // returns false if no parsing could be done
1377 : // return false;
1378 1 : return std::make_tuple (output_char, additional_length_offset, false);
1379 2731 : break;
1380 : }
1381 : /* all non-special cases (unicode, string continue) should skip their used
1382 : * char */
1383 2731 : skip_input ();
1384 2731 : current_char = peek_input ();
1385 2731 : additional_length_offset++;
1386 :
1387 : // returns true if parsing was successful
1388 : // return true;
1389 2731 : return std::make_tuple (output_char, additional_length_offset, false);
1390 : }
1391 :
1392 : // Parses the body of a string continue that has been found in an escape.
1393 : int
1394 28 : Lexer::parse_partial_string_continue ()
1395 : {
1396 28 : int additional_length_offset = 1;
1397 :
1398 : // string continue
1399 : // TODO use utf-8 codepoint to skip whitespaces
1400 364 : while (is_whitespace (current_char.value))
1401 : {
1402 336 : if (current_char == '\n')
1403 : {
1404 28 : current_line++;
1405 28 : current_column = 1;
1406 : // tell line_table that new line starts
1407 28 : start_line (current_line, max_column_hint);
1408 :
1409 : // reset "length"
1410 28 : additional_length_offset = 1;
1411 :
1412 : // get next char
1413 28 : skip_input ();
1414 28 : current_char = peek_input ();
1415 :
1416 28 : continue;
1417 : }
1418 :
1419 308 : skip_input ();
1420 308 : current_char = peek_input ();
1421 308 : additional_length_offset++;
1422 : }
1423 :
1424 28 : return additional_length_offset;
1425 : }
1426 :
1427 : /* Parses the body of a '\x' escape. Note that it does not check that the number
1428 : * is valid and smaller than 255. */
1429 : std::pair<long, int>
1430 34 : Lexer::parse_partial_hex_escape ()
1431 : {
1432 : // hex char string (null-terminated)
1433 34 : char hexNum[3] = {0, 0, 0};
1434 :
1435 : // first hex char
1436 34 : current_char = peek_input (1);
1437 34 : int additional_length_offset = 1;
1438 :
1439 34 : if (!is_x_digit (current_char.value))
1440 : {
1441 4 : rust_error_at (get_current_location (),
1442 : "invalid character %<\\x%s%> in \\x sequence",
1443 4 : current_char.as_string ().c_str ());
1444 4 : return std::make_pair (0, 0);
1445 : }
1446 30 : hexNum[0] = current_char.value;
1447 :
1448 : // second hex char
1449 30 : skip_input ();
1450 30 : current_char = peek_input (1);
1451 30 : additional_length_offset++;
1452 :
1453 30 : if (!is_x_digit (current_char.value))
1454 : {
1455 2 : rust_error_at (get_current_location (),
1456 2 : "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
1457 2 : current_char.as_string ().c_str ());
1458 2 : return std::make_pair (0, 1);
1459 : }
1460 28 : skip_input ();
1461 28 : hexNum[1] = current_char.value;
1462 :
1463 28 : long hexLong = std::strtol (hexNum, nullptr, 16);
1464 :
1465 28 : return std::make_pair (hexLong, additional_length_offset);
1466 : }
1467 :
1468 : // Parses the body of a unicode escape.
1469 : std::pair<Codepoint, int>
1470 48 : Lexer::parse_partial_unicode_escape ()
1471 : {
1472 48 : skip_input ();
1473 48 : current_char = peek_input ();
1474 48 : int additional_length_offset = 0;
1475 :
1476 48 : if (current_char != '{')
1477 : {
1478 2 : rust_error_at (get_current_location (),
1479 : "unicode escape should start with %<{%>");
1480 : /* Skip what should probaby have been between brackets. */
1481 10 : while (is_x_digit (current_char.value) || current_char == '_')
1482 : {
1483 6 : skip_input ();
1484 6 : current_char = peek_input ();
1485 6 : additional_length_offset++;
1486 : }
1487 2 : return std::make_pair (Codepoint (0), additional_length_offset);
1488 : }
1489 :
1490 46 : skip_input ();
1491 46 : current_char = peek_input ();
1492 46 : additional_length_offset++;
1493 :
1494 46 : if (current_char == '_')
1495 : {
1496 2 : rust_error_at (get_current_location (),
1497 : "unicode escape cannot start with %<_%>");
1498 2 : skip_input ();
1499 2 : current_char = peek_input ();
1500 2 : additional_length_offset++;
1501 : // fallthrough and try to parse the rest anyway
1502 : }
1503 :
1504 : // parse unicode escape - 1-6 hex digits
1505 46 : std::string num_str;
1506 46 : num_str.reserve (6);
1507 :
1508 : // loop through to add entire hex number to string
1509 304 : while (is_x_digit (current_char.value) || current_char.value == '_')
1510 : {
1511 212 : if (current_char == '_')
1512 : {
1513 : // don't add _ to number
1514 24 : skip_input ();
1515 24 : current_char = peek_input ();
1516 :
1517 24 : additional_length_offset++;
1518 :
1519 24 : continue;
1520 : }
1521 :
1522 188 : additional_length_offset++;
1523 :
1524 : // add raw hex numbers
1525 188 : num_str += current_char;
1526 :
1527 188 : skip_input ();
1528 188 : current_char = peek_input ();
1529 : }
1530 :
1531 46 : if (current_char == '}')
1532 : {
1533 44 : skip_input ();
1534 44 : current_char = peek_input ();
1535 44 : additional_length_offset++;
1536 : }
1537 : else
1538 : {
1539 : // actually an error, but allow propagation anyway Assume that
1540 : // wrong bracketm whitespace or single/double quotes are wrong
1541 : // termination, otherwise it is a wrong character, then skip to the actual
1542 : // terminator.
1543 : // TODO use utf-8 codepoint to skip whitespaces
1544 2 : if (current_char == '{' || is_whitespace (current_char.value)
1545 4 : || current_char == '\'' || current_char == '"')
1546 : {
1547 0 : rust_error_at (get_current_location (),
1548 : "expected terminating %<}%> in unicode escape");
1549 0 : return std::make_pair (Codepoint (0), additional_length_offset);
1550 : }
1551 : else
1552 : {
1553 2 : rust_error_at (get_current_location (),
1554 : "invalid character %qs in unicode escape",
1555 2 : current_char.as_string ().c_str ());
1556 : // TODO use utf-8 codepoint to skip whitespaces
1557 8 : while (current_char != '}' && current_char != '{'
1558 6 : && !is_whitespace (current_char.value) && current_char != '\''
1559 14 : && current_char != '"')
1560 : {
1561 6 : skip_input ();
1562 6 : current_char = peek_input ();
1563 6 : additional_length_offset++;
1564 : }
1565 : // Consume the actual closing bracket if found
1566 2 : if (current_char == '}')
1567 : {
1568 2 : skip_input ();
1569 2 : current_char = peek_input ();
1570 2 : additional_length_offset++;
1571 : }
1572 2 : return std::make_pair (Codepoint (0), additional_length_offset);
1573 : }
1574 : }
1575 :
1576 : // ensure 1-6 hex characters
1577 44 : if (num_str.length () > 6 || num_str.length () < 1)
1578 : {
1579 4 : rust_error_at (get_current_location (),
1580 : "unicode escape should be between 1 and 6 hex "
1581 : "characters; it is %lu",
1582 4 : (unsigned long) num_str.length ());
1583 : // return false;
1584 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1585 : }
1586 :
1587 40 : unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1588 :
1589 40 : if (hex_num > 0xd7ff && hex_num < 0xe000)
1590 : {
1591 4 : rust_error_at (
1592 : get_current_location (),
1593 : "unicode escape cannot be a surrogate value (D800 to DFFF)");
1594 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1595 : }
1596 :
1597 36 : if (hex_num > 0x10ffff)
1598 : {
1599 4 : rust_error_at (get_current_location (),
1600 : "unicode escape cannot be larger than 10FFFF");
1601 4 : return std::make_pair (Codepoint (0), additional_length_offset);
1602 : }
1603 :
1604 : // return true;
1605 32 : return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1606 : additional_length_offset);
1607 46 : }
1608 :
1609 : // Parses a byte character.
1610 : TokenPtr
1611 78 : Lexer::parse_byte_char (location_t loc)
1612 : {
1613 78 : skip_input ();
1614 78 : current_column++;
1615 : // make current char the next character
1616 78 : current_char = peek_input ();
1617 :
1618 78 : int length = 1;
1619 :
1620 : // char to save
1621 78 : Codepoint byte_char = 0;
1622 :
1623 : // detect escapes
1624 78 : if (current_char == '\\')
1625 : {
1626 30 : auto escape_length_pair = parse_escape ('\'');
1627 30 : byte_char = std::get<0> (escape_length_pair);
1628 30 : length += std::get<1> (escape_length_pair);
1629 :
1630 30 : current_char = peek_input ();
1631 :
1632 30 : if (current_char != '\'')
1633 : {
1634 0 : rust_error_at (get_current_location (), "unclosed %<byte char%>");
1635 : }
1636 :
1637 30 : skip_input ();
1638 30 : current_char = peek_input ();
1639 30 : length++; // go to next char
1640 : }
1641 48 : else if (current_char != '\'')
1642 : {
1643 : // otherwise, get character from direct input character
1644 48 : byte_char = current_char;
1645 :
1646 48 : if (!byte_char.is_ascii ())
1647 : {
1648 2 : rust_error_at (get_current_location (),
1649 : "non-ASCII character in %<byte char%>");
1650 : }
1651 :
1652 48 : skip_input ();
1653 48 : current_char = peek_input ();
1654 48 : length++;
1655 :
1656 48 : if (current_char != '\'')
1657 : {
1658 0 : rust_error_at (get_current_location (), "unclosed %<byte char%>");
1659 : }
1660 :
1661 48 : skip_input ();
1662 48 : current_char = peek_input ();
1663 48 : length++; // go to next char
1664 : }
1665 : else
1666 : {
1667 0 : rust_error_at (get_current_location (),
1668 : "no character inside %<%> for %<byte char%>");
1669 : }
1670 :
1671 78 : current_column += length;
1672 :
1673 78 : loc += length - 1;
1674 78 : return Token::make_byte_char (loc, byte_char.value);
1675 : }
1676 :
1677 : // Parses a byte string.
1678 : TokenPtr
1679 64 : Lexer::parse_byte_string (location_t loc)
1680 : {
1681 : // byte string
1682 :
1683 : // skip quote character
1684 64 : skip_input ();
1685 64 : current_column++;
1686 :
1687 64 : std::string str;
1688 64 : str.reserve (16); // some sensible default
1689 :
1690 64 : current_char = peek_input ();
1691 :
1692 64 : const location_t string_begin_locus = get_current_location ();
1693 :
1694 438 : while (current_char != '"' && !current_char.is_eof ())
1695 : {
1696 310 : if (current_char == '\\')
1697 : {
1698 31 : int length = 1;
1699 31 : auto escape_length_pair = parse_escape ('"');
1700 31 : char output_char = std::get<0> (escape_length_pair);
1701 :
1702 31 : if (output_char == 0 && std::get<2> (escape_length_pair))
1703 0 : length = std::get<1> (escape_length_pair) - 1;
1704 : else
1705 31 : length += std::get<1> (escape_length_pair);
1706 :
1707 31 : if (output_char != 0 || !std::get<2> (escape_length_pair))
1708 31 : str += output_char;
1709 :
1710 31 : current_column += length;
1711 :
1712 31 : continue;
1713 31 : }
1714 :
1715 279 : current_column++;
1716 279 : if (current_char.value == '\n')
1717 : {
1718 23 : current_line++;
1719 23 : current_column = 1;
1720 : // tell line_table that new line starts
1721 23 : start_line (current_line, max_column_hint);
1722 : }
1723 :
1724 279 : str += current_char;
1725 279 : skip_input ();
1726 279 : current_char = peek_input ();
1727 : }
1728 :
1729 64 : if (current_char == '"')
1730 : {
1731 57 : current_column++;
1732 :
1733 57 : skip_input ();
1734 57 : current_char = peek_input ();
1735 : }
1736 7 : else if (current_char.is_eof ())
1737 : {
1738 7 : rust_error_at (string_begin_locus, "unended byte string literal");
1739 7 : return Token::make (END_OF_FILE, get_current_location ());
1740 : }
1741 : else
1742 : {
1743 : rust_unreachable ();
1744 : }
1745 :
1746 57 : str.shrink_to_fit ();
1747 57 : loc += str.size () - 1;
1748 :
1749 57 : return Token::make_byte_string (loc, std::move (str));
1750 64 : }
1751 :
1752 : // Parses a raw byte string.
1753 : TokenPtr
1754 32 : Lexer::parse_raw_byte_string (location_t loc)
1755 : {
1756 : // raw byte string literals
1757 32 : std::string str;
1758 32 : str.reserve (16); // some sensible default
1759 :
1760 32 : int length = 1;
1761 32 : int hash_count = 0;
1762 :
1763 32 : const location_t string_begin_locus = get_current_location ();
1764 :
1765 : // get hash count at beginnning
1766 32 : skip_input ();
1767 32 : current_char = peek_input ();
1768 32 : length++;
1769 32 : current_column++;
1770 54 : while (current_char == '#')
1771 : {
1772 22 : hash_count++;
1773 22 : length++;
1774 22 : current_column++;
1775 :
1776 22 : skip_input ();
1777 22 : current_char = peek_input ();
1778 : }
1779 :
1780 32 : if (current_char != '"')
1781 : {
1782 0 : rust_error_at (get_current_location (),
1783 : "raw byte string has no opening %<\"%>");
1784 : }
1785 :
1786 32 : skip_input ();
1787 32 : current_char = peek_input ();
1788 32 : length++;
1789 32 : current_column++;
1790 :
1791 330 : while (true)
1792 : {
1793 181 : if (current_char == '"')
1794 : {
1795 51 : bool enough_hashes = true;
1796 :
1797 51 : for (int i = 0; i < hash_count; i++)
1798 : {
1799 26 : if (peek_input (i + 1) != '#')
1800 : {
1801 : enough_hashes = false;
1802 : break;
1803 : }
1804 : }
1805 :
1806 35 : if (enough_hashes)
1807 : {
1808 : // skip enough input and peek enough input
1809 25 : skip_input (hash_count);
1810 25 : current_char = peek_input ();
1811 25 : length += hash_count + 1;
1812 25 : current_column += hash_count + 1;
1813 25 : break;
1814 : }
1815 : }
1816 146 : else if (current_char.is_eof ())
1817 : {
1818 7 : rust_error_at (string_begin_locus, "unended raw byte string literal");
1819 7 : return Token::make (END_OF_FILE, get_current_location ());
1820 : }
1821 139 : else if (current_char.value > 127)
1822 : {
1823 1 : rust_error_at (get_current_location (),
1824 : "character %qs in raw byte string out of range",
1825 1 : current_char.as_string ().c_str ());
1826 1 : current_char = 0;
1827 : }
1828 :
1829 149 : length++;
1830 149 : current_column++;
1831 149 : if (current_char == '\n')
1832 : {
1833 22 : current_line++;
1834 22 : current_column = 1;
1835 22 : start_line (current_line, max_column_hint);
1836 : }
1837 :
1838 149 : str += current_char;
1839 149 : skip_input ();
1840 149 : current_char = peek_input ();
1841 149 : }
1842 :
1843 25 : loc += length - 1;
1844 :
1845 25 : str.shrink_to_fit ();
1846 :
1847 25 : return Token::make_byte_string (loc, std::move (str));
1848 32 : }
1849 :
1850 : // Parses a raw identifier.
1851 : TokenPtr
1852 81 : Lexer::parse_raw_identifier (location_t loc)
1853 : {
1854 : // raw identifier
1855 81 : std::string str;
1856 81 : str.reserve (16); // default
1857 :
1858 81 : skip_input ();
1859 81 : current_char = peek_input ();
1860 :
1861 81 : current_column += 2;
1862 :
1863 81 : bool first_is_underscore = current_char == '_';
1864 :
1865 81 : int length = 0;
1866 81 : current_char = peek_input ();
1867 : // loop through entire name
1868 475 : while (is_identifier_continue (current_char.value))
1869 : {
1870 313 : length++;
1871 :
1872 313 : str += current_char;
1873 313 : skip_input ();
1874 313 : current_char = peek_input ();
1875 : }
1876 :
1877 81 : current_column += length;
1878 :
1879 81 : rust_debug ("raw ident: %s", str.c_str ());
1880 :
1881 : // if just a single underscore, not an identifier
1882 81 : if (first_is_underscore && length == 1)
1883 1 : rust_error_at (get_current_location (),
1884 : "%<_%> is not a valid raw identifier");
1885 :
1886 81 : using namespace Rust::Values;
1887 81 : std::set<std::string> invalid{
1888 81 : Keywords::CRATE, Keywords::EXTERN_KW, Keywords::SELF,
1889 81 : Keywords::SUPER, Keywords::SELF_ALIAS,
1890 486 : };
1891 :
1892 81 : if (invalid.find (str) != invalid.end ())
1893 : {
1894 1 : rust_error_at (get_current_location (),
1895 : "%qs is a forbidden raw identifier", str.c_str ());
1896 :
1897 1 : return nullptr;
1898 : }
1899 : else
1900 : {
1901 80 : str.shrink_to_fit ();
1902 80 : loc += length - 1;
1903 :
1904 80 : return Token::make_identifier (loc, std::move (str));
1905 : }
1906 81 : }
1907 :
1908 : // skip broken string input (unterminated strings)
1909 : void
1910 0 : Lexer::skip_broken_string_input (Codepoint current_char)
1911 : {
1912 0 : while (current_char != '"' && !current_char.is_eof ())
1913 : {
1914 0 : if (current_char == '\n')
1915 : {
1916 0 : current_line++;
1917 0 : current_column = 1;
1918 : }
1919 : else
1920 : {
1921 0 : current_column++;
1922 : }
1923 0 : skip_input ();
1924 0 : current_char = peek_input ();
1925 : }
1926 0 : if (current_char == '"')
1927 : {
1928 0 : current_column++;
1929 :
1930 0 : skip_input ();
1931 0 : current_char = peek_input ();
1932 : }
1933 0 : rust_debug ("skipped to %d:%d due to bad quotes", current_line,
1934 : current_column);
1935 0 : }
1936 :
1937 : // Parses a string.
1938 : TokenPtr
1939 12745 : Lexer::parse_string (location_t loc)
1940 : {
1941 12745 : std::string str;
1942 12745 : str.reserve (16); // some sensible default
1943 :
1944 12745 : current_char = peek_input ();
1945 :
1946 12745 : const location_t string_begin_locus = get_current_location ();
1947 :
1948 : // FIXME: This fails if the input ends. How do we check for EOF?
1949 104398 : while (current_char.value != '"' && !current_char.is_eof ())
1950 : {
1951 78908 : if (current_char.value == '\\')
1952 : {
1953 2783 : int length = 1;
1954 :
1955 : // parse escape
1956 2783 : auto utf8_escape_pair = parse_utf8_escape ();
1957 2783 : current_char = std::get<0> (utf8_escape_pair);
1958 :
1959 2783 : if (current_char == Codepoint (0) && std::get<2> (utf8_escape_pair))
1960 28 : length = std::get<1> (utf8_escape_pair) - 1;
1961 : else
1962 2755 : length += std::get<1> (utf8_escape_pair);
1963 :
1964 2783 : if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair))
1965 5510 : str += current_char.as_string ();
1966 :
1967 2783 : current_column += length;
1968 :
1969 : // FIXME: should remove this but can't.
1970 : // `parse_utf8_escape` does not update `current_char` correctly.
1971 2783 : current_char = peek_input ();
1972 2783 : continue;
1973 2783 : }
1974 :
1975 76125 : current_column++;
1976 76125 : if (current_char.value == '\n')
1977 : {
1978 67 : current_line++;
1979 67 : current_column = 1;
1980 : // tell line_table that new line starts
1981 67 : start_line (current_line, max_column_hint);
1982 : }
1983 :
1984 76125 : str += current_char;
1985 76125 : skip_input ();
1986 76125 : current_char = peek_input ();
1987 : }
1988 :
1989 12745 : if (current_char.value == '"')
1990 : {
1991 12731 : current_column++;
1992 :
1993 12731 : skip_input ();
1994 12731 : current_char = peek_input ();
1995 : }
1996 14 : else if (current_char.is_eof ())
1997 : {
1998 14 : rust_error_at (string_begin_locus, "unended string literal");
1999 14 : return Token::make (END_OF_FILE, get_current_location ());
2000 : }
2001 : else
2002 : {
2003 : rust_unreachable ();
2004 : }
2005 :
2006 12731 : str.shrink_to_fit ();
2007 :
2008 12731 : return Token::make_string (loc, std::move (str));
2009 12745 : }
2010 :
2011 : // Parses an identifier or keyword.
2012 : TokenPtr
2013 289604 : Lexer::parse_identifier_or_keyword (location_t loc)
2014 : {
2015 289604 : std::string str;
2016 289604 : str.reserve (16); // default
2017 579208 : str += current_char.as_string ();
2018 :
2019 289604 : bool first_is_underscore = current_char == '_';
2020 :
2021 289604 : int length = 1;
2022 289604 : current_char = peek_input ();
2023 :
2024 : // loop through entire name
2025 1469952 : while (is_identifier_continue (current_char.value))
2026 : {
2027 890744 : auto s = current_char.as_string ();
2028 890744 : length++;
2029 :
2030 1781488 : str += current_char.as_string ();
2031 890744 : skip_input ();
2032 890744 : current_char = peek_input ();
2033 890744 : }
2034 :
2035 289604 : current_column += length;
2036 :
2037 : // if just a single underscore, not an identifier
2038 289604 : if (first_is_underscore && length == 1)
2039 1316 : return Token::make (UNDERSCORE, loc);
2040 :
2041 288288 : str.shrink_to_fit ();
2042 :
2043 288288 : loc += length - 1;
2044 :
2045 288288 : TokenId keyword = classify_keyword (str);
2046 288288 : if (keyword == IDENTIFIER)
2047 191918 : return Token::make_identifier (loc, std::move (str));
2048 : else
2049 96370 : return Token::make (keyword, loc);
2050 289604 : }
2051 :
2052 : // Possibly returns a raw string token if it exists - otherwise returns null.
2053 : TokenPtr
2054 3961 : Lexer::maybe_parse_raw_string (location_t loc)
2055 : {
2056 3961 : int peek_index = 0;
2057 3970 : while (peek_input (peek_index) == '#')
2058 9 : peek_index++;
2059 :
2060 3961 : if (peek_input (peek_index) == '"')
2061 25 : return parse_raw_string (loc, peek_index);
2062 : else
2063 3936 : return nullptr;
2064 : }
2065 :
2066 : // Returns a raw string token.
2067 : TokenPtr
2068 25 : Lexer::parse_raw_string (location_t loc, int initial_hash_count)
2069 : {
2070 : // raw string literals
2071 25 : std::string str;
2072 25 : str.reserve (16); // some sensible default
2073 :
2074 25 : int length = 1 + initial_hash_count;
2075 25 : current_column += length;
2076 :
2077 25 : const location_t string_begin_locus = get_current_location ();
2078 :
2079 25 : if (initial_hash_count > 0)
2080 7 : skip_input (initial_hash_count - 1);
2081 :
2082 25 : current_char = peek_input ();
2083 :
2084 25 : if (current_char != '"')
2085 0 : rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2086 :
2087 25 : length++;
2088 25 : current_column++;
2089 25 : skip_input ();
2090 25 : current_char = peek_input ();
2091 :
2092 181 : while (true)
2093 : {
2094 103 : if (current_char.value == '"')
2095 : {
2096 38 : bool enough_hashes = true;
2097 :
2098 38 : for (int i = 0; i < initial_hash_count; i++)
2099 : {
2100 13 : if (peek_input (i + 1) != '#')
2101 : {
2102 : enough_hashes = false;
2103 : break;
2104 : }
2105 : }
2106 :
2107 28 : if (enough_hashes)
2108 : {
2109 : // skip enough input and peek enough input
2110 25 : skip_input (initial_hash_count);
2111 25 : current_char = peek_input ();
2112 25 : length += initial_hash_count + 1;
2113 25 : current_column += initial_hash_count + 1;
2114 25 : break;
2115 : }
2116 : }
2117 75 : else if (current_char.is_eof ())
2118 : {
2119 0 : rust_error_at (string_begin_locus, "unended raw string literal");
2120 0 : return Token::make (END_OF_FILE, get_current_location ());
2121 : }
2122 :
2123 78 : length++;
2124 78 : current_column++;
2125 78 : if (current_char == '\n')
2126 : {
2127 1 : current_line++;
2128 1 : current_column = 1;
2129 1 : start_line (current_line, max_column_hint);
2130 : }
2131 :
2132 156 : str += current_char.as_string ();
2133 78 : skip_input ();
2134 78 : current_char = peek_input ();
2135 78 : }
2136 :
2137 25 : loc += length - 1;
2138 :
2139 25 : str.shrink_to_fit ();
2140 :
2141 25 : return Token::make_raw_string (loc, std::move (str));
2142 25 : }
2143 :
2144 : template <typename IsDigitFunc>
2145 : TokenPtr
2146 335 : Lexer::parse_non_decimal_int_literal (location_t loc, IsDigitFunc is_digit_func,
2147 : IntegerLiteralBase base)
2148 : {
2149 335 : std::string raw_str = "0";
2150 335 : raw_str += current_char; // x, o, b
2151 335 : skip_input ();
2152 :
2153 335 : int length = 2;
2154 335 : bool has_valid_digit = false;
2155 :
2156 335 : current_char = peek_input ();
2157 :
2158 : // loop through to add entire number to string
2159 2379 : while (true)
2160 : {
2161 2714 : if (is_digit_func (current_char.value))
2162 : {
2163 : has_valid_digit = true;
2164 : }
2165 389 : else if (current_char != '_')
2166 : {
2167 : break;
2168 : }
2169 2379 : length++;
2170 :
2171 2379 : raw_str += current_char;
2172 2379 : skip_input ();
2173 2379 : current_char = peek_input ();
2174 : }
2175 :
2176 335 : int suffix_start = raw_str.length ();
2177 :
2178 : // parse in suffix if it exists
2179 335 : auto suffix_pair = parse_in_suffix ();
2180 335 : PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
2181 335 : raw_str += suffix_pair.first;
2182 335 : length += suffix_pair.second;
2183 :
2184 335 : current_column += length;
2185 :
2186 335 : if (!has_valid_digit)
2187 : {
2188 5 : rust_error_at (loc, ErrorCode::E0768, "no valid digits found for number");
2189 : }
2190 :
2191 335 : loc += length - 1;
2192 :
2193 335 : return Token::make_int (loc, std::move (raw_str), suffix_start, base,
2194 335 : type_hint);
2195 335 : }
2196 :
2197 : // Parses a hex, binary or octal int literal.
2198 : TokenPtr
2199 335 : Lexer::parse_non_decimal_int_literals (location_t loc)
2200 : {
2201 335 : current_char = peek_input ();
2202 :
2203 335 : if (current_char == 'x')
2204 : {
2205 : // hex (integer only)
2206 296 : return parse_non_decimal_int_literal (loc, is_x_digit,
2207 296 : IntegerLiteralBase::Hex);
2208 : }
2209 39 : else if (current_char == 'o')
2210 : {
2211 : // octal (integer only)
2212 19 : return parse_non_decimal_int_literal (loc, is_octal_digit,
2213 19 : IntegerLiteralBase::Octal);
2214 : }
2215 20 : else if (current_char == 'b')
2216 : {
2217 : // binary (integer only)
2218 20 : return parse_non_decimal_int_literal (loc, is_bin_digit,
2219 20 : IntegerLiteralBase::Binary);
2220 : }
2221 : else
2222 : {
2223 0 : return nullptr;
2224 : }
2225 : }
2226 :
2227 : // Parses a decimal-based int literal or float literal.
2228 : TokenPtr
2229 17117 : Lexer::parse_decimal_int_or_float (location_t loc)
2230 : {
2231 17117 : std::string str;
2232 17117 : str.reserve (16); // some sensible default
2233 17117 : str += current_char;
2234 :
2235 17117 : int length = 1;
2236 17117 : bool first_zero = current_char == '0';
2237 :
2238 17117 : current_char = peek_input ();
2239 :
2240 : // parse initial decimal integer (or first integer part of float) literal
2241 17117 : auto initial_decimal = parse_in_decimal ();
2242 17117 : str += std::get<0> (initial_decimal);
2243 17117 : length += std::get<1> (initial_decimal);
2244 :
2245 : // detect float literal
2246 : //
2247 : // Note:
2248 : //
2249 : // We should not use is_float_digit () for this verification but instead
2250 : // directly ISDIGIT because rust does not support non digit values right after
2251 : // a dot.
2252 : // The following value is not legal in rust:
2253 : // let a = 3.e1;
2254 : // A `0` should be put between the dot and the exponent to be valid
2255 : // (eg. 3.0e1).
2256 17117 : if (current_char == '.' && ISDIGIT (peek_input (1).value))
2257 : {
2258 : // float with a '.', parse another decimal into it
2259 :
2260 : // add . to str
2261 349 : str += current_char;
2262 349 : skip_input ();
2263 349 : current_char = peek_input ();
2264 349 : length++;
2265 :
2266 : // parse another decimal number for float
2267 349 : auto second_decimal = parse_in_decimal ();
2268 349 : str += std::get<0> (second_decimal);
2269 349 : length += std::get<1> (second_decimal);
2270 :
2271 : // parse in exponent part if it exists
2272 349 : auto exponent_pair = parse_in_exponent_part ();
2273 349 : str += exponent_pair.first;
2274 349 : length += exponent_pair.second;
2275 :
2276 349 : int suffix_start = str.length ();
2277 :
2278 : // parse in type suffix if it exists
2279 349 : auto suffix_pair = parse_in_suffix ();
2280 349 : PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
2281 349 : str += suffix_pair.first;
2282 349 : length += suffix_pair.second;
2283 :
2284 349 : current_column += length;
2285 :
2286 349 : loc += length - 1;
2287 :
2288 349 : str.shrink_to_fit ();
2289 349 : return Token::make_float (loc, std::move (str), suffix_start, type_hint);
2290 349 : }
2291 16768 : else if (current_char == '.'
2292 16768 : && check_valid_float_dot_end (peek_input (1).value))
2293 : {
2294 : // float that is just an integer with a terminating '.' character
2295 :
2296 : // add . to str
2297 8 : str += current_char;
2298 8 : skip_input ();
2299 8 : current_char = peek_input ();
2300 8 : length++;
2301 :
2302 : // type hint not allowed
2303 :
2304 8 : current_column += length;
2305 :
2306 8 : loc += length - 1;
2307 :
2308 8 : str.shrink_to_fit ();
2309 16 : return Token::make_float (loc, std::move (str), str.length (),
2310 8 : CORETYPE_UNKNOWN);
2311 : }
2312 16760 : else if (current_char == 'E' || current_char == 'e')
2313 : {
2314 : // exponent float with no '.' character
2315 :
2316 : // parse exponent part
2317 0 : auto exponent_pair = parse_in_exponent_part ();
2318 0 : str += exponent_pair.first;
2319 0 : length += exponent_pair.second;
2320 :
2321 0 : int suffix_start = str.length ();
2322 :
2323 : // parse in type suffix if it exists
2324 0 : auto suffix_pair = parse_in_suffix ();
2325 0 : PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
2326 0 : str += suffix_pair.first;
2327 0 : length += suffix_pair.second;
2328 :
2329 0 : current_column += length;
2330 :
2331 0 : loc += length - 1;
2332 :
2333 0 : str.shrink_to_fit ();
2334 0 : return Token::make_float (loc, std::move (str), suffix_start, type_hint);
2335 0 : }
2336 : else
2337 : {
2338 : // is an integer
2339 :
2340 16760 : int suffix_start = str.length ();
2341 :
2342 : // parse in type suffix if it exists
2343 16760 : auto suffix_pair = parse_in_suffix ();
2344 16760 : str += suffix_pair.first;
2345 :
2346 16760 : PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
2347 :
2348 : /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2349 16760 : bool pure_decimal = std::get<2> (initial_decimal);
2350 16750 : if (pure_decimal && (!first_zero || suffix_start == 1)
2351 33508 : && suffix_pair.first.empty ())
2352 : type_hint = CORETYPE_PURE_DECIMAL;
2353 :
2354 16760 : length += suffix_pair.second;
2355 :
2356 16760 : current_column += length;
2357 :
2358 16760 : loc += length - 1;
2359 :
2360 16760 : str.shrink_to_fit ();
2361 16760 : return Token::make_int (loc, std::move (str), suffix_start,
2362 16760 : IntegerLiteralBase::Decimal, type_hint);
2363 16760 : }
2364 17117 : }
2365 :
2366 : TokenPtr
2367 848 : Lexer::parse_char_or_lifetime (location_t loc)
2368 : {
2369 848 : int length = 1;
2370 :
2371 848 : current_char = peek_input ();
2372 848 : if (current_char.is_eof ())
2373 0 : return nullptr;
2374 :
2375 : // parse escaped char literal
2376 848 : if (current_char.value == '\\')
2377 : {
2378 : // parse escape
2379 23 : auto utf8_escape_pair = parse_utf8_escape ();
2380 23 : Codepoint escaped_char = std::get<0> (utf8_escape_pair);
2381 23 : length += std::get<1> (utf8_escape_pair);
2382 :
2383 23 : if (peek_input ().value != '\'')
2384 : {
2385 0 : rust_error_at (get_current_location (), "unended character literal");
2386 : }
2387 : else
2388 : {
2389 23 : skip_input ();
2390 23 : current_char = peek_input ();
2391 23 : length++;
2392 : }
2393 :
2394 23 : current_column += length;
2395 :
2396 23 : loc += length - 1;
2397 :
2398 23 : return Token::make_char (loc, escaped_char);
2399 : }
2400 : else
2401 : {
2402 825 : skip_input ();
2403 :
2404 825 : if (peek_input ().value == '\'')
2405 : {
2406 : // parse non-escaped char literal
2407 203 : Codepoint non_escaped_char = current_char;
2408 :
2409 : // skip the ' character
2410 203 : skip_input ();
2411 203 : current_char = peek_input ();
2412 :
2413 : // TODO fix due to different widths of utf-8 chars?
2414 203 : current_column += 3;
2415 :
2416 203 : loc += 2;
2417 :
2418 203 : return Token::make_char (loc, non_escaped_char);
2419 : }
2420 622 : else if (is_identifier_start (current_char.value))
2421 : {
2422 : // parse lifetime name
2423 622 : std::string str;
2424 1244 : str += current_char.as_string ();
2425 622 : length++;
2426 :
2427 622 : current_char = peek_input ();
2428 1979 : while (is_identifier_continue (current_char.value))
2429 : {
2430 1470 : str += current_char.as_string ();
2431 735 : skip_input ();
2432 735 : current_char = peek_input ();
2433 735 : length++;
2434 : }
2435 :
2436 622 : current_column += length;
2437 :
2438 622 : loc += length - 1;
2439 :
2440 : // TODO some keywords cannot be used for a lifetime label #2306
2441 : // https://doc.rust-lang.org/reference/tokens.html
2442 :
2443 622 : str.shrink_to_fit ();
2444 622 : return Token::make_lifetime (loc, std::move (str));
2445 622 : }
2446 : else
2447 : {
2448 0 : rust_error_at (
2449 : get_current_location (),
2450 : "expected %' after character constant in character literal");
2451 0 : return nullptr;
2452 : }
2453 : }
2454 : }
2455 :
2456 : void
2457 100 : Lexer::split_current_token (TokenId new_left, TokenId new_right)
2458 : {
2459 : /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2460 : * like "IDENTIFIER"? */
2461 100 : location_t current_loc = peek_token ()->get_locus ();
2462 100 : TokenPtr new_left_tok = Token::make (new_left, current_loc);
2463 100 : TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2464 :
2465 100 : token_queue.replace_current_value (std::move (new_left_tok));
2466 100 : token_queue.insert (1, std::move (new_right_tok));
2467 100 : }
2468 :
2469 : void
2470 2 : Lexer::split_current_token (std::vector<TokenPtr> new_tokens)
2471 : {
2472 2 : rust_assert (new_tokens.size () > 0);
2473 4 : token_queue.replace_current_value (new_tokens[0]);
2474 :
2475 5 : for (size_t i = 1; i < new_tokens.size (); i++)
2476 : {
2477 6 : token_queue.insert (i, new_tokens[i]);
2478 : }
2479 2 : }
2480 :
2481 : void
2482 174950 : Lexer::start_line (int current_line, int current_column)
2483 : {
2484 174950 : if (line_map)
2485 174950 : linemap_line_start (line_table, current_line, current_column);
2486 174950 : }
2487 :
2488 : } // namespace Rust
2489 :
2490 : #if CHECKING_P
2491 :
2492 : namespace selftest {
2493 :
2494 : // Checks if `src` has the same contents as the given characters
2495 : static void
2496 6 : assert_source_content (Rust::InputSource &src,
2497 : const std::vector<uint32_t> &expected)
2498 : {
2499 6 : Rust::Codepoint src_char = src.next ();
2500 41 : for (auto expected_char : expected)
2501 : {
2502 : // Make sure that `src` is not shorter than `expected`
2503 35 : ASSERT_FALSE (src_char.is_eof ());
2504 : // Checks skipped character is expeceted one.
2505 35 : ASSERT_EQ (src_char.value, expected_char);
2506 35 : src_char = src.next ();
2507 : }
2508 : // Checks if `src` and `chars` has the same length.
2509 6 : ASSERT_TRUE (src_char.is_eof ());
2510 6 : }
2511 :
2512 : static void
2513 4 : test_buffer_input_source (std::string str,
2514 : const std::vector<uint32_t> &expected)
2515 : {
2516 4 : Rust::BufferInputSource source (str, 0);
2517 4 : assert_source_content (source, expected);
2518 4 : }
2519 :
2520 : static void
2521 2 : test_file_input_source (std::string str, const std::vector<uint32_t> &expected)
2522 : {
2523 2 : FILE *tmpf = tmpfile ();
2524 : // Moves to the first character
2525 2 : fputs (str.c_str (), tmpf);
2526 2 : std::rewind (tmpf);
2527 2 : Rust::FileInputSource source (tmpf);
2528 2 : assert_source_content (source, expected);
2529 2 : }
2530 :
2531 : void
2532 1 : rust_input_source_test ()
2533 : {
2534 : // ASCII
2535 1 : std::string src = (const char *) u8"_abcde\tXYZ\v\f";
2536 1 : std::vector<uint32_t> expected = {u'_', u'a', u'b', u'c', u'd', u'e',
2537 1 : u'\t', u'X', u'Y', u'Z', u'\v', u'\f'};
2538 2 : test_buffer_input_source (src, expected);
2539 :
2540 : // BOM
2541 1 : src = (const char *) u8"\xef\xbb\xbfOK";
2542 1 : expected = {u'O', u'K'};
2543 2 : test_buffer_input_source (src, expected);
2544 :
2545 : // Russian
2546 1 : src = (const char *) u8"приве́т";
2547 1 : expected = {u'п',
2548 : u'р',
2549 : u'и',
2550 : u'в',
2551 : 0x0435 /* CYRILLIC SMALL LETTER IE е */,
2552 : 0x301 /* COMBINING ACUTE ACCENT ́ */,
2553 1 : u'т'};
2554 2 : test_buffer_input_source (src, expected);
2555 :
2556 1 : src = (const char *) u8"❤️🦀";
2557 1 : expected = {0x2764 /* HEAVY BLACK HEART */,
2558 1 : 0xfe0f /* VARIATION SELECTOR-16 */, U'🦀'};
2559 2 : test_buffer_input_source (src, expected);
2560 :
2561 1 : src = (const char *) u8"こんにちは";
2562 1 : expected = {u'こ', u'ん', u'に', u'ち', u'は'};
2563 2 : test_file_input_source (src, expected);
2564 :
2565 1 : src = (const char *) u8"👮♂👩⚕";
2566 1 : expected
2567 : = {0x1f46e /* POLICE OFFICER */, 0x200d /* ZERO WIDTH JOINER */,
2568 : 0x2642 /* MALE SIGN */, 0x1f469 /* WOMAN */,
2569 1 : 0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
2570 2 : test_file_input_source (src, expected);
2571 1 : }
2572 :
2573 : } // namespace selftest
2574 :
2575 : #endif // CHECKING_P
|