Line data Source code
1 : // lex.h -- Go frontend lexer. -*- C++ -*-
2 :
3 : // Copyright 2009 The Go Authors. All rights reserved.
4 : // Use of this source code is governed by a BSD-style
5 : // license that can be found in the LICENSE file.
6 :
7 : #ifndef GO_LEX_H
8 : #define GO_LEX_H
9 :
10 : #include <mpfr.h>
11 :
12 : #include "operator.h"
13 : #include "go-linemap.h"
14 :
15 : #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
16 : # define GO_ATTRIBUTE_UNUSED __attribute__ ((__unused__))
17 : #else
18 : # define GO_ATTRIBUTE_UNUSED
19 : #endif
20 :
21 : struct Unicode_range;
22 :
23 : // The keywords. These must be in sorted order, other than
24 : // KEYWORD_INVALID. They must match the Keywords::mapping_ array in
25 : // lex.cc.
26 :
27 : enum Keyword
28 : {
29 : KEYWORD_INVALID, // Not a keyword.
30 : KEYWORD_ASM,
31 : KEYWORD_BREAK,
32 : KEYWORD_CASE,
33 : KEYWORD_CHAN,
34 : KEYWORD_CONST,
35 : KEYWORD_CONTINUE,
36 : KEYWORD_DEFAULT,
37 : KEYWORD_DEFER,
38 : KEYWORD_ELSE,
39 : KEYWORD_FALLTHROUGH,
40 : KEYWORD_FOR,
41 : KEYWORD_FUNC,
42 : KEYWORD_GO,
43 : KEYWORD_GOTO,
44 : KEYWORD_IF,
45 : KEYWORD_IMPORT,
46 : KEYWORD_INTERFACE,
47 : KEYWORD_MAP,
48 : KEYWORD_PACKAGE,
49 : KEYWORD_RANGE,
50 : KEYWORD_RETURN,
51 : KEYWORD_SELECT,
52 : KEYWORD_STRUCT,
53 : KEYWORD_SWITCH,
54 : KEYWORD_TYPE,
55 : KEYWORD_VAR
56 : };
57 :
58 : // Pragmas built from magic comments and recorded for functions.
59 : // These are used as bits in a bitmask.
60 : // The set of values is intended to be the same as the gc compiler.
61 :
62 : enum GoPragma
63 : {
64 : GOPRAGMA_NOINTERFACE = 1 << 0, // Method not in type descriptor.
65 : GOPRAGMA_NOESCAPE = 1 << 1, // Args do not escape.
66 : GOPRAGMA_NORACE = 1 << 2, // No race detector.
67 : GOPRAGMA_NOSPLIT = 1 << 3, // Do not split stack.
68 : GOPRAGMA_NOINLINE = 1 << 4, // Do not inline.
69 : GOPRAGMA_SYSTEMSTACK = 1 << 5, // Must run on system stack.
70 : GOPRAGMA_NOWRITEBARRIER = 1 << 6, // No write barriers.
71 : GOPRAGMA_NOWRITEBARRIERREC = 1 << 7, // No write barriers here or callees.
72 : GOPRAGMA_YESWRITEBARRIERREC = 1 << 8, // Stops nowritebarrierrec.
73 : GOPRAGMA_MARK = 1 << 9, // Marker for nowritebarrierrec.
74 : GOPRAGMA_CGOUNSAFEARGS = 1 << 10, // Pointer to arg is pointer to all.
75 : GOPRAGMA_UINTPTRESCAPES = 1 << 11, // uintptr(p) escapes.
76 : GOPRAGMA_NOTINHEAP = 1 << 12 // type is not in heap.
77 : };
78 :
79 : // A token returned from the lexer.
80 :
81 : class Token
82 : {
83 : public:
84 : // Token classification.
85 : enum Classification
86 : {
87 : // Token is invalid.
88 : TOKEN_INVALID,
89 : // Token indicates end of input.
90 : TOKEN_EOF,
91 : // Token is a keyword.
92 : TOKEN_KEYWORD,
93 : // Token is an identifier.
94 : TOKEN_IDENTIFIER,
95 : // Token is a string of characters.
96 : TOKEN_STRING,
97 : // Token is an operator.
98 : TOKEN_OPERATOR,
99 : // Token is a character constant.
100 : TOKEN_CHARACTER,
101 : // Token is an integer.
102 : TOKEN_INTEGER,
103 : // Token is a floating point number.
104 : TOKEN_FLOAT,
105 : // Token is an imaginary number.
106 : TOKEN_IMAGINARY
107 : };
108 :
109 : ~Token();
110 : Token(const Token&);
111 : Token& operator=(const Token&);
112 :
113 : // Get token classification.
114 : Classification
115 15074471 : classification() const
116 15074471 : { return this->classification_; }
117 :
118 : // Make a token for an invalid value.
119 : static Token
120 25414 : make_invalid_token(Location location)
121 25414 : { return Token(TOKEN_INVALID, location); }
122 :
123 : // Make a token representing end of file.
124 : static Token
125 12709 : make_eof_token(Location location)
126 12709 : { return Token(TOKEN_EOF, location); }
127 :
128 : // Make a keyword token.
129 : static Token
130 1109846 : make_keyword_token(Keyword keyword, Location location)
131 : {
132 1109846 : Token tok(TOKEN_KEYWORD, location);
133 1109846 : tok.u_.keyword = keyword;
134 1109846 : return tok;
135 : }
136 :
137 : // Make an identifier token.
138 : static Token
139 7643127 : make_identifier_token(const std::string& value, bool is_exported,
140 : Location location)
141 : {
142 7643127 : Token tok(TOKEN_IDENTIFIER, location);
143 7643127 : tok.u_.identifier_value.name = new std::string(value);
144 7643127 : tok.u_.identifier_value.is_exported = is_exported;
145 7643127 : return tok;
146 : }
147 :
148 : // Make a quoted string token.
149 : static Token
150 460495 : make_string_token(const std::string& value, Location location)
151 : {
152 460495 : Token tok(TOKEN_STRING, location);
153 460495 : tok.u_.string_value = new std::string(value);
154 460495 : return tok;
155 : }
156 :
157 : // Make an operator token.
158 : static Token
159 11184438 : make_operator_token(Operator op, Location location)
160 : {
161 11184438 : Token tok(TOKEN_OPERATOR, location);
162 11184438 : tok.u_.op = op;
163 11184438 : return tok;
164 : }
165 :
166 : // Make a character constant token.
167 : static Token
168 49036 : make_character_token(mpz_t val, Location location)
169 : {
170 49036 : Token tok(TOKEN_CHARACTER, location);
171 49036 : mpz_init(tok.u_.integer_value);
172 49036 : mpz_swap(tok.u_.integer_value, val);
173 49036 : return tok;
174 : }
175 :
176 : // Make an integer token.
177 : static Token
178 1261667 : make_integer_token(mpz_t val, Location location)
179 : {
180 1261667 : Token tok(TOKEN_INTEGER, location);
181 1261667 : mpz_init(tok.u_.integer_value);
182 1261667 : mpz_swap(tok.u_.integer_value, val);
183 1261667 : return tok;
184 : }
185 :
186 : // Make a float token.
187 : static Token
188 18714 : make_float_token(mpfr_t val, Location location)
189 : {
190 18714 : Token tok(TOKEN_FLOAT, location);
191 18714 : mpfr_init(tok.u_.float_value);
192 18714 : mpfr_swap(tok.u_.float_value, val);
193 18714 : return tok;
194 : }
195 :
196 : // Make a token for an imaginary number.
197 : static Token
198 993 : make_imaginary_token(mpfr_t val, Location location)
199 : {
200 993 : Token tok(TOKEN_IMAGINARY, location);
201 993 : mpfr_init(tok.u_.float_value);
202 993 : mpfr_swap(tok.u_.float_value, val);
203 993 : return tok;
204 : }
205 :
206 : // Get the location of the token.
207 : Location
208 19573459 : location() const
209 10245594 : { return this->location_; }
210 :
211 : // Return whether this is an invalid token.
212 : bool
213 57866163 : is_invalid() const
214 57866163 : { return this->classification_ == TOKEN_INVALID; }
215 :
216 : // Return whether this is the EOF token.
217 : bool
218 480763 : is_eof() const
219 480763 : { return this->classification_ == TOKEN_EOF; }
220 :
221 : // Return the keyword value for a keyword token.
222 : Keyword
223 1322537 : keyword() const
224 : {
225 1322537 : go_assert(this->classification_ == TOKEN_KEYWORD);
226 1322537 : return this->u_.keyword;
227 : }
228 :
229 : // Return whether this is an identifier.
230 : bool
231 6099770 : is_identifier() const
232 6099770 : { return this->classification_ == TOKEN_IDENTIFIER; }
233 :
234 : // Return the identifier.
235 : const std::string&
236 8343249 : identifier() const
237 : {
238 8343249 : go_assert(this->classification_ == TOKEN_IDENTIFIER);
239 8343249 : return *this->u_.identifier_value.name;
240 : }
241 :
242 : // Return whether the identifier is exported.
243 : bool
244 7544917 : is_identifier_exported() const
245 : {
246 7544917 : go_assert(this->classification_ == TOKEN_IDENTIFIER);
247 7544917 : return this->u_.identifier_value.is_exported;
248 : }
249 :
250 : // Return whether this is a string.
251 : bool
252 128106 : is_string() const
253 : {
254 128106 : return this->classification_ == TOKEN_STRING;
255 : }
256 :
257 : // Return the value of a string. The returned value is a string of
258 : // UTF-8 characters.
259 : std::string
260 460489 : string_value() const
261 : {
262 460489 : go_assert(this->classification_ == TOKEN_STRING);
263 460489 : return *this->u_.string_value;
264 : }
265 :
266 : // Return the value of a character constant.
267 : const mpz_t*
268 49035 : character_value() const
269 : {
270 49035 : go_assert(this->classification_ == TOKEN_CHARACTER);
271 49035 : return &this->u_.integer_value;
272 : }
273 :
274 : // Return the value of an integer.
275 : const mpz_t*
276 1261659 : integer_value() const
277 : {
278 1261659 : go_assert(this->classification_ == TOKEN_INTEGER);
279 1261659 : return &this->u_.integer_value;
280 : }
281 :
282 : // Return the value of a float.
283 : const mpfr_t*
284 18712 : float_value() const
285 : {
286 18712 : go_assert(this->classification_ == TOKEN_FLOAT);
287 18712 : return &this->u_.float_value;
288 : }
289 :
290 : // Return the value of an imaginary number.
291 : const mpfr_t*
292 992 : imaginary_value() const
293 : {
294 992 : go_assert(this->classification_ == TOKEN_IMAGINARY);
295 992 : return &this->u_.float_value;
296 : }
297 :
298 : // Return the operator value for an operator token.
299 : Operator
300 7244881 : op() const
301 : {
302 7244881 : go_assert(this->classification_ == TOKEN_OPERATOR);
303 7244881 : return this->u_.op;
304 : }
305 :
306 : // Return whether this token is KEYWORD.
307 : bool
308 5581106 : is_keyword(Keyword keyword) const
309 : {
310 5581106 : return (this->classification_ == TOKEN_KEYWORD
311 3167120 : && this->u_.keyword == keyword);
312 : }
313 :
314 : // Return whether this token is OP.
315 : bool
316 97505805 : is_op(Operator op) const
317 38831364 : { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
318 :
319 : // Print the token for debugging.
320 : void
321 : print(FILE*) const;
322 :
323 : private:
324 : // Private constructor used by make_..._token functions above.
325 : Token(Classification, Location);
326 :
327 : // Clear the token.
328 : void
329 : clear();
330 :
331 : // The token classification.
332 : Classification classification_;
333 : union
334 : {
335 : // The keyword value for TOKEN_KEYWORD.
336 : Keyword keyword;
337 : // The token value for TOKEN_IDENTIFIER.
338 : struct
339 : {
340 : // The name of the identifier. This has been mangled to only
341 : // include ASCII characters.
342 : std::string* name;
343 : // Whether this name should be exported. This is true if the
344 : // first letter in the name is upper case.
345 : bool is_exported;
346 : } identifier_value;
347 : // The string value for TOKEN_STRING.
348 : std::string* string_value;
349 : // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
350 : mpz_t integer_value;
351 : // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
352 : mpfr_t float_value;
353 : // The token value for TOKEN_OPERATOR or the keyword value
354 : Operator op;
355 : } u_;
356 : // The source location.
357 : Location location_;
358 : };
359 :
360 : // The lexer itself.
361 :
362 : class Lex
363 : {
364 : public:
365 : Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
366 :
367 : ~Lex();
368 :
369 : // Return the next token.
370 : Token
371 : next_token();
372 :
373 : // Return the contents of any current //extern comment.
374 : const std::string&
375 : extern_name() const
376 128964 : { return this->extern_; }
377 :
378 : // Return the current set of pragmas, and clear them.
379 : unsigned int
380 619615 : get_and_clear_pragmas()
381 : {
382 619615 : unsigned int ret = this->pragmas_;
383 619615 : this->pragmas_ = 0;
384 619615 : return ret;
385 : }
386 :
387 7833 : struct Linkname
388 : {
389 : std::string ext_name; // External name; empty to just export.
390 : bool is_exported; // Whether the internal name is exported.
391 : Location loc; // Location of go:linkname directive.
392 :
393 3913 : Linkname()
394 3913 : : ext_name(), is_exported(false), loc()
395 : { }
396 :
397 3920 : Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a)
398 7840 : : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a)
399 : { }
400 : };
401 :
402 : typedef std::map<std::string, Linkname> Linknames;
403 :
404 : // Return the linknames seen so far, or NULL if none, and clear the
405 : // set. These are from go:linkname compiler directives.
406 : Linknames*
407 12707 : get_and_clear_linknames()
408 : {
409 12707 : Linknames* ret = this->linknames_;
410 12707 : this->linknames_ = NULL;
411 12707 : return ret;
412 : }
413 :
414 : // Return whether there are any current go:embed patterns.
415 : bool
416 539145 : has_embeds() const
417 539145 : { return !this->embeds_.empty(); }
418 :
419 : // If there are any go:embed patterns seen so far, store them in
420 : // *EMBEDS and clear the saved set. *EMBEDS must be an empty
421 : // vector.
422 : void
423 22 : get_and_clear_embeds(std::vector<std::string>* embeds)
424 : {
425 22 : go_assert(embeds->empty());
426 22 : std::swap(*embeds, this->embeds_);
427 22 : }
428 :
429 : // Clear any go:embed patterns seen so far. This is used for
430 : // erroneous cases.
431 : void
432 0 : clear_embeds()
433 0 : { this->embeds_.clear(); }
434 :
435 : // Return whether the identifier NAME should be exported. NAME is a
436 : // mangled name which includes only ASCII characters.
437 : static bool
438 : is_exported_mangled_name(const std::string& name);
439 :
440 : // Return whether the identifier NAME should be exported. NAME is
441 : // an unmangled utf-8 string and may contain non-ASCII characters.
442 : static bool
443 : is_exported_name(const std::string& name);
444 :
445 : // Return whether the identifier NAME is invalid. When we see an
446 : // invalid character we still build an identifier, but we use a
447 : // magic string to indicate that the identifier is invalid. We then
448 : // use this to avoid knockon errors.
449 : static bool
450 : is_invalid_identifier(const std::string& name);
451 :
452 : // A helper function. Append V to STR. IS_CHARACTER is true if V
453 : // is a Unicode character which should be converted into UTF-8,
454 : // false if it is a byte value to be appended directly. The
455 : // location is used to warn about an out of range character.
456 : static void
457 : append_char(unsigned int v, bool is_charater, std::string* str,
458 : Location);
459 :
460 : // A helper function. Fetch a UTF-8 character from STR and store it
461 : // in *VALUE. Return the number of bytes read from STR. Return 0
462 : // if STR does not point to a valid UTF-8 character.
463 : static int
464 : fetch_char(const char* str, unsigned int *value);
465 :
466 : // Return whether C is a Unicode or "C" locale space character.
467 : static bool
468 : is_unicode_space(unsigned int c);
469 :
470 : // Convert the specified hex char into an unsigned integer value.
471 : static unsigned
472 : hex_val(char c);
473 :
474 : private:
475 : ssize_t
476 : get_line();
477 :
478 : bool
479 : require_line();
480 :
481 : // The current location.
482 : Location
483 : location() const;
484 :
485 : // A position CHARS column positions before the current location.
486 : Location
487 : earlier_location(int chars) const;
488 :
489 : static bool
490 : is_hex_digit(char);
491 :
492 : static bool
493 : is_base_digit(int base, char);
494 :
495 : static unsigned char
496 122 : octal_value(char c)
497 122 : { return c - '0'; }
498 :
499 : Token
500 0 : make_invalid_token()
501 0 : { return Token::make_invalid_token(this->location()); }
502 :
503 : Token
504 12709 : make_eof_token()
505 12709 : { return Token::make_eof_token(this->location()); }
506 :
507 : Token
508 10941373 : make_operator(Operator op, int chars)
509 10941373 : { return Token::make_operator_token(op, this->earlier_location(chars)); }
510 :
511 : Token
512 : gather_identifier();
513 :
514 : static bool
515 : could_be_exponent(int base, const char*, const char*);
516 :
517 : Token
518 : gather_number();
519 :
520 : void
521 : skip_exponent();
522 :
523 : Token
524 : gather_character();
525 :
526 : Token
527 : gather_string();
528 :
529 : Token
530 : gather_raw_string();
531 :
532 : const char*
533 : advance_one_utf8_char(const char*, unsigned int*, bool*);
534 :
535 : const char*
536 : advance_one_char(const char*, bool, unsigned int*, bool*);
537 :
538 : static bool
539 : is_unicode_digit(unsigned int c);
540 :
541 : static bool
542 : is_unicode_letter(unsigned int c);
543 :
544 : static bool
545 : is_unicode_uppercase(unsigned int c);
546 :
547 : static bool
548 : is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
549 : size_t range_size);
550 :
551 : Operator
552 : three_character_operator(char, char, char);
553 :
554 : Operator
555 : two_character_operator(char, char);
556 :
557 : Operator
558 : one_character_operator(char);
559 :
560 : bool
561 : skip_c_comment(bool* found_newline);
562 :
563 : void
564 : skip_cpp_comment();
565 :
566 : void
567 : gather_embed(const char*, const char*);
568 :
569 : // The input file name.
570 : const char* input_file_name_ GO_ATTRIBUTE_UNUSED;
571 : // The input file.
572 : FILE* input_file_;
573 : // The object used to keep track of file names and line numbers.
574 : Linemap* linemap_;
575 : // The line buffer. This holds the current line.
576 : char* linebuf_;
577 : // The size of the line buffer.
578 : size_t linebufsize_;
579 : // The nmber of characters in the current line.
580 : size_t linesize_;
581 : // The current offset in linebuf_.
582 : size_t lineoff_;
583 : // The current line number.
584 : size_t lineno_;
585 : // Whether to add a semicolon if we see a newline now.
586 : bool add_semi_at_eol_;
587 : // Pragmas for the next function, from magic comments.
588 : unsigned int pragmas_;
589 : // The external name to use for a function declaration, from a magic
590 : // //extern comment.
591 : std::string extern_;
592 : // The list of //go:linkname comments, if any.
593 : Linknames* linknames_;
594 : // The list of //go:embed patterns, if any.
595 : std::vector<std::string> embeds_;
596 : };
597 :
598 : #endif // !defined(GO_LEX_H)
|