Line data Source code
1 : // go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks
2 :
3 : // Copyright 2016 The Go Authors. All rights reserved.
4 : // Use of this source code is governed by a BSD-style
5 : // license that can be found in the LICENSE file.
6 :
7 : #include "go-system.h"
8 :
9 : #include "gogo.h"
10 : #include "go-location.h"
11 : #include "go-linemap.h"
12 : #include "go-encode-id.h"
13 : #include "lex.h"
14 :
15 : // Return whether the character c can appear in a name that we are
16 : // encoding. We only permit ASCII alphanumeric characters.
17 :
18 : static bool
19 163310812 : char_needs_encoding(char c)
20 : {
21 0 : switch (c)
22 : {
23 : case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
24 : case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
25 : case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
26 : case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
27 : case 'Y': case 'Z':
28 : case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
29 : case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
30 : case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
31 : case 's': case 't': case 'u': case 'v': case 'w': case 'x':
32 : case 'y': case 'z':
33 : case '0': case '1': case '2': case '3': case '4':
34 : case '5': case '6': case '7': case '8': case '9':
35 : return false;
36 28970275 : default:
37 0 : return true;
38 : }
39 : }
40 :
41 : // Return whether the identifier needs to be translated because it
42 : // contains non-ASCII characters.
43 :
44 : bool
45 890572 : go_id_needs_encoding(const std::string& str)
46 : {
47 890572 : for (std::string::const_iterator p = str.begin();
48 5997645 : p != str.end();
49 5107073 : ++p)
50 5402751 : if (char_needs_encoding(*p))
51 890572 : return true;
52 : return false;
53 : }
54 :
55 : // Map from characters to the underscore encoding for them.
56 :
57 : class Special_char_code
58 : {
59 : public:
60 : Special_char_code();
61 :
62 : // Return the simple underscore encoding for C, or 0 if none.
63 : char
64 28970275 : code_for(unsigned int c) const
65 : {
66 28970275 : if (c <= 127)
67 28970275 : return this->codes_[c];
68 : return 0;
69 : }
70 :
71 : private:
72 : // Encodings for characters.
73 : char codes_[128];
74 : };
75 :
76 : // Construct the underscore encoding map.
77 :
78 4646 : Special_char_code::Special_char_code()
79 : {
80 4646 : memset(this->codes_, 0, sizeof this->codes_);
81 4646 : this->codes_['_'] = '_';
82 4646 : this->codes_['.'] = '0';
83 4646 : this->codes_['/'] = '1';
84 4646 : this->codes_['*'] = '2';
85 4646 : this->codes_[','] = '3';
86 4646 : this->codes_['{'] = '4';
87 4646 : this->codes_['}'] = '5';
88 4646 : this->codes_['['] = '6';
89 4646 : this->codes_[']'] = '7';
90 4646 : this->codes_['('] = '8';
91 4646 : this->codes_[')'] = '9';
92 4646 : this->codes_['"'] = 'a';
93 4646 : this->codes_[' '] = 'b';
94 4646 : this->codes_[';'] = 'c';
95 4646 : }
96 :
97 : // The singleton Special_char_code.
98 :
99 : static const Special_char_code special_char_code;
100 :
101 : // Pull the next UTF-8 character out of P and store it in *PC. Return
102 : // the number of bytes read.
103 :
104 : static size_t
105 158710509 : fetch_utf8_char(const char* p, unsigned int* pc)
106 : {
107 158710509 : unsigned char c = *p;
108 158710509 : if ((c & 0x80) == 0)
109 : {
110 158709475 : *pc = c;
111 158709475 : return 1;
112 : }
113 : size_t len = 0;
114 3426 : while ((c & 0x80) != 0)
115 : {
116 2392 : ++len;
117 2392 : c <<= 1;
118 : }
119 1034 : unsigned int rc = *p & ((1 << (7 - len)) - 1);
120 2392 : for (size_t i = 1; i < len; i++)
121 : {
122 1358 : unsigned int u = p[i];
123 1358 : rc <<= 6;
124 1358 : rc |= u & 0x3f;
125 : }
126 1034 : *pc = rc;
127 1034 : return len;
128 : }
129 :
130 : // Encode an identifier using assembler-friendly characters. The
131 : // encoding is described in detail near the end of the long comment at
132 : // the start of names.cc.
133 :
134 : std::string
135 3881843 : go_encode_id(const std::string &id)
136 : {
137 3881843 : if (Lex::is_invalid_identifier(id))
138 : {
139 1 : go_assert(saw_errors());
140 1 : return id;
141 : }
142 :
143 3881842 : std::string ret;
144 3881842 : const char* p = id.c_str();
145 3881842 : const char* pend = p + id.length();
146 :
147 : // We encode a leading digit, to ensure that no identifier starts
148 : // with a digit.
149 3881842 : if (pend > p && p[0] >= '0' && p[0] <= '9')
150 : {
151 0 : char buf[8];
152 0 : snprintf(buf, sizeof buf, "_x%02x", p[0]);
153 0 : ret.append(buf);
154 0 : ++p;
155 : }
156 :
157 161790847 : while (p < pend)
158 : {
159 157909005 : unsigned int c;
160 157909005 : size_t len = fetch_utf8_char(p, &c);
161 157909005 : if (len == 1)
162 : {
163 157908061 : if (!char_needs_encoding(c))
164 128937786 : ret.push_back(c);
165 : else
166 : {
167 28970275 : char code = special_char_code.code_for(c);
168 28970275 : if (code != 0)
169 : {
170 28896350 : ret.push_back('_');
171 28896350 : ret.push_back(code);
172 : }
173 : else
174 : {
175 73925 : char buf[16];
176 73925 : snprintf(buf, sizeof buf, "_x%02x", c);
177 73925 : ret.append(buf);
178 : }
179 : }
180 : }
181 : else
182 : {
183 944 : char buf[16];
184 944 : if (c < 0x10000)
185 944 : snprintf(buf, sizeof buf, "_u%04x", c);
186 : else
187 0 : snprintf(buf, sizeof buf, "_U%08x", c);
188 944 : ret.append(buf);
189 : }
190 :
191 157909005 : p += len;
192 : }
193 :
194 3881842 : return ret;
195 3881842 : }
196 :
197 : // Convert a hex digit string to a unicode codepoint. No checking
198 : // to insure that the hex digit is meaningful.
199 :
200 : static unsigned
201 0 : hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
202 : {
203 0 : unsigned result = 0;
204 0 : for (unsigned i = 0; i < ndig; ++i) {
205 0 : result <<= 4;
206 0 : result |= Lex::hex_val(digits[i]);
207 : }
208 0 : return result;
209 : }
210 :
211 : // Decode/demangle a mangled string produced by go_encode_id(). Returns
212 : // empty string if demangling process fails in some way. At the moment
213 : // this routine is unused; there is an equivalent routine in the runtime
214 : // used for demangling symbols appearing in stack traces.
215 :
216 : std::string
217 0 : go_decode_id(const std::string &encoded)
218 : {
219 0 : std::string ret;
220 0 : const char* p = encoded.c_str();
221 0 : const char* pend = p + encoded.length();
222 0 : const Location loc = Linemap::predeclared_location();
223 :
224 0 : while (p < pend)
225 : {
226 0 : if (*p != '_' || p + 1 == pend)
227 : {
228 0 : ret.push_back(*p);
229 0 : p++;
230 0 : continue;
231 : }
232 :
233 0 : switch (p[1])
234 : {
235 0 : case '_':
236 0 : ret.push_back('_');
237 0 : p += 2;
238 0 : break;
239 0 : case '0':
240 0 : ret.push_back('.');
241 0 : p += 2;
242 0 : break;
243 0 : case '1':
244 0 : ret.push_back('/');
245 0 : p += 2;
246 0 : break;
247 0 : case '2':
248 0 : ret.push_back('*');
249 0 : p += 2;
250 0 : break;
251 0 : case '3':
252 0 : ret.push_back(',');
253 0 : p += 2;
254 0 : break;
255 0 : case '4':
256 0 : ret.push_back('{');
257 0 : p += 2;
258 0 : break;
259 0 : case '5':
260 0 : ret.push_back('}');
261 0 : p += 2;
262 0 : break;
263 0 : case '6':
264 0 : ret.push_back('[');
265 0 : p += 2;
266 0 : break;
267 0 : case '7':
268 0 : ret.push_back(']');
269 0 : p += 2;
270 0 : break;
271 0 : case '8':
272 0 : ret.push_back('(');
273 0 : p += 2;
274 0 : break;
275 0 : case '9':
276 0 : ret.push_back(')');
277 0 : p += 2;
278 0 : break;
279 0 : case 'a':
280 0 : ret.push_back('"');
281 0 : p += 2;
282 0 : break;
283 0 : case 'b':
284 0 : ret.push_back(' ');
285 0 : p += 2;
286 0 : break;
287 0 : case 'c':
288 0 : ret.push_back(';');
289 0 : p += 2;
290 0 : break;
291 0 : case 'x':
292 0 : {
293 0 : const char* digits = p + 2;
294 0 : if (strlen(digits) < 2)
295 0 : return "";
296 0 : unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
297 0 : Lex::append_char(rune, true, &ret, loc);
298 0 : p += 4;
299 : }
300 0 : break;
301 0 : case 'u':
302 0 : {
303 0 : const char* digits = p + 2;
304 0 : if (strlen(digits) < 4)
305 0 : return "";
306 0 : unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
307 0 : Lex::append_char(rune, true, &ret, loc);
308 0 : p += 6;
309 : }
310 0 : break;
311 0 : case 'U':
312 0 : {
313 0 : const char* digits = p + 2;
314 0 : if (strlen(digits) < 8)
315 0 : return "";
316 0 : unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
317 0 : Lex::append_char(rune, true, &ret, loc);
318 0 : p += 10;
319 : }
320 0 : break;
321 0 : default:
322 0 : return "";
323 : }
324 : }
325 :
326 0 : return ret;
327 0 : }
328 :
329 : // Encode a struct field tag. This is only used when we need to
330 : // create a type descriptor for an anonymous struct type with field
331 : // tags. Underscore encoding will be applied to the returned string.
332 : // The tag will appear between curly braces, so that is all we have to
333 : // avoid.
334 :
335 : std::string
336 47283 : go_mangle_struct_tag(const std::string& tag)
337 : {
338 47283 : std::string ret;
339 47283 : const char* p = tag.c_str();
340 47283 : const char* pend = p + tag.length();
341 848787 : while (p < pend)
342 : {
343 801504 : unsigned int c;
344 801504 : size_t len = fetch_utf8_char(p, &c);
345 801504 : if (len > 1)
346 90 : ret.append(p, len);
347 801414 : else if (c != '{' && c != '}' && c != '\\')
348 801270 : ret.push_back(c);
349 : else
350 : {
351 144 : ret.push_back('\\');
352 144 : ret.push_back(c);
353 : }
354 801504 : p += len;
355 : }
356 47283 : return ret;
357 : }
|