Line data Source code
1 : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
2 :
3 : // This file is part of GCC.
4 :
5 : // GCC is free software; you can redistribute it and/or modify it under
6 : // the terms of the GNU General Public License as published by the Free
7 : // Software Foundation; either version 3, or (at your option) any later
8 : // version.
9 :
10 : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 : // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 : // for more details.
14 :
15 : // You should have received a copy of the GNU General Public License
16 : // along with GCC; see the file COPYING3. If not see
17 : // <http://www.gnu.org/licenses/>.
18 :
19 : #include "rust-input-source.h"
20 : #include "rust-system.h"
21 : #include "optional.h"
22 : #include "selftest.h"
23 : #include "rust-lex.h"
24 : #include "rust-unicode.h"
25 :
26 : #include "rust-unicode-data.h"
27 :
28 : namespace Rust {
29 :
30 : typedef Codepoint codepoint_t;
31 : typedef std::vector<codepoint_t> string_t;
32 :
33 : // These constants are used to compose and decompose of Hangul syllables.
34 : // See `Sample Code for Hangul Algorithms` in 3.1.2
35 : // unicode.org/versions/Unicode15.0.0/ch03.pdf
36 : const uint32_t S_BASE = 0xAC00;
37 : const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
38 : const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
39 : const uint32_t N_COUNT = V_COUNT * T_COUNT;
40 : const uint32_t S_COUNT = L_COUNT * N_COUNT;
41 :
42 : // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
43 : template <std::size_t SIZE>
44 : bool
45 1818403 : binary_search_ranges (
46 : const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
47 : uint32_t target_cp)
48 : {
49 1818403 : auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
50 12071852 : [] (const std::pair<uint32_t, uint32_t> &a,
51 12071852 : uint32_t b) { return a.second <= b; });
52 1818403 : if (it == ranges.end ())
53 : return false;
54 : else
55 3591514 : return it->first <= target_cp && target_cp < it->second;
56 : }
57 :
58 : int
59 881211 : lookup_cc (codepoint_t c)
60 : {
61 881211 : auto it = CCC_TABLE.find (c.value);
62 881211 : if (it != CCC_TABLE.end ())
63 38 : return it->second;
64 : else
65 : // Starter. Returns zero.
66 : return 0;
67 : }
68 :
69 : tl::optional<codepoint_t>
70 14 : lookup_recomp (codepoint_t starter, codepoint_t c)
71 : {
72 14 : auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
73 14 : if (it != Rust::RECOMPOSITION_MAP.end ())
74 4 : return {it->second};
75 :
76 10 : it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
77 10 : if (it != Rust::RECOMPOSITION_MAP.end ())
78 0 : return {it->second};
79 :
80 10 : return tl::nullopt;
81 : }
82 :
83 : void
84 23 : recursive_decomp_cano (codepoint_t c, string_t &buf)
85 : {
86 23 : auto it = Rust::DECOMPOSITION_MAP.find (c.value);
87 23 : if (it != Rust::DECOMPOSITION_MAP.end ())
88 : {
89 2 : std::vector<uint32_t> decomped = it->second;
90 6 : for (uint32_t cp : decomped)
91 4 : recursive_decomp_cano (cp, buf);
92 2 : }
93 : else
94 21 : buf.push_back (c);
95 23 : }
96 :
97 : string_t
98 8 : decomp_cano (string_t s)
99 : {
100 8 : string_t buf;
101 30 : for (codepoint_t c : s)
102 : {
103 22 : int64_t s_index = c.value - S_BASE;
104 22 : if (0 <= s_index && s_index < S_COUNT)
105 : {
106 : // decompose Hangul argorithmically
107 3 : uint32_t l = L_BASE + s_index / N_COUNT;
108 3 : uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
109 3 : uint32_t t = T_BASE + s_index % T_COUNT;
110 3 : buf.push_back (l);
111 3 : buf.push_back (v);
112 3 : if (t != T_BASE)
113 0 : buf.push_back (t);
114 3 : continue;
115 3 : }
116 :
117 : // Current character is not hangul
118 19 : recursive_decomp_cano (c, buf);
119 : }
120 8 : return buf;
121 : }
122 :
123 : void
124 8 : sort_cano (string_t &s)
125 : {
126 8 : int cc_here, cc_prev;
127 8 : if (s.size () == 1)
128 : return;
129 31 : for (unsigned int i = 1; i < s.size (); i++)
130 : {
131 23 : cc_here = lookup_cc (s[i]);
132 23 : cc_prev = lookup_cc (s[i - 1]);
133 23 : if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
134 : {
135 : // swap
136 2 : codepoint_t tmp = s[i];
137 2 : s[i] = s[i - 1];
138 2 : s[i - 1] = tmp;
139 2 : if (i > 1)
140 2 : i -= 2;
141 : }
142 : }
143 : }
144 :
145 : string_t
146 8 : compose_hangul (string_t s)
147 : {
148 8 : string_t buf;
149 8 : if (s.size () < 2)
150 0 : return s;
151 :
152 8 : codepoint_t last = s[0];
153 8 : buf.push_back (last);
154 27 : for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
155 : {
156 19 : codepoint_t ch = s[src_pos];
157 :
158 : // L V => LV
159 19 : int64_t l_index = last.value - L_BASE;
160 19 : if (0 <= l_index && l_index < L_COUNT)
161 : {
162 6 : int64_t v_index = ch.value - V_BASE;
163 6 : if (0 <= v_index && v_index < V_COUNT)
164 : {
165 3 : last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
166 : // pop L
167 3 : buf.pop_back ();
168 3 : buf.push_back (last);
169 19 : continue;
170 : }
171 : }
172 :
173 : // LV T => LVT
174 16 : int64_t s_index = last.value - S_BASE;
175 16 : if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
176 : {
177 3 : int64_t t_index = ch.value - T_BASE;
178 3 : if (0 < t_index && t_index < T_COUNT)
179 : {
180 2 : last.value += t_index;
181 : // pop LV
182 2 : buf.pop_back ();
183 2 : buf.push_back (last);
184 2 : continue;
185 : }
186 : }
187 14 : last = ch;
188 14 : buf.push_back (last);
189 : }
190 8 : return buf;
191 8 : }
192 :
193 : string_t
194 8 : recomp (string_t s)
195 : {
196 : // compose hangul first
197 8 : s = compose_hangul (s);
198 :
199 8 : string_t buf;
200 8 : if (s.size () < 2)
201 0 : return s;
202 :
203 8 : int last_class = -1;
204 : // int starter_pos = 0; // Assume the first character is Starter. Correct?
205 : // int target_pos = 1;
206 8 : codepoint_t starter_ch = s[0];
207 :
208 22 : for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
209 : {
210 : // get current character
211 14 : codepoint_t ch = s[src_pos];
212 :
213 14 : int ch_class = lookup_cc (ch);
214 14 : tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
215 14 : if (composite.has_value () && last_class < ch_class)
216 : {
217 : // ch can be composed
218 4 : buf.push_back (composite.value ());
219 4 : starter_ch = composite.value ();
220 : }
221 10 : else if (ch_class == 0)
222 : {
223 : // ch is Starter and cannot be composed.
224 5 : if (src_pos == 1)
225 : // FIXME: buggy?
226 2 : buf.push_back (starter_ch);
227 5 : starter_ch = ch;
228 5 : last_class = -1;
229 5 : buf.push_back (ch);
230 : }
231 : else
232 : {
233 5 : if (src_pos == 1)
234 : // FIXME: buggy?
235 2 : buf.push_back (starter_ch);
236 : // ch is not Starter.
237 5 : last_class = ch_class;
238 5 : buf.push_back (ch);
239 : }
240 : }
241 8 : return buf;
242 8 : }
243 :
244 : // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
245 : QuickCheckResult
246 206100 : nfc_quick_check (const string_t &s)
247 : {
248 206100 : int last_canonical_class = 0;
249 206100 : QuickCheckResult res = QuickCheckResult::YES;
250 :
251 1087249 : for (unsigned long i = 0; i < s.size (); i++)
252 : {
253 881151 : codepoint_t c = s[i];
254 :
255 881151 : if (c.is_supplementary_character ())
256 0 : i++;
257 :
258 881151 : int canonical_class = lookup_cc (c);
259 881151 : if (last_canonical_class > canonical_class && canonical_class != 0)
260 206100 : return QuickCheckResult::NO;
261 :
262 881150 : if (is_nfc_qc_no (c.value))
263 : return QuickCheckResult::NO;
264 :
265 881149 : if (is_nfc_qc_maybe (c.value))
266 10 : res = QuickCheckResult::MAYBE;
267 :
268 881149 : last_canonical_class = canonical_class;
269 : }
270 : return res;
271 : }
272 :
273 : string_t
274 206097 : nfc_normalize (const string_t &s)
275 : {
276 206097 : if (nfc_quick_check (s) == QuickCheckResult::YES)
277 206089 : return s;
278 :
279 : // TODO: optimize normalization.
280 : // i.e. only normalize a limited area around MAYBE character, instead of
281 : // performing complete normlization of the entire string
282 :
283 : // decompose
284 8 : string_t d = decomp_cano (s);
285 8 : sort_cano (d);
286 :
287 : // recompose
288 8 : string_t r = recomp (d);
289 8 : return r;
290 8 : }
291 :
292 : Utf8String
293 206084 : Utf8String::nfc_normalize () const
294 : {
295 206084 : return Utf8String (Rust::nfc_normalize (chars));
296 : }
297 :
298 : bool
299 56104 : is_alphabetic (uint32_t codepoint)
300 : {
301 56104 : return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
302 : }
303 :
304 : bool
305 10823 : is_numeric (uint32_t codepoint)
306 : {
307 10823 : return std::binary_search (NUMERIC_CODEPOINTS.begin (),
308 10823 : NUMERIC_CODEPOINTS.end (), codepoint);
309 : }
310 :
311 : bool
312 881149 : is_nfc_qc_maybe (uint32_t codepoint)
313 : {
314 881149 : return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
315 : }
316 :
317 : bool
318 881150 : is_nfc_qc_no (uint32_t codepoint)
319 : {
320 881150 : return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
321 : }
322 :
323 : bool
324 57 : is_ascii_only (const std::string &str)
325 : {
326 454 : for (char c : str)
327 403 : if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
328 57 : return false;
329 : return true;
330 : }
331 :
332 : } // namespace Rust
333 :
334 : #if CHECKING_P
335 :
336 : namespace selftest {
337 :
338 : void
339 1 : rust_nfc_qc_test ()
340 : {
341 1 : ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
342 : Rust::QuickCheckResult::YES);
343 1 : ASSERT_EQ (Rust::nfc_quick_check (
344 : {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
345 : Rust::QuickCheckResult::MAYBE);
346 1 : ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
347 : Rust::QuickCheckResult::NO);
348 1 : }
349 :
350 : void
351 13 : assert_normalize (const std::vector<Rust::Codepoint> origin,
352 : const std::vector<Rust::Codepoint> expected)
353 : {
354 13 : std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
355 :
356 13 : ASSERT_EQ (actual.size (), expected.size ());
357 44 : for (unsigned int i = 0; i < actual.size (); i++)
358 : {
359 31 : ASSERT_EQ (actual[i], expected[i]);
360 : }
361 13 : }
362 :
363 : void
364 1 : rust_utf8_normalize_test ()
365 : {
366 : // ASCII
367 1 : assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
368 : // ASCII
369 1 : assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
370 :
371 : // testcases retrieved from Part0 of
372 : // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
373 1 : assert_normalize ({0x1e0a}, {0x1e0a});
374 1 : assert_normalize ({0x1e0c}, {0x1e0c});
375 1 : assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
376 1 : assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
377 1 : assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
378 :
379 : // testcases for Hangul from Part0
380 1 : assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
381 1 : assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
382 : // testcases for Hangul from Part1
383 1 : assert_normalize ({0x3131}, {0x3131});
384 1 : assert_normalize ({0x3132}, {0x3132});
385 : // testcases for Hangul from Part3
386 1 : assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
387 1 : assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
388 :
389 : // TODO: add more testcases in
390 : // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
391 1 : }
392 :
393 : void
394 0 : rust_utf8_property_test ()
395 : {
396 0 : ASSERT_TRUE (Rust::is_alphabetic ('A'));
397 0 : ASSERT_TRUE (Rust::is_alphabetic ('B'));
398 0 : ASSERT_TRUE (Rust::is_alphabetic ('x'));
399 0 : ASSERT_TRUE (Rust::is_alphabetic ('z'));
400 0 : ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
401 0 : ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
402 0 : ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
403 0 : ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
404 :
405 0 : ASSERT_FALSE (Rust::is_alphabetic ('\v'));
406 0 : ASSERT_FALSE (Rust::is_alphabetic ('-'));
407 0 : ASSERT_FALSE (Rust::is_alphabetic ('_'));
408 0 : ASSERT_FALSE (Rust::is_alphabetic ('+'));
409 0 : ASSERT_FALSE (Rust::is_alphabetic ('0'));
410 0 : ASSERT_FALSE (Rust::is_alphabetic ('1'));
411 0 : ASSERT_FALSE (Rust::is_alphabetic ('2'));
412 0 : ASSERT_FALSE (Rust::is_alphabetic ('9'));
413 0 : ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
414 0 : ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
415 :
416 : // `Nd`s
417 0 : ASSERT_TRUE (Rust::is_numeric ('0'));
418 0 : ASSERT_TRUE (Rust::is_numeric ('1'));
419 0 : ASSERT_TRUE (Rust::is_numeric ('7'));
420 0 : ASSERT_TRUE (Rust::is_numeric ('9'));
421 0 : ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
422 0 : ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
423 : // `Nl`s
424 0 : ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
425 0 : ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
426 0 : ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
427 0 : ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
428 : // `No`s
429 0 : ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
430 0 : ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
431 :
432 0 : ASSERT_FALSE (Rust::is_numeric ('\n'));
433 0 : ASSERT_FALSE (Rust::is_numeric ('-'));
434 0 : ASSERT_FALSE (Rust::is_numeric ('_'));
435 0 : ASSERT_FALSE (Rust::is_numeric ('('));
436 0 : ASSERT_FALSE (Rust::is_numeric ('z'));
437 0 : ASSERT_FALSE (Rust::is_numeric (';'));
438 0 : ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
439 0 : ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
440 0 : ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
441 0 : ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
442 0 : ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
443 0 : }
444 :
445 : } // namespace selftest
446 :
447 : #endif // CHECKING_P
|