LCOV - code coverage report
Current view: top level - gcc/rust/util - rust-unicode.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 77.7 % 215 167
Test Date: 2026-02-28 14:20:25 Functions: 95.5 % 22 21
Legend: Lines:     hit not hit

            Line data    Source code
       1              : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
       2              : 
       3              : // This file is part of GCC.
       4              : 
       5              : // GCC is free software; you can redistribute it and/or modify it under
       6              : // the terms of the GNU General Public License as published by the Free
       7              : // Software Foundation; either version 3, or (at your option) any later
       8              : // version.
       9              : 
      10              : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11              : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12              : // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13              : // for more details.
      14              : 
      15              : // You should have received a copy of the GNU General Public License
      16              : // along with GCC; see the file COPYING3.  If not see
      17              : // <http://www.gnu.org/licenses/>.
      18              : 
      19              : #include "rust-input-source.h"
      20              : #include "rust-system.h"
      21              : #include "optional.h"
      22              : #include "selftest.h"
      23              : #include "rust-lex.h"
      24              : #include "rust-unicode.h"
      25              : 
      26              : #include "rust-unicode-data.h"
      27              : 
      28              : namespace Rust {
      29              : 
      30              : typedef Codepoint codepoint_t;
      31              : typedef std::vector<codepoint_t> string_t;
      32              : 
      33              : // These constants are used to compose and decompose of Hangul syllables.
      34              : // See `Sample Code for Hangul Algorithms` in 3.1.2
      35              : // unicode.org/versions/Unicode15.0.0/ch03.pdf
      36              : const uint32_t S_BASE = 0xAC00;
      37              : const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
      38              : const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
      39              : const uint32_t N_COUNT = V_COUNT * T_COUNT;
      40              : const uint32_t S_COUNT = L_COUNT * N_COUNT;
      41              : 
      42              : // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
      43              : template <std::size_t SIZE>
      44              : bool
      45      1818403 : binary_search_ranges (
      46              :   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
      47              :   uint32_t target_cp)
      48              : {
      49      1818403 :   auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
      50     12071852 :                               [] (const std::pair<uint32_t, uint32_t> &a,
      51     12071852 :                                   uint32_t b) { return a.second <= b; });
      52      1818403 :   if (it == ranges.end ())
      53              :     return false;
      54              :   else
      55      3591514 :     return it->first <= target_cp && target_cp < it->second;
      56              : }
      57              : 
      58              : int
      59       881211 : lookup_cc (codepoint_t c)
      60              : {
      61       881211 :   auto it = CCC_TABLE.find (c.value);
      62       881211 :   if (it != CCC_TABLE.end ())
      63           38 :     return it->second;
      64              :   else
      65              :     // Starter. Returns zero.
      66              :     return 0;
      67              : }
      68              : 
      69              : tl::optional<codepoint_t>
      70           14 : lookup_recomp (codepoint_t starter, codepoint_t c)
      71              : {
      72           14 :   auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
      73           14 :   if (it != Rust::RECOMPOSITION_MAP.end ())
      74            4 :     return {it->second};
      75              : 
      76           10 :   it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
      77           10 :   if (it != Rust::RECOMPOSITION_MAP.end ())
      78            0 :     return {it->second};
      79              : 
      80           10 :   return tl::nullopt;
      81              : }
      82              : 
      83              : void
      84           23 : recursive_decomp_cano (codepoint_t c, string_t &buf)
      85              : {
      86           23 :   auto it = Rust::DECOMPOSITION_MAP.find (c.value);
      87           23 :   if (it != Rust::DECOMPOSITION_MAP.end ())
      88              :     {
      89            2 :       std::vector<uint32_t> decomped = it->second;
      90            6 :       for (uint32_t cp : decomped)
      91            4 :         recursive_decomp_cano (cp, buf);
      92            2 :     }
      93              :   else
      94           21 :     buf.push_back (c);
      95           23 : }
      96              : 
      97              : string_t
      98            8 : decomp_cano (string_t s)
      99              : {
     100            8 :   string_t buf;
     101           30 :   for (codepoint_t c : s)
     102              :     {
     103           22 :       int64_t s_index = c.value - S_BASE;
     104           22 :       if (0 <= s_index && s_index < S_COUNT)
     105              :         {
     106              :           // decompose Hangul argorithmically
     107            3 :           uint32_t l = L_BASE + s_index / N_COUNT;
     108            3 :           uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
     109            3 :           uint32_t t = T_BASE + s_index % T_COUNT;
     110            3 :           buf.push_back (l);
     111            3 :           buf.push_back (v);
     112            3 :           if (t != T_BASE)
     113            0 :             buf.push_back (t);
     114            3 :           continue;
     115            3 :         }
     116              : 
     117              :       // Current character is not hangul
     118           19 :       recursive_decomp_cano (c, buf);
     119              :     }
     120            8 :   return buf;
     121              : }
     122              : 
     123              : void
     124            8 : sort_cano (string_t &s)
     125              : {
     126            8 :   int cc_here, cc_prev;
     127            8 :   if (s.size () == 1)
     128              :     return;
     129           31 :   for (unsigned int i = 1; i < s.size (); i++)
     130              :     {
     131           23 :       cc_here = lookup_cc (s[i]);
     132           23 :       cc_prev = lookup_cc (s[i - 1]);
     133           23 :       if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
     134              :         {
     135              :           // swap
     136            2 :           codepoint_t tmp = s[i];
     137            2 :           s[i] = s[i - 1];
     138            2 :           s[i - 1] = tmp;
     139            2 :           if (i > 1)
     140            2 :             i -= 2;
     141              :         }
     142              :     }
     143              : }
     144              : 
     145              : string_t
     146            8 : compose_hangul (string_t s)
     147              : {
     148            8 :   string_t buf;
     149            8 :   if (s.size () < 2)
     150            0 :     return s;
     151              : 
     152            8 :   codepoint_t last = s[0];
     153            8 :   buf.push_back (last);
     154           27 :   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     155              :     {
     156           19 :       codepoint_t ch = s[src_pos];
     157              : 
     158              :       // L V => LV
     159           19 :       int64_t l_index = last.value - L_BASE;
     160           19 :       if (0 <= l_index && l_index < L_COUNT)
     161              :         {
     162            6 :           int64_t v_index = ch.value - V_BASE;
     163            6 :           if (0 <= v_index && v_index < V_COUNT)
     164              :             {
     165            3 :               last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
     166              :               // pop L
     167            3 :               buf.pop_back ();
     168            3 :               buf.push_back (last);
     169           19 :               continue;
     170              :             }
     171              :         }
     172              : 
     173              :       // LV T => LVT
     174           16 :       int64_t s_index = last.value - S_BASE;
     175           16 :       if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
     176              :         {
     177            3 :           int64_t t_index = ch.value - T_BASE;
     178            3 :           if (0 < t_index && t_index < T_COUNT)
     179              :             {
     180            2 :               last.value += t_index;
     181              :               // pop LV
     182            2 :               buf.pop_back ();
     183            2 :               buf.push_back (last);
     184            2 :               continue;
     185              :             }
     186              :         }
     187           14 :       last = ch;
     188           14 :       buf.push_back (last);
     189              :     }
     190            8 :   return buf;
     191            8 : }
     192              : 
     193              : string_t
     194            8 : recomp (string_t s)
     195              : {
     196              :   // compose hangul first
     197            8 :   s = compose_hangul (s);
     198              : 
     199            8 :   string_t buf;
     200            8 :   if (s.size () < 2)
     201            0 :     return s;
     202              : 
     203            8 :   int last_class = -1;
     204              :   // int starter_pos = 0; // Assume the first character is Starter. Correct?
     205              :   // int target_pos = 1;
     206            8 :   codepoint_t starter_ch = s[0];
     207              : 
     208           22 :   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     209              :     {
     210              :       // get current character
     211           14 :       codepoint_t ch = s[src_pos];
     212              : 
     213           14 :       int ch_class = lookup_cc (ch);
     214           14 :       tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
     215           14 :       if (composite.has_value () && last_class < ch_class)
     216              :         {
     217              :           // ch can be composed
     218            4 :           buf.push_back (composite.value ());
     219            4 :           starter_ch = composite.value ();
     220              :         }
     221           10 :       else if (ch_class == 0)
     222              :         {
     223              :           // ch is Starter and cannot be composed.
     224            5 :           if (src_pos == 1)
     225              :             // FIXME: buggy?
     226            2 :             buf.push_back (starter_ch);
     227            5 :           starter_ch = ch;
     228            5 :           last_class = -1;
     229            5 :           buf.push_back (ch);
     230              :         }
     231              :       else
     232              :         {
     233            5 :           if (src_pos == 1)
     234              :             // FIXME: buggy?
     235            2 :             buf.push_back (starter_ch);
     236              :           // ch is not Starter.
     237            5 :           last_class = ch_class;
     238            5 :           buf.push_back (ch);
     239              :         }
     240              :     }
     241            8 :   return buf;
     242            8 : }
     243              : 
     244              : // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
     245              : QuickCheckResult
     246       206100 : nfc_quick_check (const string_t &s)
     247              : {
     248       206100 :   int last_canonical_class = 0;
     249       206100 :   QuickCheckResult res = QuickCheckResult::YES;
     250              : 
     251      1087249 :   for (unsigned long i = 0; i < s.size (); i++)
     252              :     {
     253       881151 :       codepoint_t c = s[i];
     254              : 
     255       881151 :       if (c.is_supplementary_character ())
     256            0 :         i++;
     257              : 
     258       881151 :       int canonical_class = lookup_cc (c);
     259       881151 :       if (last_canonical_class > canonical_class && canonical_class != 0)
     260       206100 :         return QuickCheckResult::NO;
     261              : 
     262       881150 :       if (is_nfc_qc_no (c.value))
     263              :         return QuickCheckResult::NO;
     264              : 
     265       881149 :       if (is_nfc_qc_maybe (c.value))
     266           10 :         res = QuickCheckResult::MAYBE;
     267              : 
     268       881149 :       last_canonical_class = canonical_class;
     269              :     }
     270              :   return res;
     271              : }
     272              : 
     273              : string_t
     274       206097 : nfc_normalize (const string_t &s)
     275              : {
     276       206097 :   if (nfc_quick_check (s) == QuickCheckResult::YES)
     277       206089 :     return s;
     278              : 
     279              :   // TODO: optimize normalization.
     280              :   // i.e. only normalize a limited area around MAYBE character, instead of
     281              :   // performing complete normlization of the entire string
     282              : 
     283              :   // decompose
     284            8 :   string_t d = decomp_cano (s);
     285            8 :   sort_cano (d);
     286              : 
     287              :   // recompose
     288            8 :   string_t r = recomp (d);
     289            8 :   return r;
     290            8 : }
     291              : 
     292              : Utf8String
     293       206084 : Utf8String::nfc_normalize () const
     294              : {
     295       206084 :   return Utf8String (Rust::nfc_normalize (chars));
     296              : }
     297              : 
     298              : bool
     299        56104 : is_alphabetic (uint32_t codepoint)
     300              : {
     301        56104 :   return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
     302              : }
     303              : 
     304              : bool
     305        10823 : is_numeric (uint32_t codepoint)
     306              : {
     307        10823 :   return std::binary_search (NUMERIC_CODEPOINTS.begin (),
     308        10823 :                              NUMERIC_CODEPOINTS.end (), codepoint);
     309              : }
     310              : 
     311              : bool
     312       881149 : is_nfc_qc_maybe (uint32_t codepoint)
     313              : {
     314       881149 :   return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
     315              : }
     316              : 
     317              : bool
     318       881150 : is_nfc_qc_no (uint32_t codepoint)
     319              : {
     320       881150 :   return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
     321              : }
     322              : 
     323              : bool
     324           57 : is_ascii_only (const std::string &str)
     325              : {
     326          454 :   for (char c : str)
     327          403 :     if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
     328           57 :       return false;
     329              :   return true;
     330              : }
     331              : 
     332              : } // namespace Rust
     333              : 
     334              : #if CHECKING_P
     335              : 
     336              : namespace selftest {
     337              : 
     338              : void
     339            1 : rust_nfc_qc_test ()
     340              : {
     341            1 :   ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
     342              :              Rust::QuickCheckResult::YES);
     343            1 :   ASSERT_EQ (Rust::nfc_quick_check (
     344              :                {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
     345              :              Rust::QuickCheckResult::MAYBE);
     346            1 :   ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
     347              :              Rust::QuickCheckResult::NO);
     348            1 : }
     349              : 
     350              : void
     351           13 : assert_normalize (const std::vector<Rust::Codepoint> origin,
     352              :                   const std::vector<Rust::Codepoint> expected)
     353              : {
     354           13 :   std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
     355              : 
     356           13 :   ASSERT_EQ (actual.size (), expected.size ());
     357           44 :   for (unsigned int i = 0; i < actual.size (); i++)
     358              :     {
     359           31 :       ASSERT_EQ (actual[i], expected[i]);
     360              :     }
     361           13 : }
     362              : 
     363              : void
     364            1 : rust_utf8_normalize_test ()
     365              : {
     366              :   // ASCII
     367            1 :   assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
     368              :   // ASCII
     369            1 :   assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
     370              : 
     371              :   // testcases retrieved from Part0 of
     372              :   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
     373            1 :   assert_normalize ({0x1e0a}, {0x1e0a});
     374            1 :   assert_normalize ({0x1e0c}, {0x1e0c});
     375            1 :   assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
     376            1 :   assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
     377            1 :   assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
     378              : 
     379              :   // testcases for Hangul from Part0
     380            1 :   assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
     381            1 :   assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
     382              :   // testcases for Hangul from Part1
     383            1 :   assert_normalize ({0x3131}, {0x3131});
     384            1 :   assert_normalize ({0x3132}, {0x3132});
     385              :   // testcases for Hangul from Part3
     386            1 :   assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
     387            1 :   assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
     388              : 
     389              :   // TODO: add more testcases in
     390              :   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
     391            1 : }
     392              : 
     393              : void
     394            0 : rust_utf8_property_test ()
     395              : {
     396            0 :   ASSERT_TRUE (Rust::is_alphabetic ('A'));
     397            0 :   ASSERT_TRUE (Rust::is_alphabetic ('B'));
     398            0 :   ASSERT_TRUE (Rust::is_alphabetic ('x'));
     399            0 :   ASSERT_TRUE (Rust::is_alphabetic ('z'));
     400            0 :   ASSERT_TRUE (Rust::is_alphabetic (0x00b5));  // µ
     401            0 :   ASSERT_TRUE (Rust::is_alphabetic (0x3093));  // ん
     402            0 :   ASSERT_TRUE (Rust::is_alphabetic (0xa8f2));  // ꣲ
     403            0 :   ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
     404              : 
     405            0 :   ASSERT_FALSE (Rust::is_alphabetic ('\v'));
     406            0 :   ASSERT_FALSE (Rust::is_alphabetic ('-'));
     407            0 :   ASSERT_FALSE (Rust::is_alphabetic ('_'));
     408            0 :   ASSERT_FALSE (Rust::is_alphabetic ('+'));
     409            0 :   ASSERT_FALSE (Rust::is_alphabetic ('0'));
     410            0 :   ASSERT_FALSE (Rust::is_alphabetic ('1'));
     411            0 :   ASSERT_FALSE (Rust::is_alphabetic ('2'));
     412            0 :   ASSERT_FALSE (Rust::is_alphabetic ('9'));
     413            0 :   ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
     414            0 :   ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
     415              : 
     416              :   // `Nd`s
     417            0 :   ASSERT_TRUE (Rust::is_numeric ('0'));
     418            0 :   ASSERT_TRUE (Rust::is_numeric ('1'));
     419            0 :   ASSERT_TRUE (Rust::is_numeric ('7'));
     420            0 :   ASSERT_TRUE (Rust::is_numeric ('9'));
     421            0 :   ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
     422            0 :   ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
     423              :   // `Nl`s
     424            0 :   ASSERT_TRUE (Rust::is_numeric (0x16e6));  // ᛮ
     425            0 :   ASSERT_TRUE (Rust::is_numeric (0xa6e6));  // ꛦ
     426            0 :   ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
     427            0 :   ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
     428              :   // `No`s
     429            0 :   ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
     430            0 :   ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
     431              : 
     432            0 :   ASSERT_FALSE (Rust::is_numeric ('\n'));
     433            0 :   ASSERT_FALSE (Rust::is_numeric ('-'));
     434            0 :   ASSERT_FALSE (Rust::is_numeric ('_'));
     435            0 :   ASSERT_FALSE (Rust::is_numeric ('('));
     436            0 :   ASSERT_FALSE (Rust::is_numeric ('z'));
     437            0 :   ASSERT_FALSE (Rust::is_numeric (';'));
     438            0 :   ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
     439            0 :   ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
     440            0 :   ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
     441            0 :   ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
     442            0 :   ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
     443            0 : }
     444              : 
     445              : } // namespace selftest
     446              : 
     447              : #endif // CHECKING_P
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.