LCOV - gcc.info - gcc/rust/util/rust-unicode.cc

LCOV - code coverage report

Current view:	top level - gcc/rust/util - rust-unicode.cc (source / functions)		Coverage	Total	Hit
Test:	gcc.info	Lines:	77.7 %	215	167
Test Date:	2024-04-27 14:03:13	Functions:	95.5 %	22	21
Legend:	Lines: hit not hit \| Branches: + taken - not taken # not executed	Branches:	-	0	0

             Branch data     Line data    Source code

       1                 :             : // Copyright (C) 2020-2024 Free Software Foundation, Inc.
       2                 :             : 
       3                 :             : // This file is part of GCC.
       4                 :             : 
       5                 :             : // GCC is free software; you can redistribute it and/or modify it under
       6                 :             : // the terms of the GNU General Public License as published by the Free
       7                 :             : // Software Foundation; either version 3, or (at your option) any later
       8                 :             : // version.
       9                 :             : 
      10                 :             : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11                 :             : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12                 :             : // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13                 :             : // for more details.
      14                 :             : 
      15                 :             : // You should have received a copy of the GNU General Public License
      16                 :             : // along with GCC; see the file COPYING3.  If not see
      17                 :             : // <http://www.gnu.org/licenses/>.
      18                 :             : 
      19                 :             : #include "rust-input-source.h"
      20                 :             : #include "rust-system.h"
      21                 :             : #include "optional.h"
      22                 :             : #include "selftest.h"
      23                 :             : #include "rust-lex.h"
      24                 :             : #include "rust-unicode.h"
      25                 :             : 
      26                 :             : #include "rust-unicode-data.h"
      27                 :             : 
      28                 :             : namespace Rust {
      29                 :             : 
      30                 :             : typedef Codepoint codepoint_t;
      31                 :             : typedef std::vector<codepoint_t> string_t;
      32                 :             : 
      33                 :             : // These constants are used to compose and decompose of Hangul syllables.
      34                 :             : // See `Sample Code for Hangul Algorithms` in 3.1.2
      35                 :             : // unicode.org/versions/Unicode15.0.0/ch03.pdf
      36                 :             : const uint32_t S_BASE = 0xAC00;
      37                 :             : const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
      38                 :             : const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
      39                 :             : const uint32_t N_COUNT = V_COUNT * T_COUNT;
      40                 :             : const uint32_t S_COUNT = L_COUNT * N_COUNT;
      41                 :             : 
      42                 :             : // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
      43                 :             : template <std::size_t SIZE>
      44                 :             : bool
      45                 :      817787 : binary_search_ranges (
      46                 :             :   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
      47                 :             :   uint32_t target_cp)
      48                 :             : {
      49                 :      817787 :   auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
      50                 :     5507906 :                               [] (const std::pair<uint32_t, uint32_t> &a,
      51                 :     5507906 :                                   uint32_t b) { return a.second <= b; });
      52                 :      817787 :   if (it == ranges.end ())
      53                 :             :     return false;
      54                 :             :   else
      55                 :     1600610 :     return it->first <= target_cp && target_cp < it->second;
      56                 :             : }
      57                 :             : 
      58                 :             : int
      59                 :      387571 : lookup_cc (codepoint_t c)
      60                 :             : {
      61                 :      387571 :   auto it = CCC_TABLE.find (c.value);
      62                 :      387571 :   if (it != CCC_TABLE.end ())
      63                 :          38 :     return it->second;
      64                 :             :   else
      65                 :             :     // Starter. Returns zero.
      66                 :             :     return 0;
      67                 :             : }
      68                 :             : 
      69                 :             : tl::optional<codepoint_t>
      70                 :          14 : lookup_recomp (codepoint_t starter, codepoint_t c)
      71                 :             : {
      72                 :          14 :   auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
      73                 :          14 :   if (it != Rust::RECOMPOSITION_MAP.end ())
      74                 :           4 :     return {it->second};
      75                 :             : 
      76                 :          10 :   it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
      77                 :          10 :   if (it != Rust::RECOMPOSITION_MAP.end ())
      78                 :           0 :     return {it->second};
      79                 :             : 
      80                 :          10 :   return tl::nullopt;
      81                 :             : }
      82                 :             : 
      83                 :             : void
      84                 :          23 : recursive_decomp_cano (codepoint_t c, string_t &buf)
      85                 :             : {
      86                 :          23 :   auto it = Rust::DECOMPOSITION_MAP.find (c.value);
      87                 :          23 :   if (it != Rust::DECOMPOSITION_MAP.end ())
      88                 :             :     {
      89                 :           2 :       std::vector<uint32_t> decomped = it->second;
      90                 :           6 :       for (uint32_t cp : decomped)
      91                 :           4 :         recursive_decomp_cano (cp, buf);
      92                 :           2 :     }
      93                 :             :   else
      94                 :          21 :     buf.push_back (c);
      95                 :          23 : }
      96                 :             : 
      97                 :             : string_t
      98                 :           8 : decomp_cano (string_t s)
      99                 :             : {
     100                 :           8 :   string_t buf;
     101                 :          30 :   for (codepoint_t c : s)
     102                 :             :     {
     103                 :          22 :       int64_t s_index = c.value - S_BASE;
     104                 :          22 :       if (0 <= s_index && s_index < S_COUNT)
     105                 :             :         {
     106                 :             :           // decompose Hangul argorithmically
     107                 :           3 :           uint32_t l = L_BASE + s_index / N_COUNT;
     108                 :           3 :           uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
     109                 :           3 :           uint32_t t = T_BASE + s_index % T_COUNT;
     110                 :           3 :           buf.push_back (l);
     111                 :           3 :           buf.push_back (v);
     112                 :           3 :           if (t != T_BASE)
     113                 :           0 :             buf.push_back (t);
     114                 :           3 :           continue;
     115                 :           3 :         }
     116                 :             : 
     117                 :             :       // Current character is not hangul
     118                 :          19 :       recursive_decomp_cano (c, buf);
     119                 :             :     }
     120                 :           8 :   return buf;
     121                 :             : }
     122                 :             : 
     123                 :             : void
     124                 :           8 : sort_cano (string_t &s)
     125                 :             : {
     126                 :           8 :   int cc_here, cc_prev;
     127                 :           8 :   if (s.size () == 1)
     128                 :             :     return;
     129                 :          31 :   for (unsigned int i = 1; i < s.size (); i++)
     130                 :             :     {
     131                 :          23 :       cc_here = lookup_cc (s[i]);
     132                 :          23 :       cc_prev = lookup_cc (s[i - 1]);
     133                 :          23 :       if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
     134                 :             :         {
     135                 :             :           // swap
     136                 :           2 :           codepoint_t tmp = s[i];
     137                 :           2 :           s[i] = s[i - 1];
     138                 :           2 :           s[i - 1] = tmp;
     139                 :           2 :           if (i > 1)
     140                 :           2 :             i -= 2;
     141                 :             :         }
     142                 :             :     }
     143                 :             : }
     144                 :             : 
     145                 :             : string_t
     146                 :           8 : compose_hangul (string_t s)
     147                 :             : {
     148                 :           8 :   string_t buf;
     149                 :           8 :   if (s.size () < 2)
     150                 :           0 :     return s;
     151                 :             : 
     152                 :           8 :   codepoint_t last = s[0];
     153                 :           8 :   buf.push_back (last);
     154                 :          27 :   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     155                 :             :     {
     156                 :          19 :       codepoint_t ch = s[src_pos];
     157                 :             : 
     158                 :             :       // L V => LV
     159                 :          19 :       int64_t l_index = last.value - L_BASE;
     160                 :          19 :       if (0 <= l_index && l_index < L_COUNT)
     161                 :             :         {
     162                 :           6 :           int64_t v_index = ch.value - V_BASE;
     163                 :           6 :           if (0 <= v_index && v_index < V_COUNT)
     164                 :             :             {
     165                 :           3 :               last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
     166                 :             :               // pop L
     167                 :           3 :               buf.pop_back ();
     168                 :           3 :               buf.push_back (last);
     169                 :          19 :               continue;
     170                 :             :             }
     171                 :             :         }
     172                 :             : 
     173                 :             :       // LV T => LVT
     174                 :          16 :       int64_t s_index = last.value - S_BASE;
     175                 :          16 :       if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
     176                 :             :         {
     177                 :           3 :           int64_t t_index = ch.value - T_BASE;
     178                 :           3 :           if (0 < t_index && t_index < T_COUNT)
     179                 :             :             {
     180                 :           2 :               last.value += t_index;
     181                 :             :               // pop LV
     182                 :           2 :               buf.pop_back ();
     183                 :           2 :               buf.push_back (last);
     184                 :           2 :               continue;
     185                 :             :             }
     186                 :             :         }
     187                 :          14 :       last = ch;
     188                 :          14 :       buf.push_back (last);
     189                 :             :     }
     190                 :           8 :   return buf;
     191                 :           8 : }
     192                 :             : 
     193                 :             : string_t
     194                 :           8 : recomp (string_t s)
     195                 :             : {
     196                 :             :   // compose hangul first
     197                 :           8 :   s = compose_hangul (s);
     198                 :             : 
     199                 :           8 :   string_t buf;
     200                 :           8 :   if (s.size () < 2)
     201                 :           0 :     return s;
     202                 :             : 
     203                 :           8 :   int last_class = -1;
     204                 :             :   // int starter_pos = 0; // Assume the first character is Starter. Correct?
     205                 :             :   // int target_pos = 1;
     206                 :           8 :   codepoint_t starter_ch = s[0];
     207                 :             : 
     208                 :          22 :   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     209                 :             :     {
     210                 :             :       // get current character
     211                 :          14 :       codepoint_t ch = s[src_pos];
     212                 :             : 
     213                 :          14 :       int ch_class = lookup_cc (ch);
     214                 :          14 :       tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
     215                 :          14 :       if (composite.has_value () && last_class < ch_class)
     216                 :             :         {
     217                 :             :           // ch can be composed
     218                 :           4 :           buf.push_back (composite.value ());
     219                 :           4 :           starter_ch = composite.value ();
     220                 :             :         }
     221                 :          10 :       else if (ch_class == 0)
     222                 :             :         {
     223                 :             :           // ch is Starter and cannot be composed.
     224                 :           5 :           if (src_pos == 1)
     225                 :             :             // FIXME: buggy?
     226                 :           2 :             buf.push_back (starter_ch);
     227                 :           5 :           starter_ch = ch;
     228                 :           5 :           last_class = -1;
     229                 :           5 :           buf.push_back (ch);
     230                 :             :         }
     231                 :             :       else
     232                 :             :         {
     233                 :           5 :           if (src_pos == 1)
     234                 :             :             // FIXME: buggy?
     235                 :           2 :             buf.push_back (starter_ch);
     236                 :             :           // ch is not Starter.
     237                 :           5 :           last_class = ch_class;
     238                 :           5 :           buf.push_back (ch);
     239                 :             :         }
     240                 :             :     }
     241                 :           8 :   return buf;
     242                 :           8 : }
     243                 :             : 
     244                 :             : // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
     245                 :             : QuickCheckResult
     246                 :      108586 : nfc_quick_check (const string_t &s)
     247                 :             : {
     248                 :      108586 :   int last_canonical_class = 0;
     249                 :      108586 :   QuickCheckResult res = QuickCheckResult::YES;
     250                 :             : 
     251                 :      496095 :   for (unsigned long i = 0; i < s.size (); i++)
     252                 :             :     {
     253                 :      387511 :       codepoint_t c = s[i];
     254                 :             : 
     255                 :      387511 :       if (c.is_supplementary_character ())
     256                 :           0 :         i++;
     257                 :             : 
     258                 :      387511 :       int canonical_class = lookup_cc (c);
     259                 :      387511 :       if (last_canonical_class > canonical_class && canonical_class != 0)
     260                 :      108586 :         return QuickCheckResult::NO;
     261                 :             : 
     262                 :      387510 :       if (is_nfc_qc_no (c.value))
     263                 :             :         return QuickCheckResult::NO;
     264                 :             : 
     265                 :      387509 :       if (is_nfc_qc_maybe (c.value))
     266                 :          10 :         res = QuickCheckResult::MAYBE;
     267                 :             : 
     268                 :      387509 :       last_canonical_class = canonical_class;
     269                 :             :     }
     270                 :             :   return res;
     271                 :             : }
     272                 :             : 
     273                 :             : string_t
     274                 :      108583 : nfc_normalize (const string_t &s)
     275                 :             : {
     276                 :      108583 :   if (nfc_quick_check (s) == QuickCheckResult::YES)
     277                 :      108575 :     return s;
     278                 :             : 
     279                 :             :   // TODO: optimize normalization.
     280                 :             :   // i.e. only normalize a limited area around MAYBE character, instead of
     281                 :             :   // performing complete normlization of the entire string
     282                 :             : 
     283                 :             :   // decompose
     284                 :           8 :   string_t d = decomp_cano (s);
     285                 :           8 :   sort_cano (d);
     286                 :             : 
     287                 :             :   // recompose
     288                 :           8 :   string_t r = recomp (d);
     289                 :           8 :   return r;
     290                 :           8 : }
     291                 :             : 
     292                 :             : Utf8String
     293                 :      108570 : Utf8String::nfc_normalize () const
     294                 :             : {
     295                 :      108570 :   return Utf8String (Rust::nfc_normalize (chars));
     296                 :             : }
     297                 :             : 
     298                 :             : bool
     299                 :       42768 : is_alphabetic (uint32_t codepoint)
     300                 :             : {
     301                 :       42768 :   return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
     302                 :             : }
     303                 :             : 
     304                 :             : bool
     305                 :        7815 : is_numeric (uint32_t codepoint)
     306                 :             : {
     307                 :        7815 :   return std::binary_search (NUMERIC_CODEPOINTS.begin (),
     308                 :        7815 :                              NUMERIC_CODEPOINTS.end (), codepoint);
     309                 :             : }
     310                 :             : 
     311                 :             : bool
     312                 :      387509 : is_nfc_qc_maybe (uint32_t codepoint)
     313                 :             : {
     314                 :      387509 :   return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
     315                 :             : }
     316                 :             : 
     317                 :             : bool
     318                 :      387510 : is_nfc_qc_no (uint32_t codepoint)
     319                 :             : {
     320                 :      387510 :   return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
     321                 :             : }
     322                 :             : 
     323                 :             : bool
     324                 :          57 : is_ascii_only (const std::string &str)
     325                 :             : {
     326                 :         454 :   for (char c : str)
     327                 :         403 :     if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
     328                 :          57 :       return false;
     329                 :             :   return true;
     330                 :             : }
     331                 :             : 
     332                 :             : } // namespace Rust
     333                 :             : 
     334                 :             : #if CHECKING_P
     335                 :             : 
     336                 :             : namespace selftest {
     337                 :             : 
     338                 :             : void
     339                 :           1 : rust_nfc_qc_test ()
     340                 :             : {
     341                 :           1 :   ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
     342                 :             :              Rust::QuickCheckResult::YES);
     343                 :           1 :   ASSERT_EQ (Rust::nfc_quick_check (
     344                 :             :                {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
     345                 :             :              Rust::QuickCheckResult::MAYBE);
     346                 :           1 :   ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
     347                 :             :              Rust::QuickCheckResult::NO);
     348                 :           1 : }
     349                 :             : 
     350                 :             : void
     351                 :          13 : assert_normalize (const std::vector<Rust::Codepoint> origin,
     352                 :             :                   const std::vector<Rust::Codepoint> expected)
     353                 :             : {
     354                 :          13 :   std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
     355                 :             : 
     356                 :          13 :   ASSERT_EQ (actual.size (), expected.size ());
     357                 :          44 :   for (unsigned int i = 0; i < actual.size (); i++)
     358                 :             :     {
     359                 :          31 :       ASSERT_EQ (actual[i], expected[i]);
     360                 :             :     }
     361                 :          13 : }
     362                 :             : 
     363                 :             : void
     364                 :           1 : rust_utf8_normalize_test ()
     365                 :             : {
     366                 :             :   // ASCII
     367                 :           1 :   assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
     368                 :             :   // ASCII
     369                 :           1 :   assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
     370                 :             : 
     371                 :             :   // testcases retrieved from Part0 of
     372                 :             :   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
     373                 :           1 :   assert_normalize ({0x1e0a}, {0x1e0a});
     374                 :           1 :   assert_normalize ({0x1e0c}, {0x1e0c});
     375                 :           1 :   assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
     376                 :           1 :   assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
     377                 :           1 :   assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
     378                 :             : 
     379                 :             :   // testcases for Hangul from Part0
     380                 :           1 :   assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
     381                 :           1 :   assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
     382                 :             :   // testcases for Hangul from Part1
     383                 :           1 :   assert_normalize ({0x3131}, {0x3131});
     384                 :           1 :   assert_normalize ({0x3132}, {0x3132});
     385                 :             :   // testcases for Hangul from Part3
     386                 :           1 :   assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
     387                 :           1 :   assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
     388                 :             : 
     389                 :             :   // TODO: add more testcases in
     390                 :             :   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
     391                 :           1 : }
     392                 :             : 
     393                 :             : void
     394                 :           0 : rust_utf8_property_test ()
     395                 :             : {
     396                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic ('A'));
     397                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic ('B'));
     398                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic ('x'));
     399                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic ('z'));
     400                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic (0x00b5));  // µ
     401                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic (0x3093));  // ん
     402                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic (0xa8f2));  // ꣲ
     403                 :           0 :   ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
     404                 :             : 
     405                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('\v'));
     406                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('-'));
     407                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('_'));
     408                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('+'));
     409                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('0'));
     410                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('1'));
     411                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('2'));
     412                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic ('9'));
     413                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
     414                 :           0 :   ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
     415                 :             : 
     416                 :             :   // `Nd`s
     417                 :           0 :   ASSERT_TRUE (Rust::is_numeric ('0'));
     418                 :           0 :   ASSERT_TRUE (Rust::is_numeric ('1'));
     419                 :           0 :   ASSERT_TRUE (Rust::is_numeric ('7'));
     420                 :           0 :   ASSERT_TRUE (Rust::is_numeric ('9'));
     421                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
     422                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
     423                 :             :   // `Nl`s
     424                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x16e6));  // ᛮ
     425                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0xa6e6));  // ꛦ
     426                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
     427                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
     428                 :             :   // `No`s
     429                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
     430                 :           0 :   ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
     431                 :             : 
     432                 :           0 :   ASSERT_FALSE (Rust::is_numeric ('\n'));
     433                 :           0 :   ASSERT_FALSE (Rust::is_numeric ('-'));
     434                 :           0 :   ASSERT_FALSE (Rust::is_numeric ('_'));
     435                 :           0 :   ASSERT_FALSE (Rust::is_numeric ('('));
     436                 :           0 :   ASSERT_FALSE (Rust::is_numeric ('z'));
     437                 :           0 :   ASSERT_FALSE (Rust::is_numeric (';'));
     438                 :           0 :   ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
     439                 :           0 :   ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
     440                 :           0 :   ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
     441                 :           0 :   ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
     442                 :           0 :   ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
     443                 :           0 : }
     444                 :             : 
     445                 :             : } // namespace selftest
     446                 :             : 
     447                 :             : #endif // CHECKING_P

Generated by: LCOV version 2.1-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.