LCOV - code coverage report
Current view: top level - gcc/rust/lex - rust-input-source.h (source / functions) Coverage Total Hit
Test: gcc.info Lines: 87.7 % 73 64
Test Date: 2026-02-28 14:20:25 Functions: 100.0 % 7 7
Legend: Lines:     hit not hit

            Line data    Source code
       1              : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
       2              : 
       3              : // This file is part of GCC.
       4              : 
       5              : // GCC is free software; you can redistribute it and/or modify it under
       6              : // the terms of the GNU General Public License as published by the Free
       7              : // Software Foundation; either version 3, or (at your option) any later
       8              : // version.
       9              : 
      10              : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11              : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12              : // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13              : // for more details.
      14              : 
      15              : // You should have received a copy of the GNU General Public License
      16              : // along with GCC; see the file COPYING3.  If not see
      17              : // <http://www.gnu.org/licenses/>.
      18              : 
      19              : #ifndef RUST_INPUT_SOURCE_H
      20              : #define RUST_INPUT_SOURCE_H
      21              : 
      22              : #include "rust-codepoint.h"
      23              : #include "optional.h"
      24              : 
      25              : namespace Rust {
      26              : 
      27              : constexpr uint8_t UTF8_BOM1 = 0xEF;
      28              : constexpr uint8_t UTF8_BOM2 = 0xBB;
      29              : constexpr uint8_t UTF8_BOM3 = 0xBF;
      30              : 
      31              : // Input source wrapper thing.
      32              : class InputSource
      33              : {
      34              : private:
      35              :   // position of current character
      36              :   unsigned int pos;
      37              :   std::vector<Codepoint> chars;
      38              :   bool is_valid_utf8;
      39              : 
      40              :   // Overload operator () to return next char from input stream.
      41              :   virtual int next_byte () = 0;
      42              : 
      43      6250154 :   Codepoint next_codepoint ()
      44              :   {
      45      6250183 :     uint32_t input = next_byte ();
      46              : 
      47      6250183 :     if ((int32_t) input == EOF)
      48       320305 :       return Codepoint::eof ();
      49      5929878 :     else if (input <= MAX_ASCII_CODEPOINT)
      50              :       {
      51              :         // ascii -- 1 byte
      52      5928818 :         return {input};
      53              :       }
      54         1060 :     else if ((input & 0xC0) == 0x80)
      55              :       {
      56              :         // invalid (continuation; can't be first char)
      57            0 :         return {CODEPOINT_INVALID};
      58              :       }
      59         1060 :     else if ((input & 0xE0) == 0xC0)
      60              :       {
      61              :         // 2 bytes
      62          628 :         uint8_t input2 = next_byte ();
      63          628 :         if ((input2 & 0xC0) != 0x80)
      64            0 :           return {CODEPOINT_INVALID};
      65              : 
      66          628 :         uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
      67          628 :         return output;
      68              :       }
      69          432 :     else if ((input & 0xF0) == 0xE0)
      70              :       {
      71              :         // 3 bytes or UTF-8 BOM
      72          424 :         uint8_t input2 = next_byte ();
      73              :         // If the second byte is equal to 0xBB then the input is no longer a
      74              :         // valid UTF-8 char. Then, we check if the third byte makes up a UTF
      75              :         // BOM.
      76          424 :         if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
      77              :           {
      78           29 :             uint8_t input3 = next_byte ();
      79           29 :             if (input3 == UTF8_BOM3)
      80              :               // found BOM
      81              :               return next_codepoint ();
      82              :             else
      83            0 :               return {CODEPOINT_INVALID};
      84              :           }
      85              : 
      86          395 :         if ((input2 & 0xC0) != 0x80)
      87            0 :           return {CODEPOINT_INVALID};
      88              : 
      89          395 :         uint8_t input3 = next_byte ();
      90              : 
      91          395 :         if ((input3 & 0xC0) != 0x80)
      92            0 :           return {CODEPOINT_INVALID};
      93              : 
      94          395 :         uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
      95          395 :                           | ((input3 & 0x3F) << 0);
      96          395 :         return {output};
      97              :       }
      98            8 :     else if ((input & 0xF8) == 0xF0)
      99              :       {
     100              :         // 4 bytes
     101            7 :         uint8_t input2 = next_byte ();
     102            7 :         if ((input2 & 0xC0) != 0x80)
     103            0 :           return {CODEPOINT_INVALID};
     104              : 
     105            7 :         uint8_t input3 = next_byte ();
     106            7 :         if ((input3 & 0xC0) != 0x80)
     107            0 :           return {CODEPOINT_INVALID};
     108              : 
     109            7 :         uint8_t input4 = next_byte ();
     110            7 :         if ((input4 & 0xC0) != 0x80)
     111            0 :           return {CODEPOINT_INVALID};
     112              : 
     113            7 :         uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
     114            7 :                           | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
     115            7 :         return {output};
     116              :       }
     117              :     else
     118              :       {
     119            1 :         return {CODEPOINT_INVALID};
     120              :       }
     121              :   }
     122              : 
     123              : protected:
     124              :   // This method must be called by the constructor to initialize the input
     125              :   // source. We cannot move this to the constructor because it calls a
     126              :   // virtual method .
     127       320306 :   void init ()
     128              :   {
     129              :     // Check if the input source is valid as utf-8 and copy all characters to
     130              :     // `chars`.
     131       320306 :     Codepoint char32 = next_codepoint ();
     132      6570460 :     while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
     133              :       {
     134      5929848 :         chars.push_back (char32);
     135      5929848 :         char32 = next_codepoint ();
     136              :       }
     137              : 
     138       320306 :     if (char32 == CODEPOINT_INVALID)
     139              :       {
     140              :         // Input source is not valid as utf-8.
     141            1 :         is_valid_utf8 = false;
     142              :       }
     143       320306 :   }
     144              : 
     145              : public:
     146       320306 :   InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
     147              : 
     148       631026 :   virtual ~InputSource () {}
     149              : 
     150              :   // Checks if input source is a valid UTF-8 string
     151       320142 :   bool is_valid () { return is_valid_utf8; }
     152              : 
     153              :   // get the next UTF-8 character
     154      3489012 :   Codepoint next ()
     155              :   {
     156      3489012 :     if (pos >= chars.size ())
     157         5111 :       return Codepoint::eof ();
     158              :     else
     159              :       {
     160      3483901 :         Codepoint c = chars[pos];
     161      3483901 :         pos++;
     162      3483901 :         return c;
     163              :       }
     164              :   }
     165              : 
     166              :   // Returns codepoint if input source is a valid UTF-8 string. Returns
     167              :   // nullopt otherwise.
     168       315507 :   tl::optional<std::vector<Codepoint>> get_chars ()
     169              :   {
     170       315507 :     if (is_valid ())
     171       315507 :       return {chars};
     172              :     else
     173            0 :       return tl::nullopt;
     174              :   }
     175              : };
     176              : 
     177            2 : class FileInputSource : public InputSource
     178              : {
     179              : private:
     180              :   // Input source file.
     181              :   FILE *input;
     182              : 
     183      3493756 :   int next_byte () override { return fgetc (input); }
     184              : 
     185              : public:
     186              :   // Create new input source from file.
     187         4692 :   FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
     188              : };
     189              : 
     190       315511 : class BufferInputSource : public InputSource
     191              : {
     192              : private:
     193              :   const std::string &buffer;
     194              :   size_t offs;
     195              : 
     196      2757924 :   int next_byte () override
     197              :   {
     198      2757924 :     if (offs >= buffer.size ())
     199              :       return EOF;
     200      2442310 :     return static_cast<uint8_t> (buffer.at (offs++));
     201              :   }
     202              : 
     203              : public:
     204              :   // Create new input source from file.
     205       315614 :   BufferInputSource (const std::string &b, size_t offset)
     206       315614 :     : InputSource (), buffer (b), offs (offset)
     207              :   {
     208       315614 :     init ();
     209              :   }
     210              : };
     211              : 
     212              : } // namespace Rust
     213              : 
     214              : #endif
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.