Line data Source code
1 : // Copyright (C) 2020-2026 Free Software Foundation, Inc.
2 :
3 : // This file is part of GCC.
4 :
5 : // GCC is free software; you can redistribute it and/or modify it under
6 : // the terms of the GNU General Public License as published by the Free
7 : // Software Foundation; either version 3, or (at your option) any later
8 : // version.
9 :
10 : // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 : // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 : // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 : // for more details.
14 :
15 : // You should have received a copy of the GNU General Public License
16 : // along with GCC; see the file COPYING3. If not see
17 : // <http://www.gnu.org/licenses/>.
18 :
19 : #ifndef RUST_INPUT_SOURCE_H
20 : #define RUST_INPUT_SOURCE_H
21 :
22 : #include "rust-codepoint.h"
23 : #include "optional.h"
24 :
25 : namespace Rust {
26 :
27 : constexpr uint8_t UTF8_BOM1 = 0xEF;
28 : constexpr uint8_t UTF8_BOM2 = 0xBB;
29 : constexpr uint8_t UTF8_BOM3 = 0xBF;
30 :
31 : // Input source wrapper thing.
32 : class InputSource
33 : {
34 : private:
35 : // position of current character
36 : unsigned int pos;
37 : std::vector<Codepoint> chars;
38 : bool is_valid_utf8;
39 :
40 : // Overload operator () to return next char from input stream.
41 : virtual int next_byte () = 0;
42 :
43 6250154 : Codepoint next_codepoint ()
44 : {
45 6250183 : uint32_t input = next_byte ();
46 :
47 6250183 : if ((int32_t) input == EOF)
48 320305 : return Codepoint::eof ();
49 5929878 : else if (input <= MAX_ASCII_CODEPOINT)
50 : {
51 : // ascii -- 1 byte
52 5928818 : return {input};
53 : }
54 1060 : else if ((input & 0xC0) == 0x80)
55 : {
56 : // invalid (continuation; can't be first char)
57 0 : return {CODEPOINT_INVALID};
58 : }
59 1060 : else if ((input & 0xE0) == 0xC0)
60 : {
61 : // 2 bytes
62 628 : uint8_t input2 = next_byte ();
63 628 : if ((input2 & 0xC0) != 0x80)
64 0 : return {CODEPOINT_INVALID};
65 :
66 628 : uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
67 628 : return output;
68 : }
69 432 : else if ((input & 0xF0) == 0xE0)
70 : {
71 : // 3 bytes or UTF-8 BOM
72 424 : uint8_t input2 = next_byte ();
73 : // If the second byte is equal to 0xBB then the input is no longer a
74 : // valid UTF-8 char. Then, we check if the third byte makes up a UTF
75 : // BOM.
76 424 : if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
77 : {
78 29 : uint8_t input3 = next_byte ();
79 29 : if (input3 == UTF8_BOM3)
80 : // found BOM
81 : return next_codepoint ();
82 : else
83 0 : return {CODEPOINT_INVALID};
84 : }
85 :
86 395 : if ((input2 & 0xC0) != 0x80)
87 0 : return {CODEPOINT_INVALID};
88 :
89 395 : uint8_t input3 = next_byte ();
90 :
91 395 : if ((input3 & 0xC0) != 0x80)
92 0 : return {CODEPOINT_INVALID};
93 :
94 395 : uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
95 395 : | ((input3 & 0x3F) << 0);
96 395 : return {output};
97 : }
98 8 : else if ((input & 0xF8) == 0xF0)
99 : {
100 : // 4 bytes
101 7 : uint8_t input2 = next_byte ();
102 7 : if ((input2 & 0xC0) != 0x80)
103 0 : return {CODEPOINT_INVALID};
104 :
105 7 : uint8_t input3 = next_byte ();
106 7 : if ((input3 & 0xC0) != 0x80)
107 0 : return {CODEPOINT_INVALID};
108 :
109 7 : uint8_t input4 = next_byte ();
110 7 : if ((input4 & 0xC0) != 0x80)
111 0 : return {CODEPOINT_INVALID};
112 :
113 7 : uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
114 7 : | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
115 7 : return {output};
116 : }
117 : else
118 : {
119 1 : return {CODEPOINT_INVALID};
120 : }
121 : }
122 :
123 : protected:
124 : // This method must be called by the constructor to initialize the input
125 : // source. We cannot move this to the constructor because it calls a
126 : // virtual method .
127 320306 : void init ()
128 : {
129 : // Check if the input source is valid as utf-8 and copy all characters to
130 : // `chars`.
131 320306 : Codepoint char32 = next_codepoint ();
132 6570460 : while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
133 : {
134 5929848 : chars.push_back (char32);
135 5929848 : char32 = next_codepoint ();
136 : }
137 :
138 320306 : if (char32 == CODEPOINT_INVALID)
139 : {
140 : // Input source is not valid as utf-8.
141 1 : is_valid_utf8 = false;
142 : }
143 320306 : }
144 :
145 : public:
146 320306 : InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
147 :
148 631026 : virtual ~InputSource () {}
149 :
150 : // Checks if input source is a valid UTF-8 string
151 320142 : bool is_valid () { return is_valid_utf8; }
152 :
153 : // get the next UTF-8 character
154 3489012 : Codepoint next ()
155 : {
156 3489012 : if (pos >= chars.size ())
157 5111 : return Codepoint::eof ();
158 : else
159 : {
160 3483901 : Codepoint c = chars[pos];
161 3483901 : pos++;
162 3483901 : return c;
163 : }
164 : }
165 :
166 : // Returns codepoint if input source is a valid UTF-8 string. Returns
167 : // nullopt otherwise.
168 315507 : tl::optional<std::vector<Codepoint>> get_chars ()
169 : {
170 315507 : if (is_valid ())
171 315507 : return {chars};
172 : else
173 0 : return tl::nullopt;
174 : }
175 : };
176 :
177 2 : class FileInputSource : public InputSource
178 : {
179 : private:
180 : // Input source file.
181 : FILE *input;
182 :
183 3493756 : int next_byte () override { return fgetc (input); }
184 :
185 : public:
186 : // Create new input source from file.
187 4692 : FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
188 : };
189 :
190 315511 : class BufferInputSource : public InputSource
191 : {
192 : private:
193 : const std::string &buffer;
194 : size_t offs;
195 :
196 2757924 : int next_byte () override
197 : {
198 2757924 : if (offs >= buffer.size ())
199 : return EOF;
200 2442310 : return static_cast<uint8_t> (buffer.at (offs++));
201 : }
202 :
203 : public:
204 : // Create new input source from file.
205 315614 : BufferInputSource (const std::string &b, size_t offset)
206 315614 : : InputSource (), buffer (b), offs (offset)
207 : {
208 315614 : init ();
209 : }
210 : };
211 :
212 : } // namespace Rust
213 :
214 : #endif
|