]> glassweightruler.freedombox.rocks Git - Ventoy.git/blob - Plugson/src/Core/ventoy_utf.c
Update ko_KR.txt (#2025)
[Ventoy.git] / Plugson / src / Core / ventoy_utf.c
1 /******************************************************************************
2 * ventoy_utf.c ---- ventoy utf
3 * Copyright (c) 2022, Davipb https://github.com/Davipb/utf8-utf16-converter
4 * Copyright (c) 2022, longpanda <admin@ventoy.net>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 3 of the
9 * License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <stdint.h>
23 #include <stddef.h>
24 #include <ventoy_define.h>
25 #include <ventoy_util.h>
26
27 typedef uint8_t utf8_t; // The type of a single UTF-8 character
28 typedef uint16_t utf16_t; // The type of a single UTF-16 character
29
30
31 // The type of a single Unicode codepoint
32 typedef uint32_t codepoint_t;
33
34 // The last codepoint of the Basic Multilingual Plane, which is the part of Unicode that
35 // UTF-16 can encode without surrogates
36 #define BMP_END 0xFFFF
37
38 // The highest valid Unicode codepoint
39 #define UNICODE_MAX 0x10FFFF
40
41 // The codepoint that is used to replace invalid encodings
42 #define INVALID_CODEPOINT 0xFFFD
43
44 // If a character, masked with GENERIC_SURROGATE_MASK, matches this value, it is a surrogate.
45 #define GENERIC_SURROGATE_VALUE 0xD800
46 // The mask to apply to a character before testing it against GENERIC_SURROGATE_VALUE
47 #define GENERIC_SURROGATE_MASK 0xF800
48
49 // If a character, masked with SURROGATE_MASK, matches this value, it is a high surrogate.
50 #define HIGH_SURROGATE_VALUE 0xD800
51 // If a character, masked with SURROGATE_MASK, matches this value, it is a low surrogate.
52 #define LOW_SURROGATE_VALUE 0xDC00
53 // The mask to apply to a character before testing it against HIGH_SURROGATE_VALUE or LOW_SURROGATE_VALUE
54 #define SURROGATE_MASK 0xFC00
55
56 // The value that is subtracted from a codepoint before encoding it in a surrogate pair
57 #define SURROGATE_CODEPOINT_OFFSET 0x10000
58 // A mask that can be applied to a surrogate to extract the codepoint value contained in it
59 #define SURROGATE_CODEPOINT_MASK 0x03FF
60 // The number of bits of SURROGATE_CODEPOINT_MASK
61 #define SURROGATE_CODEPOINT_BITS 10
62
63
64 // The highest codepoint that can be encoded with 1 byte in UTF-8
65 #define UTF8_1_MAX 0x7F
66 // The highest codepoint that can be encoded with 2 bytes in UTF-8
67 #define UTF8_2_MAX 0x7FF
68 // The highest codepoint that can be encoded with 3 bytes in UTF-8
69 #define UTF8_3_MAX 0xFFFF
70 // The highest codepoint that can be encoded with 4 bytes in UTF-8
71 #define UTF8_4_MAX 0x10FFFF
72
73 // If a character, masked with UTF8_CONTINUATION_MASK, matches this value, it is a UTF-8 continuation byte
74 #define UTF8_CONTINUATION_VALUE 0x80
75 // The mask to a apply to a character before testing it against UTF8_CONTINUATION_VALUE
76 #define UTF8_CONTINUATION_MASK 0xC0
77 // The number of bits of a codepoint that are contained in a UTF-8 continuation byte
78 #define UTF8_CONTINUATION_CODEPOINT_BITS 6
79
80 // Represents a UTF-8 bit pattern that can be set or verified
81 typedef struct
82 {
83 // The mask that should be applied to the character before testing it
84 utf8_t mask;
85 // The value that the character should be tested against after applying the mask
86 utf8_t value;
87 } utf8_pattern;
88
89 // The patterns for leading bytes of a UTF-8 codepoint encoding
90 // Each pattern represents the leading byte for a character encoded with N UTF-8 bytes,
91 // where N is the index + 1
92 static const utf8_pattern utf8_leading_bytes[] =
93 {
94 { 0x80, 0x00 }, // 0xxxxxxx
95 { 0xE0, 0xC0 }, // 110xxxxx
96 { 0xF0, 0xE0 }, // 1110xxxx
97 { 0xF8, 0xF0 } // 11110xxx
98 };
99
100 // The number of elements in utf8_leading_bytes
101 #define UTF8_LEADING_BYTES_LEN 4
102
103
104 // Gets a codepoint from a UTF-16 string
105 // utf16: The UTF-16 string
106 // len: The length of the UTF-16 string, in UTF-16 characters
107 // index:
108 // A pointer to the current index on the string.
109 // When the function returns, this will be left at the index of the last character
110 // that composes the returned codepoint.
111 // For surrogate pairs, this means the index will be left at the low surrogate.
112 static codepoint_t decode_utf16(utf16_t const* utf16, size_t len, size_t* index)
113 {
114 utf16_t high = utf16[*index];
115
116 // BMP character
117 if ((high & GENERIC_SURROGATE_MASK) != GENERIC_SURROGATE_VALUE)
118 return high;
119
120 // Unmatched low surrogate, invalid
121 if ((high & SURROGATE_MASK) != HIGH_SURROGATE_VALUE)
122 return INVALID_CODEPOINT;
123
124 // String ended with an unmatched high surrogate, invalid
125 if (*index == len - 1)
126 return INVALID_CODEPOINT;
127
128 utf16_t low = utf16[*index + 1];
129
130 // Unmatched high surrogate, invalid
131 if ((low & SURROGATE_MASK) != LOW_SURROGATE_VALUE)
132 return INVALID_CODEPOINT;
133
134 // Two correctly matched surrogates, increase index to indicate we've consumed
135 // two characters
136 (*index)++;
137
138 // The high bits of the codepoint are the value bits of the high surrogate
139 // The low bits of the codepoint are the value bits of the low surrogate
140 codepoint_t result = high & SURROGATE_CODEPOINT_MASK;
141 result <<= SURROGATE_CODEPOINT_BITS;
142 result |= low & SURROGATE_CODEPOINT_MASK;
143 result += SURROGATE_CODEPOINT_OFFSET;
144
145 // And if all else fails, it's valid
146 return result;
147 }
148
149 // Calculates the number of UTF-8 characters it would take to encode a codepoint
150 // The codepoint won't be checked for validity, that should be done beforehand.
151 static int calculate_utf8_len(codepoint_t codepoint)
152 {
153 // An array with the max values would be more elegant, but a bit too heavy
154 // for this common function
155
156 if (codepoint <= UTF8_1_MAX)
157 return 1;
158
159 if (codepoint <= UTF8_2_MAX)
160 return 2;
161
162 if (codepoint <= UTF8_3_MAX)
163 return 3;
164
165 return 4;
166 }
167
168 // Encodes a codepoint in a UTF-8 string.
169 // The codepoint won't be checked for validity, that should be done beforehand.
170 //
171 // codepoint: The codepoint to be encoded.
172 // utf8: The UTF-8 string
173 // len: The length of the UTF-8 string, in UTF-8 characters
174 // index: The first empty index on the string.
175 //
176 // return: The number of characters written to the string.
177 static size_t encode_utf8(codepoint_t codepoint, utf8_t* utf8, size_t len, size_t index)
178 {
179 int size = calculate_utf8_len(codepoint);
180
181 // Not enough space left on the string
182 if (index + size > len)
183 return 0;
184
185 // Write the continuation bytes in reverse order first
186 for (int cont_index = size - 1; cont_index > 0; cont_index--)
187 {
188 utf8_t cont = codepoint & ~UTF8_CONTINUATION_MASK;
189 cont |= UTF8_CONTINUATION_VALUE;
190
191 utf8[index + cont_index] = cont;
192 codepoint >>= UTF8_CONTINUATION_CODEPOINT_BITS;
193 }
194
195 // Write the leading byte
196 utf8_pattern pattern = utf8_leading_bytes[size - 1];
197
198 utf8_t lead = codepoint & ~(pattern.mask);
199 lead |= pattern.value;
200
201 utf8[index] = lead;
202
203 return size;
204 }
205
206 size_t utf16_to_utf8(utf16_t const* utf16, size_t utf16_len, utf8_t* utf8, size_t utf8_len)
207 {
208 // The next codepoint that will be written in the UTF-8 string
209 // or the size of the required buffer if utf8 is NULL
210 size_t utf8_index = 0;
211
212 for (size_t utf16_index = 0; utf16_index < utf16_len; utf16_index++)
213 {
214 codepoint_t codepoint = decode_utf16(utf16, utf16_len, &utf16_index);
215
216 if (utf8 == NULL)
217 utf8_index += calculate_utf8_len(codepoint);
218 else
219 utf8_index += encode_utf8(codepoint, utf8, utf8_len, utf8_index);
220 }
221
222 return utf8_index;
223 }
224
225 // Gets a codepoint from a UTF-8 string
226 // utf8: The UTF-8 string
227 // len: The length of the UTF-8 string, in UTF-8 characters
228 // index:
229 // A pointer to the current index on the string.
230 // When the function returns, this will be left at the index of the last character
231 // that composes the returned codepoint.
232 // For example, for a 3-byte codepoint, the index will be left at the third character.
233 static codepoint_t decode_utf8(utf8_t const* utf8, size_t len, size_t* index)
234 {
235 utf8_t leading = utf8[*index];
236
237 // The number of bytes that are used to encode the codepoint
238 int encoding_len = 0;
239 // The pattern of the leading byte
240 utf8_pattern leading_pattern;
241 // If the leading byte matches the current leading pattern
242 int matches = 0;
243
244 do
245 {
246 encoding_len++;
247 leading_pattern = utf8_leading_bytes[encoding_len - 1];
248
249 matches = ((leading & leading_pattern.mask) == leading_pattern.value);
250
251 } while (!matches && encoding_len < UTF8_LEADING_BYTES_LEN);
252
253 // Leading byte doesn't match any known pattern, consider it invalid
254 if (!matches)
255 return INVALID_CODEPOINT;
256
257 codepoint_t codepoint = leading & ~leading_pattern.mask;
258
259 for (int i = 0; i < encoding_len - 1; i++)
260 {
261 // String ended before all continuation bytes were found
262 // Invalid encoding
263 if (*index + 1 >= len)
264 return INVALID_CODEPOINT;
265
266 utf8_t continuation = utf8[*index + 1];
267
268 // Number of continuation bytes not the same as advertised on the leading byte
269 // Invalid encoding
270 if ((continuation & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_VALUE)
271 return INVALID_CODEPOINT;
272
273 codepoint <<= UTF8_CONTINUATION_CODEPOINT_BITS;
274 codepoint |= continuation & ~UTF8_CONTINUATION_MASK;
275
276 (*index)++;
277 }
278
279 int proper_len = calculate_utf8_len(codepoint);
280
281 // Overlong encoding: too many bytes were used to encode a short codepoint
282 // Invalid encoding
283 if (proper_len != encoding_len)
284 return INVALID_CODEPOINT;
285
286 // Surrogates are invalid Unicode codepoints, and should only be used in UTF-16
287 // Invalid encoding
288 if (codepoint < BMP_END && (codepoint & GENERIC_SURROGATE_MASK) == GENERIC_SURROGATE_VALUE)
289 return INVALID_CODEPOINT;
290
291 // UTF-8 can encode codepoints larger than the Unicode standard allows
292 // Invalid encoding
293 if (codepoint > UNICODE_MAX)
294 return INVALID_CODEPOINT;
295
296 return codepoint;
297 }
298
299 // Calculates the number of UTF-16 characters it would take to encode a codepoint
300 // The codepoint won't be checked for validity, that should be done beforehand.
301 static int calculate_utf16_len(codepoint_t codepoint)
302 {
303 if (codepoint <= BMP_END)
304 return 1;
305
306 return 2;
307 }
308
309 // Encodes a codepoint in a UTF-16 string.
310 // The codepoint won't be checked for validity, that should be done beforehand.
311 //
312 // codepoint: The codepoint to be encoded.
313 // utf16: The UTF-16 string
314 // len: The length of the UTF-16 string, in UTF-16 characters
315 // index: The first empty index on the string.
316 //
317 // return: The number of characters written to the string.
318 static size_t encode_utf16(codepoint_t codepoint, utf16_t* utf16, size_t len, size_t index)
319 {
320 // Not enough space on the string
321 if (index >= len)
322 return 0;
323
324 if (codepoint <= BMP_END)
325 {
326 utf16[index] = codepoint;
327 return 1;
328 }
329
330 // Not enough space on the string for two surrogates
331 if (index + 1 >= len)
332 return 0;
333
334 codepoint -= SURROGATE_CODEPOINT_OFFSET;
335
336 utf16_t low = LOW_SURROGATE_VALUE;
337 low |= codepoint & SURROGATE_CODEPOINT_MASK;
338
339 codepoint >>= SURROGATE_CODEPOINT_BITS;
340
341 utf16_t high = HIGH_SURROGATE_VALUE;
342 high |= codepoint & SURROGATE_CODEPOINT_MASK;
343
344 utf16[index] = high;
345 utf16[index + 1] = low;
346
347 return 2;
348 }
349
350 size_t utf8_to_utf16(const unsigned char * utf8, size_t utf8_len, unsigned short* utf16, size_t utf16_len)
351 {
352 // The next codepoint that will be written in the UTF-16 string
353 // or the size of the required buffer if utf16 is NULL
354 size_t utf16_index = 0;
355
356 for (size_t utf8_index = 0; utf8_index < utf8_len; utf8_index++)
357 {
358 codepoint_t codepoint = decode_utf8(utf8, utf8_len, &utf8_index);
359
360 if (utf16 == NULL)
361 utf16_index += calculate_utf16_len(codepoint);
362 else
363 utf16_index += encode_utf16(codepoint, utf16, utf16_len, utf16_index);
364 }
365
366 return utf16_index;
367 }