1 /******************************************************************************
2 * ventoy_utf.c ---- ventoy utf
3 * Copyright (c) 2022, Davipb https://github.com/Davipb/utf8-utf16-converter
4 * Copyright (c) 2022, longpanda <admin@ventoy.net>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 3 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
24 #include <ventoy_define.h>
25 #include <ventoy_util.h>
27 typedef uint8_t utf8_t
; // The type of a single UTF-8 character
28 typedef uint16_t utf16_t
; // The type of a single UTF-16 character
31 // The type of a single Unicode codepoint
32 typedef uint32_t codepoint_t
;
34 // The last codepoint of the Basic Multilingual Plane, which is the part of Unicode that
35 // UTF-16 can encode without surrogates
36 #define BMP_END 0xFFFF
38 // The highest valid Unicode codepoint
39 #define UNICODE_MAX 0x10FFFF
41 // The codepoint that is used to replace invalid encodings
42 #define INVALID_CODEPOINT 0xFFFD
44 // If a character, masked with GENERIC_SURROGATE_MASK, matches this value, it is a surrogate.
45 #define GENERIC_SURROGATE_VALUE 0xD800
46 // The mask to apply to a character before testing it against GENERIC_SURROGATE_VALUE
47 #define GENERIC_SURROGATE_MASK 0xF800
49 // If a character, masked with SURROGATE_MASK, matches this value, it is a high surrogate.
50 #define HIGH_SURROGATE_VALUE 0xD800
51 // If a character, masked with SURROGATE_MASK, matches this value, it is a low surrogate.
52 #define LOW_SURROGATE_VALUE 0xDC00
53 // The mask to apply to a character before testing it against HIGH_SURROGATE_VALUE or LOW_SURROGATE_VALUE
54 #define SURROGATE_MASK 0xFC00
56 // The value that is subtracted from a codepoint before encoding it in a surrogate pair
57 #define SURROGATE_CODEPOINT_OFFSET 0x10000
58 // A mask that can be applied to a surrogate to extract the codepoint value contained in it
59 #define SURROGATE_CODEPOINT_MASK 0x03FF
60 // The number of bits of SURROGATE_CODEPOINT_MASK
61 #define SURROGATE_CODEPOINT_BITS 10
64 // The highest codepoint that can be encoded with 1 byte in UTF-8
65 #define UTF8_1_MAX 0x7F
66 // The highest codepoint that can be encoded with 2 bytes in UTF-8
67 #define UTF8_2_MAX 0x7FF
68 // The highest codepoint that can be encoded with 3 bytes in UTF-8
69 #define UTF8_3_MAX 0xFFFF
70 // The highest codepoint that can be encoded with 4 bytes in UTF-8
71 #define UTF8_4_MAX 0x10FFFF
73 // If a character, masked with UTF8_CONTINUATION_MASK, matches this value, it is a UTF-8 continuation byte
74 #define UTF8_CONTINUATION_VALUE 0x80
75 // The mask to a apply to a character before testing it against UTF8_CONTINUATION_VALUE
76 #define UTF8_CONTINUATION_MASK 0xC0
77 // The number of bits of a codepoint that are contained in a UTF-8 continuation byte
78 #define UTF8_CONTINUATION_CODEPOINT_BITS 6
80 // Represents a UTF-8 bit pattern that can be set or verified
83 // The mask that should be applied to the character before testing it
85 // The value that the character should be tested against after applying the mask
89 // The patterns for leading bytes of a UTF-8 codepoint encoding
90 // Each pattern represents the leading byte for a character encoded with N UTF-8 bytes,
91 // where N is the index + 1
92 static const utf8_pattern utf8_leading_bytes
[] =
94 { 0x80, 0x00 }, // 0xxxxxxx
95 { 0xE0, 0xC0 }, // 110xxxxx
96 { 0xF0, 0xE0 }, // 1110xxxx
97 { 0xF8, 0xF0 } // 11110xxx
100 // The number of elements in utf8_leading_bytes
101 #define UTF8_LEADING_BYTES_LEN 4
104 // Gets a codepoint from a UTF-16 string
105 // utf16: The UTF-16 string
106 // len: The length of the UTF-16 string, in UTF-16 characters
108 // A pointer to the current index on the string.
109 // When the function returns, this will be left at the index of the last character
110 // that composes the returned codepoint.
111 // For surrogate pairs, this means the index will be left at the low surrogate.
112 static codepoint_t
decode_utf16(utf16_t
const* utf16
, size_t len
, size_t* index
)
114 utf16_t high
= utf16
[*index
];
117 if ((high
& GENERIC_SURROGATE_MASK
) != GENERIC_SURROGATE_VALUE
)
120 // Unmatched low surrogate, invalid
121 if ((high
& SURROGATE_MASK
) != HIGH_SURROGATE_VALUE
)
122 return INVALID_CODEPOINT
;
124 // String ended with an unmatched high surrogate, invalid
125 if (*index
== len
- 1)
126 return INVALID_CODEPOINT
;
128 utf16_t low
= utf16
[*index
+ 1];
130 // Unmatched high surrogate, invalid
131 if ((low
& SURROGATE_MASK
) != LOW_SURROGATE_VALUE
)
132 return INVALID_CODEPOINT
;
134 // Two correctly matched surrogates, increase index to indicate we've consumed
138 // The high bits of the codepoint are the value bits of the high surrogate
139 // The low bits of the codepoint are the value bits of the low surrogate
140 codepoint_t result
= high
& SURROGATE_CODEPOINT_MASK
;
141 result
<<= SURROGATE_CODEPOINT_BITS
;
142 result
|= low
& SURROGATE_CODEPOINT_MASK
;
143 result
+= SURROGATE_CODEPOINT_OFFSET
;
145 // And if all else fails, it's valid
149 // Calculates the number of UTF-8 characters it would take to encode a codepoint
150 // The codepoint won't be checked for validity, that should be done beforehand.
151 static int calculate_utf8_len(codepoint_t codepoint
)
153 // An array with the max values would be more elegant, but a bit too heavy
154 // for this common function
156 if (codepoint
<= UTF8_1_MAX
)
159 if (codepoint
<= UTF8_2_MAX
)
162 if (codepoint
<= UTF8_3_MAX
)
168 // Encodes a codepoint in a UTF-8 string.
169 // The codepoint won't be checked for validity, that should be done beforehand.
171 // codepoint: The codepoint to be encoded.
172 // utf8: The UTF-8 string
173 // len: The length of the UTF-8 string, in UTF-8 characters
174 // index: The first empty index on the string.
176 // return: The number of characters written to the string.
177 static size_t encode_utf8(codepoint_t codepoint
, utf8_t
* utf8
, size_t len
, size_t index
)
179 int size
= calculate_utf8_len(codepoint
);
181 // Not enough space left on the string
182 if (index
+ size
> len
)
185 // Write the continuation bytes in reverse order first
186 for (int cont_index
= size
- 1; cont_index
> 0; cont_index
--)
188 utf8_t cont
= codepoint
& ~UTF8_CONTINUATION_MASK
;
189 cont
|= UTF8_CONTINUATION_VALUE
;
191 utf8
[index
+ cont_index
] = cont
;
192 codepoint
>>= UTF8_CONTINUATION_CODEPOINT_BITS
;
195 // Write the leading byte
196 utf8_pattern pattern
= utf8_leading_bytes
[size
- 1];
198 utf8_t lead
= codepoint
& ~(pattern
.mask
);
199 lead
|= pattern
.value
;
206 size_t utf16_to_utf8(utf16_t
const* utf16
, size_t utf16_len
, utf8_t
* utf8
, size_t utf8_len
)
208 // The next codepoint that will be written in the UTF-8 string
209 // or the size of the required buffer if utf8 is NULL
210 size_t utf8_index
= 0;
212 for (size_t utf16_index
= 0; utf16_index
< utf16_len
; utf16_index
++)
214 codepoint_t codepoint
= decode_utf16(utf16
, utf16_len
, &utf16_index
);
217 utf8_index
+= calculate_utf8_len(codepoint
);
219 utf8_index
+= encode_utf8(codepoint
, utf8
, utf8_len
, utf8_index
);
225 // Gets a codepoint from a UTF-8 string
226 // utf8: The UTF-8 string
227 // len: The length of the UTF-8 string, in UTF-8 characters
229 // A pointer to the current index on the string.
230 // When the function returns, this will be left at the index of the last character
231 // that composes the returned codepoint.
232 // For example, for a 3-byte codepoint, the index will be left at the third character.
233 static codepoint_t
decode_utf8(utf8_t
const* utf8
, size_t len
, size_t* index
)
235 utf8_t leading
= utf8
[*index
];
237 // The number of bytes that are used to encode the codepoint
238 int encoding_len
= 0;
239 // The pattern of the leading byte
240 utf8_pattern leading_pattern
;
241 // If the leading byte matches the current leading pattern
247 leading_pattern
= utf8_leading_bytes
[encoding_len
- 1];
249 matches
= ((leading
& leading_pattern
.mask
) == leading_pattern
.value
);
251 } while (!matches
&& encoding_len
< UTF8_LEADING_BYTES_LEN
);
253 // Leading byte doesn't match any known pattern, consider it invalid
255 return INVALID_CODEPOINT
;
257 codepoint_t codepoint
= leading
& ~leading_pattern
.mask
;
259 for (int i
= 0; i
< encoding_len
- 1; i
++)
261 // String ended before all continuation bytes were found
263 if (*index
+ 1 >= len
)
264 return INVALID_CODEPOINT
;
266 utf8_t continuation
= utf8
[*index
+ 1];
268 // Number of continuation bytes not the same as advertised on the leading byte
270 if ((continuation
& UTF8_CONTINUATION_MASK
) != UTF8_CONTINUATION_VALUE
)
271 return INVALID_CODEPOINT
;
273 codepoint
<<= UTF8_CONTINUATION_CODEPOINT_BITS
;
274 codepoint
|= continuation
& ~UTF8_CONTINUATION_MASK
;
279 int proper_len
= calculate_utf8_len(codepoint
);
281 // Overlong encoding: too many bytes were used to encode a short codepoint
283 if (proper_len
!= encoding_len
)
284 return INVALID_CODEPOINT
;
286 // Surrogates are invalid Unicode codepoints, and should only be used in UTF-16
288 if (codepoint
< BMP_END
&& (codepoint
& GENERIC_SURROGATE_MASK
) == GENERIC_SURROGATE_VALUE
)
289 return INVALID_CODEPOINT
;
291 // UTF-8 can encode codepoints larger than the Unicode standard allows
293 if (codepoint
> UNICODE_MAX
)
294 return INVALID_CODEPOINT
;
299 // Calculates the number of UTF-16 characters it would take to encode a codepoint
300 // The codepoint won't be checked for validity, that should be done beforehand.
301 static int calculate_utf16_len(codepoint_t codepoint
)
303 if (codepoint
<= BMP_END
)
309 // Encodes a codepoint in a UTF-16 string.
310 // The codepoint won't be checked for validity, that should be done beforehand.
312 // codepoint: The codepoint to be encoded.
313 // utf16: The UTF-16 string
314 // len: The length of the UTF-16 string, in UTF-16 characters
315 // index: The first empty index on the string.
317 // return: The number of characters written to the string.
318 static size_t encode_utf16(codepoint_t codepoint
, utf16_t
* utf16
, size_t len
, size_t index
)
320 // Not enough space on the string
324 if (codepoint
<= BMP_END
)
326 utf16
[index
] = codepoint
;
330 // Not enough space on the string for two surrogates
331 if (index
+ 1 >= len
)
334 codepoint
-= SURROGATE_CODEPOINT_OFFSET
;
336 utf16_t low
= LOW_SURROGATE_VALUE
;
337 low
|= codepoint
& SURROGATE_CODEPOINT_MASK
;
339 codepoint
>>= SURROGATE_CODEPOINT_BITS
;
341 utf16_t high
= HIGH_SURROGATE_VALUE
;
342 high
|= codepoint
& SURROGATE_CODEPOINT_MASK
;
345 utf16
[index
+ 1] = low
;
350 size_t utf8_to_utf16(const unsigned char * utf8
, size_t utf8_len
, unsigned short* utf16
, size_t utf16_len
)
352 // The next codepoint that will be written in the UTF-16 string
353 // or the size of the required buffer if utf16 is NULL
354 size_t utf16_index
= 0;
356 for (size_t utf8_index
= 0; utf8_index
< utf8_len
; utf8_index
++)
358 codepoint_t codepoint
= decode_utf8(utf8
, utf8_len
, &utf8_index
);
361 utf16_index
+= calculate_utf16_len(codepoint
);
363 utf16_index
+= encode_utf16(codepoint
, utf16
, utf16_len
, utf16_index
);