Plugson/src/Core/ventoy_utf.c

   1 /******************************************************************************
   2  * ventoy_utf.c  ---- ventoy utf
   3  * Copyright (c) 2022, Davipb https://github.com/Davipb/utf8-utf16-converter
   4  * Copyright (c) 2022, longpanda <admin@ventoy.net>
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 3 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  18  *
  19  */
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <stdint.h>
  23 #include <stddef.h>
  24 #include <ventoy_define.h>
  25 #include <ventoy_util.h>
  26
  27 typedef uint8_t utf8_t; // The type of a single UTF-8 character
  28 typedef uint16_t utf16_t; // The type of a single UTF-16 character
  29
  30
  31 // The type of a single Unicode codepoint
  32 typedef uint32_t codepoint_t;
  33
  34 // The last codepoint of the Basic Multilingual Plane, which is the part of Unicode that
  35 // UTF-16 can encode without surrogates
  36 #define BMP_END 0xFFFF
  37
  38 // The highest valid Unicode codepoint
  39 #define UNICODE_MAX 0x10FFFF
  40
  41 // The codepoint that is used to replace invalid encodings
  42 #define INVALID_CODEPOINT 0xFFFD
  43
  44 // If a character, masked with GENERIC_SURROGATE_MASK, matches this value, it is a surrogate.
  45 #define GENERIC_SURROGATE_VALUE 0xD800
  46 // The mask to apply to a character before testing it against GENERIC_SURROGATE_VALUE
  47 #define GENERIC_SURROGATE_MASK 0xF800
  48
  49 // If a character, masked with SURROGATE_MASK, matches this value, it is a high surrogate.
  50 #define HIGH_SURROGATE_VALUE 0xD800
  51 // If a character, masked with SURROGATE_MASK, matches this value, it is a low surrogate.
  52 #define LOW_SURROGATE_VALUE 0xDC00
  53 // The mask to apply to a character before testing it against HIGH_SURROGATE_VALUE or LOW_SURROGATE_VALUE
  54 #define SURROGATE_MASK 0xFC00
  55
  56 // The value that is subtracted from a codepoint before encoding it in a surrogate pair
  57 #define SURROGATE_CODEPOINT_OFFSET 0x10000
  58 // A mask that can be applied to a surrogate to extract the codepoint value contained in it
  59 #define SURROGATE_CODEPOINT_MASK 0x03FF
  60 // The number of bits of SURROGATE_CODEPOINT_MASK
  61 #define SURROGATE_CODEPOINT_BITS 10
  62
  63
  64 // The highest codepoint that can be encoded with 1 byte in UTF-8
  65 #define UTF8_1_MAX 0x7F
  66 // The highest codepoint that can be encoded with 2 bytes in UTF-8
  67 #define UTF8_2_MAX 0x7FF
  68 // The highest codepoint that can be encoded with 3 bytes in UTF-8
  69 #define UTF8_3_MAX 0xFFFF
  70 // The highest codepoint that can be encoded with 4 bytes in UTF-8
  71 #define UTF8_4_MAX 0x10FFFF
  72
  73 // If a character, masked with UTF8_CONTINUATION_MASK, matches this value, it is a UTF-8 continuation byte
  74 #define UTF8_CONTINUATION_VALUE 0x80
  75 // The mask to a apply to a character before testing it against UTF8_CONTINUATION_VALUE
  76 #define UTF8_CONTINUATION_MASK 0xC0
  77 // The number of bits of a codepoint that are contained in a UTF-8 continuation byte
  78 #define UTF8_CONTINUATION_CODEPOINT_BITS 6
  79
  80 // Represents a UTF-8 bit pattern that can be set or verified
  81 typedef struct
  82 {
  83     // The mask that should be applied to the character before testing it
  84     utf8_t mask;
  85     // The value that the character should be tested against after applying the mask
  86     utf8_t value;
  87 } utf8_pattern;
  88
  89 // The patterns for leading bytes of a UTF-8 codepoint encoding
  90 // Each pattern represents the leading byte for a character encoded with N UTF-8 bytes,
  91 // where N is the index + 1
  92 static const utf8_pattern utf8_leading_bytes[] =
  93 {
  94     { 0x80, 0x00 }, // 0xxxxxxx
  95     { 0xE0, 0xC0 }, // 110xxxxx
  96     { 0xF0, 0xE0 }, // 1110xxxx
  97     { 0xF8, 0xF0 }  // 11110xxx
  98 };
  99
 100 // The number of elements in utf8_leading_bytes
 101 #define UTF8_LEADING_BYTES_LEN 4
 102
 103
 104 // Gets a codepoint from a UTF-16 string
 105 // utf16: The UTF-16 string
 106 // len: The length of the UTF-16 string, in UTF-16 characters
 107 // index:
 108 // A pointer to the current index on the string.
 109 // When the function returns, this will be left at the index of the last character
 110 // that composes the returned codepoint.
 111 // For surrogate pairs, this means the index will be left at the low surrogate.
 112 static codepoint_t decode_utf16(utf16_t const* utf16, size_t len, size_t* index)
 113 {
 114     utf16_t high = utf16[*index];
 115
 116     // BMP character
 117     if ((high & GENERIC_SURROGATE_MASK) != GENERIC_SURROGATE_VALUE)
 118         return high;
 119
 120     // Unmatched low surrogate, invalid
 121     if ((high & SURROGATE_MASK) != HIGH_SURROGATE_VALUE)
 122         return INVALID_CODEPOINT;
 123
 124     // String ended with an unmatched high surrogate, invalid
 125     if (*index == len - 1)
 126         return INVALID_CODEPOINT;
 127
 128     utf16_t low = utf16[*index + 1];
 129
 130     // Unmatched high surrogate, invalid
 131     if ((low & SURROGATE_MASK) != LOW_SURROGATE_VALUE)
 132         return INVALID_CODEPOINT;
 133
 134     // Two correctly matched surrogates, increase index to indicate we've consumed
 135     // two characters
 136     (*index)++;
 137
 138     // The high bits of the codepoint are the value bits of the high surrogate
 139     // The low bits of the codepoint are the value bits of the low surrogate
 140     codepoint_t result = high & SURROGATE_CODEPOINT_MASK;
 141     result <<= SURROGATE_CODEPOINT_BITS;
 142     result |= low & SURROGATE_CODEPOINT_MASK;
 143     result += SURROGATE_CODEPOINT_OFFSET;
 144
 145     // And if all else fails, it's valid
 146     return result;
 147 }
 148
 149 // Calculates the number of UTF-8 characters it would take to encode a codepoint
 150 // The codepoint won't be checked for validity, that should be done beforehand.
 151 static int calculate_utf8_len(codepoint_t codepoint)
 152 {
 153     // An array with the max values would be more elegant, but a bit too heavy
 154     // for this common function
 155
 156     if (codepoint <= UTF8_1_MAX)
 157         return 1;
 158
 159     if (codepoint <= UTF8_2_MAX)
 160         return 2;
 161
 162     if (codepoint <= UTF8_3_MAX)
 163         return 3;
 164
 165     return 4;
 166 }
 167
 168 // Encodes a codepoint in a UTF-8 string.
 169 // The codepoint won't be checked for validity, that should be done beforehand.
 170 //
 171 // codepoint: The codepoint to be encoded.
 172 // utf8: The UTF-8 string
 173 // len: The length of the UTF-8 string, in UTF-8 characters
 174 // index: The first empty index on the string.
 175 //
 176 // return: The number of characters written to the string.
 177 static size_t encode_utf8(codepoint_t codepoint, utf8_t* utf8, size_t len, size_t index)
 178 {
 179     int size = calculate_utf8_len(codepoint);
 180
 181     // Not enough space left on the string
 182     if (index + size > len)
 183         return 0;
 184
 185     // Write the continuation bytes in reverse order first
 186     for (int cont_index = size - 1; cont_index > 0; cont_index--)
 187     {
 188         utf8_t cont = codepoint & ~UTF8_CONTINUATION_MASK;
 189         cont |= UTF8_CONTINUATION_VALUE;
 190
 191         utf8[index + cont_index] = cont;
 192         codepoint >>= UTF8_CONTINUATION_CODEPOINT_BITS;
 193     }
 194
 195     // Write the leading byte
 196     utf8_pattern pattern = utf8_leading_bytes[size - 1];
 197
 198     utf8_t lead = codepoint & ~(pattern.mask);
 199     lead |= pattern.value;
 200
 201     utf8[index] = lead;
 202
 203     return size;
 204 }
 205
 206 size_t utf16_to_utf8(utf16_t const* utf16, size_t utf16_len, utf8_t* utf8, size_t utf8_len)
 207 {
 208     // The next codepoint that will be written in the UTF-8 string
 209     // or the size of the required buffer if utf8 is NULL
 210     size_t utf8_index = 0;
 211
 212     for (size_t utf16_index = 0; utf16_index < utf16_len; utf16_index++)
 213     {
 214         codepoint_t codepoint = decode_utf16(utf16, utf16_len, &utf16_index);
 215
 216         if (utf8 == NULL)
 217             utf8_index += calculate_utf8_len(codepoint);
 218         else
 219             utf8_index += encode_utf8(codepoint, utf8, utf8_len, utf8_index);
 220     }
 221
 222     return utf8_index;
 223 }
 224
 225 // Gets a codepoint from a UTF-8 string
 226 // utf8: The UTF-8 string
 227 // len: The length of the UTF-8 string, in UTF-8 characters
 228 // index:
 229 // A pointer to the current index on the string.
 230 // When the function returns, this will be left at the index of the last character
 231 // that composes the returned codepoint.
 232 // For example, for a 3-byte codepoint, the index will be left at the third character.
 233 static codepoint_t decode_utf8(utf8_t const* utf8, size_t len, size_t* index)
 234 {
 235     utf8_t leading = utf8[*index];
 236
 237     // The number of bytes that are used to encode the codepoint
 238     int encoding_len = 0;
 239     // The pattern of the leading byte
 240     utf8_pattern leading_pattern;
 241     // If the leading byte matches the current leading pattern
 242     int matches = 0;
 243
 244     do
 245     {
 246         encoding_len++;
 247         leading_pattern = utf8_leading_bytes[encoding_len - 1];
 248
 249         matches = ((leading & leading_pattern.mask) == leading_pattern.value);
 250
 251     } while (!matches && encoding_len < UTF8_LEADING_BYTES_LEN);
 252
 253     // Leading byte doesn't match any known pattern, consider it invalid
 254     if (!matches)
 255         return INVALID_CODEPOINT;
 256
 257     codepoint_t codepoint = leading & ~leading_pattern.mask;
 258
 259     for (int i = 0; i < encoding_len - 1; i++)
 260     {
 261         // String ended before all continuation bytes were found
 262         // Invalid encoding
 263         if (*index + 1 >= len)
 264             return INVALID_CODEPOINT;
 265
 266         utf8_t continuation = utf8[*index + 1];
 267
 268         // Number of continuation bytes not the same as advertised on the leading byte
 269         // Invalid encoding
 270         if ((continuation & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_VALUE)
 271             return INVALID_CODEPOINT;
 272
 273         codepoint <<= UTF8_CONTINUATION_CODEPOINT_BITS;
 274         codepoint |= continuation & ~UTF8_CONTINUATION_MASK;
 275
 276         (*index)++;
 277     }
 278
 279     int proper_len = calculate_utf8_len(codepoint);
 280
 281     // Overlong encoding: too many bytes were used to encode a short codepoint
 282     // Invalid encoding
 283     if (proper_len != encoding_len)
 284         return INVALID_CODEPOINT;
 285
 286     // Surrogates are invalid Unicode codepoints, and should only be used in UTF-16
 287     // Invalid encoding
 288     if (codepoint < BMP_END && (codepoint & GENERIC_SURROGATE_MASK) == GENERIC_SURROGATE_VALUE)
 289         return INVALID_CODEPOINT;
 290
 291     // UTF-8 can encode codepoints larger than the Unicode standard allows
 292     // Invalid encoding
 293     if (codepoint > UNICODE_MAX)
 294         return INVALID_CODEPOINT;
 295
 296     return codepoint;
 297 }
 298
 299 // Calculates the number of UTF-16 characters it would take to encode a codepoint
 300 // The codepoint won't be checked for validity, that should be done beforehand.
 301 static int calculate_utf16_len(codepoint_t codepoint)
 302 {
 303     if (codepoint <= BMP_END)
 304         return 1;
 305
 306     return 2;
 307 }
 308
 309 // Encodes a codepoint in a UTF-16 string.
 310 // The codepoint won't be checked for validity, that should be done beforehand.
 311 //
 312 // codepoint: The codepoint to be encoded.
 313 // utf16: The UTF-16 string
 314 // len: The length of the UTF-16 string, in UTF-16 characters
 315 // index: The first empty index on the string.
 316 //
 317 // return: The number of characters written to the string.
 318 static size_t encode_utf16(codepoint_t codepoint, utf16_t* utf16, size_t len, size_t index)
 319 {
 320     // Not enough space on the string
 321     if (index >= len)
 322         return 0;
 323
 324     if (codepoint <= BMP_END)
 325     {
 326         utf16[index] = codepoint;
 327         return 1;
 328     }
 329
 330     // Not enough space on the string for two surrogates
 331     if (index + 1 >= len)
 332         return 0;
 333
 334     codepoint -= SURROGATE_CODEPOINT_OFFSET;
 335
 336     utf16_t low = LOW_SURROGATE_VALUE;
 337     low |= codepoint & SURROGATE_CODEPOINT_MASK;
 338
 339     codepoint >>= SURROGATE_CODEPOINT_BITS;
 340
 341     utf16_t high = HIGH_SURROGATE_VALUE;
 342     high |= codepoint & SURROGATE_CODEPOINT_MASK;
 343
 344     utf16[index] = high;
 345     utf16[index + 1] = low;
 346
 347     return 2;
 348 }
 349
 350 size_t utf8_to_utf16(const unsigned char * utf8, size_t utf8_len, unsigned short* utf16, size_t utf16_len)
 351 {
 352     // The next codepoint that will be written in the UTF-16 string
 353     // or the size of the required buffer if utf16 is NULL
 354     size_t utf16_index = 0;
 355
 356     for (size_t utf8_index = 0; utf8_index < utf8_len; utf8_index++)
 357     {
 358         codepoint_t codepoint = decode_utf8(utf8, utf8_len, &utf8_index);
 359
 360         if (utf16 == NULL)
 361             utf16_index += calculate_utf16_len(codepoint);
 362         else
 363             utf16_index += encode_utf16(codepoint, utf16, utf16_len, utf16_index);
 364     }
 365
 366     return utf16_index;
 367 }