File: utfate.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -o ./utfate ./utfate.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 // info is the multi-line help message 43 const char* info = "" 44 "utfate [options...] [filenames...]\n" 45 "\n" 46 "This app turns ASCII/UTF text into UTF-8. ASCII/UTF-8 inputs stay the same,\n" 47 "leading UTF-8 BOMs (byte-order marks) are ignored, UTF-16 and UTF-32 (both\n" 48 "in either kind of endianess) are turned into UTF-8.\n" 49 "\n" 50 "\n" 51 "Options\n" 52 "\n" 53 " -h, --h show this help message\n" 54 " -help, --help aliases for option -h\n" 55 ""; 56 57 typedef struct bufreader { 58 // buf is the buffer, (re)filled periodically as needed 59 unsigned char* buf; 60 61 // len is how many buffer bytes are being used, out of its max capacity 62 size_t len; 63 64 // cap is the buffer's capacity, or the most bytes it can hold at once 65 size_t cap; 66 67 // pos is the current position, up to the current buffer length 68 size_t pos; 69 70 // src is the data source used to fill the buffer 71 FILE* src; 72 } bufreader; 73 74 // init_bufreader is the constructor for type bufreader 75 void init_bufreader(bufreader* r) { 76 r->buf = NULL; 77 r->len = 0; 78 r->cap = 0; 79 r->pos = 0; 80 r->src = NULL; 81 } 82 83 void restart_bufreader(bufreader* r, FILE* src) { 84 r->src = src; 85 86 // allow peeking at the first few input bytes, which are needed to detect 87 // which specific utf input-format is being used 88 ssize_t len = getline((char**)&r->buf, &r->cap, r->src); 89 r->len = (len > 0) ? len : 0; 90 } 91 92 bool check_bufreader(const bufreader* r) { 93 return r->buf != NULL; 94 } 95 96 // read_byte does as it says: check its return for the value EOF, before 97 // using it as the next byte 98 int read_byte(bufreader* r) { 99 if (r->pos < r->len) { 100 // inside current chunk 101 const unsigned char b = r->buf[r->pos]; 102 r->pos++; 103 return b; 104 } 105 106 // need to read the next block 107 r->pos = 0; 108 ssize_t len = getline((char**)&r->buf, &r->cap, r->src); 109 if (len > 0) { 110 r->len = len; 111 return r->buf[r->pos++]; 112 } 113 114 // reached the end of data 115 r->len = 0; 116 return EOF; 117 } 118 119 int64_t discard_bytes(bufreader* r, size_t n) { 120 if (r->pos + n < r->len) { 121 r->pos += n; 122 return n; 123 } 124 125 int64_t discarded = 0; 126 for (; n > 0; n--, discarded++) { 127 if (read_byte(r) == EOF) { 128 break; 129 } 130 } 131 return discarded; 132 } 133 134 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 135 136 bool check_2_byte_rune(int a, int b) { 137 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 138 } 139 140 bool check_3_byte_rune(int a, int b, int c) { 141 return ( 142 (a == 0xe0) && 143 (0xa0 <= b && b <= 0xbf) && 144 (0x80 <= c && c <= 0xbf) 145 ) || ( 146 (0xe1 <= a && a <= 0xec) && 147 (0x80 <= b && b <= 0xbf) && 148 (0x80 <= c && c <= 0xbf) 149 ) || ( 150 (a == 0xed) && 151 (0x80 <= b && b <= 0x9f) && 152 (0x80 <= c && c <= 0xbf) 153 ) || ( 154 (a == 0xee || a == 0xef) && 155 (0x80 <= b && b <= 0xbf) && 156 (0x80 <= c && c <= 0xbf) 157 ); 158 } 159 160 bool check_4_byte_rune(int a, int b, int c, int d) { 161 return ( 162 (a == 0xf0) && 163 (0x90 <= b && b <= 0xbf) && 164 (0x80 <= c && c <= 0xbf) && 165 (0x80 <= d && d <= 0xbf) 166 ) || ( 167 (a == 0xf1 || a == 0xf3) && 168 (0x80 <= b && b <= 0xbf) && 169 (0x80 <= c && c <= 0xbf) && 170 (0x80 <= d && d <= 0xbf) 171 ) || ( 172 (a == 0xf4) && 173 (0x80 <= b && b <= 0xbf) && 174 (0x80 <= c && c <= 0x8f) && 175 (0x80 <= d && d <= 0xbf) 176 ); 177 } 178 179 // write_replacement_char is the recommended action to handle invalid bytes 180 void write_replacement_char(FILE* w) { 181 putc(0xef, w); 182 putc(0xbf, w); 183 putc(0xbd, w); 184 } 185 186 void copy_utf8_rune(FILE* w, bufreader* r) { 187 const int a = read_byte(r); 188 if (a == EOF) { 189 return; 190 } 191 192 // handle 1-byte runes 193 if (a < 128) { 194 putc(a, w); 195 return; 196 } 197 198 const int b = read_byte(r); 199 if (b == EOF) { 200 write_replacement_char(w); 201 return; 202 } 203 204 // handle 2-byte runes 205 if (check_2_byte_rune(a, b)) { 206 putc(a, w); 207 putc(b, w); 208 return; 209 } 210 211 const int c = read_byte(r); 212 if (c == EOF) { 213 write_replacement_char(w); 214 return; 215 } 216 217 // handle 3-byte runes 218 if (check_3_byte_rune(a, b, c)) { 219 putc(a, w); 220 putc(b, w); 221 putc(c, w); 222 return; 223 } 224 225 const int d = read_byte(r); 226 if (d == EOF) { 227 write_replacement_char(w); 228 return; 229 } 230 231 // handle 4-byte runes 232 if (check_4_byte_rune(a, b, c, d)) { 233 putc(a, w); 234 putc(b, w); 235 putc(c, w); 236 putc(d, w); 237 return; 238 } 239 240 write_replacement_char(w); 241 } 242 243 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 244 // void write_rune(FILE* w, uint32_t rune) { 245 // if (rune < (1 << 7)) { 246 // putc(rune, w); 247 // return; 248 // } 249 // 250 // if (rune < (1 << (5 + 6))) { 251 // putc(0b11000000 | (rune >> 6), w); 252 // putc(0b10000000 | (rune & 0b00111111), w); 253 // return; 254 // } 255 // 256 // if (rune < (1 << (4 + 6 + 6))) { 257 // putc(0b11100000 | (rune >> 12), w); 258 // putc(0b10000000 | ((rune >> 6) & 0b00111111), w); 259 // putc(0b10000000 | (rune & 0b00111111), w); 260 // return; 261 // } 262 // 263 // if (rune < (1 << (3 + 6 + 6 + 6))) { 264 // putc(0b11110000 | (rune >> 18), w); 265 // putc(0b10000000 | ((rune >> 12) & 0b00111111), w); 266 // putc(0b10000000 | ((rune >> 6) & 0b00111111), w); 267 // putc(0b10000000 | (rune & 0b00111111), w); 268 // return; 269 // } 270 // 271 // // handle invalid runes with a utf-8 replacement character 272 // write_replacement_char(w); 273 // } 274 275 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 276 void write_rune(FILE* w, uint32_t rune) { 277 if (rune < (1 << 7)) { 278 putc(rune, w); 279 return; 280 } 281 282 if (rune < (1 << (5 + 6))) { 283 const int a = 0b11000000 | (rune >> 6); 284 const int b = 0b10000000 | (rune & 0b00111111); 285 if (check_2_byte_rune(a, b)) { 286 putc(a, w); 287 putc(b, w); 288 } else { 289 write_replacement_char(w); 290 } 291 return; 292 } 293 294 if (rune < (1 << (4 + 6 + 6))) { 295 const int a = 0b11100000 | (rune >> 12); 296 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 297 const int c = 0b10000000 | (rune & 0b00111111); 298 if (check_3_byte_rune(a, b, c)) { 299 putc(a, w); 300 putc(b, w); 301 putc(c, w); 302 } else { 303 write_replacement_char(w); 304 } 305 return; 306 } 307 308 if (rune < (1 << (3 + 6 + 6 + 6))) { 309 const int a = 0b11110000 | (rune >> 18); 310 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 311 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 312 const int d = 0b10000000 | (rune & 0b00111111); 313 if (check_4_byte_rune(a, b, c, d)) { 314 putc(a, w); 315 putc(b, w); 316 putc(c, w); 317 putc(d, w); 318 } else { 319 write_replacement_char(w); 320 } 321 return; 322 } 323 324 // handle invalid runes with a utf-8 replacement character 325 write_replacement_char(w); 326 } 327 328 void show_error(FILE* w, const char* msg) { 329 putc('\n', w); 330 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 331 } 332 333 typedef enum detected_bom { 334 NO_BOM = 0, 335 UTF8_BOM = 1, 336 UTF16_BE_BOM = 2, 337 UTF16_LE_BOM = 3, 338 UTF32_BE_BOM = 4, 339 UTF32_LE_BOM = 5, 340 } detected_bom; 341 342 detected_bom detect_bom(const bufreader* r) { 343 const unsigned char* p = r->buf; 344 const ssize_t len = r->len; 345 346 if (len >= 4) { 347 if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) { 348 return UTF32_BE_BOM; 349 } 350 if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) { 351 return UTF32_LE_BOM; 352 } 353 } 354 355 if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) { 356 return UTF8_BOM; 357 } 358 359 if (len >= 2) { 360 if (p[0] == 0xfe && p[1] == 0xff) { 361 return UTF16_BE_BOM; 362 } 363 if (p[0] == 0xff && p[1] == 0xfe) { 364 return UTF16_LE_BOM; 365 } 366 } 367 368 return NO_BOM; 369 } 370 371 // desurrogate assumes the utf16 pair given to it is a valid surrogate 372 uint32_t desurrogate(uint16_t high, uint16_t low) { 373 return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000; 374 } 375 376 bool handle_utf8(FILE* w, bufreader* r) { 377 for (uint64_t i = 0; r->len > 0; i++) { 378 copy_utf8_rune(w, r); 379 if ((i % 1024 == 0) && feof(w)) { 380 break; 381 } 382 } 383 384 return check_bufreader(r); 385 } 386 387 bool handle_utf16be(FILE* w, bufreader* r) { 388 for (uint64_t i = 0; r->len > 0; i++) { 389 if ((i % 1024 == 0) && feof(w)) { 390 break; 391 } 392 393 const int a = read_byte(r); 394 if (a == EOF) { 395 break; 396 } 397 398 const int b = read_byte(r); 399 if (b == EOF) { 400 write_replacement_char(w); 401 break; 402 } 403 404 const uint32_t code = (a << 8) + b; 405 406 // handle non-surrogate runes 407 if ((code <= 0xd7ff) || (code >= 0xe000)) { 408 write_rune(w, code); 409 continue; 410 } 411 412 const int c = read_byte(r); 413 if (c == EOF) { 414 write_replacement_char(w); 415 break; 416 } 417 418 const int d = read_byte(r); 419 if (d == EOF) { 420 write_replacement_char(w); 421 break; 422 } 423 424 // https://en.wikipedia.org/wiki/UTF-16 425 426 const uint16_t high = code; 427 const uint16_t low = (c << 8) + d; 428 429 // handle valid surrogate runes 430 if (0xdc00 <= low && low <= 0xdfff) { 431 write_rune(w, desurrogate(high, low)); 432 continue; 433 } 434 435 write_replacement_char(w); 436 } 437 438 return check_bufreader(r); 439 } 440 441 bool handle_utf16le(FILE* w, bufreader* r) { 442 for (uint64_t i = 0; r->len > 0; i++) { 443 if ((i % 1024 == 0) && feof(w)) { 444 break; 445 } 446 447 const int a = read_byte(r); 448 if (a == EOF) { 449 break; 450 } 451 452 const int b = read_byte(r); 453 if (b == EOF) { 454 write_replacement_char(w); 455 break; 456 } 457 458 const uint32_t code = (b << 8) + a; 459 460 // handle non-surrogate runes 461 if ((code <= 0xd7ff) || (code >= 0xe000)) { 462 write_rune(w, code); 463 continue; 464 } 465 466 const int c = read_byte(r); 467 if (c == EOF) { 468 write_replacement_char(w); 469 break; 470 } 471 472 const int d = read_byte(r); 473 if (d == EOF) { 474 write_replacement_char(w); 475 break; 476 } 477 478 // https://en.wikipedia.org/wiki/UTF-16 479 480 const uint16_t high = code; 481 const uint16_t low = (d << 8) + c; 482 483 // handle valid surrogate runes 484 if (0xdc00 <= low && low <= 0xdfff) { 485 write_rune(w, desurrogate(high, low)); 486 continue; 487 } 488 489 write_replacement_char(w); 490 } 491 492 return check_bufreader(r); 493 } 494 495 bool handle_utf32be(FILE* w, bufreader* r) { 496 for (uint64_t i = 0; r->len > 0; i++) { 497 if ((i % 1024 == 0) && feof(w)) { 498 break; 499 } 500 501 const int a = read_byte(r); 502 if (a == EOF) { 503 break; 504 } 505 506 const int b = read_byte(r); 507 if (b == EOF) { 508 write_replacement_char(w); 509 break; 510 } 511 512 const int c = read_byte(r); 513 if (c == EOF) { 514 write_replacement_char(w); 515 break; 516 } 517 518 const int d = read_byte(r); 519 if (d == EOF) { 520 write_replacement_char(w); 521 break; 522 } 523 524 write_rune(w, (a << 24) + (b << 16) + (c << 8) + d); 525 } 526 527 return check_bufreader(r); 528 } 529 530 bool handle_utf32le(FILE* w, bufreader* r) { 531 for (uint64_t i = 0; r->len > 0; i++) { 532 if ((i % 1024 == 0) && feof(w)) { 533 break; 534 } 535 536 const int a = read_byte(r); 537 if (a == EOF) { 538 break; 539 } 540 541 const int b = read_byte(r); 542 if (b == EOF) { 543 write_replacement_char(w); 544 break; 545 } 546 547 const int c = read_byte(r); 548 if (c == EOF) { 549 write_replacement_char(w); 550 break; 551 } 552 553 const int d = read_byte(r); 554 if (d == EOF) { 555 write_replacement_char(w); 556 break; 557 } 558 559 write_rune(w, (d << 24) + (c << 16) + (b << 8) + a); 560 } 561 562 return check_bufreader(r); 563 } 564 565 bool dispatch_reader(FILE* w, FILE* src, bufreader* r) { 566 switch (detect_bom(r)) { 567 case NO_BOM: 568 return handle_utf8(w, r); 569 570 case UTF8_BOM: 571 discard_bytes(r, 3); 572 return handle_utf8(w, r); 573 574 case UTF16_BE_BOM: 575 discard_bytes(r, 2); 576 return handle_utf16be(w, r); 577 578 case UTF16_LE_BOM: 579 discard_bytes(r, 2); 580 return handle_utf16le(w, r); 581 582 case UTF32_BE_BOM: 583 discard_bytes(r, 4); 584 return handle_utf32be(w, r); 585 586 case UTF32_LE_BOM: 587 discard_bytes(r, 4); 588 return handle_utf32le(w, r); 589 590 default: 591 return handle_utf8(w, r); 592 } 593 } 594 595 bool handle_reader(FILE* w, FILE* src, bufreader* r) { 596 restart_bufreader(r, src); 597 598 const bool ok = dispatch_reader(w, src, r); 599 if (r->buf == NULL) { 600 show_error(w, "can't get memory to read text lines"); 601 } 602 return ok; 603 } 604 605 // handle_file handles data from the filename given; returns false only when 606 // the file can't be opened 607 bool handle_file(FILE* w, const char* path, bufreader* r) { 608 FILE* f = fopen(path, "rb"); 609 if (f == NULL) { 610 putc('\n', w); 611 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); 612 return false; 613 } 614 615 const bool ok = handle_reader(w, f, r); 616 fclose(f); 617 return ok; 618 } 619 620 // is_help_option simplifies control-flow for func run 621 bool is_help_option(const char* s) { 622 return (s[0] == '-') && ( 623 strcmp(s, "-h") == 0 || 624 strcmp(s, "-help") == 0 || 625 strcmp(s, "--h") == 0 || 626 strcmp(s, "--help") == 0 627 ); 628 } 629 630 // run returns the number of errors 631 int run(int argc, char** argv, FILE* w) { 632 if (argc > 1 && is_help_option(argv[1])) { 633 // help option quits the app right away 634 fprintf(stderr, "%s", info); 635 return 0; 636 } 637 638 bufreader r; 639 init_bufreader(&r); 640 size_t errors = 0; 641 642 // handle all filenames/options given 643 for (size_t i = 1; i < argc && !feof(w); i++) { 644 // a `-` filename stands for the standard input 645 if (argv[i][0] == '-' && argv[i][1] == 0) { 646 if (!handle_reader(w, stdin, &r)) { 647 errors++; 648 } 649 continue; 650 } 651 652 if (!handle_file(w, argv[i], &r)) { 653 errors++; 654 } 655 } 656 657 // no filenames means use stdin as the only input 658 if (argc < 2) { 659 if (!handle_reader(w, stdin, &r)) { 660 errors++; 661 } 662 } 663 664 free(r.buf); 665 return errors; 666 } 667 668 int main(int argc, char** argv) { 669 #ifdef _WIN32 670 setmode(fileno(stdin), O_BINARY); 671 // ensure output lines end in LF instead of CRLF on windows 672 setmode(fileno(stdout), O_BINARY); 673 setmode(fileno(stderr), O_BINARY); 674 #endif 675 676 return run(argc, argv, stdout) == 0 ? 0 : 1; 677 }