File: utfate.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -o ./utfate ./utfate.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 // info is the multi-line help message
  43 const char* info = ""
  44 "utfate [options...] [filenames...]\n"
  45 "\n"
  46 "This app turns ASCII/UTF text into UTF-8. ASCII/UTF-8 inputs stay the same,\n"
  47 "leading UTF-8 BOMs (byte-order marks) are ignored, UTF-16 and UTF-32 (both\n"
  48 "in either kind of endianess) are turned into UTF-8.\n"
  49 "\n"
  50 "\n"
  51 "Options\n"
  52 "\n"
  53 "    -h, --h            show this help message\n"
  54 "    -help, --help      aliases for option -h\n"
  55 "";
  56 
  57 typedef struct bufreader {
  58     // buf is the buffer, (re)filled periodically as needed
  59     unsigned char* buf;
  60 
  61     // len is how many buffer bytes are being used, out of its max capacity
  62     size_t len;
  63 
  64     // cap is the buffer's capacity, or the most bytes it can hold at once
  65     size_t cap;
  66 
  67     // pos is the current position, up to the current buffer length
  68     size_t pos;
  69 
  70     // src is the data source used to fill the buffer
  71     FILE* src;
  72 } bufreader;
  73 
  74 // init_bufreader is the constructor for type bufreader
  75 void init_bufreader(bufreader* r) {
  76     r->buf = NULL;
  77     r->len = 0;
  78     r->cap = 0;
  79     r->pos = 0;
  80     r->src = NULL;
  81 }
  82 
  83 void restart_bufreader(bufreader* r, FILE* src) {
  84     r->src = src;
  85 
  86     // allow peeking at the first few input bytes, which are needed to detect
  87     // which specific utf input-format is being used
  88     ssize_t len = getline((char**)&r->buf, &r->cap, r->src);
  89     r->len = (len > 0) ? len : 0;
  90 }
  91 
  92 bool check_bufreader(const bufreader* r) {
  93     return r->buf != NULL;
  94 }
  95 
  96 // read_byte does as it says: check its return for the value EOF, before
  97 // using it as the next byte
  98 int read_byte(bufreader* r) {
  99     if (r->pos < r->len) {
 100         // inside current chunk
 101         const unsigned char b = r->buf[r->pos];
 102         r->pos++;
 103         return b;
 104     }
 105 
 106     // need to read the next block
 107     r->pos = 0;
 108     ssize_t len = getline((char**)&r->buf, &r->cap, r->src);
 109     if (len > 0) {
 110         r->len = len;
 111         return r->buf[r->pos++];
 112     }
 113 
 114     // reached the end of data
 115     r->len = 0;
 116     return EOF;
 117 }
 118 
 119 int64_t discard_bytes(bufreader* r, size_t n) {
 120     if (r->pos + n < r->len) {
 121         r->pos += n;
 122         return n;
 123     }
 124 
 125     int64_t discarded = 0;
 126     for (; n > 0; n--, discarded++) {
 127         if (read_byte(r) == EOF) {
 128             break;
 129         }
 130     }
 131     return discarded;
 132 }
 133 
 134 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 135 
 136 bool check_2_byte_rune(int a, int b) {
 137     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 138 }
 139 
 140 bool check_3_byte_rune(int a, int b, int c) {
 141     return (
 142         (a == 0xe0) &&
 143         (0xa0 <= b && b <= 0xbf) &&
 144         (0x80 <= c && c <= 0xbf)
 145     ) || (
 146         (0xe1 <= a && a <= 0xec) &&
 147         (0x80 <= b && b <= 0xbf) &&
 148         (0x80 <= c && c <= 0xbf)
 149     ) || (
 150         (a == 0xed) &&
 151         (0x80 <= b && b <= 0x9f) &&
 152         (0x80 <= c && c <= 0xbf)
 153     ) || (
 154         (a == 0xee || a == 0xef) &&
 155         (0x80 <= b && b <= 0xbf) &&
 156         (0x80 <= c && c <= 0xbf)
 157     );
 158 }
 159 
 160 bool check_4_byte_rune(int a, int b, int c, int d) {
 161     return (
 162         (a == 0xf0) &&
 163         (0x90 <= b && b <= 0xbf) &&
 164         (0x80 <= c && c <= 0xbf) &&
 165         (0x80 <= d && d <= 0xbf)
 166     ) || (
 167         (a == 0xf1 || a == 0xf3) &&
 168         (0x80 <= b && b <= 0xbf) &&
 169         (0x80 <= c && c <= 0xbf) &&
 170         (0x80 <= d && d <= 0xbf)
 171     ) || (
 172         (a == 0xf4) &&
 173         (0x80 <= b && b <= 0xbf) &&
 174         (0x80 <= c && c <= 0x8f) &&
 175         (0x80 <= d && d <= 0xbf)
 176     );
 177 }
 178 
 179 // write_replacement_char is the recommended action to handle invalid bytes
 180 void write_replacement_char(FILE* w) {
 181     putc(0xef, w);
 182     putc(0xbf, w);
 183     putc(0xbd, w);
 184 }
 185 
 186 void copy_utf8_rune(FILE* w, bufreader* r) {
 187     const int a = read_byte(r);
 188     if (a == EOF) {
 189         return;
 190     }
 191 
 192     // handle 1-byte runes
 193     if (a < 128) {
 194         putc(a, w);
 195         return;
 196     }
 197 
 198     const int b = read_byte(r);
 199     if (b == EOF) {
 200         write_replacement_char(w);
 201         return;
 202     }
 203 
 204     // handle 2-byte runes
 205     if (check_2_byte_rune(a, b)) {
 206         putc(a, w);
 207         putc(b, w);
 208         return;
 209     }
 210 
 211     const int c = read_byte(r);
 212     if (c == EOF) {
 213         write_replacement_char(w);
 214         return;
 215     }
 216 
 217     // handle 3-byte runes
 218     if (check_3_byte_rune(a, b, c)) {
 219         putc(a, w);
 220         putc(b, w);
 221         putc(c, w);
 222         return;
 223     }
 224 
 225     const int d = read_byte(r);
 226     if (d == EOF) {
 227         write_replacement_char(w);
 228         return;
 229     }
 230 
 231     // handle 4-byte runes
 232     if (check_4_byte_rune(a, b, c, d)) {
 233         putc(a, w);
 234         putc(b, w);
 235         putc(c, w);
 236         putc(d, w);
 237         return;
 238     }
 239 
 240     write_replacement_char(w);
 241 }
 242 
 243 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 244 // void write_rune(FILE* w, uint32_t rune) {
 245 //     if (rune < (1 << 7)) {
 246 //         putc(rune, w);
 247 //         return;
 248 //     }
 249 //
 250 //     if (rune < (1 << (5 + 6))) {
 251 //         putc(0b11000000 | (rune >> 6), w);
 252 //         putc(0b10000000 | (rune & 0b00111111), w);
 253 //         return;
 254 //     }
 255 //
 256 //     if (rune < (1 << (4 + 6 + 6))) {
 257 //         putc(0b11100000 | (rune >> 12), w);
 258 //         putc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 259 //         putc(0b10000000 | (rune & 0b00111111), w);
 260 //         return;
 261 //     }
 262 //
 263 //     if (rune < (1 << (3 + 6 + 6 + 6))) {
 264 //         putc(0b11110000 | (rune >> 18), w);
 265 //         putc(0b10000000 | ((rune >> 12) & 0b00111111), w);
 266 //         putc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 267 //         putc(0b10000000 | (rune & 0b00111111), w);
 268 //         return;
 269 //     }
 270 //
 271 //     // handle invalid runes with a utf-8 replacement character
 272 //     write_replacement_char(w);
 273 // }
 274 
 275 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 276 void write_rune(FILE* w, uint32_t rune) {
 277     if (rune < (1 << 7)) {
 278         putc(rune, w);
 279         return;
 280     }
 281 
 282     if (rune < (1 << (5 + 6))) {
 283         const int a = 0b11000000 | (rune >> 6);
 284         const int b = 0b10000000 | (rune & 0b00111111);
 285         if (check_2_byte_rune(a, b)) {
 286             putc(a, w);
 287             putc(b, w);
 288         } else {
 289             write_replacement_char(w);
 290         }
 291         return;
 292     }
 293 
 294     if (rune < (1 << (4 + 6 + 6))) {
 295         const int a = 0b11100000 | (rune >> 12);
 296         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 297         const int c = 0b10000000 | (rune & 0b00111111);
 298         if (check_3_byte_rune(a, b, c)) {
 299             putc(a, w);
 300             putc(b, w);
 301             putc(c, w);
 302         } else {
 303             write_replacement_char(w);
 304         }
 305         return;
 306     }
 307 
 308     if (rune < (1 << (3 + 6 + 6 + 6))) {
 309         const int a = 0b11110000 | (rune >> 18);
 310         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 311         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 312         const int d = 0b10000000 | (rune & 0b00111111);
 313         if (check_4_byte_rune(a, b, c, d)) {
 314             putc(a, w);
 315             putc(b, w);
 316             putc(c, w);
 317             putc(d, w);
 318         } else {
 319             write_replacement_char(w);
 320         }
 321         return;
 322     }
 323 
 324     // handle invalid runes with a utf-8 replacement character
 325     write_replacement_char(w);
 326 }
 327 
 328 void show_error(FILE* w, const char* msg) {
 329     putc('\n', w);
 330     fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 331 }
 332 
 333 typedef enum detected_bom {
 334     NO_BOM = 0,
 335     UTF8_BOM = 1,
 336     UTF16_BE_BOM = 2,
 337     UTF16_LE_BOM = 3,
 338     UTF32_BE_BOM = 4,
 339     UTF32_LE_BOM = 5,
 340 } detected_bom;
 341 
 342 detected_bom detect_bom(const bufreader* r) {
 343     const unsigned char* p = r->buf;
 344     const ssize_t len = r->len;
 345 
 346     if (len >= 4) {
 347         if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) {
 348             return UTF32_BE_BOM;
 349         }
 350         if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) {
 351             return UTF32_LE_BOM;
 352         }
 353     }
 354 
 355     if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) {
 356         return UTF8_BOM;
 357     }
 358 
 359     if (len >= 2) {
 360         if (p[0] == 0xfe && p[1] == 0xff) {
 361             return UTF16_BE_BOM;
 362         }
 363         if (p[0] == 0xff && p[1] == 0xfe) {
 364             return UTF16_LE_BOM;
 365         }
 366     }
 367 
 368     return NO_BOM;
 369 }
 370 
 371 // desurrogate assumes the utf16 pair given to it is a valid surrogate
 372 uint32_t desurrogate(uint16_t high, uint16_t low) {
 373     return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000;
 374 }
 375 
 376 bool handle_utf8(FILE* w, bufreader* r) {
 377     for (uint64_t i = 0; r->len > 0; i++) {
 378         copy_utf8_rune(w, r);
 379         if ((i % 1024 == 0) && feof(w)) {
 380             break;
 381         }
 382     }
 383 
 384     return check_bufreader(r);
 385 }
 386 
 387 bool handle_utf16be(FILE* w, bufreader* r) {
 388     for (uint64_t i = 0; r->len > 0; i++) {
 389         if ((i % 1024 == 0) && feof(w)) {
 390             break;
 391         }
 392 
 393         const int a = read_byte(r);
 394         if (a == EOF) {
 395             break;
 396         }
 397 
 398         const int b = read_byte(r);
 399         if (b == EOF) {
 400             write_replacement_char(w);
 401             break;
 402         }
 403 
 404         const uint32_t code = (a << 8) + b;
 405 
 406         // handle non-surrogate runes
 407         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 408             write_rune(w, code);
 409             continue;
 410         }
 411 
 412         const int c = read_byte(r);
 413         if (c == EOF) {
 414             write_replacement_char(w);
 415             break;
 416         }
 417 
 418         const int d = read_byte(r);
 419         if (d == EOF) {
 420             write_replacement_char(w);
 421             break;
 422         }
 423 
 424         // https://en.wikipedia.org/wiki/UTF-16
 425 
 426         const uint16_t high = code;
 427         const uint16_t low = (c << 8) + d;
 428 
 429         // handle valid surrogate runes
 430         if (0xdc00 <= low && low <= 0xdfff) {
 431             write_rune(w, desurrogate(high, low));
 432             continue;
 433         }
 434 
 435         write_replacement_char(w);
 436     }
 437 
 438     return check_bufreader(r);
 439 }
 440 
 441 bool handle_utf16le(FILE* w, bufreader* r) {
 442     for (uint64_t i = 0; r->len > 0; i++) {
 443         if ((i % 1024 == 0) && feof(w)) {
 444             break;
 445         }
 446 
 447         const int a = read_byte(r);
 448         if (a == EOF) {
 449             break;
 450         }
 451 
 452         const int b = read_byte(r);
 453         if (b == EOF) {
 454             write_replacement_char(w);
 455             break;
 456         }
 457 
 458         const uint32_t code = (b << 8) + a;
 459 
 460         // handle non-surrogate runes
 461         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 462             write_rune(w, code);
 463             continue;
 464         }
 465 
 466         const int c = read_byte(r);
 467         if (c == EOF) {
 468             write_replacement_char(w);
 469             break;
 470         }
 471 
 472         const int d = read_byte(r);
 473         if (d == EOF) {
 474             write_replacement_char(w);
 475             break;
 476         }
 477 
 478         // https://en.wikipedia.org/wiki/UTF-16
 479 
 480         const uint16_t high = code;
 481         const uint16_t low = (d << 8) + c;
 482 
 483         // handle valid surrogate runes
 484         if (0xdc00 <= low && low <= 0xdfff) {
 485             write_rune(w, desurrogate(high, low));
 486             continue;
 487         }
 488 
 489         write_replacement_char(w);
 490     }
 491 
 492     return check_bufreader(r);
 493 }
 494 
 495 bool handle_utf32be(FILE* w, bufreader* r) {
 496     for (uint64_t i = 0; r->len > 0; i++) {
 497         if ((i % 1024 == 0) && feof(w)) {
 498             break;
 499         }
 500 
 501         const int a = read_byte(r);
 502         if (a == EOF) {
 503             break;
 504         }
 505 
 506         const int b = read_byte(r);
 507         if (b == EOF) {
 508             write_replacement_char(w);
 509             break;
 510         }
 511 
 512         const int c = read_byte(r);
 513         if (c == EOF) {
 514             write_replacement_char(w);
 515             break;
 516         }
 517 
 518         const int d = read_byte(r);
 519         if (d == EOF) {
 520             write_replacement_char(w);
 521             break;
 522         }
 523 
 524         write_rune(w, (a << 24) + (b << 16) + (c << 8) + d);
 525     }
 526 
 527     return check_bufreader(r);
 528 }
 529 
 530 bool handle_utf32le(FILE* w, bufreader* r) {
 531     for (uint64_t i = 0; r->len > 0; i++) {
 532         if ((i % 1024 == 0) && feof(w)) {
 533             break;
 534         }
 535 
 536         const int a = read_byte(r);
 537         if (a == EOF) {
 538             break;
 539         }
 540 
 541         const int b = read_byte(r);
 542         if (b == EOF) {
 543             write_replacement_char(w);
 544             break;
 545         }
 546 
 547         const int c = read_byte(r);
 548         if (c == EOF) {
 549             write_replacement_char(w);
 550             break;
 551         }
 552 
 553         const int d = read_byte(r);
 554         if (d == EOF) {
 555             write_replacement_char(w);
 556             break;
 557         }
 558 
 559         write_rune(w, (d << 24) + (c << 16) + (b << 8) + a);
 560     }
 561 
 562     return check_bufreader(r);
 563 }
 564 
 565 bool dispatch_reader(FILE* w, FILE* src, bufreader* r) {
 566     switch (detect_bom(r)) {
 567         case NO_BOM:
 568             return handle_utf8(w, r);
 569 
 570         case UTF8_BOM:
 571             discard_bytes(r, 3);
 572             return handle_utf8(w, r);
 573 
 574         case UTF16_BE_BOM:
 575             discard_bytes(r, 2);
 576             return handle_utf16be(w, r);
 577 
 578         case UTF16_LE_BOM:
 579             discard_bytes(r, 2);
 580             return handle_utf16le(w, r);
 581 
 582         case UTF32_BE_BOM:
 583             discard_bytes(r, 4);
 584             return handle_utf32be(w, r);
 585 
 586         case UTF32_LE_BOM:
 587             discard_bytes(r, 4);
 588             return handle_utf32le(w, r);
 589 
 590         default:
 591             return handle_utf8(w, r);
 592     }
 593 }
 594 
 595 bool handle_reader(FILE* w, FILE* src, bufreader* r) {
 596     restart_bufreader(r, src);
 597 
 598     const bool ok = dispatch_reader(w, src, r);
 599     if (r->buf == NULL) {
 600         show_error(w, "can't get memory to read text lines");
 601     }
 602     return ok;
 603 }
 604 
 605 // handle_file handles data from the filename given; returns false only when
 606 // the file can't be opened
 607 bool handle_file(FILE* w, const char* path, bufreader* r) {
 608     FILE* f = fopen(path, "rb");
 609     if (f == NULL) {
 610         putc('\n', w);
 611         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path);
 612         return false;
 613     }
 614 
 615     const bool ok = handle_reader(w, f, r);
 616     fclose(f);
 617     return ok;
 618 }
 619 
 620 // is_help_option simplifies control-flow for func run
 621 bool is_help_option(const char* s) {
 622     return (s[0] == '-') && (
 623         strcmp(s, "-h") == 0 ||
 624         strcmp(s, "-help") == 0 ||
 625         strcmp(s, "--h") == 0 ||
 626         strcmp(s, "--help") == 0
 627     );
 628 }
 629 
 630 // run returns the number of errors
 631 int run(int argc, char** argv, FILE* w) {
 632     if (argc > 1 && is_help_option(argv[1])) {
 633         // help option quits the app right away
 634         fprintf(stderr, "%s", info);
 635         return 0;
 636     }
 637 
 638     bufreader r;
 639     init_bufreader(&r);
 640     size_t errors = 0;
 641 
 642     // handle all filenames/options given
 643     for (size_t i = 1; i < argc && !feof(w); i++) {
 644         // a `-` filename stands for the standard input
 645         if (argv[i][0] == '-' && argv[i][1] == 0) {
 646             if (!handle_reader(w, stdin, &r)) {
 647                 errors++;
 648             }
 649             continue;
 650         }
 651 
 652         if (!handle_file(w, argv[i], &r)) {
 653             errors++;
 654         }
 655     }
 656 
 657     // no filenames means use stdin as the only input
 658     if (argc < 2) {
 659         if (!handle_reader(w, stdin, &r)) {
 660             errors++;
 661         }
 662     }
 663 
 664     free(r.buf);
 665     return errors;
 666 }
 667 
 668 int main(int argc, char** argv) {
 669 #ifdef _WIN32
 670     setmode(fileno(stdin), O_BINARY);
 671     // ensure output lines end in LF instead of CRLF on windows
 672     setmode(fileno(stdout), O_BINARY);
 673     setmode(fileno(stderr), O_BINARY);
 674 #endif
 675 
 676     return run(argc, argv, stdout) == 0 ? 0 : 1;
 677 }