File: utfate.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./utfate ./utfate.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #ifdef RED_ERRORS
  43 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  44 #ifdef __APPLE__
  45 #define ERROR_STYLE "\x1b[31m"
  46 #endif
  47 #define RESET_STYLE "\x1b[0m"
  48 #else
  49 #define ERROR_STYLE
  50 #define RESET_STYLE
  51 #endif
  52 
  53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  54 
  55 #ifndef IBUF_SIZE
  56 #define IBUF_SIZE (32 * 1024)
  57 #endif
  58 
  59 const char* info = ""
  60 "utfate [options...] [filenames...]\n"
  61 "\n"
  62 "This app turns ASCII/UTF text into UTF-8. ASCII/UTF-8 inputs stay the same,\n"
  63 "leading UTF-8 BOMs (byte-order marks) are ignored, UTF-16 and UTF-32 (both\n"
  64 "in either kind of endianess) are turned into UTF-8.\n"
  65 "\n"
  66 "\n"
  67 "Options\n"
  68 "\n"
  69 "    -h, --h            show this help message\n"
  70 "    -help, --help      aliases for option -h\n"
  71 "";
  72 
  73 typedef struct bufreader {
  74     // buf is the buffer, (re)filled periodically as needed
  75     unsigned char* buf;
  76 
  77     // len is how many buffer bytes are being used, out of its max capacity
  78     size_t len;
  79 
  80     // cap is the buffer's capacity, or the most bytes it can hold at once
  81     size_t cap;
  82 
  83     // pos is the current position, up to the current buffer length
  84     size_t pos;
  85 
  86     // src is the data source used to fill the buffer
  87     FILE* src;
  88 } bufreader;
  89 
  90 // init_bufreader is the constructor for type bufreader
  91 void init_bufreader(bufreader* r, FILE* src, unsigned char* buf, size_t cap) {
  92     r->buf = buf;
  93     r->len = 0;
  94     r->cap = cap;
  95     r->pos = 0;
  96     r->src = src;
  97 }
  98 
  99 void restart_bufreader(bufreader* r, FILE* src) {
 100     r->src = src;
 101     ssize_t len = fread(r->buf, sizeof(unsigned char), r->cap, r->src);
 102     r->len = (len > 0) ? len : 0;
 103 }
 104 
 105 // read_byte does as it says: check its return for the value EOF, before
 106 // using it as the next byte
 107 int read_byte(bufreader* r) {
 108     if (r->pos < r->len) {
 109         // inside current chunk
 110         const unsigned char b = r->buf[r->pos];
 111         r->pos++;
 112         return b;
 113     }
 114 
 115     // need to read the next block
 116     r->pos = 0;
 117     ssize_t len = fread(r->buf, sizeof(unsigned char), r->cap, r->src);
 118     if (len > 0) {
 119         r->len = len;
 120         return r->buf[r->pos++];
 121     }
 122 
 123     // reached the end of data
 124     r->len = 0;
 125     return EOF;
 126 }
 127 
 128 int64_t discard_bytes(bufreader* r, size_t n) {
 129     if (r->pos + n < r->len) {
 130         r->pos += n;
 131         return n;
 132     }
 133 
 134     int64_t discarded = 0;
 135     for (; n > 0; n--, discarded++) {
 136         if (read_byte(r) == EOF) {
 137             break;
 138         }
 139     }
 140     return discarded;
 141 }
 142 
 143 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 144 
 145 static inline bool check_2_byte_rune(int a, int b) {
 146     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 147 }
 148 
 149 bool check_3_byte_rune(int a, int b, int c) {
 150     return (
 151         (a == 0xe0) &&
 152         (0xa0 <= b && b <= 0xbf) &&
 153         (0x80 <= c && c <= 0xbf)
 154     ) || (
 155         (0xe1 <= a && a <= 0xec) &&
 156         (0x80 <= b && b <= 0xbf) &&
 157         (0x80 <= c && c <= 0xbf)
 158     ) || (
 159         (a == 0xed) &&
 160         (0x80 <= b && b <= 0x9f) &&
 161         (0x80 <= c && c <= 0xbf)
 162     ) || (
 163         (a == 0xee || a == 0xef) &&
 164         (0x80 <= b && b <= 0xbf) &&
 165         (0x80 <= c && c <= 0xbf)
 166     );
 167 }
 168 
 169 bool check_4_byte_rune(int a, int b, int c, int d) {
 170     return (
 171         (a == 0xf0) &&
 172         (0x90 <= b && b <= 0xbf) &&
 173         (0x80 <= c && c <= 0xbf) &&
 174         (0x80 <= d && d <= 0xbf)
 175     ) || (
 176         (a == 0xf1 || a == 0xf3) &&
 177         (0x80 <= b && b <= 0xbf) &&
 178         (0x80 <= c && c <= 0xbf) &&
 179         (0x80 <= d && d <= 0xbf)
 180     ) || (
 181         (a == 0xf4) &&
 182         (0x80 <= b && b <= 0xbf) &&
 183         (0x80 <= c && c <= 0x8f) &&
 184         (0x80 <= d && d <= 0xbf)
 185     );
 186 }
 187 
 188 // write_replacement_char is the recommended action to handle invalid bytes
 189 void write_replacement_char(FILE* w) {
 190     fputc(0xef, w);
 191     fputc(0xbf, w);
 192     fputc(0xbd, w);
 193 }
 194 
 195 void copy_utf8_rune(FILE* w, bufreader* r) {
 196     const int a = read_byte(r);
 197     if (a == EOF) {
 198         return;
 199     }
 200 
 201     // handle 1-byte runes
 202     if (a < 128) {
 203         fputc(a, w);
 204         return;
 205     }
 206 
 207     const int b = read_byte(r);
 208     if (b == EOF) {
 209         write_replacement_char(w);
 210         return;
 211     }
 212 
 213     // handle 2-byte runes
 214     if (check_2_byte_rune(a, b)) {
 215         fputc(a, w);
 216         fputc(b, w);
 217         return;
 218     }
 219 
 220     const int c = read_byte(r);
 221     if (c == EOF) {
 222         write_replacement_char(w);
 223         return;
 224     }
 225 
 226     // handle 3-byte runes
 227     if (check_3_byte_rune(a, b, c)) {
 228         fputc(a, w);
 229         fputc(b, w);
 230         fputc(c, w);
 231         return;
 232     }
 233 
 234     const int d = read_byte(r);
 235     if (d == EOF) {
 236         write_replacement_char(w);
 237         return;
 238     }
 239 
 240     // handle 4-byte runes
 241     if (check_4_byte_rune(a, b, c, d)) {
 242         fputc(a, w);
 243         fputc(b, w);
 244         fputc(c, w);
 245         fputc(d, w);
 246         return;
 247     }
 248 
 249     write_replacement_char(w);
 250 }
 251 
 252 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 253 // void write_rune(FILE* w, uint32_t rune) {
 254 //     if (rune < (1 << 7)) {
 255 //         fputc(rune, w);
 256 //         return;
 257 //     }
 258 //
 259 //     if (rune < (1 << (5 + 6))) {
 260 //         fputc(0b11000000 | (rune >> 6), w);
 261 //         fputc(0b10000000 | (rune & 0b00111111), w);
 262 //         return;
 263 //     }
 264 //
 265 //     if (rune < (1 << (4 + 6 + 6))) {
 266 //         fputc(0b11100000 | (rune >> 12), w);
 267 //         fputc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 268 //         fputc(0b10000000 | (rune & 0b00111111), w);
 269 //         return;
 270 //     }
 271 //
 272 //     if (rune < (1 << (3 + 6 + 6 + 6))) {
 273 //         fputc(0b11110000 | (rune >> 18), w);
 274 //         fputc(0b10000000 | ((rune >> 12) & 0b00111111), w);
 275 //         fputc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 276 //         fputc(0b10000000 | (rune & 0b00111111), w);
 277 //         return;
 278 //     }
 279 //
 280 //     // handle invalid runes with a utf-8 replacement character
 281 //     write_replacement_char(w);
 282 // }
 283 
 284 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 285 void write_rune(FILE* w, uint32_t rune) {
 286     if (rune < (1 << 7)) {
 287         fputc(rune, w);
 288         return;
 289     }
 290 
 291     if (rune < (1 << (5 + 6))) {
 292         const int a = 0b11000000 | (rune >> 6);
 293         const int b = 0b10000000 | (rune & 0b00111111);
 294         if (check_2_byte_rune(a, b)) {
 295             fputc(a, w);
 296             fputc(b, w);
 297         } else {
 298             write_replacement_char(w);
 299         }
 300         return;
 301     }
 302 
 303     if (rune < (1 << (4 + 6 + 6))) {
 304         const int a = 0b11100000 | (rune >> 12);
 305         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 306         const int c = 0b10000000 | (rune & 0b00111111);
 307         if (check_3_byte_rune(a, b, c)) {
 308             fputc(a, w);
 309             fputc(b, w);
 310             fputc(c, w);
 311         } else {
 312             write_replacement_char(w);
 313         }
 314         return;
 315     }
 316 
 317     if (rune < (1 << (3 + 6 + 6 + 6))) {
 318         const int a = 0b11110000 | (rune >> 18);
 319         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 320         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 321         const int d = 0b10000000 | (rune & 0b00111111);
 322         if (check_4_byte_rune(a, b, c, d)) {
 323             fputc(a, w);
 324             fputc(b, w);
 325             fputc(c, w);
 326             fputc(d, w);
 327         } else {
 328             write_replacement_char(w);
 329         }
 330         return;
 331     }
 332 
 333     // handle invalid runes with a utf-8 replacement character
 334     write_replacement_char(w);
 335 }
 336 
 337 void show_error(FILE* w, const char* msg) {
 338     fputc('\n', w);
 339     fprintf(stderr, ERROR_LINE("%s"), msg);
 340 }
 341 
 342 // desurrogate assumes the utf16 pair given to it is a valid surrogate
 343 static inline uint32_t desurrogate(uint16_t high, uint16_t low) {
 344     return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000;
 345 }
 346 
 347 void handle_utf8(FILE* w, bufreader* r) {
 348     for (uint64_t i = 0; r->len > 0; i++) {
 349         copy_utf8_rune(w, r);
 350         if ((i % 1024 == 0) && feof(w)) {
 351             break;
 352         }
 353     }
 354 }
 355 
 356 void handle_utf8_bom(FILE* w, bufreader* r) {
 357     discard_bytes(r, 3);
 358     handle_utf8(w, r);
 359 }
 360 
 361 void handle_utf16be(FILE* w, bufreader* r) {
 362     discard_bytes(r, 2);
 363 
 364     for (uint64_t i = 0; r->len > 0; i++) {
 365         if ((i % 1024 == 0) && feof(w)) {
 366             break;
 367         }
 368 
 369         const int a = read_byte(r);
 370         if (a == EOF) {
 371             break;
 372         }
 373 
 374         const int b = read_byte(r);
 375         if (b == EOF) {
 376             write_replacement_char(w);
 377             break;
 378         }
 379 
 380         const uint32_t code = (a << 8) + b;
 381 
 382         // handle non-surrogate runes
 383         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 384             write_rune(w, code);
 385             continue;
 386         }
 387 
 388         const int c = read_byte(r);
 389         if (c == EOF) {
 390             write_replacement_char(w);
 391             break;
 392         }
 393 
 394         const int d = read_byte(r);
 395         if (d == EOF) {
 396             write_replacement_char(w);
 397             break;
 398         }
 399 
 400         // https://en.wikipedia.org/wiki/UTF-16
 401 
 402         const uint16_t high = code;
 403         const uint16_t low = (c << 8) + d;
 404 
 405         // handle valid surrogate runes
 406         if (0xdc00 <= low && low <= 0xdfff) {
 407             write_rune(w, desurrogate(high, low));
 408             continue;
 409         }
 410 
 411         write_replacement_char(w);
 412     }
 413 }
 414 
 415 void handle_utf16le(FILE* w, bufreader* r) {
 416     discard_bytes(r, 2);
 417 
 418     for (uint64_t i = 0; r->len > 0; i++) {
 419         if ((i % 1024 == 0) && feof(w)) {
 420             break;
 421         }
 422 
 423         const int a = read_byte(r);
 424         if (a == EOF) {
 425             break;
 426         }
 427 
 428         const int b = read_byte(r);
 429         if (b == EOF) {
 430             write_replacement_char(w);
 431             break;
 432         }
 433 
 434         const uint32_t code = (b << 8) + a;
 435 
 436         // handle non-surrogate runes
 437         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 438             write_rune(w, code);
 439             continue;
 440         }
 441 
 442         const int c = read_byte(r);
 443         if (c == EOF) {
 444             write_replacement_char(w);
 445             break;
 446         }
 447 
 448         const int d = read_byte(r);
 449         if (d == EOF) {
 450             write_replacement_char(w);
 451             break;
 452         }
 453 
 454         // https://en.wikipedia.org/wiki/UTF-16
 455 
 456         const uint16_t high = code;
 457         const uint16_t low = (d << 8) + c;
 458 
 459         // handle valid surrogate runes
 460         if (0xdc00 <= low && low <= 0xdfff) {
 461             write_rune(w, desurrogate(high, low));
 462             continue;
 463         }
 464 
 465         write_replacement_char(w);
 466     }
 467 }
 468 
 469 void handle_utf32be(FILE* w, bufreader* r) {
 470     discard_bytes(r, 4);
 471 
 472     for (uint64_t i = 0; r->len > 0; i++) {
 473         if ((i % 1024 == 0) && feof(w)) {
 474             break;
 475         }
 476 
 477         const int a = read_byte(r);
 478         if (a == EOF) {
 479             break;
 480         }
 481 
 482         const int b = read_byte(r);
 483         if (b == EOF) {
 484             write_replacement_char(w);
 485             break;
 486         }
 487 
 488         const int c = read_byte(r);
 489         if (c == EOF) {
 490             write_replacement_char(w);
 491             break;
 492         }
 493 
 494         const int d = read_byte(r);
 495         if (d == EOF) {
 496             write_replacement_char(w);
 497             break;
 498         }
 499 
 500         write_rune(w, (a << 24) + (b << 16) + (c << 8) + d);
 501     }
 502 }
 503 
 504 void handle_utf32le(FILE* w, bufreader* r) {
 505     discard_bytes(r, 4);
 506 
 507     for (uint64_t i = 0; r->len > 0; i++) {
 508         if ((i % 1024 == 0) && feof(w)) {
 509             break;
 510         }
 511 
 512         const int a = read_byte(r);
 513         if (a == EOF) {
 514             break;
 515         }
 516 
 517         const int b = read_byte(r);
 518         if (b == EOF) {
 519             write_replacement_char(w);
 520             break;
 521         }
 522 
 523         const int c = read_byte(r);
 524         if (c == EOF) {
 525             write_replacement_char(w);
 526             break;
 527         }
 528 
 529         const int d = read_byte(r);
 530         if (d == EOF) {
 531             write_replacement_char(w);
 532             break;
 533         }
 534 
 535         write_rune(w, (d << 24) + (c << 16) + (b << 8) + a);
 536     }
 537 }
 538 
 539 void (*detect_bom(const bufreader* r))(FILE*, bufreader*) {
 540     const unsigned char* p = r->buf;
 541     const ssize_t len = r->len;
 542 
 543     if (len >= 4) {
 544         if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) {
 545             return handle_utf32be;
 546         }
 547         if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) {
 548             return handle_utf32le;
 549         }
 550     }
 551 
 552     if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) {
 553         return handle_utf8_bom;
 554     }
 555 
 556     if (len >= 2) {
 557         if (p[0] == 0xfe && p[1] == 0xff) {
 558             return handle_utf16be;
 559         }
 560         if (p[0] == 0xff && p[1] == 0xfe) {
 561             return handle_utf16le;
 562         }
 563     }
 564 
 565     return handle_utf8;
 566 }
 567 
 568 void handle_reader(FILE* w, FILE* src, bufreader* r) {
 569     restart_bufreader(r, src);
 570     detect_bom(r)(w, r);
 571     fflush(w);
 572 }
 573 
 574 // handle_file handles data from the filename given; returns false only when
 575 // the file can't be opened
 576 bool handle_file(FILE* w, const char* path, bufreader* r) {
 577     FILE* f = fopen(path, "rb");
 578     if (f == NULL) {
 579         fputc('\n', w);
 580         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 581         return false;
 582     }
 583 
 584     handle_reader(w, f, r);
 585     fclose(f);
 586     return true;
 587 }
 588 
 589 // is_help_option simplifies control-flow for func run
 590 bool is_help_option(const char* s) {
 591     return (s[0] == '-') && (
 592         strcmp(s, "-h") == 0 ||
 593         strcmp(s, "-help") == 0 ||
 594         strcmp(s, "--h") == 0 ||
 595         strcmp(s, "--help") == 0
 596     );
 597 }
 598 
 599 // run returns the number of errors
 600 int run(int argc, char** argv, FILE* w) {
 601     unsigned char buf[IBUF_SIZE];
 602     bufreader r;
 603     init_bufreader(&r, stdin, buf, sizeof(buf));
 604     size_t errors = 0;
 605 
 606     // handle all filenames/options given
 607     for (size_t i = 1; i < argc && !feof(w); i++) {
 608         // a `-` filename stands for the standard input
 609         if (argv[i][0] == '-' && argv[i][1] == 0) {
 610             handle_reader(w, stdin, &r);
 611             continue;
 612         }
 613 
 614         if (!handle_file(w, argv[i], &r)) {
 615             errors++;
 616         }
 617     }
 618 
 619     // no filenames means use stdin as the only input
 620     if (argc < 2) {
 621         handle_reader(w, stdin, &r);
 622     }
 623 
 624     return errors;
 625 }
 626 
 627 int main(int argc, char** argv) {
 628 #ifdef _WIN32
 629     setmode(fileno(stdin), O_BINARY);
 630     // ensure output lines end in LF instead of CRLF on windows
 631     setmode(fileno(stdout), O_BINARY);
 632     setmode(fileno(stderr), O_BINARY);
 633 #endif
 634 
 635     if (argc > 1 && is_help_option(argv[1])) {
 636         fprintf(stderr, "%s", info);
 637         return 0;
 638     }
 639 
 640     return run(argc, argv, stdout) == 0 ? 0 : 1;
 641 }