File: ut.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -o ./ut ./ut.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 // info is the multi-line help message
  43 const char* info = ""
  44 "ut [options...] [filenames...]\n"
  45 "\n"
  46 "\n"
  47 "Unit Text: a tool to turn ASCII/Unicode text into UTF-8 unix-style lines,\n"
  48 "auto-detecting leading UTF byte-order marks, and decoding accordingly.\n"
  49 "\n"
  50 "This tool concatenates lines ignoring carriage-returns on CRLF pairs at the\n"
  51 "end of lines, and ensures no lines across inputs are accidentally joined,\n"
  52 "since all lines it outputs end with line-feeds, even when the original files\n"
  53 "don't.\n"
  54 "\n"
  55 "\n"
  56 "Options\n"
  57 "\n"
  58 "    -h, --h            show this help message\n"
  59 "    -help, --help      aliases for option -h\n"
  60 "";
  61 
  62 typedef struct bufreader {
  63     // buf is the buffer, (re)filled periodically as needed
  64     unsigned char* buf;
  65 
  66     // len is how many buffer bytes are being used, out of its max capacity
  67     size_t len;
  68 
  69     // cap is the buffer's capacity, or the most bytes it can hold at once
  70     size_t cap;
  71 
  72     // pos is the current position, up to the current buffer length
  73     size_t pos;
  74 
  75     // src is the data source used to fill the buffer
  76     FILE* src;
  77 } bufreader;
  78 
  79 // init_bufreader is the constructor for type bufreader
  80 void init_bufreader(bufreader* r, FILE* src, unsigned char* buf, size_t cap) {
  81     r->buf = buf;
  82     r->len = 0;
  83     r->cap = cap;
  84     r->pos = 0;
  85     r->src = src;
  86 
  87     // allow peeking at the first few input bytes, which are needed to detect
  88     // which specific utf input-format is being used
  89     r->len = fread(r->buf, sizeof(unsigned char), r->cap, r->src);
  90 }
  91 
  92 // read_byte does as it says: check its return for the value EOF, before
  93 // using it as the next byte
  94 int read_byte(bufreader* r) {
  95     if (r->pos < r->len) {
  96         // inside current chunk
  97         const unsigned char b = r->buf[r->pos];
  98         r->pos++;
  99         return b;
 100     }
 101 
 102     // need to read the next block
 103     r->pos = 0;
 104     r->len = fread(r->buf, sizeof(unsigned char), r->cap, r->src);
 105     if (r->len > 0) {
 106         const unsigned char b = r->buf[r->pos];
 107         r->pos++;
 108         return b;
 109     }
 110 
 111     // reached the end of data
 112     return EOF;
 113 }
 114 
 115 int64_t discard_bytes(bufreader* r, size_t n) {
 116     if (r->pos + n < r->len) {
 117         r->pos += n;
 118         return n;
 119     }
 120 
 121     int64_t discarded = 0;
 122     for (; n > 0; n--, discarded++) {
 123         if (read_byte(r) == EOF) {
 124             break;
 125         }
 126     }
 127     return discarded;
 128 }
 129 
 130 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 131 
 132 bool check_2_byte_rune(int a, int b) {
 133     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 134 }
 135 
 136 bool check_3_byte_rune(int a, int b, int c) {
 137     return (
 138         (a == 0xe0) &&
 139         (0xa0 <= b && b <= 0xbf) &&
 140         (0x80 <= c && c <= 0xbf)
 141     ) || (
 142         (0xe1 <= a && a <= 0xec) &&
 143         (0x80 <= b && b <= 0xbf) &&
 144         (0x80 <= c && c <= 0xbf)
 145     ) || (
 146         (a == 0xed) &&
 147         (0x80 <= b && b <= 0x9f) &&
 148         (0x80 <= c && c <= 0xbf)
 149     ) || (
 150         (a == 0xee || a == 0xef) &&
 151         (0x80 <= b && b <= 0xbf) &&
 152         (0x80 <= c && c <= 0xbf)
 153     );
 154 }
 155 
 156 bool check_4_byte_rune(int a, int b, int c, int d) {
 157     return (
 158         (a == 0xf0) &&
 159         (0x90 <= b && b <= 0xbf) &&
 160         (0x80 <= c && c <= 0xbf) &&
 161         (0x80 <= d && d <= 0xbf)
 162     ) || (
 163         (a == 0xf1 || a == 0xf3) &&
 164         (0x80 <= b && b <= 0xbf) &&
 165         (0x80 <= c && c <= 0xbf) &&
 166         (0x80 <= d && d <= 0xbf)
 167     ) || (
 168         (a == 0xf4) &&
 169         (0x80 <= b && b <= 0xbf) &&
 170         (0x80 <= c && c <= 0x8f) &&
 171         (0x80 <= d && d <= 0xbf)
 172     );
 173 }
 174 
 175 // write_replacement_char is the recommended action to handle invalid bytes
 176 void write_replacement_char(FILE* w) {
 177     putc(0xef, w);
 178     putc(0xbf, w);
 179     putc(0xbd, w);
 180 }
 181 
 182 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 183 // void write_rune(FILE* w, uint32_t rune) {
 184 //     if (rune < (1 << 7)) {
 185 //         putc(rune, w);
 186 //         return;
 187 //     }
 188 //
 189 //     if (rune < (1 << (5 + 6))) {
 190 //         putc(0b11000000 | (rune >> 6), w);
 191 //         putc(0b10000000 | (rune & 0b00111111), w);
 192 //         return;
 193 //     }
 194 //
 195 //     if (rune < (1 << (4 + 6 + 6))) {
 196 //         putc(0b11100000 | (rune >> 12), w);
 197 //         putc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 198 //         putc(0b10000000 | (rune & 0b00111111), w);
 199 //         return;
 200 //     }
 201 //
 202 //     if (rune < (1 << (3 + 6 + 6 + 6))) {
 203 //         putc(0b11110000 | (rune >> 18), w);
 204 //         putc(0b10000000 | ((rune >> 12) & 0b00111111), w);
 205 //         putc(0b10000000 | ((rune >> 6) & 0b00111111), w);
 206 //         putc(0b10000000 | (rune & 0b00111111), w);
 207 //         return;
 208 //     }
 209 //
 210 //     // handle invalid runes with a utf-8 replacement character
 211 //     write_replacement_char(w);
 212 // }
 213 
 214 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 215 void write_rune(FILE* w, uint32_t rune) {
 216     if (rune < (1 << 7)) {
 217         putc(rune, w);
 218         return;
 219     }
 220 
 221     if (rune < (1 << (5 + 6))) {
 222         const int a = 0b11000000 | (rune >> 6);
 223         const int b = 0b10000000 | (rune & 0b00111111);
 224         if (check_2_byte_rune(a, b)) {
 225             putc(a, w);
 226             putc(b, w);
 227         } else {
 228             write_replacement_char(w);
 229         }
 230         return;
 231     }
 232 
 233     if (rune < (1 << (4 + 6 + 6))) {
 234         const int a = 0b11100000 | (rune >> 12);
 235         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 236         const int c = 0b10000000 | (rune & 0b00111111);
 237         if (check_3_byte_rune(a, b, c)) {
 238             putc(a, w);
 239             putc(b, w);
 240             putc(c, w);
 241         } else {
 242             write_replacement_char(w);
 243         }
 244         return;
 245     }
 246 
 247     if (rune < (1 << (3 + 6 + 6 + 6))) {
 248         const int a = 0b11110000 | (rune >> 18);
 249         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 250         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 251         const int d = 0b10000000 | (rune & 0b00111111);
 252         if (check_4_byte_rune(a, b, c, d)) {
 253             putc(a, w);
 254             putc(b, w);
 255             putc(c, w);
 256             putc(d, w);
 257         } else {
 258             write_replacement_char(w);
 259         }
 260         return;
 261     }
 262 
 263     // handle invalid runes with a utf-8 replacement character
 264     write_replacement_char(w);
 265 }
 266 
 267 void show_error(FILE* w, const char* msg) {
 268     putc('\n', w);
 269     fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 270 }
 271 
 272 typedef enum detected_bom {
 273     NO_BOM = 0,
 274     UTF8_BOM = 1,
 275     UTF16_BE_BOM = 2,
 276     UTF16_LE_BOM = 3,
 277     UTF32_BE_BOM = 4,
 278     UTF32_LE_BOM = 5,
 279 } detected_bom;
 280 
 281 detected_bom detect_bom(const bufreader* r) {
 282     const unsigned char* p = r->buf;
 283     const size_t len = r->len;
 284 
 285     if (len >= 4) {
 286         if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) {
 287             return UTF32_BE_BOM;
 288         }
 289         if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) {
 290             return UTF32_LE_BOM;
 291         }
 292     }
 293 
 294     if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) {
 295         return UTF8_BOM;
 296     }
 297 
 298     if (len >= 2) {
 299         if (p[0] == 0xfe && p[1] == 0xff) {
 300             return UTF16_BE_BOM;
 301         }
 302         if (p[0] == 0xff && p[1] == 0xfe) {
 303             return UTF16_LE_BOM;
 304         }
 305     }
 306 
 307     return NO_BOM;
 308 }
 309 
 310 // desurrogate assumes the utf16 pair given to it is a valid surrogate
 311 uint32_t desurrogate(uint16_t high, uint16_t low) {
 312     return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000;
 313 }
 314 
 315 bool handle_utf8(FILE* w, bufreader* r) {
 316     bool cr = false;
 317     bool bad_end = false;
 318 
 319     for (uint64_t i = 0; r->len > 0; i++) {
 320         if ((i % 1024 == 0) && feof(w)) {
 321             break;
 322         }
 323 
 324         const int a = read_byte(r);
 325         if (a == EOF) {
 326             break;
 327         }
 328 
 329         if (a != '\n' && cr) {
 330             putc('\r', w);
 331         }
 332 
 333         cr = a == '\r';
 334         if (cr) {
 335             continue;
 336         }
 337 
 338         // handle 1-byte runes
 339         if (a < 128) {
 340             putc(a, w);
 341             continue;
 342         }
 343 
 344         const int b = read_byte(r);
 345         if (b == EOF) {
 346             bad_end = true;
 347             break;
 348         }
 349 
 350         // handle 2-byte runes
 351         if (check_2_byte_rune(a, b)) {
 352             putc(a, w);
 353             putc(b, w);
 354             continue;
 355         }
 356 
 357         const int c = read_byte(r);
 358         if (c == EOF) {
 359             bad_end = true;
 360             break;
 361         }
 362 
 363         // handle 3-byte runes
 364         if (check_3_byte_rune(a, b, c)) {
 365             putc(a, w);
 366             putc(b, w);
 367             putc(c, w);
 368             continue;
 369         }
 370 
 371         const int d = read_byte(r);
 372         if (d == EOF) {
 373             bad_end = true;
 374             break;
 375         }
 376 
 377         // handle 4-byte runes
 378         if (check_4_byte_rune(a, b, c, d)) {
 379             putc(a, w);
 380             putc(b, w);
 381             putc(c, w);
 382             putc(d, w);
 383             continue;
 384         }
 385 
 386         write_replacement_char(w);
 387     }
 388 
 389     if (cr) {
 390         putc('\r', w);
 391     }
 392     if (bad_end) {
 393         write_replacement_char(w);
 394     }
 395     return true;
 396 }
 397 
 398 bool handle_utf16be(FILE* w, bufreader* r) {
 399     bool cr = false;
 400     bool bad_end = false;
 401 
 402     for (uint64_t i = 0; r->len > 0; i++) {
 403         if ((i % 1024 == 0) && feof(w)) {
 404             break;
 405         }
 406 
 407         const int a = read_byte(r);
 408         if (a == EOF) {
 409             break;
 410         }
 411 
 412         const int b = read_byte(r);
 413         if (b == EOF) {
 414             bad_end = true;
 415             break;
 416         }
 417 
 418         const uint32_t code = (a << 8) + b;
 419 
 420         if (code != '\n' && cr) {
 421             putc('\r', w);
 422         }
 423 
 424         cr = code == '\r';
 425         if (cr) {
 426             continue;
 427         }
 428 
 429         // handle non-surrogate runes
 430         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 431             write_rune(w, code);
 432             continue;
 433         }
 434 
 435         const int c = read_byte(r);
 436         if (c == EOF) {
 437             bad_end = true;
 438             break;
 439         }
 440 
 441         const int d = read_byte(r);
 442         if (d == EOF) {
 443             bad_end = true;
 444             break;
 445         }
 446 
 447         // https://en.wikipedia.org/wiki/UTF-16
 448 
 449         const uint16_t high = code;
 450         const uint16_t low = (c << 8) + d;
 451 
 452         // handle valid surrogate runes
 453         if (0xdc00 <= low && low <= 0xdfff) {
 454             write_rune(w, desurrogate(high, low));
 455             continue;
 456         }
 457 
 458         write_replacement_char(w);
 459     }
 460 
 461     if (cr) {
 462         putc('\r', w);
 463     }
 464     if (bad_end) {
 465         write_replacement_char(w);
 466     }
 467     return true;
 468 }
 469 
 470 bool handle_utf16le(FILE* w, bufreader* r) {
 471     bool cr = false;
 472     bool bad_end = false;
 473 
 474     for (uint64_t i = 0; r->len > 0; i++) {
 475         if ((i % 1024 == 0) && feof(w)) {
 476             break;
 477         }
 478 
 479         const int a = read_byte(r);
 480         if (a == EOF) {
 481             break;
 482         }
 483 
 484         const int b = read_byte(r);
 485         if (b == EOF) {
 486             bad_end = true;
 487             break;
 488         }
 489 
 490         const uint32_t code = (b << 8) + a;
 491 
 492         if (code != '\n' && cr) {
 493             putc('\r', w);
 494         }
 495 
 496         cr = code == '\r';
 497         if (cr) {
 498             continue;
 499         }
 500 
 501         // handle non-surrogate runes
 502         if ((code <= 0xd7ff) || (code >= 0xe000)) {
 503             write_rune(w, code);
 504             continue;
 505         }
 506 
 507         const int c = read_byte(r);
 508         if (c == EOF) {
 509             bad_end = true;
 510             break;
 511         }
 512 
 513         const int d = read_byte(r);
 514         if (d == EOF) {
 515             bad_end = true;
 516             break;
 517         }
 518 
 519         // https://en.wikipedia.org/wiki/UTF-16
 520 
 521         const uint16_t high = code;
 522         const uint16_t low = (d << 8) + c;
 523 
 524         // handle valid surrogate runes
 525         if (0xdc00 <= low && low <= 0xdfff) {
 526             write_rune(w, desurrogate(high, low));
 527             continue;
 528         }
 529 
 530         write_replacement_char(w);
 531     }
 532 
 533     if (cr) {
 534         putc('\r', w);
 535     }
 536     if (bad_end) {
 537         write_replacement_char(w);
 538     }
 539     return true;
 540 }
 541 
 542 bool handle_utf32be(FILE* w, bufreader* r) {
 543     bool cr = false;
 544     bool bad_end = false;
 545 
 546     for (uint64_t i = 0; r->len > 0; i++) {
 547         if ((i % 1024 == 0) && feof(w)) {
 548             break;
 549         }
 550 
 551         const int a = read_byte(r);
 552         if (a == EOF) {
 553             break;
 554         }
 555 
 556         const int b = read_byte(r);
 557         if (b == EOF) {
 558             bad_end = true;
 559             break;
 560         }
 561 
 562         const int c = read_byte(r);
 563         if (c == EOF) {
 564             bad_end = true;
 565             break;
 566         }
 567 
 568         const int d = read_byte(r);
 569         if (d == EOF) {
 570             bad_end = true;
 571             break;
 572         }
 573 
 574         const uint32_t code = (a << 24) + (b << 16) + (c << 8) + d;
 575 
 576         if (code != '\n' && cr) {
 577             putc('\r', w);
 578         }
 579 
 580         cr = code == '\r';
 581         if (cr) {
 582             continue;
 583         }
 584 
 585         write_rune(w, code);
 586     }
 587 
 588     if (cr) {
 589         putc('\r', w);
 590     }
 591     if (bad_end) {
 592         write_replacement_char(w);
 593     }
 594     return true;
 595 }
 596 
 597 bool handle_utf32le(FILE* w, bufreader* r) {
 598     bool cr = false;
 599     bool bad_end = false;
 600 
 601     for (uint64_t i = 0; r->len > 0; i++) {
 602         if ((i % 1024 == 0) && feof(w)) {
 603             break;
 604         }
 605 
 606         const int a = read_byte(r);
 607         if (a == EOF) {
 608             return true;
 609         }
 610 
 611         const int b = read_byte(r);
 612         if (b == EOF) {
 613             write_replacement_char(w);
 614             return true;
 615         }
 616 
 617         const int c = read_byte(r);
 618         if (c == EOF) {
 619             write_replacement_char(w);
 620             return true;
 621         }
 622 
 623         const int d = read_byte(r);
 624         if (d == EOF) {
 625             write_replacement_char(w);
 626             return true;
 627         }
 628 
 629         const uint32_t code = (d << 24) + (c << 16) + (b << 8) + a;
 630 
 631         if (code != '\n' && cr) {
 632             putc('\r', w);
 633         }
 634 
 635         cr = code == '\r';
 636         if (cr) {
 637             continue;
 638         }
 639 
 640         write_rune(w, code);
 641     }
 642 
 643     if (cr) {
 644         putc('\r', w);
 645     }
 646     if (bad_end) {
 647         write_replacement_char(w);
 648     }
 649     return true;
 650 }
 651 
 652 bool handle_reader(FILE* w, FILE* src) {
 653     const int bufcap = 32 * 1024;
 654     unsigned char buf[bufcap];
 655 
 656     bufreader r;
 657     init_bufreader(&r, src, buf, bufcap);
 658 
 659     switch (detect_bom(&r)) {
 660         case NO_BOM:
 661             return handle_utf8(w, &r);
 662 
 663         case UTF8_BOM:
 664             discard_bytes(&r, 3);
 665             return handle_utf8(w, &r);
 666 
 667         case UTF16_BE_BOM:
 668             discard_bytes(&r, 2);
 669             return handle_utf16be(w, &r);
 670 
 671         case UTF16_LE_BOM:
 672             discard_bytes(&r, 2);
 673             return handle_utf16le(w, &r);
 674 
 675         case UTF32_BE_BOM:
 676             discard_bytes(&r, 4);
 677             return handle_utf32be(w, &r);
 678 
 679         case UTF32_LE_BOM:
 680             discard_bytes(&r, 4);
 681             return handle_utf32le(w, &r);
 682 
 683         default:
 684             return handle_utf8(w, &r);
 685     }
 686 }
 687 
 688 // handle_file handles data from the filename given; returns false only when
 689 // the file can't be opened
 690 bool handle_file(FILE* w, const char* path) {
 691     FILE* f = fopen(path, "rb");
 692     if (f == NULL) {
 693         // ensure currently-buffered/deferred output shows up right now: not
 694         // doing so may scramble results in the common case where stdout and
 695         // stderr are the same, thus confusing users
 696         putc('\n', w);
 697 
 698         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path);
 699         return false;
 700     }
 701 
 702     const bool ok = handle_reader(w, f);
 703     fclose(f);
 704     return ok;
 705 }
 706 
 707 // is_help_option simplifies control-flow for func run
 708 bool is_help_option(const char* s) {
 709     return (s[0] == '-') && (
 710         strcmp(s, "-h") == 0 ||
 711         strcmp(s, "-help") == 0 ||
 712         strcmp(s, "--h") == 0 ||
 713         strcmp(s, "--help") == 0
 714     );
 715 }
 716 
 717 // run returns the number of errors
 718 int run(int argc, char** argv, FILE* w) {
 719     size_t files = 0;
 720     size_t errors = 0;
 721 
 722     // handle all filenames/options given
 723     for (size_t i = 1; i < argc && !feof(w); i++) {
 724         if (i > 1) {
 725             fflush(w);
 726         }
 727 
 728         // a `-` filename stands for the standard input
 729         if (argv[i][0] == '-' && argv[i][1] == 0) {
 730             handle_reader(w, stdin);
 731             continue;
 732         }
 733 
 734         if (is_help_option(argv[i])) {
 735             // help option quits the app right away
 736             fprintf(stderr, "%s", info);
 737             return 0;
 738         }
 739 
 740         if (!handle_file(w, argv[i])) {
 741             errors++;
 742         }
 743         files++;
 744     }
 745 
 746     // no filenames means use stdin as the only input
 747     if (files == 0) {
 748         handle_reader(w, stdin);
 749     }
 750 
 751     return errors;
 752 }
 753 
 754 int main(int argc, char** argv) {
 755 #ifdef _WIN32
 756     setmode(fileno(stdin), O_BINARY);
 757     // ensure output lines end in LF instead of CRLF on windows
 758     setmode(fileno(stdout), O_BINARY);
 759     setmode(fileno(stderr), O_BINARY);
 760 #endif
 761 
 762     return run(argc, argv, stdout) == 0 ? 0 : 1;
 763 }