File: ut.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -o ./ut ./ut.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 // info is the multi-line help message 43 const char* info = "" 44 "ut [options...] [filenames...]\n" 45 "\n" 46 "\n" 47 "Unit Text: a tool to turn ASCII/Unicode text into UTF-8 unix-style lines,\n" 48 "auto-detecting leading UTF byte-order marks, and decoding accordingly.\n" 49 "\n" 50 "This tool concatenates lines ignoring carriage-returns on CRLF pairs at the\n" 51 "end of lines, and ensures no lines across inputs are accidentally joined,\n" 52 "since all lines it outputs end with line-feeds, even when the original files\n" 53 "don't.\n" 54 "\n" 55 "\n" 56 "Options\n" 57 "\n" 58 " -h, --h show this help message\n" 59 " -help, --help aliases for option -h\n" 60 ""; 61 62 typedef struct bufreader { 63 // buf is the buffer, (re)filled periodically as needed 64 unsigned char* buf; 65 66 // len is how many buffer bytes are being used, out of its max capacity 67 size_t len; 68 69 // cap is the buffer's capacity, or the most bytes it can hold at once 70 size_t cap; 71 72 // pos is the current position, up to the current buffer length 73 size_t pos; 74 75 // src is the data source used to fill the buffer 76 FILE* src; 77 } bufreader; 78 79 // init_bufreader is the constructor for type bufreader 80 void init_bufreader(bufreader* r, FILE* src, unsigned char* buf, size_t cap) { 81 r->buf = buf; 82 r->len = 0; 83 r->cap = cap; 84 r->pos = 0; 85 r->src = src; 86 87 // allow peeking at the first few input bytes, which are needed to detect 88 // which specific utf input-format is being used 89 r->len = fread(r->buf, sizeof(unsigned char), r->cap, r->src); 90 } 91 92 // read_byte does as it says: check its return for the value EOF, before 93 // using it as the next byte 94 int read_byte(bufreader* r) { 95 if (r->pos < r->len) { 96 // inside current chunk 97 const unsigned char b = r->buf[r->pos]; 98 r->pos++; 99 return b; 100 } 101 102 // need to read the next block 103 r->pos = 0; 104 r->len = fread(r->buf, sizeof(unsigned char), r->cap, r->src); 105 if (r->len > 0) { 106 const unsigned char b = r->buf[r->pos]; 107 r->pos++; 108 return b; 109 } 110 111 // reached the end of data 112 return EOF; 113 } 114 115 int64_t discard_bytes(bufreader* r, size_t n) { 116 if (r->pos + n < r->len) { 117 r->pos += n; 118 return n; 119 } 120 121 int64_t discarded = 0; 122 for (; n > 0; n--, discarded++) { 123 if (read_byte(r) == EOF) { 124 break; 125 } 126 } 127 return discarded; 128 } 129 130 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 131 132 bool check_2_byte_rune(int a, int b) { 133 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 134 } 135 136 bool check_3_byte_rune(int a, int b, int c) { 137 return ( 138 (a == 0xe0) && 139 (0xa0 <= b && b <= 0xbf) && 140 (0x80 <= c && c <= 0xbf) 141 ) || ( 142 (0xe1 <= a && a <= 0xec) && 143 (0x80 <= b && b <= 0xbf) && 144 (0x80 <= c && c <= 0xbf) 145 ) || ( 146 (a == 0xed) && 147 (0x80 <= b && b <= 0x9f) && 148 (0x80 <= c && c <= 0xbf) 149 ) || ( 150 (a == 0xee || a == 0xef) && 151 (0x80 <= b && b <= 0xbf) && 152 (0x80 <= c && c <= 0xbf) 153 ); 154 } 155 156 bool check_4_byte_rune(int a, int b, int c, int d) { 157 return ( 158 (a == 0xf0) && 159 (0x90 <= b && b <= 0xbf) && 160 (0x80 <= c && c <= 0xbf) && 161 (0x80 <= d && d <= 0xbf) 162 ) || ( 163 (a == 0xf1 || a == 0xf3) && 164 (0x80 <= b && b <= 0xbf) && 165 (0x80 <= c && c <= 0xbf) && 166 (0x80 <= d && d <= 0xbf) 167 ) || ( 168 (a == 0xf4) && 169 (0x80 <= b && b <= 0xbf) && 170 (0x80 <= c && c <= 0x8f) && 171 (0x80 <= d && d <= 0xbf) 172 ); 173 } 174 175 // write_replacement_char is the recommended action to handle invalid bytes 176 void write_replacement_char(FILE* w) { 177 putc(0xef, w); 178 putc(0xbf, w); 179 putc(0xbd, w); 180 } 181 182 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 183 // void write_rune(FILE* w, uint32_t rune) { 184 // if (rune < (1 << 7)) { 185 // putc(rune, w); 186 // return; 187 // } 188 // 189 // if (rune < (1 << (5 + 6))) { 190 // putc(0b11000000 | (rune >> 6), w); 191 // putc(0b10000000 | (rune & 0b00111111), w); 192 // return; 193 // } 194 // 195 // if (rune < (1 << (4 + 6 + 6))) { 196 // putc(0b11100000 | (rune >> 12), w); 197 // putc(0b10000000 | ((rune >> 6) & 0b00111111), w); 198 // putc(0b10000000 | (rune & 0b00111111), w); 199 // return; 200 // } 201 // 202 // if (rune < (1 << (3 + 6 + 6 + 6))) { 203 // putc(0b11110000 | (rune >> 18), w); 204 // putc(0b10000000 | ((rune >> 12) & 0b00111111), w); 205 // putc(0b10000000 | ((rune >> 6) & 0b00111111), w); 206 // putc(0b10000000 | (rune & 0b00111111), w); 207 // return; 208 // } 209 // 210 // // handle invalid runes with a utf-8 replacement character 211 // write_replacement_char(w); 212 // } 213 214 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 215 void write_rune(FILE* w, uint32_t rune) { 216 if (rune < (1 << 7)) { 217 putc(rune, w); 218 return; 219 } 220 221 if (rune < (1 << (5 + 6))) { 222 const int a = 0b11000000 | (rune >> 6); 223 const int b = 0b10000000 | (rune & 0b00111111); 224 if (check_2_byte_rune(a, b)) { 225 putc(a, w); 226 putc(b, w); 227 } else { 228 write_replacement_char(w); 229 } 230 return; 231 } 232 233 if (rune < (1 << (4 + 6 + 6))) { 234 const int a = 0b11100000 | (rune >> 12); 235 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 236 const int c = 0b10000000 | (rune & 0b00111111); 237 if (check_3_byte_rune(a, b, c)) { 238 putc(a, w); 239 putc(b, w); 240 putc(c, w); 241 } else { 242 write_replacement_char(w); 243 } 244 return; 245 } 246 247 if (rune < (1 << (3 + 6 + 6 + 6))) { 248 const int a = 0b11110000 | (rune >> 18); 249 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 250 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 251 const int d = 0b10000000 | (rune & 0b00111111); 252 if (check_4_byte_rune(a, b, c, d)) { 253 putc(a, w); 254 putc(b, w); 255 putc(c, w); 256 putc(d, w); 257 } else { 258 write_replacement_char(w); 259 } 260 return; 261 } 262 263 // handle invalid runes with a utf-8 replacement character 264 write_replacement_char(w); 265 } 266 267 void show_error(FILE* w, const char* msg) { 268 putc('\n', w); 269 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 270 } 271 272 typedef enum detected_bom { 273 NO_BOM = 0, 274 UTF8_BOM = 1, 275 UTF16_BE_BOM = 2, 276 UTF16_LE_BOM = 3, 277 UTF32_BE_BOM = 4, 278 UTF32_LE_BOM = 5, 279 } detected_bom; 280 281 detected_bom detect_bom(const bufreader* r) { 282 const unsigned char* p = r->buf; 283 const size_t len = r->len; 284 285 if (len >= 4) { 286 if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) { 287 return UTF32_BE_BOM; 288 } 289 if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) { 290 return UTF32_LE_BOM; 291 } 292 } 293 294 if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) { 295 return UTF8_BOM; 296 } 297 298 if (len >= 2) { 299 if (p[0] == 0xfe && p[1] == 0xff) { 300 return UTF16_BE_BOM; 301 } 302 if (p[0] == 0xff && p[1] == 0xfe) { 303 return UTF16_LE_BOM; 304 } 305 } 306 307 return NO_BOM; 308 } 309 310 // desurrogate assumes the utf16 pair given to it is a valid surrogate 311 uint32_t desurrogate(uint16_t high, uint16_t low) { 312 return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000; 313 } 314 315 bool handle_utf8(FILE* w, bufreader* r) { 316 bool cr = false; 317 bool bad_end = false; 318 319 for (uint64_t i = 0; r->len > 0; i++) { 320 if ((i % 1024 == 0) && feof(w)) { 321 break; 322 } 323 324 const int a = read_byte(r); 325 if (a == EOF) { 326 break; 327 } 328 329 if (a != '\n' && cr) { 330 putc('\r', w); 331 } 332 333 cr = a == '\r'; 334 if (cr) { 335 continue; 336 } 337 338 // handle 1-byte runes 339 if (a < 128) { 340 putc(a, w); 341 continue; 342 } 343 344 const int b = read_byte(r); 345 if (b == EOF) { 346 bad_end = true; 347 break; 348 } 349 350 // handle 2-byte runes 351 if (check_2_byte_rune(a, b)) { 352 putc(a, w); 353 putc(b, w); 354 continue; 355 } 356 357 const int c = read_byte(r); 358 if (c == EOF) { 359 bad_end = true; 360 break; 361 } 362 363 // handle 3-byte runes 364 if (check_3_byte_rune(a, b, c)) { 365 putc(a, w); 366 putc(b, w); 367 putc(c, w); 368 continue; 369 } 370 371 const int d = read_byte(r); 372 if (d == EOF) { 373 bad_end = true; 374 break; 375 } 376 377 // handle 4-byte runes 378 if (check_4_byte_rune(a, b, c, d)) { 379 putc(a, w); 380 putc(b, w); 381 putc(c, w); 382 putc(d, w); 383 continue; 384 } 385 386 write_replacement_char(w); 387 } 388 389 if (cr) { 390 putc('\r', w); 391 } 392 if (bad_end) { 393 write_replacement_char(w); 394 } 395 return true; 396 } 397 398 bool handle_utf16be(FILE* w, bufreader* r) { 399 bool cr = false; 400 bool bad_end = false; 401 402 for (uint64_t i = 0; r->len > 0; i++) { 403 if ((i % 1024 == 0) && feof(w)) { 404 break; 405 } 406 407 const int a = read_byte(r); 408 if (a == EOF) { 409 break; 410 } 411 412 const int b = read_byte(r); 413 if (b == EOF) { 414 bad_end = true; 415 break; 416 } 417 418 const uint32_t code = (a << 8) + b; 419 420 if (code != '\n' && cr) { 421 putc('\r', w); 422 } 423 424 cr = code == '\r'; 425 if (cr) { 426 continue; 427 } 428 429 // handle non-surrogate runes 430 if ((code <= 0xd7ff) || (code >= 0xe000)) { 431 write_rune(w, code); 432 continue; 433 } 434 435 const int c = read_byte(r); 436 if (c == EOF) { 437 bad_end = true; 438 break; 439 } 440 441 const int d = read_byte(r); 442 if (d == EOF) { 443 bad_end = true; 444 break; 445 } 446 447 // https://en.wikipedia.org/wiki/UTF-16 448 449 const uint16_t high = code; 450 const uint16_t low = (c << 8) + d; 451 452 // handle valid surrogate runes 453 if (0xdc00 <= low && low <= 0xdfff) { 454 write_rune(w, desurrogate(high, low)); 455 continue; 456 } 457 458 write_replacement_char(w); 459 } 460 461 if (cr) { 462 putc('\r', w); 463 } 464 if (bad_end) { 465 write_replacement_char(w); 466 } 467 return true; 468 } 469 470 bool handle_utf16le(FILE* w, bufreader* r) { 471 bool cr = false; 472 bool bad_end = false; 473 474 for (uint64_t i = 0; r->len > 0; i++) { 475 if ((i % 1024 == 0) && feof(w)) { 476 break; 477 } 478 479 const int a = read_byte(r); 480 if (a == EOF) { 481 break; 482 } 483 484 const int b = read_byte(r); 485 if (b == EOF) { 486 bad_end = true; 487 break; 488 } 489 490 const uint32_t code = (b << 8) + a; 491 492 if (code != '\n' && cr) { 493 putc('\r', w); 494 } 495 496 cr = code == '\r'; 497 if (cr) { 498 continue; 499 } 500 501 // handle non-surrogate runes 502 if ((code <= 0xd7ff) || (code >= 0xe000)) { 503 write_rune(w, code); 504 continue; 505 } 506 507 const int c = read_byte(r); 508 if (c == EOF) { 509 bad_end = true; 510 break; 511 } 512 513 const int d = read_byte(r); 514 if (d == EOF) { 515 bad_end = true; 516 break; 517 } 518 519 // https://en.wikipedia.org/wiki/UTF-16 520 521 const uint16_t high = code; 522 const uint16_t low = (d << 8) + c; 523 524 // handle valid surrogate runes 525 if (0xdc00 <= low && low <= 0xdfff) { 526 write_rune(w, desurrogate(high, low)); 527 continue; 528 } 529 530 write_replacement_char(w); 531 } 532 533 if (cr) { 534 putc('\r', w); 535 } 536 if (bad_end) { 537 write_replacement_char(w); 538 } 539 return true; 540 } 541 542 bool handle_utf32be(FILE* w, bufreader* r) { 543 bool cr = false; 544 bool bad_end = false; 545 546 for (uint64_t i = 0; r->len > 0; i++) { 547 if ((i % 1024 == 0) && feof(w)) { 548 break; 549 } 550 551 const int a = read_byte(r); 552 if (a == EOF) { 553 break; 554 } 555 556 const int b = read_byte(r); 557 if (b == EOF) { 558 bad_end = true; 559 break; 560 } 561 562 const int c = read_byte(r); 563 if (c == EOF) { 564 bad_end = true; 565 break; 566 } 567 568 const int d = read_byte(r); 569 if (d == EOF) { 570 bad_end = true; 571 break; 572 } 573 574 const uint32_t code = (a << 24) + (b << 16) + (c << 8) + d; 575 576 if (code != '\n' && cr) { 577 putc('\r', w); 578 } 579 580 cr = code == '\r'; 581 if (cr) { 582 continue; 583 } 584 585 write_rune(w, code); 586 } 587 588 if (cr) { 589 putc('\r', w); 590 } 591 if (bad_end) { 592 write_replacement_char(w); 593 } 594 return true; 595 } 596 597 bool handle_utf32le(FILE* w, bufreader* r) { 598 bool cr = false; 599 bool bad_end = false; 600 601 for (uint64_t i = 0; r->len > 0; i++) { 602 if ((i % 1024 == 0) && feof(w)) { 603 break; 604 } 605 606 const int a = read_byte(r); 607 if (a == EOF) { 608 return true; 609 } 610 611 const int b = read_byte(r); 612 if (b == EOF) { 613 write_replacement_char(w); 614 return true; 615 } 616 617 const int c = read_byte(r); 618 if (c == EOF) { 619 write_replacement_char(w); 620 return true; 621 } 622 623 const int d = read_byte(r); 624 if (d == EOF) { 625 write_replacement_char(w); 626 return true; 627 } 628 629 const uint32_t code = (d << 24) + (c << 16) + (b << 8) + a; 630 631 if (code != '\n' && cr) { 632 putc('\r', w); 633 } 634 635 cr = code == '\r'; 636 if (cr) { 637 continue; 638 } 639 640 write_rune(w, code); 641 } 642 643 if (cr) { 644 putc('\r', w); 645 } 646 if (bad_end) { 647 write_replacement_char(w); 648 } 649 return true; 650 } 651 652 bool handle_reader(FILE* w, FILE* src) { 653 const int bufcap = 32 * 1024; 654 unsigned char buf[bufcap]; 655 656 bufreader r; 657 init_bufreader(&r, src, buf, bufcap); 658 659 switch (detect_bom(&r)) { 660 case NO_BOM: 661 return handle_utf8(w, &r); 662 663 case UTF8_BOM: 664 discard_bytes(&r, 3); 665 return handle_utf8(w, &r); 666 667 case UTF16_BE_BOM: 668 discard_bytes(&r, 2); 669 return handle_utf16be(w, &r); 670 671 case UTF16_LE_BOM: 672 discard_bytes(&r, 2); 673 return handle_utf16le(w, &r); 674 675 case UTF32_BE_BOM: 676 discard_bytes(&r, 4); 677 return handle_utf32be(w, &r); 678 679 case UTF32_LE_BOM: 680 discard_bytes(&r, 4); 681 return handle_utf32le(w, &r); 682 683 default: 684 return handle_utf8(w, &r); 685 } 686 } 687 688 // handle_file handles data from the filename given; returns false only when 689 // the file can't be opened 690 bool handle_file(FILE* w, const char* path) { 691 FILE* f = fopen(path, "rb"); 692 if (f == NULL) { 693 // ensure currently-buffered/deferred output shows up right now: not 694 // doing so may scramble results in the common case where stdout and 695 // stderr are the same, thus confusing users 696 putc('\n', w); 697 698 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); 699 return false; 700 } 701 702 const bool ok = handle_reader(w, f); 703 fclose(f); 704 return ok; 705 } 706 707 // is_help_option simplifies control-flow for func run 708 bool is_help_option(const char* s) { 709 return (s[0] == '-') && ( 710 strcmp(s, "-h") == 0 || 711 strcmp(s, "-help") == 0 || 712 strcmp(s, "--h") == 0 || 713 strcmp(s, "--help") == 0 714 ); 715 } 716 717 // run returns the number of errors 718 int run(int argc, char** argv, FILE* w) { 719 size_t files = 0; 720 size_t errors = 0; 721 722 // handle all filenames/options given 723 for (size_t i = 1; i < argc && !feof(w); i++) { 724 if (i > 1) { 725 fflush(w); 726 } 727 728 // a `-` filename stands for the standard input 729 if (argv[i][0] == '-' && argv[i][1] == 0) { 730 handle_reader(w, stdin); 731 continue; 732 } 733 734 if (is_help_option(argv[i])) { 735 // help option quits the app right away 736 fprintf(stderr, "%s", info); 737 return 0; 738 } 739 740 if (!handle_file(w, argv[i])) { 741 errors++; 742 } 743 files++; 744 } 745 746 // no filenames means use stdin as the only input 747 if (files == 0) { 748 handle_reader(w, stdin); 749 } 750 751 return errors; 752 } 753 754 int main(int argc, char** argv) { 755 #ifdef _WIN32 756 setmode(fileno(stdin), O_BINARY); 757 // ensure output lines end in LF instead of CRLF on windows 758 setmode(fileno(stdout), O_BINARY); 759 setmode(fileno(stderr), O_BINARY); 760 #endif 761 762 return run(argc, argv, stdout) == 0 ? 0 : 1; 763 }