File: utfate.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./utfate ./utfate.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #ifdef RED_ERRORS 43 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 44 #ifdef __APPLE__ 45 #define ERROR_STYLE "\x1b[31m" 46 #endif 47 #define RESET_STYLE "\x1b[0m" 48 #else 49 #define ERROR_STYLE 50 #define RESET_STYLE 51 #endif 52 53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 54 55 #ifndef IBUF_SIZE 56 #define IBUF_SIZE (32 * 1024) 57 #endif 58 59 const char* info = "" 60 "utfate [options...] [filenames...]\n" 61 "\n" 62 "This app turns ASCII/UTF text into UTF-8. ASCII/UTF-8 inputs stay the same,\n" 63 "leading UTF-8 BOMs (byte-order marks) are ignored, UTF-16 and UTF-32 (both\n" 64 "in either kind of endianess) are turned into UTF-8.\n" 65 "\n" 66 "\n" 67 "Options\n" 68 "\n" 69 " -h, --h show this help message\n" 70 " -help, --help aliases for option -h\n" 71 ""; 72 73 typedef struct bufreader { 74 // buf is the buffer, (re)filled periodically as needed 75 unsigned char* buf; 76 77 // len is how many buffer bytes are being used, out of its max capacity 78 size_t len; 79 80 // cap is the buffer's capacity, or the most bytes it can hold at once 81 size_t cap; 82 83 // pos is the current position, up to the current buffer length 84 size_t pos; 85 86 // src is the data source used to fill the buffer 87 FILE* src; 88 } bufreader; 89 90 // init_bufreader is the constructor for type bufreader 91 void init_bufreader(bufreader* r, FILE* src, unsigned char* buf, size_t cap) { 92 r->buf = buf; 93 r->len = 0; 94 r->cap = cap; 95 r->pos = 0; 96 r->src = src; 97 } 98 99 void restart_bufreader(bufreader* r, FILE* src) { 100 r->src = src; 101 ssize_t len = fread(r->buf, sizeof(unsigned char), r->cap, r->src); 102 r->len = (len > 0) ? len : 0; 103 } 104 105 // read_byte does as it says: check its return for the value EOF, before 106 // using it as the next byte 107 int read_byte(bufreader* r) { 108 if (r->pos < r->len) { 109 // inside current chunk 110 const unsigned char b = r->buf[r->pos]; 111 r->pos++; 112 return b; 113 } 114 115 // need to read the next block 116 r->pos = 0; 117 ssize_t len = fread(r->buf, sizeof(unsigned char), r->cap, r->src); 118 if (len > 0) { 119 r->len = len; 120 return r->buf[r->pos++]; 121 } 122 123 // reached the end of data 124 r->len = 0; 125 return EOF; 126 } 127 128 int64_t discard_bytes(bufreader* r, size_t n) { 129 if (r->pos + n < r->len) { 130 r->pos += n; 131 return n; 132 } 133 134 int64_t discarded = 0; 135 for (; n > 0; n--, discarded++) { 136 if (read_byte(r) == EOF) { 137 break; 138 } 139 } 140 return discarded; 141 } 142 143 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 144 145 static inline bool check_2_byte_rune(int a, int b) { 146 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 147 } 148 149 bool check_3_byte_rune(int a, int b, int c) { 150 return ( 151 (a == 0xe0) && 152 (0xa0 <= b && b <= 0xbf) && 153 (0x80 <= c && c <= 0xbf) 154 ) || ( 155 (0xe1 <= a && a <= 0xec) && 156 (0x80 <= b && b <= 0xbf) && 157 (0x80 <= c && c <= 0xbf) 158 ) || ( 159 (a == 0xed) && 160 (0x80 <= b && b <= 0x9f) && 161 (0x80 <= c && c <= 0xbf) 162 ) || ( 163 (a == 0xee || a == 0xef) && 164 (0x80 <= b && b <= 0xbf) && 165 (0x80 <= c && c <= 0xbf) 166 ); 167 } 168 169 bool check_4_byte_rune(int a, int b, int c, int d) { 170 return ( 171 (a == 0xf0) && 172 (0x90 <= b && b <= 0xbf) && 173 (0x80 <= c && c <= 0xbf) && 174 (0x80 <= d && d <= 0xbf) 175 ) || ( 176 (a == 0xf1 || a == 0xf3) && 177 (0x80 <= b && b <= 0xbf) && 178 (0x80 <= c && c <= 0xbf) && 179 (0x80 <= d && d <= 0xbf) 180 ) || ( 181 (a == 0xf4) && 182 (0x80 <= b && b <= 0xbf) && 183 (0x80 <= c && c <= 0x8f) && 184 (0x80 <= d && d <= 0xbf) 185 ); 186 } 187 188 // write_replacement_char is the recommended action to handle invalid bytes 189 void write_replacement_char(FILE* w) { 190 fputc(0xef, w); 191 fputc(0xbf, w); 192 fputc(0xbd, w); 193 } 194 195 void copy_utf8_rune(FILE* w, bufreader* r) { 196 const int a = read_byte(r); 197 if (a == EOF) { 198 return; 199 } 200 201 // handle 1-byte runes 202 if (a < 128) { 203 fputc(a, w); 204 return; 205 } 206 207 const int b = read_byte(r); 208 if (b == EOF) { 209 write_replacement_char(w); 210 return; 211 } 212 213 // handle 2-byte runes 214 if (check_2_byte_rune(a, b)) { 215 fputc(a, w); 216 fputc(b, w); 217 return; 218 } 219 220 const int c = read_byte(r); 221 if (c == EOF) { 222 write_replacement_char(w); 223 return; 224 } 225 226 // handle 3-byte runes 227 if (check_3_byte_rune(a, b, c)) { 228 fputc(a, w); 229 fputc(b, w); 230 fputc(c, w); 231 return; 232 } 233 234 const int d = read_byte(r); 235 if (d == EOF) { 236 write_replacement_char(w); 237 return; 238 } 239 240 // handle 4-byte runes 241 if (check_4_byte_rune(a, b, c, d)) { 242 fputc(a, w); 243 fputc(b, w); 244 fputc(c, w); 245 fputc(d, w); 246 return; 247 } 248 249 write_replacement_char(w); 250 } 251 252 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 253 // void write_rune(FILE* w, uint32_t rune) { 254 // if (rune < (1 << 7)) { 255 // fputc(rune, w); 256 // return; 257 // } 258 // 259 // if (rune < (1 << (5 + 6))) { 260 // fputc(0b11000000 | (rune >> 6), w); 261 // fputc(0b10000000 | (rune & 0b00111111), w); 262 // return; 263 // } 264 // 265 // if (rune < (1 << (4 + 6 + 6))) { 266 // fputc(0b11100000 | (rune >> 12), w); 267 // fputc(0b10000000 | ((rune >> 6) & 0b00111111), w); 268 // fputc(0b10000000 | (rune & 0b00111111), w); 269 // return; 270 // } 271 // 272 // if (rune < (1 << (3 + 6 + 6 + 6))) { 273 // fputc(0b11110000 | (rune >> 18), w); 274 // fputc(0b10000000 | ((rune >> 12) & 0b00111111), w); 275 // fputc(0b10000000 | ((rune >> 6) & 0b00111111), w); 276 // fputc(0b10000000 | (rune & 0b00111111), w); 277 // return; 278 // } 279 // 280 // // handle invalid runes with a utf-8 replacement character 281 // write_replacement_char(w); 282 // } 283 284 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 285 void write_rune(FILE* w, uint32_t rune) { 286 if (rune < (1 << 7)) { 287 fputc(rune, w); 288 return; 289 } 290 291 if (rune < (1 << (5 + 6))) { 292 const int a = 0b11000000 | (rune >> 6); 293 const int b = 0b10000000 | (rune & 0b00111111); 294 if (check_2_byte_rune(a, b)) { 295 fputc(a, w); 296 fputc(b, w); 297 } else { 298 write_replacement_char(w); 299 } 300 return; 301 } 302 303 if (rune < (1 << (4 + 6 + 6))) { 304 const int a = 0b11100000 | (rune >> 12); 305 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 306 const int c = 0b10000000 | (rune & 0b00111111); 307 if (check_3_byte_rune(a, b, c)) { 308 fputc(a, w); 309 fputc(b, w); 310 fputc(c, w); 311 } else { 312 write_replacement_char(w); 313 } 314 return; 315 } 316 317 if (rune < (1 << (3 + 6 + 6 + 6))) { 318 const int a = 0b11110000 | (rune >> 18); 319 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 320 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 321 const int d = 0b10000000 | (rune & 0b00111111); 322 if (check_4_byte_rune(a, b, c, d)) { 323 fputc(a, w); 324 fputc(b, w); 325 fputc(c, w); 326 fputc(d, w); 327 } else { 328 write_replacement_char(w); 329 } 330 return; 331 } 332 333 // handle invalid runes with a utf-8 replacement character 334 write_replacement_char(w); 335 } 336 337 void show_error(FILE* w, const char* msg) { 338 fputc('\n', w); 339 fprintf(stderr, ERROR_LINE("%s"), msg); 340 } 341 342 // desurrogate assumes the utf16 pair given to it is a valid surrogate 343 static inline uint32_t desurrogate(uint16_t high, uint16_t low) { 344 return 0x400 * (high - 0xd800) + (low - 0xdc00) + 0x10000; 345 } 346 347 void handle_utf8(FILE* w, bufreader* r) { 348 for (uint64_t i = 0; r->len > 0; i++) { 349 copy_utf8_rune(w, r); 350 if ((i % 1024 == 0) && feof(w)) { 351 break; 352 } 353 } 354 } 355 356 void handle_utf8_bom(FILE* w, bufreader* r) { 357 discard_bytes(r, 3); 358 handle_utf8(w, r); 359 } 360 361 void handle_utf16be(FILE* w, bufreader* r) { 362 discard_bytes(r, 2); 363 364 for (uint64_t i = 0; r->len > 0; i++) { 365 if ((i % 1024 == 0) && feof(w)) { 366 break; 367 } 368 369 const int a = read_byte(r); 370 if (a == EOF) { 371 break; 372 } 373 374 const int b = read_byte(r); 375 if (b == EOF) { 376 write_replacement_char(w); 377 break; 378 } 379 380 const uint32_t code = (a << 8) + b; 381 382 // handle non-surrogate runes 383 if ((code <= 0xd7ff) || (code >= 0xe000)) { 384 write_rune(w, code); 385 continue; 386 } 387 388 const int c = read_byte(r); 389 if (c == EOF) { 390 write_replacement_char(w); 391 break; 392 } 393 394 const int d = read_byte(r); 395 if (d == EOF) { 396 write_replacement_char(w); 397 break; 398 } 399 400 // https://en.wikipedia.org/wiki/UTF-16 401 402 const uint16_t high = code; 403 const uint16_t low = (c << 8) + d; 404 405 // handle valid surrogate runes 406 if (0xdc00 <= low && low <= 0xdfff) { 407 write_rune(w, desurrogate(high, low)); 408 continue; 409 } 410 411 write_replacement_char(w); 412 } 413 } 414 415 void handle_utf16le(FILE* w, bufreader* r) { 416 discard_bytes(r, 2); 417 418 for (uint64_t i = 0; r->len > 0; i++) { 419 if ((i % 1024 == 0) && feof(w)) { 420 break; 421 } 422 423 const int a = read_byte(r); 424 if (a == EOF) { 425 break; 426 } 427 428 const int b = read_byte(r); 429 if (b == EOF) { 430 write_replacement_char(w); 431 break; 432 } 433 434 const uint32_t code = (b << 8) + a; 435 436 // handle non-surrogate runes 437 if ((code <= 0xd7ff) || (code >= 0xe000)) { 438 write_rune(w, code); 439 continue; 440 } 441 442 const int c = read_byte(r); 443 if (c == EOF) { 444 write_replacement_char(w); 445 break; 446 } 447 448 const int d = read_byte(r); 449 if (d == EOF) { 450 write_replacement_char(w); 451 break; 452 } 453 454 // https://en.wikipedia.org/wiki/UTF-16 455 456 const uint16_t high = code; 457 const uint16_t low = (d << 8) + c; 458 459 // handle valid surrogate runes 460 if (0xdc00 <= low && low <= 0xdfff) { 461 write_rune(w, desurrogate(high, low)); 462 continue; 463 } 464 465 write_replacement_char(w); 466 } 467 } 468 469 void handle_utf32be(FILE* w, bufreader* r) { 470 discard_bytes(r, 4); 471 472 for (uint64_t i = 0; r->len > 0; i++) { 473 if ((i % 1024 == 0) && feof(w)) { 474 break; 475 } 476 477 const int a = read_byte(r); 478 if (a == EOF) { 479 break; 480 } 481 482 const int b = read_byte(r); 483 if (b == EOF) { 484 write_replacement_char(w); 485 break; 486 } 487 488 const int c = read_byte(r); 489 if (c == EOF) { 490 write_replacement_char(w); 491 break; 492 } 493 494 const int d = read_byte(r); 495 if (d == EOF) { 496 write_replacement_char(w); 497 break; 498 } 499 500 write_rune(w, (a << 24) + (b << 16) + (c << 8) + d); 501 } 502 } 503 504 void handle_utf32le(FILE* w, bufreader* r) { 505 discard_bytes(r, 4); 506 507 for (uint64_t i = 0; r->len > 0; i++) { 508 if ((i % 1024 == 0) && feof(w)) { 509 break; 510 } 511 512 const int a = read_byte(r); 513 if (a == EOF) { 514 break; 515 } 516 517 const int b = read_byte(r); 518 if (b == EOF) { 519 write_replacement_char(w); 520 break; 521 } 522 523 const int c = read_byte(r); 524 if (c == EOF) { 525 write_replacement_char(w); 526 break; 527 } 528 529 const int d = read_byte(r); 530 if (d == EOF) { 531 write_replacement_char(w); 532 break; 533 } 534 535 write_rune(w, (d << 24) + (c << 16) + (b << 8) + a); 536 } 537 } 538 539 void (*detect_bom(const bufreader* r))(FILE*, bufreader*) { 540 const unsigned char* p = r->buf; 541 const ssize_t len = r->len; 542 543 if (len >= 4) { 544 if (p[0] == 0x00 && p[1] == 0x00 && p[2] == 0xfe && p[3] == 0xff) { 545 return handle_utf32be; 546 } 547 if (p[0] == 0xff && p[1] == 0xfe && p[2] == 0x00 && p[3] == 0x00) { 548 return handle_utf32le; 549 } 550 } 551 552 if (len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf) { 553 return handle_utf8_bom; 554 } 555 556 if (len >= 2) { 557 if (p[0] == 0xfe && p[1] == 0xff) { 558 return handle_utf16be; 559 } 560 if (p[0] == 0xff && p[1] == 0xfe) { 561 return handle_utf16le; 562 } 563 } 564 565 return handle_utf8; 566 } 567 568 void handle_reader(FILE* w, FILE* src, bufreader* r) { 569 restart_bufreader(r, src); 570 detect_bom(r)(w, r); 571 fflush(w); 572 } 573 574 // handle_file handles data from the filename given; returns false only when 575 // the file can't be opened 576 bool handle_file(FILE* w, const char* path, bufreader* r) { 577 FILE* f = fopen(path, "rb"); 578 if (f == NULL) { 579 fputc('\n', w); 580 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 581 return false; 582 } 583 584 handle_reader(w, f, r); 585 fclose(f); 586 return true; 587 } 588 589 // is_help_option simplifies control-flow for func run 590 bool is_help_option(const char* s) { 591 return (s[0] == '-') && ( 592 strcmp(s, "-h") == 0 || 593 strcmp(s, "-help") == 0 || 594 strcmp(s, "--h") == 0 || 595 strcmp(s, "--help") == 0 596 ); 597 } 598 599 // run returns the number of errors 600 int run(int argc, char** argv, FILE* w) { 601 unsigned char buf[IBUF_SIZE]; 602 bufreader r; 603 init_bufreader(&r, stdin, buf, sizeof(buf)); 604 size_t errors = 0; 605 606 // handle all filenames/options given 607 for (size_t i = 1; i < argc && !feof(w); i++) { 608 // a `-` filename stands for the standard input 609 if (argv[i][0] == '-' && argv[i][1] == 0) { 610 handle_reader(w, stdin, &r); 611 continue; 612 } 613 614 if (!handle_file(w, argv[i], &r)) { 615 errors++; 616 } 617 } 618 619 // no filenames means use stdin as the only input 620 if (argc < 2) { 621 handle_reader(w, stdin, &r); 622 } 623 624 return errors; 625 } 626 627 int main(int argc, char** argv) { 628 #ifdef _WIN32 629 setmode(fileno(stdin), O_BINARY); 630 // ensure output lines end in LF instead of CRLF on windows 631 setmode(fileno(stdout), O_BINARY); 632 setmode(fileno(stderr), O_BINARY); 633 #endif 634 635 if (argc > 1 && is_help_option(argv[1])) { 636 fprintf(stderr, "%s", info); 637 return 0; 638 } 639 640 return run(argc, argv, stdout) == 0 ? 0 : 1; 641 }