File: j0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./j0 ./j0.c 29 */ 30 31 #include <ctype.h> 32 #include <stdarg.h> 33 #include <stdbool.h> 34 #include <stdint.h> 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <string.h> 38 39 #ifdef _WIN32 40 #include <fcntl.h> 41 #include <windows.h> 42 #endif 43 44 #ifdef RED_ERRORS 45 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 46 #ifdef __APPLE__ 47 #define ERROR_STYLE "\x1b[31m" 48 #endif 49 #define RESET_STYLE "\x1b[0m" 50 #else 51 #define ERROR_STYLE 52 #define RESET_STYLE 53 #endif 54 55 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 56 57 #ifndef IBUF_SIZE 58 #define IBUF_SIZE (32 * 1024) 59 #endif 60 61 #ifndef OBUF_SIZE 62 #define OBUF_SIZE (8 * 1024) 63 #endif 64 65 const char* info = "" 66 "j0 [options...] [file...]\n" 67 "\n" 68 "\n" 69 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 70 "Its output is always a single line, which ends with a line-feed.\n" 71 "\n" 72 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 73 "valid JSON, since it\n" 74 "\n" 75 " - ignores both rest-of-line and multi-line comments\n" 76 " - ignores extra/trailing commas in arrays and objects\n" 77 " - turns single-quoted strings/keys into double-quoted strings\n" 78 " - double-quotes unquoted object keys\n" 79 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 80 "\n" 81 "All options available can either start with a single or a double-dash\n" 82 "\n" 83 " -h show this help message\n" 84 " -help show this help message\n" 85 " -jsonl emit JSON Lines, when top-level value is an array\n" 86 ""; 87 88 typedef struct j0_maker { 89 FILE* in; 90 FILE* out; 91 92 unsigned char* ibuf; 93 size_t ilen; // how many bytes are being used in the input buffer 94 size_t icap; // the input buffer's capacity 95 size_t ipos; // the current position in the input buffer 96 97 size_t line; // the current line, used to show useful error messages 98 size_t pos; // the position in the current line, for error messages 99 100 unsigned char* obuf; 101 size_t ocap; // the output buffer's capacity 102 size_t opos; // the current position in the output buffer 103 104 int current; 105 int next; 106 } j0_maker; 107 108 // advance_reader_pos helps func read_byte do its job 109 static inline void advance_reader_pos(j0_maker* r, unsigned char b) { 110 r->ipos++; 111 if (b == '\n') { 112 r->line++; 113 r->pos = 1; 114 } else { 115 r->pos++; 116 } 117 } 118 119 // read_byte does as it says: check its return for the value EOF, before 120 // using it as the next byte 121 static inline int read_byte(j0_maker* r) { 122 if (r->ipos < r->ilen) { 123 // inside current chunk 124 const unsigned char b = r->ibuf[r->ipos]; 125 advance_reader_pos(r, b); 126 return b; 127 } 128 129 // need to read the next block 130 r->ipos = 0; 131 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 132 if (r->ilen > 0) { 133 const unsigned char b = r->ibuf[r->ipos]; 134 advance_reader_pos(r, b); 135 return b; 136 } 137 138 // reached the end of data 139 return EOF; 140 } 141 142 // advance is used in most of the code, instead of calling read_byte directly 143 static inline void advance(j0_maker* r) { 144 r->current = r->next; 145 r->next = read_byte(r); 146 } 147 148 void fail(j0_maker* m, int code, const char* msg); 149 150 void skip_line(j0_maker* r) { 151 while (true) { 152 advance(r); 153 const int lead = r->current; 154 155 if (lead == EOF) { 156 break; 157 } 158 159 if (lead == '\n') { 160 advance(r); 161 break; 162 } 163 } 164 } 165 166 void skip_multiline_comment(j0_maker* r) { 167 unsigned char prev = 0; 168 169 while (true) { 170 advance(r); 171 const int lead = r->current; 172 173 if (lead == EOF) { 174 break; 175 } 176 177 if (prev == '*' && lead == '/') { 178 advance(r); 179 break; 180 } 181 182 prev = (unsigned char)lead; 183 } 184 } 185 186 void skip_comment(j0_maker* r) { 187 int lead = r->current; 188 189 if (lead == '#') { 190 skip_line(r); 191 return; 192 } 193 194 if (lead != '/') { 195 fail(r, 1, "expected a slash to start comments"); 196 } 197 198 advance(r); 199 lead = r->current; 200 201 if (lead == '/') { 202 skip_line(r); 203 return; 204 } 205 206 if (lead == '*') { 207 skip_multiline_comment(r); 208 return; 209 } 210 211 fail(r, 1, "expected `//` or `/*` to start comments"); 212 } 213 214 void seek_token(j0_maker* r) { 215 while (true) { 216 const int lead = r->current; 217 218 if (lead != EOF && lead <= ' ') { 219 advance(r); 220 continue; 221 } 222 223 if (lead == '/' || lead == '#') { 224 skip_comment(r); 225 continue; 226 } 227 228 break; 229 } 230 } 231 232 bool starts_with_bom(const unsigned char* b, const size_t n) { 233 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 234 } 235 236 void restart_state(j0_maker* m, FILE* w, FILE* r) { 237 m->in = r; 238 m->ilen = 0; 239 m->ipos = 0; 240 241 m->out = w; 242 m->opos = 0; 243 244 m->line = 1; 245 m->pos = 1; 246 247 m->current = EOF; 248 m->next = EOF; 249 250 m->current = read_byte(m); 251 if (m->current == EOF) { 252 return; 253 } 254 m->next = read_byte(m); 255 256 // skip leading UTF-8 BOM (byte-order mark), if present 257 if (starts_with_bom(m->ibuf, m->ilen)) { 258 // a UTF-8 BOM has 3 bytes 259 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 260 advance(m); 261 } 262 } 263 } 264 265 void write_byte(j0_maker* m, unsigned char b) { 266 if (m->opos < m->ocap) { 267 m->obuf[m->opos++] = b; 268 return; 269 } 270 271 fwrite(m->obuf, m->ocap, 1, m->out); 272 m->obuf[0] = b; 273 m->opos = 1; 274 } 275 276 // write_bytes does as it says, minimizing the number of calls to fwrite 277 void write_bytes(j0_maker* m, const unsigned char* src, size_t len) { 278 const size_t rem = m->ocap - m->opos; 279 if (len < rem) { 280 memcpy(m->obuf + m->opos, src, len); 281 m->opos += len; 282 return; 283 } 284 285 for (size_t i = 0; i < len; i++) { 286 write_byte(m, src[i]); 287 } 288 } 289 290 void flush(j0_maker* m) { 291 if (m->opos > 0) { 292 fwrite(m->obuf, m->opos, 1, m->out); 293 } 294 m->opos = 0; 295 fflush(m->out); 296 } 297 298 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 299 300 static inline bool check_2_byte_rune(int a, int b) { 301 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 302 } 303 304 bool check_3_byte_rune(int a, int b, int c) { 305 return ( 306 (a == 0xe0) && 307 (0xa0 <= b && b <= 0xbf) && 308 (0x80 <= c && c <= 0xbf) 309 ) || ( 310 (0xe1 <= a && a <= 0xec) && 311 (0x80 <= b && b <= 0xbf) && 312 (0x80 <= c && c <= 0xbf) 313 ) || ( 314 (a == 0xed) && 315 (0x80 <= b && b <= 0x9f) && 316 (0x80 <= c && c <= 0xbf) 317 ) || ( 318 (a == 0xee || a == 0xef) && 319 (0x80 <= b && b <= 0xbf) && 320 (0x80 <= c && c <= 0xbf) 321 ); 322 } 323 324 bool check_4_byte_rune(int a, int b, int c, int d) { 325 return ( 326 (a == 0xf0) && 327 (0x90 <= b && b <= 0xbf) && 328 (0x80 <= c && c <= 0xbf) && 329 (0x80 <= d && d <= 0xbf) 330 ) || ( 331 (a == 0xf1 || a == 0xf3) && 332 (0x80 <= b && b <= 0xbf) && 333 (0x80 <= c && c <= 0xbf) && 334 (0x80 <= d && d <= 0xbf) 335 ) || ( 336 (a == 0xf4) && 337 (0x80 <= b && b <= 0xbf) && 338 (0x80 <= c && c <= 0x8f) && 339 (0x80 <= d && d <= 0xbf) 340 ); 341 } 342 343 // write_replacement_char is the recommended action to handle invalid bytes 344 void write_replacement_char(j0_maker* m) { 345 write_byte(m, 0xef); 346 write_byte(m, 0xbf); 347 write_byte(m, 0xbd); 348 } 349 350 void handle_invalid_rune(j0_maker* m) { 351 // fail(m, 1, "invalid unicode value"); 352 write_replacement_char(m); 353 } 354 355 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 356 void write_rune(j0_maker* m, uint32_t rune) { 357 if (rune < (1 << 7)) { 358 write_byte(m, rune); 359 return; 360 } 361 362 if (rune < (1 << (5 + 6))) { 363 const int a = 0b11000000 | (rune >> 6); 364 const int b = 0b10000000 | (rune & 0b00111111); 365 if (check_2_byte_rune(a, b)) { 366 write_byte(m, a); 367 write_byte(m, b); 368 } else { 369 write_replacement_char(m); 370 } 371 return; 372 } 373 374 if (rune < (1 << (4 + 6 + 6))) { 375 const int a = 0b11100000 | (rune >> 12); 376 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 377 const int c = 0b10000000 | (rune & 0b00111111); 378 if (check_3_byte_rune(a, b, c)) { 379 write_byte(m, a); 380 write_byte(m, b); 381 write_byte(m, c); 382 } else { 383 write_replacement_char(m); 384 } 385 return; 386 } 387 388 if (rune < (1 << (3 + 6 + 6 + 6))) { 389 const int a = 0b11110000 | (rune >> 18); 390 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 391 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 392 const int d = 0b10000000 | (rune & 0b00111111); 393 if (check_4_byte_rune(a, b, c, d)) { 394 write_byte(m, a); 395 write_byte(m, b); 396 write_byte(m, c); 397 write_byte(m, d); 398 } else { 399 write_replacement_char(m); 400 } 401 return; 402 } 403 404 write_replacement_char(m); 405 } 406 407 void copy_utf8_rune(j0_maker* m) { 408 const int a = m->current; 409 410 if (a == EOF) { 411 return; 412 } 413 414 // handle 1-byte runes 415 if (a < 128) { 416 write_byte(m, a); 417 return; 418 } 419 420 advance(m); 421 const int b = m->current; 422 423 if (b == EOF) { 424 handle_invalid_rune(m); 425 return; 426 } 427 428 // handle 2-byte runes 429 if (check_2_byte_rune(a, b)) { 430 write_byte(m, a); 431 write_byte(m, b); 432 return; 433 } 434 435 advance(m); 436 const int c = m->current; 437 438 if (c == EOF) { 439 handle_invalid_rune(m); 440 return; 441 } 442 443 // handle 3-byte runes 444 if (check_3_byte_rune(a, b, c)) { 445 write_byte(m, a); 446 write_byte(m, b); 447 write_byte(m, c); 448 return; 449 } 450 451 advance(m); 452 const int d = m->current; 453 454 if (d == EOF) { 455 handle_invalid_rune(m); 456 return; 457 } 458 459 // handle 4-byte runes 460 if (check_4_byte_rune(a, b, c, d)) { 461 write_byte(m, a); 462 write_byte(m, b); 463 write_byte(m, c); 464 write_byte(m, d); 465 return; 466 } 467 468 handle_invalid_rune(m); 469 } 470 471 // debug is available to diagnose any bug found 472 void debug(j0_maker* m, const char* fmt, ...) { 473 va_list args; 474 va_start(args, fmt); 475 476 if (m->in != stdin) { 477 fclose(m->in); 478 } 479 480 write_byte(m, '\n'); 481 482 const unsigned long line = m->line; 483 const unsigned long pos = m->pos; 484 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 485 fprintf(stderr, fmt, args); 486 fprintf(stderr, "\x1b[0m\n"); 487 488 va_end(args); 489 490 exit(10); 491 } 492 493 // fail quits this app right after showing the error message given 494 void fail(j0_maker* m, int code, const char* msg) { 495 const unsigned long line = m->line; 496 const unsigned long pos = m->pos; 497 498 write_byte(m, '\n'); 499 flush(m); 500 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 501 exit(code); 502 } 503 504 bool demand_keyword(j0_maker* m, char* rest) { 505 for (; rest[0] != 0; rest++) { 506 const int lead = m->current; 507 if (lead == EOF || lead != rest[0]) { 508 return false; 509 } 510 advance(m); 511 } 512 513 return rest[0] == 0; 514 } 515 516 void handle_null(j0_maker* m) { 517 if (!demand_keyword(m, "null")) { 518 fail(m, 1, "expected `null` keyword"); 519 } 520 write_bytes(m, (unsigned char*)"null", 4); 521 } 522 523 void handle_true(j0_maker* m) { 524 if (!demand_keyword(m, "true")) { 525 fail(m, 1, "expected `true` keyword"); 526 } 527 write_bytes(m, (unsigned char*)"true", 4); 528 } 529 530 void handle_false(j0_maker* m) { 531 if (!demand_keyword(m, "false")) { 532 fail(m, 1, "expected `false` keyword"); 533 } 534 write_bytes(m, (unsigned char*)"false", 5); 535 } 536 537 void handle_capital_none(j0_maker* m) { 538 if (!demand_keyword(m, "None")) { 539 fail(m, 1, "expected `None` keyword"); 540 } 541 write_bytes(m, (unsigned char*)"null", 4); 542 } 543 544 void handle_capital_true(j0_maker* m) { 545 if (!demand_keyword(m, "True")) { 546 fail(m, 1, "expected `True` keyword"); 547 } 548 write_bytes(m, (unsigned char*)"true", 4); 549 } 550 551 void handle_capital_false(j0_maker* m) { 552 if (!demand_keyword(m, "False")) { 553 fail(m, 1, "expected `False` keyword"); 554 } 555 write_bytes(m, (unsigned char*)"false", 5); 556 } 557 558 void handle_digits(j0_maker* m) { 559 if (!isdigit(m->current)) { 560 fail(m, 1, "expected/missing digits"); 561 } 562 563 while (isdigit(m->current)) { 564 write_byte(m, m->current); 565 advance(m); 566 } 567 } 568 569 void handle_number(j0_maker* m) { 570 handle_digits(m); 571 572 const int lead = m->current; 573 574 if (lead == '.') { 575 write_byte(m, '.'); 576 advance(m); 577 578 if (isdigit(m->current)) { 579 handle_digits(m); 580 } else { 581 write_byte(m, '0'); 582 } 583 return; 584 } 585 586 if (lead == 'e' || lead == 'E') { 587 write_byte(m, lead); 588 advance(m); 589 590 if (m->current == '+') { 591 advance(m); 592 } else if (m->current == '-') { 593 write_byte(m, '-'); 594 advance(m); 595 } 596 597 handle_digits(m); 598 } 599 } 600 601 void handle_dot(j0_maker* m) { 602 write_byte(m, '0'); 603 write_byte(m, '.'); 604 advance(m); 605 606 if (!isdigit(m->current)) { 607 fail(m, 1, "expected/missing digits after decimal dot"); 608 } 609 handle_digits(m); 610 } 611 612 void handle_plus_number(j0_maker* m) { 613 advance(m); 614 615 if (m->current == '.') { 616 handle_dot(m); 617 return; 618 } 619 handle_number(m); 620 } 621 622 void handle_minus_number(j0_maker* m) { 623 write_byte(m, '-'); 624 advance(m); 625 626 if (m->current == '.') { 627 handle_dot(m); 628 return; 629 } 630 handle_number(m); 631 } 632 633 // decode_hex assumes valid hex digits, checked by func is_valid_hex 634 uint32_t decode_hex(unsigned char hex) { 635 if ('0' <= hex && hex <= '9') { 636 return hex - '0'; 637 } 638 if ('A' <= hex && hex <= 'F') { 639 return hex - 'A' + 10; 640 } 641 if ('a' <= hex && hex <= 'f') { 642 return hex - 'a' + 10; 643 } 644 return 0xffff; 645 } 646 647 static inline bool is_valid_hex(unsigned char b) { 648 return false || 649 ('0' <= b && b <= '9') || 650 ('A' <= b && b <= 'F') || 651 ('a' <= b && b <= 'f'); 652 } 653 654 void handle_hex_quad(j0_maker* m) { 655 unsigned char quad[4]; 656 for (size_t i = 0; i < 4; i++) { 657 advance(m); 658 const int lead = m->current; 659 if (lead == EOF) { 660 fail(m, 1, "end of input before end of string"); 661 } 662 if (is_valid_hex(lead)) { 663 quad[i] = lead; 664 continue; 665 } 666 fail(m, 1, "invalid hexadecimal digit in string"); 667 } 668 669 const uint32_t n = 0 + 670 (decode_hex(quad[0]) << 12) + 671 (decode_hex(quad[1]) << 8) + 672 (decode_hex(quad[2]) << 4) + 673 (decode_hex(quad[3]) << 0); 674 675 if (n >= 32) { 676 write_rune(m, n); 677 return; 678 } 679 680 write_byte(m, '\\'); 681 write_byte(m, 'u'); 682 write_byte(m, quad[0]); 683 write_byte(m, quad[1]); 684 write_byte(m, quad[2]); 685 write_byte(m, quad[3]); 686 } 687 688 void handle_hex_pair(j0_maker* m) { 689 advance(m); 690 const int a = m->current; 691 advance(m); 692 const int b = m->current; 693 if (a == EOF || b == EOF) { 694 fail(m, 1, "end of input before end of string"); 695 } 696 if (!is_valid_hex(a) || !is_valid_hex(b)) { 697 fail(m, 1, "invalid hexadecimal digit in string"); 698 } 699 700 const uint32_t n = 16 * decode_hex(a) + decode_hex(b); 701 if (n >= 32) { 702 write_rune(m, n); 703 } else { 704 write_byte(m, '\\'); 705 write_byte(m, 'u'); 706 write_byte(m, '0'); 707 write_byte(m, '0'); 708 write_byte(m, a); 709 write_byte(m, b); 710 } 711 } 712 713 void handle_string_escape(j0_maker* m, int c) { 714 switch (c) { 715 case '"': 716 case '\\': 717 case 'b': 718 case 'f': 719 case 'n': 720 case 'r': 721 case 't': 722 write_byte(m, '\\'); 723 write_byte(m, c); 724 break; 725 726 case 'u': 727 handle_hex_quad(m); 728 break; 729 730 case 'x': 731 handle_hex_pair(m); 732 break; 733 734 case '\'': 735 write_byte(m, '\''); 736 break; 737 738 default: 739 write_byte(m, m->current); 740 break; 741 } 742 } 743 744 // hex is only used by function handle_low_char to render hexadecimals 745 const char* hex = "0123456789ABCDEF"; 746 747 // handle_low_char simplifies function handle_string 748 void handle_low_char(j0_maker* m, int c) { 749 switch (c) { 750 case '\t': 751 write_byte(m, '\\'); 752 write_byte(m, 't'); 753 break; 754 755 case '\n': 756 write_byte(m, '\\'); 757 write_byte(m, 'n'); 758 break; 759 760 case '\r': 761 write_byte(m, '\\'); 762 write_byte(m, 'r'); 763 break; 764 765 case '\v': 766 write_byte(m, '\\'); 767 write_byte(m, 'v'); 768 break; 769 770 default: 771 write_byte(m, '\\'); 772 write_byte(m, 'u'); 773 write_byte(m, '0'); 774 write_byte(m, '0'); 775 write_byte(m, hex[c / 16]); 776 write_byte(m, hex[c % 16]); 777 break; 778 } 779 } 780 781 void handle_string(j0_maker* m) { 782 const unsigned char quote = m->current; 783 bool escaped = false; 784 785 write_byte(m, '"'); 786 787 while (true) { 788 advance(m); 789 790 int c = m->current; 791 if (c == EOF) { 792 fail(m, 1, "input ended before string was close-quoted"); 793 } 794 795 if (escaped) { 796 handle_string_escape(m, c); 797 escaped = false; 798 continue; 799 } 800 801 switch (c) { 802 case '\\': 803 escaped = true; 804 break; 805 806 default: 807 if (c == quote) { 808 write_byte(m, '"'); 809 advance(m); 810 return; 811 } 812 813 // write_byte(m, c); 814 if (c < ' ') { 815 handle_low_char(m, c); 816 } else { 817 copy_utf8_rune(m); 818 } 819 break; 820 } 821 } 822 } 823 824 void handle_token(j0_maker* m); 825 826 void handle_array(j0_maker* m) { 827 write_byte(m, '['); 828 advance(m); 829 830 for (size_t i = 0; true; i++) { 831 seek_token(m); 832 const int lead = m->current; 833 834 if (lead == EOF) { 835 fail(m, 1, "unclosed array"); 836 } 837 838 if (lead == ',') { 839 advance(m); 840 continue; 841 } 842 843 if (lead == ']') { 844 write_byte(m, ']'); 845 advance(m); 846 return; 847 } 848 849 if (i > 0) { 850 write_byte(m, ','); 851 } 852 if (feof(m->out)) { 853 return; 854 } 855 handle_token(m); 856 } 857 } 858 859 // handle_array_jsonl is a slight variation of func handle_array: this one is 860 // used to handle top-level arrays when running in JSON Lines mode, to emit 861 // line-feeds after each item, instead of commas between them 862 void handle_array_jsonl(j0_maker* m) { 863 advance(m); 864 865 for (size_t i = 0; true; i++) { 866 seek_token(m); 867 const int lead = m->current; 868 869 if (lead == EOF) { 870 fail(m, 1, "unclosed array"); 871 } 872 873 if (lead == ',') { 874 advance(m); 875 continue; 876 } 877 878 if (i > 0) { 879 write_byte(m, '\n'); 880 } 881 882 if (lead == ']') { 883 advance(m); 884 return; 885 } 886 887 if (feof(m->out)) { 888 return; 889 } 890 handle_token(m); 891 } 892 } 893 894 void handle_unquoted_key(j0_maker* m) { 895 write_byte(m, '"'); 896 897 while (true) { 898 int c = m->current; 899 if (c == EOF) { 900 fail(m, 1, "input ended with an object key"); 901 } 902 903 write_byte(m, c); 904 advance(m); 905 906 c = m->current; 907 if (!isalpha(c) && !isdigit(c) && c != '_') { 908 break; 909 } 910 } 911 912 write_byte(m, '"'); 913 } 914 915 void handle_object(j0_maker* m) { 916 write_byte(m, '{'); 917 advance(m); 918 919 for (size_t i = 0; true; i++) { 920 seek_token(m); 921 int lead = m->current; 922 923 if (lead == EOF) { 924 fail(m, 1, "unclosed object"); 925 } 926 927 if (lead == ',') { 928 advance(m); 929 continue; 930 } 931 932 if (lead == '}') { 933 write_byte(m, '}'); 934 advance(m); 935 return; 936 } 937 938 if (feof(m->out)) { 939 return; 940 } 941 942 if (lead == '"' || lead == '\'') { 943 if (i > 0) { 944 write_byte(m, ','); 945 } 946 handle_string(m); 947 } else if (isalpha(lead) || lead == '_') { 948 if (i > 0) { 949 write_byte(m, ','); 950 } 951 handle_unquoted_key(m); 952 } else { 953 fail(m, 1, "only strings or identifiers can be object keys"); 954 } 955 956 seek_token(m); 957 lead = m->current; 958 959 if (lead == EOF) { 960 fail(m, 1, "input ended after object-key and before value"); 961 } 962 963 if (lead != ':') { 964 fail(m, 1, "a `:` must follow all object keys"); 965 } 966 967 write_byte(m, ':'); 968 advance(m); 969 970 seek_token(m); 971 if (m->current == EOF) { 972 fail(m, 1, "input ended after a `:` following an object-key"); 973 } 974 975 handle_token(m); 976 } 977 } 978 979 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 980 void (*dispatch[256])() = { 981 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 982 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 983 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 984 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 985 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 986 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 987 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 988 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 989 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 990 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 991 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 992 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 993 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 994 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 995 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 996 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 997 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 998 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 999 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1000 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1001 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1002 NULL, NULL, NULL, NULL, 1003 }; 1004 1005 void handle_token(j0_maker* m) { 1006 dispatch[m->current](m); 1007 } 1008 1009 // handle_invalid_token shows an error message and quits the app right after 1010 void handle_invalid_token(j0_maker* m) { 1011 char msg[64]; 1012 unsigned char c = (unsigned char)m->current; 1013 sprintf(msg, "%c (%d): invalid token", c, c); 1014 fail(m, 1, msg); 1015 } 1016 1017 void handle_array_jsonl(j0_maker* m); 1018 1019 void handle_input(FILE* src, bool jsonl) { 1020 unsigned char ibuf[IBUF_SIZE]; 1021 unsigned char obuf[OBUF_SIZE]; 1022 1023 j0_maker m; 1024 m.ibuf = ibuf; 1025 m.icap = sizeof(ibuf); 1026 m.obuf = obuf; 1027 m.ocap = sizeof(obuf); 1028 restart_state(&m, stdout, src); 1029 1030 // ignore leading whitespace/comment bytes, if present 1031 seek_token(&m); 1032 1033 if (m.current == EOF) { 1034 fail(&m, 1, "empty input isn't valid JSON"); 1035 } 1036 1037 if (jsonl && m.current == '[') { 1038 handle_array_jsonl(&m); 1039 } else { 1040 handle_token(&m); 1041 write_byte(&m, '\n'); 1042 } 1043 flush(&m); 1044 1045 // ignore trailing whitespace/comment bytes, if present 1046 seek_token(&m); 1047 1048 // ignore trailing semicolon, if present 1049 if (m.current == ';') { 1050 advance(&m); 1051 // ignore trailing whitespace/comment bytes, if present 1052 seek_token(&m); 1053 } 1054 1055 if (!feof(src) || m.current != EOF) { 1056 fail(&m, 1, "unexpected trailing JSON data"); 1057 } 1058 } 1059 1060 bool is_help_option(const char* s) { 1061 return (s[0] == '-' && s[1] != 0) && ( 1062 strcmp(s, "-h") == 0 || 1063 strcmp(s, "--h") == 0 || 1064 strcmp(s, "-help") == 0 || 1065 strcmp(s, "--help") == 0 1066 ); 1067 } 1068 1069 bool is_jsonl_option(const char* s) { 1070 return (s[0] == '-' && s[1] != 0) && ( 1071 strcmp(s, "-jl") == 0 || 1072 strcmp(s, "--jl") == 0 || 1073 strcmp(s, "-jsonl") == 0 || 1074 strcmp(s, "--jsonl") == 0 1075 ); 1076 } 1077 1078 // run returns the error code 1079 int run(int nargs, char** args) { 1080 bool jsonl = false; 1081 if (nargs > 0 && is_jsonl_option(args[0])) { 1082 jsonl = true; 1083 nargs--; 1084 args++; 1085 } 1086 1087 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1088 nargs--; 1089 args++; 1090 } 1091 1092 if (nargs > 1) { 1093 const char* msg = "can't use more than 1 named input"; 1094 fprintf(stderr, ERROR_LINE("%s"), msg); 1095 return 1; 1096 } 1097 1098 // use stdin when not given a filepath 1099 if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) { 1100 handle_input(stdin, jsonl); 1101 return 0; 1102 } 1103 1104 const char* path = args[0]; 1105 FILE* f = fopen(path, "rb"); 1106 if (f == NULL) { 1107 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1108 return 1; 1109 } 1110 1111 handle_input(f, jsonl); 1112 fclose(f); 1113 1114 return 0; 1115 } 1116 1117 int main(int argc, char** argv) { 1118 #ifdef _WIN32 1119 setmode(fileno(stdin), O_BINARY); 1120 // ensure output lines end in LF instead of CRLF on windows 1121 setmode(fileno(stdout), O_BINARY); 1122 setmode(fileno(stderr), O_BINARY); 1123 #endif 1124 1125 if (argc > 1 && is_help_option(argv[1])) { 1126 printf("%s", info); 1127 return 0; 1128 } 1129 1130 // the dispatch table starts as all null function-pointers 1131 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1132 dispatch[i] = handle_invalid_token; 1133 } 1134 1135 for (size_t i = '0'; i <= '9'; i++) { 1136 dispatch[i] = handle_number; 1137 } 1138 1139 dispatch['n'] = handle_null; 1140 dispatch['t'] = handle_true; 1141 dispatch['f'] = handle_false; 1142 dispatch['N'] = handle_capital_none; 1143 dispatch['T'] = handle_capital_true; 1144 dispatch['F'] = handle_capital_false; 1145 dispatch['.'] = handle_dot; 1146 dispatch['+'] = handle_plus_number; 1147 dispatch['-'] = handle_minus_number; 1148 dispatch['"'] = handle_string; 1149 dispatch['\''] = handle_string; 1150 dispatch['['] = handle_array; 1151 dispatch['{'] = handle_object; 1152 1153 // enable full/block-buffering for standard output 1154 setvbuf(stdout, NULL, _IOFBF, 0); 1155 1156 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1157 }