File: json0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./json0 ./json0.c 29 */ 30 31 #include <ctype.h> 32 #include <stdarg.h> 33 #include <stdbool.h> 34 #include <stdint.h> 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <string.h> 38 39 #ifdef _WIN32 40 #include <fcntl.h> 41 #include <windows.h> 42 #endif 43 44 #ifdef RED_ERRORS 45 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 46 #ifdef __APPLE__ 47 #define ERROR_STYLE "\x1b[31m" 48 #endif 49 #define RESET_STYLE "\x1b[0m" 50 #else 51 #define ERROR_STYLE 52 #define RESET_STYLE 53 #endif 54 55 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 56 57 #ifndef IBUF_SIZE 58 #define IBUF_SIZE (32 * 1024) 59 #endif 60 61 #ifndef OBUF_SIZE 62 #define OBUF_SIZE (8 * 1024) 63 #endif 64 65 const char* info = "" 66 "json0 [options...] [file...]\n" 67 "\n" 68 "\n" 69 "JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 70 "Its output is always a single line, which ends with a line-feed.\n" 71 "\n" 72 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 73 "valid JSON, since it\n" 74 "\n" 75 " - ignores both rest-of-line and multi-line comments\n" 76 " - ignores extra/trailing commas in arrays and objects\n" 77 " - turns single-quoted strings/keys into double-quoted strings\n" 78 " - double-quotes unquoted object keys\n" 79 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 80 "\n" 81 "All options available can either start with a single or a double-dash\n" 82 "\n" 83 " -h show this help message\n" 84 " -help show this help message\n" 85 " -jsonl emit JSON Lines, when top-level value is an array\n" 86 ""; 87 88 typedef struct j0_maker { 89 FILE* in; 90 FILE* out; 91 92 unsigned char* ibuf; 93 size_t ilen; // how many bytes are being used in the input buffer 94 size_t icap; // the input buffer's capacity 95 size_t ipos; // the current position in the input buffer 96 97 size_t line; // the current line, used to show useful error messages 98 size_t pos; // the position in the current line, for error messages 99 100 unsigned char* obuf; 101 size_t ocap; // the output buffer's capacity 102 size_t opos; // the current position in the output buffer 103 104 int current; 105 int next; 106 } j0_maker; 107 108 // advance_reader_pos helps func read_byte do its job 109 static inline void advance_reader_pos(j0_maker* r, unsigned char b) { 110 r->ipos++; 111 if (b == '\n') { 112 r->line++; 113 r->pos = 1; 114 } else { 115 r->pos++; 116 } 117 } 118 119 // read_byte does as it says: check its return for the value EOF, before 120 // using it as the next byte 121 static inline int read_byte(j0_maker* r) { 122 if (r->ipos < r->ilen) { 123 // inside current chunk 124 const unsigned char b = r->ibuf[r->ipos]; 125 advance_reader_pos(r, b); 126 return b; 127 } 128 129 // need to read the next block 130 r->ipos = 0; 131 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 132 if (r->ilen > 0) { 133 const unsigned char b = r->ibuf[r->ipos]; 134 advance_reader_pos(r, b); 135 return b; 136 } 137 138 // reached the end of data 139 return EOF; 140 } 141 142 // advance is used in most of the code, instead of calling read_byte directly 143 static inline void advance(j0_maker* r) { 144 r->current = r->next; 145 r->next = read_byte(r); 146 } 147 148 void fail(j0_maker* m, int code, const char* msg); 149 150 void skip_line(j0_maker* r) { 151 while (true) { 152 advance(r); 153 const int lead = r->current; 154 155 if (lead == EOF) { 156 break; 157 } 158 159 if (lead == '\n') { 160 advance(r); 161 break; 162 } 163 } 164 } 165 166 void skip_multiline_comment(j0_maker* r) { 167 unsigned char prev = 0; 168 169 while (true) { 170 advance(r); 171 const int lead = r->current; 172 173 if (lead == EOF) { 174 break; 175 } 176 177 if (prev == '*' && lead == '/') { 178 advance(r); 179 break; 180 } 181 182 prev = (unsigned char)lead; 183 } 184 } 185 186 void skip_comment(j0_maker* r) { 187 int lead = r->current; 188 189 if (lead == '#') { 190 skip_line(r); 191 return; 192 } 193 194 if (lead != '/') { 195 fail(r, 1, "expected a slash to start comments"); 196 } 197 198 advance(r); 199 lead = r->current; 200 201 if (lead == '/') { 202 skip_line(r); 203 return; 204 } 205 206 if (lead == '*') { 207 skip_multiline_comment(r); 208 return; 209 } 210 211 fail(r, 1, "expected `//` or `/*` to start comments"); 212 } 213 214 static inline void seek_token(j0_maker* r) { 215 while (true) { 216 const int lead = r->current; 217 218 if (lead != EOF && lead <= ' ') { 219 advance(r); 220 continue; 221 } 222 223 if (lead == '/' || lead == '#') { 224 skip_comment(r); 225 continue; 226 } 227 228 break; 229 } 230 } 231 232 bool starts_with_bom(const unsigned char* b, const size_t n) { 233 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 234 } 235 236 void restart_state(j0_maker* m, FILE* w, FILE* r) { 237 m->in = r; 238 m->ilen = 0; 239 m->ipos = 0; 240 241 m->out = w; 242 m->opos = 0; 243 244 m->line = 1; 245 m->pos = 1; 246 247 m->current = EOF; 248 m->next = EOF; 249 250 m->current = read_byte(m); 251 if (m->current == EOF) { 252 return; 253 } 254 m->next = read_byte(m); 255 256 // skip leading UTF-8 BOM (byte-order mark), if present 257 if (starts_with_bom(m->ibuf, m->ilen)) { 258 // a UTF-8 BOM has 3 bytes 259 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 260 advance(m); 261 } 262 } 263 } 264 265 void write_byte(j0_maker* m, unsigned char b) { 266 if (m->opos < m->ocap) { 267 m->obuf[m->opos++] = b; 268 return; 269 } 270 271 fwrite(m->obuf, 1, m->ocap, m->out); 272 m->obuf[0] = b; 273 m->opos = 1; 274 } 275 276 // write_bytes does as it says, minimizing the number of calls to fwrite 277 void write_bytes(j0_maker* m, const unsigned char* src, size_t len) { 278 const size_t rem = m->ocap - m->opos; 279 if (len < rem) { 280 memcpy(m->obuf + m->opos, src, len); 281 m->opos += len; 282 return; 283 } 284 285 for (size_t i = 0; i < len; i++) { 286 write_byte(m, src[i]); 287 } 288 } 289 290 void flush(j0_maker* m) { 291 if (m->opos > 0) { 292 fwrite(m->obuf, 1, m->opos, m->out); 293 } 294 m->opos = 0; 295 fflush(m->out); 296 } 297 298 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 299 300 static inline bool check_2_byte_rune(int a, int b) { 301 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 302 } 303 304 bool check_3_byte_rune(int a, int b, int c) { 305 return ( 306 (a == 0xe0) && 307 (0xa0 <= b && b <= 0xbf) && 308 (0x80 <= c && c <= 0xbf) 309 ) || ( 310 (0xe1 <= a && a <= 0xec) && 311 (0x80 <= b && b <= 0xbf) && 312 (0x80 <= c && c <= 0xbf) 313 ) || ( 314 (a == 0xed) && 315 (0x80 <= b && b <= 0x9f) && 316 (0x80 <= c && c <= 0xbf) 317 ) || ( 318 (a == 0xee || a == 0xef) && 319 (0x80 <= b && b <= 0xbf) && 320 (0x80 <= c && c <= 0xbf) 321 ); 322 } 323 324 bool check_4_byte_rune(int a, int b, int c, int d) { 325 return ( 326 (a == 0xf0) && 327 (0x90 <= b && b <= 0xbf) && 328 (0x80 <= c && c <= 0xbf) && 329 (0x80 <= d && d <= 0xbf) 330 ) || ( 331 (a == 0xf1 || a == 0xf3) && 332 (0x80 <= b && b <= 0xbf) && 333 (0x80 <= c && c <= 0xbf) && 334 (0x80 <= d && d <= 0xbf) 335 ) || ( 336 (a == 0xf4) && 337 (0x80 <= b && b <= 0xbf) && 338 (0x80 <= c && c <= 0x8f) && 339 (0x80 <= d && d <= 0xbf) 340 ); 341 } 342 343 // write_replacement_char is the recommended action to handle invalid bytes 344 void write_replacement_char(j0_maker* m) { 345 write_byte(m, 0xef); 346 write_byte(m, 0xbf); 347 write_byte(m, 0xbd); 348 } 349 350 void handle_invalid_rune(j0_maker* m) { 351 // fail(m, 1, "invalid unicode value"); 352 write_replacement_char(m); 353 } 354 355 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 356 void write_rune(j0_maker* m, uint32_t rune) { 357 if (rune < (1 << 7)) { 358 write_byte(m, rune); 359 return; 360 } 361 362 if (rune < (1 << (5 + 6))) { 363 const int a = 0b11000000 | (rune >> 6); 364 const int b = 0b10000000 | (rune & 0b00111111); 365 if (check_2_byte_rune(a, b)) { 366 write_byte(m, a); 367 write_byte(m, b); 368 } else { 369 write_replacement_char(m); 370 } 371 return; 372 } 373 374 if (rune < (1 << (4 + 6 + 6))) { 375 const int a = 0b11100000 | (rune >> 12); 376 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 377 const int c = 0b10000000 | (rune & 0b00111111); 378 if (check_3_byte_rune(a, b, c)) { 379 write_byte(m, a); 380 write_byte(m, b); 381 write_byte(m, c); 382 } else { 383 write_replacement_char(m); 384 } 385 return; 386 } 387 388 if (rune < (1 << (3 + 6 + 6 + 6))) { 389 const int a = 0b11110000 | (rune >> 18); 390 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 391 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 392 const int d = 0b10000000 | (rune & 0b00111111); 393 if (check_4_byte_rune(a, b, c, d)) { 394 write_byte(m, a); 395 write_byte(m, b); 396 write_byte(m, c); 397 write_byte(m, d); 398 } else { 399 write_replacement_char(m); 400 } 401 return; 402 } 403 404 write_replacement_char(m); 405 } 406 407 void copy_utf8_rune(j0_maker* m) { 408 const int a = m->current; 409 410 if (a == EOF) { 411 return; 412 } 413 414 // handle 1-byte runes 415 if (a < 128) { 416 write_byte(m, a); 417 return; 418 } 419 420 advance(m); 421 const int b = m->current; 422 423 if (b == EOF) { 424 handle_invalid_rune(m); 425 return; 426 } 427 428 // handle 2-byte runes 429 if (check_2_byte_rune(a, b)) { 430 write_byte(m, a); 431 write_byte(m, b); 432 return; 433 } 434 435 advance(m); 436 const int c = m->current; 437 438 if (c == EOF) { 439 handle_invalid_rune(m); 440 return; 441 } 442 443 // handle 3-byte runes 444 if (check_3_byte_rune(a, b, c)) { 445 write_byte(m, a); 446 write_byte(m, b); 447 write_byte(m, c); 448 return; 449 } 450 451 advance(m); 452 const int d = m->current; 453 454 if (d == EOF) { 455 handle_invalid_rune(m); 456 return; 457 } 458 459 // handle 4-byte runes 460 if (check_4_byte_rune(a, b, c, d)) { 461 write_byte(m, a); 462 write_byte(m, b); 463 write_byte(m, c); 464 write_byte(m, d); 465 return; 466 } 467 468 handle_invalid_rune(m); 469 } 470 471 // debug is available to diagnose any bug found 472 void debug(j0_maker* m, const char* fmt, ...) { 473 va_list args; 474 va_start(args, fmt); 475 476 if (m->in != stdin) { 477 fclose(m->in); 478 } 479 480 write_byte(m, '\n'); 481 482 const unsigned long line = m->line; 483 const unsigned long pos = m->pos; 484 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 485 fprintf(stderr, fmt, args); 486 fprintf(stderr, "\x1b[0m\n"); 487 488 va_end(args); 489 490 exit(10); 491 } 492 493 // fail quits this app right after showing the error message given 494 void fail(j0_maker* m, int code, const char* msg) { 495 const unsigned long line = m->line; 496 const unsigned long pos = m->pos; 497 498 write_byte(m, '\n'); 499 flush(m); 500 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 501 exit(code); 502 } 503 504 bool demand_keyword(j0_maker* m, char* rest) { 505 for (; rest[0] != 0; rest++) { 506 const int lead = m->current; 507 if (lead == EOF || lead != rest[0]) { 508 return false; 509 } 510 advance(m); 511 } 512 513 return rest[0] == 0; 514 } 515 516 void handle_null(j0_maker* m) { 517 if (!demand_keyword(m, "null")) { 518 fail(m, 1, "expected `null` keyword"); 519 } 520 write_bytes(m, (unsigned char*)"null", 4); 521 } 522 523 void handle_true(j0_maker* m) { 524 if (!demand_keyword(m, "true")) { 525 fail(m, 1, "expected `true` keyword"); 526 } 527 write_bytes(m, (unsigned char*)"true", 4); 528 } 529 530 void handle_false(j0_maker* m) { 531 if (!demand_keyword(m, "false")) { 532 fail(m, 1, "expected `false` keyword"); 533 } 534 write_bytes(m, (unsigned char*)"false", 5); 535 } 536 537 void handle_capital_none(j0_maker* m) { 538 if (!demand_keyword(m, "None")) { 539 fail(m, 1, "expected `None` keyword"); 540 } 541 write_bytes(m, (unsigned char*)"null", 4); 542 } 543 544 void handle_capital_true(j0_maker* m) { 545 if (!demand_keyword(m, "True")) { 546 fail(m, 1, "expected `True` keyword"); 547 } 548 write_bytes(m, (unsigned char*)"true", 4); 549 } 550 551 void handle_capital_false(j0_maker* m) { 552 if (!demand_keyword(m, "False")) { 553 fail(m, 1, "expected `False` keyword"); 554 } 555 write_bytes(m, (unsigned char*)"false", 5); 556 } 557 558 void handle_digits(j0_maker* m) { 559 if (!isdigit(m->current)) { 560 fail(m, 1, "expected/missing digits"); 561 } 562 563 while (isdigit(m->current)) { 564 write_byte(m, m->current); 565 advance(m); 566 } 567 } 568 569 void handle_number(j0_maker* m) { 570 handle_digits(m); 571 572 const int lead = m->current; 573 574 if (lead == '.') { 575 write_byte(m, '.'); 576 advance(m); 577 578 if (isdigit(m->current)) { 579 handle_digits(m); 580 } else { 581 write_byte(m, '0'); 582 } 583 return; 584 } 585 586 if (lead == 'e' || lead == 'E') { 587 write_byte(m, lead); 588 advance(m); 589 590 if (m->current == '+') { 591 advance(m); 592 } else if (m->current == '-') { 593 write_byte(m, '-'); 594 advance(m); 595 } 596 597 handle_digits(m); 598 } 599 } 600 601 void handle_dot(j0_maker* m) { 602 write_byte(m, '0'); 603 write_byte(m, '.'); 604 advance(m); 605 606 if (!isdigit(m->current)) { 607 fail(m, 1, "expected/missing digits after decimal dot"); 608 } 609 handle_digits(m); 610 } 611 612 void handle_plus_number(j0_maker* m) { 613 advance(m); 614 615 if (m->current == '.') { 616 handle_dot(m); 617 return; 618 } 619 handle_number(m); 620 } 621 622 void handle_minus_number(j0_maker* m) { 623 write_byte(m, '-'); 624 advance(m); 625 626 if (m->current == '.') { 627 handle_dot(m); 628 return; 629 } 630 handle_number(m); 631 } 632 633 // decode_hex assumes valid hex digits, checked by func is_valid_hex 634 uint32_t decode_hex(unsigned char hex) { 635 if ('0' <= hex && hex <= '9') { 636 return hex - '0'; 637 } 638 if ('A' <= hex && hex <= 'F') { 639 return hex - 'A' + 10; 640 } 641 if ('a' <= hex && hex <= 'f') { 642 return hex - 'a' + 10; 643 } 644 return 0xffff; 645 } 646 647 static inline bool is_valid_hex(unsigned char b) { 648 return false || 649 ('0' <= b && b <= '9') || 650 ('A' <= b && b <= 'F') || 651 ('a' <= b && b <= 'f'); 652 } 653 654 // handle_low_char ensures characters whose ASCII codes are lower than spaces 655 // are properly escaped for strings 656 void handle_low_char(j0_maker* m, int c) { 657 const char* hex = "0123456789ABCDEF"; 658 659 switch (c) { 660 case '\t': 661 write_byte(m, '\\'); 662 write_byte(m, 't'); 663 break; 664 case '\n': 665 write_byte(m, '\\'); 666 write_byte(m, 'n'); 667 break; 668 case '\r': 669 write_byte(m, '\\'); 670 write_byte(m, 'r'); 671 break; 672 case '\b': 673 write_byte(m, '\\'); 674 write_byte(m, 'b'); 675 break; 676 case '\f': 677 write_byte(m, '\\'); 678 write_byte(m, 'f'); 679 break; 680 case '\v': 681 write_byte(m, '\\'); 682 write_byte(m, 'v'); 683 break; 684 default: 685 write_byte(m, '\\'); 686 write_byte(m, 'u'); 687 write_byte(m, '0'); 688 write_byte(m, '0'); 689 write_byte(m, hex[c / 16]); 690 write_byte(m, hex[c % 16]); 691 break; 692 } 693 } 694 695 void write_inner_string_hex_quad(j0_maker* m, const unsigned char quad[4]) { 696 const uint32_t n = 0 + 697 (decode_hex(quad[0]) << 12) + 698 (decode_hex(quad[1]) << 8) + 699 (decode_hex(quad[2]) << 4) + 700 (decode_hex(quad[3]) << 0); 701 702 switch (n) { 703 case '"': 704 write_byte(m, '\\'); 705 write_byte(m, '"'); 706 return; 707 case '\\': 708 write_byte(m, '\\'); 709 write_byte(m, '\\'); 710 return; 711 } 712 713 if (n >= ' ') { 714 write_rune(m, n); 715 } else { 716 handle_low_char(m, n); 717 } 718 } 719 720 void handle_hex_quad(j0_maker* m) { 721 unsigned char quad[4]; 722 for (size_t i = 0; i < 4; i++) { 723 advance(m); 724 const int lead = m->current; 725 if (lead == EOF) { 726 fail(m, 1, "end of input before end of string"); 727 } 728 if (is_valid_hex(lead)) { 729 quad[i] = lead; 730 continue; 731 } 732 fail(m, 1, "invalid hexadecimal digit in string"); 733 } 734 735 write_inner_string_hex_quad(m, quad); 736 } 737 738 void handle_hex_pair(j0_maker* m) { 739 unsigned char quad[4] = {'0', '0', '0', '0'}; 740 advance(m); 741 const int a = m->current; 742 advance(m); 743 const int b = m->current; 744 if (a == EOF || b == EOF) { 745 fail(m, 1, "end of input before end of string"); 746 } 747 if (!is_valid_hex(a) || !is_valid_hex(b)) { 748 fail(m, 1, "invalid hexadecimal digit in string"); 749 } 750 751 quad[2] = a; 752 quad[3] = b; 753 write_inner_string_hex_quad(m, quad); 754 } 755 756 void handle_string_escape(j0_maker* m, int c) { 757 switch (c) { 758 case '"': 759 case '\\': 760 case 'b': 761 case 'f': 762 case 'n': 763 case 'r': 764 case 't': 765 write_byte(m, '\\'); 766 write_byte(m, c); 767 break; 768 case 'u': 769 handle_hex_quad(m); 770 break; 771 case 'x': 772 handle_hex_pair(m); 773 break; 774 case '\'': 775 write_byte(m, '\''); 776 break; 777 default: 778 write_byte(m, m->current); 779 break; 780 } 781 } 782 783 void handle_string(j0_maker* m) { 784 const unsigned char quote = m->current; 785 bool escaped = false; 786 787 write_byte(m, '"'); 788 789 while (true) { 790 advance(m); 791 792 int c = m->current; 793 if (c == EOF) { 794 fail(m, 1, "input ended before string was close-quoted"); 795 } 796 797 if (escaped) { 798 handle_string_escape(m, c); 799 escaped = false; 800 continue; 801 } 802 803 switch (c) { 804 case '\\': 805 escaped = true; 806 break; 807 default: 808 if (c == quote) { 809 write_byte(m, '"'); 810 advance(m); 811 return; 812 } 813 814 // write_byte(m, c); 815 if (c < ' ') { 816 handle_low_char(m, c); 817 } else { 818 copy_utf8_rune(m); 819 } 820 break; 821 } 822 } 823 } 824 825 void handle_token(j0_maker* m); 826 827 void handle_array(j0_maker* m) { 828 write_byte(m, '['); 829 advance(m); 830 831 for (size_t i = 0; true; i++) { 832 seek_token(m); 833 const int lead = m->current; 834 835 if (lead == EOF) { 836 fail(m, 1, "unclosed array"); 837 } 838 839 if (lead == ',') { 840 advance(m); 841 continue; 842 } 843 844 if (lead == ']') { 845 write_byte(m, ']'); 846 advance(m); 847 return; 848 } 849 850 if (i > 0) { 851 write_byte(m, ','); 852 } 853 if (feof(m->out)) { 854 return; 855 } 856 handle_token(m); 857 } 858 } 859 860 // handle_array_jsonl is a slight variation of func handle_array: this one is 861 // used to handle top-level arrays when running in JSON Lines mode, to emit 862 // line-feeds after each item, instead of commas between them 863 void handle_array_jsonl(j0_maker* m) { 864 advance(m); 865 866 for (size_t i = 0; true; i++) { 867 seek_token(m); 868 const int lead = m->current; 869 870 if (lead == EOF) { 871 fail(m, 1, "unclosed array"); 872 } 873 874 if (lead == ',') { 875 advance(m); 876 continue; 877 } 878 879 if (i > 0) { 880 write_byte(m, '\n'); 881 } 882 883 if (lead == ']') { 884 advance(m); 885 return; 886 } 887 888 if (feof(m->out)) { 889 return; 890 } 891 handle_token(m); 892 } 893 } 894 895 void handle_unquoted_key(j0_maker* m) { 896 write_byte(m, '"'); 897 898 while (true) { 899 int c = m->current; 900 if (c == EOF) { 901 fail(m, 1, "input ended with an object key"); 902 } 903 904 write_byte(m, c); 905 advance(m); 906 907 c = m->current; 908 if (!isalpha(c) && !isdigit(c) && c != '_') { 909 break; 910 } 911 } 912 913 write_byte(m, '"'); 914 } 915 916 void handle_object(j0_maker* m) { 917 write_byte(m, '{'); 918 advance(m); 919 920 for (size_t i = 0; true; i++) { 921 seek_token(m); 922 int lead = m->current; 923 924 if (lead == EOF) { 925 fail(m, 1, "unclosed object"); 926 } 927 928 if (lead == ',') { 929 advance(m); 930 continue; 931 } 932 933 if (lead == '}') { 934 write_byte(m, '}'); 935 advance(m); 936 return; 937 } 938 939 if (feof(m->out)) { 940 return; 941 } 942 943 if (lead == '"' || lead == '\'') { 944 if (i > 0) { 945 write_byte(m, ','); 946 } 947 handle_string(m); 948 } else if (isalpha(lead) || lead == '_') { 949 if (i > 0) { 950 write_byte(m, ','); 951 } 952 handle_unquoted_key(m); 953 } else { 954 fail(m, 1, "only strings or identifiers can be object keys"); 955 } 956 957 seek_token(m); 958 lead = m->current; 959 960 if (lead == EOF) { 961 fail(m, 1, "input ended after object-key and before value"); 962 } 963 964 if (lead != ':') { 965 fail(m, 1, "a `:` must follow all object keys"); 966 } 967 968 write_byte(m, ':'); 969 advance(m); 970 971 seek_token(m); 972 if (m->current == EOF) { 973 fail(m, 1, "input ended after a `:` following an object-key"); 974 } 975 976 handle_token(m); 977 } 978 } 979 980 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 981 void (*dispatch[256])() = { 982 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 983 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 984 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 985 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 986 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 987 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 988 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 989 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 990 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 991 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 992 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 993 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 994 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 995 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 996 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 997 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 998 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 999 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1000 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1001 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1002 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1003 NULL, NULL, NULL, NULL, 1004 }; 1005 1006 void handle_token(j0_maker* m) { 1007 dispatch[m->current](m); 1008 } 1009 1010 // handle_invalid_token shows an error message and quits the app right after 1011 void handle_invalid_token(j0_maker* m) { 1012 char msg[64]; 1013 unsigned char c = (unsigned char)m->current; 1014 sprintf(msg, "%c (%d): invalid token", c, c); 1015 fail(m, 1, msg); 1016 } 1017 1018 void handle_array_jsonl(j0_maker* m); 1019 1020 void handle_input(FILE* src, bool jsonl) { 1021 unsigned char ibuf[IBUF_SIZE]; 1022 unsigned char obuf[OBUF_SIZE]; 1023 1024 j0_maker m; 1025 m.ibuf = ibuf; 1026 m.icap = sizeof(ibuf); 1027 m.obuf = obuf; 1028 m.ocap = sizeof(obuf); 1029 restart_state(&m, stdout, src); 1030 1031 // ignore leading whitespace/comment bytes, if present 1032 seek_token(&m); 1033 1034 if (m.current == EOF) { 1035 fail(&m, 1, "empty input isn't valid JSON"); 1036 } 1037 1038 if (jsonl && m.current == '[') { 1039 handle_array_jsonl(&m); 1040 } else { 1041 handle_token(&m); 1042 write_byte(&m, '\n'); 1043 } 1044 flush(&m); 1045 1046 // ignore trailing whitespace/comment bytes, if present 1047 seek_token(&m); 1048 1049 // ignore trailing semicolon, if present 1050 if (m.current == ';') { 1051 advance(&m); 1052 // ignore trailing whitespace/comment bytes, if present 1053 seek_token(&m); 1054 } 1055 1056 if (!feof(src) || m.current != EOF) { 1057 fail(&m, 1, "unexpected trailing JSON data"); 1058 } 1059 } 1060 1061 bool is_help_option(const char* s) { 1062 return (s[0] == '-' && s[1] != 0) && ( 1063 strcmp(s, "-h") == 0 || 1064 strcmp(s, "--h") == 0 || 1065 strcmp(s, "-help") == 0 || 1066 strcmp(s, "--help") == 0 1067 ); 1068 } 1069 1070 bool is_jsonl_option(const char* s) { 1071 return (s[0] == '-' && s[1] != 0) && ( 1072 strcmp(s, "-jl") == 0 || 1073 strcmp(s, "--jl") == 0 || 1074 strcmp(s, "-jsonl") == 0 || 1075 strcmp(s, "--jsonl") == 0 1076 ); 1077 } 1078 1079 // run returns the error code 1080 int run(int nargs, char** args) { 1081 bool jsonl = false; 1082 if (nargs > 0 && is_jsonl_option(args[0])) { 1083 jsonl = true; 1084 nargs--; 1085 args++; 1086 } 1087 1088 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1089 nargs--; 1090 args++; 1091 } 1092 1093 if (nargs > 1) { 1094 const char* msg = "can't use more than 1 named input"; 1095 fprintf(stderr, ERROR_LINE("%s"), msg); 1096 return 1; 1097 } 1098 1099 // use stdin when not given a filepath 1100 if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) { 1101 handle_input(stdin, jsonl); 1102 return 0; 1103 } 1104 1105 const char* path = args[0]; 1106 FILE* f = fopen(path, "rb"); 1107 if (f == NULL) { 1108 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1109 return 1; 1110 } 1111 1112 handle_input(f, jsonl); 1113 fclose(f); 1114 1115 return 0; 1116 } 1117 1118 int main(int argc, char** argv) { 1119 #ifdef _WIN32 1120 setmode(fileno(stdin), O_BINARY); 1121 // ensure output lines end in LF instead of CRLF on windows 1122 setmode(fileno(stdout), O_BINARY); 1123 setmode(fileno(stderr), O_BINARY); 1124 #endif 1125 1126 if (argc > 1 && is_help_option(argv[1])) { 1127 printf("%s", info); 1128 return 0; 1129 } 1130 1131 // the dispatch table starts as all null function-pointers 1132 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1133 dispatch[i] = handle_invalid_token; 1134 } 1135 1136 for (size_t i = '0'; i <= '9'; i++) { 1137 dispatch[i] = handle_number; 1138 } 1139 1140 dispatch['n'] = handle_null; 1141 dispatch['t'] = handle_true; 1142 dispatch['f'] = handle_false; 1143 dispatch['N'] = handle_capital_none; 1144 dispatch['T'] = handle_capital_true; 1145 dispatch['F'] = handle_capital_false; 1146 dispatch['.'] = handle_dot; 1147 dispatch['+'] = handle_plus_number; 1148 dispatch['-'] = handle_minus_number; 1149 dispatch['"'] = handle_string; 1150 dispatch['\''] = handle_string; 1151 dispatch['['] = handle_array; 1152 dispatch['{'] = handle_object; 1153 1154 // enable full/block-buffering for standard output 1155 setvbuf(stdout, NULL, _IOFBF, 0); 1156 1157 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1158 }