File: json0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./json0 ./json0.c 29 30 To build a unit-testing app run 31 32 cc -Wall -s -D TESTING -o ./json0_test ./json0.c 33 */ 34 35 #include <ctype.h> 36 #include <stdarg.h> 37 #include <stdbool.h> 38 #include <stdint.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 #include <string.h> 42 43 #ifdef _WIN32 44 #include <fcntl.h> 45 #include <windows.h> 46 #endif 47 48 #ifdef RED_ERRORS 49 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 50 #ifdef __APPLE__ 51 #define ERROR_STYLE "\x1b[31m" 52 #endif 53 #define RESET_STYLE "\x1b[0m" 54 #else 55 #define ERROR_STYLE 56 #define RESET_STYLE 57 #endif 58 59 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 60 61 #ifndef IBUF_SIZE 62 #define IBUF_SIZE (32 * 1024) 63 #endif 64 65 #ifndef OBUF_SIZE 66 #define OBUF_SIZE (8 * 1024) 67 #endif 68 69 const char* info = "" 70 "json0 [options...] [file...]\n" 71 "\n" 72 "\n" 73 "JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 74 "Its output is always a single line, which ends with a line-feed.\n" 75 "\n" 76 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 77 "valid JSON, since it\n" 78 "\n" 79 " - ignores both rest-of-line and multi-line comments\n" 80 " - ignores extra/trailing commas in arrays and objects\n" 81 " - turns single-quoted strings/keys into double-quoted strings\n" 82 " - double-quotes unquoted object keys\n" 83 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 84 "\n" 85 "All options available can either start with a single or a double-dash\n" 86 "\n" 87 " -h show this help message\n" 88 " -help show this help message\n" 89 " -jsonl emit JSON Lines, when top-level value is an array\n" 90 ""; 91 92 typedef struct j0_maker { 93 FILE* in; 94 FILE* out; 95 96 unsigned char* ibuf; 97 size_t ilen; // how many bytes are being used in the input buffer 98 size_t icap; // the input buffer's capacity 99 size_t ipos; // the current position in the input buffer 100 101 size_t line; // the current line, used to show useful error messages 102 size_t pos; // the position in the current line, for error messages 103 104 unsigned char* obuf; 105 size_t ocap; // the output buffer's capacity 106 size_t opos; // the current position in the output buffer 107 108 int current; 109 int next; 110 } j0_maker; 111 112 // advance_reader_pos helps func read_byte do its job 113 static inline void advance_reader_pos(j0_maker* r, unsigned char b) { 114 r->ipos++; 115 if (b == '\n') { 116 r->line++; 117 r->pos = 1; 118 } else { 119 r->pos++; 120 } 121 } 122 123 // read_byte does as it says: check its return for the value EOF, before 124 // using it as the next byte 125 static inline int read_byte(j0_maker* r) { 126 if (r->ipos < r->ilen) { 127 // inside current chunk 128 const unsigned char b = r->ibuf[r->ipos]; 129 advance_reader_pos(r, b); 130 return b; 131 } 132 133 // need to read the next block 134 r->ipos = 0; 135 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 136 if (r->ilen > 0) { 137 const unsigned char b = r->ibuf[r->ipos]; 138 advance_reader_pos(r, b); 139 return b; 140 } 141 142 // reached the end of data 143 return EOF; 144 } 145 146 // advance is used in most of the code, instead of calling read_byte directly 147 static inline void advance(j0_maker* r) { 148 r->current = r->next; 149 r->next = read_byte(r); 150 } 151 152 void fail(j0_maker* m, int code, const char* msg); 153 154 void skip_line(j0_maker* r) { 155 while (true) { 156 advance(r); 157 const int lead = r->current; 158 159 if (lead == EOF) { 160 break; 161 } 162 163 if (lead == '\n') { 164 advance(r); 165 break; 166 } 167 } 168 } 169 170 void skip_multiline_comment(j0_maker* r) { 171 unsigned char prev = 0; 172 173 while (true) { 174 advance(r); 175 const int lead = r->current; 176 177 if (lead == EOF) { 178 break; 179 } 180 181 if (prev == '*' && lead == '/') { 182 advance(r); 183 break; 184 } 185 186 prev = (unsigned char)lead; 187 } 188 } 189 190 void skip_comment(j0_maker* r) { 191 int lead = r->current; 192 193 if (lead == '#') { 194 skip_line(r); 195 return; 196 } 197 198 if (lead != '/') { 199 fail(r, 1, "expected a slash to start comments"); 200 } 201 202 advance(r); 203 lead = r->current; 204 205 if (lead == '/') { 206 skip_line(r); 207 return; 208 } 209 210 if (lead == '*') { 211 skip_multiline_comment(r); 212 return; 213 } 214 215 fail(r, 1, "expected `//` or `/*` to start comments"); 216 } 217 218 static inline void seek_token(j0_maker* r) { 219 while (true) { 220 const int lead = r->current; 221 222 if (lead != EOF && lead <= ' ') { 223 advance(r); 224 continue; 225 } 226 227 if (lead == '/' || lead == '#') { 228 skip_comment(r); 229 continue; 230 } 231 232 break; 233 } 234 } 235 236 bool starts_with_bom(const unsigned char* b, const size_t n) { 237 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 238 } 239 240 void restart_state(j0_maker* m, FILE* w, FILE* r) { 241 m->in = r; 242 m->ilen = 0; 243 m->ipos = 0; 244 245 m->out = w; 246 m->opos = 0; 247 248 m->line = 1; 249 m->pos = 1; 250 251 m->current = EOF; 252 m->next = EOF; 253 254 m->current = read_byte(m); 255 if (m->current == EOF) { 256 return; 257 } 258 m->next = read_byte(m); 259 260 // skip leading UTF-8 BOM (byte-order mark), if present 261 if (starts_with_bom(m->ibuf, m->ilen)) { 262 // a UTF-8 BOM has 3 bytes 263 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 264 advance(m); 265 } 266 } 267 } 268 269 void write_byte(j0_maker* m, unsigned char b) { 270 if (m->opos < m->ocap) { 271 m->obuf[m->opos++] = b; 272 return; 273 } 274 275 fwrite(m->obuf, 1, m->ocap, m->out); 276 m->obuf[0] = b; 277 m->opos = 1; 278 } 279 280 // write_bytes does as it says, minimizing the number of calls to fwrite 281 void write_bytes(j0_maker* m, const unsigned char* src, size_t len) { 282 const size_t rem = m->ocap - m->opos; 283 if (len < rem) { 284 memcpy(m->obuf + m->opos, src, len); 285 m->opos += len; 286 return; 287 } 288 289 for (size_t i = 0; i < len; i++) { 290 write_byte(m, src[i]); 291 } 292 } 293 294 void flush(j0_maker* m) { 295 if (m->opos > 0) { 296 fwrite(m->obuf, 1, m->opos, m->out); 297 } 298 m->opos = 0; 299 fflush(m->out); 300 } 301 302 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 303 304 static inline bool check_2_byte_rune(int a, int b) { 305 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 306 } 307 308 bool check_3_byte_rune(int a, int b, int c) { 309 return ( 310 (a == 0xe0) && 311 (0xa0 <= b && b <= 0xbf) && 312 (0x80 <= c && c <= 0xbf) 313 ) || ( 314 (0xe1 <= a && a <= 0xec) && 315 (0x80 <= b && b <= 0xbf) && 316 (0x80 <= c && c <= 0xbf) 317 ) || ( 318 (a == 0xed) && 319 (0x80 <= b && b <= 0x9f) && 320 (0x80 <= c && c <= 0xbf) 321 ) || ( 322 (a == 0xee || a == 0xef) && 323 (0x80 <= b && b <= 0xbf) && 324 (0x80 <= c && c <= 0xbf) 325 ); 326 } 327 328 bool check_4_byte_rune(int a, int b, int c, int d) { 329 return ( 330 (a == 0xf0) && 331 (0x90 <= b && b <= 0xbf) && 332 (0x80 <= c && c <= 0xbf) && 333 (0x80 <= d && d <= 0xbf) 334 ) || ( 335 (a == 0xf1 || a == 0xf3) && 336 (0x80 <= b && b <= 0xbf) && 337 (0x80 <= c && c <= 0xbf) && 338 (0x80 <= d && d <= 0xbf) 339 ) || ( 340 (a == 0xf4) && 341 (0x80 <= b && b <= 0xbf) && 342 (0x80 <= c && c <= 0x8f) && 343 (0x80 <= d && d <= 0xbf) 344 ); 345 } 346 347 // write_replacement_char is the recommended action to handle invalid bytes 348 void write_replacement_char(j0_maker* m) { 349 write_byte(m, 0xef); 350 write_byte(m, 0xbf); 351 write_byte(m, 0xbd); 352 } 353 354 void handle_invalid_rune(j0_maker* m) { 355 // fail(m, 1, "invalid unicode value"); 356 write_replacement_char(m); 357 } 358 359 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 360 void write_rune(j0_maker* m, uint32_t rune) { 361 if (rune < (1 << 7)) { 362 write_byte(m, rune); 363 return; 364 } 365 366 if (rune < (1 << (5 + 6))) { 367 const int a = 0b11000000 | (rune >> 6); 368 const int b = 0b10000000 | (rune & 0b00111111); 369 if (check_2_byte_rune(a, b)) { 370 write_byte(m, a); 371 write_byte(m, b); 372 } else { 373 write_replacement_char(m); 374 } 375 return; 376 } 377 378 if (rune < (1 << (4 + 6 + 6))) { 379 const int a = 0b11100000 | (rune >> 12); 380 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 381 const int c = 0b10000000 | (rune & 0b00111111); 382 if (check_3_byte_rune(a, b, c)) { 383 write_byte(m, a); 384 write_byte(m, b); 385 write_byte(m, c); 386 } else { 387 write_replacement_char(m); 388 } 389 return; 390 } 391 392 if (rune < (1 << (3 + 6 + 6 + 6))) { 393 const int a = 0b11110000 | (rune >> 18); 394 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 395 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 396 const int d = 0b10000000 | (rune & 0b00111111); 397 if (check_4_byte_rune(a, b, c, d)) { 398 write_byte(m, a); 399 write_byte(m, b); 400 write_byte(m, c); 401 write_byte(m, d); 402 } else { 403 write_replacement_char(m); 404 } 405 return; 406 } 407 408 write_replacement_char(m); 409 } 410 411 void copy_utf8_rune(j0_maker* m) { 412 const int a = m->current; 413 414 if (a == EOF) { 415 return; 416 } 417 418 // handle 1-byte runes 419 if (a < 128) { 420 write_byte(m, a); 421 return; 422 } 423 424 advance(m); 425 const int b = m->current; 426 427 if (b == EOF) { 428 handle_invalid_rune(m); 429 return; 430 } 431 432 // handle 2-byte runes 433 if (check_2_byte_rune(a, b)) { 434 write_byte(m, a); 435 write_byte(m, b); 436 return; 437 } 438 439 advance(m); 440 const int c = m->current; 441 442 if (c == EOF) { 443 handle_invalid_rune(m); 444 return; 445 } 446 447 // handle 3-byte runes 448 if (check_3_byte_rune(a, b, c)) { 449 write_byte(m, a); 450 write_byte(m, b); 451 write_byte(m, c); 452 return; 453 } 454 455 advance(m); 456 const int d = m->current; 457 458 if (d == EOF) { 459 handle_invalid_rune(m); 460 return; 461 } 462 463 // handle 4-byte runes 464 if (check_4_byte_rune(a, b, c, d)) { 465 write_byte(m, a); 466 write_byte(m, b); 467 write_byte(m, c); 468 write_byte(m, d); 469 return; 470 } 471 472 handle_invalid_rune(m); 473 } 474 475 // debug is available to diagnose any bug found 476 void debug(j0_maker* m, const char* fmt, ...) { 477 va_list args; 478 va_start(args, fmt); 479 480 if (m->in != stdin) { 481 fclose(m->in); 482 } 483 484 write_byte(m, '\n'); 485 486 const unsigned long line = m->line; 487 const unsigned long pos = m->pos; 488 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 489 fprintf(stderr, fmt, args); 490 fprintf(stderr, "\x1b[0m\n"); 491 492 va_end(args); 493 494 exit(10); 495 } 496 497 // fail quits this app right after showing the error message given 498 void fail(j0_maker* m, int code, const char* msg) { 499 const unsigned long line = m->line; 500 const unsigned long pos = m->pos; 501 502 write_byte(m, '\n'); 503 flush(m); 504 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 505 exit(code); 506 } 507 508 bool demand_keyword(j0_maker* m, char* rest) { 509 for (; rest[0] != 0; rest++) { 510 const int lead = m->current; 511 if (lead == EOF || lead != rest[0]) { 512 return false; 513 } 514 advance(m); 515 } 516 517 return rest[0] == 0; 518 } 519 520 void handle_null(j0_maker* m) { 521 if (!demand_keyword(m, "null")) { 522 fail(m, 1, "expected `null` keyword"); 523 } 524 write_bytes(m, (unsigned char*)"null", 4); 525 } 526 527 void handle_true(j0_maker* m) { 528 if (!demand_keyword(m, "true")) { 529 fail(m, 1, "expected `true` keyword"); 530 } 531 write_bytes(m, (unsigned char*)"true", 4); 532 } 533 534 void handle_false(j0_maker* m) { 535 if (!demand_keyword(m, "false")) { 536 fail(m, 1, "expected `false` keyword"); 537 } 538 write_bytes(m, (unsigned char*)"false", 5); 539 } 540 541 void handle_capital_none(j0_maker* m) { 542 if (!demand_keyword(m, "None")) { 543 fail(m, 1, "expected `None` keyword"); 544 } 545 write_bytes(m, (unsigned char*)"null", 4); 546 } 547 548 void handle_capital_true(j0_maker* m) { 549 if (!demand_keyword(m, "True")) { 550 fail(m, 1, "expected `True` keyword"); 551 } 552 write_bytes(m, (unsigned char*)"true", 4); 553 } 554 555 void handle_capital_false(j0_maker* m) { 556 if (!demand_keyword(m, "False")) { 557 fail(m, 1, "expected `False` keyword"); 558 } 559 write_bytes(m, (unsigned char*)"false", 5); 560 } 561 562 void handle_digits(j0_maker* m) { 563 if (!isdigit(m->current)) { 564 fail(m, 1, "expected/missing digits"); 565 } 566 567 while (isdigit(m->current)) { 568 write_byte(m, m->current); 569 advance(m); 570 } 571 } 572 573 void handle_number(j0_maker* m) { 574 handle_digits(m); 575 576 const int lead = m->current; 577 578 if (lead == '.') { 579 write_byte(m, '.'); 580 advance(m); 581 582 if (isdigit(m->current)) { 583 handle_digits(m); 584 } else { 585 write_byte(m, '0'); 586 } 587 return; 588 } 589 590 if (lead == 'e' || lead == 'E') { 591 write_byte(m, lead); 592 advance(m); 593 594 if (m->current == '+') { 595 advance(m); 596 } else if (m->current == '-') { 597 write_byte(m, '-'); 598 advance(m); 599 } 600 601 handle_digits(m); 602 } 603 } 604 605 void handle_dot(j0_maker* m) { 606 write_byte(m, '0'); 607 write_byte(m, '.'); 608 advance(m); 609 610 if (!isdigit(m->current)) { 611 fail(m, 1, "expected/missing digits after decimal dot"); 612 } 613 handle_digits(m); 614 } 615 616 void handle_plus_number(j0_maker* m) { 617 advance(m); 618 619 if (m->current == '.') { 620 handle_dot(m); 621 return; 622 } 623 handle_number(m); 624 } 625 626 void handle_minus_number(j0_maker* m) { 627 write_byte(m, '-'); 628 advance(m); 629 630 if (m->current == '.') { 631 handle_dot(m); 632 return; 633 } 634 handle_number(m); 635 } 636 637 // decode_hex assumes valid hex digits, checked by func is_valid_hex 638 uint32_t decode_hex(unsigned char hex) { 639 if ('0' <= hex && hex <= '9') { 640 return hex - '0'; 641 } 642 if ('A' <= hex && hex <= 'F') { 643 return hex - 'A' + 10; 644 } 645 if ('a' <= hex && hex <= 'f') { 646 return hex - 'a' + 10; 647 } 648 return 0xffff; 649 } 650 651 static inline bool is_valid_hex(unsigned char b) { 652 return false || 653 ('0' <= b && b <= '9') || 654 ('A' <= b && b <= 'F') || 655 ('a' <= b && b <= 'f'); 656 } 657 658 // handle_low_char ensures characters whose ASCII codes are lower than spaces 659 // are properly escaped for strings 660 void handle_low_char(j0_maker* m, int c) { 661 const char* hex = "0123456789ABCDEF"; 662 663 switch (c) { 664 case '\t': 665 write_byte(m, '\\'); 666 write_byte(m, 't'); 667 break; 668 case '\n': 669 write_byte(m, '\\'); 670 write_byte(m, 'n'); 671 break; 672 case '\r': 673 write_byte(m, '\\'); 674 write_byte(m, 'r'); 675 break; 676 case '\b': 677 write_byte(m, '\\'); 678 write_byte(m, 'b'); 679 break; 680 case '\f': 681 write_byte(m, '\\'); 682 write_byte(m, 'f'); 683 break; 684 case '\v': 685 write_byte(m, '\\'); 686 write_byte(m, 'v'); 687 break; 688 default: 689 write_byte(m, '\\'); 690 write_byte(m, 'u'); 691 write_byte(m, '0'); 692 write_byte(m, '0'); 693 write_byte(m, hex[c / 16]); 694 write_byte(m, hex[c % 16]); 695 break; 696 } 697 } 698 699 void write_inner_string_hex_quad(j0_maker* m, const unsigned char quad[4]) { 700 const uint32_t n = 0 + 701 (decode_hex(quad[0]) << 12) + 702 (decode_hex(quad[1]) << 8) + 703 (decode_hex(quad[2]) << 4) + 704 (decode_hex(quad[3]) << 0); 705 706 switch (n) { 707 case '"': 708 write_byte(m, '\\'); 709 write_byte(m, '"'); 710 return; 711 case '\\': 712 write_byte(m, '\\'); 713 write_byte(m, '\\'); 714 return; 715 } 716 717 if (n >= ' ') { 718 write_rune(m, n); 719 } else { 720 handle_low_char(m, n); 721 } 722 } 723 724 void handle_hex_quad(j0_maker* m) { 725 unsigned char quad[4]; 726 for (size_t i = 0; i < 4; i++) { 727 advance(m); 728 const int lead = m->current; 729 if (lead == EOF) { 730 fail(m, 1, "end of input before end of string"); 731 } 732 if (is_valid_hex(lead)) { 733 quad[i] = lead; 734 continue; 735 } 736 fail(m, 1, "invalid hexadecimal digit in string"); 737 } 738 739 write_inner_string_hex_quad(m, quad); 740 } 741 742 void handle_hex_pair(j0_maker* m) { 743 unsigned char quad[4] = {'0', '0', '0', '0'}; 744 advance(m); 745 const int a = m->current; 746 advance(m); 747 const int b = m->current; 748 if (a == EOF || b == EOF) { 749 fail(m, 1, "end of input before end of string"); 750 } 751 if (!is_valid_hex(a) || !is_valid_hex(b)) { 752 fail(m, 1, "invalid hexadecimal digit in string"); 753 } 754 755 quad[2] = a; 756 quad[3] = b; 757 write_inner_string_hex_quad(m, quad); 758 } 759 760 void handle_string_escape(j0_maker* m, int c) { 761 switch (c) { 762 case '"': 763 case '\\': 764 case 'b': 765 case 'f': 766 case 'n': 767 case 'r': 768 case 't': 769 write_byte(m, '\\'); 770 write_byte(m, c); 771 break; 772 case 'u': 773 handle_hex_quad(m); 774 break; 775 case 'x': 776 handle_hex_pair(m); 777 break; 778 case '\'': 779 write_byte(m, '\''); 780 break; 781 default: 782 write_byte(m, m->current); 783 break; 784 } 785 } 786 787 void handle_string(j0_maker* m) { 788 const unsigned char quote = m->current; 789 bool escaped = false; 790 791 write_byte(m, '"'); 792 793 while (true) { 794 advance(m); 795 796 int c = m->current; 797 if (c == EOF) { 798 fail(m, 1, "input ended before string was close-quoted"); 799 } 800 801 if (escaped) { 802 handle_string_escape(m, c); 803 escaped = false; 804 continue; 805 } 806 807 switch (c) { 808 case '\\': 809 escaped = true; 810 break; 811 default: 812 if (c == quote) { 813 write_byte(m, '"'); 814 advance(m); 815 return; 816 } 817 818 // write_byte(m, c); 819 if (c < ' ') { 820 handle_low_char(m, c); 821 } else { 822 copy_utf8_rune(m); 823 } 824 break; 825 } 826 } 827 } 828 829 void handle_token(j0_maker* m); 830 831 void handle_array(j0_maker* m) { 832 size_t items = 0; 833 const unsigned char end = m->current == '[' ? ']' : ')'; 834 write_byte(m, '['); 835 advance(m); 836 837 while (true) { 838 seek_token(m); 839 const int lead = m->current; 840 841 if (lead == EOF) { 842 fail(m, 1, "unclosed array"); 843 } 844 845 if (lead == ',') { 846 advance(m); 847 continue; 848 } 849 850 if (lead == end) { 851 write_byte(m, ']'); 852 advance(m); 853 return; 854 } 855 856 if (items > 0) { 857 write_byte(m, ','); 858 } 859 if (feof(m->out)) { 860 return; 861 } 862 handle_token(m); 863 items++; 864 } 865 } 866 867 // handle_array_jsonl is a slight variation of func handle_array: this one is 868 // used to handle top-level arrays when running in JSON Lines mode, to emit 869 // line-feeds after each item, instead of commas between them 870 void handle_array_jsonl(j0_maker* m) { 871 const unsigned char end = m->current == '[' ? ']' : ')'; 872 advance(m); 873 874 while (true) { 875 seek_token(m); 876 const int lead = m->current; 877 878 if (lead == EOF) { 879 fail(m, 1, "unclosed array"); 880 } 881 882 if (lead == ',') { 883 advance(m); 884 continue; 885 } 886 887 if (lead == end) { 888 advance(m); 889 return; 890 } 891 892 if (feof(m->out)) { 893 return; 894 } 895 896 handle_token(m); 897 write_byte(m, '\n'); 898 } 899 } 900 901 void handle_unquoted_key(j0_maker* m) { 902 write_byte(m, '"'); 903 904 while (true) { 905 int c = m->current; 906 if (c == EOF) { 907 fail(m, 1, "input ended with an object key"); 908 } 909 910 write_byte(m, c); 911 advance(m); 912 913 c = m->current; 914 if (!isalpha(c) && !isdigit(c) && c != '_') { 915 break; 916 } 917 } 918 919 write_byte(m, '"'); 920 } 921 922 void handle_object(j0_maker* m) { 923 size_t items = 0; 924 write_byte(m, '{'); 925 advance(m); 926 927 while (true) { 928 seek_token(m); 929 int lead = m->current; 930 931 if (lead == EOF) { 932 fail(m, 1, "unclosed object"); 933 } 934 935 if (lead == ',') { 936 advance(m); 937 continue; 938 } 939 940 if (lead == '}') { 941 write_byte(m, '}'); 942 advance(m); 943 return; 944 } 945 946 if (feof(m->out)) { 947 return; 948 } 949 950 if (lead == '"' || lead == '\'') { 951 if (items > 0) { 952 write_byte(m, ','); 953 } 954 handle_string(m); 955 } else if (isalpha(lead) || lead == '_') { 956 if (items > 0) { 957 write_byte(m, ','); 958 } 959 handle_unquoted_key(m); 960 } else { 961 fail(m, 1, "only strings or identifiers can be object keys"); 962 } 963 964 seek_token(m); 965 lead = m->current; 966 967 if (lead == EOF) { 968 fail(m, 1, "input ended after object-key and before value"); 969 } 970 971 if (lead != ':') { 972 fail(m, 1, "a `:` must follow all object keys"); 973 } 974 975 write_byte(m, ':'); 976 advance(m); 977 978 seek_token(m); 979 if (m->current == EOF) { 980 fail(m, 1, "input ended after a `:` following an object-key"); 981 } 982 983 handle_token(m); 984 items++; 985 } 986 } 987 988 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 989 void (*dispatch[256])() = { 990 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 991 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 992 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 993 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 994 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 995 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 996 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 997 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 998 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 999 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1000 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1001 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1002 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1003 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1004 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1005 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1006 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1007 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1008 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1009 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1010 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1011 NULL, NULL, NULL, NULL, 1012 }; 1013 1014 void handle_token(j0_maker* m) { 1015 dispatch[m->current](m); 1016 } 1017 1018 // handle_invalid_token shows an error message and quits the app right after 1019 void handle_invalid_token(j0_maker* m) { 1020 char msg[64]; 1021 unsigned char c = (unsigned char)m->current; 1022 sprintf(msg, "%c (%d): invalid token", c, c); 1023 fail(m, 1, msg); 1024 } 1025 1026 void handle_array_jsonl(j0_maker* m); 1027 1028 void handle_input(FILE* out, FILE* src, bool jsonl) { 1029 unsigned char ibuf[IBUF_SIZE]; 1030 unsigned char obuf[OBUF_SIZE]; 1031 1032 j0_maker m; 1033 m.ibuf = ibuf; 1034 m.icap = sizeof(ibuf); 1035 m.obuf = obuf; 1036 m.ocap = sizeof(obuf); 1037 restart_state(&m, out, src); 1038 1039 // ignore leading whitespace/comment bytes, if present 1040 seek_token(&m); 1041 1042 if (m.current == EOF) { 1043 fail(&m, 1, "empty input isn't valid JSON"); 1044 } 1045 1046 if (jsonl && m.current == '[') { 1047 handle_array_jsonl(&m); 1048 } else { 1049 handle_token(&m); 1050 write_byte(&m, '\n'); 1051 } 1052 flush(&m); 1053 1054 // ignore trailing whitespace/comment bytes, if present 1055 seek_token(&m); 1056 1057 // ignore trailing semicolon, if present 1058 if (m.current == ';') { 1059 advance(&m); 1060 // ignore trailing whitespace/comment bytes, if present 1061 seek_token(&m); 1062 } 1063 1064 if (!feof(src) || m.current != EOF) { 1065 fail(&m, 1, "unexpected trailing JSON data"); 1066 } 1067 } 1068 1069 bool is_help_option(const char* s) { 1070 return (s[0] == '-' && s[1] != 0) && ( 1071 strcmp(s, "-h") == 0 || 1072 strcmp(s, "--h") == 0 || 1073 strcmp(s, "-help") == 0 || 1074 strcmp(s, "--help") == 0 1075 ); 1076 } 1077 1078 bool is_jsonl_option(const char* s) { 1079 return (s[0] == '-' && s[1] != 0) && ( 1080 strcmp(s, "-jl") == 0 || 1081 strcmp(s, "--jl") == 0 || 1082 strcmp(s, "-jsonl") == 0 || 1083 strcmp(s, "--jsonl") == 0 1084 ); 1085 } 1086 1087 // run returns the error code 1088 int run(int nargs, char** args) { 1089 bool jsonl = false; 1090 if (nargs > 0 && is_jsonl_option(args[0])) { 1091 jsonl = true; 1092 nargs--; 1093 args++; 1094 } 1095 1096 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1097 nargs--; 1098 args++; 1099 } 1100 1101 if (nargs > 1) { 1102 const char* msg = "can't use more than 1 named input"; 1103 fprintf(stderr, ERROR_LINE("%s"), msg); 1104 return 1; 1105 } 1106 1107 // use stdin when not given a filepath 1108 if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) { 1109 handle_input(stdout, stdin, jsonl); 1110 return 0; 1111 } 1112 1113 const char* path = args[0]; 1114 FILE* f = fopen(path, "rb"); 1115 if (f == NULL) { 1116 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1117 return 1; 1118 } 1119 1120 handle_input(stdout, f, jsonl); 1121 fclose(f); 1122 1123 return 0; 1124 } 1125 1126 #ifdef TESTING 1127 bool run_test(const char* name, const char* input, const char* expected) { 1128 unsigned char result[OBUF_SIZE]; 1129 1130 fprintf(stdout, "running test named \"%s\"\n", name); 1131 1132 FILE* in = fmemopen((void*)input, strlen(input), "rb"); 1133 if (in == NULL) { 1134 fprintf(stdout, "fmemopen failed\n"); 1135 return false; 1136 } 1137 1138 memset(result, 0, sizeof(result)); 1139 FILE* out = fmemopen((void*)result, sizeof(result), "wb"); 1140 if (out == NULL) { 1141 fprintf(stdout, "fmemopen failed\n"); 1142 return false; 1143 } 1144 1145 handle_input(out, in, false); 1146 1147 fclose(out); 1148 fclose(in); 1149 1150 // remove trailing line-feed from the result 1151 for (ssize_t i = sizeof(result) - 1; i >= 0; i--) { 1152 if (result[i] == '\n') { 1153 result[i] = 0; 1154 break; 1155 } 1156 } 1157 1158 const bool ok = strcmp((char*)result, expected) == 0; 1159 if (!ok) { 1160 fprintf(stdout, " input: %s\n", input); 1161 fprintf(stdout, " expected: %s\n", expected); 1162 fprintf(stdout, " result: %s\n", result); 1163 } 1164 return ok; 1165 } 1166 1167 int test() { 1168 typedef struct test_case { 1169 const char* name; 1170 const char* input; 1171 const char* expected; 1172 } test_case; 1173 1174 test_case cases[] = { 1175 {"null", "null", "null"}, 1176 {"false", "false", "false"}, 1177 {"true", "true", "true"}, 1178 {"None", "None", "null"}, 1179 {"False", "False", "false"}, 1180 {"True", "True", "true"}, 1181 {"zero", "0", "0"}, 1182 {"zero with decimals", "0.0000", "0.0000"}, 1183 {"negative number", "-1230.324", "-1230.324"}, 1184 {"leading plus", "+1230.324", "1230.324"}, 1185 {"leading dot", ".123", "0.123"}, 1186 {"leading negative dot", "-.123", "-0.123"}, 1187 {"leading positive dot", "+.123", "0.123"}, 1188 {"empty string", "\"\"", "\"\""}, 1189 {"single-quoted string", "'abc def'", "\"abc def\""}, 1190 { 1191 "string with double-quotes in it", 1192 "\"\\\"cats and dogs\\\" goes the saying\"", 1193 "\"\\\"cats and dogs\\\" goes the saying\"", 1194 }, 1195 { 1196 "string with escaped hex-digit values in it", 1197 "\"\\x00\\u0000\\x09\"", 1198 "\"\\u0000\\u0000\\t\"", 1199 }, 1200 {"empty array", "[]", "[]"}, 1201 {"empty array, extra comma", "[ , ]", "[]"}, 1202 {"empty object", "{}", "{}"}, 1203 {"empty object, extra commas", "{,, , ,,}", "{}"}, 1204 {"numeric array", "[,,1, 2, 3, ]", "[1,2,3]"}, 1205 {"simple nested array", "[1, 2, 3, []]", "[1,2,3,[]]"}, 1206 { 1207 "another simple nested array", 1208 "[1, 2, 3, [false,\"abc\"]]", 1209 "[1,2,3,[false,\"abc\"]]", 1210 }, 1211 { 1212 "fancier nested array", 1213 "[1, 2, 3, [[ -.233, false,] , , ,,, 'abc']]", 1214 "[1,2,3,[[-0.233,false],\"abc\"]]", 1215 }, 1216 { 1217 "simple object, extra commas", 1218 "{,'abc' : 123, , ,'def': 987,}", 1219 "{\"abc\":123,\"def\":987}", 1220 }, 1221 { 1222 "simple object, extra commas, unquoted object keys", 1223 "{,abc : 123, , ,def: 987,}", 1224 "{\"abc\":123,\"def\":987}", 1225 }, 1226 { 1227 "numeric array with trailing single-line comment", 1228 "[1, 2, 3, ] // comments aren't valid JSON", 1229 "[1,2,3]", 1230 }, 1231 { 1232 "numeric array with comments", 1233 "/* hi there */ [1, 2, /* 3 better be next */ 3, ]" 1234 " // I'll have the last word # you wish", 1235 "[1,2,3]", 1236 }, 1237 { 1238 "self-compacting shebang", 1239 "#!/usr/bin/json0\n[, 1 , , 2 , , , 3]", 1240 "[1,2,3]", 1241 }, 1242 { 1243 "pyon example 1", 1244 "[True,False,'abc\\x0adef',None,+12.45]", 1245 "[true,false,\"abc\\ndef\",null,12.45]", 1246 }, 1247 { 1248 "pyon example 2", 1249 "[{'abc':123},'abc\\x0adef',None,+12.45]", 1250 "[{\"abc\":123},\"abc\\ndef\",null,12.45]", 1251 }, 1252 }; 1253 1254 size_t errors = 0; 1255 for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { 1256 const char* s = cases[i].name; 1257 if (!run_test(s, cases[i].input, cases[i].expected)) { 1258 fprintf(stdout, "\x1b[31mtest named \"%s\" failed\x1b[0m\n", s); 1259 errors++; 1260 } 1261 } 1262 return errors == 0 ? 0 : 1; 1263 } 1264 #endif 1265 1266 int main(int argc, char** argv) { 1267 #ifdef _WIN32 1268 setmode(fileno(stdin), O_BINARY); 1269 // ensure output lines end in LF instead of CRLF on windows 1270 setmode(fileno(stdout), O_BINARY); 1271 setmode(fileno(stderr), O_BINARY); 1272 #endif 1273 1274 #ifndef TESTING 1275 if (argc > 1 && is_help_option(argv[1])) { 1276 printf("%s", info); 1277 return 0; 1278 } 1279 #endif 1280 1281 // the dispatch table starts as all null function-pointers 1282 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1283 dispatch[i] = handle_invalid_token; 1284 } 1285 1286 for (size_t i = '0'; i <= '9'; i++) { 1287 dispatch[i] = handle_number; 1288 } 1289 1290 dispatch['n'] = handle_null; 1291 dispatch['t'] = handle_true; 1292 dispatch['f'] = handle_false; 1293 dispatch['N'] = handle_capital_none; 1294 dispatch['T'] = handle_capital_true; 1295 dispatch['F'] = handle_capital_false; 1296 dispatch['.'] = handle_dot; 1297 dispatch['+'] = handle_plus_number; 1298 dispatch['-'] = handle_minus_number; 1299 dispatch['"'] = handle_string; 1300 dispatch['\''] = handle_string; 1301 dispatch['['] = handle_array; 1302 dispatch['('] = handle_array; 1303 dispatch['{'] = handle_object; 1304 1305 #ifdef TESTING 1306 return test(); 1307 #else 1308 // enable full/block-buffering for standard output 1309 setvbuf(stdout, NULL, _IOFBF, 0); 1310 1311 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1312 #endif 1313 }