File: j0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -o ./j0 ./j0.c 29 */ 30 31 #include <ctype.h> 32 #include <fcntl.h> 33 #include <stdarg.h> 34 #include <stdbool.h> 35 #include <stdint.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 40 #ifdef _WIN32 41 #include <windows.h> 42 #endif 43 44 const char* info = "" 45 "j0 [options...] [file...]\n" 46 "\n" 47 "\n" 48 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 49 "Its output is always a single line, which ends with a line-feed.\n" 50 "\n" 51 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 52 "valid JSON, since it\n" 53 "\n" 54 " - ignores both rest-of-line and multi-line comments\n" 55 " - ignores extra/trailing commas in arrays and objects\n" 56 " - turns single-quoted strings/keys into double-quoted strings\n" 57 " - double-quotes unquoted object keys\n" 58 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 59 "\n" 60 "All options available can either start with a single or a double-dash\n" 61 "\n" 62 " -h show this help message\n" 63 " -help show this help message\n" 64 " -jsonl emit JSON Lines, when top-level value is an array\n" 65 ""; 66 67 typedef struct j0_maker { 68 FILE* in; 69 unsigned char* ibuf; 70 size_t ilen; // how many bytes are being used in the input buffer 71 size_t icap; // the input buffer's capacity 72 size_t ipos; // the current position in the input buffer 73 74 FILE* out; 75 76 size_t line; // the current line, used to show useful error messages 77 size_t pos; // the position in the current line, for error messages 78 79 int current; 80 int next; 81 } j0_maker; 82 83 // advance_reader_pos helps func read_byte do its job 84 void advance_reader_pos(j0_maker* r, unsigned char b) { 85 r->ipos++; 86 if (b == '\n') { 87 r->line++; 88 r->pos = 1; 89 } else { 90 r->pos++; 91 } 92 } 93 94 // read_byte does as it says: check its return for the value EOF, before 95 // using it as the next byte 96 int read_byte(j0_maker* r) { 97 if (r->ipos < r->ilen) { 98 // inside current chunk 99 const unsigned char b = r->ibuf[r->ipos]; 100 advance_reader_pos(r, b); 101 return b; 102 } 103 104 // need to read the next block 105 r->ipos = 0; 106 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 107 if (r->ilen > 0) { 108 const unsigned char b = r->ibuf[r->ipos]; 109 advance_reader_pos(r, b); 110 return b; 111 } 112 113 // reached the end of data 114 return EOF; 115 } 116 117 // advance is used in most of the code, instead of calling read_byte directly 118 void advance(j0_maker* r) { 119 r->current = r->next; 120 r->next = read_byte(r); 121 } 122 123 void fail(j0_maker* s, int code, const char* msg); 124 125 void skip_line(j0_maker* r) { 126 while (true) { 127 advance(r); 128 if (r->current == EOF) { 129 break; 130 } 131 132 if (r->current == '\n') { 133 advance(r); 134 break; 135 } 136 } 137 } 138 139 void skip_multiline_comment(j0_maker* r) { 140 unsigned char prev = 0; 141 142 while (true) { 143 advance(r); 144 145 if (r->current == EOF) { 146 break; 147 } 148 149 if (prev == '*' && r->current == '/') { 150 advance(r); 151 break; 152 } 153 154 prev = (unsigned char)r->current; 155 } 156 } 157 158 void skip_comment(j0_maker* r) { 159 if (r->current != '/') { 160 fail(r, 1, "expected a slash to start comments"); 161 } 162 advance(r); 163 164 if (r->current == '/') { 165 skip_line(r); 166 return; 167 } 168 169 if (r->current == '*') { 170 skip_multiline_comment(r); 171 return; 172 } 173 174 fail(r, 1, "expected `//` or `/*` to start comments"); 175 } 176 177 void seek_token(j0_maker* r) { 178 while (true) { 179 if (r->current != EOF && r->current <= ' ') { 180 advance(r); 181 continue; 182 } 183 184 if (r->current == '/') { 185 skip_comment(r); 186 continue; 187 } 188 189 break; 190 } 191 } 192 193 bool starts_with_bom(const unsigned char* b, const size_t n) { 194 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 195 } 196 197 void restart_state(j0_maker* s, FILE* w, FILE* r) { 198 s->in = r; 199 s->ilen = 0; 200 s->ipos = 0; 201 202 s->out = w; 203 204 s->line = 1; 205 s->pos = 1; 206 207 s->current = EOF; 208 s->next = EOF; 209 210 s->current = read_byte(s); 211 if (s->current == EOF) { 212 return; 213 } 214 s->next = read_byte(s); 215 216 // skip leading UTF-8 BOM (byte-order mark), if present 217 if (starts_with_bom(s->ibuf, s->ilen)) { 218 // a UTF-8 BOM has 3 bytes 219 for (size_t i = 0; i < 3 && s->current != EOF; i++) { 220 advance(s); 221 } 222 } 223 } 224 225 // write_bytes does as it says, minimizing the number of calls to fwrite 226 void write_bytes(j0_maker* w, const unsigned char* src, size_t len) { 227 if (len > 0 && fwrite(src, len, 1, w->out) < 1) { 228 if (feof(w->out)) { 229 exit(0); 230 } 231 232 fail(w, 1, "failed to write more output"); 233 } 234 } 235 236 inline void write_byte(j0_maker* w, unsigned char b) { 237 putc(b, w->out); 238 } 239 240 // debug is available to diagnose any bug found 241 void debug(j0_maker* s, const char* fmt, ...) { 242 va_list args; 243 va_start(args, fmt); 244 245 if (s->in != stdin) { 246 fclose(s->in); 247 } 248 249 write_byte(s, '\n'); 250 251 const unsigned long line = s->line; 252 const unsigned long pos = s->pos; 253 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 254 fprintf(stderr, fmt, args); 255 fprintf(stderr, "\x1b[0m\n"); 256 257 va_end(args); 258 259 exit(10); 260 } 261 262 // fail quits this app with the printf-style formatted error message given 263 void fail(j0_maker* s, int code, const char* msg) { 264 if (s->in != stdin) { 265 fclose(s->in); 266 } 267 268 write_byte(s, '\n'); 269 270 const unsigned long line = s->line; 271 const unsigned long pos = s->pos; 272 fprintf(stderr, "\x1b[31mline %lu, pos %lu: %s\x1b[0m\n", line, pos, msg); 273 274 exit(code); 275 } 276 277 bool demand_keyword(j0_maker* s, char* rest) { 278 for (; rest[0] != 0; rest++) { 279 if (s->current == EOF || s->current != rest[0]) { 280 return false; 281 } 282 advance(s); 283 } 284 285 return rest[0] == 0; 286 } 287 288 void handle_null(j0_maker* s) { 289 if (!demand_keyword(s, "null")) { 290 fail(s, 1, "expected `null` keyword"); 291 } 292 write_bytes(s, (unsigned char*)"null", 4); 293 } 294 295 void handle_true(j0_maker* s) { 296 if (!demand_keyword(s, "true")) { 297 fail(s, 1, "expected `true` keyword"); 298 } 299 write_bytes(s, (unsigned char*)"true", 4); 300 } 301 302 void handle_false(j0_maker* s) { 303 if (!demand_keyword(s, "false")) { 304 fail(s, 1, "expected `false` keyword"); 305 } 306 write_bytes(s, (unsigned char*)"false", 5); 307 } 308 309 void handle_capital_none(j0_maker* s) { 310 if (!demand_keyword(s, "None")) { 311 fail(s, 1, "expected `None` keyword"); 312 } 313 write_bytes(s, (unsigned char*)"null", 4); 314 } 315 316 void handle_capital_true(j0_maker* s) { 317 if (!demand_keyword(s, "True")) { 318 fail(s, 1, "expected `True` keyword"); 319 } 320 write_bytes(s, (unsigned char*)"true", 4); 321 } 322 323 void handle_capital_false(j0_maker* s) { 324 if (!demand_keyword(s, "False")) { 325 fail(s, 1, "expected `False` keyword"); 326 } 327 write_bytes(s, (unsigned char*)"false", 5); 328 } 329 330 void handle_digits(j0_maker* s) { 331 if (!isdigit(s->current)) { 332 fail(s, 1, "expected/missing digits"); 333 } 334 335 while (isdigit(s->current)) { 336 write_byte(s, s->current); 337 advance(s); 338 } 339 } 340 341 void handle_number(j0_maker* s) { 342 handle_digits(s); 343 344 if (s->current == '.') { 345 write_byte(s, '.'); 346 advance(s); 347 348 if (isdigit(s->current)) { 349 handle_digits(s); 350 } else { 351 write_byte(s, '0'); 352 } 353 return; 354 } 355 356 if (s->current == 'e' || s->current == 'E') { 357 write_byte(s, s->current); 358 advance(s); 359 360 if (s->current == '+') { 361 advance(s); 362 } else if (s->current == '-') { 363 write_byte(s, '-'); 364 advance(s); 365 } 366 367 handle_digits(s); 368 } 369 } 370 371 void handle_dot(j0_maker* s) { 372 write_byte(s, '0'); 373 write_byte(s, '.'); 374 advance(s); 375 376 if (!isdigit(s->current)) { 377 fail(s, 1, "expected/missing digits after decimal dot"); 378 } 379 handle_digits(s); 380 } 381 382 void handle_plus_number(j0_maker* s) { 383 advance(s); 384 385 if (s->current == '.') { 386 handle_dot(s); 387 return; 388 } 389 handle_number(s); 390 } 391 392 void handle_minus_number(j0_maker* s) { 393 write_byte(s, '-'); 394 advance(s); 395 396 if (s->current == '.') { 397 handle_dot(s); 398 return; 399 } 400 handle_number(s); 401 } 402 403 void handle_string_escape(j0_maker* s, int c) { 404 switch (c) { 405 case '"': 406 case '\\': 407 case 'b': 408 case 'f': 409 case 'n': 410 case 'r': 411 case 't': 412 write_byte(s, '\\'); 413 write_byte(s, c); 414 break; 415 416 case 'u': 417 write_byte(s, '\\'); 418 write_byte(s, 'u'); 419 for (size_t i = 0; i < 4; i++) { 420 advance(s); 421 if (s->current == EOF) { 422 fail(s, 1, "end of input before end of string"); 423 } 424 if (isdigit(s->current) || isalpha(s->current)) { 425 // write_byte(s, toupper(c)); 426 write_byte(s, c); 427 continue; 428 } 429 fail(s, 1, "invalid hexadecimal digit in string"); 430 } 431 break; 432 433 case 'x': 434 write_byte(s, '\\'); 435 write_byte(s, 'u'); 436 write_byte(s, '0'); 437 write_byte(s, '0'); 438 for (size_t i = 0; i < 2; i++) { 439 advance(s); 440 if (s->current == EOF) { 441 fail(s, 1, "end of input before end of string"); 442 } 443 if (isdigit(s->current) || isalpha(s->current)) { 444 // write_byte(s, toupper(c)); 445 write_byte(s, c); 446 continue; 447 } 448 fail(s, 1, "invalid hexadecimal digit in string"); 449 } 450 break; 451 452 case '\'': 453 write_byte(s, '\''); 454 break; 455 456 default: 457 write_byte(s, s->current); 458 break; 459 } 460 } 461 462 void handle_string(j0_maker* s) { 463 const unsigned char quote = s->current; 464 bool escaped = false; 465 466 write_byte(s, '"'); 467 468 while (true) { 469 advance(s); 470 471 int c = s->current; 472 if (c == EOF) { 473 fail(s, 1, "input ended before string was close-quoted"); 474 } 475 476 if (escaped) { 477 handle_string_escape(s, c); 478 escaped = false; 479 continue; 480 } 481 482 switch (c) { 483 case '\\': 484 escaped = true; 485 break; 486 487 default: 488 if (c == quote) { 489 write_byte(s, '"'); 490 advance(s); 491 return; 492 } 493 494 write_byte(s, c); 495 break; 496 } 497 } 498 } 499 500 void handle_token(j0_maker* s); 501 502 void handle_array(j0_maker* s) { 503 size_t items_before = 0; 504 write_byte(s, '['); 505 advance(s); 506 507 while (true) { 508 seek_token(s); 509 if (s->current == EOF) { 510 fail(s, 1, "unclosed array"); 511 } 512 513 if (s->current == ',') { 514 advance(s); 515 continue; 516 } 517 518 if (s->current == ']') { 519 write_byte(s, ']'); 520 advance(s); 521 return; 522 } 523 524 if (items_before > 0) { 525 write_byte(s, ','); 526 } 527 handle_token(s); 528 items_before++; 529 } 530 } 531 532 // handle_array_jsonl is a slight variation of func handle_array: this one is 533 // used to handle top-level arrays when running in JSON Lines mode, to emit 534 // line-feeds after each item, instead of commas between them 535 void handle_array_jsonl(j0_maker* s) { 536 size_t items_before = 0; 537 advance(s); 538 539 while (true) { 540 seek_token(s); 541 if (s->current == EOF) { 542 fail(s, 1, "unclosed array"); 543 } 544 545 if (s->current == ',') { 546 advance(s); 547 continue; 548 } 549 550 if (items_before > 0) { 551 write_byte(s, '\n'); 552 } 553 554 if (s->current == ']') { 555 advance(s); 556 return; 557 } 558 559 handle_token(s); 560 items_before++; 561 } 562 } 563 564 void handle_unquoted_key(j0_maker* s) { 565 write_byte(s, '"'); 566 567 while (true) { 568 int c = s->current; 569 if (c == EOF) { 570 fail(s, 1, "input ended with an object key"); 571 } 572 573 write_byte(s, c); 574 advance(s); 575 576 c = s->current; 577 if (!isalpha(c) && !isdigit(c) && c != '_') { 578 break; 579 } 580 } 581 582 write_byte(s, '"'); 583 } 584 585 void handle_object(j0_maker* s) { 586 size_t items_before = 0; 587 write_byte(s, '{'); 588 advance(s); 589 590 while (true) { 591 seek_token(s); 592 if (s->current == EOF) { 593 fail(s, 1, "unclosed object"); 594 } 595 596 if (s->current == ',') { 597 advance(s); 598 continue; 599 } 600 601 if (s->current == '}') { 602 write_byte(s, '}'); 603 advance(s); 604 return; 605 } 606 607 if (s->current == '"' || s->current == '\'') { 608 if (items_before > 0) { 609 write_byte(s, ','); 610 } 611 handle_string(s); 612 items_before++; 613 } else if (isalpha(s->current) || s->current == '_') { 614 if (items_before > 0) { 615 write_byte(s, ','); 616 } 617 handle_unquoted_key(s); 618 items_before++; 619 } else { 620 fail(s, 1, "only strings or identifiers can be object keys"); 621 } 622 623 seek_token(s); 624 if (s->current == EOF) { 625 fail(s, 1, "input ended after object-key and before value"); 626 } 627 628 if (s->current != ':') { 629 fail(s, 1, "a `:` must follow all object keys"); 630 } 631 632 write_byte(s, ':'); 633 advance(s); 634 635 seek_token(s); 636 if (s->current == EOF) { 637 fail(s, 1, "input ended after a `:` following an object-key"); 638 } 639 640 handle_token(s); 641 } 642 } 643 644 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 645 void (*dispatch[256])() = { 646 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 647 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 648 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 649 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 650 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 651 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 652 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 653 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 654 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 655 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 656 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 657 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 658 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 659 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 660 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 661 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 662 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 663 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 664 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 665 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 666 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 667 NULL, NULL, NULL, NULL, 668 }; 669 670 void handle_token(j0_maker* s) { 671 void (*fn)(j0_maker*) = NULL; 672 673 // seek_token(s); 674 // if (s->current == EOF) { 675 // fail(s, 1, "expected a token"); 676 // } 677 678 fn = dispatch[s->current]; 679 if (fn != NULL) { 680 fn(s); 681 } else { 682 unsigned char c = (unsigned char)s->current; 683 fprintf(stderr, "%c\n", c); 684 fail(s, 1, "invalid token"); 685 } 686 } 687 688 void handle_array_jsonl(j0_maker* s); 689 690 void handle_input(FILE* src, bool jsonl) { 691 unsigned char ibuf[32 * 1024]; 692 693 j0_maker state; 694 j0_maker* s = &state; 695 s->ibuf = ibuf; 696 s->icap = sizeof(ibuf); 697 restart_state(s, stdout, src); 698 699 // ignore leading whitespace/comment bytes, if present 700 seek_token(s); 701 702 if (s->current == EOF) { 703 fail(s, 1, "empty input isn't valid JSON"); 704 } 705 706 if (jsonl && s->current == '[') { 707 handle_array_jsonl(s); 708 } else { 709 handle_token(s); 710 write_byte(s, '\n'); 711 } 712 713 // ignore trailing whitespace/comment bytes, if present 714 seek_token(s); 715 716 // ignore trailing semicolon, if present 717 if (s->current == ';') { 718 advance(s); 719 // ignore trailing whitespace/comment bytes, if present 720 seek_token(s); 721 } 722 723 if (!feof(src)) { 724 fail(s, 1, "unexpected trailing JSON data"); 725 } 726 } 727 728 bool is_help_option(const char* s) { 729 return (s[0] == '-' && s[1] != 0) && ( 730 strcmp(s, "-h") == 0 || strcmp(s, "--h") == 0 || 731 strcmp(s, "-help") == 0 || strcmp(s, "--help") == 0 732 ); 733 } 734 735 bool is_jsonl_option(const char* s) { 736 return (s[0] == '-' && s[1] != 0) && ( 737 strcmp(s, "-jsonl") == 0 || strcmp(s, "--jsonl") == 0 738 ); 739 } 740 741 // run returns the error code 742 int run(int argc, char** argv) { 743 bool jsonl = false; 744 if (argc > 1 && is_jsonl_option(argv[1])) { 745 jsonl = true; 746 argc--; 747 argv++; 748 } 749 750 if (argc > 2) { 751 const char* msg = "can't use more than 1 named input"; 752 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 753 return 1; 754 } 755 756 // use stdin when not given a filepath, or is `-` 757 if (argc < 2 || argv[1][0] == 0 || strcmp(argv[1], "-") == 0) { 758 handle_input(stdin, jsonl); 759 return 0; 760 } 761 762 const char* path = argv[1]; 763 FILE* f = fopen(path, "rb"); 764 if (f == NULL) { 765 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); 766 return 1; 767 } 768 769 handle_input(f, jsonl); 770 fclose(f); 771 772 return 0; 773 } 774 775 int main(int argc, char** argv) { 776 #ifdef _WIN32 777 setmode(fileno(stdin), O_BINARY); 778 // ensure output lines end in LF instead of CRLF on windows 779 setmode(fileno(stdout), O_BINARY); 780 setmode(fileno(stderr), O_BINARY); 781 #endif 782 783 if (argc > 1 && is_help_option(argv[1])) { 784 puts(info); 785 return 0; 786 } 787 788 dispatch['0'] = handle_number; 789 dispatch['1'] = handle_number; 790 dispatch['2'] = handle_number; 791 dispatch['3'] = handle_number; 792 dispatch['4'] = handle_number; 793 dispatch['5'] = handle_number; 794 dispatch['6'] = handle_number; 795 dispatch['7'] = handle_number; 796 dispatch['8'] = handle_number; 797 dispatch['9'] = handle_number; 798 dispatch['n'] = handle_null; 799 dispatch['t'] = handle_true; 800 dispatch['f'] = handle_false; 801 dispatch['N'] = handle_capital_none; 802 dispatch['T'] = handle_capital_true; 803 dispatch['F'] = handle_capital_false; 804 dispatch['.'] = handle_dot; 805 dispatch['+'] = handle_plus_number; 806 dispatch['-'] = handle_minus_number; 807 dispatch['"'] = handle_string; 808 dispatch['\''] = handle_string; 809 dispatch['['] = handle_array; 810 dispatch['{'] = handle_object; 811 812 return run(argc, argv) == 0 ? 0 : 1; 813 }