File: j0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -o ./j0 ./j0.c 29 */ 30 31 #include <ctype.h> 32 #include <fcntl.h> 33 #include <stdarg.h> 34 #include <stdbool.h> 35 #include <stdint.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 40 #ifdef _WIN32 41 #include <windows.h> 42 #endif 43 44 // info is the message shown when this app is given any of its help options 45 const char* info = "" 46 "j0 [options...] [file...]\n" 47 "\n" 48 "\n" 49 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 50 "Its output is always a single line, which ends with a line-feed.\n" 51 "\n" 52 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 53 "valid JSON, since it\n" 54 "\n" 55 " - ignores both rest-of-line and multi-line comments\n" 56 " - ignores extra/trailing commas in arrays and objects\n" 57 " - turns single-quoted strings/keys into double-quoted strings\n" 58 " - double-quotes unquoted object keys\n" 59 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 60 "\n" 61 "All options available can either start with a single or a double-dash\n" 62 "\n" 63 " -h show this help message\n" 64 " -help show this help message\n" 65 " -jsonl emit JSON Lines, when top-level value is an array\n" 66 ""; 67 68 typedef struct j0_maker { 69 FILE* in; 70 unsigned char* ibuf; 71 size_t ilen; // how many bytes are being used in the input buffer 72 size_t icap; // the input buffer's capacity 73 size_t ipos; // the current position in the input buffer 74 75 FILE* out; 76 77 size_t line; // the current line, used to show useful error messages 78 size_t pos; // the position in the current line, for error messages 79 80 int current; 81 int next; 82 } j0_maker; 83 84 // advance_reader_pos helps func read_byte do its job 85 void advance_reader_pos(j0_maker* r, unsigned char b) { 86 r->ipos++; 87 if (b == '\n') { 88 r->line++; 89 r->pos = 1; 90 } else { 91 r->pos++; 92 } 93 } 94 95 // read_byte does as it says: check its return for the value EOF, before 96 // using it as the next byte 97 int read_byte(j0_maker* r) { 98 if (r->ipos < r->ilen) { 99 // inside current chunk 100 const unsigned char b = r->ibuf[r->ipos]; 101 advance_reader_pos(r, b); 102 return b; 103 } 104 105 // need to read the next block 106 r->ipos = 0; 107 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 108 if (r->ilen > 0) { 109 const unsigned char b = r->ibuf[r->ipos]; 110 advance_reader_pos(r, b); 111 return b; 112 } 113 114 // reached the end of data 115 return EOF; 116 } 117 118 // advance is used in most of the code, instead of calling read_byte directly 119 void advance(j0_maker* r) { 120 r->current = r->next; 121 r->next = read_byte(r); 122 } 123 124 void fail(j0_maker* s, int code, const char* msg); 125 126 void skip_line(j0_maker* r) { 127 while (true) { 128 advance(r); 129 if (r->current == EOF) { 130 break; 131 } 132 133 if (r->current == '\n') { 134 advance(r); 135 break; 136 } 137 } 138 } 139 140 void skip_multiline_comment(j0_maker* r) { 141 unsigned char prev = 0; 142 143 while (true) { 144 advance(r); 145 146 if (r->current == EOF) { 147 break; 148 } 149 150 if (prev == '*' && r->current == '/') { 151 advance(r); 152 break; 153 } 154 155 prev = (unsigned char)r->current; 156 } 157 } 158 159 void skip_comment(j0_maker* r) { 160 if (r->current != '/') { 161 fail(r, 1, "expected a slash to start comments"); 162 } 163 advance(r); 164 165 if (r->current == '/') { 166 skip_line(r); 167 return; 168 } 169 170 if (r->current == '*') { 171 skip_multiline_comment(r); 172 return; 173 } 174 175 fail(r, 1, "expected `//` or `/*` to start comments"); 176 } 177 178 void seek_token(j0_maker* r) { 179 while (true) { 180 if (r->current != EOF && r->current <= ' ') { 181 advance(r); 182 continue; 183 } 184 185 if (r->current == '/') { 186 skip_comment(r); 187 continue; 188 } 189 190 break; 191 } 192 } 193 194 bool starts_with_bom(const unsigned char* b, const size_t n) { 195 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 196 } 197 198 void restart_state(j0_maker* s, FILE* w, FILE* r) { 199 s->in = r; 200 s->ilen = 0; 201 s->ipos = 0; 202 203 s->out = w; 204 205 s->line = 1; 206 s->pos = 1; 207 208 s->current = EOF; 209 s->next = EOF; 210 211 s->current = read_byte(s); 212 if (s->current == EOF) { 213 return; 214 } 215 s->next = read_byte(s); 216 217 // skip leading UTF-8 BOM (byte-order mark), if present 218 if (starts_with_bom(s->ibuf, s->ilen)) { 219 // a UTF-8 BOM has 3 bytes 220 for (size_t i = 0; i < 3 && s->current != EOF; i++) { 221 advance(s); 222 } 223 } 224 } 225 226 // write_bytes does as it says, minimizing the number of calls to fwrite 227 void write_bytes(j0_maker* w, const unsigned char* src, size_t len) { 228 if (len > 0 && fwrite(src, len, 1, w->out) < 1) { 229 if (feof(w->out)) { 230 exit(0); 231 } 232 233 fail(w, 1, "failed to write more output"); 234 } 235 } 236 237 inline void write_byte(j0_maker* w, unsigned char b) { 238 putc(b, w->out); 239 } 240 241 // debug is available to diagnose any bug found 242 void debug(j0_maker* s, const char* fmt, ...) { 243 va_list args; 244 va_start(args, fmt); 245 246 if (s->in != stdin) { 247 fclose(s->in); 248 } 249 250 write_byte(s, '\n'); 251 252 const unsigned long line = s->line; 253 const unsigned long pos = s->pos; 254 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 255 fprintf(stderr, fmt, args); 256 fprintf(stderr, "\x1b[0m\n"); 257 258 va_end(args); 259 260 exit(10); 261 } 262 263 // fail quits this app with the printf-style formatted error message given 264 void fail(j0_maker* s, int code, const char* msg) { 265 if (s->in != stdin) { 266 fclose(s->in); 267 } 268 269 write_byte(s, '\n'); 270 271 const unsigned long line = s->line; 272 const unsigned long pos = s->pos; 273 fprintf(stderr, "\x1b[31mline %lu, pos %lu: %s\x1b[0m\n", line, pos, msg); 274 275 exit(code); 276 } 277 278 bool demand_keyword(j0_maker* s, char* rest) { 279 for (; rest[0] != 0; rest++) { 280 if (s->current == EOF || s->current != rest[0]) { 281 return false; 282 } 283 advance(s); 284 } 285 286 return rest[0] == 0; 287 } 288 289 void handle_null(j0_maker* s) { 290 if (!demand_keyword(s, "null")) { 291 fail(s, 1, "expected `null` keyword"); 292 } 293 write_bytes(s, (unsigned char*)"null", 4); 294 } 295 296 void handle_true(j0_maker* s) { 297 if (!demand_keyword(s, "true")) { 298 fail(s, 1, "expected `true` keyword"); 299 } 300 write_bytes(s, (unsigned char*)"true", 4); 301 } 302 303 void handle_false(j0_maker* s) { 304 if (!demand_keyword(s, "false")) { 305 fail(s, 1, "expected `false` keyword"); 306 } 307 write_bytes(s, (unsigned char*)"false", 5); 308 } 309 310 void handle_capital_none(j0_maker* s) { 311 if (!demand_keyword(s, "None")) { 312 fail(s, 1, "expected `None` keyword"); 313 } 314 write_bytes(s, (unsigned char*)"null", 4); 315 } 316 317 void handle_capital_true(j0_maker* s) { 318 if (!demand_keyword(s, "True")) { 319 fail(s, 1, "expected `True` keyword"); 320 } 321 write_bytes(s, (unsigned char*)"true", 4); 322 } 323 324 void handle_capital_false(j0_maker* s) { 325 if (!demand_keyword(s, "False")) { 326 fail(s, 1, "expected `False` keyword"); 327 } 328 write_bytes(s, (unsigned char*)"false", 5); 329 } 330 331 void handle_digits(j0_maker* s) { 332 if (!isdigit(s->current)) { 333 fail(s, 1, "expected/missing digits"); 334 } 335 336 while (isdigit(s->current)) { 337 write_byte(s, s->current); 338 advance(s); 339 } 340 } 341 342 void handle_number(j0_maker* s) { 343 handle_digits(s); 344 345 if (s->current == '.') { 346 write_byte(s, '.'); 347 advance(s); 348 349 if (isdigit(s->current)) { 350 handle_digits(s); 351 } else { 352 write_byte(s, '0'); 353 } 354 return; 355 } 356 357 if (s->current == 'e' || s->current == 'E') { 358 write_byte(s, s->current); 359 advance(s); 360 361 if (s->current == '+') { 362 advance(s); 363 } else if (s->current == '-') { 364 write_byte(s, '-'); 365 advance(s); 366 } 367 368 handle_digits(s); 369 } 370 } 371 372 void handle_dot(j0_maker* s) { 373 write_byte(s, '0'); 374 write_byte(s, '.'); 375 advance(s); 376 377 if (!isdigit(s->current)) { 378 fail(s, 1, "expected/missing digits after decimal dot"); 379 } 380 handle_digits(s); 381 } 382 383 void handle_plus_number(j0_maker* s) { 384 advance(s); 385 386 if (s->current == '.') { 387 handle_dot(s); 388 return; 389 } 390 handle_number(s); 391 } 392 393 void handle_minus_number(j0_maker* s) { 394 write_byte(s, '-'); 395 advance(s); 396 397 if (s->current == '.') { 398 handle_dot(s); 399 return; 400 } 401 handle_number(s); 402 } 403 404 void handle_string_escape(j0_maker* s, int c) { 405 switch (c) { 406 case '"': 407 case '\\': 408 case 'b': 409 case 'f': 410 case 'n': 411 case 'r': 412 case 't': 413 write_byte(s, '\\'); 414 write_byte(s, c); 415 break; 416 417 case 'u': 418 write_byte(s, '\\'); 419 write_byte(s, 'u'); 420 for (size_t i = 0; i < 4; i++) { 421 advance(s); 422 if (s->current == EOF) { 423 fail(s, 1, "end of input before end of string"); 424 } 425 if (isdigit(s->current) || isalpha(s->current)) { 426 // write_byte(s, toupper(c)); 427 write_byte(s, c); 428 continue; 429 } 430 fail(s, 1, "invalid hexadecimal digit in string"); 431 } 432 break; 433 434 case 'x': 435 write_byte(s, '\\'); 436 write_byte(s, 'u'); 437 write_byte(s, '0'); 438 write_byte(s, '0'); 439 for (size_t i = 0; i < 2; i++) { 440 advance(s); 441 if (s->current == EOF) { 442 fail(s, 1, "end of input before end of string"); 443 } 444 if (isdigit(s->current) || isalpha(s->current)) { 445 // write_byte(s, toupper(c)); 446 write_byte(s, c); 447 continue; 448 } 449 fail(s, 1, "invalid hexadecimal digit in string"); 450 } 451 break; 452 453 case '\'': 454 write_byte(s, '\''); 455 break; 456 457 default: 458 write_byte(s, s->current); 459 break; 460 } 461 } 462 463 void handle_string(j0_maker* s) { 464 const unsigned char quote = s->current; 465 bool escaped = false; 466 467 write_byte(s, '"'); 468 469 while (true) { 470 advance(s); 471 472 int c = s->current; 473 if (c == EOF) { 474 fail(s, 1, "input ended before string was close-quoted"); 475 } 476 477 if (escaped) { 478 handle_string_escape(s, c); 479 escaped = false; 480 continue; 481 } 482 483 switch (c) { 484 case '\\': 485 escaped = true; 486 break; 487 488 default: 489 if (c == quote) { 490 write_byte(s, '"'); 491 advance(s); 492 return; 493 } 494 495 write_byte(s, c); 496 break; 497 } 498 } 499 } 500 501 void handle_token(j0_maker* s); 502 503 void handle_array(j0_maker* s) { 504 size_t items_before = 0; 505 write_byte(s, '['); 506 advance(s); 507 508 while (true) { 509 seek_token(s); 510 if (s->current == EOF) { 511 fail(s, 1, "unclosed array"); 512 } 513 514 if (s->current == ',') { 515 advance(s); 516 continue; 517 } 518 519 if (s->current == ']') { 520 write_byte(s, ']'); 521 advance(s); 522 return; 523 } 524 525 if (items_before > 0) { 526 write_byte(s, ','); 527 } 528 handle_token(s); 529 items_before++; 530 } 531 } 532 533 // handle_array_jsonl is a slight variation of func handle_array: this one is 534 // used to handle top-level arrays when running in JSON Lines mode, to emit 535 // line-feeds after each item, instead of commas between them 536 void handle_array_jsonl(j0_maker* s) { 537 size_t items_before = 0; 538 advance(s); 539 540 while (true) { 541 seek_token(s); 542 if (s->current == EOF) { 543 fail(s, 1, "unclosed array"); 544 } 545 546 if (s->current == ',') { 547 advance(s); 548 continue; 549 } 550 551 if (items_before > 0) { 552 write_byte(s, '\n'); 553 } 554 555 if (s->current == ']') { 556 advance(s); 557 return; 558 } 559 560 handle_token(s); 561 items_before++; 562 } 563 } 564 565 void handle_unquoted_key(j0_maker* s) { 566 write_byte(s, '"'); 567 568 while (true) { 569 int c = s->current; 570 if (c == EOF) { 571 fail(s, 1, "input ended with an object key"); 572 } 573 574 write_byte(s, c); 575 advance(s); 576 577 c = s->current; 578 if (!isalpha(c) && !isdigit(c) && c != '_') { 579 break; 580 } 581 } 582 583 write_byte(s, '"'); 584 } 585 586 void handle_object(j0_maker* s) { 587 size_t items_before = 0; 588 write_byte(s, '{'); 589 advance(s); 590 591 while (true) { 592 seek_token(s); 593 if (s->current == EOF) { 594 fail(s, 1, "unclosed object"); 595 } 596 597 if (s->current == ',') { 598 advance(s); 599 continue; 600 } 601 602 if (s->current == '}') { 603 write_byte(s, '}'); 604 advance(s); 605 return; 606 } 607 608 if (s->current == '"' || s->current == '\'') { 609 if (items_before > 0) { 610 write_byte(s, ','); 611 } 612 handle_string(s); 613 items_before++; 614 } else if (isalpha(s->current) || s->current == '_') { 615 if (items_before > 0) { 616 write_byte(s, ','); 617 } 618 handle_unquoted_key(s); 619 items_before++; 620 } else { 621 fail(s, 1, "only strings or identifiers can be object keys"); 622 } 623 624 seek_token(s); 625 if (s->current == EOF) { 626 fail(s, 1, "input ended after object-key and before value"); 627 } 628 629 if (s->current != ':') { 630 fail(s, 1, "a `:` must follow all object keys"); 631 } 632 633 write_byte(s, ':'); 634 advance(s); 635 636 seek_token(s); 637 if (s->current == EOF) { 638 fail(s, 1, "input ended after a `:` following an object-key"); 639 } 640 641 handle_token(s); 642 } 643 } 644 645 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 646 void (*dispatch[256])() = { 647 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 648 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 649 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 650 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 651 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 652 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 653 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 654 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 655 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 656 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 657 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 658 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 659 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 660 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 661 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 662 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 663 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 664 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 665 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 666 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 667 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 668 NULL, NULL, NULL, NULL, 669 }; 670 671 void handle_token(j0_maker* s) { 672 void (*fn)(j0_maker*) = NULL; 673 674 // seek_token(s); 675 // if (s->current == EOF) { 676 // fail(s, 1, "expected a token"); 677 // } 678 679 fn = dispatch[s->current]; 680 if (fn != NULL) { 681 fn(s); 682 } else { 683 unsigned char c = (unsigned char)s->current; 684 fprintf(stderr, "%c\n", c); 685 fail(s, 1, "invalid token"); 686 } 687 } 688 689 void handle_array_jsonl(j0_maker* s); 690 691 void handle_input(FILE* src, bool jsonl) { 692 unsigned char ibuf[32 * 1024]; 693 694 j0_maker state; 695 j0_maker* s = &state; 696 s->ibuf = ibuf; 697 s->icap = sizeof(ibuf); 698 restart_state(s, stdout, src); 699 700 // ignore leading whitespace/comment bytes, if present 701 seek_token(s); 702 703 if (s->current == EOF) { 704 fail(s, 1, "empty input isn't valid JSON"); 705 } 706 707 if (jsonl && s->current == '[') { 708 handle_array_jsonl(s); 709 } else { 710 handle_token(s); 711 write_byte(s, '\n'); 712 } 713 714 // ignore trailing whitespace/comment bytes, if present 715 seek_token(s); 716 717 // ignore trailing semicolon, if present 718 if (s->current == ';') { 719 advance(s); 720 // ignore trailing whitespace/comment bytes, if present 721 seek_token(s); 722 } 723 724 if (!feof(src)) { 725 fail(s, 1, "unexpected trailing JSON data"); 726 } 727 } 728 729 bool is_help_option(const char* s) { 730 return (s[0] == '-' && s[1] != 0) && ( 731 strcmp(s, "-h") == 0 || strcmp(s, "--h") == 0 || 732 strcmp(s, "-help") == 0 || strcmp(s, "--help") == 0 733 ); 734 } 735 736 bool is_jsonl_option(const char* s) { 737 return (s[0] == '-' && s[1] != 0) && ( 738 strcmp(s, "-jsonl") == 0 || strcmp(s, "--jsonl") == 0 739 ); 740 } 741 742 // run returns the error code 743 int run(int argc, char** argv) { 744 bool jsonl = false; 745 if (argc > 1 && is_jsonl_option(argv[1])) { 746 jsonl = true; 747 argc--; 748 argv++; 749 } 750 751 if (argc > 2) { 752 const char* msg = "can't use more than 1 named input"; 753 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 754 return 1; 755 } 756 757 // use stdin when not given a filepath, or is `-` 758 if (argc < 2 || argv[1][0] == 0 || strcmp(argv[1], "-") == 0) { 759 handle_input(stdin, jsonl); 760 return 0; 761 } 762 763 const char* path = argv[1]; 764 FILE* f = fopen(path, "rb"); 765 if (f == NULL) { 766 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); 767 return 1; 768 } 769 770 handle_input(f, jsonl); 771 fclose(f); 772 773 return 0; 774 } 775 776 int main(int argc, char** argv) { 777 #ifdef _WIN32 778 setmode(fileno(stdin), O_BINARY); 779 // ensure output lines end in LF instead of CRLF on windows 780 setmode(fileno(stdout), O_BINARY); 781 setmode(fileno(stderr), O_BINARY); 782 #endif 783 784 if (argc > 1 && is_help_option(argv[1])) { 785 puts(info); 786 return 0; 787 } 788 789 dispatch['0'] = handle_number; 790 dispatch['1'] = handle_number; 791 dispatch['2'] = handle_number; 792 dispatch['3'] = handle_number; 793 dispatch['4'] = handle_number; 794 dispatch['5'] = handle_number; 795 dispatch['6'] = handle_number; 796 dispatch['7'] = handle_number; 797 dispatch['8'] = handle_number; 798 dispatch['9'] = handle_number; 799 dispatch['n'] = handle_null; 800 dispatch['t'] = handle_true; 801 dispatch['f'] = handle_false; 802 dispatch['N'] = handle_capital_none; 803 dispatch['T'] = handle_capital_true; 804 dispatch['F'] = handle_capital_false; 805 dispatch['.'] = handle_dot; 806 dispatch['+'] = handle_plus_number; 807 dispatch['-'] = handle_minus_number; 808 dispatch['"'] = handle_string; 809 dispatch['\''] = handle_string; 810 dispatch['['] = handle_array; 811 dispatch['{'] = handle_object; 812 813 return run(argc, argv) == 0 ? 0 : 1; 814 }