File: j0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2024 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 j0 [options...] [file...] 27 28 29 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 30 Its output is always a single line, which ends with a line-feed. 31 32 Besides minimizing bytes, this tool also adapts almost-JSON input into 33 valid JSON, since it 34 35 - ignores both rest-of-line and multi-line comments 36 - ignores extra/trailing commas in arrays and objects 37 - turns single-quoted strings/keys into double-quoted strings 38 - double-quotes unquoted object keys 39 - changes \x 2-hex-digit into \u 4-hex-digit string-escapes 40 41 The only option available can either start with a single or a double-dash 42 43 -h -help show this help message 44 */ 45 46 /* 47 You can build this command-line app by running 48 49 cc -Wall -s -O2 -o ./j0 ./j0.c 50 */ 51 52 #include <ctype.h> 53 #include <fcntl.h> 54 #include <stdarg.h> 55 #include <stdbool.h> 56 #include <stdint.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 61 #ifdef _WIN32 62 #include <windows.h> 63 #endif 64 65 // info is the message shown when this app is given any of its help options 66 const char* info = "" 67 "j0 [options...] [file...]\n" 68 "\n" 69 "\n" 70 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 71 "Its output is always a single line, which ends with a line-feed.\n" 72 "\n" 73 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 74 "valid JSON, since it\n" 75 "\n" 76 " - ignores both rest-of-line and multi-line comments\n" 77 " - ignores extra/trailing commas in arrays and objects\n" 78 " - turns single-quoted strings/keys into double-quoted strings\n" 79 " - double-quotes unquoted object keys\n" 80 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 81 "\n" 82 "All options available can either start with a single or a double-dash\n" 83 "\n" 84 " -h show this help message\n" 85 " -help show this help message\n" 86 " -jsonl emit JSON Lines, when top-level value is an array\n" 87 ""; 88 89 typedef struct j0_maker { 90 FILE* in; 91 unsigned char* ibuf; 92 size_t ilen; // how many bytes are being used in the input buffer 93 size_t icap; // the input buffer's capacity 94 size_t ipos; // the current position in the input buffer 95 96 FILE* out; 97 98 size_t line; // the current line, used to show useful error messages 99 size_t pos; // the position in the current line, for error messages 100 101 int current; 102 int next; 103 } j0_maker; 104 105 // advance_reader_pos helps func read_byte do its job 106 void advance_reader_pos(j0_maker* r, unsigned char b) { 107 r->ipos++; 108 if (b == '\n') { 109 r->line++; 110 r->pos = 1; 111 } else { 112 r->pos++; 113 } 114 } 115 116 // read_byte does as it says: check its return for the value EOF, before 117 // using it as the next byte 118 int read_byte(j0_maker* r) { 119 if (r->ipos < r->ilen) { 120 // inside current chunk 121 const unsigned char b = r->ibuf[r->ipos]; 122 advance_reader_pos(r, b); 123 return b; 124 } 125 126 // need to read the next block 127 r->ipos = 0; 128 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 129 if (r->ilen > 0) { 130 const unsigned char b = r->ibuf[r->ipos]; 131 advance_reader_pos(r, b); 132 return b; 133 } 134 135 // reached the end of data 136 return EOF; 137 } 138 139 // advance is used in most of the code, instead of calling read_byte directly 140 void advance(j0_maker* r) { 141 r->current = r->next; 142 r->next = read_byte(r); 143 } 144 145 void fail(j0_maker* s, size_t code, const char* fmt, ...); 146 147 void skip_line(j0_maker* r) { 148 while (true) { 149 advance(r); 150 if (r->current == EOF) { 151 break; 152 } 153 154 if (r->current == '\n') { 155 advance(r); 156 break; 157 } 158 } 159 } 160 161 void skip_multiline_comment(j0_maker* r) { 162 unsigned char prev = 0; 163 164 while (true) { 165 advance(r); 166 167 if (r->current == EOF) { 168 break; 169 } 170 171 if (prev == '*' && r->current == '/') { 172 advance(r); 173 break; 174 } 175 176 prev = (unsigned char)r->current; 177 } 178 } 179 180 void skip_comment(j0_maker* r) { 181 if (r->current != '/') { 182 fail(r, 1, "expected a slash to start comments"); 183 } 184 advance(r); 185 186 if (r->current == '/') { 187 skip_line(r); 188 return; 189 } 190 191 if (r->current == '*') { 192 skip_multiline_comment(r); 193 return; 194 } 195 196 fail(r, 1, "expected `//` or `/*` to start comments"); 197 } 198 199 void seek_token(j0_maker* r) { 200 while (true) { 201 if (r->current != EOF && r->current <= ' ') { 202 advance(r); 203 continue; 204 } 205 206 if (r->current == '/') { 207 skip_comment(r); 208 continue; 209 } 210 211 break; 212 } 213 } 214 215 bool starts_with_bom(const unsigned char* b, const size_t n) { 216 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 217 } 218 219 void restart_state(j0_maker* s, FILE* w, FILE* r) { 220 s->in = r; 221 s->ilen = 0; 222 s->ipos = 0; 223 224 s->out = w; 225 226 s->line = 1; 227 s->pos = 1; 228 229 s->current = EOF; 230 s->next = EOF; 231 232 s->current = read_byte(s); 233 if (s->current == EOF) { 234 return; 235 } 236 s->next = read_byte(s); 237 238 // skip leading UTF-8 BOM (byte-order mark), if present 239 if (starts_with_bom(s->ibuf, s->ilen)) { 240 // a UTF-8 BOM has 3 bytes 241 for (size_t i = 0; i < 3 && s->current != EOF; i++) { 242 advance(s); 243 } 244 } 245 } 246 247 // write_bytes does as it says, minimizing the number of calls to fwrite 248 void write_bytes(j0_maker* w, const unsigned char* src, size_t len) { 249 if (len > 0 && fwrite(src, len, 1, w->out) < 1) { 250 if (feof(w->out)) { 251 exit(0); 252 } 253 254 fail(w, 1, "failed to write more output"); 255 } 256 } 257 258 inline void write_byte(j0_maker* w, unsigned char b) { 259 putc(b, w->out); 260 } 261 262 // debug is available to diagnose any bug found 263 void debug(j0_maker* s, const char* fmt, ...) { 264 va_list args; 265 va_start(args, fmt); 266 267 if (s->in != stdin) { 268 fclose(s->in); 269 } 270 271 write_byte(s, '\n'); 272 273 const unsigned long line = s->line; 274 const unsigned long pos = s->pos; 275 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 276 fprintf(stderr, fmt, args); 277 fprintf(stderr, "\x1b[0m\n"); 278 279 va_end(args); 280 281 exit(10); 282 } 283 284 // fail quits this app with the printf-style formatted error message given 285 void fail(j0_maker* s, size_t code, const char* fmt, ...) { 286 va_list args; 287 va_start(args, fmt); 288 289 if (s->in != stdin) { 290 fclose(s->in); 291 } 292 293 write_byte(s, '\n'); 294 295 const unsigned long line = s->line; 296 const unsigned long pos = s->pos; 297 fprintf(stderr, "\x1b[31mline %lu, pos %lu: ", line, pos); 298 fprintf(stderr, fmt, args); 299 fprintf(stderr, "\x1b[0m\n"); 300 301 va_end(args); 302 303 exit(code); 304 } 305 306 bool demand_keyword(j0_maker* s, char* rest) { 307 for (; rest[0] != 0; rest++) { 308 if (s->current == EOF || s->current != rest[0]) { 309 return false; 310 } 311 advance(s); 312 } 313 314 return rest[0] == 0; 315 } 316 317 void handle_null(j0_maker* s) { 318 if (!demand_keyword(s, "null")) { 319 fail(s, 1, "expected `null` keyword"); 320 } 321 write_bytes(s, (unsigned char*)"null", 4); 322 } 323 324 void handle_true(j0_maker* s) { 325 if (!demand_keyword(s, "true")) { 326 fail(s, 1, "expected `true` keyword"); 327 } 328 write_bytes(s, (unsigned char*)"true", 4); 329 } 330 331 void handle_false(j0_maker* s) { 332 if (!demand_keyword(s, "false")) { 333 fail(s, 1, "expected `false` keyword"); 334 } 335 write_bytes(s, (unsigned char*)"false", 5); 336 } 337 338 void handle_capital_none(j0_maker* s) { 339 if (!demand_keyword(s, "None")) { 340 fail(s, 1, "expected `None` keyword"); 341 } 342 write_bytes(s, (unsigned char*)"null", 4); 343 } 344 345 void handle_capital_true(j0_maker* s) { 346 if (!demand_keyword(s, "True")) { 347 fail(s, 1, "expected `True` keyword"); 348 } 349 write_bytes(s, (unsigned char*)"true", 4); 350 } 351 352 void handle_capital_false(j0_maker* s) { 353 if (!demand_keyword(s, "False")) { 354 fail(s, 1, "expected `False` keyword"); 355 } 356 write_bytes(s, (unsigned char*)"false", 5); 357 } 358 359 void handle_digits(j0_maker* s) { 360 if (!isdigit(s->current)) { 361 fail(s, 1, "expected/missing digits"); 362 } 363 364 while (isdigit(s->current)) { 365 write_byte(s, s->current); 366 advance(s); 367 } 368 } 369 370 void handle_number(j0_maker* s) { 371 handle_digits(s); 372 373 if (s->current == '.') { 374 write_byte(s, '.'); 375 advance(s); 376 377 if (isdigit(s->current)) { 378 handle_digits(s); 379 } else { 380 write_byte(s, '0'); 381 } 382 return; 383 } 384 385 if (s->current == 'e' || s->current == 'E') { 386 write_byte(s, s->current); 387 advance(s); 388 389 if (s->current == '+') { 390 advance(s); 391 } else if (s->current == '-') { 392 write_byte(s, '-'); 393 advance(s); 394 } 395 396 handle_digits(s); 397 } 398 } 399 400 void handle_dot(j0_maker* s) { 401 write_byte(s, '0'); 402 write_byte(s, '.'); 403 advance(s); 404 405 if (!isdigit(s->current)) { 406 fail(s, 1, "expected/missing digits after decimal dot"); 407 } 408 handle_digits(s); 409 } 410 411 void handle_plus_number(j0_maker* s) { 412 advance(s); 413 414 if (s->current == '.') { 415 handle_dot(s); 416 return; 417 } 418 handle_number(s); 419 } 420 421 void handle_minus_number(j0_maker* s) { 422 write_byte(s, '-'); 423 advance(s); 424 425 if (s->current == '.') { 426 handle_dot(s); 427 return; 428 } 429 handle_number(s); 430 } 431 432 void handle_string_escape(j0_maker* s, int c) { 433 switch (c) { 434 case '"': 435 case '\\': 436 case 'b': 437 case 'f': 438 case 'n': 439 case 'r': 440 case 't': 441 write_byte(s, '\\'); 442 write_byte(s, c); 443 break; 444 445 case 'u': 446 write_byte(s, '\\'); 447 write_byte(s, 'u'); 448 for (size_t i = 0; i < 4; i++) { 449 advance(s); 450 if (s->current == EOF) { 451 fail(s, 1, "end of input before end of string"); 452 } 453 if (isdigit(s->current) || isalpha(s->current)) { 454 // write_byte(s, toupper(c)); 455 write_byte(s, c); 456 continue; 457 } 458 fail(s, 1, "invalid hexadecimal digit in string"); 459 } 460 break; 461 462 case 'x': 463 write_byte(s, '\\'); 464 write_byte(s, 'u'); 465 write_byte(s, '0'); 466 write_byte(s, '0'); 467 for (size_t i = 0; i < 2; i++) { 468 advance(s); 469 if (s->current == EOF) { 470 fail(s, 1, "end of input before end of string"); 471 } 472 if (isdigit(s->current) || isalpha(s->current)) { 473 // write_byte(s, toupper(c)); 474 write_byte(s, c); 475 continue; 476 } 477 fail(s, 1, "invalid hexadecimal digit in string"); 478 } 479 break; 480 481 case '\'': 482 write_byte(s, '\''); 483 break; 484 485 default: 486 write_byte(s, s->current); 487 break; 488 } 489 } 490 491 void handle_string(j0_maker* s) { 492 const unsigned char quote = s->current; 493 bool escaped = false; 494 495 write_byte(s, '"'); 496 497 while (true) { 498 advance(s); 499 500 int c = s->current; 501 if (c == EOF) { 502 fail(s, 1, "input ended before string was close-quoted"); 503 } 504 505 if (escaped) { 506 handle_string_escape(s, c); 507 escaped = false; 508 continue; 509 } 510 511 switch (c) { 512 case '\\': 513 escaped = true; 514 break; 515 516 default: 517 if (c == quote) { 518 write_byte(s, '"'); 519 advance(s); 520 return; 521 } 522 523 write_byte(s, c); 524 break; 525 } 526 } 527 } 528 529 void handle_token(j0_maker* s); 530 531 void handle_array(j0_maker* s) { 532 size_t items_before = 0; 533 write_byte(s, '['); 534 advance(s); 535 536 while (true) { 537 seek_token(s); 538 if (s->current == EOF) { 539 fail(s, 1, "unclosed array"); 540 } 541 542 if (s->current == ',') { 543 advance(s); 544 continue; 545 } 546 547 if (s->current == ']') { 548 write_byte(s, ']'); 549 advance(s); 550 return; 551 } 552 553 if (items_before > 0) { 554 write_byte(s, ','); 555 } 556 handle_token(s); 557 items_before++; 558 } 559 } 560 561 // handle_array_jsonl is a slight variation of func handle_array: this one is 562 // used to handle top-level arrays when running in JSON Lines mode, to emit 563 // line-feeds after each item, instead of commas between them 564 void handle_array_jsonl(j0_maker* s) { 565 size_t items_before = 0; 566 advance(s); 567 568 while (true) { 569 seek_token(s); 570 if (s->current == EOF) { 571 fail(s, 1, "unclosed array"); 572 } 573 574 if (s->current == ',') { 575 advance(s); 576 continue; 577 } 578 579 if (items_before > 0) { 580 write_byte(s, '\n'); 581 } 582 583 if (s->current == ']') { 584 advance(s); 585 return; 586 } 587 588 handle_token(s); 589 items_before++; 590 } 591 } 592 593 void handle_unquoted_key(j0_maker* s) { 594 write_byte(s, '"'); 595 596 while (true) { 597 int c = s->current; 598 if (c == EOF) { 599 fail(s, 1, "input ended with an object key"); 600 } 601 602 write_byte(s, c); 603 advance(s); 604 605 c = s->current; 606 if (!isalpha(c) && !isdigit(c) && c != '_') { 607 break; 608 } 609 } 610 611 write_byte(s, '"'); 612 } 613 614 void handle_object(j0_maker* s) { 615 size_t items_before = 0; 616 write_byte(s, '{'); 617 advance(s); 618 619 while (true) { 620 seek_token(s); 621 if (s->current == EOF) { 622 fail(s, 1, "unclosed object"); 623 } 624 625 if (s->current == ',') { 626 advance(s); 627 continue; 628 } 629 630 if (s->current == '}') { 631 write_byte(s, '}'); 632 advance(s); 633 return; 634 } 635 636 if (s->current == '"' || s->current == '\'') { 637 if (items_before > 0) { 638 write_byte(s, ','); 639 } 640 handle_string(s); 641 items_before++; 642 } else if (isalpha(s->current) || s->current == '_') { 643 if (items_before > 0) { 644 write_byte(s, ','); 645 } 646 handle_unquoted_key(s); 647 items_before++; 648 } else { 649 fail(s, 1, "only strings or identifiers can be object keys"); 650 } 651 652 seek_token(s); 653 if (s->current == EOF) { 654 fail(s, 1, "input ended after object-key and before value"); 655 } 656 657 if (s->current != ':') { 658 fail(s, 1, "a `:` must follow all object keys"); 659 } 660 661 write_byte(s, ':'); 662 advance(s); 663 664 seek_token(s); 665 if (s->current == EOF) { 666 fail(s, 1, "input ended after a `:` following an object-key"); 667 } 668 669 handle_token(s); 670 } 671 } 672 673 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 674 void (*dispatch[256])() = {}; 698 699 void handle_token(j0_maker* s) { 700 void (*fn)(j0_maker*) = NULL; 701 702 // seek_token(s); 703 // if (s->current == EOF) { 704 // fail(s, 1, "expected a token"); 705 // } 706 707 fn = dispatch[s->current]; 708 if (fn != NULL) { 709 fn(s); 710 } else { 711 unsigned char c = (unsigned char)s->current; 712 fprintf(stderr, "%c\n", c); 713 fail(s, 1, "invalid token"); 714 } 715 } 716 717 void handle_array_jsonl(j0_maker* s); 718 719 void handle_input(FILE* src, bool jsonl) { 720 unsigned char ibuf[32 * 1024]; 721 722 j0_maker state; 723 j0_maker* s = &state; 724 s->ibuf = ibuf; 725 s->icap = sizeof(ibuf); 726 restart_state(s, stdout, src); 727 728 // ignore leading whitespace/comment bytes, if present 729 seek_token(s); 730 731 if (s->current == EOF) { 732 fail(s, 1, "empty input isn't valid JSON"); 733 } 734 735 if (jsonl && s->current == '[') { 736 handle_array_jsonl(s); 737 } else { 738 handle_token(s); 739 write_byte(s, '\n'); 740 } 741 742 // ignore trailing whitespace/comment bytes, if present 743 seek_token(s); 744 745 // ignore trailing semicolon, if present 746 if (s->current == ';') { 747 advance(s); 748 // ignore trailing whitespace/comment bytes, if present 749 seek_token(s); 750 } 751 752 if (!feof(src)) { 753 fail(s, 1, "unexpected trailing JSON data"); 754 } 755 } 756 757 bool is_help_option(const char* s) { 758 return (s[0] == '-' && s[1] != 0) && ( 759 strcmp(s, "-h") == 0 || strcmp(s, "--h") == 0 || 760 strcmp(s, "-help") == 0 || strcmp(s, "--help") == 0 761 ); 762 } 763 764 bool is_jsonl_option(const char* s) { 765 return (s[0] == '-' && s[1] != 0) && ( 766 strcmp(s, "-jsonl") == 0 || strcmp(s, "--jsonl") == 0 767 ); 768 } 769 770 // run returns the error code 771 int run(int argc, char** argv) { 772 bool jsonl = false; 773 if (argc > 1 && is_jsonl_option(argv[1])) { 774 jsonl = true; 775 argc--; 776 argv++; 777 } 778 779 if (argc > 2) { 780 const char* msg = "can't use more than 1 named input"; 781 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 782 return 1; 783 } 784 785 // use stdin when not given a filepath, or is `-` 786 if (argc < 2 || argv[1][0] == 0 || strcmp(argv[1], "-") == 0) { 787 handle_input(stdin, jsonl); 788 return 0; 789 } 790 791 const char* path = argv[1]; 792 FILE* f = fopen(path, "rb"); 793 if (f == NULL) { 794 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); 795 return 1; 796 } 797 798 handle_input(f, jsonl); 799 fclose(f); 800 801 return 0; 802 } 803 804 int main(int argc, char** argv) { 805 #ifdef _WIN32 806 setmode(fileno(stdin), O_BINARY); 807 // ensure output lines end in LF instead of CRLF on windows 808 setmode(fileno(stdout), O_BINARY); 809 setmode(fileno(stderr), O_BINARY); 810 #endif 811 812 if (argc > 1 && is_help_option(argv[1])) { 813 puts(info); 814 return 0; 815 } 816 817 dispatch['0'] = handle_number; 818 dispatch['1'] = handle_number; 819 dispatch['2'] = handle_number; 820 dispatch['3'] = handle_number; 821 dispatch['4'] = handle_number; 822 dispatch['5'] = handle_number; 823 dispatch['6'] = handle_number; 824 dispatch['7'] = handle_number; 825 dispatch['8'] = handle_number; 826 dispatch['9'] = handle_number; 827 dispatch['n'] = handle_null; 828 dispatch['t'] = handle_true; 829 dispatch['f'] = handle_false; 830 dispatch['N'] = handle_capital_none; 831 dispatch['T'] = handle_capital_true; 832 dispatch['F'] = handle_capital_false; 833 dispatch['.'] = handle_dot; 834 dispatch['+'] = handle_plus_number; 835 dispatch['-'] = handle_minus_number; 836 dispatch['"'] = handle_string; 837 dispatch['\''] = handle_string; 838 dispatch['['] = handle_array; 839 dispatch['{'] = handle_object; 840 841 return run(argc, argv) == 0 ? 0 : 1; 842 }