File: j0.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2024 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 j0 [options...] [file...] 27 28 29 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 30 Its output is always a single line, which ends with a line-feed. 31 32 Besides minimizing bytes, this tool also adapts almost-JSON input into 33 valid JSON, since it 34 35 - ignores both rest-of-line and multi-line comments 36 - ignores extra/trailing commas in arrays and objects 37 - turns single-quoted strings/keys into double-quoted strings 38 - double-quotes unquoted object keys 39 - changes \x 2-hex-digit into \u 4-hex-digit string-escapes 40 41 The only option available can either start with a single or a double-dash 42 43 -h -help show this help message 44 */ 45 46 /* 47 You can build this command-line app by running 48 cc -Wall -s -O2 -o ./j0 ./j0.c 49 */ 50 51 #include <ctype.h> 52 #include <fcntl.h> 53 #include <stdarg.h> 54 #include <stdbool.h> 55 #include <stdint.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <string.h> 59 60 #ifdef _WIN32 61 #include <windows.h> 62 #endif 63 64 // info is the message shown when this app is given any of its help options 65 const char* info = "" 66 "j0 [options...] [file...]\n" 67 "\n" 68 "\n" 69 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n" 70 "Its output is always a single line, which ends with a line-feed.\n" 71 "\n" 72 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n" 73 "valid JSON, since it\n" 74 "\n" 75 " - ignores both rest-of-line and multi-line comments\n" 76 " - ignores extra/trailing commas in arrays and objects\n" 77 " - turns single-quoted strings/keys into double-quoted strings\n" 78 " - double-quotes unquoted object keys\n" 79 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 80 "\n" 81 "The only option available can either start with a single or a double-dash\n" 82 "\n" 83 " -h -help show this help message\n" 84 ""; 85 86 typedef struct j0_maker { 87 FILE* in; 88 unsigned char* ibuf; 89 size_t ilen; // how many bytes are being used in the input buffer 90 size_t icap; // the input buffer's capacity 91 size_t ipos; // the current position in the input buffer 92 93 FILE* out; 94 unsigned char* obuf; 95 size_t olen; // how many bytes are being used in the output buffer 96 size_t ocap; // the output buffer's capacity 97 size_t opos; // the current position in the output buffer 98 99 size_t line; // the current line, used to show useful error messages 100 size_t pos; // the position in the current line, for error messages 101 102 int current; 103 int next; 104 } j0_maker; 105 106 void advance_reader_pos(j0_maker* r, unsigned char b) { 107 r->ipos++; 108 if (b == '\n') { 109 r->line++; 110 r->pos = 1; 111 } else { 112 r->pos++; 113 } 114 } 115 116 // read_byte does as it says: check its return for the value EOF, before 117 // using it as the next byte 118 int read_byte(j0_maker* r) { 119 if (r->ipos < r->ilen) { 120 // inside current chunk 121 const unsigned char b = r->ibuf[r->ipos]; 122 advance_reader_pos(r, b); 123 return b; 124 } 125 126 // need to read the next block 127 r->ipos = 0; 128 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 129 if (r->ilen > 0) { 130 const unsigned char b = r->ibuf[r->ipos]; 131 advance_reader_pos(r, b); 132 return b; 133 } 134 135 // reached the end of data 136 return EOF; 137 } 138 139 void advance(j0_maker* r) { 140 r->current = r->next; 141 r->next = read_byte(r); 142 } 143 144 void fail(j0_maker* s, size_t code, const char* fmt, ...); 145 146 void skip_line(j0_maker* r) { 147 while (true) { 148 advance(r); 149 if (r->current == EOF) { 150 break; 151 } 152 153 if (r->current == '\n') { 154 advance(r); 155 break; 156 } 157 } 158 } 159 160 void skip_multiline_comment(j0_maker* r) { 161 unsigned char prev = 0; 162 163 while (true) { 164 advance(r); 165 166 if (r->current == EOF) { 167 break; 168 } 169 170 if (prev == '*' && r->current == '/') { 171 advance(r); 172 break; 173 } 174 175 prev = (unsigned char)r->current; 176 } 177 } 178 179 void skip_comment(j0_maker* r) { 180 if (r->current != '/') { 181 fail(r, 1, "expected a slash to start comments"); 182 } 183 advance(r); 184 185 if (r->current == '/') { 186 skip_line(r); 187 return; 188 } 189 190 if (r->current == '*') { 191 skip_multiline_comment(r); 192 return; 193 } 194 195 fail(r, 1, "expected `//` or `/*` to start comments"); 196 } 197 198 void seek_token(j0_maker* r) { 199 while (true) { 200 if (r->current != EOF && r->current <= 32) { 201 advance(r); 202 continue; 203 } 204 205 if (r->current == '/') { 206 skip_comment(r); 207 continue; 208 } 209 210 break; 211 } 212 } 213 214 bool starts_with_bom(const unsigned char* b, const size_t n) { 215 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 216 } 217 218 void restart_state(j0_maker* s, FILE* w, FILE* r) { 219 s->in = r; 220 s->ilen = 0; 221 s->ipos = 0; 222 223 s->out = w; 224 s->olen = 0; 225 226 s->line = 1; 227 s->pos = 1; 228 229 s->current = EOF; 230 s->next = EOF; 231 232 s->current = read_byte(s); 233 if (s->current == EOF) { 234 return; 235 } 236 s->next = read_byte(s); 237 238 // skip leading UTF-8 BOM (byte-order mark), if present 239 if (starts_with_bom(s->ibuf, s->ilen)) { 240 // a UTF-8 BOM has 3 bytes 241 for (size_t i = 0; i < 3 && s->current != EOF; i++) { 242 advance(s); 243 } 244 } 245 } 246 247 // flush does as it says: it empties the buffer after ensuring its bytes end 248 // on their intended destination 249 void flush(j0_maker* w) { 250 if (w->olen > 0 && fwrite(w->obuf, w->olen, 1, w->out) < 1) { 251 exit(0); 252 } 253 w->olen = 0; 254 } 255 256 // write_bytes does as it says, minimizing the number of calls to fwrite 257 void write_bytes(j0_maker* w, const unsigned char* src, size_t len) { 258 if (w->olen + len < w->ocap) { 259 // all bytes fit into buffer 260 memcpy(w->obuf + w->olen, src, len); 261 w->olen += len; 262 return; 263 } 264 265 // ensure current buffer bytes go out, before crossing strides 266 flush(w); 267 268 // emit all chunks striding beyond/at the buffer's capacity 269 for (; len >= w->ocap; src += w->ocap, len -= w->ocap) { 270 if (fwrite(src, w->ocap, 1, w->out) < 1) { 271 if (feof(w->out)) { 272 exit(0); 273 } 274 return; 275 } 276 } 277 278 // now all, if any, remaining bytes will fit into the buffer 279 memcpy(w->obuf, src, len); 280 w->olen += len; 281 } 282 283 // write_byte does as it says 284 void write_byte(j0_maker* w, unsigned char b) { 285 if (w->olen >= w->ocap) { 286 flush(w); 287 } 288 w->obuf[w->olen] = b; 289 w->olen++; 290 } 291 292 void debug(j0_maker* s, const char* fmt, ...) { 293 va_list args; 294 va_start(args, fmt); 295 296 flush(s); 297 fflush(s->out); 298 299 if (s->in != stdin) { 300 fclose(s->in); 301 } 302 303 write_byte(s, '\n'); 304 flush(s); 305 fflush(stdout); 306 307 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", s->line, s->pos); 308 fprintf(stderr, fmt, args); 309 fprintf(stderr, "\x1b[0m\n"); 310 311 va_end(args); 312 313 exit(10); 314 } 315 316 void fail(j0_maker* s, size_t code, const char* fmt, ...) { 317 va_list args; 318 va_start(args, fmt); 319 320 if (s->in != stdin) { 321 fclose(s->in); 322 } 323 324 write_byte(s, '\n'); 325 flush(s); 326 fflush(stdout); 327 328 fprintf(stderr, "\x1b[31mline %lu, pos %lu: ", s->line, s->pos); 329 fprintf(stderr, fmt, args); 330 fprintf(stderr, "\x1b[0m\n"); 331 332 va_end(args); 333 334 exit(code); 335 } 336 337 bool demand_keyword(j0_maker* s, char* rest) { 338 for (; rest[0] != 0; rest++) { 339 if (s->current == EOF || s->current != rest[0]) { 340 return false; 341 } 342 advance(s); 343 } 344 345 return rest[0] == 0; 346 } 347 348 void handle_null(j0_maker* s) { 349 if (!demand_keyword(s, "null")) { 350 fail(s, 1, "expected null keyword"); 351 } 352 write_bytes(s, (unsigned char*)"null", 4); 353 } 354 355 void handle_true(j0_maker* s) { 356 if (!demand_keyword(s, "true")) { 357 fail(s, 1, "expected `true` keyword"); 358 } 359 write_bytes(s, (unsigned char*)"true", 4); 360 } 361 362 void handle_false(j0_maker* s) { 363 if (!demand_keyword(s, "false")) { 364 fail(s, 1, "expected `false` keyword"); 365 } 366 write_bytes(s, (unsigned char*)"false", 5); 367 } 368 369 void handle_digits(j0_maker* s) { 370 if (!isdigit(s->current)) { 371 fail(s, 1, "expected/missing digits"); 372 } 373 374 while (isdigit(s->current)) { 375 write_byte(s, s->current); 376 flush(s); 377 fflush(stdout); 378 advance(s); 379 } 380 } 381 382 void handle_number(j0_maker* s) { 383 handle_digits(s); 384 385 if (s->current == '.') { 386 write_byte(s, '.'); 387 advance(s); 388 389 if (isdigit(s->current)) { 390 handle_digits(s); 391 } else { 392 write_byte(s, '0'); 393 } 394 return; 395 } 396 397 if (s->current == 'e' || s->current == 'E') { 398 if (s->current == '+') { 399 advance(s); 400 } else if (s->current == '-') { 401 write_byte(s, '-'); 402 advance(s); 403 } 404 handle_digits(s); 405 } 406 } 407 408 void handle_dot(j0_maker* s) { 409 write_byte(s, '0'); 410 write_byte(s, '.'); 411 advance(s); 412 413 if (!isdigit(s->current)) { 414 fail(s, 1, "expected/missing digits after decimal dot"); 415 } 416 handle_digits(s); 417 } 418 419 void handle_plus_number(j0_maker* s) { 420 advance(s); 421 422 if (s->current == '.') { 423 handle_dot(s); 424 return; 425 } 426 handle_number(s); 427 } 428 429 void handle_minus_number(j0_maker* s) { 430 write_byte(s, '-'); 431 advance(s); 432 433 if (s->current == '.') { 434 handle_dot(s); 435 return; 436 } 437 handle_number(s); 438 } 439 440 void handle_string_escape(j0_maker* s, int c) { 441 switch (c) { 442 case '"': 443 case '\\': 444 case 'b': 445 case 'f': 446 case 'n': 447 case 'r': 448 case 't': 449 write_byte(s, '\\'); 450 write_byte(s, c); 451 break; 452 453 case 'u': 454 write_byte(s, '\\'); 455 write_byte(s, 'u'); 456 for (size_t i = 0; i < 4; i++) { 457 advance(s); 458 if (s->current == EOF) { 459 fail(s, 1, "end of input before end of string"); 460 } 461 if (isdigit(s->current) || isalpha(s->current)) { 462 // write_byte(s, toupper(c)); 463 write_byte(s, c); 464 continue; 465 } 466 fail(s, 1, "invalid hexadecimal digit in string"); 467 } 468 break; 469 470 case 'x': 471 write_byte(s, '\\'); 472 write_byte(s, 'u'); 473 write_byte(s, '0'); 474 write_byte(s, '0'); 475 for (size_t i = 0; i < 2; i++) { 476 advance(s); 477 if (s->current == EOF) { 478 fail(s, 1, "end of input before end of string"); 479 } 480 if (isdigit(s->current) || isalpha(s->current)) { 481 // write_byte(s, toupper(c)); 482 write_byte(s, c); 483 continue; 484 } 485 fail(s, 1, "invalid hexadecimal digit in string"); 486 } 487 break; 488 489 case '\'': 490 write_byte(s, '\''); 491 break; 492 493 default: 494 write_byte(s, s->current); 495 break; 496 } 497 } 498 499 void handle_string(j0_maker* s) { 500 const unsigned char quote = s->current; 501 bool escaped = false; 502 503 write_byte(s, '"'); 504 505 while (true) { 506 advance(s); 507 508 int c = s->current; 509 if (c == EOF) { 510 fail(s, 1, "input ended before string was close-quoted"); 511 } 512 513 if (escaped) { 514 handle_string_escape(s, c); 515 escaped = false; 516 continue; 517 } 518 519 switch (c) { 520 case '\\': 521 escaped = true; 522 break; 523 524 default: 525 if (c == quote) { 526 write_byte(s, '"'); 527 advance(s); 528 return; 529 } 530 531 write_byte(s, c); 532 break; 533 } 534 } 535 } 536 537 void handle_token(j0_maker* s); 538 539 void handle_array(j0_maker* s) { 540 size_t items_before = 0; 541 write_byte(s, '['); 542 advance(s); 543 544 while (true) { 545 seek_token(s); 546 if (s->current == EOF) { 547 fail(s, 1, "unclosed array"); 548 } 549 550 if (s->current == ',') { 551 advance(s); 552 continue; 553 } 554 555 if (s->current == ']') { 556 write_byte(s, ']'); 557 advance(s); 558 return; 559 } 560 561 if (items_before > 0) { 562 write_byte(s, ','); 563 } 564 handle_token(s); 565 items_before++; 566 } 567 } 568 569 void handle_unquoted_key(j0_maker* s) { 570 write_byte(s, '"'); 571 572 while (true) { 573 int c = s->current; 574 if (c == EOF) { 575 fail(s, 1, "input ended with an object key"); 576 } 577 578 write_byte(s, c); 579 advance(s); 580 581 c = s->current; 582 if (!isalpha(c) && !isdigit(c) && c != '_') { 583 break; 584 } 585 } 586 587 write_byte(s, '"'); 588 } 589 590 void handle_object(j0_maker* s) { 591 size_t items_before = 0; 592 write_byte(s, '{'); 593 advance(s); 594 595 while (true) { 596 seek_token(s); 597 if (s->current == EOF) { 598 fail(s, 1, "unclosed object"); 599 } 600 601 if (s->current == ',') { 602 advance(s); 603 continue; 604 } 605 606 if (s->current == '}') { 607 write_byte(s, '}'); 608 advance(s); 609 return; 610 } 611 612 if (s->current == '"' || s->current == '\'') { 613 if (items_before > 0) { 614 write_byte(s, ','); 615 } 616 handle_string(s); 617 items_before++; 618 } else if (isalpha(s->current) || s->current == '_') { 619 if (items_before > 0) { 620 write_byte(s, ','); 621 } 622 handle_unquoted_key(s); 623 items_before++; 624 } else { 625 fail(s, 1, "only strings or identifiers can be object keys"); 626 } 627 628 seek_token(s); 629 if (s->current == EOF) { 630 fail(s, 1, "input ended after object-key and before value"); 631 } 632 633 if (s->current != ':') { 634 fail(s, 1, "a `:` must follow all object keys"); 635 } 636 637 write_byte(s, ':'); 638 advance(s); 639 640 seek_token(s); 641 if (s->current == EOF) { 642 fail(s, 1, "input ended after a `:` following an object-key"); 643 } 644 645 handle_token(s); 646 } 647 } 648 649 void (*dispatch[256])() = {}; 650 651 void handle_token(j0_maker* s) { 652 void (*fn)(j0_maker*) = NULL; 653 654 seek_token(s); 655 if (s->current == EOF) { 656 fail(s, 1, "expected a token"); 657 } 658 659 fn = dispatch[s->current]; 660 if (fn != NULL) { 661 fn(s); 662 } else { 663 unsigned char c = (unsigned char)s->current; 664 fprintf(stderr, "%c\n", c); 665 fail(s, 1, "invalid token"); 666 } 667 } 668 669 void handle_input(FILE* src) { 670 unsigned char ibuf[48 * 1024]; 671 unsigned char obuf[48 * 1024]; 672 673 j0_maker state; 674 j0_maker* s = &state; 675 s->ibuf = ibuf; 676 s->icap = sizeof(ibuf); 677 s->obuf = obuf; 678 s->ocap = sizeof(obuf); 679 restart_state(s, stdout, src); 680 681 if (s->current == EOF) { 682 fail(s, 1, "empty input isn't valid JSON"); 683 } 684 685 handle_token(s); 686 seek_token(s); 687 if (!feof(src)) { 688 fail(s, 1, "unexpected trailing JSON data"); 689 } 690 write_byte(s, '\n'); 691 flush(s); 692 // fflush(stdout); 693 } 694 695 // run returns the error code 696 size_t run(int argc, char** argv) { 697 if (argc > 2) { 698 const char* msg = "can't use more than 1 named input"; 699 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 700 return 1; 701 } 702 703 const char* fname = argv[1]; 704 705 // use stdin when not given a filepath, when the path is empty, or is `-` 706 if (argc <= 1 || fname[0] == 0 || (fname[0] == '-' && fname[1] == 0)) { 707 handle_input(stdin); 708 return 0; 709 } 710 711 FILE* f = fopen(fname, "rb"); 712 if (f == NULL) { 713 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname); 714 return 1; 715 } 716 717 handle_input(f); 718 fclose(f); 719 720 return 0; 721 } 722 723 void init_dispatch() { 724 for (size_t i = 0; i < 256; i++) { 725 dispatch[i] = NULL; 726 } 727 728 for (size_t i = '0'; i <= '9'; i++) { 729 dispatch[i] = handle_number; 730 } 731 732 dispatch['n'] = handle_null; 733 dispatch['t'] = handle_true; 734 dispatch['f'] = handle_false; 735 dispatch['.'] = handle_dot; 736 dispatch['+'] = handle_plus_number; 737 dispatch['-'] = handle_minus_number; 738 dispatch['"'] = handle_string; 739 dispatch['\''] = handle_string; 740 dispatch['['] = handle_array; 741 dispatch['{'] = handle_object; 742 } 743 744 int main(int argc, char** argv) { 745 #ifdef _WIN32 746 setmode(fileno(stdin), O_BINARY); 747 // ensure output lines end in LF instead of CRLF on windows 748 setmode(fileno(stdout), O_BINARY); 749 setmode(fileno(stderr), O_BINARY); 750 #endif 751 752 // handle any of the help options, if given 753 if (argc > 1 && argv[1][0] == '-') { 754 const char* s = argv[1] + (argv[1][1] == '-' ? 2 : 1); 755 if (strcmp(s, "h") == 0 || strcmp(s, "help") == 0) { 756 puts(info); 757 return 0; 758 } 759 } 760 761 // disable automatic stdio buffering, in favor of explicit buffering 762 setvbuf(stdin, NULL, _IONBF, 0); 763 setvbuf(stdout, NULL, _IONBF, 0); 764 setvbuf(stderr, NULL, _IONBF, 0); 765 766 init_dispatch(); 767 return run(argc, argv) == 0 ? 0 : 1; 768 }