File: j0.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2024 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 j0 [options...] [file...]
  27 
  28 
  29 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  30 Its output is always a single line, which ends with a line-feed.
  31 
  32 Besides minimizing bytes, this tool also adapts almost-JSON input into
  33 valid JSON, since it
  34 
  35     - ignores both rest-of-line and multi-line comments
  36     - ignores extra/trailing commas in arrays and objects
  37     - turns single-quoted strings/keys into double-quoted strings
  38     - double-quotes unquoted object keys
  39     - changes \x 2-hex-digit into \u 4-hex-digit string-escapes
  40 
  41 The only option available can either start with a single or a double-dash
  42 
  43     -h   -help       show this help message
  44 */
  45 
  46 /*
  47 You can build this command-line app by running
  48     cc -Wall -s -O2 -o ./j0 ./j0.c
  49 */
  50 
  51 #include <ctype.h>
  52 #include <fcntl.h>
  53 #include <stdarg.h>
  54 #include <stdbool.h>
  55 #include <stdint.h>
  56 #include <stdio.h>
  57 #include <stdlib.h>
  58 #include <string.h>
  59 
  60 #ifdef _WIN32
  61 #include <windows.h>
  62 #endif
  63 
  64 // info is the message shown when this app is given any of its help options
  65 const char* info = ""
  66 "j0 [options...] [file...]\n"
  67 "\n"
  68 "\n"
  69 "Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n"
  70 "Its output is always a single line, which ends with a line-feed.\n"
  71 "\n"
  72 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n"
  73 "valid JSON, since it\n"
  74 "\n"
  75 "    - ignores both rest-of-line and multi-line comments\n"
  76 "    - ignores extra/trailing commas in arrays and objects\n"
  77 "    - turns single-quoted strings/keys into double-quoted strings\n"
  78 "    - double-quotes unquoted object keys\n"
  79 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
  80 "\n"
  81 "The only option available can either start with a single or a double-dash\n"
  82 "\n"
  83 "    -h   -help       show this help message\n"
  84 "";
  85 
  86 typedef struct j0_maker {
  87     FILE* in;
  88     unsigned char* ibuf;
  89     size_t ilen; // how many bytes are being used in the input buffer
  90     size_t icap; // the input buffer's capacity
  91     size_t ipos; // the current position in the input buffer
  92 
  93     FILE* out;
  94     unsigned char* obuf;
  95     size_t olen; // how many bytes are being used in the output buffer
  96     size_t ocap; // the output buffer's capacity
  97     size_t opos; // the current position in the output buffer
  98 
  99     size_t line; // the current line, used to show useful error messages
 100     size_t pos;  // the position in the current line, for error messages
 101 
 102     int current;
 103     int next;
 104 } j0_maker;
 105 
 106 void advance_reader_pos(j0_maker* r, unsigned char b) {
 107     r->ipos++;
 108     if (b == '\n') {
 109         r->line++;
 110         r->pos = 1;
 111     } else {
 112         r->pos++;
 113     }
 114 }
 115 
 116 // read_byte does as it says: check its return for the value EOF, before
 117 // using it as the next byte
 118 int read_byte(j0_maker* r) {
 119     if (r->ipos < r->ilen) {
 120         // inside current chunk
 121         const unsigned char b = r->ibuf[r->ipos];
 122         advance_reader_pos(r, b);
 123         return b;
 124     }
 125 
 126     // need to read the next block
 127     r->ipos = 0;
 128     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 129     if (r->ilen > 0) {
 130         const unsigned char b = r->ibuf[r->ipos];
 131         advance_reader_pos(r, b);
 132         return b;
 133     }
 134 
 135     // reached the end of data
 136     return EOF;
 137 }
 138 
 139 void advance(j0_maker* r) {
 140     r->current = r->next;
 141     r->next = read_byte(r);
 142 }
 143 
 144 void fail(j0_maker* s, size_t code, const char* fmt, ...);
 145 
 146 void skip_line(j0_maker* r) {
 147     while (true) {
 148         advance(r);
 149         if (r->current == EOF) {
 150             break;
 151         }
 152 
 153         if (r->current == '\n') {
 154             advance(r);
 155             break;
 156         }
 157     }
 158 }
 159 
 160 void skip_multiline_comment(j0_maker* r) {
 161     unsigned char prev = 0;
 162 
 163     while (true) {
 164         advance(r);
 165 
 166         if (r->current == EOF) {
 167             break;
 168         }
 169 
 170         if (prev == '*' && r->current == '/') {
 171             advance(r);
 172             break;
 173         }
 174 
 175         prev = (unsigned char)r->current;
 176     }
 177 }
 178 
 179 void skip_comment(j0_maker* r) {
 180     if (r->current != '/') {
 181         fail(r, 1, "expected a slash to start comments");
 182     }
 183     advance(r);
 184 
 185     if (r->current == '/') {
 186         skip_line(r);
 187         return;
 188     }
 189 
 190     if (r->current == '*') {
 191         skip_multiline_comment(r);
 192         return;
 193     }
 194 
 195     fail(r, 1, "expected `//` or `/*` to start comments");
 196 }
 197 
 198 void seek_token(j0_maker* r) {
 199     while (true) {
 200         if (r->current != EOF && r->current <= 32) {
 201             advance(r);
 202             continue;
 203         }
 204 
 205         if (r->current == '/') {
 206             skip_comment(r);
 207             continue;
 208         }
 209 
 210         break;
 211     }
 212 }
 213 
 214 bool starts_with_bom(const unsigned char* b, const size_t n) {
 215     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 216 }
 217 
 218 void restart_state(j0_maker* s, FILE* w, FILE* r) {
 219     s->in = r;
 220     s->ilen = 0;
 221     s->ipos = 0;
 222 
 223     s->out = w;
 224     s->olen = 0;
 225 
 226     s->line = 1;
 227     s->pos = 1;
 228 
 229     s->current = EOF;
 230     s->next = EOF;
 231 
 232     s->current = read_byte(s);
 233     if (s->current == EOF) {
 234         return;
 235     }
 236     s->next = read_byte(s);
 237 
 238     // skip leading UTF-8 BOM (byte-order mark), if present
 239     if (starts_with_bom(s->ibuf, s->ilen)) {
 240         // a UTF-8 BOM has 3 bytes
 241         for (size_t i = 0; i < 3 && s->current != EOF; i++) {
 242             advance(s);
 243         }
 244     }
 245 }
 246 
 247 // flush does as it says: it empties the buffer after ensuring its bytes end
 248 // on their intended destination
 249 void flush(j0_maker* w) {
 250     if (w->olen > 0 && fwrite(w->obuf, w->olen, 1, w->out) < 1) {
 251         exit(0);
 252     }
 253     w->olen = 0;
 254 }
 255 
 256 // write_bytes does as it says, minimizing the number of calls to fwrite
 257 void write_bytes(j0_maker* w, const unsigned char* src, size_t len) {
 258     if (w->olen + len < w->ocap) {
 259         // all bytes fit into buffer
 260         memcpy(w->obuf + w->olen, src, len);
 261         w->olen += len;
 262         return;
 263     }
 264 
 265     // ensure current buffer bytes go out, before crossing strides
 266     flush(w);
 267 
 268     // emit all chunks striding beyond/at the buffer's capacity
 269     for (; len >= w->ocap; src += w->ocap, len -= w->ocap) {
 270         if (fwrite(src, w->ocap, 1, w->out) < 1) {
 271             if (feof(w->out)) {
 272                 exit(0);
 273             }
 274             return;
 275         }
 276     }
 277 
 278     // now all, if any, remaining bytes will fit into the buffer
 279     memcpy(w->obuf, src, len);
 280     w->olen += len;
 281 }
 282 
 283 // write_byte does as it says
 284 void write_byte(j0_maker* w, unsigned char b) {
 285     if (w->olen >= w->ocap) {
 286         flush(w);
 287     }
 288     w->obuf[w->olen] = b;
 289     w->olen++;
 290 }
 291 
 292 void debug(j0_maker* s, const char* fmt, ...) {
 293     va_list args;
 294     va_start(args, fmt);
 295 
 296     flush(s);
 297     fflush(s->out);
 298 
 299     if (s->in != stdin) {
 300         fclose(s->in);
 301     }
 302 
 303     write_byte(s, '\n');
 304     flush(s);
 305     fflush(stdout);
 306 
 307     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", s->line, s->pos);
 308     fprintf(stderr, fmt, args);
 309     fprintf(stderr, "\x1b[0m\n");
 310 
 311     va_end(args);
 312 
 313     exit(10);
 314 }
 315 
 316 void fail(j0_maker* s, size_t code, const char* fmt, ...) {
 317     va_list args;
 318     va_start(args, fmt);
 319 
 320     if (s->in != stdin) {
 321         fclose(s->in);
 322     }
 323 
 324     write_byte(s, '\n');
 325     flush(s);
 326     fflush(stdout);
 327 
 328     fprintf(stderr, "\x1b[31mline %lu, pos %lu: ", s->line, s->pos);
 329     fprintf(stderr, fmt, args);
 330     fprintf(stderr, "\x1b[0m\n");
 331 
 332     va_end(args);
 333 
 334     exit(code);
 335 }
 336 
 337 bool demand_keyword(j0_maker* s, char* rest) {
 338     for (; rest[0] != 0; rest++) {
 339         if (s->current == EOF || s->current != rest[0]) {
 340             return false;
 341         }
 342         advance(s);
 343     }
 344 
 345     return rest[0] == 0;
 346 }
 347 
 348 void handle_null(j0_maker* s) {
 349     if (!demand_keyword(s, "null")) {
 350         fail(s, 1, "expected null keyword");
 351     }
 352     write_bytes(s, (unsigned char*)"null", 4);
 353 }
 354 
 355 void handle_true(j0_maker* s) {
 356     if (!demand_keyword(s, "true")) {
 357         fail(s, 1, "expected `true` keyword");
 358     }
 359     write_bytes(s, (unsigned char*)"true", 4);
 360 }
 361 
 362 void handle_false(j0_maker* s) {
 363     if (!demand_keyword(s, "false")) {
 364         fail(s, 1, "expected `false` keyword");
 365     }
 366     write_bytes(s, (unsigned char*)"false", 5);
 367 }
 368 
 369 void handle_digits(j0_maker* s) {
 370     if (!isdigit(s->current)) {
 371         fail(s, 1, "expected/missing digits");
 372     }
 373 
 374     while (isdigit(s->current)) {
 375         write_byte(s, s->current);
 376         flush(s);
 377         fflush(stdout);
 378         advance(s);
 379     }
 380 }
 381 
 382 void handle_number(j0_maker* s) {
 383     handle_digits(s);
 384 
 385     if (s->current == '.') {
 386         write_byte(s, '.');
 387         advance(s);
 388 
 389         if (isdigit(s->current)) {
 390             handle_digits(s);
 391         } else {
 392             write_byte(s, '0');
 393         }
 394         return;
 395     }
 396 
 397     if (s->current == 'e' || s->current == 'E') {
 398         if (s->current == '+') {
 399             advance(s);
 400         } else if (s->current == '-') {
 401             write_byte(s, '-');
 402             advance(s);
 403         }
 404         handle_digits(s);
 405     }
 406 }
 407 
 408 void handle_dot(j0_maker* s) {
 409     write_byte(s, '0');
 410     write_byte(s, '.');
 411     advance(s);
 412 
 413     if (!isdigit(s->current)) {
 414         fail(s, 1, "expected/missing digits after decimal dot");
 415     }
 416     handle_digits(s);
 417 }
 418 
 419 void handle_plus_number(j0_maker* s) {
 420     advance(s);
 421 
 422     if (s->current == '.') {
 423         handle_dot(s);
 424         return;
 425     }
 426     handle_number(s);
 427 }
 428 
 429 void handle_minus_number(j0_maker* s) {
 430     write_byte(s, '-');
 431     advance(s);
 432 
 433     if (s->current == '.') {
 434         handle_dot(s);
 435         return;
 436     }
 437     handle_number(s);
 438 }
 439 
 440 void handle_string_escape(j0_maker* s, int c) {
 441     switch (c) {
 442         case '"':
 443         case '\\':
 444         case 'b':
 445         case 'f':
 446         case 'n':
 447         case 'r':
 448         case 't':
 449             write_byte(s, '\\');
 450             write_byte(s, c);
 451             break;
 452 
 453         case 'u':
 454             write_byte(s, '\\');
 455             write_byte(s, 'u');
 456             for (size_t i = 0; i < 4; i++) {
 457                 advance(s);
 458                 if (s->current == EOF) {
 459                     fail(s, 1, "end of input before end of string");
 460                 }
 461                 if (isdigit(s->current) || isalpha(s->current)) {
 462                     // write_byte(s, toupper(c));
 463                     write_byte(s, c);
 464                     continue;
 465                 }
 466                 fail(s, 1, "invalid hexadecimal digit in string");
 467             }
 468             break;
 469 
 470         case 'x':
 471             write_byte(s, '\\');
 472             write_byte(s, 'u');
 473             write_byte(s, '0');
 474             write_byte(s, '0');
 475             for (size_t i = 0; i < 2; i++) {
 476                 advance(s);
 477                 if (s->current == EOF) {
 478                     fail(s, 1, "end of input before end of string");
 479                 }
 480                 if (isdigit(s->current) || isalpha(s->current)) {
 481                     // write_byte(s, toupper(c));
 482                     write_byte(s, c);
 483                     continue;
 484                 }
 485                 fail(s, 1, "invalid hexadecimal digit in string");
 486             }
 487             break;
 488 
 489         case '\'':
 490             write_byte(s, '\'');
 491             break;
 492 
 493         default:
 494             write_byte(s, s->current);
 495             break;
 496     }
 497 }
 498 
 499 void handle_string(j0_maker* s) {
 500     const unsigned char quote = s->current;
 501     bool escaped = false;
 502 
 503     write_byte(s, '"');
 504 
 505     while (true) {
 506         advance(s);
 507 
 508         int c = s->current;
 509         if (c == EOF) {
 510             fail(s, 1, "input ended before string was close-quoted");
 511         }
 512 
 513         if (escaped) {
 514             handle_string_escape(s, c);
 515             escaped = false;
 516             continue;
 517         }
 518 
 519         switch (c) {
 520             case '\\':
 521                 escaped = true;
 522                 break;
 523 
 524             default:
 525                 if (c == quote) {
 526                     write_byte(s, '"');
 527                     advance(s);
 528                     return;
 529                 }
 530 
 531                 write_byte(s, c);
 532                 break;
 533         }
 534     }
 535 }
 536 
 537 void handle_token(j0_maker* s);
 538 
 539 void handle_array(j0_maker* s) {
 540     size_t items_before = 0;
 541     write_byte(s, '[');
 542     advance(s);
 543 
 544     while (true) {
 545         seek_token(s);
 546         if (s->current == EOF) {
 547             fail(s, 1, "unclosed array");
 548         }
 549 
 550         if (s->current == ',') {
 551             advance(s);
 552             continue;
 553         }
 554 
 555         if (s->current == ']') {
 556             write_byte(s, ']');
 557             advance(s);
 558             return;
 559         }
 560 
 561         if (items_before > 0) {
 562             write_byte(s, ',');
 563         }
 564         handle_token(s);
 565         items_before++;
 566     }
 567 }
 568 
 569 void handle_unquoted_key(j0_maker* s) {
 570     write_byte(s, '"');
 571 
 572     while (true) {
 573         int c = s->current;
 574         if (c == EOF) {
 575             fail(s, 1, "input ended with an object key");
 576         }
 577 
 578         write_byte(s, c);
 579         advance(s);
 580 
 581         c = s->current;
 582         if (!isalpha(c) && !isdigit(c) && c != '_') {
 583             break;
 584         }
 585     }
 586 
 587     write_byte(s, '"');
 588 }
 589 
 590 void handle_object(j0_maker* s) {
 591     size_t items_before = 0;
 592     write_byte(s, '{');
 593     advance(s);
 594 
 595     while (true) {
 596         seek_token(s);
 597         if (s->current == EOF) {
 598             fail(s, 1, "unclosed object");
 599         }
 600 
 601         if (s->current == ',') {
 602             advance(s);
 603             continue;
 604         }
 605 
 606         if (s->current == '}') {
 607             write_byte(s, '}');
 608             advance(s);
 609             return;
 610         }
 611 
 612         if (s->current == '"' || s->current == '\'') {
 613             if (items_before > 0) {
 614                 write_byte(s, ',');
 615             }
 616             handle_string(s);
 617             items_before++;
 618         } else if (isalpha(s->current) || s->current == '_') {
 619             if (items_before > 0) {
 620                 write_byte(s, ',');
 621             }
 622             handle_unquoted_key(s);
 623             items_before++;
 624         } else {
 625             fail(s, 1, "only strings or identifiers can be object keys");
 626         }
 627 
 628         seek_token(s);
 629         if (s->current == EOF) {
 630             fail(s, 1, "input ended after object-key and before value");
 631         }
 632 
 633         if (s->current != ':') {
 634             fail(s, 1, "a `:` must follow all object keys");
 635         }
 636 
 637         write_byte(s, ':');
 638         advance(s);
 639 
 640         seek_token(s);
 641         if (s->current == EOF) {
 642             fail(s, 1, "input ended after a `:` following an object-key");
 643         }
 644 
 645         handle_token(s);
 646     }
 647 }
 648 
 649 void (*dispatch[256])() = {};
 650 
 651 void handle_token(j0_maker* s) {
 652     void (*fn)(j0_maker*) = NULL;
 653 
 654     seek_token(s);
 655     if (s->current == EOF) {
 656         fail(s, 1, "expected a token");
 657     }
 658 
 659     fn = dispatch[s->current];
 660     if (fn != NULL) {
 661         fn(s);
 662     } else {
 663         unsigned char c = (unsigned char)s->current;
 664         fprintf(stderr, "%c\n", c);
 665         fail(s, 1, "invalid token");
 666     }
 667 }
 668 
 669 void handle_input(FILE* src) {
 670     unsigned char ibuf[48 * 1024];
 671     unsigned char obuf[48 * 1024];
 672 
 673     j0_maker state;
 674     j0_maker* s = &state;
 675     s->ibuf = ibuf;
 676     s->icap = sizeof(ibuf);
 677     s->obuf = obuf;
 678     s->ocap = sizeof(obuf);
 679     restart_state(s, stdout, src);
 680 
 681     if (s->current == EOF) {
 682         fail(s, 1, "empty input isn't valid JSON");
 683     }
 684 
 685     handle_token(s);
 686     seek_token(s);
 687     if (!feof(src)) {
 688         fail(s, 1, "unexpected trailing JSON data");
 689     }
 690     write_byte(s, '\n');
 691     flush(s);
 692     // fflush(stdout);
 693 }
 694 
 695 // run returns the error code
 696 size_t run(int argc, char** argv) {
 697     if (argc > 2) {
 698         const char* msg = "can't use more than 1 named input";
 699         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 700         return 1;
 701     }
 702 
 703     const char* fname = argv[1];
 704 
 705     // use stdin when not given a filepath, when the path is empty, or is `-`
 706     if (argc <= 1 || fname[0] == 0 || (fname[0] == '-' && fname[1] == 0)) {
 707         handle_input(stdin);
 708         return 0;
 709     }
 710 
 711     FILE* f = fopen(fname, "rb");
 712     if (f == NULL) {
 713         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 714         return 1;
 715     }
 716 
 717     handle_input(f);
 718     fclose(f);
 719 
 720     return 0;
 721 }
 722 
 723 void init_dispatch() {
 724     for (size_t i = 0; i < 256; i++) {
 725         dispatch[i] = NULL;
 726     }
 727 
 728     for (size_t i = '0'; i <= '9'; i++) {
 729         dispatch[i] = handle_number;
 730     }
 731 
 732     dispatch['n'] = handle_null;
 733     dispatch['t'] = handle_true;
 734     dispatch['f'] = handle_false;
 735     dispatch['.'] = handle_dot;
 736     dispatch['+'] = handle_plus_number;
 737     dispatch['-'] = handle_minus_number;
 738     dispatch['"'] = handle_string;
 739     dispatch['\''] = handle_string;
 740     dispatch['['] = handle_array;
 741     dispatch['{'] = handle_object;
 742 }
 743 
 744 int main(int argc, char** argv) {
 745 #ifdef _WIN32
 746     setmode(fileno(stdin), O_BINARY);
 747     // ensure output lines end in LF instead of CRLF on windows
 748     setmode(fileno(stdout), O_BINARY);
 749     setmode(fileno(stderr), O_BINARY);
 750 #endif
 751 
 752     // handle any of the help options, if given
 753     if (argc > 1 && argv[1][0] == '-') {
 754         const char* s = argv[1] + (argv[1][1] == '-' ? 2 : 1);
 755         if (strcmp(s, "h") == 0 || strcmp(s, "help") == 0) {
 756             puts(info);
 757             return 0;
 758         }
 759     }
 760 
 761     // disable automatic stdio buffering, in favor of explicit buffering
 762     setvbuf(stdin, NULL, _IONBF, 0);
 763     setvbuf(stdout, NULL, _IONBF, 0);
 764     setvbuf(stderr, NULL, _IONBF, 0);
 765 
 766     init_dispatch();
 767     return run(argc, argv) == 0 ? 0 : 1;
 768 }