File: nj.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./nj ./nj.c
  29 
  30 Building with COMPACT_OUTPUT defined makes `nj` output many fewer bytes, at
  31 the cost of using arguably worse colors. You can do that by running
  32 
  33 cc -s -O3 -march=native -mtune=native -flto -D COMPACT_OUTPUT -o ./nj ./nj.c
  34 
  35 Building for macos always uses COMPACT_OUTPUT, as the default terminal app
  36 there still doesn't support rgb colors.
  37 */
  38 
  39 #include <ctype.h>
  40 #include <stdarg.h>
  41 #include <stdbool.h>
  42 #include <stdint.h>
  43 #include <stdio.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46 
  47 #ifdef _WIN32
  48 #include <fcntl.h>
  49 #include <windows.h>
  50 #endif
  51 
  52 #ifdef RED_ERRORS
  53 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  54 #ifdef __APPLE__
  55 #define ERROR_STYLE "\x1b[31m"
  56 #endif
  57 #define ERROR_LINE(MSG) (ERROR_STYLE MSG "\x1b[0m\n")
  58 #else
  59 #define ERROR_LINE(MSG) (MSG "\n")
  60 #endif
  61 
  62 #ifdef __APPLE__
  63 #define COMPACT_OUTPUT
  64 #endif
  65 
  66 #ifndef IBUF_SIZE
  67 #define IBUF_SIZE (32 * 1024)
  68 #endif
  69 
  70 #ifndef OBUF_SIZE
  71 #define OBUF_SIZE (8 * 1024)
  72 #endif
  73 
  74 #ifndef INDENTATION
  75 #define INDENTATION 2
  76 #endif
  77 
  78 // CONST_SLICE initializes a slice struct using the string-constant given
  79 #define CONST_SLICE(s, x) init_slice(s, (unsigned char*)x, sizeof(x) - 1)
  80 
  81 // EMIT_CONST emits string constants without their final null byte
  82 #define EMIT_CONST(w, x) write_bytes(w, (unsigned char*)x, sizeof(x) - 1)
  83 
  84 #define RESET_STYLE "\x1b[0m"
  85 
  86 #ifdef COMPACT_OUTPUT
  87 #define NULL_STYLE "\x1b[33m"
  88 #define BOOL_STYLE "\x1b[36m"
  89 #define NUMBER_STYLE "\x1b[32m"
  90 #define NEGATIVE_STYLE "\x1b[31m"
  91 #define KEY_STYLE "\x1b[35m"
  92 // #define SYNTAX_STYLE "\x1b[37m"
  93 #define SYNTAX_STYLE "\x1b[33m"
  94 // #define NULL_STYLE "\x1b[38;5;248m"
  95 // #define BOOL_STYLE "\x1b[38;5;74m"
  96 // #define NUMBER_STYLE "\x1b[38;5;29m"
  97 // #define NEGATIVE_STYLE "\x1b[38;5;1m"
  98 // #define KEY_STYLE "\x1b[38;5;99m"
  99 // #define SYNTAX_STYLE "\x1b[38;5;248m"
 100 #else
 101 #define NULL_STYLE "\x1b[38;2;168;168;168m"
 102 #define BOOL_STYLE "\x1b[38;2;95;175;215m"
 103 #define NUMBER_STYLE "\x1b[38;2;0;135;95m"
 104 #define NEGATIVE_STYLE "\x1b[38;2;204;0;0m"
 105 // #define KEY_STYLE "\x1b[38;2;135;95;255m"
 106 #define KEY_STYLE "\x1b[38;2;135;135;225m"
 107 #define SYNTAX_STYLE "\x1b[38;2;168;168;168m"
 108 #endif
 109 
 110 const char* info = ""
 111 "nj [options...] [file...]\n"
 112 "\n"
 113 "\n"
 114 "Nice Json converts/fixes JSON/pseudo-JSON input into ANSI-styled multi-line\n"
 115 "JSON which uses 2 spaces for each indentation level.\n"
 116 "\n"
 117 "Besides styling and indenting JSON, this tool also adapts almost-JSON input\n"
 118 "into valid JSON, since it\n"
 119 "\n"
 120 "    - ignores both rest-of-line and multi-line comments\n"
 121 "    - ignores extra/trailing commas in arrays and objects\n"
 122 "    - turns single-quoted strings/keys into double-quoted strings\n"
 123 "    - double-quotes unquoted object keys\n"
 124 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
 125 "\n"
 126 "All options available can either start with a single or a double-dash\n"
 127 "\n"
 128 "    -h        show this help message\n"
 129 "    -help     show this help message\n"
 130 "";
 131 
 132 typedef struct slice {
 133     unsigned char* ptr;
 134     size_t len;
 135 } slice;
 136 
 137 static inline void init_slice(slice* s, unsigned char* ptr, size_t len) {
 138     s->ptr = ptr;
 139     s->len = len;
 140 }
 141 
 142 typedef struct nj_maker {
 143     FILE* in;
 144     FILE* out;
 145 
 146     unsigned char* ibuf;
 147     size_t ilen; // how many bytes are being used in the input buffer
 148     size_t icap; // the input buffer's capacity
 149     size_t ipos; // the current position in the input buffer
 150 
 151     size_t line; // the current line, used to show useful error messages
 152     size_t pos;  // the position in the current line, for error messages
 153 
 154     unsigned char* obuf;
 155     size_t ocap; // the output buffer's capacity
 156     size_t opos; // the current position in the output buffer
 157 
 158     ssize_t level; // the current indentation/nesting level
 159 
 160     int current;
 161     int next;
 162 } nj_maker;
 163 
 164 // advance_reader_pos helps func read_byte do its job
 165 static inline void advance_reader_pos(nj_maker* r, unsigned char b) {
 166     r->ipos++;
 167     if (b == '\n') {
 168         r->line++;
 169         r->pos = 1;
 170     } else {
 171         r->pos++;
 172     }
 173 }
 174 
 175 // read_byte does as it says: check its return for the value EOF, before
 176 // using it as the next byte
 177 int read_byte(nj_maker* r) {
 178     if (r->ipos < r->ilen) {
 179         // inside current chunk
 180         const unsigned char b = r->ibuf[r->ipos];
 181         advance_reader_pos(r, b);
 182         return b;
 183     }
 184 
 185     // need to read the next block
 186     r->ipos = 0;
 187     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 188     if (r->ilen > 0) {
 189         const unsigned char b = r->ibuf[r->ipos];
 190         advance_reader_pos(r, b);
 191         return b;
 192     }
 193 
 194     // reached the end of data
 195     return EOF;
 196 }
 197 
 198 // advance is used in most of the code, instead of calling read_byte directly
 199 static inline void advance(nj_maker* r) {
 200     r->current = r->next;
 201     r->next = read_byte(r);
 202 }
 203 
 204 void fail(nj_maker* m, int code, const char* msg);
 205 
 206 void skip_line(nj_maker* r) {
 207     while (true) {
 208         advance(r);
 209         const int lead = r->current;
 210 
 211         if (lead == EOF) {
 212             break;
 213         }
 214 
 215         if (lead == '\n') {
 216             advance(r);
 217             break;
 218         }
 219     }
 220 }
 221 
 222 void skip_multiline_comment(nj_maker* r) {
 223     unsigned char prev = 0;
 224 
 225     while (true) {
 226         advance(r);
 227         const int lead = r->current;
 228 
 229         if (lead == EOF) {
 230             break;
 231         }
 232 
 233         if (prev == '*' && lead == '/') {
 234             advance(r);
 235             break;
 236         }
 237 
 238         prev = (unsigned char)lead;
 239     }
 240 }
 241 
 242 void skip_comment(nj_maker* r) {
 243     int lead = r->current;
 244 
 245     if (lead == '#') {
 246         skip_line(r);
 247         return;
 248     }
 249 
 250     if (lead != '/') {
 251         fail(r, 1, "expected a slash to start comments");
 252     }
 253 
 254     advance(r);
 255     lead = r->current;
 256 
 257     if (lead == '/') {
 258         skip_line(r);
 259         return;
 260     }
 261 
 262     if (lead == '*') {
 263         skip_multiline_comment(r);
 264         return;
 265     }
 266 
 267     fail(r, 1, "expected `//` or `/*` to start comments");
 268 }
 269 
 270 static inline void seek_token(nj_maker* r) {
 271     while (true) {
 272         const int lead = r->current;
 273 
 274         if (lead != EOF && lead <= ' ') {
 275             advance(r);
 276             continue;
 277         }
 278 
 279         if (lead == '/' || lead == '#') {
 280             skip_comment(r);
 281             continue;
 282         }
 283 
 284         break;
 285     }
 286 }
 287 
 288 bool starts_with_bom(const unsigned char* b, const size_t n) {
 289     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 290 }
 291 
 292 void restart_state(nj_maker* m, FILE* w, FILE* r) {
 293     m->in = r;
 294     m->ilen = 0;
 295     m->ipos = 0;
 296 
 297     m->out = w;
 298     m->opos = 0;
 299 
 300     m->line = 1;
 301     m->pos = 1;
 302 
 303     m->current = EOF;
 304     m->next = EOF;
 305 
 306     m->current = read_byte(m);
 307     if (m->current == EOF) {
 308         return;
 309     }
 310     m->next = read_byte(m);
 311 
 312     m->level = 0;
 313 
 314     // skip leading UTF-8 BOM (byte-order mark), if present
 315     if (starts_with_bom(m->ibuf, m->ilen)) {
 316         // a UTF-8 BOM has 3 bytes
 317         for (size_t i = 0; i < 3 && m->current != EOF; i++) {
 318             advance(m);
 319         }
 320     }
 321 }
 322 
 323 void write_byte(nj_maker* m, unsigned char b) {
 324     if (m->opos < m->ocap) {
 325         m->obuf[m->opos++] = b;
 326         return;
 327     }
 328 
 329     fwrite(m->obuf, 1, m->ocap, m->out);
 330     m->obuf[0] = b;
 331     m->opos = 1;
 332 }
 333 
 334 // write_bytes does as it says, minimizing the number of calls to fwrite
 335 void write_bytes(nj_maker* m, const unsigned char* src, size_t len) {
 336     const size_t rem = m->ocap - m->opos;
 337     if (len < rem) {
 338         memcpy(m->obuf + m->opos, src, len);
 339         m->opos += len;
 340         return;
 341     }
 342 
 343     for (size_t i = 0; i < len; i++) {
 344         write_byte(m, src[i]);
 345     }
 346 }
 347 
 348 void write_spaces(nj_maker* m, ssize_t n) {
 349     const unsigned char spaces[32] = "                                ";
 350     while (n > sizeof(spaces)) {
 351         write_bytes(m, spaces, sizeof(spaces));
 352         n -= sizeof(spaces);
 353     }
 354     if (n > 0) {
 355         write_bytes(m, spaces, n);
 356     }
 357 }
 358 
 359 static inline void indent(nj_maker* m) {
 360     write_spaces(m, INDENTATION * m->level);
 361 }
 362 
 363 void flush(nj_maker* m) {
 364     if (m->opos > 0) {
 365         fwrite(m->obuf, 1, m->opos, m->out);
 366     }
 367     m->opos = 0;
 368     fflush(m->out);
 369 }
 370 
 371 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 372 
 373 static inline bool check_2_byte_rune(int a, int b) {
 374     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 375 }
 376 
 377 bool check_3_byte_rune(int a, int b, int c) {
 378     return (
 379         (a == 0xe0) &&
 380         (0xa0 <= b && b <= 0xbf) &&
 381         (0x80 <= c && c <= 0xbf)
 382     ) || (
 383         (0xe1 <= a && a <= 0xec) &&
 384         (0x80 <= b && b <= 0xbf) &&
 385         (0x80 <= c && c <= 0xbf)
 386     ) || (
 387         (a == 0xed) &&
 388         (0x80 <= b && b <= 0x9f) &&
 389         (0x80 <= c && c <= 0xbf)
 390     ) || (
 391         (a == 0xee || a == 0xef) &&
 392         (0x80 <= b && b <= 0xbf) &&
 393         (0x80 <= c && c <= 0xbf)
 394     );
 395 }
 396 
 397 bool check_4_byte_rune(int a, int b, int c, int d) {
 398     return (
 399         (a == 0xf0) &&
 400         (0x90 <= b && b <= 0xbf) &&
 401         (0x80 <= c && c <= 0xbf) &&
 402         (0x80 <= d && d <= 0xbf)
 403     ) || (
 404         (a == 0xf1 || a == 0xf3) &&
 405         (0x80 <= b && b <= 0xbf) &&
 406         (0x80 <= c && c <= 0xbf) &&
 407         (0x80 <= d && d <= 0xbf)
 408     ) || (
 409         (a == 0xf4) &&
 410         (0x80 <= b && b <= 0xbf) &&
 411         (0x80 <= c && c <= 0x8f) &&
 412         (0x80 <= d && d <= 0xbf)
 413     );
 414 }
 415 
 416 // write_replacement_char is the recommended action to handle invalid bytes
 417 void write_replacement_char(nj_maker* m) {
 418     write_byte(m, 0xef);
 419     write_byte(m, 0xbf);
 420     write_byte(m, 0xbd);
 421 }
 422 
 423 void handle_invalid_rune(nj_maker* m) {
 424     // fail(m, 1, "invalid unicode value");
 425     write_replacement_char(m);
 426 }
 427 
 428 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 429 void write_rune(nj_maker* m, uint32_t rune) {
 430     if (rune < (1 << 7)) {
 431         write_byte(m, rune);
 432         return;
 433     }
 434 
 435     if (rune < (1 << (5 + 6))) {
 436         const int a = 0b11000000 | (rune >> 6);
 437         const int b = 0b10000000 | (rune & 0b00111111);
 438         if (check_2_byte_rune(a, b)) {
 439             write_byte(m, a);
 440             write_byte(m, b);
 441         } else {
 442             write_replacement_char(m);
 443         }
 444         return;
 445     }
 446 
 447     if (rune < (1 << (4 + 6 + 6))) {
 448         const int a = 0b11100000 | (rune >> 12);
 449         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 450         const int c = 0b10000000 | (rune & 0b00111111);
 451         if (check_3_byte_rune(a, b, c)) {
 452             write_byte(m, a);
 453             write_byte(m, b);
 454             write_byte(m, c);
 455         } else {
 456             write_replacement_char(m);
 457         }
 458         return;
 459     }
 460 
 461     if (rune < (1 << (3 + 6 + 6 + 6))) {
 462         const int a = 0b11110000 | (rune >> 18);
 463         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 464         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 465         const int d = 0b10000000 | (rune & 0b00111111);
 466         if (check_4_byte_rune(a, b, c, d)) {
 467             write_byte(m, a);
 468             write_byte(m, b);
 469             write_byte(m, c);
 470             write_byte(m, d);
 471         } else {
 472             write_replacement_char(m);
 473         }
 474         return;
 475     }
 476 
 477     write_replacement_char(m);
 478 }
 479 
 480 void copy_utf8_rune(nj_maker* m) {
 481     const int a = m->current;
 482 
 483     if (a == EOF) {
 484         return;
 485     }
 486 
 487     // handle 1-byte runes
 488     if (a < 128) {
 489         write_byte(m, a);
 490         return;
 491     }
 492 
 493     advance(m);
 494     const int b = m->current;
 495 
 496     if (b == EOF) {
 497         handle_invalid_rune(m);
 498         return;
 499     }
 500 
 501     // handle 2-byte runes
 502     if (check_2_byte_rune(a, b)) {
 503         write_byte(m, a);
 504         write_byte(m, b);
 505         return;
 506     }
 507 
 508     advance(m);
 509     const int c = m->current;
 510 
 511     if (c == EOF) {
 512         handle_invalid_rune(m);
 513         return;
 514     }
 515 
 516     // handle 3-byte runes
 517     if (check_3_byte_rune(a, b, c)) {
 518         write_byte(m, a);
 519         write_byte(m, b);
 520         write_byte(m, c);
 521         return;
 522     }
 523 
 524     advance(m);
 525     const int d = m->current;
 526 
 527     if (d == EOF) {
 528         handle_invalid_rune(m);
 529         return;
 530     }
 531 
 532     // handle 4-byte runes
 533     if (check_4_byte_rune(a, b, c, d)) {
 534         write_byte(m, a);
 535         write_byte(m, b);
 536         write_byte(m, c);
 537         write_byte(m, d);
 538         return;
 539     }
 540 
 541     handle_invalid_rune(m);
 542 }
 543 
 544 // debug is available to diagnose any bug found
 545 void debug(nj_maker* m, const char* fmt, ...) {
 546     va_list args;
 547     va_start(args, fmt);
 548 
 549     if (m->in != stdin) {
 550         fclose(m->in);
 551     }
 552 
 553     write_byte(m, '\n');
 554 
 555     const unsigned long line = m->line;
 556     const unsigned long pos = m->pos;
 557     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos);
 558     fprintf(stderr, fmt, args);
 559     fprintf(stderr, "\x1b[0m\n");
 560 
 561     va_end(args);
 562 
 563     exit(10);
 564 }
 565 
 566 // fail quits this app right after showing the error message given
 567 void fail(nj_maker* m, int code, const char* msg) {
 568     const unsigned long line = m->line;
 569     const unsigned long pos = m->pos;
 570 
 571     EMIT_CONST(m, "\x1b[0m");
 572     write_byte(m, '\n');
 573     flush(m);
 574     fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg);
 575     exit(code);
 576 }
 577 
 578 bool demand_keyword(nj_maker* m, char* rest) {
 579     for (; rest[0] != 0; rest++) {
 580         const int lead = m->current;
 581         if (lead == EOF || lead != rest[0]) {
 582             return false;
 583         }
 584         advance(m);
 585     }
 586 
 587     return rest[0] == 0;
 588 }
 589 
 590 void handle_null(nj_maker* m) {
 591     if (!demand_keyword(m, "null")) {
 592         fail(m, 1, "expected `null` keyword");
 593     }
 594     EMIT_CONST(m, "null");
 595 }
 596 
 597 void handle_true(nj_maker* m) {
 598     if (!demand_keyword(m, "true")) {
 599         fail(m, 1, "expected `true` keyword");
 600     }
 601     EMIT_CONST(m, "true");
 602 }
 603 
 604 void handle_false(nj_maker* m) {
 605     if (!demand_keyword(m, "false")) {
 606         fail(m, 1, "expected `false` keyword");
 607     }
 608     EMIT_CONST(m, "false");
 609 }
 610 
 611 void handle_capital_none(nj_maker* m) {
 612     if (!demand_keyword(m, "None")) {
 613         fail(m, 1, "expected `None` keyword");
 614     }
 615     EMIT_CONST(m, "null");
 616 }
 617 
 618 void handle_capital_true(nj_maker* m) {
 619     if (!demand_keyword(m, "True")) {
 620         fail(m, 1, "expected `True` keyword");
 621     }
 622     EMIT_CONST(m, "true");
 623 }
 624 
 625 void handle_capital_false(nj_maker* m) {
 626     if (!demand_keyword(m, "False")) {
 627         fail(m, 1, "expected `False` keyword");
 628     }
 629     EMIT_CONST(m, "false");
 630 }
 631 
 632 void handle_digits(nj_maker* m) {
 633     if (!isdigit(m->current)) {
 634         fail(m, 1, "expected/missing digits");
 635     }
 636 
 637     while (isdigit(m->current)) {
 638         write_byte(m, m->current);
 639         advance(m);
 640     }
 641 }
 642 
 643 void handle_number(nj_maker* m) {
 644     handle_digits(m);
 645 
 646     const int lead = m->current;
 647 
 648     if (lead == '.') {
 649         write_byte(m, '.');
 650         advance(m);
 651 
 652         if (isdigit(m->current)) {
 653             handle_digits(m);
 654         } else {
 655             write_byte(m, '0');
 656         }
 657         return;
 658     }
 659 
 660     if (lead == 'e' || lead == 'E') {
 661         write_byte(m, lead);
 662         advance(m);
 663 
 664         if (m->current == '+') {
 665             advance(m);
 666         } else if (m->current == '-') {
 667             write_byte(m, '-');
 668             advance(m);
 669         }
 670 
 671         handle_digits(m);
 672     }
 673 }
 674 
 675 void handle_dot(nj_maker* m) {
 676     write_byte(m, '0');
 677     write_byte(m, '.');
 678     advance(m);
 679 
 680     if (!isdigit(m->current)) {
 681         fail(m, 1, "expected/missing digits after decimal dot");
 682     }
 683     handle_digits(m);
 684 }
 685 
 686 void handle_plus_number(nj_maker* m) {
 687     advance(m);
 688 
 689     if (m->current == '.') {
 690         handle_dot(m);
 691         return;
 692     }
 693     handle_number(m);
 694 }
 695 
 696 void handle_minus_number(nj_maker* m) {
 697     write_byte(m, '-');
 698     advance(m);
 699 
 700     if (m->current == '.') {
 701         handle_dot(m);
 702         return;
 703     }
 704     handle_number(m);
 705 }
 706 
 707 // decode_hex assumes valid hex digits, checked by func is_valid_hex
 708 uint32_t decode_hex(unsigned char hex) {
 709     if ('0' <= hex && hex <= '9') {
 710         return hex - '0';
 711     }
 712     if ('A' <= hex && hex <= 'F') {
 713         return hex - 'A' + 10;
 714     }
 715     if ('a' <= hex && hex <= 'f') {
 716         return hex - 'a' + 10;
 717     }
 718     return 0xffff;
 719 }
 720 
 721 static inline bool is_valid_hex(unsigned char b) {
 722     return false ||
 723         ('0' <= b && b <= '9') ||
 724         ('A' <= b && b <= 'F') ||
 725         ('a' <= b && b <= 'f');
 726 }
 727 
 728 // handle_low_char ensures characters whose ASCII codes are lower than spaces
 729 // are properly escaped for strings
 730 void handle_low_char(nj_maker* m, int c) {
 731     const char* hex = "0123456789ABCDEF";
 732 
 733     switch (c) {
 734     case '\t':
 735         write_byte(m, '\\');
 736         write_byte(m, 't');
 737         break;
 738     case '\n':
 739         write_byte(m, '\\');
 740         write_byte(m, 'n');
 741         break;
 742     case '\r':
 743         write_byte(m, '\\');
 744         write_byte(m, 'r');
 745         break;
 746     case '\b':
 747         write_byte(m, '\\');
 748         write_byte(m, 'b');
 749         break;
 750     case '\f':
 751         write_byte(m, '\\');
 752         write_byte(m, 'f');
 753         break;
 754     case '\v':
 755         write_byte(m, '\\');
 756         write_byte(m, 'v');
 757         break;
 758     default:
 759         write_byte(m, '\\');
 760         write_byte(m, 'u');
 761         write_byte(m, '0');
 762         write_byte(m, '0');
 763         write_byte(m, hex[c / 16]);
 764         write_byte(m, hex[c % 16]);
 765         break;
 766     }
 767 }
 768 
 769 void write_inner_string_hex_quad(nj_maker* m, const unsigned char quad[4]) {
 770     const uint32_t n = 0 +
 771         (decode_hex(quad[0]) << 12) +
 772         (decode_hex(quad[1]) << 8) +
 773         (decode_hex(quad[2]) << 4) +
 774         (decode_hex(quad[3]) << 0);
 775 
 776     switch (n) {
 777     case '"':
 778         write_byte(m, '\\');
 779         write_byte(m, '"');
 780         return;
 781     case '\\':
 782         write_byte(m, '\\');
 783         write_byte(m, '\\');
 784         return;
 785     }
 786 
 787     if (n >= ' ') {
 788         write_rune(m, n);
 789     } else {
 790         handle_low_char(m, n);
 791     }
 792 }
 793 
 794 void handle_hex_quad(nj_maker* m) {
 795     unsigned char quad[4];
 796     for (size_t i = 0; i < 4; i++) {
 797         advance(m);
 798         const int lead = m->current;
 799         if (lead == EOF) {
 800             fail(m, 1, "end of input before end of string");
 801         }
 802         if (is_valid_hex(lead)) {
 803             quad[i] = lead;
 804             continue;
 805         }
 806         fail(m, 1, "invalid hexadecimal digit in string");
 807     }
 808 
 809     write_inner_string_hex_quad(m, quad);
 810 }
 811 
 812 void handle_hex_pair(nj_maker* m) {
 813     unsigned char quad[4] = {'0', '0', '0', '0'};
 814     advance(m);
 815     const int a = m->current;
 816     advance(m);
 817     const int b = m->current;
 818     if (a == EOF || b == EOF) {
 819         fail(m, 1, "end of input before end of string");
 820     }
 821     if (!is_valid_hex(a) || !is_valid_hex(b)) {
 822         fail(m, 1, "invalid hexadecimal digit in string");
 823     }
 824 
 825     quad[2] = a;
 826     quad[3] = b;
 827     write_inner_string_hex_quad(m, quad);
 828 }
 829 
 830 void handle_string_escape(nj_maker* m, int c) {
 831     switch (c) {
 832     case '"':
 833     case '\\':
 834     case 'b':
 835     case 'f':
 836     case 'n':
 837     case 'r':
 838     case 't':
 839         write_byte(m, '\\');
 840         write_byte(m, c);
 841         break;
 842     case 'u':
 843         handle_hex_quad(m);
 844         break;
 845     case 'x':
 846         handle_hex_pair(m);
 847         break;
 848     case '\'':
 849         write_byte(m, '\'');
 850         break;
 851     default:
 852         write_byte(m, m->current);
 853         break;
 854     }
 855 }
 856 
 857 ssize_t handle_inner_string(nj_maker* m) {
 858     const unsigned char quote = m->current;
 859     bool escaped = false;
 860 
 861     for (size_t i = 0; true; i++) {
 862         advance(m);
 863 
 864         int c = m->current;
 865         if (c == EOF) {
 866             fail(m, 1, "input ended before string was close-quoted");
 867         }
 868 
 869         if (escaped) {
 870             handle_string_escape(m, c);
 871             escaped = false;
 872             continue;
 873         }
 874 
 875         switch (c) {
 876         case '\\':
 877             escaped = true;
 878             break;
 879         default:
 880             if (c == quote) {
 881                 advance(m);
 882                 return i;
 883             }
 884 
 885             // write_byte(m, c);
 886             if (c < ' ') {
 887                 handle_low_char(m, c);
 888             } else {
 889                 copy_utf8_rune(m);
 890             }
 891             break;
 892         }
 893     }
 894 }
 895 
 896 void handle_quoted_key(nj_maker* m) {
 897     if (m->current != m->next) {
 898         EMIT_CONST(m, "\"" KEY_STYLE);
 899         handle_inner_string(m);
 900         EMIT_CONST(m, SYNTAX_STYLE "\"");
 901     } else {
 902         write_byte(m, '"');
 903         handle_inner_string(m);
 904         write_byte(m, '"');
 905     }
 906 }
 907 
 908 void handle_string(nj_maker* m) {
 909     if (m->current != m->next) {
 910         EMIT_CONST(m, "\"" RESET_STYLE);
 911         handle_inner_string(m);
 912         EMIT_CONST(m, SYNTAX_STYLE "\"");
 913     } else {
 914         write_byte(m, '"');
 915         handle_inner_string(m);
 916         write_byte(m, '"');
 917     }
 918 }
 919 
 920 void handle_token(nj_maker* m, ssize_t lead_level);
 921 
 922 void handle_array(nj_maker* m) {
 923     size_t items = 0;
 924     const unsigned char end = m->current == '[' ? ']' : ')';
 925     m->level++;
 926     write_byte(m, '[');
 927     advance(m);
 928 
 929     while (true) {
 930         seek_token(m);
 931         const int lead = m->current;
 932 
 933         if (lead == EOF) {
 934             fail(m, 1, "unclosed array");
 935         }
 936 
 937         if (lead == ',') {
 938             advance(m);
 939             continue;
 940         }
 941 
 942         if (lead == end) {
 943             m->level--;
 944             if (items > 0) {
 945                 write_byte(m, '\n');
 946                 indent(m);
 947                 EMIT_CONST(m, SYNTAX_STYLE "]");
 948             } else {
 949                 write_byte(m, ']');
 950             }
 951             advance(m);
 952             return;
 953         }
 954 
 955         if (items > 0) {
 956             EMIT_CONST(m, SYNTAX_STYLE ",");
 957         }
 958         write_byte(m, '\n');
 959         if (feof(m->out)) {
 960             return;
 961         }
 962         handle_token(m, m->level);
 963         items++;
 964     }
 965 }
 966 
 967 void handle_unquoted_key(nj_maker* m) {
 968     EMIT_CONST(m, SYNTAX_STYLE "\"" KEY_STYLE);
 969 
 970     while (true) {
 971         int c = m->current;
 972         if (c == EOF) {
 973             fail(m, 1, "input ended with an object key");
 974         }
 975 
 976         write_byte(m, c);
 977         advance(m);
 978 
 979         c = m->current;
 980         if (!isalpha(c) && !isdigit(c) && c != '_') {
 981             break;
 982         }
 983     }
 984 
 985     EMIT_CONST(m, SYNTAX_STYLE "\"");
 986 }
 987 
 988 void handle_object(nj_maker* m) {
 989     size_t items = 0;
 990     m->level++;
 991     write_byte(m, '{');
 992     advance(m);
 993 
 994     while (true) {
 995         seek_token(m);
 996         int lead = m->current;
 997 
 998         if (lead == EOF) {
 999             fail(m, 1, "unclosed object");
1000         }
1001 
1002         if (lead == ',') {
1003             advance(m);
1004             continue;
1005         }
1006 
1007         if (lead == '}') {
1008             m->level--;
1009             if (items > 0) {
1010                 write_byte(m, '\n');
1011                 indent(m);
1012                 EMIT_CONST(m, SYNTAX_STYLE "}");
1013             } else {
1014                 write_byte(m, '}');
1015             }
1016             advance(m);
1017             return;
1018         }
1019 
1020         if (feof(m->out)) {
1021             return;
1022         }
1023 
1024         if (lead == '"' || lead == '\'') {
1025             if (items > 0) {
1026                 EMIT_CONST(m, SYNTAX_STYLE ",");
1027             }
1028             write_byte(m, '\n');
1029             indent(m);
1030             EMIT_CONST(m, SYNTAX_STYLE);
1031             handle_quoted_key(m);
1032         } else if (isalpha(lead) || lead == '_') {
1033             if (items > 0) {
1034                 EMIT_CONST(m, SYNTAX_STYLE ",");
1035             }
1036             write_byte(m, '\n');
1037             indent(m);
1038             handle_unquoted_key(m);
1039         } else {
1040             fail(m, 1, "only strings or identifiers can be object keys");
1041         }
1042 
1043         seek_token(m);
1044         lead = m->current;
1045 
1046         if (lead == EOF) {
1047             fail(m, 1, "input ended after object-key and before value");
1048         }
1049 
1050         if (lead != ':') {
1051             fail(m, 1, "a `:` must follow all object keys");
1052         }
1053 
1054         EMIT_CONST(m, ": ");
1055         advance(m);
1056 
1057         seek_token(m);
1058         if (m->current == EOF) {
1059             fail(m, 1, "input ended after a `:` following an object-key");
1060         }
1061 
1062         handle_token(m, 0);
1063         items++;
1064     }
1065 }
1066 
1067 // styles ties leading bytes/chars in tokens to their leading ANSI styles
1068 slice styles[256] = {};
1069 
1070 // dispatch ties leading bytes/chars in tokens to the funcs which handle them
1071 void (*dispatch[256])() = {};
1072 
1073 void handle_token(nj_maker* m, ssize_t lead_level) {
1074     const unsigned char b = m->current;
1075     write_spaces(m, INDENTATION * lead_level);
1076     write_bytes(m, styles[b].ptr, styles[b].len);
1077     dispatch[b](m);
1078 }
1079 
1080 // handle_invalid_token shows an error message and quits the app right after
1081 void handle_invalid_token(nj_maker* m) {
1082     char msg[64];
1083     unsigned char c = (unsigned char)m->current;
1084     sprintf(msg, "%c (%d): invalid token", c, c);
1085     fail(m, 1, msg);
1086 }
1087 
1088 void handle_input(FILE* src) {
1089     unsigned char ibuf[IBUF_SIZE];
1090     unsigned char obuf[OBUF_SIZE];
1091 
1092     nj_maker m;
1093     m.ibuf = ibuf;
1094     m.icap = sizeof(ibuf);
1095     m.obuf = obuf;
1096     m.ocap = sizeof(obuf);
1097     restart_state(&m, stdout, src);
1098 
1099     // ignore leading whitespace/comment bytes, if present
1100     seek_token(&m);
1101 
1102     if (m.current == EOF) {
1103         fail(&m, 1, "empty input isn't valid JSON");
1104     }
1105 
1106     handle_token(&m, 0);
1107     EMIT_CONST(&m, RESET_STYLE);
1108     write_byte(&m, '\n');
1109     flush(&m);
1110 
1111     // ignore trailing whitespace/comment bytes, if present
1112     seek_token(&m);
1113 
1114     // ignore trailing semicolon, if present
1115     if (m.current == ';') {
1116         advance(&m);
1117         // ignore trailing whitespace/comment bytes, if present
1118         seek_token(&m);
1119     }
1120 
1121     if (!feof(src) || m.current != EOF) {
1122         fail(&m, 1, "unexpected trailing JSON data");
1123     }
1124 }
1125 
1126 bool is_help_option(const char* s) {
1127     return (s[0] == '-' && s[1] != 0) && (
1128         strcmp(s, "-h") == 0 ||
1129         strcmp(s, "--h") == 0 ||
1130         strcmp(s, "-help") == 0 ||
1131         strcmp(s, "--help") == 0
1132     );
1133 }
1134 
1135 // run returns the error code
1136 int run(int nargs, char** args) {
1137     if (nargs > 0 && strcmp(args[0], "--") == 0) {
1138         nargs--;
1139         args++;
1140     }
1141 
1142     if (nargs > 1) {
1143         const char* msg = "can't use more than 1 named input";
1144         fprintf(stderr, ERROR_LINE("%s"), msg);
1145         return 1;
1146     }
1147 
1148     // use stdin when not given a filepath
1149     if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) {
1150         handle_input(stdin);
1151         return 0;
1152     }
1153 
1154     const char* path = args[0];
1155     FILE* f = fopen(path, "rb");
1156     if (f == NULL) {
1157         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
1158         return 1;
1159     }
1160 
1161     handle_input(f);
1162     fclose(f);
1163 
1164     return 0;
1165 }
1166 
1167 int main(int argc, char** argv) {
1168 #ifdef _WIN32
1169     setmode(fileno(stdin), O_BINARY);
1170     // ensure output lines end in LF instead of CRLF on windows
1171     setmode(fileno(stdout), O_BINARY);
1172     setmode(fileno(stderr), O_BINARY);
1173 #endif
1174 
1175     if (argc > 1 && is_help_option(argv[1])) {
1176         printf("%s", info);
1177         return 0;
1178     }
1179 
1180     memset(dispatch, 0, sizeof(dispatch));
1181     memset(styles, 0, sizeof(styles));
1182 
1183     // the dispatch table starts as all null function-pointers
1184     for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) {
1185         dispatch[i] = handle_invalid_token;
1186     }
1187 
1188     for (size_t i = '0'; i <= '9'; i++) {
1189         dispatch[i] = handle_number;
1190         CONST_SLICE(&styles[i], NUMBER_STYLE);
1191     }
1192 
1193     dispatch['n'] = handle_null;
1194     dispatch['t'] = handle_true;
1195     dispatch['f'] = handle_false;
1196     dispatch['N'] = handle_capital_none;
1197     dispatch['T'] = handle_capital_true;
1198     dispatch['F'] = handle_capital_false;
1199     dispatch['.'] = handle_dot;
1200     dispatch['+'] = handle_plus_number;
1201     dispatch['-'] = handle_minus_number;
1202     dispatch['"'] = handle_string;
1203     dispatch['\''] = handle_string;
1204     dispatch['['] = handle_array;
1205     dispatch['('] = handle_array;
1206     dispatch['{'] = handle_object;
1207 
1208     CONST_SLICE(&styles['n'], NULL_STYLE);
1209     CONST_SLICE(&styles['t'], BOOL_STYLE);
1210     CONST_SLICE(&styles['f'], BOOL_STYLE);
1211     CONST_SLICE(&styles['N'], NULL_STYLE);
1212     CONST_SLICE(&styles['T'], BOOL_STYLE);
1213     CONST_SLICE(&styles['F'], BOOL_STYLE);
1214     CONST_SLICE(&styles['.'], NUMBER_STYLE);
1215     CONST_SLICE(&styles['+'], NUMBER_STYLE);
1216     // CONST_SLICE(&styles['-'], NUMBER_STYLE);
1217     CONST_SLICE(&styles['-'], NEGATIVE_STYLE);
1218     CONST_SLICE(&styles['"'], SYNTAX_STYLE);
1219     CONST_SLICE(&styles['\''], SYNTAX_STYLE);
1220     CONST_SLICE(&styles['['], SYNTAX_STYLE);
1221     CONST_SLICE(&styles['('], SYNTAX_STYLE);
1222     CONST_SLICE(&styles['{'], SYNTAX_STYLE);
1223 
1224     // enable full/block-buffering for standard output
1225     setvbuf(stdout, NULL, _IOFBF, 0);
1226 
1227     return run(argc - 1, argv + 1) == 0 ? 0 : 1;
1228 }