File: nj.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./nj ./nj.c
  29 
  30 Building with COMPACT_OUTPUT defined makes `nj` output many fewer bytes, at
  31 the cost of using arguably worse colors. You can do that by running
  32 
  33 cc -s -O3 -march=native -mtune=native -flto -D COMPACT_OUTPUT -o ./nj ./nj.c
  34 
  35 Building for macos always uses COMPACT_OUTPUT, as the default terminal app
  36 there still doesn't support rgb colors.
  37 */
  38 
  39 #include <ctype.h>
  40 #include <stdarg.h>
  41 #include <stdbool.h>
  42 #include <stdint.h>
  43 #include <stdio.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46 
  47 #ifdef _WIN32
  48 #include <fcntl.h>
  49 #include <windows.h>
  50 #endif
  51 
  52 #ifdef RED_ERRORS
  53 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  54 #ifdef __APPLE__
  55 #define ERROR_STYLE "\x1b[31m"
  56 #endif
  57 #define ERROR_LINE(MSG) (ERROR_STYLE MSG "\x1b[0m\n")
  58 #else
  59 #define ERROR_LINE(MSG) (MSG "\n")
  60 #endif
  61 
  62 #ifdef __APPLE__
  63 #define COMPACT_OUTPUT
  64 #endif
  65 
  66 #ifndef IBUF_SIZE
  67 #define IBUF_SIZE (32 * 1024)
  68 #endif
  69 
  70 #ifndef OBUF_SIZE
  71 #define OBUF_SIZE (8 * 1024)
  72 #endif
  73 
  74 #ifndef INDENTATION
  75 #define INDENTATION 2
  76 #endif
  77 
  78 // CONST_SLICE initializes a slice struct using the string-constant given
  79 #define CONST_SLICE(s, x) init_slice(s, (unsigned char*)x, sizeof(x) - 1)
  80 
  81 // EMIT_CONST emits string constants without their final null byte
  82 #define EMIT_CONST(w, x) write_bytes(w, (unsigned char*)x, sizeof(x) - 1)
  83 
  84 #define RESET_STYLE "\x1b[0m"
  85 
  86 #ifdef COMPACT_OUTPUT
  87 // #define NULL_STYLE "\x1b[37m"
  88 #define NULL_STYLE "\x1b[38;5;248m"
  89 #define BOOL_STYLE "\x1b[36m"
  90 #define NUMBER_STYLE "\x1b[32m"
  91 #define NEGATIVE_STYLE "\x1b[31m"
  92 #define KEY_STYLE "\x1b[35m"
  93 // #define SYNTAX_STYLE "\x1b[37m"
  94 #define SYNTAX_STYLE "\x1b[38;5;248m"
  95 // #define NULL_STYLE "\x1b[38;5;248m"
  96 // #define BOOL_STYLE "\x1b[38;5;74m"
  97 // #define NUMBER_STYLE "\x1b[38;5;29m"
  98 // #define NEGATIVE_STYLE "\x1b[38;5;1m"
  99 // #define KEY_STYLE "\x1b[38;5;99m"
 100 // #define SYNTAX_STYLE "\x1b[38;5;248m"
 101 #else
 102 #define NULL_STYLE "\x1b[38;2;168;168;168m"
 103 #define BOOL_STYLE "\x1b[38;2;95;175;215m"
 104 #define NUMBER_STYLE "\x1b[38;2;0;135;95m"
 105 #define NEGATIVE_STYLE "\x1b[38;2;204;0;0m"
 106 #define KEY_STYLE "\x1b[38;2;135;95;255m"
 107 #define SYNTAX_STYLE "\x1b[38;2;168;168;168m"
 108 #endif
 109 
 110 const char* info = ""
 111 "nj [options...] [file...]\n"
 112 "\n"
 113 "\n"
 114 "Nice Json converts/fixes JSON/pseudo-JSON input into ANSI-styled multi-line\n"
 115 "JSON which uses 2 spaces for each indentation level.\n"
 116 "\n"
 117 "Besides styling and indenting JSON, this tool also adapts almost-JSON input\n"
 118 "into valid JSON, since it\n"
 119 "\n"
 120 "    - ignores both rest-of-line and multi-line comments\n"
 121 "    - ignores extra/trailing commas in arrays and objects\n"
 122 "    - turns single-quoted strings/keys into double-quoted strings\n"
 123 "    - double-quotes unquoted object keys\n"
 124 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
 125 "\n"
 126 "All options available can either start with a single or a double-dash\n"
 127 "\n"
 128 "    -h        show this help message\n"
 129 "    -help     show this help message\n"
 130 "";
 131 
 132 typedef struct slice {
 133     unsigned char* ptr;
 134     size_t len;
 135 } slice;
 136 
 137 static inline void init_slice(slice* s, unsigned char* ptr, size_t len) {
 138     s->ptr = ptr;
 139     s->len = len;
 140 }
 141 
 142 typedef struct nj_maker {
 143     FILE* in;
 144     FILE* out;
 145 
 146     unsigned char* ibuf;
 147     size_t ilen; // how many bytes are being used in the input buffer
 148     size_t icap; // the input buffer's capacity
 149     size_t ipos; // the current position in the input buffer
 150 
 151     size_t line; // the current line, used to show useful error messages
 152     size_t pos;  // the position in the current line, for error messages
 153 
 154     unsigned char* obuf;
 155     size_t ocap; // the output buffer's capacity
 156     size_t opos; // the current position in the output buffer
 157 
 158     ssize_t level; // the current indentation/nesting level
 159 
 160     int current;
 161     int next;
 162 } nj_maker;
 163 
 164 // advance_reader_pos helps func read_byte do its job
 165 static inline void advance_reader_pos(nj_maker* r, unsigned char b) {
 166     r->ipos++;
 167     if (b == '\n') {
 168         r->line++;
 169         r->pos = 1;
 170     } else {
 171         r->pos++;
 172     }
 173 }
 174 
 175 // read_byte does as it says: check its return for the value EOF, before
 176 // using it as the next byte
 177 int read_byte(nj_maker* r) {
 178     if (r->ipos < r->ilen) {
 179         // inside current chunk
 180         const unsigned char b = r->ibuf[r->ipos];
 181         advance_reader_pos(r, b);
 182         return b;
 183     }
 184 
 185     // need to read the next block
 186     r->ipos = 0;
 187     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 188     if (r->ilen > 0) {
 189         const unsigned char b = r->ibuf[r->ipos];
 190         advance_reader_pos(r, b);
 191         return b;
 192     }
 193 
 194     // reached the end of data
 195     return EOF;
 196 }
 197 
 198 // advance is used in most of the code, instead of calling read_byte directly
 199 static inline void advance(nj_maker* r) {
 200     r->current = r->next;
 201     r->next = read_byte(r);
 202 }
 203 
 204 void fail(nj_maker* m, int code, const char* msg);
 205 
 206 void skip_line(nj_maker* r) {
 207     while (true) {
 208         advance(r);
 209         const int lead = r->current;
 210 
 211         if (lead == EOF) {
 212             break;
 213         }
 214 
 215         if (lead == '\n') {
 216             advance(r);
 217             break;
 218         }
 219     }
 220 }
 221 
 222 void skip_multiline_comment(nj_maker* r) {
 223     unsigned char prev = 0;
 224 
 225     while (true) {
 226         advance(r);
 227         const int lead = r->current;
 228 
 229         if (lead == EOF) {
 230             break;
 231         }
 232 
 233         if (prev == '*' && lead == '/') {
 234             advance(r);
 235             break;
 236         }
 237 
 238         prev = (unsigned char)lead;
 239     }
 240 }
 241 
 242 void skip_comment(nj_maker* r) {
 243     int lead = r->current;
 244 
 245     if (lead == '#') {
 246         skip_line(r);
 247         return;
 248     }
 249 
 250     if (lead != '/') {
 251         fail(r, 1, "expected a slash to start comments");
 252     }
 253 
 254     advance(r);
 255     lead = r->current;
 256 
 257     if (lead == '/') {
 258         skip_line(r);
 259         return;
 260     }
 261 
 262     if (lead == '*') {
 263         skip_multiline_comment(r);
 264         return;
 265     }
 266 
 267     fail(r, 1, "expected `//` or `/*` to start comments");
 268 }
 269 
 270 static inline void seek_token(nj_maker* r) {
 271     while (true) {
 272         const int lead = r->current;
 273 
 274         if (lead != EOF && lead <= ' ') {
 275             advance(r);
 276             continue;
 277         }
 278 
 279         if (lead == '/' || lead == '#') {
 280             skip_comment(r);
 281             continue;
 282         }
 283 
 284         break;
 285     }
 286 }
 287 
 288 bool starts_with_bom(const unsigned char* b, const size_t n) {
 289     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 290 }
 291 
 292 void restart_state(nj_maker* m, FILE* w, FILE* r) {
 293     m->in = r;
 294     m->ilen = 0;
 295     m->ipos = 0;
 296 
 297     m->out = w;
 298     m->opos = 0;
 299 
 300     m->line = 1;
 301     m->pos = 1;
 302 
 303     m->current = EOF;
 304     m->next = EOF;
 305 
 306     m->current = read_byte(m);
 307     if (m->current == EOF) {
 308         return;
 309     }
 310     m->next = read_byte(m);
 311 
 312     m->level = 0;
 313 
 314     // skip leading UTF-8 BOM (byte-order mark), if present
 315     if (starts_with_bom(m->ibuf, m->ilen)) {
 316         // a UTF-8 BOM has 3 bytes
 317         for (size_t i = 0; i < 3 && m->current != EOF; i++) {
 318             advance(m);
 319         }
 320     }
 321 }
 322 
 323 void write_byte(nj_maker* m, unsigned char b) {
 324     if (m->opos < m->ocap) {
 325         m->obuf[m->opos++] = b;
 326         return;
 327     }
 328 
 329     fwrite(m->obuf, 1, m->ocap, m->out);
 330     m->obuf[0] = b;
 331     m->opos = 1;
 332 }
 333 
 334 // write_bytes does as it says, minimizing the number of calls to fwrite
 335 void write_bytes(nj_maker* m, const unsigned char* src, size_t len) {
 336     const size_t rem = m->ocap - m->opos;
 337     if (len < rem) {
 338         memcpy(m->obuf + m->opos, src, len);
 339         m->opos += len;
 340         return;
 341     }
 342 
 343     for (size_t i = 0; i < len; i++) {
 344         write_byte(m, src[i]);
 345     }
 346 }
 347 
 348 void write_spaces(nj_maker* m, ssize_t n) {
 349     const unsigned char spaces[32] = "                                ";
 350     while (n > sizeof(spaces)) {
 351         write_bytes(m, spaces, sizeof(spaces));
 352         n -= sizeof(spaces);
 353     }
 354     if (n > 0) {
 355         write_bytes(m, spaces, n);
 356     }
 357 }
 358 
 359 static inline void indent(nj_maker* m) {
 360     write_spaces(m, INDENTATION * m->level);
 361 }
 362 
 363 void flush(nj_maker* m) {
 364     if (m->opos > 0) {
 365         fwrite(m->obuf, 1, m->opos, m->out);
 366     }
 367     m->opos = 0;
 368     fflush(m->out);
 369 }
 370 
 371 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 372 
 373 static inline bool check_2_byte_rune(int a, int b) {
 374     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 375 }
 376 
 377 bool check_3_byte_rune(int a, int b, int c) {
 378     return (
 379         (a == 0xe0) &&
 380         (0xa0 <= b && b <= 0xbf) &&
 381         (0x80 <= c && c <= 0xbf)
 382     ) || (
 383         (0xe1 <= a && a <= 0xec) &&
 384         (0x80 <= b && b <= 0xbf) &&
 385         (0x80 <= c && c <= 0xbf)
 386     ) || (
 387         (a == 0xed) &&
 388         (0x80 <= b && b <= 0x9f) &&
 389         (0x80 <= c && c <= 0xbf)
 390     ) || (
 391         (a == 0xee || a == 0xef) &&
 392         (0x80 <= b && b <= 0xbf) &&
 393         (0x80 <= c && c <= 0xbf)
 394     );
 395 }
 396 
 397 bool check_4_byte_rune(int a, int b, int c, int d) {
 398     return (
 399         (a == 0xf0) &&
 400         (0x90 <= b && b <= 0xbf) &&
 401         (0x80 <= c && c <= 0xbf) &&
 402         (0x80 <= d && d <= 0xbf)
 403     ) || (
 404         (a == 0xf1 || a == 0xf3) &&
 405         (0x80 <= b && b <= 0xbf) &&
 406         (0x80 <= c && c <= 0xbf) &&
 407         (0x80 <= d && d <= 0xbf)
 408     ) || (
 409         (a == 0xf4) &&
 410         (0x80 <= b && b <= 0xbf) &&
 411         (0x80 <= c && c <= 0x8f) &&
 412         (0x80 <= d && d <= 0xbf)
 413     );
 414 }
 415 
 416 // write_replacement_char is the recommended action to handle invalid bytes
 417 void write_replacement_char(nj_maker* m) {
 418     write_byte(m, 0xef);
 419     write_byte(m, 0xbf);
 420     write_byte(m, 0xbd);
 421 }
 422 
 423 void handle_invalid_rune(nj_maker* m) {
 424     // fail(m, 1, "invalid unicode value");
 425     write_replacement_char(m);
 426 }
 427 
 428 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 429 void write_rune(nj_maker* m, uint32_t rune) {
 430     if (rune < (1 << 7)) {
 431         write_byte(m, rune);
 432         return;
 433     }
 434 
 435     if (rune < (1 << (5 + 6))) {
 436         const int a = 0b11000000 | (rune >> 6);
 437         const int b = 0b10000000 | (rune & 0b00111111);
 438         if (check_2_byte_rune(a, b)) {
 439             write_byte(m, a);
 440             write_byte(m, b);
 441         } else {
 442             write_replacement_char(m);
 443         }
 444         return;
 445     }
 446 
 447     if (rune < (1 << (4 + 6 + 6))) {
 448         const int a = 0b11100000 | (rune >> 12);
 449         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 450         const int c = 0b10000000 | (rune & 0b00111111);
 451         if (check_3_byte_rune(a, b, c)) {
 452             write_byte(m, a);
 453             write_byte(m, b);
 454             write_byte(m, c);
 455         } else {
 456             write_replacement_char(m);
 457         }
 458         return;
 459     }
 460 
 461     if (rune < (1 << (3 + 6 + 6 + 6))) {
 462         const int a = 0b11110000 | (rune >> 18);
 463         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 464         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 465         const int d = 0b10000000 | (rune & 0b00111111);
 466         if (check_4_byte_rune(a, b, c, d)) {
 467             write_byte(m, a);
 468             write_byte(m, b);
 469             write_byte(m, c);
 470             write_byte(m, d);
 471         } else {
 472             write_replacement_char(m);
 473         }
 474         return;
 475     }
 476 
 477     write_replacement_char(m);
 478 }
 479 
 480 void copy_utf8_rune(nj_maker* m) {
 481     const int a = m->current;
 482 
 483     if (a == EOF) {
 484         return;
 485     }
 486 
 487     // handle 1-byte runes
 488     if (a < 128) {
 489         write_byte(m, a);
 490         return;
 491     }
 492 
 493     advance(m);
 494     const int b = m->current;
 495 
 496     if (b == EOF) {
 497         handle_invalid_rune(m);
 498         return;
 499     }
 500 
 501     // handle 2-byte runes
 502     if (check_2_byte_rune(a, b)) {
 503         write_byte(m, a);
 504         write_byte(m, b);
 505         return;
 506     }
 507 
 508     advance(m);
 509     const int c = m->current;
 510 
 511     if (c == EOF) {
 512         handle_invalid_rune(m);
 513         return;
 514     }
 515 
 516     // handle 3-byte runes
 517     if (check_3_byte_rune(a, b, c)) {
 518         write_byte(m, a);
 519         write_byte(m, b);
 520         write_byte(m, c);
 521         return;
 522     }
 523 
 524     advance(m);
 525     const int d = m->current;
 526 
 527     if (d == EOF) {
 528         handle_invalid_rune(m);
 529         return;
 530     }
 531 
 532     // handle 4-byte runes
 533     if (check_4_byte_rune(a, b, c, d)) {
 534         write_byte(m, a);
 535         write_byte(m, b);
 536         write_byte(m, c);
 537         write_byte(m, d);
 538         return;
 539     }
 540 
 541     handle_invalid_rune(m);
 542 }
 543 
 544 // debug is available to diagnose any bug found
 545 void debug(nj_maker* m, const char* fmt, ...) {
 546     va_list args;
 547     va_start(args, fmt);
 548 
 549     if (m->in != stdin) {
 550         fclose(m->in);
 551     }
 552 
 553     write_byte(m, '\n');
 554 
 555     const unsigned long line = m->line;
 556     const unsigned long pos = m->pos;
 557     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos);
 558     fprintf(stderr, fmt, args);
 559     fprintf(stderr, "\x1b[0m\n");
 560 
 561     va_end(args);
 562 
 563     exit(10);
 564 }
 565 
 566 // fail quits this app right after showing the error message given
 567 void fail(nj_maker* m, int code, const char* msg) {
 568     const unsigned long line = m->line;
 569     const unsigned long pos = m->pos;
 570 
 571     write_byte(m, '\n');
 572     flush(m);
 573     fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg);
 574     exit(code);
 575 }
 576 
 577 bool demand_keyword(nj_maker* m, char* rest) {
 578     for (; rest[0] != 0; rest++) {
 579         const int lead = m->current;
 580         if (lead == EOF || lead != rest[0]) {
 581             return false;
 582         }
 583         advance(m);
 584     }
 585 
 586     return rest[0] == 0;
 587 }
 588 
 589 void handle_null(nj_maker* m) {
 590     if (!demand_keyword(m, "null")) {
 591         fail(m, 1, "expected `null` keyword");
 592     }
 593     EMIT_CONST(m, "null");
 594 }
 595 
 596 void handle_true(nj_maker* m) {
 597     if (!demand_keyword(m, "true")) {
 598         fail(m, 1, "expected `true` keyword");
 599     }
 600     EMIT_CONST(m, "true");
 601 }
 602 
 603 void handle_false(nj_maker* m) {
 604     if (!demand_keyword(m, "false")) {
 605         fail(m, 1, "expected `false` keyword");
 606     }
 607     EMIT_CONST(m, "false");
 608 }
 609 
 610 void handle_capital_none(nj_maker* m) {
 611     if (!demand_keyword(m, "None")) {
 612         fail(m, 1, "expected `None` keyword");
 613     }
 614     EMIT_CONST(m, "null");
 615 }
 616 
 617 void handle_capital_true(nj_maker* m) {
 618     if (!demand_keyword(m, "True")) {
 619         fail(m, 1, "expected `True` keyword");
 620     }
 621     EMIT_CONST(m, "true");
 622 }
 623 
 624 void handle_capital_false(nj_maker* m) {
 625     if (!demand_keyword(m, "False")) {
 626         fail(m, 1, "expected `False` keyword");
 627     }
 628     EMIT_CONST(m, "false");
 629 }
 630 
 631 void handle_digits(nj_maker* m) {
 632     if (!isdigit(m->current)) {
 633         fail(m, 1, "expected/missing digits");
 634     }
 635 
 636     while (isdigit(m->current)) {
 637         write_byte(m, m->current);
 638         advance(m);
 639     }
 640 }
 641 
 642 void handle_number(nj_maker* m) {
 643     handle_digits(m);
 644 
 645     const int lead = m->current;
 646 
 647     if (lead == '.') {
 648         write_byte(m, '.');
 649         advance(m);
 650 
 651         if (isdigit(m->current)) {
 652             handle_digits(m);
 653         } else {
 654             write_byte(m, '0');
 655         }
 656         return;
 657     }
 658 
 659     if (lead == 'e' || lead == 'E') {
 660         write_byte(m, lead);
 661         advance(m);
 662 
 663         if (m->current == '+') {
 664             advance(m);
 665         } else if (m->current == '-') {
 666             write_byte(m, '-');
 667             advance(m);
 668         }
 669 
 670         handle_digits(m);
 671     }
 672 }
 673 
 674 void handle_dot(nj_maker* m) {
 675     write_byte(m, '0');
 676     write_byte(m, '.');
 677     advance(m);
 678 
 679     if (!isdigit(m->current)) {
 680         fail(m, 1, "expected/missing digits after decimal dot");
 681     }
 682     handle_digits(m);
 683 }
 684 
 685 void handle_plus_number(nj_maker* m) {
 686     advance(m);
 687 
 688     if (m->current == '.') {
 689         handle_dot(m);
 690         return;
 691     }
 692     handle_number(m);
 693 }
 694 
 695 void handle_minus_number(nj_maker* m) {
 696     write_byte(m, '-');
 697     advance(m);
 698 
 699     if (m->current == '.') {
 700         handle_dot(m);
 701         return;
 702     }
 703     handle_number(m);
 704 }
 705 
 706 // decode_hex assumes valid hex digits, checked by func is_valid_hex
 707 uint32_t decode_hex(unsigned char hex) {
 708     if ('0' <= hex && hex <= '9') {
 709         return hex - '0';
 710     }
 711     if ('A' <= hex && hex <= 'F') {
 712         return hex - 'A' + 10;
 713     }
 714     if ('a' <= hex && hex <= 'f') {
 715         return hex - 'a' + 10;
 716     }
 717     return 0xffff;
 718 }
 719 
 720 static inline bool is_valid_hex(unsigned char b) {
 721     return false ||
 722         ('0' <= b && b <= '9') ||
 723         ('A' <= b && b <= 'F') ||
 724         ('a' <= b && b <= 'f');
 725 }
 726 
 727 // handle_low_char ensures characters whose ASCII codes are lower than spaces
 728 // are properly escaped for strings
 729 void handle_low_char(nj_maker* m, int c) {
 730     const char* hex = "0123456789ABCDEF";
 731 
 732     switch (c) {
 733     case '\t':
 734         write_byte(m, '\\');
 735         write_byte(m, 't');
 736         break;
 737     case '\n':
 738         write_byte(m, '\\');
 739         write_byte(m, 'n');
 740         break;
 741     case '\r':
 742         write_byte(m, '\\');
 743         write_byte(m, 'r');
 744         break;
 745     case '\b':
 746         write_byte(m, '\\');
 747         write_byte(m, 'b');
 748         break;
 749     case '\f':
 750         write_byte(m, '\\');
 751         write_byte(m, 'f');
 752         break;
 753     case '\v':
 754         write_byte(m, '\\');
 755         write_byte(m, 'v');
 756         break;
 757     default:
 758         write_byte(m, '\\');
 759         write_byte(m, 'u');
 760         write_byte(m, '0');
 761         write_byte(m, '0');
 762         write_byte(m, hex[c / 16]);
 763         write_byte(m, hex[c % 16]);
 764         break;
 765     }
 766 }
 767 
 768 void write_inner_string_hex_quad(nj_maker* m, const unsigned char quad[4]) {
 769     const uint32_t n = 0 +
 770         (decode_hex(quad[0]) << 12) +
 771         (decode_hex(quad[1]) << 8) +
 772         (decode_hex(quad[2]) << 4) +
 773         (decode_hex(quad[3]) << 0);
 774 
 775     switch (n) {
 776     case '"':
 777         write_byte(m, '\\');
 778         write_byte(m, '"');
 779         return;
 780     case '\\':
 781         write_byte(m, '\\');
 782         write_byte(m, '\\');
 783         return;
 784     }
 785 
 786     if (n >= ' ') {
 787         write_rune(m, n);
 788     } else {
 789         handle_low_char(m, n);
 790     }
 791 }
 792 
 793 void handle_hex_quad(nj_maker* m) {
 794     unsigned char quad[4];
 795     for (size_t i = 0; i < 4; i++) {
 796         advance(m);
 797         const int lead = m->current;
 798         if (lead == EOF) {
 799             fail(m, 1, "end of input before end of string");
 800         }
 801         if (is_valid_hex(lead)) {
 802             quad[i] = lead;
 803             continue;
 804         }
 805         fail(m, 1, "invalid hexadecimal digit in string");
 806     }
 807 
 808     write_inner_string_hex_quad(m, quad);
 809 }
 810 
 811 void handle_hex_pair(nj_maker* m) {
 812     unsigned char quad[4] = {'0', '0', '0', '0'};
 813     advance(m);
 814     const int a = m->current;
 815     advance(m);
 816     const int b = m->current;
 817     if (a == EOF || b == EOF) {
 818         fail(m, 1, "end of input before end of string");
 819     }
 820     if (!is_valid_hex(a) || !is_valid_hex(b)) {
 821         fail(m, 1, "invalid hexadecimal digit in string");
 822     }
 823 
 824     quad[2] = a;
 825     quad[3] = b;
 826     write_inner_string_hex_quad(m, quad);
 827 }
 828 
 829 void handle_string_escape(nj_maker* m, int c) {
 830     switch (c) {
 831     case '"':
 832     case '\\':
 833     case 'b':
 834     case 'f':
 835     case 'n':
 836     case 'r':
 837     case 't':
 838         write_byte(m, '\\');
 839         write_byte(m, c);
 840         break;
 841     case 'u':
 842         handle_hex_quad(m);
 843         break;
 844     case 'x':
 845         handle_hex_pair(m);
 846         break;
 847     case '\'':
 848         write_byte(m, '\'');
 849         break;
 850     default:
 851         write_byte(m, m->current);
 852         break;
 853     }
 854 }
 855 
 856 ssize_t handle_inner_string(nj_maker* m) {
 857     const unsigned char quote = m->current;
 858     bool escaped = false;
 859 
 860     for (size_t i = 0; true; i++) {
 861         advance(m);
 862 
 863         int c = m->current;
 864         if (c == EOF) {
 865             fail(m, 1, "input ended before string was close-quoted");
 866         }
 867 
 868         if (escaped) {
 869             handle_string_escape(m, c);
 870             escaped = false;
 871             continue;
 872         }
 873 
 874         switch (c) {
 875         case '\\':
 876             escaped = true;
 877             break;
 878         default:
 879             if (c == quote) {
 880                 advance(m);
 881                 return i;
 882             }
 883 
 884             // write_byte(m, c);
 885             if (c < ' ') {
 886                 handle_low_char(m, c);
 887             } else {
 888                 copy_utf8_rune(m);
 889             }
 890             break;
 891         }
 892     }
 893 }
 894 
 895 void handle_quoted_key(nj_maker* m) {
 896     if (m->current != m->next) {
 897         EMIT_CONST(m, "\"" KEY_STYLE);
 898         handle_inner_string(m);
 899         EMIT_CONST(m, SYNTAX_STYLE "\"");
 900     } else {
 901         write_byte(m, '"');
 902         handle_inner_string(m);
 903         write_byte(m, '"');
 904     }
 905 }
 906 
 907 void handle_string(nj_maker* m) {
 908     if (m->current != m->next) {
 909         EMIT_CONST(m, "\"" RESET_STYLE);
 910         handle_inner_string(m);
 911         EMIT_CONST(m, SYNTAX_STYLE "\"");
 912     } else {
 913         write_byte(m, '"');
 914         handle_inner_string(m);
 915         write_byte(m, '"');
 916     }
 917 }
 918 
 919 void handle_token(nj_maker* m, ssize_t lead_level);
 920 
 921 void handle_array(nj_maker* m) {
 922     m->level++;
 923     write_byte(m, '[');
 924     advance(m);
 925 
 926     for (size_t i = 0; true; i++) {
 927         seek_token(m);
 928         const int lead = m->current;
 929 
 930         if (lead == EOF) {
 931             fail(m, 1, "unclosed array");
 932         }
 933 
 934         if (lead == ',') {
 935             advance(m);
 936             continue;
 937         }
 938 
 939         if (lead == ']') {
 940             m->level--;
 941             if (i > 0) {
 942                 write_byte(m, '\n');
 943                 indent(m);
 944                 EMIT_CONST(m, SYNTAX_STYLE "]");
 945             } else {
 946                 write_byte(m, ']');
 947             }
 948             advance(m);
 949             return;
 950         }
 951 
 952         if (i > 0) {
 953             EMIT_CONST(m, SYNTAX_STYLE ",");
 954         }
 955         write_byte(m, '\n');
 956         if (feof(m->out)) {
 957             return;
 958         }
 959         handle_token(m, m->level);
 960     }
 961 }
 962 
 963 void handle_unquoted_key(nj_maker* m) {
 964     EMIT_CONST(m, SYNTAX_STYLE "\"" KEY_STYLE);
 965 
 966     while (true) {
 967         int c = m->current;
 968         if (c == EOF) {
 969             fail(m, 1, "input ended with an object key");
 970         }
 971 
 972         write_byte(m, c);
 973         advance(m);
 974 
 975         c = m->current;
 976         if (!isalpha(c) && !isdigit(c) && c != '_') {
 977             break;
 978         }
 979     }
 980 
 981     EMIT_CONST(m, SYNTAX_STYLE "\"");
 982 }
 983 
 984 void handle_object(nj_maker* m) {
 985     m->level++;
 986     write_byte(m, '{');
 987     advance(m);
 988 
 989     for (size_t i = 0; true; i++) {
 990         seek_token(m);
 991         int lead = m->current;
 992 
 993         if (lead == EOF) {
 994             fail(m, 1, "unclosed object");
 995         }
 996 
 997         if (lead == ',') {
 998             advance(m);
 999             continue;
1000         }
1001 
1002         if (lead == '}') {
1003             m->level--;
1004             if (i > 0) {
1005                 write_byte(m, '\n');
1006                 indent(m);
1007                 EMIT_CONST(m, SYNTAX_STYLE "}");
1008             } else {
1009                 write_byte(m, '}');
1010             }
1011             advance(m);
1012             return;
1013         }
1014 
1015         if (feof(m->out)) {
1016             return;
1017         }
1018 
1019         if (lead == '"' || lead == '\'') {
1020             if (i > 0) {
1021                 EMIT_CONST(m, SYNTAX_STYLE ",");
1022             }
1023             write_byte(m, '\n');
1024             indent(m);
1025             EMIT_CONST(m, SYNTAX_STYLE);
1026             handle_quoted_key(m);
1027         } else if (isalpha(lead) || lead == '_') {
1028             if (i > 0) {
1029                 EMIT_CONST(m, SYNTAX_STYLE ",");
1030             }
1031             write_byte(m, '\n');
1032             indent(m);
1033             handle_unquoted_key(m);
1034         } else {
1035             fail(m, 1, "only strings or identifiers can be object keys");
1036         }
1037 
1038         seek_token(m);
1039         lead = m->current;
1040 
1041         if (lead == EOF) {
1042             fail(m, 1, "input ended after object-key and before value");
1043         }
1044 
1045         if (lead != ':') {
1046             fail(m, 1, "a `:` must follow all object keys");
1047         }
1048 
1049         EMIT_CONST(m, ": ");
1050         advance(m);
1051 
1052         seek_token(m);
1053         if (m->current == EOF) {
1054             fail(m, 1, "input ended after a `:` following an object-key");
1055         }
1056 
1057         handle_token(m, 0);
1058     }
1059 }
1060 
1061 // styles ties leading bytes/chars in tokens to their leading ANSI styles
1062 slice styles[256] = {};
1063 
1064 // dispatch ties leading bytes/chars in tokens to the funcs which handle them
1065 void (*dispatch[256])() = {};
1066 
1067 void handle_token(nj_maker* m, ssize_t lead_level) {
1068     const unsigned char b = m->current;
1069     write_spaces(m, INDENTATION * lead_level);
1070     write_bytes(m, styles[b].ptr, styles[b].len);
1071     dispatch[b](m);
1072 }
1073 
1074 // handle_invalid_token shows an error message and quits the app right after
1075 void handle_invalid_token(nj_maker* m) {
1076     char msg[64];
1077     unsigned char c = (unsigned char)m->current;
1078     sprintf(msg, "%c (%d): invalid token", c, c);
1079     fail(m, 1, msg);
1080 }
1081 
1082 void handle_input(FILE* src) {
1083     unsigned char ibuf[IBUF_SIZE];
1084     unsigned char obuf[OBUF_SIZE];
1085 
1086     nj_maker m;
1087     m.ibuf = ibuf;
1088     m.icap = sizeof(ibuf);
1089     m.obuf = obuf;
1090     m.ocap = sizeof(obuf);
1091     restart_state(&m, stdout, src);
1092 
1093     // ignore leading whitespace/comment bytes, if present
1094     seek_token(&m);
1095 
1096     if (m.current == EOF) {
1097         fail(&m, 1, "empty input isn't valid JSON");
1098     }
1099 
1100     handle_token(&m, 0);
1101     EMIT_CONST(&m, RESET_STYLE);
1102     write_byte(&m, '\n');
1103     flush(&m);
1104 
1105     // ignore trailing whitespace/comment bytes, if present
1106     seek_token(&m);
1107 
1108     // ignore trailing semicolon, if present
1109     if (m.current == ';') {
1110         advance(&m);
1111         // ignore trailing whitespace/comment bytes, if present
1112         seek_token(&m);
1113     }
1114 
1115     if (!feof(src) || m.current != EOF) {
1116         fail(&m, 1, "unexpected trailing JSON data");
1117     }
1118 }
1119 
1120 bool is_help_option(const char* s) {
1121     return (s[0] == '-' && s[1] != 0) && (
1122         strcmp(s, "-h") == 0 ||
1123         strcmp(s, "--h") == 0 ||
1124         strcmp(s, "-help") == 0 ||
1125         strcmp(s, "--help") == 0
1126     );
1127 }
1128 
1129 // run returns the error code
1130 int run(int nargs, char** args) {
1131     if (nargs > 0 && strcmp(args[0], "--") == 0) {
1132         nargs--;
1133         args++;
1134     }
1135 
1136     if (nargs > 1) {
1137         const char* msg = "can't use more than 1 named input";
1138         fprintf(stderr, ERROR_LINE("%s"), msg);
1139         return 1;
1140     }
1141 
1142     // use stdin when not given a filepath
1143     if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) {
1144         handle_input(stdin);
1145         return 0;
1146     }
1147 
1148     const char* path = args[0];
1149     FILE* f = fopen(path, "rb");
1150     if (f == NULL) {
1151         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
1152         return 1;
1153     }
1154 
1155     handle_input(f);
1156     fclose(f);
1157 
1158     return 0;
1159 }
1160 
1161 int main(int argc, char** argv) {
1162 #ifdef _WIN32
1163     setmode(fileno(stdin), O_BINARY);
1164     // ensure output lines end in LF instead of CRLF on windows
1165     setmode(fileno(stdout), O_BINARY);
1166     setmode(fileno(stderr), O_BINARY);
1167 #endif
1168 
1169     if (argc > 1 && is_help_option(argv[1])) {
1170         printf("%s", info);
1171         return 0;
1172     }
1173 
1174     memset(dispatch, 0, sizeof(dispatch));
1175     memset(styles, 0, sizeof(styles));
1176 
1177     // the dispatch table starts as all null function-pointers
1178     for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) {
1179         dispatch[i] = handle_invalid_token;
1180     }
1181 
1182     for (size_t i = '0'; i <= '9'; i++) {
1183         dispatch[i] = handle_number;
1184         CONST_SLICE(&styles[i], NUMBER_STYLE);
1185     }
1186 
1187     dispatch['n'] = handle_null;
1188     dispatch['t'] = handle_true;
1189     dispatch['f'] = handle_false;
1190     dispatch['N'] = handle_capital_none;
1191     dispatch['T'] = handle_capital_true;
1192     dispatch['F'] = handle_capital_false;
1193     dispatch['.'] = handle_dot;
1194     dispatch['+'] = handle_plus_number;
1195     dispatch['-'] = handle_minus_number;
1196     dispatch['"'] = handle_string;
1197     dispatch['\''] = handle_string;
1198     dispatch['['] = handle_array;
1199     dispatch['{'] = handle_object;
1200 
1201     CONST_SLICE(&styles['n'], NULL_STYLE);
1202     CONST_SLICE(&styles['t'], BOOL_STYLE);
1203     CONST_SLICE(&styles['f'], BOOL_STYLE);
1204     CONST_SLICE(&styles['N'], NULL_STYLE);
1205     CONST_SLICE(&styles['T'], BOOL_STYLE);
1206     CONST_SLICE(&styles['F'], BOOL_STYLE);
1207     CONST_SLICE(&styles['.'], NUMBER_STYLE);
1208     CONST_SLICE(&styles['+'], NUMBER_STYLE);
1209     // CONST_SLICE(&styles['-'], NUMBER_STYLE);
1210     CONST_SLICE(&styles['-'], NEGATIVE_STYLE);
1211     CONST_SLICE(&styles['"'], SYNTAX_STYLE);
1212     CONST_SLICE(&styles['\''], SYNTAX_STYLE);
1213     CONST_SLICE(&styles['['], SYNTAX_STYLE);
1214     CONST_SLICE(&styles['{'], SYNTAX_STYLE);
1215 
1216     // enable full/block-buffering for standard output
1217     setvbuf(stdout, NULL, _IOFBF, 0);
1218 
1219     return run(argc - 1, argv + 1) == 0 ? 0 : 1;
1220 }