File: json0.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./json0 ./json0.c
  29 */
  30 
  31 #include <ctype.h>
  32 #include <stdarg.h>
  33 #include <stdbool.h>
  34 #include <stdint.h>
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 
  39 #ifdef _WIN32
  40 #include <fcntl.h>
  41 #include <windows.h>
  42 #endif
  43 
  44 #ifdef RED_ERRORS
  45 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  46 #ifdef __APPLE__
  47 #define ERROR_STYLE "\x1b[31m"
  48 #endif
  49 #define RESET_STYLE "\x1b[0m"
  50 #else
  51 #define ERROR_STYLE
  52 #define RESET_STYLE
  53 #endif
  54 
  55 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  56 
  57 #ifndef IBUF_SIZE
  58 #define IBUF_SIZE (32 * 1024)
  59 #endif
  60 
  61 #ifndef OBUF_SIZE
  62 #define OBUF_SIZE (8 * 1024)
  63 #endif
  64 
  65 const char* info = ""
  66 "json0 [options...] [file...]\n"
  67 "\n"
  68 "\n"
  69 "JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n"
  70 "Its output is always a single line, which ends with a line-feed.\n"
  71 "\n"
  72 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n"
  73 "valid JSON, since it\n"
  74 "\n"
  75 "    - ignores both rest-of-line and multi-line comments\n"
  76 "    - ignores extra/trailing commas in arrays and objects\n"
  77 "    - turns single-quoted strings/keys into double-quoted strings\n"
  78 "    - double-quotes unquoted object keys\n"
  79 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
  80 "\n"
  81 "All options available can either start with a single or a double-dash\n"
  82 "\n"
  83 "    -h        show this help message\n"
  84 "    -help     show this help message\n"
  85 "    -jsonl    emit JSON Lines, when top-level value is an array\n"
  86 "";
  87 
  88 typedef struct j0_maker {
  89     FILE* in;
  90     FILE* out;
  91 
  92     unsigned char* ibuf;
  93     size_t ilen; // how many bytes are being used in the input buffer
  94     size_t icap; // the input buffer's capacity
  95     size_t ipos; // the current position in the input buffer
  96 
  97     size_t line; // the current line, used to show useful error messages
  98     size_t pos;  // the position in the current line, for error messages
  99 
 100     unsigned char* obuf;
 101     size_t ocap; // the output buffer's capacity
 102     size_t opos; // the current position in the output buffer
 103 
 104     int current;
 105     int next;
 106 } j0_maker;
 107 
 108 // advance_reader_pos helps func read_byte do its job
 109 static inline void advance_reader_pos(j0_maker* r, unsigned char b) {
 110     r->ipos++;
 111     if (b == '\n') {
 112         r->line++;
 113         r->pos = 1;
 114     } else {
 115         r->pos++;
 116     }
 117 }
 118 
 119 // read_byte does as it says: check its return for the value EOF, before
 120 // using it as the next byte
 121 static inline int read_byte(j0_maker* r) {
 122     if (r->ipos < r->ilen) {
 123         // inside current chunk
 124         const unsigned char b = r->ibuf[r->ipos];
 125         advance_reader_pos(r, b);
 126         return b;
 127     }
 128 
 129     // need to read the next block
 130     r->ipos = 0;
 131     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 132     if (r->ilen > 0) {
 133         const unsigned char b = r->ibuf[r->ipos];
 134         advance_reader_pos(r, b);
 135         return b;
 136     }
 137 
 138     // reached the end of data
 139     return EOF;
 140 }
 141 
 142 // advance is used in most of the code, instead of calling read_byte directly
 143 static inline void advance(j0_maker* r) {
 144     r->current = r->next;
 145     r->next = read_byte(r);
 146 }
 147 
 148 void fail(j0_maker* m, int code, const char* msg);
 149 
 150 void skip_line(j0_maker* r) {
 151     while (true) {
 152         advance(r);
 153         const int lead = r->current;
 154 
 155         if (lead == EOF) {
 156             break;
 157         }
 158 
 159         if (lead == '\n') {
 160             advance(r);
 161             break;
 162         }
 163     }
 164 }
 165 
 166 void skip_multiline_comment(j0_maker* r) {
 167     unsigned char prev = 0;
 168 
 169     while (true) {
 170         advance(r);
 171         const int lead = r->current;
 172 
 173         if (lead == EOF) {
 174             break;
 175         }
 176 
 177         if (prev == '*' && lead == '/') {
 178             advance(r);
 179             break;
 180         }
 181 
 182         prev = (unsigned char)lead;
 183     }
 184 }
 185 
 186 void skip_comment(j0_maker* r) {
 187     int lead = r->current;
 188 
 189     if (lead == '#') {
 190         skip_line(r);
 191         return;
 192     }
 193 
 194     if (lead != '/') {
 195         fail(r, 1, "expected a slash to start comments");
 196     }
 197 
 198     advance(r);
 199     lead = r->current;
 200 
 201     if (lead == '/') {
 202         skip_line(r);
 203         return;
 204     }
 205 
 206     if (lead == '*') {
 207         skip_multiline_comment(r);
 208         return;
 209     }
 210 
 211     fail(r, 1, "expected `//` or `/*` to start comments");
 212 }
 213 
 214 static inline void seek_token(j0_maker* r) {
 215     while (true) {
 216         const int lead = r->current;
 217 
 218         if (lead != EOF && lead <= ' ') {
 219             advance(r);
 220             continue;
 221         }
 222 
 223         if (lead == '/' || lead == '#') {
 224             skip_comment(r);
 225             continue;
 226         }
 227 
 228         break;
 229     }
 230 }
 231 
 232 bool starts_with_bom(const unsigned char* b, const size_t n) {
 233     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 234 }
 235 
 236 void restart_state(j0_maker* m, FILE* w, FILE* r) {
 237     m->in = r;
 238     m->ilen = 0;
 239     m->ipos = 0;
 240 
 241     m->out = w;
 242     m->opos = 0;
 243 
 244     m->line = 1;
 245     m->pos = 1;
 246 
 247     m->current = EOF;
 248     m->next = EOF;
 249 
 250     m->current = read_byte(m);
 251     if (m->current == EOF) {
 252         return;
 253     }
 254     m->next = read_byte(m);
 255 
 256     // skip leading UTF-8 BOM (byte-order mark), if present
 257     if (starts_with_bom(m->ibuf, m->ilen)) {
 258         // a UTF-8 BOM has 3 bytes
 259         for (size_t i = 0; i < 3 && m->current != EOF; i++) {
 260             advance(m);
 261         }
 262     }
 263 }
 264 
 265 void write_byte(j0_maker* m, unsigned char b) {
 266     if (m->opos < m->ocap) {
 267         m->obuf[m->opos++] = b;
 268         return;
 269     }
 270 
 271     fwrite(m->obuf, 1, m->ocap, m->out);
 272     m->obuf[0] = b;
 273     m->opos = 1;
 274 }
 275 
 276 // write_bytes does as it says, minimizing the number of calls to fwrite
 277 void write_bytes(j0_maker* m, const unsigned char* src, size_t len) {
 278     const size_t rem = m->ocap - m->opos;
 279     if (len < rem) {
 280         memcpy(m->obuf + m->opos, src, len);
 281         m->opos += len;
 282         return;
 283     }
 284 
 285     for (size_t i = 0; i < len; i++) {
 286         write_byte(m, src[i]);
 287     }
 288 }
 289 
 290 void flush(j0_maker* m) {
 291     if (m->opos > 0) {
 292         fwrite(m->obuf, 1, m->opos, m->out);
 293     }
 294     m->opos = 0;
 295     fflush(m->out);
 296 }
 297 
 298 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 299 
 300 static inline bool check_2_byte_rune(int a, int b) {
 301     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 302 }
 303 
 304 bool check_3_byte_rune(int a, int b, int c) {
 305     return (
 306         (a == 0xe0) &&
 307         (0xa0 <= b && b <= 0xbf) &&
 308         (0x80 <= c && c <= 0xbf)
 309     ) || (
 310         (0xe1 <= a && a <= 0xec) &&
 311         (0x80 <= b && b <= 0xbf) &&
 312         (0x80 <= c && c <= 0xbf)
 313     ) || (
 314         (a == 0xed) &&
 315         (0x80 <= b && b <= 0x9f) &&
 316         (0x80 <= c && c <= 0xbf)
 317     ) || (
 318         (a == 0xee || a == 0xef) &&
 319         (0x80 <= b && b <= 0xbf) &&
 320         (0x80 <= c && c <= 0xbf)
 321     );
 322 }
 323 
 324 bool check_4_byte_rune(int a, int b, int c, int d) {
 325     return (
 326         (a == 0xf0) &&
 327         (0x90 <= b && b <= 0xbf) &&
 328         (0x80 <= c && c <= 0xbf) &&
 329         (0x80 <= d && d <= 0xbf)
 330     ) || (
 331         (a == 0xf1 || a == 0xf3) &&
 332         (0x80 <= b && b <= 0xbf) &&
 333         (0x80 <= c && c <= 0xbf) &&
 334         (0x80 <= d && d <= 0xbf)
 335     ) || (
 336         (a == 0xf4) &&
 337         (0x80 <= b && b <= 0xbf) &&
 338         (0x80 <= c && c <= 0x8f) &&
 339         (0x80 <= d && d <= 0xbf)
 340     );
 341 }
 342 
 343 // write_replacement_char is the recommended action to handle invalid bytes
 344 void write_replacement_char(j0_maker* m) {
 345     write_byte(m, 0xef);
 346     write_byte(m, 0xbf);
 347     write_byte(m, 0xbd);
 348 }
 349 
 350 void handle_invalid_rune(j0_maker* m) {
 351     // fail(m, 1, "invalid unicode value");
 352     write_replacement_char(m);
 353 }
 354 
 355 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 356 void write_rune(j0_maker* m, uint32_t rune) {
 357     if (rune < (1 << 7)) {
 358         write_byte(m, rune);
 359         return;
 360     }
 361 
 362     if (rune < (1 << (5 + 6))) {
 363         const int a = 0b11000000 | (rune >> 6);
 364         const int b = 0b10000000 | (rune & 0b00111111);
 365         if (check_2_byte_rune(a, b)) {
 366             write_byte(m, a);
 367             write_byte(m, b);
 368         } else {
 369             write_replacement_char(m);
 370         }
 371         return;
 372     }
 373 
 374     if (rune < (1 << (4 + 6 + 6))) {
 375         const int a = 0b11100000 | (rune >> 12);
 376         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 377         const int c = 0b10000000 | (rune & 0b00111111);
 378         if (check_3_byte_rune(a, b, c)) {
 379             write_byte(m, a);
 380             write_byte(m, b);
 381             write_byte(m, c);
 382         } else {
 383             write_replacement_char(m);
 384         }
 385         return;
 386     }
 387 
 388     if (rune < (1 << (3 + 6 + 6 + 6))) {
 389         const int a = 0b11110000 | (rune >> 18);
 390         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 391         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 392         const int d = 0b10000000 | (rune & 0b00111111);
 393         if (check_4_byte_rune(a, b, c, d)) {
 394             write_byte(m, a);
 395             write_byte(m, b);
 396             write_byte(m, c);
 397             write_byte(m, d);
 398         } else {
 399             write_replacement_char(m);
 400         }
 401         return;
 402     }
 403 
 404     write_replacement_char(m);
 405 }
 406 
 407 void copy_utf8_rune(j0_maker* m) {
 408     const int a = m->current;
 409 
 410     if (a == EOF) {
 411         return;
 412     }
 413 
 414     // handle 1-byte runes
 415     if (a < 128) {
 416         write_byte(m, a);
 417         return;
 418     }
 419 
 420     advance(m);
 421     const int b = m->current;
 422 
 423     if (b == EOF) {
 424         handle_invalid_rune(m);
 425         return;
 426     }
 427 
 428     // handle 2-byte runes
 429     if (check_2_byte_rune(a, b)) {
 430         write_byte(m, a);
 431         write_byte(m, b);
 432         return;
 433     }
 434 
 435     advance(m);
 436     const int c = m->current;
 437 
 438     if (c == EOF) {
 439         handle_invalid_rune(m);
 440         return;
 441     }
 442 
 443     // handle 3-byte runes
 444     if (check_3_byte_rune(a, b, c)) {
 445         write_byte(m, a);
 446         write_byte(m, b);
 447         write_byte(m, c);
 448         return;
 449     }
 450 
 451     advance(m);
 452     const int d = m->current;
 453 
 454     if (d == EOF) {
 455         handle_invalid_rune(m);
 456         return;
 457     }
 458 
 459     // handle 4-byte runes
 460     if (check_4_byte_rune(a, b, c, d)) {
 461         write_byte(m, a);
 462         write_byte(m, b);
 463         write_byte(m, c);
 464         write_byte(m, d);
 465         return;
 466     }
 467 
 468     handle_invalid_rune(m);
 469 }
 470 
 471 // debug is available to diagnose any bug found
 472 void debug(j0_maker* m, const char* fmt, ...) {
 473     va_list args;
 474     va_start(args, fmt);
 475 
 476     if (m->in != stdin) {
 477         fclose(m->in);
 478     }
 479 
 480     write_byte(m, '\n');
 481 
 482     const unsigned long line = m->line;
 483     const unsigned long pos = m->pos;
 484     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos);
 485     fprintf(stderr, fmt, args);
 486     fprintf(stderr, "\x1b[0m\n");
 487 
 488     va_end(args);
 489 
 490     exit(10);
 491 }
 492 
 493 // fail quits this app right after showing the error message given
 494 void fail(j0_maker* m, int code, const char* msg) {
 495     const unsigned long line = m->line;
 496     const unsigned long pos = m->pos;
 497 
 498     write_byte(m, '\n');
 499     flush(m);
 500     fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg);
 501     exit(code);
 502 }
 503 
 504 bool demand_keyword(j0_maker* m, char* rest) {
 505     for (; rest[0] != 0; rest++) {
 506         const int lead = m->current;
 507         if (lead == EOF || lead != rest[0]) {
 508             return false;
 509         }
 510         advance(m);
 511     }
 512 
 513     return rest[0] == 0;
 514 }
 515 
 516 void handle_null(j0_maker* m) {
 517     if (!demand_keyword(m, "null")) {
 518         fail(m, 1, "expected `null` keyword");
 519     }
 520     write_bytes(m, (unsigned char*)"null", 4);
 521 }
 522 
 523 void handle_true(j0_maker* m) {
 524     if (!demand_keyword(m, "true")) {
 525         fail(m, 1, "expected `true` keyword");
 526     }
 527     write_bytes(m, (unsigned char*)"true", 4);
 528 }
 529 
 530 void handle_false(j0_maker* m) {
 531     if (!demand_keyword(m, "false")) {
 532         fail(m, 1, "expected `false` keyword");
 533     }
 534     write_bytes(m, (unsigned char*)"false", 5);
 535 }
 536 
 537 void handle_capital_none(j0_maker* m) {
 538     if (!demand_keyword(m, "None")) {
 539         fail(m, 1, "expected `None` keyword");
 540     }
 541     write_bytes(m, (unsigned char*)"null", 4);
 542 }
 543 
 544 void handle_capital_true(j0_maker* m) {
 545     if (!demand_keyword(m, "True")) {
 546         fail(m, 1, "expected `True` keyword");
 547     }
 548     write_bytes(m, (unsigned char*)"true", 4);
 549 }
 550 
 551 void handle_capital_false(j0_maker* m) {
 552     if (!demand_keyword(m, "False")) {
 553         fail(m, 1, "expected `False` keyword");
 554     }
 555     write_bytes(m, (unsigned char*)"false", 5);
 556 }
 557 
 558 void handle_digits(j0_maker* m) {
 559     if (!isdigit(m->current)) {
 560         fail(m, 1, "expected/missing digits");
 561     }
 562 
 563     while (isdigit(m->current)) {
 564         write_byte(m, m->current);
 565         advance(m);
 566     }
 567 }
 568 
 569 void handle_number(j0_maker* m) {
 570     handle_digits(m);
 571 
 572     const int lead = m->current;
 573 
 574     if (lead == '.') {
 575         write_byte(m, '.');
 576         advance(m);
 577 
 578         if (isdigit(m->current)) {
 579             handle_digits(m);
 580         } else {
 581             write_byte(m, '0');
 582         }
 583         return;
 584     }
 585 
 586     if (lead == 'e' || lead == 'E') {
 587         write_byte(m, lead);
 588         advance(m);
 589 
 590         if (m->current == '+') {
 591             advance(m);
 592         } else if (m->current == '-') {
 593             write_byte(m, '-');
 594             advance(m);
 595         }
 596 
 597         handle_digits(m);
 598     }
 599 }
 600 
 601 void handle_dot(j0_maker* m) {
 602     write_byte(m, '0');
 603     write_byte(m, '.');
 604     advance(m);
 605 
 606     if (!isdigit(m->current)) {
 607         fail(m, 1, "expected/missing digits after decimal dot");
 608     }
 609     handle_digits(m);
 610 }
 611 
 612 void handle_plus_number(j0_maker* m) {
 613     advance(m);
 614 
 615     if (m->current == '.') {
 616         handle_dot(m);
 617         return;
 618     }
 619     handle_number(m);
 620 }
 621 
 622 void handle_minus_number(j0_maker* m) {
 623     write_byte(m, '-');
 624     advance(m);
 625 
 626     if (m->current == '.') {
 627         handle_dot(m);
 628         return;
 629     }
 630     handle_number(m);
 631 }
 632 
 633 // decode_hex assumes valid hex digits, checked by func is_valid_hex
 634 uint32_t decode_hex(unsigned char hex) {
 635     if ('0' <= hex && hex <= '9') {
 636         return hex - '0';
 637     }
 638     if ('A' <= hex && hex <= 'F') {
 639         return hex - 'A' + 10;
 640     }
 641     if ('a' <= hex && hex <= 'f') {
 642         return hex - 'a' + 10;
 643     }
 644     return 0xffff;
 645 }
 646 
 647 static inline bool is_valid_hex(unsigned char b) {
 648     return false ||
 649         ('0' <= b && b <= '9') ||
 650         ('A' <= b && b <= 'F') ||
 651         ('a' <= b && b <= 'f');
 652 }
 653 
 654 // handle_low_char ensures characters whose ASCII codes are lower than spaces
 655 // are properly escaped for strings
 656 void handle_low_char(j0_maker* m, int c) {
 657     const char* hex = "0123456789ABCDEF";
 658 
 659     switch (c) {
 660     case '\t':
 661         write_byte(m, '\\');
 662         write_byte(m, 't');
 663         break;
 664     case '\n':
 665         write_byte(m, '\\');
 666         write_byte(m, 'n');
 667         break;
 668     case '\r':
 669         write_byte(m, '\\');
 670         write_byte(m, 'r');
 671         break;
 672     case '\b':
 673         write_byte(m, '\\');
 674         write_byte(m, 'b');
 675         break;
 676     case '\f':
 677         write_byte(m, '\\');
 678         write_byte(m, 'f');
 679         break;
 680     case '\v':
 681         write_byte(m, '\\');
 682         write_byte(m, 'v');
 683         break;
 684     default:
 685         write_byte(m, '\\');
 686         write_byte(m, 'u');
 687         write_byte(m, '0');
 688         write_byte(m, '0');
 689         write_byte(m, hex[c / 16]);
 690         write_byte(m, hex[c % 16]);
 691         break;
 692     }
 693 }
 694 
 695 void write_inner_string_hex_quad(j0_maker* m, const unsigned char quad[4]) {
 696     const uint32_t n = 0 +
 697         (decode_hex(quad[0]) << 12) +
 698         (decode_hex(quad[1]) << 8) +
 699         (decode_hex(quad[2]) << 4) +
 700         (decode_hex(quad[3]) << 0);
 701 
 702     switch (n) {
 703     case '"':
 704         write_byte(m, '\\');
 705         write_byte(m, '"');
 706         return;
 707     case '\\':
 708         write_byte(m, '\\');
 709         write_byte(m, '\\');
 710         return;
 711     }
 712 
 713     if (n >= ' ') {
 714         write_rune(m, n);
 715     } else {
 716         handle_low_char(m, n);
 717     }
 718 }
 719 
 720 void handle_hex_quad(j0_maker* m) {
 721     unsigned char quad[4];
 722     for (size_t i = 0; i < 4; i++) {
 723         advance(m);
 724         const int lead = m->current;
 725         if (lead == EOF) {
 726             fail(m, 1, "end of input before end of string");
 727         }
 728         if (is_valid_hex(lead)) {
 729             quad[i] = lead;
 730             continue;
 731         }
 732         fail(m, 1, "invalid hexadecimal digit in string");
 733     }
 734 
 735     write_inner_string_hex_quad(m, quad);
 736 }
 737 
 738 void handle_hex_pair(j0_maker* m) {
 739     unsigned char quad[4] = {'0', '0', '0', '0'};
 740     advance(m);
 741     const int a = m->current;
 742     advance(m);
 743     const int b = m->current;
 744     if (a == EOF || b == EOF) {
 745         fail(m, 1, "end of input before end of string");
 746     }
 747     if (!is_valid_hex(a) || !is_valid_hex(b)) {
 748         fail(m, 1, "invalid hexadecimal digit in string");
 749     }
 750 
 751     quad[2] = a;
 752     quad[3] = b;
 753     write_inner_string_hex_quad(m, quad);
 754 }
 755 
 756 void handle_string_escape(j0_maker* m, int c) {
 757     switch (c) {
 758     case '"':
 759     case '\\':
 760     case 'b':
 761     case 'f':
 762     case 'n':
 763     case 'r':
 764     case 't':
 765         write_byte(m, '\\');
 766         write_byte(m, c);
 767         break;
 768     case 'u':
 769         handle_hex_quad(m);
 770         break;
 771     case 'x':
 772         handle_hex_pair(m);
 773         break;
 774     case '\'':
 775         write_byte(m, '\'');
 776         break;
 777     default:
 778         write_byte(m, m->current);
 779         break;
 780     }
 781 }
 782 
 783 void handle_string(j0_maker* m) {
 784     const unsigned char quote = m->current;
 785     bool escaped = false;
 786 
 787     write_byte(m, '"');
 788 
 789     while (true) {
 790         advance(m);
 791 
 792         int c = m->current;
 793         if (c == EOF) {
 794             fail(m, 1, "input ended before string was close-quoted");
 795         }
 796 
 797         if (escaped) {
 798             handle_string_escape(m, c);
 799             escaped = false;
 800             continue;
 801         }
 802 
 803         switch (c) {
 804         case '\\':
 805             escaped = true;
 806             break;
 807         default:
 808             if (c == quote) {
 809                 write_byte(m, '"');
 810                 advance(m);
 811                 return;
 812             }
 813 
 814             // write_byte(m, c);
 815             if (c < ' ') {
 816                 handle_low_char(m, c);
 817             } else {
 818                 copy_utf8_rune(m);
 819             }
 820             break;
 821         }
 822     }
 823 }
 824 
 825 void handle_token(j0_maker* m);
 826 
 827 void handle_array(j0_maker* m) {
 828     write_byte(m, '[');
 829     advance(m);
 830 
 831     for (size_t i = 0; true; i++) {
 832         seek_token(m);
 833         const int lead = m->current;
 834 
 835         if (lead == EOF) {
 836             fail(m, 1, "unclosed array");
 837         }
 838 
 839         if (lead == ',') {
 840             advance(m);
 841             continue;
 842         }
 843 
 844         if (lead == ']') {
 845             write_byte(m, ']');
 846             advance(m);
 847             return;
 848         }
 849 
 850         if (i > 0) {
 851             write_byte(m, ',');
 852         }
 853         if (feof(m->out)) {
 854             return;
 855         }
 856         handle_token(m);
 857     }
 858 }
 859 
 860 // handle_array_jsonl is a slight variation of func handle_array: this one is
 861 // used to handle top-level arrays when running in JSON Lines mode, to emit
 862 // line-feeds after each item, instead of commas between them
 863 void handle_array_jsonl(j0_maker* m) {
 864     advance(m);
 865 
 866     for (size_t i = 0; true; i++) {
 867         seek_token(m);
 868         const int lead = m->current;
 869 
 870         if (lead == EOF) {
 871             fail(m, 1, "unclosed array");
 872         }
 873 
 874         if (lead == ',') {
 875             advance(m);
 876             continue;
 877         }
 878 
 879         if (i > 0) {
 880             write_byte(m, '\n');
 881         }
 882 
 883         if (lead == ']') {
 884             advance(m);
 885             return;
 886         }
 887 
 888         if (feof(m->out)) {
 889             return;
 890         }
 891         handle_token(m);
 892     }
 893 }
 894 
 895 void handle_unquoted_key(j0_maker* m) {
 896     write_byte(m, '"');
 897 
 898     while (true) {
 899         int c = m->current;
 900         if (c == EOF) {
 901             fail(m, 1, "input ended with an object key");
 902         }
 903 
 904         write_byte(m, c);
 905         advance(m);
 906 
 907         c = m->current;
 908         if (!isalpha(c) && !isdigit(c) && c != '_') {
 909             break;
 910         }
 911     }
 912 
 913     write_byte(m, '"');
 914 }
 915 
 916 void handle_object(j0_maker* m) {
 917     write_byte(m, '{');
 918     advance(m);
 919 
 920     for (size_t i = 0; true; i++) {
 921         seek_token(m);
 922         int lead = m->current;
 923 
 924         if (lead == EOF) {
 925             fail(m, 1, "unclosed object");
 926         }
 927 
 928         if (lead == ',') {
 929             advance(m);
 930             continue;
 931         }
 932 
 933         if (lead == '}') {
 934             write_byte(m, '}');
 935             advance(m);
 936             return;
 937         }
 938 
 939         if (feof(m->out)) {
 940             return;
 941         }
 942 
 943         if (lead == '"' || lead == '\'') {
 944             if (i > 0) {
 945                 write_byte(m, ',');
 946             }
 947             handle_string(m);
 948         } else if (isalpha(lead) || lead == '_') {
 949             if (i > 0) {
 950                 write_byte(m, ',');
 951             }
 952             handle_unquoted_key(m);
 953         } else {
 954             fail(m, 1, "only strings or identifiers can be object keys");
 955         }
 956 
 957         seek_token(m);
 958         lead = m->current;
 959 
 960         if (lead == EOF) {
 961             fail(m, 1, "input ended after object-key and before value");
 962         }
 963 
 964         if (lead != ':') {
 965             fail(m, 1, "a `:` must follow all object keys");
 966         }
 967 
 968         write_byte(m, ':');
 969         advance(m);
 970 
 971         seek_token(m);
 972         if (m->current == EOF) {
 973             fail(m, 1, "input ended after a `:` following an object-key");
 974         }
 975 
 976         handle_token(m);
 977     }
 978 }
 979 
 980 // dispatch ties leading bytes/chars in tokens to the funcs which handle them
 981 void (*dispatch[256])() = {
 982     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 983     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 984     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 985     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 986     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 987     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 988     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 989     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 990     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 991     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 992     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 993     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 994     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 995     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 996     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 997     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 998     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 999     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1000     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1001     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1002     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1003     NULL, NULL, NULL, NULL,
1004 };
1005 
1006 void handle_token(j0_maker* m) {
1007     dispatch[m->current](m);
1008 }
1009 
1010 // handle_invalid_token shows an error message and quits the app right after
1011 void handle_invalid_token(j0_maker* m) {
1012     char msg[64];
1013     unsigned char c = (unsigned char)m->current;
1014     sprintf(msg, "%c (%d): invalid token", c, c);
1015     fail(m, 1, msg);
1016 }
1017 
1018 void handle_array_jsonl(j0_maker* m);
1019 
1020 void handle_input(FILE* src, bool jsonl) {
1021     unsigned char ibuf[IBUF_SIZE];
1022     unsigned char obuf[OBUF_SIZE];
1023 
1024     j0_maker m;
1025     m.ibuf = ibuf;
1026     m.icap = sizeof(ibuf);
1027     m.obuf = obuf;
1028     m.ocap = sizeof(obuf);
1029     restart_state(&m, stdout, src);
1030 
1031     // ignore leading whitespace/comment bytes, if present
1032     seek_token(&m);
1033 
1034     if (m.current == EOF) {
1035         fail(&m, 1, "empty input isn't valid JSON");
1036     }
1037 
1038     if (jsonl && m.current == '[') {
1039         handle_array_jsonl(&m);
1040     } else {
1041         handle_token(&m);
1042         write_byte(&m, '\n');
1043     }
1044     flush(&m);
1045 
1046     // ignore trailing whitespace/comment bytes, if present
1047     seek_token(&m);
1048 
1049     // ignore trailing semicolon, if present
1050     if (m.current == ';') {
1051         advance(&m);
1052         // ignore trailing whitespace/comment bytes, if present
1053         seek_token(&m);
1054     }
1055 
1056     if (!feof(src) || m.current != EOF) {
1057         fail(&m, 1, "unexpected trailing JSON data");
1058     }
1059 }
1060 
1061 bool is_help_option(const char* s) {
1062     return (s[0] == '-' && s[1] != 0) && (
1063         strcmp(s, "-h") == 0 ||
1064         strcmp(s, "--h") == 0 ||
1065         strcmp(s, "-help") == 0 ||
1066         strcmp(s, "--help") == 0
1067     );
1068 }
1069 
1070 bool is_jsonl_option(const char* s) {
1071     return (s[0] == '-' && s[1] != 0) && (
1072         strcmp(s, "-jl") == 0 ||
1073         strcmp(s, "--jl") == 0 ||
1074         strcmp(s, "-jsonl") == 0 ||
1075         strcmp(s, "--jsonl") == 0
1076     );
1077 }
1078 
1079 // run returns the error code
1080 int run(int nargs, char** args) {
1081     bool jsonl = false;
1082     if (nargs > 0 && is_jsonl_option(args[0])) {
1083         jsonl = true;
1084         nargs--;
1085         args++;
1086     }
1087 
1088     if (nargs > 0 && strcmp(args[0], "--") == 0) {
1089         nargs--;
1090         args++;
1091     }
1092 
1093     if (nargs > 1) {
1094         const char* msg = "can't use more than 1 named input";
1095         fprintf(stderr, ERROR_LINE("%s"), msg);
1096         return 1;
1097     }
1098 
1099     // use stdin when not given a filepath
1100     if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) {
1101         handle_input(stdin, jsonl);
1102         return 0;
1103     }
1104 
1105     const char* path = args[0];
1106     FILE* f = fopen(path, "rb");
1107     if (f == NULL) {
1108         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
1109         return 1;
1110     }
1111 
1112     handle_input(f, jsonl);
1113     fclose(f);
1114 
1115     return 0;
1116 }
1117 
1118 int main(int argc, char** argv) {
1119 #ifdef _WIN32
1120     setmode(fileno(stdin), O_BINARY);
1121     // ensure output lines end in LF instead of CRLF on windows
1122     setmode(fileno(stdout), O_BINARY);
1123     setmode(fileno(stderr), O_BINARY);
1124 #endif
1125 
1126     if (argc > 1 && is_help_option(argv[1])) {
1127         printf("%s", info);
1128         return 0;
1129     }
1130 
1131     // the dispatch table starts as all null function-pointers
1132     for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) {
1133         dispatch[i] = handle_invalid_token;
1134     }
1135 
1136     for (size_t i = '0'; i <= '9'; i++) {
1137         dispatch[i] = handle_number;
1138     }
1139 
1140     dispatch['n'] = handle_null;
1141     dispatch['t'] = handle_true;
1142     dispatch['f'] = handle_false;
1143     dispatch['N'] = handle_capital_none;
1144     dispatch['T'] = handle_capital_true;
1145     dispatch['F'] = handle_capital_false;
1146     dispatch['.'] = handle_dot;
1147     dispatch['+'] = handle_plus_number;
1148     dispatch['-'] = handle_minus_number;
1149     dispatch['"'] = handle_string;
1150     dispatch['\''] = handle_string;
1151     dispatch['['] = handle_array;
1152     dispatch['{'] = handle_object;
1153 
1154     // enable full/block-buffering for standard output
1155     setvbuf(stdout, NULL, _IOFBF, 0);
1156 
1157     return run(argc - 1, argv + 1) == 0 ? 0 : 1;
1158 }