File: json0.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./json0 ./json0.c
  29 
  30 To build a unit-testing app run
  31 
  32 cc -Wall -s -D TESTING -o ./json0_test ./json0.c
  33 */
  34 
  35 #include <ctype.h>
  36 #include <stdarg.h>
  37 #include <stdbool.h>
  38 #include <stdint.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 
  43 #ifdef _WIN32
  44 #include <fcntl.h>
  45 #include <windows.h>
  46 #endif
  47 
  48 #ifdef RED_ERRORS
  49 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  50 #ifdef __APPLE__
  51 #define ERROR_STYLE "\x1b[31m"
  52 #endif
  53 #define RESET_STYLE "\x1b[0m"
  54 #else
  55 #define ERROR_STYLE
  56 #define RESET_STYLE
  57 #endif
  58 
  59 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  60 
  61 #ifndef IBUF_SIZE
  62 #define IBUF_SIZE (32 * 1024)
  63 #endif
  64 
  65 #ifndef OBUF_SIZE
  66 #define OBUF_SIZE (8 * 1024)
  67 #endif
  68 
  69 const char* info = ""
  70 "json0 [options...] [file...]\n"
  71 "\n"
  72 "\n"
  73 "JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.\n"
  74 "Its output is always a single line, which ends with a line-feed.\n"
  75 "\n"
  76 "Besides minimizing bytes, this tool also adapts almost-JSON input into\n"
  77 "valid JSON, since it\n"
  78 "\n"
  79 "    - ignores both rest-of-line and multi-line comments\n"
  80 "    - ignores extra/trailing commas in arrays and objects\n"
  81 "    - turns single-quoted strings/keys into double-quoted strings\n"
  82 "    - double-quotes unquoted object keys\n"
  83 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
  84 "\n"
  85 "All options available can either start with a single or a double-dash\n"
  86 "\n"
  87 "    -h        show this help message\n"
  88 "    -help     show this help message\n"
  89 "    -jsonl    emit JSON Lines, when top-level value is an array\n"
  90 "";
  91 
  92 typedef struct j0_maker {
  93     FILE* in;
  94     FILE* out;
  95 
  96     unsigned char* ibuf;
  97     size_t ilen; // how many bytes are being used in the input buffer
  98     size_t icap; // the input buffer's capacity
  99     size_t ipos; // the current position in the input buffer
 100 
 101     size_t line; // the current line, used to show useful error messages
 102     size_t pos;  // the position in the current line, for error messages
 103 
 104     unsigned char* obuf;
 105     size_t ocap; // the output buffer's capacity
 106     size_t opos; // the current position in the output buffer
 107 
 108     int current;
 109     int next;
 110 } j0_maker;
 111 
 112 // advance_reader_pos helps func read_byte do its job
 113 static inline void advance_reader_pos(j0_maker* r, unsigned char b) {
 114     r->ipos++;
 115     if (b == '\n') {
 116         r->line++;
 117         r->pos = 1;
 118     } else {
 119         r->pos++;
 120     }
 121 }
 122 
 123 // read_byte does as it says: check its return for the value EOF, before
 124 // using it as the next byte
 125 static inline int read_byte(j0_maker* r) {
 126     if (r->ipos < r->ilen) {
 127         // inside current chunk
 128         const unsigned char b = r->ibuf[r->ipos];
 129         advance_reader_pos(r, b);
 130         return b;
 131     }
 132 
 133     // need to read the next block
 134     r->ipos = 0;
 135     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 136     if (r->ilen > 0) {
 137         const unsigned char b = r->ibuf[r->ipos];
 138         advance_reader_pos(r, b);
 139         return b;
 140     }
 141 
 142     // reached the end of data
 143     return EOF;
 144 }
 145 
 146 // advance is used in most of the code, instead of calling read_byte directly
 147 static inline void advance(j0_maker* r) {
 148     r->current = r->next;
 149     r->next = read_byte(r);
 150 }
 151 
 152 void fail(j0_maker* m, int code, const char* msg);
 153 
 154 void skip_line(j0_maker* r) {
 155     while (true) {
 156         advance(r);
 157         const int lead = r->current;
 158 
 159         if (lead == EOF) {
 160             break;
 161         }
 162 
 163         if (lead == '\n') {
 164             advance(r);
 165             break;
 166         }
 167     }
 168 }
 169 
 170 void skip_multiline_comment(j0_maker* r) {
 171     unsigned char prev = 0;
 172 
 173     while (true) {
 174         advance(r);
 175         const int lead = r->current;
 176 
 177         if (lead == EOF) {
 178             break;
 179         }
 180 
 181         if (prev == '*' && lead == '/') {
 182             advance(r);
 183             break;
 184         }
 185 
 186         prev = (unsigned char)lead;
 187     }
 188 }
 189 
 190 void skip_comment(j0_maker* r) {
 191     int lead = r->current;
 192 
 193     if (lead == '#') {
 194         skip_line(r);
 195         return;
 196     }
 197 
 198     if (lead != '/') {
 199         fail(r, 1, "expected a slash to start comments");
 200     }
 201 
 202     advance(r);
 203     lead = r->current;
 204 
 205     if (lead == '/') {
 206         skip_line(r);
 207         return;
 208     }
 209 
 210     if (lead == '*') {
 211         skip_multiline_comment(r);
 212         return;
 213     }
 214 
 215     fail(r, 1, "expected `//` or `/*` to start comments");
 216 }
 217 
 218 static inline void seek_token(j0_maker* r) {
 219     while (true) {
 220         const int lead = r->current;
 221 
 222         if (lead != EOF && lead <= ' ') {
 223             advance(r);
 224             continue;
 225         }
 226 
 227         if (lead == '/' || lead == '#') {
 228             skip_comment(r);
 229             continue;
 230         }
 231 
 232         break;
 233     }
 234 }
 235 
 236 bool starts_with_bom(const unsigned char* b, const size_t n) {
 237     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 238 }
 239 
 240 void restart_state(j0_maker* m, FILE* w, FILE* r) {
 241     m->in = r;
 242     m->ilen = 0;
 243     m->ipos = 0;
 244 
 245     m->out = w;
 246     m->opos = 0;
 247 
 248     m->line = 1;
 249     m->pos = 1;
 250 
 251     m->current = EOF;
 252     m->next = EOF;
 253 
 254     m->current = read_byte(m);
 255     if (m->current == EOF) {
 256         return;
 257     }
 258     m->next = read_byte(m);
 259 
 260     // skip leading UTF-8 BOM (byte-order mark), if present
 261     if (starts_with_bom(m->ibuf, m->ilen)) {
 262         // a UTF-8 BOM has 3 bytes
 263         for (size_t i = 0; i < 3 && m->current != EOF; i++) {
 264             advance(m);
 265         }
 266     }
 267 }
 268 
 269 void write_byte(j0_maker* m, unsigned char b) {
 270     if (m->opos < m->ocap) {
 271         m->obuf[m->opos++] = b;
 272         return;
 273     }
 274 
 275     fwrite(m->obuf, 1, m->ocap, m->out);
 276     m->obuf[0] = b;
 277     m->opos = 1;
 278 }
 279 
 280 // write_bytes does as it says, minimizing the number of calls to fwrite
 281 void write_bytes(j0_maker* m, const unsigned char* src, size_t len) {
 282     const size_t rem = m->ocap - m->opos;
 283     if (len < rem) {
 284         memcpy(m->obuf + m->opos, src, len);
 285         m->opos += len;
 286         return;
 287     }
 288 
 289     for (size_t i = 0; i < len; i++) {
 290         write_byte(m, src[i]);
 291     }
 292 }
 293 
 294 void flush(j0_maker* m) {
 295     if (m->opos > 0) {
 296         fwrite(m->obuf, 1, m->opos, m->out);
 297     }
 298     m->opos = 0;
 299     fflush(m->out);
 300 }
 301 
 302 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 303 
 304 static inline bool check_2_byte_rune(int a, int b) {
 305     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 306 }
 307 
 308 bool check_3_byte_rune(int a, int b, int c) {
 309     return (
 310         (a == 0xe0) &&
 311         (0xa0 <= b && b <= 0xbf) &&
 312         (0x80 <= c && c <= 0xbf)
 313     ) || (
 314         (0xe1 <= a && a <= 0xec) &&
 315         (0x80 <= b && b <= 0xbf) &&
 316         (0x80 <= c && c <= 0xbf)
 317     ) || (
 318         (a == 0xed) &&
 319         (0x80 <= b && b <= 0x9f) &&
 320         (0x80 <= c && c <= 0xbf)
 321     ) || (
 322         (a == 0xee || a == 0xef) &&
 323         (0x80 <= b && b <= 0xbf) &&
 324         (0x80 <= c && c <= 0xbf)
 325     );
 326 }
 327 
 328 bool check_4_byte_rune(int a, int b, int c, int d) {
 329     return (
 330         (a == 0xf0) &&
 331         (0x90 <= b && b <= 0xbf) &&
 332         (0x80 <= c && c <= 0xbf) &&
 333         (0x80 <= d && d <= 0xbf)
 334     ) || (
 335         (a == 0xf1 || a == 0xf3) &&
 336         (0x80 <= b && b <= 0xbf) &&
 337         (0x80 <= c && c <= 0xbf) &&
 338         (0x80 <= d && d <= 0xbf)
 339     ) || (
 340         (a == 0xf4) &&
 341         (0x80 <= b && b <= 0xbf) &&
 342         (0x80 <= c && c <= 0x8f) &&
 343         (0x80 <= d && d <= 0xbf)
 344     );
 345 }
 346 
 347 // write_replacement_char is the recommended action to handle invalid bytes
 348 void write_replacement_char(j0_maker* m) {
 349     write_byte(m, 0xef);
 350     write_byte(m, 0xbf);
 351     write_byte(m, 0xbd);
 352 }
 353 
 354 void handle_invalid_rune(j0_maker* m) {
 355     // fail(m, 1, "invalid unicode value");
 356     write_replacement_char(m);
 357 }
 358 
 359 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 360 void write_rune(j0_maker* m, uint32_t rune) {
 361     if (rune < (1 << 7)) {
 362         write_byte(m, rune);
 363         return;
 364     }
 365 
 366     if (rune < (1 << (5 + 6))) {
 367         const int a = 0b11000000 | (rune >> 6);
 368         const int b = 0b10000000 | (rune & 0b00111111);
 369         if (check_2_byte_rune(a, b)) {
 370             write_byte(m, a);
 371             write_byte(m, b);
 372         } else {
 373             write_replacement_char(m);
 374         }
 375         return;
 376     }
 377 
 378     if (rune < (1 << (4 + 6 + 6))) {
 379         const int a = 0b11100000 | (rune >> 12);
 380         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 381         const int c = 0b10000000 | (rune & 0b00111111);
 382         if (check_3_byte_rune(a, b, c)) {
 383             write_byte(m, a);
 384             write_byte(m, b);
 385             write_byte(m, c);
 386         } else {
 387             write_replacement_char(m);
 388         }
 389         return;
 390     }
 391 
 392     if (rune < (1 << (3 + 6 + 6 + 6))) {
 393         const int a = 0b11110000 | (rune >> 18);
 394         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 395         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 396         const int d = 0b10000000 | (rune & 0b00111111);
 397         if (check_4_byte_rune(a, b, c, d)) {
 398             write_byte(m, a);
 399             write_byte(m, b);
 400             write_byte(m, c);
 401             write_byte(m, d);
 402         } else {
 403             write_replacement_char(m);
 404         }
 405         return;
 406     }
 407 
 408     write_replacement_char(m);
 409 }
 410 
 411 void copy_utf8_rune(j0_maker* m) {
 412     const int a = m->current;
 413 
 414     if (a == EOF) {
 415         return;
 416     }
 417 
 418     // handle 1-byte runes
 419     if (a < 128) {
 420         write_byte(m, a);
 421         return;
 422     }
 423 
 424     advance(m);
 425     const int b = m->current;
 426 
 427     if (b == EOF) {
 428         handle_invalid_rune(m);
 429         return;
 430     }
 431 
 432     // handle 2-byte runes
 433     if (check_2_byte_rune(a, b)) {
 434         write_byte(m, a);
 435         write_byte(m, b);
 436         return;
 437     }
 438 
 439     advance(m);
 440     const int c = m->current;
 441 
 442     if (c == EOF) {
 443         handle_invalid_rune(m);
 444         return;
 445     }
 446 
 447     // handle 3-byte runes
 448     if (check_3_byte_rune(a, b, c)) {
 449         write_byte(m, a);
 450         write_byte(m, b);
 451         write_byte(m, c);
 452         return;
 453     }
 454 
 455     advance(m);
 456     const int d = m->current;
 457 
 458     if (d == EOF) {
 459         handle_invalid_rune(m);
 460         return;
 461     }
 462 
 463     // handle 4-byte runes
 464     if (check_4_byte_rune(a, b, c, d)) {
 465         write_byte(m, a);
 466         write_byte(m, b);
 467         write_byte(m, c);
 468         write_byte(m, d);
 469         return;
 470     }
 471 
 472     handle_invalid_rune(m);
 473 }
 474 
 475 // debug is available to diagnose any bug found
 476 void debug(j0_maker* m, const char* fmt, ...) {
 477     va_list args;
 478     va_start(args, fmt);
 479 
 480     if (m->in != stdin) {
 481         fclose(m->in);
 482     }
 483 
 484     write_byte(m, '\n');
 485 
 486     const unsigned long line = m->line;
 487     const unsigned long pos = m->pos;
 488     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos);
 489     fprintf(stderr, fmt, args);
 490     fprintf(stderr, "\x1b[0m\n");
 491 
 492     va_end(args);
 493 
 494     exit(10);
 495 }
 496 
 497 // fail quits this app right after showing the error message given
 498 void fail(j0_maker* m, int code, const char* msg) {
 499     const unsigned long line = m->line;
 500     const unsigned long pos = m->pos;
 501 
 502     write_byte(m, '\n');
 503     flush(m);
 504     fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg);
 505     exit(code);
 506 }
 507 
 508 bool demand_keyword(j0_maker* m, char* rest) {
 509     for (; rest[0] != 0; rest++) {
 510         const int lead = m->current;
 511         if (lead == EOF || lead != rest[0]) {
 512             return false;
 513         }
 514         advance(m);
 515     }
 516 
 517     return rest[0] == 0;
 518 }
 519 
 520 void handle_null(j0_maker* m) {
 521     if (!demand_keyword(m, "null")) {
 522         fail(m, 1, "expected `null` keyword");
 523     }
 524     write_bytes(m, (unsigned char*)"null", 4);
 525 }
 526 
 527 void handle_true(j0_maker* m) {
 528     if (!demand_keyword(m, "true")) {
 529         fail(m, 1, "expected `true` keyword");
 530     }
 531     write_bytes(m, (unsigned char*)"true", 4);
 532 }
 533 
 534 void handle_false(j0_maker* m) {
 535     if (!demand_keyword(m, "false")) {
 536         fail(m, 1, "expected `false` keyword");
 537     }
 538     write_bytes(m, (unsigned char*)"false", 5);
 539 }
 540 
 541 void handle_capital_none(j0_maker* m) {
 542     if (!demand_keyword(m, "None")) {
 543         fail(m, 1, "expected `None` keyword");
 544     }
 545     write_bytes(m, (unsigned char*)"null", 4);
 546 }
 547 
 548 void handle_capital_true(j0_maker* m) {
 549     if (!demand_keyword(m, "True")) {
 550         fail(m, 1, "expected `True` keyword");
 551     }
 552     write_bytes(m, (unsigned char*)"true", 4);
 553 }
 554 
 555 void handle_capital_false(j0_maker* m) {
 556     if (!demand_keyword(m, "False")) {
 557         fail(m, 1, "expected `False` keyword");
 558     }
 559     write_bytes(m, (unsigned char*)"false", 5);
 560 }
 561 
 562 void handle_digits(j0_maker* m) {
 563     if (!isdigit(m->current)) {
 564         fail(m, 1, "expected/missing digits");
 565     }
 566 
 567     while (isdigit(m->current)) {
 568         write_byte(m, m->current);
 569         advance(m);
 570     }
 571 }
 572 
 573 void handle_number(j0_maker* m) {
 574     handle_digits(m);
 575 
 576     const int lead = m->current;
 577 
 578     if (lead == '.') {
 579         write_byte(m, '.');
 580         advance(m);
 581 
 582         if (isdigit(m->current)) {
 583             handle_digits(m);
 584         } else {
 585             write_byte(m, '0');
 586         }
 587         return;
 588     }
 589 
 590     if (lead == 'e' || lead == 'E') {
 591         write_byte(m, lead);
 592         advance(m);
 593 
 594         if (m->current == '+') {
 595             advance(m);
 596         } else if (m->current == '-') {
 597             write_byte(m, '-');
 598             advance(m);
 599         }
 600 
 601         handle_digits(m);
 602     }
 603 }
 604 
 605 void handle_dot(j0_maker* m) {
 606     write_byte(m, '0');
 607     write_byte(m, '.');
 608     advance(m);
 609 
 610     if (!isdigit(m->current)) {
 611         fail(m, 1, "expected/missing digits after decimal dot");
 612     }
 613     handle_digits(m);
 614 }
 615 
 616 void handle_plus_number(j0_maker* m) {
 617     advance(m);
 618 
 619     if (m->current == '.') {
 620         handle_dot(m);
 621         return;
 622     }
 623     handle_number(m);
 624 }
 625 
 626 void handle_minus_number(j0_maker* m) {
 627     write_byte(m, '-');
 628     advance(m);
 629 
 630     if (m->current == '.') {
 631         handle_dot(m);
 632         return;
 633     }
 634     handle_number(m);
 635 }
 636 
 637 // decode_hex assumes valid hex digits, checked by func is_valid_hex
 638 uint32_t decode_hex(unsigned char hex) {
 639     if ('0' <= hex && hex <= '9') {
 640         return hex - '0';
 641     }
 642     if ('A' <= hex && hex <= 'F') {
 643         return hex - 'A' + 10;
 644     }
 645     if ('a' <= hex && hex <= 'f') {
 646         return hex - 'a' + 10;
 647     }
 648     return 0xffff;
 649 }
 650 
 651 static inline bool is_valid_hex(unsigned char b) {
 652     return false ||
 653         ('0' <= b && b <= '9') ||
 654         ('A' <= b && b <= 'F') ||
 655         ('a' <= b && b <= 'f');
 656 }
 657 
 658 // handle_low_char ensures characters whose ASCII codes are lower than spaces
 659 // are properly escaped for strings
 660 void handle_low_char(j0_maker* m, int c) {
 661     const char* hex = "0123456789ABCDEF";
 662 
 663     switch (c) {
 664     case '\t':
 665         write_byte(m, '\\');
 666         write_byte(m, 't');
 667         break;
 668     case '\n':
 669         write_byte(m, '\\');
 670         write_byte(m, 'n');
 671         break;
 672     case '\r':
 673         write_byte(m, '\\');
 674         write_byte(m, 'r');
 675         break;
 676     case '\b':
 677         write_byte(m, '\\');
 678         write_byte(m, 'b');
 679         break;
 680     case '\f':
 681         write_byte(m, '\\');
 682         write_byte(m, 'f');
 683         break;
 684     case '\v':
 685         write_byte(m, '\\');
 686         write_byte(m, 'v');
 687         break;
 688     default:
 689         write_byte(m, '\\');
 690         write_byte(m, 'u');
 691         write_byte(m, '0');
 692         write_byte(m, '0');
 693         write_byte(m, hex[c / 16]);
 694         write_byte(m, hex[c % 16]);
 695         break;
 696     }
 697 }
 698 
 699 void write_inner_string_hex_quad(j0_maker* m, const unsigned char quad[4]) {
 700     const uint32_t n = 0 +
 701         (decode_hex(quad[0]) << 12) +
 702         (decode_hex(quad[1]) << 8) +
 703         (decode_hex(quad[2]) << 4) +
 704         (decode_hex(quad[3]) << 0);
 705 
 706     switch (n) {
 707     case '"':
 708         write_byte(m, '\\');
 709         write_byte(m, '"');
 710         return;
 711     case '\\':
 712         write_byte(m, '\\');
 713         write_byte(m, '\\');
 714         return;
 715     }
 716 
 717     if (n >= ' ') {
 718         write_rune(m, n);
 719     } else {
 720         handle_low_char(m, n);
 721     }
 722 }
 723 
 724 void handle_hex_quad(j0_maker* m) {
 725     unsigned char quad[4];
 726     for (size_t i = 0; i < 4; i++) {
 727         advance(m);
 728         const int lead = m->current;
 729         if (lead == EOF) {
 730             fail(m, 1, "end of input before end of string");
 731         }
 732         if (is_valid_hex(lead)) {
 733             quad[i] = lead;
 734             continue;
 735         }
 736         fail(m, 1, "invalid hexadecimal digit in string");
 737     }
 738 
 739     write_inner_string_hex_quad(m, quad);
 740 }
 741 
 742 void handle_hex_pair(j0_maker* m) {
 743     unsigned char quad[4] = {'0', '0', '0', '0'};
 744     advance(m);
 745     const int a = m->current;
 746     advance(m);
 747     const int b = m->current;
 748     if (a == EOF || b == EOF) {
 749         fail(m, 1, "end of input before end of string");
 750     }
 751     if (!is_valid_hex(a) || !is_valid_hex(b)) {
 752         fail(m, 1, "invalid hexadecimal digit in string");
 753     }
 754 
 755     quad[2] = a;
 756     quad[3] = b;
 757     write_inner_string_hex_quad(m, quad);
 758 }
 759 
 760 void handle_string_escape(j0_maker* m, int c) {
 761     switch (c) {
 762     case '"':
 763     case '\\':
 764     case 'b':
 765     case 'f':
 766     case 'n':
 767     case 'r':
 768     case 't':
 769         write_byte(m, '\\');
 770         write_byte(m, c);
 771         break;
 772     case 'u':
 773         handle_hex_quad(m);
 774         break;
 775     case 'x':
 776         handle_hex_pair(m);
 777         break;
 778     case '\'':
 779         write_byte(m, '\'');
 780         break;
 781     default:
 782         write_byte(m, m->current);
 783         break;
 784     }
 785 }
 786 
 787 void handle_string(j0_maker* m) {
 788     const unsigned char quote = m->current;
 789     bool escaped = false;
 790 
 791     write_byte(m, '"');
 792 
 793     while (true) {
 794         advance(m);
 795 
 796         int c = m->current;
 797         if (c == EOF) {
 798             fail(m, 1, "input ended before string was close-quoted");
 799         }
 800 
 801         if (escaped) {
 802             handle_string_escape(m, c);
 803             escaped = false;
 804             continue;
 805         }
 806 
 807         switch (c) {
 808         case '\\':
 809             escaped = true;
 810             break;
 811         default:
 812             if (c == quote) {
 813                 write_byte(m, '"');
 814                 advance(m);
 815                 return;
 816             }
 817 
 818             // write_byte(m, c);
 819             if (c < ' ') {
 820                 handle_low_char(m, c);
 821             } else {
 822                 copy_utf8_rune(m);
 823             }
 824             break;
 825         }
 826     }
 827 }
 828 
 829 void handle_token(j0_maker* m);
 830 
 831 void handle_array(j0_maker* m) {
 832     size_t items = 0;
 833     const unsigned char end = m->current == '[' ? ']' : ')';
 834     write_byte(m, '[');
 835     advance(m);
 836 
 837     while (true) {
 838         seek_token(m);
 839         const int lead = m->current;
 840 
 841         if (lead == EOF) {
 842             fail(m, 1, "unclosed array");
 843         }
 844 
 845         if (lead == ',') {
 846             advance(m);
 847             continue;
 848         }
 849 
 850         if (lead == end) {
 851             write_byte(m, ']');
 852             advance(m);
 853             return;
 854         }
 855 
 856         if (items > 0) {
 857             write_byte(m, ',');
 858         }
 859         if (feof(m->out)) {
 860             return;
 861         }
 862         handle_token(m);
 863         items++;
 864     }
 865 }
 866 
 867 // handle_array_jsonl is a slight variation of func handle_array: this one is
 868 // used to handle top-level arrays when running in JSON Lines mode, to emit
 869 // line-feeds after each item, instead of commas between them
 870 void handle_array_jsonl(j0_maker* m) {
 871     const unsigned char end = m->current == '[' ? ']' : ')';
 872     advance(m);
 873 
 874     while (true) {
 875         seek_token(m);
 876         const int lead = m->current;
 877 
 878         if (lead == EOF) {
 879             fail(m, 1, "unclosed array");
 880         }
 881 
 882         if (lead == ',') {
 883             advance(m);
 884             continue;
 885         }
 886 
 887         if (lead == end) {
 888             advance(m);
 889             return;
 890         }
 891 
 892         if (feof(m->out)) {
 893             return;
 894         }
 895 
 896         handle_token(m);
 897         write_byte(m, '\n');
 898     }
 899 }
 900 
 901 void handle_unquoted_key(j0_maker* m) {
 902     write_byte(m, '"');
 903 
 904     while (true) {
 905         int c = m->current;
 906         if (c == EOF) {
 907             fail(m, 1, "input ended with an object key");
 908         }
 909 
 910         write_byte(m, c);
 911         advance(m);
 912 
 913         c = m->current;
 914         if (!isalpha(c) && !isdigit(c) && c != '_') {
 915             break;
 916         }
 917     }
 918 
 919     write_byte(m, '"');
 920 }
 921 
 922 void handle_object(j0_maker* m) {
 923     size_t items = 0;
 924     write_byte(m, '{');
 925     advance(m);
 926 
 927     while (true) {
 928         seek_token(m);
 929         int lead = m->current;
 930 
 931         if (lead == EOF) {
 932             fail(m, 1, "unclosed object");
 933         }
 934 
 935         if (lead == ',') {
 936             advance(m);
 937             continue;
 938         }
 939 
 940         if (lead == '}') {
 941             write_byte(m, '}');
 942             advance(m);
 943             return;
 944         }
 945 
 946         if (feof(m->out)) {
 947             return;
 948         }
 949 
 950         if (lead == '"' || lead == '\'') {
 951             if (items > 0) {
 952                 write_byte(m, ',');
 953             }
 954             handle_string(m);
 955         } else if (isalpha(lead) || lead == '_') {
 956             if (items > 0) {
 957                 write_byte(m, ',');
 958             }
 959             handle_unquoted_key(m);
 960         } else {
 961             fail(m, 1, "only strings or identifiers can be object keys");
 962         }
 963 
 964         seek_token(m);
 965         lead = m->current;
 966 
 967         if (lead == EOF) {
 968             fail(m, 1, "input ended after object-key and before value");
 969         }
 970 
 971         if (lead != ':') {
 972             fail(m, 1, "a `:` must follow all object keys");
 973         }
 974 
 975         write_byte(m, ':');
 976         advance(m);
 977 
 978         seek_token(m);
 979         if (m->current == EOF) {
 980             fail(m, 1, "input ended after a `:` following an object-key");
 981         }
 982 
 983         handle_token(m);
 984         items++;
 985     }
 986 }
 987 
 988 // dispatch ties leading bytes/chars in tokens to the funcs which handle them
 989 void (*dispatch[256])() = {
 990     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 991     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 992     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 993     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 994     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 995     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 996     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 997     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 998     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 999     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1000     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1001     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1002     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1003     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1004     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1005     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1006     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1007     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1008     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1009     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1010     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1011     NULL, NULL, NULL, NULL,
1012 };
1013 
1014 void handle_token(j0_maker* m) {
1015     dispatch[m->current](m);
1016 }
1017 
1018 // handle_invalid_token shows an error message and quits the app right after
1019 void handle_invalid_token(j0_maker* m) {
1020     char msg[64];
1021     unsigned char c = (unsigned char)m->current;
1022     sprintf(msg, "%c (%d): invalid token", c, c);
1023     fail(m, 1, msg);
1024 }
1025 
1026 void handle_array_jsonl(j0_maker* m);
1027 
1028 void handle_input(FILE* out, FILE* src, bool jsonl) {
1029     unsigned char ibuf[IBUF_SIZE];
1030     unsigned char obuf[OBUF_SIZE];
1031 
1032     j0_maker m;
1033     m.ibuf = ibuf;
1034     m.icap = sizeof(ibuf);
1035     m.obuf = obuf;
1036     m.ocap = sizeof(obuf);
1037     restart_state(&m, out, src);
1038 
1039     // ignore leading whitespace/comment bytes, if present
1040     seek_token(&m);
1041 
1042     if (m.current == EOF) {
1043         fail(&m, 1, "empty input isn't valid JSON");
1044     }
1045 
1046     if (jsonl && m.current == '[') {
1047         handle_array_jsonl(&m);
1048     } else {
1049         handle_token(&m);
1050         write_byte(&m, '\n');
1051     }
1052     flush(&m);
1053 
1054     // ignore trailing whitespace/comment bytes, if present
1055     seek_token(&m);
1056 
1057     // ignore trailing semicolon, if present
1058     if (m.current == ';') {
1059         advance(&m);
1060         // ignore trailing whitespace/comment bytes, if present
1061         seek_token(&m);
1062     }
1063 
1064     if (!feof(src) || m.current != EOF) {
1065         fail(&m, 1, "unexpected trailing JSON data");
1066     }
1067 }
1068 
1069 bool is_help_option(const char* s) {
1070     return (s[0] == '-' && s[1] != 0) && (
1071         strcmp(s, "-h") == 0 ||
1072         strcmp(s, "--h") == 0 ||
1073         strcmp(s, "-help") == 0 ||
1074         strcmp(s, "--help") == 0
1075     );
1076 }
1077 
1078 bool is_jsonl_option(const char* s) {
1079     return (s[0] == '-' && s[1] != 0) && (
1080         strcmp(s, "-jl") == 0 ||
1081         strcmp(s, "--jl") == 0 ||
1082         strcmp(s, "-jsonl") == 0 ||
1083         strcmp(s, "--jsonl") == 0
1084     );
1085 }
1086 
1087 // run returns the error code
1088 int run(int nargs, char** args) {
1089     bool jsonl = false;
1090     if (nargs > 0 && is_jsonl_option(args[0])) {
1091         jsonl = true;
1092         nargs--;
1093         args++;
1094     }
1095 
1096     if (nargs > 0 && strcmp(args[0], "--") == 0) {
1097         nargs--;
1098         args++;
1099     }
1100 
1101     if (nargs > 1) {
1102         const char* msg = "can't use more than 1 named input";
1103         fprintf(stderr, ERROR_LINE("%s"), msg);
1104         return 1;
1105     }
1106 
1107     // use stdin when not given a filepath
1108     if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) {
1109         handle_input(stdout, stdin, jsonl);
1110         return 0;
1111     }
1112 
1113     const char* path = args[0];
1114     FILE* f = fopen(path, "rb");
1115     if (f == NULL) {
1116         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
1117         return 1;
1118     }
1119 
1120     handle_input(stdout, f, jsonl);
1121     fclose(f);
1122 
1123     return 0;
1124 }
1125 
1126 #ifdef TESTING
1127 bool run_test(const char* name, const char* input, const char* expected) {
1128     unsigned char result[OBUF_SIZE];
1129 
1130     fprintf(stdout, "running test named \"%s\"\n", name);
1131 
1132     FILE* in = fmemopen((void*)input, strlen(input), "rb");
1133     if (in == NULL) {
1134         fprintf(stdout, "fmemopen failed\n");
1135         return false;
1136     }
1137 
1138     memset(result, 0, sizeof(result));
1139     FILE* out = fmemopen((void*)result, sizeof(result), "wb");
1140     if (out == NULL) {
1141         fprintf(stdout, "fmemopen failed\n");
1142         return false;
1143     }
1144 
1145     handle_input(out, in, false);
1146 
1147     fclose(out);
1148     fclose(in);
1149 
1150     // remove trailing line-feed from the result
1151     for (ssize_t i = sizeof(result) - 1; i >= 0; i--) {
1152         if (result[i] == '\n') {
1153             result[i] = 0;
1154             break;
1155         }
1156     }
1157 
1158     const bool ok = strcmp((char*)result, expected) == 0;
1159     if (!ok) {
1160         fprintf(stdout, "  input:    %s\n", input);
1161         fprintf(stdout, "  expected: %s\n", expected);
1162         fprintf(stdout, "  result:   %s\n", result);
1163     }
1164     return ok;
1165 }
1166 
1167 int test() {
1168     typedef struct test_case {
1169         const char* name;
1170         const char* input;
1171         const char* expected;
1172     } test_case;
1173 
1174     test_case cases[] = {
1175         {"null", "null", "null"},
1176         {"false", "false", "false"},
1177         {"true", "true", "true"},
1178         {"None", "None", "null"},
1179         {"False", "False", "false"},
1180         {"True", "True", "true"},
1181         {"zero", "0", "0"},
1182         {"zero with decimals", "0.0000", "0.0000"},
1183         {"negative number", "-1230.324", "-1230.324"},
1184         {"leading plus", "+1230.324", "1230.324"},
1185         {"leading dot", ".123", "0.123"},
1186         {"leading negative dot", "-.123", "-0.123"},
1187         {"leading positive dot", "+.123", "0.123"},
1188         {"empty string", "\"\"", "\"\""},
1189         {"single-quoted string", "'abc def'", "\"abc def\""},
1190         {
1191             "string with double-quotes in it",
1192             "\"\\\"cats and dogs\\\" goes the saying\"",
1193             "\"\\\"cats and dogs\\\" goes the saying\"",
1194         },
1195         {
1196             "string with escaped hex-digit values in it",
1197             "\"\\x00\\u0000\\x09\"",
1198             "\"\\u0000\\u0000\\t\"",
1199         },
1200         {"empty array", "[]", "[]"},
1201         {"empty array, extra comma", "[ , ]", "[]"},
1202         {"empty object", "{}", "{}"},
1203         {"empty object, extra commas", "{,, , ,,}", "{}"},
1204         {"numeric array", "[,,1, 2, 3, ]", "[1,2,3]"},
1205         {"simple nested array", "[1, 2, 3, []]", "[1,2,3,[]]"},
1206         {
1207             "another simple nested array",
1208             "[1, 2, 3, [false,\"abc\"]]",
1209             "[1,2,3,[false,\"abc\"]]",
1210         },
1211         {
1212             "fancier nested array",
1213             "[1, 2, 3, [[  -.233,  false,] , , ,,, 'abc']]",
1214             "[1,2,3,[[-0.233,false],\"abc\"]]",
1215         },
1216         {
1217             "simple object, extra commas",
1218             "{,'abc'  : 123, , ,'def': 987,}",
1219             "{\"abc\":123,\"def\":987}",
1220         },
1221         {
1222             "simple object, extra commas, unquoted object keys",
1223             "{,abc  : 123, , ,def: 987,}",
1224             "{\"abc\":123,\"def\":987}",
1225         },
1226         {
1227             "numeric array with trailing single-line comment",
1228             "[1, 2, 3, ] // comments aren't valid JSON",
1229             "[1,2,3]",
1230         },
1231         {
1232             "numeric array with comments",
1233             "/* hi there */ [1, 2, /* 3 better be next */ 3, ]"
1234             " // I'll have the last word # you wish",
1235             "[1,2,3]",
1236         },
1237         {
1238             "self-compacting shebang",
1239             "#!/usr/bin/json0\n[, 1 , , 2 , , , 3]",
1240             "[1,2,3]",
1241         },
1242         {
1243             "pyon example 1",
1244             "[True,False,'abc\\x0adef',None,+12.45]",
1245             "[true,false,\"abc\\ndef\",null,12.45]",
1246         },
1247         {
1248             "pyon example 2",
1249             "[{'abc':123},'abc\\x0adef',None,+12.45]",
1250             "[{\"abc\":123},\"abc\\ndef\",null,12.45]",
1251         },
1252     };
1253 
1254     size_t errors = 0;
1255     for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
1256         const char* s = cases[i].name;
1257         if (!run_test(s, cases[i].input, cases[i].expected)) {
1258             fprintf(stdout, "\x1b[31mtest named \"%s\" failed\x1b[0m\n", s);
1259             errors++;
1260         }
1261     }
1262     return errors == 0 ? 0 : 1;
1263 }
1264 #endif
1265 
1266 int main(int argc, char** argv) {
1267 #ifdef _WIN32
1268     setmode(fileno(stdin), O_BINARY);
1269     // ensure output lines end in LF instead of CRLF on windows
1270     setmode(fileno(stdout), O_BINARY);
1271     setmode(fileno(stderr), O_BINARY);
1272 #endif
1273 
1274 #ifndef TESTING
1275     if (argc > 1 && is_help_option(argv[1])) {
1276         printf("%s", info);
1277         return 0;
1278     }
1279 #endif
1280 
1281     // the dispatch table starts as all null function-pointers
1282     for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) {
1283         dispatch[i] = handle_invalid_token;
1284     }
1285 
1286     for (size_t i = '0'; i <= '9'; i++) {
1287         dispatch[i] = handle_number;
1288     }
1289 
1290     dispatch['n'] = handle_null;
1291     dispatch['t'] = handle_true;
1292     dispatch['f'] = handle_false;
1293     dispatch['N'] = handle_capital_none;
1294     dispatch['T'] = handle_capital_true;
1295     dispatch['F'] = handle_capital_false;
1296     dispatch['.'] = handle_dot;
1297     dispatch['+'] = handle_plus_number;
1298     dispatch['-'] = handle_minus_number;
1299     dispatch['"'] = handle_string;
1300     dispatch['\''] = handle_string;
1301     dispatch['['] = handle_array;
1302     dispatch['('] = handle_array;
1303     dispatch['{'] = handle_object;
1304 
1305     #ifdef TESTING
1306         return test();
1307     #else
1308         // enable full/block-buffering for standard output
1309         setvbuf(stdout, NULL, _IOFBF, 0);
1310 
1311         return run(argc - 1, argv + 1) == 0 ? 0 : 1;
1312     #endif
1313 }