File: jsonl.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./jsonl ./jsonl.c
  29 */
  30 
  31 #include <ctype.h>
  32 #include <stdarg.h>
  33 #include <stdbool.h>
  34 #include <stdint.h>
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 
  39 #ifdef _WIN32
  40 #include <fcntl.h>
  41 #include <windows.h>
  42 #endif
  43 
  44 #ifdef RED_ERRORS
  45 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  46 #ifdef __APPLE__
  47 #define ERROR_STYLE "\x1b[31m"
  48 #endif
  49 #define RESET_STYLE "\x1b[0m"
  50 #else
  51 #define ERROR_STYLE
  52 #define RESET_STYLE
  53 #endif
  54 
  55 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  56 
  57 #ifndef IBUF_SIZE
  58 #define IBUF_SIZE (32 * 1024)
  59 #endif
  60 
  61 #ifndef OBUF_SIZE
  62 #define OBUF_SIZE (8 * 1024)
  63 #endif
  64 
  65 // #define JSON0
  66 
  67 const char* info = ""
  68 "jsonl [options...] [files...]\n"
  69 "\n"
  70 "\n"
  71 "JSON Lines converts/fixes JSON/pseudo-JSON input into lines of text, each\n"
  72 "with valid JSON in it. Multiple lines are emitted when the top-level value\n"
  73 "is an array, while a single line is emitted for any other top-level type."
  74 "\n"
  75 "Besides splitting top-level items into a line-streamable format, this tool\n"
  76 "also adapts almost-JSON input into valid JSON, since it\n"
  77 "\n"
  78 "    - ignores both rest-of-line and multi-line comments\n"
  79 "    - ignores extra/trailing commas in arrays and objects\n"
  80 "    - turns single-quoted strings/keys into double-quoted strings\n"
  81 "    - double-quotes unquoted object keys\n"
  82 "    - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n"
  83 "\n"
  84 "All options available can either start with a single or a double-dash\n"
  85 "\n"
  86 "    -h        show this help message\n"
  87 "    -help     show this help message\n"
  88 "";
  89 
  90 typedef struct jl_maker {
  91     FILE* in;
  92     FILE* out;
  93 
  94     unsigned char* ibuf;
  95     size_t ilen; // how many bytes are being used in the input buffer
  96     size_t icap; // the input buffer's capacity
  97     size_t ipos; // the current position in the input buffer
  98 
  99     size_t line; // the current line, used to show useful error messages
 100     size_t pos;  // the position in the current line, for error messages
 101 
 102     unsigned char* obuf;
 103     size_t ocap; // the output buffer's capacity
 104     size_t opos; // the current position in the output buffer
 105 
 106     int current;
 107     int next;
 108 } jl_maker;
 109 
 110 // advance_reader_pos helps func read_byte do its job
 111 static inline void advance_reader_pos(jl_maker* r, unsigned char b) {
 112     r->ipos++;
 113     if (b == '\n') {
 114         r->line++;
 115         r->pos = 1;
 116     } else {
 117         r->pos++;
 118     }
 119 }
 120 
 121 // read_byte does as it says: check its return for the value EOF, before
 122 // using it as the next byte
 123 static inline int read_byte(jl_maker* r) {
 124     if (r->ipos < r->ilen) {
 125         // inside current chunk
 126         const unsigned char b = r->ibuf[r->ipos];
 127         advance_reader_pos(r, b);
 128         return b;
 129     }
 130 
 131     // need to read the next block
 132     r->ipos = 0;
 133     r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in);
 134     if (r->ilen > 0) {
 135         const unsigned char b = r->ibuf[r->ipos];
 136         advance_reader_pos(r, b);
 137         return b;
 138     }
 139 
 140     // reached the end of data
 141     return EOF;
 142 }
 143 
 144 // advance is used in most of the code, instead of calling read_byte directly
 145 static inline void advance(jl_maker* r) {
 146     r->current = r->next;
 147     r->next = read_byte(r);
 148 }
 149 
 150 void fail(jl_maker* m, int code, const char* msg);
 151 
 152 void skip_line(jl_maker* r) {
 153     while (true) {
 154         advance(r);
 155         const int lead = r->current;
 156 
 157         if (lead == EOF) {
 158             break;
 159         }
 160 
 161         if (lead == '\n') {
 162             advance(r);
 163             break;
 164         }
 165     }
 166 }
 167 
 168 void skip_multiline_comment(jl_maker* r) {
 169     unsigned char prev = 0;
 170 
 171     while (true) {
 172         advance(r);
 173         const int lead = r->current;
 174 
 175         if (lead == EOF) {
 176             break;
 177         }
 178 
 179         if (prev == '*' && lead == '/') {
 180             advance(r);
 181             break;
 182         }
 183 
 184         prev = (unsigned char)lead;
 185     }
 186 }
 187 
 188 void skip_comment(jl_maker* r) {
 189     int lead = r->current;
 190 
 191     if (lead == '#') {
 192         skip_line(r);
 193         return;
 194     }
 195 
 196     if (lead != '/') {
 197         fail(r, 1, "expected a slash to start comments");
 198     }
 199 
 200     advance(r);
 201     lead = r->current;
 202 
 203     if (lead == '/') {
 204         skip_line(r);
 205         return;
 206     }
 207 
 208     if (lead == '*') {
 209         skip_multiline_comment(r);
 210         return;
 211     }
 212 
 213     fail(r, 1, "expected `//` or `/*` to start comments");
 214 }
 215 
 216 static inline void seek_token(jl_maker* r) {
 217     while (true) {
 218         const int lead = r->current;
 219 
 220         if (lead != EOF && lead <= ' ') {
 221             advance(r);
 222             continue;
 223         }
 224 
 225         if (lead == '/' || lead == '#') {
 226             skip_comment(r);
 227             continue;
 228         }
 229 
 230         break;
 231     }
 232 }
 233 
 234 bool starts_with_bom(const unsigned char* b, const size_t n) {
 235     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 236 }
 237 
 238 void restart_state(jl_maker* m, FILE* w, FILE* r) {
 239     m->in = r;
 240     m->ilen = 0;
 241     m->ipos = 0;
 242 
 243     m->out = w;
 244     m->opos = 0;
 245 
 246     m->line = 1;
 247     m->pos = 1;
 248 
 249     m->current = EOF;
 250     m->next = EOF;
 251 
 252     m->current = read_byte(m);
 253     if (m->current == EOF) {
 254         return;
 255     }
 256     m->next = read_byte(m);
 257 
 258     // skip leading UTF-8 BOM (byte-order mark), if present
 259     if (starts_with_bom(m->ibuf, m->ilen)) {
 260         // a UTF-8 BOM has 3 bytes
 261         for (size_t i = 0; i < 3 && m->current != EOF; i++) {
 262             advance(m);
 263         }
 264     }
 265 }
 266 
 267 void write_byte(jl_maker* m, unsigned char b) {
 268     if (m->opos < m->ocap) {
 269         m->obuf[m->opos++] = b;
 270         return;
 271     }
 272 
 273     fwrite(m->obuf, 1, m->ocap, m->out);
 274     m->obuf[0] = b;
 275     m->opos = 1;
 276 }
 277 
 278 // write_bytes does as it says, minimizing the number of calls to fwrite
 279 void write_bytes(jl_maker* m, const unsigned char* src, size_t len) {
 280     const size_t rem = m->ocap - m->opos;
 281     if (len < rem) {
 282         memcpy(m->obuf + m->opos, src, len);
 283         m->opos += len;
 284         return;
 285     }
 286 
 287     for (size_t i = 0; i < len; i++) {
 288         write_byte(m, src[i]);
 289     }
 290 }
 291 
 292 void flush(jl_maker* m) {
 293     if (m->opos > 0) {
 294         fwrite(m->obuf, 1, m->opos, m->out);
 295     }
 296     m->opos = 0;
 297     fflush(m->out);
 298 }
 299 
 300 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 301 
 302 static inline bool check_2_byte_rune(int a, int b) {
 303     return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf);
 304 }
 305 
 306 bool check_3_byte_rune(int a, int b, int c) {
 307     return (
 308         (a == 0xe0) &&
 309         (0xa0 <= b && b <= 0xbf) &&
 310         (0x80 <= c && c <= 0xbf)
 311     ) || (
 312         (0xe1 <= a && a <= 0xec) &&
 313         (0x80 <= b && b <= 0xbf) &&
 314         (0x80 <= c && c <= 0xbf)
 315     ) || (
 316         (a == 0xed) &&
 317         (0x80 <= b && b <= 0x9f) &&
 318         (0x80 <= c && c <= 0xbf)
 319     ) || (
 320         (a == 0xee || a == 0xef) &&
 321         (0x80 <= b && b <= 0xbf) &&
 322         (0x80 <= c && c <= 0xbf)
 323     );
 324 }
 325 
 326 bool check_4_byte_rune(int a, int b, int c, int d) {
 327     return (
 328         (a == 0xf0) &&
 329         (0x90 <= b && b <= 0xbf) &&
 330         (0x80 <= c && c <= 0xbf) &&
 331         (0x80 <= d && d <= 0xbf)
 332     ) || (
 333         (a == 0xf1 || a == 0xf3) &&
 334         (0x80 <= b && b <= 0xbf) &&
 335         (0x80 <= c && c <= 0xbf) &&
 336         (0x80 <= d && d <= 0xbf)
 337     ) || (
 338         (a == 0xf4) &&
 339         (0x80 <= b && b <= 0xbf) &&
 340         (0x80 <= c && c <= 0x8f) &&
 341         (0x80 <= d && d <= 0xbf)
 342     );
 343 }
 344 
 345 // write_replacement_char is the recommended action to handle invalid bytes
 346 void write_replacement_char(jl_maker* m) {
 347     write_byte(m, 0xef);
 348     write_byte(m, 0xbf);
 349     write_byte(m, 0xbd);
 350 }
 351 
 352 void handle_invalid_rune(jl_maker* m) {
 353     // fail(m, 1, "invalid unicode value");
 354     write_replacement_char(m);
 355 }
 356 
 357 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8
 358 void write_rune(jl_maker* m, uint32_t rune) {
 359     if (rune < (1 << 7)) {
 360         write_byte(m, rune);
 361         return;
 362     }
 363 
 364     if (rune < (1 << (5 + 6))) {
 365         const int a = 0b11000000 | (rune >> 6);
 366         const int b = 0b10000000 | (rune & 0b00111111);
 367         if (check_2_byte_rune(a, b)) {
 368             write_byte(m, a);
 369             write_byte(m, b);
 370         } else {
 371             write_replacement_char(m);
 372         }
 373         return;
 374     }
 375 
 376     if (rune < (1 << (4 + 6 + 6))) {
 377         const int a = 0b11100000 | (rune >> 12);
 378         const int b = 0b10000000 | ((rune >> 6) & 0b00111111);
 379         const int c = 0b10000000 | (rune & 0b00111111);
 380         if (check_3_byte_rune(a, b, c)) {
 381             write_byte(m, a);
 382             write_byte(m, b);
 383             write_byte(m, c);
 384         } else {
 385             write_replacement_char(m);
 386         }
 387         return;
 388     }
 389 
 390     if (rune < (1 << (3 + 6 + 6 + 6))) {
 391         const int a = 0b11110000 | (rune >> 18);
 392         const int b = 0b10000000 | ((rune >> 12) & 0b00111111);
 393         const int c = 0b10000000 | ((rune >> 6) & 0b00111111);
 394         const int d = 0b10000000 | (rune & 0b00111111);
 395         if (check_4_byte_rune(a, b, c, d)) {
 396             write_byte(m, a);
 397             write_byte(m, b);
 398             write_byte(m, c);
 399             write_byte(m, d);
 400         } else {
 401             write_replacement_char(m);
 402         }
 403         return;
 404     }
 405 
 406     write_replacement_char(m);
 407 }
 408 
 409 void copy_utf8_rune(jl_maker* m) {
 410     const int a = m->current;
 411 
 412     if (a == EOF) {
 413         return;
 414     }
 415 
 416     // handle 1-byte runes
 417     if (a < 128) {
 418         write_byte(m, a);
 419         return;
 420     }
 421 
 422     advance(m);
 423     const int b = m->current;
 424 
 425     if (b == EOF) {
 426         handle_invalid_rune(m);
 427         return;
 428     }
 429 
 430     // handle 2-byte runes
 431     if (check_2_byte_rune(a, b)) {
 432         write_byte(m, a);
 433         write_byte(m, b);
 434         return;
 435     }
 436 
 437     advance(m);
 438     const int c = m->current;
 439 
 440     if (c == EOF) {
 441         handle_invalid_rune(m);
 442         return;
 443     }
 444 
 445     // handle 3-byte runes
 446     if (check_3_byte_rune(a, b, c)) {
 447         write_byte(m, a);
 448         write_byte(m, b);
 449         write_byte(m, c);
 450         return;
 451     }
 452 
 453     advance(m);
 454     const int d = m->current;
 455 
 456     if (d == EOF) {
 457         handle_invalid_rune(m);
 458         return;
 459     }
 460 
 461     // handle 4-byte runes
 462     if (check_4_byte_rune(a, b, c, d)) {
 463         write_byte(m, a);
 464         write_byte(m, b);
 465         write_byte(m, c);
 466         write_byte(m, d);
 467         return;
 468     }
 469 
 470     handle_invalid_rune(m);
 471 }
 472 
 473 // debug is available to diagnose any bug found
 474 void debug(jl_maker* m, const char* fmt, ...) {
 475     va_list args;
 476     va_start(args, fmt);
 477 
 478     if (m->in != stdin) {
 479         fclose(m->in);
 480     }
 481 
 482     write_byte(m, '\n');
 483 
 484     const unsigned long line = m->line;
 485     const unsigned long pos = m->pos;
 486     fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos);
 487     fprintf(stderr, fmt, args);
 488     fprintf(stderr, "\x1b[0m\n");
 489 
 490     va_end(args);
 491 
 492     exit(10);
 493 }
 494 
 495 // fail quits this app right after showing the error message given
 496 void fail(jl_maker* m, int code, const char* msg) {
 497     const unsigned long line = m->line;
 498     const unsigned long pos = m->pos;
 499 
 500     write_byte(m, '\n');
 501     flush(m);
 502     fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg);
 503     exit(code);
 504 }
 505 
 506 bool demand_keyword(jl_maker* m, char* rest) {
 507     for (; rest[0] != 0; rest++) {
 508         const int lead = m->current;
 509         if (lead == EOF || lead != rest[0]) {
 510             return false;
 511         }
 512         advance(m);
 513     }
 514 
 515     return rest[0] == 0;
 516 }
 517 
 518 void handle_null(jl_maker* m) {
 519     if (!demand_keyword(m, "null")) {
 520         fail(m, 1, "expected `null` keyword");
 521     }
 522     write_bytes(m, (unsigned char*)"null", 4);
 523 }
 524 
 525 void handle_true(jl_maker* m) {
 526     if (!demand_keyword(m, "true")) {
 527         fail(m, 1, "expected `true` keyword");
 528     }
 529     write_bytes(m, (unsigned char*)"true", 4);
 530 }
 531 
 532 void handle_false(jl_maker* m) {
 533     if (!demand_keyword(m, "false")) {
 534         fail(m, 1, "expected `false` keyword");
 535     }
 536     write_bytes(m, (unsigned char*)"false", 5);
 537 }
 538 
 539 void handle_capital_none(jl_maker* m) {
 540     if (!demand_keyword(m, "None")) {
 541         fail(m, 1, "expected `None` keyword");
 542     }
 543     write_bytes(m, (unsigned char*)"null", 4);
 544 }
 545 
 546 void handle_capital_true(jl_maker* m) {
 547     if (!demand_keyword(m, "True")) {
 548         fail(m, 1, "expected `True` keyword");
 549     }
 550     write_bytes(m, (unsigned char*)"true", 4);
 551 }
 552 
 553 void handle_capital_false(jl_maker* m) {
 554     if (!demand_keyword(m, "False")) {
 555         fail(m, 1, "expected `False` keyword");
 556     }
 557     write_bytes(m, (unsigned char*)"false", 5);
 558 }
 559 
 560 void handle_digits(jl_maker* m) {
 561     if (!isdigit(m->current)) {
 562         fail(m, 1, "expected/missing digits");
 563     }
 564 
 565     while (isdigit(m->current)) {
 566         write_byte(m, m->current);
 567         advance(m);
 568     }
 569 }
 570 
 571 void handle_number(jl_maker* m) {
 572     handle_digits(m);
 573 
 574     const int lead = m->current;
 575 
 576     if (lead == '.') {
 577         write_byte(m, '.');
 578         advance(m);
 579 
 580         if (isdigit(m->current)) {
 581             handle_digits(m);
 582         } else {
 583             write_byte(m, '0');
 584         }
 585         return;
 586     }
 587 
 588     if (lead == 'e' || lead == 'E') {
 589         write_byte(m, lead);
 590         advance(m);
 591 
 592         if (m->current == '+') {
 593             advance(m);
 594         } else if (m->current == '-') {
 595             write_byte(m, '-');
 596             advance(m);
 597         }
 598 
 599         handle_digits(m);
 600     }
 601 }
 602 
 603 void handle_dot(jl_maker* m) {
 604     write_byte(m, '0');
 605     write_byte(m, '.');
 606     advance(m);
 607 
 608     if (!isdigit(m->current)) {
 609         fail(m, 1, "expected/missing digits after decimal dot");
 610     }
 611     handle_digits(m);
 612 }
 613 
 614 void handle_plus_number(jl_maker* m) {
 615     advance(m);
 616 
 617     if (m->current == '.') {
 618         handle_dot(m);
 619         return;
 620     }
 621     handle_number(m);
 622 }
 623 
 624 void handle_minus_number(jl_maker* m) {
 625     write_byte(m, '-');
 626     advance(m);
 627 
 628     if (m->current == '.') {
 629         handle_dot(m);
 630         return;
 631     }
 632     handle_number(m);
 633 }
 634 
 635 // decode_hex assumes valid hex digits, checked by func is_valid_hex
 636 uint32_t decode_hex(unsigned char hex) {
 637     if ('0' <= hex && hex <= '9') {
 638         return hex - '0';
 639     }
 640     if ('A' <= hex && hex <= 'F') {
 641         return hex - 'A' + 10;
 642     }
 643     if ('a' <= hex && hex <= 'f') {
 644         return hex - 'a' + 10;
 645     }
 646     return 0xffff;
 647 }
 648 
 649 static inline bool is_valid_hex(unsigned char b) {
 650     return false ||
 651         ('0' <= b && b <= '9') ||
 652         ('A' <= b && b <= 'F') ||
 653         ('a' <= b && b <= 'f');
 654 }
 655 
 656 // handle_low_char ensures characters whose ASCII codes are lower than spaces
 657 // are properly escaped for strings
 658 void handle_low_char(jl_maker* m, int c) {
 659     const char* hex = "0123456789ABCDEF";
 660 
 661     switch (c) {
 662     case '\t':
 663         write_byte(m, '\\');
 664         write_byte(m, 't');
 665         break;
 666     case '\n':
 667         write_byte(m, '\\');
 668         write_byte(m, 'n');
 669         break;
 670     case '\r':
 671         write_byte(m, '\\');
 672         write_byte(m, 'r');
 673         break;
 674     case '\b':
 675         write_byte(m, '\\');
 676         write_byte(m, 'b');
 677         break;
 678     case '\f':
 679         write_byte(m, '\\');
 680         write_byte(m, 'f');
 681         break;
 682     case '\v':
 683         write_byte(m, '\\');
 684         write_byte(m, 'v');
 685         break;
 686     default:
 687         write_byte(m, '\\');
 688         write_byte(m, 'u');
 689         write_byte(m, '0');
 690         write_byte(m, '0');
 691         write_byte(m, hex[c / 16]);
 692         write_byte(m, hex[c % 16]);
 693         break;
 694     }
 695 }
 696 
 697 void write_inner_string_hex_quad(jl_maker* m, const unsigned char quad[4]) {
 698     const uint32_t n = 0 +
 699         (decode_hex(quad[0]) << 12) +
 700         (decode_hex(quad[1]) << 8) +
 701         (decode_hex(quad[2]) << 4) +
 702         (decode_hex(quad[3]) << 0);
 703 
 704     switch (n) {
 705     case '"':
 706         write_byte(m, '\\');
 707         write_byte(m, '"');
 708         return;
 709     case '\\':
 710         write_byte(m, '\\');
 711         write_byte(m, '\\');
 712         return;
 713     }
 714 
 715     if (n >= ' ') {
 716         write_rune(m, n);
 717     } else {
 718         handle_low_char(m, n);
 719     }
 720 }
 721 
 722 void handle_hex_quad(jl_maker* m) {
 723     unsigned char quad[4];
 724     for (size_t i = 0; i < 4; i++) {
 725         advance(m);
 726         const int lead = m->current;
 727         if (lead == EOF) {
 728             fail(m, 1, "end of input before end of string");
 729         }
 730         if (is_valid_hex(lead)) {
 731             quad[i] = lead;
 732             continue;
 733         }
 734         fail(m, 1, "invalid hexadecimal digit in string");
 735     }
 736 
 737     write_inner_string_hex_quad(m, quad);
 738 }
 739 
 740 void handle_hex_pair(jl_maker* m) {
 741     unsigned char quad[4] = {'0', '0', '0', '0'};
 742     advance(m);
 743     const int a = m->current;
 744     advance(m);
 745     const int b = m->current;
 746     if (a == EOF || b == EOF) {
 747         fail(m, 1, "end of input before end of string");
 748     }
 749     if (!is_valid_hex(a) || !is_valid_hex(b)) {
 750         fail(m, 1, "invalid hexadecimal digit in string");
 751     }
 752 
 753     quad[2] = a;
 754     quad[3] = b;
 755     write_inner_string_hex_quad(m, quad);
 756 }
 757 
 758 void handle_string_escape(jl_maker* m, int c) {
 759     switch (c) {
 760     case '"':
 761     case '\\':
 762     case 'b':
 763     case 'f':
 764     case 'n':
 765     case 'r':
 766     case 't':
 767         write_byte(m, '\\');
 768         write_byte(m, c);
 769         break;
 770     case 'u':
 771         handle_hex_quad(m);
 772         break;
 773     case 'x':
 774         handle_hex_pair(m);
 775         break;
 776     case '\'':
 777         write_byte(m, '\'');
 778         break;
 779     default:
 780         write_byte(m, m->current);
 781         break;
 782     }
 783 }
 784 
 785 void handle_string(jl_maker* m) {
 786     const unsigned char quote = m->current;
 787     bool escaped = false;
 788 
 789     write_byte(m, '"');
 790 
 791     while (true) {
 792         advance(m);
 793 
 794         int c = m->current;
 795         if (c == EOF) {
 796             fail(m, 1, "input ended before string was close-quoted");
 797         }
 798 
 799         if (escaped) {
 800             handle_string_escape(m, c);
 801             escaped = false;
 802             continue;
 803         }
 804 
 805         switch (c) {
 806         case '\\':
 807             escaped = true;
 808             break;
 809         default:
 810             if (c == quote) {
 811                 write_byte(m, '"');
 812                 advance(m);
 813                 return;
 814             }
 815 
 816             // write_byte(m, c);
 817             if (c < ' ') {
 818                 handle_low_char(m, c);
 819             } else {
 820                 copy_utf8_rune(m);
 821             }
 822             break;
 823         }
 824     }
 825 }
 826 
 827 void handle_token(jl_maker* m);
 828 
 829 void handle_array(jl_maker* m) {
 830     write_byte(m, '[');
 831     advance(m);
 832 
 833     for (size_t i = 0; true; i++) {
 834         seek_token(m);
 835         const int lead = m->current;
 836 
 837         if (lead == EOF) {
 838             fail(m, 1, "unclosed array");
 839         }
 840 
 841         if (lead == ',') {
 842             advance(m);
 843             continue;
 844         }
 845 
 846         if (lead == ']') {
 847             write_byte(m, ']');
 848             advance(m);
 849             return;
 850         }
 851 
 852         if (i > 0) {
 853             write_byte(m, ',');
 854 #ifndef JSON0
 855             write_byte(m, ' ');
 856 #endif
 857         }
 858         if (feof(m->out)) {
 859             return;
 860         }
 861         handle_token(m);
 862     }
 863 }
 864 
 865 // handle_array_jsonl is a slight variation of func handle_array: this one is
 866 // used to handle top-level arrays when running in JSON Lines mode, to emit
 867 // line-feeds after each item, instead of commas between them
 868 void handle_array_jsonl(jl_maker* m) {
 869     advance(m);
 870 
 871     while (true) {
 872         seek_token(m);
 873         const int lead = m->current;
 874 
 875         if (lead == EOF) {
 876             fail(m, 1, "unclosed array");
 877         }
 878 
 879         if (lead == ',') {
 880             advance(m);
 881             continue;
 882         }
 883 
 884         if (lead == ']') {
 885             advance(m);
 886             return;
 887         }
 888 
 889         if (feof(m->out)) {
 890             return;
 891         }
 892 
 893         handle_token(m);
 894         write_byte(m, '\n');
 895     }
 896 }
 897 
 898 void handle_unquoted_key(jl_maker* m) {
 899     write_byte(m, '"');
 900 
 901     while (true) {
 902         int c = m->current;
 903         if (c == EOF) {
 904             fail(m, 1, "input ended with an object key");
 905         }
 906 
 907         write_byte(m, c);
 908         advance(m);
 909 
 910         c = m->current;
 911         if (!isalpha(c) && !isdigit(c) && c != '_') {
 912             break;
 913         }
 914     }
 915 
 916     write_byte(m, '"');
 917 }
 918 
 919 void handle_object(jl_maker* m) {
 920     write_byte(m, '{');
 921     advance(m);
 922 
 923     for (size_t i = 0; true; i++) {
 924         seek_token(m);
 925         int lead = m->current;
 926 
 927         if (lead == EOF) {
 928             fail(m, 1, "unclosed object");
 929         }
 930 
 931         if (lead == ',') {
 932             advance(m);
 933             continue;
 934         }
 935 
 936         if (lead == '}') {
 937             write_byte(m, '}');
 938             advance(m);
 939             return;
 940         }
 941 
 942         if (feof(m->out)) {
 943             return;
 944         }
 945 
 946         if (lead == '"' || lead == '\'') {
 947             if (i > 0) {
 948                 write_byte(m, ',');
 949 #ifndef JSON0
 950                 write_byte(m, ' ');
 951 #endif
 952             }
 953             handle_string(m);
 954         } else if (isalpha(lead) || lead == '_') {
 955             if (i > 0) {
 956                 write_byte(m, ',');
 957 #ifndef JSON0
 958                 write_byte(m, ' ');
 959 #endif
 960             }
 961             handle_unquoted_key(m);
 962         } else {
 963             fail(m, 1, "only strings or identifiers can be object keys");
 964         }
 965 
 966         seek_token(m);
 967         lead = m->current;
 968 
 969         if (lead == EOF) {
 970             fail(m, 1, "input ended after object-key and before value");
 971         }
 972 
 973         if (lead != ':') {
 974             fail(m, 1, "a `:` must follow all object keys");
 975         }
 976 
 977         write_byte(m, ':');
 978 #ifndef JSON0
 979         write_byte(m, ' ');
 980 #endif
 981         advance(m);
 982 
 983         seek_token(m);
 984         if (m->current == EOF) {
 985             fail(m, 1, "input ended after a `:` following an object-key");
 986         }
 987 
 988         handle_token(m);
 989     }
 990 }
 991 
 992 // dispatch ties leading bytes/chars in tokens to the funcs which handle them
 993 void (*dispatch[256])() = {
 994     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 995     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 996     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 997     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 998     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 999     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1000     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1001     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1002     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1003     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1004     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1005     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1006     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1007     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1008     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1009     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1010     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1011     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1012     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1013     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1014     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1015     NULL, NULL, NULL, NULL,
1016 };
1017 
1018 void handle_token(jl_maker* m) {
1019     dispatch[m->current](m);
1020 }
1021 
1022 // handle_invalid_token shows an error message and quits the app right after
1023 void handle_invalid_token(jl_maker* m) {
1024     char msg[64];
1025     unsigned char c = (unsigned char)m->current;
1026     sprintf(msg, "%c (%d): invalid token", c, c);
1027     fail(m, 1, msg);
1028 }
1029 
1030 void handle_input(FILE* w, FILE* src) {
1031     unsigned char ibuf[IBUF_SIZE];
1032     unsigned char obuf[OBUF_SIZE];
1033 
1034     jl_maker m;
1035     m.ibuf = ibuf;
1036     m.icap = sizeof(ibuf);
1037     m.obuf = obuf;
1038     m.ocap = sizeof(obuf);
1039     restart_state(&m, w, src);
1040 
1041     // ignore leading whitespace/comment bytes, if present
1042     seek_token(&m);
1043 
1044     if (m.current == EOF) {
1045         fail(&m, 1, "empty input isn't valid JSON");
1046     }
1047 
1048     if (m.current == '[') {
1049         handle_array_jsonl(&m);
1050         flush(&m);
1051     } else {
1052         handle_token(&m);
1053         write_byte(&m, '\n');
1054         flush(&m);
1055     }
1056 
1057     // ignore trailing whitespace/comment bytes, if present
1058     seek_token(&m);
1059 
1060     // ignore trailing semicolon, if present
1061     if (m.current == ';') {
1062         advance(&m);
1063         // ignore trailing whitespace/comment bytes, if present
1064         seek_token(&m);
1065     }
1066 
1067     if (!feof(src) || m.current != EOF) {
1068         fail(&m, 1, "unexpected trailing JSON data");
1069     }
1070 }
1071 
1072 bool is_help_option(const char* s) {
1073     return (s[0] == '-' && s[1] != 0) && (
1074         strcmp(s, "-h") == 0 ||
1075         strcmp(s, "--h") == 0 ||
1076         strcmp(s, "-help") == 0 ||
1077         strcmp(s, "--help") == 0
1078     );
1079 }
1080 
1081 // handle_file handles data from the filename given; returns false only when
1082 // the file can't be opened
1083 bool handle_file(FILE* w, const char* path) {
1084     FILE* f = fopen(path, "rb");
1085     if (f == NULL) {
1086         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
1087         return false;
1088     }
1089 
1090     handle_input(w, f);
1091     fclose(f);
1092     return true;
1093 }
1094 
1095 // run returns the number of errors
1096 int run(int nargs, char** args) {
1097     if (nargs > 0 && strcmp(args[0], "--") == 0) {
1098         nargs--;
1099         args++;
1100     }
1101 
1102     size_t errors = 0;
1103     for (size_t i = 0; i < nargs && !feof(stdout); i++) {
1104         if (args[i][0] == '-' && args[i][1] == 0) {
1105             handle_input(stdout, stdin);
1106             continue;
1107         }
1108 
1109         if (!handle_file(stdout, args[i])) {
1110             errors++;
1111         }
1112     }
1113 
1114     // use stdin when not given any filepaths
1115     if (nargs < 1) {
1116         handle_input(stdout, stdin);
1117     }
1118 
1119     return errors;
1120 }
1121 
1122 int main(int argc, char** argv) {
1123 #ifdef _WIN32
1124     setmode(fileno(stdin), O_BINARY);
1125     // ensure output lines end in LF instead of CRLF on windows
1126     setmode(fileno(stdout), O_BINARY);
1127     setmode(fileno(stderr), O_BINARY);
1128 #endif
1129 
1130     if (argc > 1 && is_help_option(argv[1])) {
1131         printf("%s", info);
1132         return 0;
1133     }
1134 
1135     // the dispatch table starts as all null function-pointers
1136     for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) {
1137         dispatch[i] = handle_invalid_token;
1138     }
1139 
1140     for (size_t i = '0'; i <= '9'; i++) {
1141         dispatch[i] = handle_number;
1142     }
1143 
1144     dispatch['n'] = handle_null;
1145     dispatch['t'] = handle_true;
1146     dispatch['f'] = handle_false;
1147     dispatch['N'] = handle_capital_none;
1148     dispatch['T'] = handle_capital_true;
1149     dispatch['F'] = handle_capital_false;
1150     dispatch['.'] = handle_dot;
1151     dispatch['+'] = handle_plus_number;
1152     dispatch['-'] = handle_minus_number;
1153     dispatch['"'] = handle_string;
1154     dispatch['\''] = handle_string;
1155     dispatch['['] = handle_array;
1156     dispatch['{'] = handle_object;
1157 
1158     return run(argc - 1, argv + 1) == 0 ? 0 : 1;
1159 }