File: jsonl.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./jsonl ./jsonl.c 29 */ 30 31 #include <ctype.h> 32 #include <stdarg.h> 33 #include <stdbool.h> 34 #include <stdint.h> 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <string.h> 38 39 #ifdef _WIN32 40 #include <fcntl.h> 41 #include <windows.h> 42 #endif 43 44 #ifdef RED_ERRORS 45 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 46 #ifdef __APPLE__ 47 #define ERROR_STYLE "\x1b[31m" 48 #endif 49 #define RESET_STYLE "\x1b[0m" 50 #else 51 #define ERROR_STYLE 52 #define RESET_STYLE 53 #endif 54 55 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 56 57 #ifndef IBUF_SIZE 58 #define IBUF_SIZE (32 * 1024) 59 #endif 60 61 #ifndef OBUF_SIZE 62 #define OBUF_SIZE (8 * 1024) 63 #endif 64 65 // #define JSON0 66 67 const char* info = "" 68 "jsonl [options...] [files...]\n" 69 "\n" 70 "\n" 71 "JSON Lines converts/fixes JSON/pseudo-JSON input into lines of text, each\n" 72 "with valid JSON in it. Multiple lines are emitted when the top-level value\n" 73 "is an array, while a single line is emitted for any other top-level type." 74 "\n" 75 "Besides splitting top-level items into a line-streamable format, this tool\n" 76 "also adapts almost-JSON input into valid JSON, since it\n" 77 "\n" 78 " - ignores both rest-of-line and multi-line comments\n" 79 " - ignores extra/trailing commas in arrays and objects\n" 80 " - turns single-quoted strings/keys into double-quoted strings\n" 81 " - double-quotes unquoted object keys\n" 82 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 83 "\n" 84 "All options available can either start with a single or a double-dash\n" 85 "\n" 86 " -h show this help message\n" 87 " -help show this help message\n" 88 ""; 89 90 typedef struct jl_maker { 91 FILE* in; 92 FILE* out; 93 94 unsigned char* ibuf; 95 size_t ilen; // how many bytes are being used in the input buffer 96 size_t icap; // the input buffer's capacity 97 size_t ipos; // the current position in the input buffer 98 99 size_t line; // the current line, used to show useful error messages 100 size_t pos; // the position in the current line, for error messages 101 102 unsigned char* obuf; 103 size_t ocap; // the output buffer's capacity 104 size_t opos; // the current position in the output buffer 105 106 int current; 107 int next; 108 } jl_maker; 109 110 // advance_reader_pos helps func read_byte do its job 111 static inline void advance_reader_pos(jl_maker* r, unsigned char b) { 112 r->ipos++; 113 if (b == '\n') { 114 r->line++; 115 r->pos = 1; 116 } else { 117 r->pos++; 118 } 119 } 120 121 // read_byte does as it says: check its return for the value EOF, before 122 // using it as the next byte 123 static inline int read_byte(jl_maker* r) { 124 if (r->ipos < r->ilen) { 125 // inside current chunk 126 const unsigned char b = r->ibuf[r->ipos]; 127 advance_reader_pos(r, b); 128 return b; 129 } 130 131 // need to read the next block 132 r->ipos = 0; 133 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 134 if (r->ilen > 0) { 135 const unsigned char b = r->ibuf[r->ipos]; 136 advance_reader_pos(r, b); 137 return b; 138 } 139 140 // reached the end of data 141 return EOF; 142 } 143 144 // advance is used in most of the code, instead of calling read_byte directly 145 static inline void advance(jl_maker* r) { 146 r->current = r->next; 147 r->next = read_byte(r); 148 } 149 150 void fail(jl_maker* m, int code, const char* msg); 151 152 void skip_line(jl_maker* r) { 153 while (true) { 154 advance(r); 155 const int lead = r->current; 156 157 if (lead == EOF) { 158 break; 159 } 160 161 if (lead == '\n') { 162 advance(r); 163 break; 164 } 165 } 166 } 167 168 void skip_multiline_comment(jl_maker* r) { 169 unsigned char prev = 0; 170 171 while (true) { 172 advance(r); 173 const int lead = r->current; 174 175 if (lead == EOF) { 176 break; 177 } 178 179 if (prev == '*' && lead == '/') { 180 advance(r); 181 break; 182 } 183 184 prev = (unsigned char)lead; 185 } 186 } 187 188 void skip_comment(jl_maker* r) { 189 int lead = r->current; 190 191 if (lead == '#') { 192 skip_line(r); 193 return; 194 } 195 196 if (lead != '/') { 197 fail(r, 1, "expected a slash to start comments"); 198 } 199 200 advance(r); 201 lead = r->current; 202 203 if (lead == '/') { 204 skip_line(r); 205 return; 206 } 207 208 if (lead == '*') { 209 skip_multiline_comment(r); 210 return; 211 } 212 213 fail(r, 1, "expected `//` or `/*` to start comments"); 214 } 215 216 static inline void seek_token(jl_maker* r) { 217 while (true) { 218 const int lead = r->current; 219 220 if (lead != EOF && lead <= ' ') { 221 advance(r); 222 continue; 223 } 224 225 if (lead == '/' || lead == '#') { 226 skip_comment(r); 227 continue; 228 } 229 230 break; 231 } 232 } 233 234 bool starts_with_bom(const unsigned char* b, const size_t n) { 235 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 236 } 237 238 void restart_state(jl_maker* m, FILE* w, FILE* r) { 239 m->in = r; 240 m->ilen = 0; 241 m->ipos = 0; 242 243 m->out = w; 244 m->opos = 0; 245 246 m->line = 1; 247 m->pos = 1; 248 249 m->current = EOF; 250 m->next = EOF; 251 252 m->current = read_byte(m); 253 if (m->current == EOF) { 254 return; 255 } 256 m->next = read_byte(m); 257 258 // skip leading UTF-8 BOM (byte-order mark), if present 259 if (starts_with_bom(m->ibuf, m->ilen)) { 260 // a UTF-8 BOM has 3 bytes 261 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 262 advance(m); 263 } 264 } 265 } 266 267 void write_byte(jl_maker* m, unsigned char b) { 268 if (m->opos < m->ocap) { 269 m->obuf[m->opos++] = b; 270 return; 271 } 272 273 fwrite(m->obuf, 1, m->ocap, m->out); 274 m->obuf[0] = b; 275 m->opos = 1; 276 } 277 278 // write_bytes does as it says, minimizing the number of calls to fwrite 279 void write_bytes(jl_maker* m, const unsigned char* src, size_t len) { 280 const size_t rem = m->ocap - m->opos; 281 if (len < rem) { 282 memcpy(m->obuf + m->opos, src, len); 283 m->opos += len; 284 return; 285 } 286 287 for (size_t i = 0; i < len; i++) { 288 write_byte(m, src[i]); 289 } 290 } 291 292 void flush(jl_maker* m) { 293 if (m->opos > 0) { 294 fwrite(m->obuf, 1, m->opos, m->out); 295 } 296 m->opos = 0; 297 fflush(m->out); 298 } 299 300 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 301 302 static inline bool check_2_byte_rune(int a, int b) { 303 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 304 } 305 306 bool check_3_byte_rune(int a, int b, int c) { 307 return ( 308 (a == 0xe0) && 309 (0xa0 <= b && b <= 0xbf) && 310 (0x80 <= c && c <= 0xbf) 311 ) || ( 312 (0xe1 <= a && a <= 0xec) && 313 (0x80 <= b && b <= 0xbf) && 314 (0x80 <= c && c <= 0xbf) 315 ) || ( 316 (a == 0xed) && 317 (0x80 <= b && b <= 0x9f) && 318 (0x80 <= c && c <= 0xbf) 319 ) || ( 320 (a == 0xee || a == 0xef) && 321 (0x80 <= b && b <= 0xbf) && 322 (0x80 <= c && c <= 0xbf) 323 ); 324 } 325 326 bool check_4_byte_rune(int a, int b, int c, int d) { 327 return ( 328 (a == 0xf0) && 329 (0x90 <= b && b <= 0xbf) && 330 (0x80 <= c && c <= 0xbf) && 331 (0x80 <= d && d <= 0xbf) 332 ) || ( 333 (a == 0xf1 || a == 0xf3) && 334 (0x80 <= b && b <= 0xbf) && 335 (0x80 <= c && c <= 0xbf) && 336 (0x80 <= d && d <= 0xbf) 337 ) || ( 338 (a == 0xf4) && 339 (0x80 <= b && b <= 0xbf) && 340 (0x80 <= c && c <= 0x8f) && 341 (0x80 <= d && d <= 0xbf) 342 ); 343 } 344 345 // write_replacement_char is the recommended action to handle invalid bytes 346 void write_replacement_char(jl_maker* m) { 347 write_byte(m, 0xef); 348 write_byte(m, 0xbf); 349 write_byte(m, 0xbd); 350 } 351 352 void handle_invalid_rune(jl_maker* m) { 353 // fail(m, 1, "invalid unicode value"); 354 write_replacement_char(m); 355 } 356 357 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 358 void write_rune(jl_maker* m, uint32_t rune) { 359 if (rune < (1 << 7)) { 360 write_byte(m, rune); 361 return; 362 } 363 364 if (rune < (1 << (5 + 6))) { 365 const int a = 0b11000000 | (rune >> 6); 366 const int b = 0b10000000 | (rune & 0b00111111); 367 if (check_2_byte_rune(a, b)) { 368 write_byte(m, a); 369 write_byte(m, b); 370 } else { 371 write_replacement_char(m); 372 } 373 return; 374 } 375 376 if (rune < (1 << (4 + 6 + 6))) { 377 const int a = 0b11100000 | (rune >> 12); 378 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 379 const int c = 0b10000000 | (rune & 0b00111111); 380 if (check_3_byte_rune(a, b, c)) { 381 write_byte(m, a); 382 write_byte(m, b); 383 write_byte(m, c); 384 } else { 385 write_replacement_char(m); 386 } 387 return; 388 } 389 390 if (rune < (1 << (3 + 6 + 6 + 6))) { 391 const int a = 0b11110000 | (rune >> 18); 392 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 393 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 394 const int d = 0b10000000 | (rune & 0b00111111); 395 if (check_4_byte_rune(a, b, c, d)) { 396 write_byte(m, a); 397 write_byte(m, b); 398 write_byte(m, c); 399 write_byte(m, d); 400 } else { 401 write_replacement_char(m); 402 } 403 return; 404 } 405 406 write_replacement_char(m); 407 } 408 409 void copy_utf8_rune(jl_maker* m) { 410 const int a = m->current; 411 412 if (a == EOF) { 413 return; 414 } 415 416 // handle 1-byte runes 417 if (a < 128) { 418 write_byte(m, a); 419 return; 420 } 421 422 advance(m); 423 const int b = m->current; 424 425 if (b == EOF) { 426 handle_invalid_rune(m); 427 return; 428 } 429 430 // handle 2-byte runes 431 if (check_2_byte_rune(a, b)) { 432 write_byte(m, a); 433 write_byte(m, b); 434 return; 435 } 436 437 advance(m); 438 const int c = m->current; 439 440 if (c == EOF) { 441 handle_invalid_rune(m); 442 return; 443 } 444 445 // handle 3-byte runes 446 if (check_3_byte_rune(a, b, c)) { 447 write_byte(m, a); 448 write_byte(m, b); 449 write_byte(m, c); 450 return; 451 } 452 453 advance(m); 454 const int d = m->current; 455 456 if (d == EOF) { 457 handle_invalid_rune(m); 458 return; 459 } 460 461 // handle 4-byte runes 462 if (check_4_byte_rune(a, b, c, d)) { 463 write_byte(m, a); 464 write_byte(m, b); 465 write_byte(m, c); 466 write_byte(m, d); 467 return; 468 } 469 470 handle_invalid_rune(m); 471 } 472 473 // debug is available to diagnose any bug found 474 void debug(jl_maker* m, const char* fmt, ...) { 475 va_list args; 476 va_start(args, fmt); 477 478 if (m->in != stdin) { 479 fclose(m->in); 480 } 481 482 write_byte(m, '\n'); 483 484 const unsigned long line = m->line; 485 const unsigned long pos = m->pos; 486 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 487 fprintf(stderr, fmt, args); 488 fprintf(stderr, "\x1b[0m\n"); 489 490 va_end(args); 491 492 exit(10); 493 } 494 495 // fail quits this app right after showing the error message given 496 void fail(jl_maker* m, int code, const char* msg) { 497 const unsigned long line = m->line; 498 const unsigned long pos = m->pos; 499 500 write_byte(m, '\n'); 501 flush(m); 502 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 503 exit(code); 504 } 505 506 bool demand_keyword(jl_maker* m, char* rest) { 507 for (; rest[0] != 0; rest++) { 508 const int lead = m->current; 509 if (lead == EOF || lead != rest[0]) { 510 return false; 511 } 512 advance(m); 513 } 514 515 return rest[0] == 0; 516 } 517 518 void handle_null(jl_maker* m) { 519 if (!demand_keyword(m, "null")) { 520 fail(m, 1, "expected `null` keyword"); 521 } 522 write_bytes(m, (unsigned char*)"null", 4); 523 } 524 525 void handle_true(jl_maker* m) { 526 if (!demand_keyword(m, "true")) { 527 fail(m, 1, "expected `true` keyword"); 528 } 529 write_bytes(m, (unsigned char*)"true", 4); 530 } 531 532 void handle_false(jl_maker* m) { 533 if (!demand_keyword(m, "false")) { 534 fail(m, 1, "expected `false` keyword"); 535 } 536 write_bytes(m, (unsigned char*)"false", 5); 537 } 538 539 void handle_capital_none(jl_maker* m) { 540 if (!demand_keyword(m, "None")) { 541 fail(m, 1, "expected `None` keyword"); 542 } 543 write_bytes(m, (unsigned char*)"null", 4); 544 } 545 546 void handle_capital_true(jl_maker* m) { 547 if (!demand_keyword(m, "True")) { 548 fail(m, 1, "expected `True` keyword"); 549 } 550 write_bytes(m, (unsigned char*)"true", 4); 551 } 552 553 void handle_capital_false(jl_maker* m) { 554 if (!demand_keyword(m, "False")) { 555 fail(m, 1, "expected `False` keyword"); 556 } 557 write_bytes(m, (unsigned char*)"false", 5); 558 } 559 560 void handle_digits(jl_maker* m) { 561 if (!isdigit(m->current)) { 562 fail(m, 1, "expected/missing digits"); 563 } 564 565 while (isdigit(m->current)) { 566 write_byte(m, m->current); 567 advance(m); 568 } 569 } 570 571 void handle_number(jl_maker* m) { 572 handle_digits(m); 573 574 const int lead = m->current; 575 576 if (lead == '.') { 577 write_byte(m, '.'); 578 advance(m); 579 580 if (isdigit(m->current)) { 581 handle_digits(m); 582 } else { 583 write_byte(m, '0'); 584 } 585 return; 586 } 587 588 if (lead == 'e' || lead == 'E') { 589 write_byte(m, lead); 590 advance(m); 591 592 if (m->current == '+') { 593 advance(m); 594 } else if (m->current == '-') { 595 write_byte(m, '-'); 596 advance(m); 597 } 598 599 handle_digits(m); 600 } 601 } 602 603 void handle_dot(jl_maker* m) { 604 write_byte(m, '0'); 605 write_byte(m, '.'); 606 advance(m); 607 608 if (!isdigit(m->current)) { 609 fail(m, 1, "expected/missing digits after decimal dot"); 610 } 611 handle_digits(m); 612 } 613 614 void handle_plus_number(jl_maker* m) { 615 advance(m); 616 617 if (m->current == '.') { 618 handle_dot(m); 619 return; 620 } 621 handle_number(m); 622 } 623 624 void handle_minus_number(jl_maker* m) { 625 write_byte(m, '-'); 626 advance(m); 627 628 if (m->current == '.') { 629 handle_dot(m); 630 return; 631 } 632 handle_number(m); 633 } 634 635 // decode_hex assumes valid hex digits, checked by func is_valid_hex 636 uint32_t decode_hex(unsigned char hex) { 637 if ('0' <= hex && hex <= '9') { 638 return hex - '0'; 639 } 640 if ('A' <= hex && hex <= 'F') { 641 return hex - 'A' + 10; 642 } 643 if ('a' <= hex && hex <= 'f') { 644 return hex - 'a' + 10; 645 } 646 return 0xffff; 647 } 648 649 static inline bool is_valid_hex(unsigned char b) { 650 return false || 651 ('0' <= b && b <= '9') || 652 ('A' <= b && b <= 'F') || 653 ('a' <= b && b <= 'f'); 654 } 655 656 // handle_low_char ensures characters whose ASCII codes are lower than spaces 657 // are properly escaped for strings 658 void handle_low_char(jl_maker* m, int c) { 659 const char* hex = "0123456789ABCDEF"; 660 661 switch (c) { 662 case '\t': 663 write_byte(m, '\\'); 664 write_byte(m, 't'); 665 break; 666 case '\n': 667 write_byte(m, '\\'); 668 write_byte(m, 'n'); 669 break; 670 case '\r': 671 write_byte(m, '\\'); 672 write_byte(m, 'r'); 673 break; 674 case '\b': 675 write_byte(m, '\\'); 676 write_byte(m, 'b'); 677 break; 678 case '\f': 679 write_byte(m, '\\'); 680 write_byte(m, 'f'); 681 break; 682 case '\v': 683 write_byte(m, '\\'); 684 write_byte(m, 'v'); 685 break; 686 default: 687 write_byte(m, '\\'); 688 write_byte(m, 'u'); 689 write_byte(m, '0'); 690 write_byte(m, '0'); 691 write_byte(m, hex[c / 16]); 692 write_byte(m, hex[c % 16]); 693 break; 694 } 695 } 696 697 void write_inner_string_hex_quad(jl_maker* m, const unsigned char quad[4]) { 698 const uint32_t n = 0 + 699 (decode_hex(quad[0]) << 12) + 700 (decode_hex(quad[1]) << 8) + 701 (decode_hex(quad[2]) << 4) + 702 (decode_hex(quad[3]) << 0); 703 704 switch (n) { 705 case '"': 706 write_byte(m, '\\'); 707 write_byte(m, '"'); 708 return; 709 case '\\': 710 write_byte(m, '\\'); 711 write_byte(m, '\\'); 712 return; 713 } 714 715 if (n >= ' ') { 716 write_rune(m, n); 717 } else { 718 handle_low_char(m, n); 719 } 720 } 721 722 void handle_hex_quad(jl_maker* m) { 723 unsigned char quad[4]; 724 for (size_t i = 0; i < 4; i++) { 725 advance(m); 726 const int lead = m->current; 727 if (lead == EOF) { 728 fail(m, 1, "end of input before end of string"); 729 } 730 if (is_valid_hex(lead)) { 731 quad[i] = lead; 732 continue; 733 } 734 fail(m, 1, "invalid hexadecimal digit in string"); 735 } 736 737 write_inner_string_hex_quad(m, quad); 738 } 739 740 void handle_hex_pair(jl_maker* m) { 741 unsigned char quad[4] = {'0', '0', '0', '0'}; 742 advance(m); 743 const int a = m->current; 744 advance(m); 745 const int b = m->current; 746 if (a == EOF || b == EOF) { 747 fail(m, 1, "end of input before end of string"); 748 } 749 if (!is_valid_hex(a) || !is_valid_hex(b)) { 750 fail(m, 1, "invalid hexadecimal digit in string"); 751 } 752 753 quad[2] = a; 754 quad[3] = b; 755 write_inner_string_hex_quad(m, quad); 756 } 757 758 void handle_string_escape(jl_maker* m, int c) { 759 switch (c) { 760 case '"': 761 case '\\': 762 case 'b': 763 case 'f': 764 case 'n': 765 case 'r': 766 case 't': 767 write_byte(m, '\\'); 768 write_byte(m, c); 769 break; 770 case 'u': 771 handle_hex_quad(m); 772 break; 773 case 'x': 774 handle_hex_pair(m); 775 break; 776 case '\'': 777 write_byte(m, '\''); 778 break; 779 default: 780 write_byte(m, m->current); 781 break; 782 } 783 } 784 785 void handle_string(jl_maker* m) { 786 const unsigned char quote = m->current; 787 bool escaped = false; 788 789 write_byte(m, '"'); 790 791 while (true) { 792 advance(m); 793 794 int c = m->current; 795 if (c == EOF) { 796 fail(m, 1, "input ended before string was close-quoted"); 797 } 798 799 if (escaped) { 800 handle_string_escape(m, c); 801 escaped = false; 802 continue; 803 } 804 805 switch (c) { 806 case '\\': 807 escaped = true; 808 break; 809 default: 810 if (c == quote) { 811 write_byte(m, '"'); 812 advance(m); 813 return; 814 } 815 816 // write_byte(m, c); 817 if (c < ' ') { 818 handle_low_char(m, c); 819 } else { 820 copy_utf8_rune(m); 821 } 822 break; 823 } 824 } 825 } 826 827 void handle_token(jl_maker* m); 828 829 void handle_array(jl_maker* m) { 830 write_byte(m, '['); 831 advance(m); 832 833 for (size_t i = 0; true; i++) { 834 seek_token(m); 835 const int lead = m->current; 836 837 if (lead == EOF) { 838 fail(m, 1, "unclosed array"); 839 } 840 841 if (lead == ',') { 842 advance(m); 843 continue; 844 } 845 846 if (lead == ']') { 847 write_byte(m, ']'); 848 advance(m); 849 return; 850 } 851 852 if (i > 0) { 853 write_byte(m, ','); 854 #ifndef JSON0 855 write_byte(m, ' '); 856 #endif 857 } 858 if (feof(m->out)) { 859 return; 860 } 861 handle_token(m); 862 } 863 } 864 865 // handle_array_jsonl is a slight variation of func handle_array: this one is 866 // used to handle top-level arrays when running in JSON Lines mode, to emit 867 // line-feeds after each item, instead of commas between them 868 void handle_array_jsonl(jl_maker* m) { 869 advance(m); 870 871 while (true) { 872 seek_token(m); 873 const int lead = m->current; 874 875 if (lead == EOF) { 876 fail(m, 1, "unclosed array"); 877 } 878 879 if (lead == ',') { 880 advance(m); 881 continue; 882 } 883 884 if (lead == ']') { 885 advance(m); 886 return; 887 } 888 889 if (feof(m->out)) { 890 return; 891 } 892 893 handle_token(m); 894 write_byte(m, '\n'); 895 } 896 } 897 898 void handle_unquoted_key(jl_maker* m) { 899 write_byte(m, '"'); 900 901 while (true) { 902 int c = m->current; 903 if (c == EOF) { 904 fail(m, 1, "input ended with an object key"); 905 } 906 907 write_byte(m, c); 908 advance(m); 909 910 c = m->current; 911 if (!isalpha(c) && !isdigit(c) && c != '_') { 912 break; 913 } 914 } 915 916 write_byte(m, '"'); 917 } 918 919 void handle_object(jl_maker* m) { 920 write_byte(m, '{'); 921 advance(m); 922 923 for (size_t i = 0; true; i++) { 924 seek_token(m); 925 int lead = m->current; 926 927 if (lead == EOF) { 928 fail(m, 1, "unclosed object"); 929 } 930 931 if (lead == ',') { 932 advance(m); 933 continue; 934 } 935 936 if (lead == '}') { 937 write_byte(m, '}'); 938 advance(m); 939 return; 940 } 941 942 if (feof(m->out)) { 943 return; 944 } 945 946 if (lead == '"' || lead == '\'') { 947 if (i > 0) { 948 write_byte(m, ','); 949 #ifndef JSON0 950 write_byte(m, ' '); 951 #endif 952 } 953 handle_string(m); 954 } else if (isalpha(lead) || lead == '_') { 955 if (i > 0) { 956 write_byte(m, ','); 957 #ifndef JSON0 958 write_byte(m, ' '); 959 #endif 960 } 961 handle_unquoted_key(m); 962 } else { 963 fail(m, 1, "only strings or identifiers can be object keys"); 964 } 965 966 seek_token(m); 967 lead = m->current; 968 969 if (lead == EOF) { 970 fail(m, 1, "input ended after object-key and before value"); 971 } 972 973 if (lead != ':') { 974 fail(m, 1, "a `:` must follow all object keys"); 975 } 976 977 write_byte(m, ':'); 978 #ifndef JSON0 979 write_byte(m, ' '); 980 #endif 981 advance(m); 982 983 seek_token(m); 984 if (m->current == EOF) { 985 fail(m, 1, "input ended after a `:` following an object-key"); 986 } 987 988 handle_token(m); 989 } 990 } 991 992 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 993 void (*dispatch[256])() = { 994 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 995 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 996 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 997 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 998 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 999 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1000 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1001 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1002 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1003 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1004 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1005 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1006 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1007 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1008 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1009 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1010 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1011 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1012 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1013 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1014 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1015 NULL, NULL, NULL, NULL, 1016 }; 1017 1018 void handle_token(jl_maker* m) { 1019 dispatch[m->current](m); 1020 } 1021 1022 // handle_invalid_token shows an error message and quits the app right after 1023 void handle_invalid_token(jl_maker* m) { 1024 char msg[64]; 1025 unsigned char c = (unsigned char)m->current; 1026 sprintf(msg, "%c (%d): invalid token", c, c); 1027 fail(m, 1, msg); 1028 } 1029 1030 void handle_input(FILE* w, FILE* src) { 1031 unsigned char ibuf[IBUF_SIZE]; 1032 unsigned char obuf[OBUF_SIZE]; 1033 1034 jl_maker m; 1035 m.ibuf = ibuf; 1036 m.icap = sizeof(ibuf); 1037 m.obuf = obuf; 1038 m.ocap = sizeof(obuf); 1039 restart_state(&m, w, src); 1040 1041 // ignore leading whitespace/comment bytes, if present 1042 seek_token(&m); 1043 1044 if (m.current == EOF) { 1045 fail(&m, 1, "empty input isn't valid JSON"); 1046 } 1047 1048 if (m.current == '[') { 1049 handle_array_jsonl(&m); 1050 flush(&m); 1051 } else { 1052 handle_token(&m); 1053 write_byte(&m, '\n'); 1054 flush(&m); 1055 } 1056 1057 // ignore trailing whitespace/comment bytes, if present 1058 seek_token(&m); 1059 1060 // ignore trailing semicolon, if present 1061 if (m.current == ';') { 1062 advance(&m); 1063 // ignore trailing whitespace/comment bytes, if present 1064 seek_token(&m); 1065 } 1066 1067 if (!feof(src) || m.current != EOF) { 1068 fail(&m, 1, "unexpected trailing JSON data"); 1069 } 1070 } 1071 1072 bool is_help_option(const char* s) { 1073 return (s[0] == '-' && s[1] != 0) && ( 1074 strcmp(s, "-h") == 0 || 1075 strcmp(s, "--h") == 0 || 1076 strcmp(s, "-help") == 0 || 1077 strcmp(s, "--help") == 0 1078 ); 1079 } 1080 1081 // handle_file handles data from the filename given; returns false only when 1082 // the file can't be opened 1083 bool handle_file(FILE* w, const char* path) { 1084 FILE* f = fopen(path, "rb"); 1085 if (f == NULL) { 1086 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1087 return false; 1088 } 1089 1090 handle_input(w, f); 1091 fclose(f); 1092 return true; 1093 } 1094 1095 // run returns the number of errors 1096 int run(int nargs, char** args) { 1097 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1098 nargs--; 1099 args++; 1100 } 1101 1102 size_t errors = 0; 1103 for (size_t i = 0; i < nargs && !feof(stdout); i++) { 1104 if (args[i][0] == '-' && args[i][1] == 0) { 1105 handle_input(stdout, stdin); 1106 continue; 1107 } 1108 1109 if (!handle_file(stdout, args[i])) { 1110 errors++; 1111 } 1112 } 1113 1114 // use stdin when not given any filepaths 1115 if (nargs < 1) { 1116 handle_input(stdout, stdin); 1117 } 1118 1119 return errors; 1120 } 1121 1122 int main(int argc, char** argv) { 1123 #ifdef _WIN32 1124 setmode(fileno(stdin), O_BINARY); 1125 // ensure output lines end in LF instead of CRLF on windows 1126 setmode(fileno(stdout), O_BINARY); 1127 setmode(fileno(stderr), O_BINARY); 1128 #endif 1129 1130 if (argc > 1 && is_help_option(argv[1])) { 1131 printf("%s", info); 1132 return 0; 1133 } 1134 1135 // the dispatch table starts as all null function-pointers 1136 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1137 dispatch[i] = handle_invalid_token; 1138 } 1139 1140 for (size_t i = '0'; i <= '9'; i++) { 1141 dispatch[i] = handle_number; 1142 } 1143 1144 dispatch['n'] = handle_null; 1145 dispatch['t'] = handle_true; 1146 dispatch['f'] = handle_false; 1147 dispatch['N'] = handle_capital_none; 1148 dispatch['T'] = handle_capital_true; 1149 dispatch['F'] = handle_capital_false; 1150 dispatch['.'] = handle_dot; 1151 dispatch['+'] = handle_plus_number; 1152 dispatch['-'] = handle_minus_number; 1153 dispatch['"'] = handle_string; 1154 dispatch['\''] = handle_string; 1155 dispatch['['] = handle_array; 1156 dispatch['{'] = handle_object; 1157 1158 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1159 }