File: nj.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./nj ./nj.c 29 30 Building with COMPACT_OUTPUT defined makes `nj` output many fewer bytes, at 31 the cost of using arguably worse colors. You can do that by running 32 33 cc -s -O3 -march=native -mtune=native -flto -D COMPACT_OUTPUT -o ./nj ./nj.c 34 35 Building for macos always uses COMPACT_OUTPUT, as the default terminal app 36 there still doesn't support rgb colors. 37 */ 38 39 #include <ctype.h> 40 #include <stdarg.h> 41 #include <stdbool.h> 42 #include <stdint.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <string.h> 46 47 #ifdef _WIN32 48 #include <fcntl.h> 49 #include <windows.h> 50 #endif 51 52 #ifdef RED_ERRORS 53 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 54 #ifdef __APPLE__ 55 #define ERROR_STYLE "\x1b[31m" 56 #endif 57 #define ERROR_LINE(MSG) (ERROR_STYLE MSG "\x1b[0m\n") 58 #else 59 #define ERROR_LINE(MSG) (MSG "\n") 60 #endif 61 62 #ifdef __APPLE__ 63 #define COMPACT_OUTPUT 64 #endif 65 66 #ifndef IBUF_SIZE 67 #define IBUF_SIZE (32 * 1024) 68 #endif 69 70 #ifndef OBUF_SIZE 71 #define OBUF_SIZE (8 * 1024) 72 #endif 73 74 #ifndef INDENTATION 75 #define INDENTATION 2 76 #endif 77 78 // CONST_SLICE initializes a slice struct using the string-constant given 79 #define CONST_SLICE(s, x) init_slice(s, (unsigned char*)x, sizeof(x) - 1) 80 81 // EMIT_CONST emits string constants without their final null byte 82 #define EMIT_CONST(w, x) write_bytes(w, (unsigned char*)x, sizeof(x) - 1) 83 84 #define RESET_STYLE "\x1b[0m" 85 86 #ifdef COMPACT_OUTPUT 87 // #define NULL_STYLE "\x1b[37m" 88 #define NULL_STYLE "\x1b[38;5;248m" 89 #define BOOL_STYLE "\x1b[36m" 90 #define NUMBER_STYLE "\x1b[32m" 91 #define NEGATIVE_STYLE "\x1b[31m" 92 #define KEY_STYLE "\x1b[35m" 93 // #define SYNTAX_STYLE "\x1b[37m" 94 #define SYNTAX_STYLE "\x1b[38;5;248m" 95 // #define NULL_STYLE "\x1b[38;5;248m" 96 // #define BOOL_STYLE "\x1b[38;5;74m" 97 // #define NUMBER_STYLE "\x1b[38;5;29m" 98 // #define NEGATIVE_STYLE "\x1b[38;5;1m" 99 // #define KEY_STYLE "\x1b[38;5;99m" 100 // #define SYNTAX_STYLE "\x1b[38;5;248m" 101 #else 102 #define NULL_STYLE "\x1b[38;2;168;168;168m" 103 #define BOOL_STYLE "\x1b[38;2;95;175;215m" 104 #define NUMBER_STYLE "\x1b[38;2;0;135;95m" 105 #define NEGATIVE_STYLE "\x1b[38;2;204;0;0m" 106 #define KEY_STYLE "\x1b[38;2;135;95;255m" 107 #define SYNTAX_STYLE "\x1b[38;2;168;168;168m" 108 #endif 109 110 const char* info = "" 111 "nj [options...] [file...]\n" 112 "\n" 113 "\n" 114 "Nice Json converts/fixes JSON/pseudo-JSON input into ANSI-styled multi-line\n" 115 "JSON which uses 2 spaces for each indentation level.\n" 116 "\n" 117 "Besides styling and indenting JSON, this tool also adapts almost-JSON input\n" 118 "into valid JSON, since it\n" 119 "\n" 120 " - ignores both rest-of-line and multi-line comments\n" 121 " - ignores extra/trailing commas in arrays and objects\n" 122 " - turns single-quoted strings/keys into double-quoted strings\n" 123 " - double-quotes unquoted object keys\n" 124 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 125 "\n" 126 "All options available can either start with a single or a double-dash\n" 127 "\n" 128 " -h show this help message\n" 129 " -help show this help message\n" 130 ""; 131 132 typedef struct slice { 133 unsigned char* ptr; 134 size_t len; 135 } slice; 136 137 static inline void init_slice(slice* s, unsigned char* ptr, size_t len) { 138 s->ptr = ptr; 139 s->len = len; 140 } 141 142 typedef struct nj_maker { 143 FILE* in; 144 FILE* out; 145 146 unsigned char* ibuf; 147 size_t ilen; // how many bytes are being used in the input buffer 148 size_t icap; // the input buffer's capacity 149 size_t ipos; // the current position in the input buffer 150 151 size_t line; // the current line, used to show useful error messages 152 size_t pos; // the position in the current line, for error messages 153 154 unsigned char* obuf; 155 size_t ocap; // the output buffer's capacity 156 size_t opos; // the current position in the output buffer 157 158 ssize_t level; // the current indentation/nesting level 159 160 int current; 161 int next; 162 } nj_maker; 163 164 // advance_reader_pos helps func read_byte do its job 165 static inline void advance_reader_pos(nj_maker* r, unsigned char b) { 166 r->ipos++; 167 if (b == '\n') { 168 r->line++; 169 r->pos = 1; 170 } else { 171 r->pos++; 172 } 173 } 174 175 // read_byte does as it says: check its return for the value EOF, before 176 // using it as the next byte 177 int read_byte(nj_maker* r) { 178 if (r->ipos < r->ilen) { 179 // inside current chunk 180 const unsigned char b = r->ibuf[r->ipos]; 181 advance_reader_pos(r, b); 182 return b; 183 } 184 185 // need to read the next block 186 r->ipos = 0; 187 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 188 if (r->ilen > 0) { 189 const unsigned char b = r->ibuf[r->ipos]; 190 advance_reader_pos(r, b); 191 return b; 192 } 193 194 // reached the end of data 195 return EOF; 196 } 197 198 // advance is used in most of the code, instead of calling read_byte directly 199 static inline void advance(nj_maker* r) { 200 r->current = r->next; 201 r->next = read_byte(r); 202 } 203 204 void fail(nj_maker* m, int code, const char* msg); 205 206 void skip_line(nj_maker* r) { 207 while (true) { 208 advance(r); 209 const int lead = r->current; 210 211 if (lead == EOF) { 212 break; 213 } 214 215 if (lead == '\n') { 216 advance(r); 217 break; 218 } 219 } 220 } 221 222 void skip_multiline_comment(nj_maker* r) { 223 unsigned char prev = 0; 224 225 while (true) { 226 advance(r); 227 const int lead = r->current; 228 229 if (lead == EOF) { 230 break; 231 } 232 233 if (prev == '*' && lead == '/') { 234 advance(r); 235 break; 236 } 237 238 prev = (unsigned char)lead; 239 } 240 } 241 242 void skip_comment(nj_maker* r) { 243 int lead = r->current; 244 245 if (lead == '#') { 246 skip_line(r); 247 return; 248 } 249 250 if (lead != '/') { 251 fail(r, 1, "expected a slash to start comments"); 252 } 253 254 advance(r); 255 lead = r->current; 256 257 if (lead == '/') { 258 skip_line(r); 259 return; 260 } 261 262 if (lead == '*') { 263 skip_multiline_comment(r); 264 return; 265 } 266 267 fail(r, 1, "expected `//` or `/*` to start comments"); 268 } 269 270 static inline void seek_token(nj_maker* r) { 271 while (true) { 272 const int lead = r->current; 273 274 if (lead != EOF && lead <= ' ') { 275 advance(r); 276 continue; 277 } 278 279 if (lead == '/' || lead == '#') { 280 skip_comment(r); 281 continue; 282 } 283 284 break; 285 } 286 } 287 288 bool starts_with_bom(const unsigned char* b, const size_t n) { 289 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 290 } 291 292 void restart_state(nj_maker* m, FILE* w, FILE* r) { 293 m->in = r; 294 m->ilen = 0; 295 m->ipos = 0; 296 297 m->out = w; 298 m->opos = 0; 299 300 m->line = 1; 301 m->pos = 1; 302 303 m->current = EOF; 304 m->next = EOF; 305 306 m->current = read_byte(m); 307 if (m->current == EOF) { 308 return; 309 } 310 m->next = read_byte(m); 311 312 m->level = 0; 313 314 // skip leading UTF-8 BOM (byte-order mark), if present 315 if (starts_with_bom(m->ibuf, m->ilen)) { 316 // a UTF-8 BOM has 3 bytes 317 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 318 advance(m); 319 } 320 } 321 } 322 323 void write_byte(nj_maker* m, unsigned char b) { 324 if (m->opos < m->ocap) { 325 m->obuf[m->opos++] = b; 326 return; 327 } 328 329 fwrite(m->obuf, 1, m->ocap, m->out); 330 m->obuf[0] = b; 331 m->opos = 1; 332 } 333 334 // write_bytes does as it says, minimizing the number of calls to fwrite 335 void write_bytes(nj_maker* m, const unsigned char* src, size_t len) { 336 const size_t rem = m->ocap - m->opos; 337 if (len < rem) { 338 memcpy(m->obuf + m->opos, src, len); 339 m->opos += len; 340 return; 341 } 342 343 for (size_t i = 0; i < len; i++) { 344 write_byte(m, src[i]); 345 } 346 } 347 348 void write_spaces(nj_maker* m, ssize_t n) { 349 const unsigned char spaces[32] = " "; 350 while (n > sizeof(spaces)) { 351 write_bytes(m, spaces, sizeof(spaces)); 352 n -= sizeof(spaces); 353 } 354 if (n > 0) { 355 write_bytes(m, spaces, n); 356 } 357 } 358 359 static inline void indent(nj_maker* m) { 360 write_spaces(m, INDENTATION * m->level); 361 } 362 363 void flush(nj_maker* m) { 364 if (m->opos > 0) { 365 fwrite(m->obuf, 1, m->opos, m->out); 366 } 367 m->opos = 0; 368 fflush(m->out); 369 } 370 371 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 372 373 static inline bool check_2_byte_rune(int a, int b) { 374 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 375 } 376 377 bool check_3_byte_rune(int a, int b, int c) { 378 return ( 379 (a == 0xe0) && 380 (0xa0 <= b && b <= 0xbf) && 381 (0x80 <= c && c <= 0xbf) 382 ) || ( 383 (0xe1 <= a && a <= 0xec) && 384 (0x80 <= b && b <= 0xbf) && 385 (0x80 <= c && c <= 0xbf) 386 ) || ( 387 (a == 0xed) && 388 (0x80 <= b && b <= 0x9f) && 389 (0x80 <= c && c <= 0xbf) 390 ) || ( 391 (a == 0xee || a == 0xef) && 392 (0x80 <= b && b <= 0xbf) && 393 (0x80 <= c && c <= 0xbf) 394 ); 395 } 396 397 bool check_4_byte_rune(int a, int b, int c, int d) { 398 return ( 399 (a == 0xf0) && 400 (0x90 <= b && b <= 0xbf) && 401 (0x80 <= c && c <= 0xbf) && 402 (0x80 <= d && d <= 0xbf) 403 ) || ( 404 (a == 0xf1 || a == 0xf3) && 405 (0x80 <= b && b <= 0xbf) && 406 (0x80 <= c && c <= 0xbf) && 407 (0x80 <= d && d <= 0xbf) 408 ) || ( 409 (a == 0xf4) && 410 (0x80 <= b && b <= 0xbf) && 411 (0x80 <= c && c <= 0x8f) && 412 (0x80 <= d && d <= 0xbf) 413 ); 414 } 415 416 // write_replacement_char is the recommended action to handle invalid bytes 417 void write_replacement_char(nj_maker* m) { 418 write_byte(m, 0xef); 419 write_byte(m, 0xbf); 420 write_byte(m, 0xbd); 421 } 422 423 void handle_invalid_rune(nj_maker* m) { 424 // fail(m, 1, "invalid unicode value"); 425 write_replacement_char(m); 426 } 427 428 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 429 void write_rune(nj_maker* m, uint32_t rune) { 430 if (rune < (1 << 7)) { 431 write_byte(m, rune); 432 return; 433 } 434 435 if (rune < (1 << (5 + 6))) { 436 const int a = 0b11000000 | (rune >> 6); 437 const int b = 0b10000000 | (rune & 0b00111111); 438 if (check_2_byte_rune(a, b)) { 439 write_byte(m, a); 440 write_byte(m, b); 441 } else { 442 write_replacement_char(m); 443 } 444 return; 445 } 446 447 if (rune < (1 << (4 + 6 + 6))) { 448 const int a = 0b11100000 | (rune >> 12); 449 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 450 const int c = 0b10000000 | (rune & 0b00111111); 451 if (check_3_byte_rune(a, b, c)) { 452 write_byte(m, a); 453 write_byte(m, b); 454 write_byte(m, c); 455 } else { 456 write_replacement_char(m); 457 } 458 return; 459 } 460 461 if (rune < (1 << (3 + 6 + 6 + 6))) { 462 const int a = 0b11110000 | (rune >> 18); 463 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 464 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 465 const int d = 0b10000000 | (rune & 0b00111111); 466 if (check_4_byte_rune(a, b, c, d)) { 467 write_byte(m, a); 468 write_byte(m, b); 469 write_byte(m, c); 470 write_byte(m, d); 471 } else { 472 write_replacement_char(m); 473 } 474 return; 475 } 476 477 write_replacement_char(m); 478 } 479 480 void copy_utf8_rune(nj_maker* m) { 481 const int a = m->current; 482 483 if (a == EOF) { 484 return; 485 } 486 487 // handle 1-byte runes 488 if (a < 128) { 489 write_byte(m, a); 490 return; 491 } 492 493 advance(m); 494 const int b = m->current; 495 496 if (b == EOF) { 497 handle_invalid_rune(m); 498 return; 499 } 500 501 // handle 2-byte runes 502 if (check_2_byte_rune(a, b)) { 503 write_byte(m, a); 504 write_byte(m, b); 505 return; 506 } 507 508 advance(m); 509 const int c = m->current; 510 511 if (c == EOF) { 512 handle_invalid_rune(m); 513 return; 514 } 515 516 // handle 3-byte runes 517 if (check_3_byte_rune(a, b, c)) { 518 write_byte(m, a); 519 write_byte(m, b); 520 write_byte(m, c); 521 return; 522 } 523 524 advance(m); 525 const int d = m->current; 526 527 if (d == EOF) { 528 handle_invalid_rune(m); 529 return; 530 } 531 532 // handle 4-byte runes 533 if (check_4_byte_rune(a, b, c, d)) { 534 write_byte(m, a); 535 write_byte(m, b); 536 write_byte(m, c); 537 write_byte(m, d); 538 return; 539 } 540 541 handle_invalid_rune(m); 542 } 543 544 // debug is available to diagnose any bug found 545 void debug(nj_maker* m, const char* fmt, ...) { 546 va_list args; 547 va_start(args, fmt); 548 549 if (m->in != stdin) { 550 fclose(m->in); 551 } 552 553 write_byte(m, '\n'); 554 555 const unsigned long line = m->line; 556 const unsigned long pos = m->pos; 557 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 558 fprintf(stderr, fmt, args); 559 fprintf(stderr, "\x1b[0m\n"); 560 561 va_end(args); 562 563 exit(10); 564 } 565 566 // fail quits this app right after showing the error message given 567 void fail(nj_maker* m, int code, const char* msg) { 568 const unsigned long line = m->line; 569 const unsigned long pos = m->pos; 570 571 write_byte(m, '\n'); 572 flush(m); 573 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 574 exit(code); 575 } 576 577 bool demand_keyword(nj_maker* m, char* rest) { 578 for (; rest[0] != 0; rest++) { 579 const int lead = m->current; 580 if (lead == EOF || lead != rest[0]) { 581 return false; 582 } 583 advance(m); 584 } 585 586 return rest[0] == 0; 587 } 588 589 void handle_null(nj_maker* m) { 590 if (!demand_keyword(m, "null")) { 591 fail(m, 1, "expected `null` keyword"); 592 } 593 EMIT_CONST(m, "null"); 594 } 595 596 void handle_true(nj_maker* m) { 597 if (!demand_keyword(m, "true")) { 598 fail(m, 1, "expected `true` keyword"); 599 } 600 EMIT_CONST(m, "true"); 601 } 602 603 void handle_false(nj_maker* m) { 604 if (!demand_keyword(m, "false")) { 605 fail(m, 1, "expected `false` keyword"); 606 } 607 EMIT_CONST(m, "false"); 608 } 609 610 void handle_capital_none(nj_maker* m) { 611 if (!demand_keyword(m, "None")) { 612 fail(m, 1, "expected `None` keyword"); 613 } 614 EMIT_CONST(m, "null"); 615 } 616 617 void handle_capital_true(nj_maker* m) { 618 if (!demand_keyword(m, "True")) { 619 fail(m, 1, "expected `True` keyword"); 620 } 621 EMIT_CONST(m, "true"); 622 } 623 624 void handle_capital_false(nj_maker* m) { 625 if (!demand_keyword(m, "False")) { 626 fail(m, 1, "expected `False` keyword"); 627 } 628 EMIT_CONST(m, "false"); 629 } 630 631 void handle_digits(nj_maker* m) { 632 if (!isdigit(m->current)) { 633 fail(m, 1, "expected/missing digits"); 634 } 635 636 while (isdigit(m->current)) { 637 write_byte(m, m->current); 638 advance(m); 639 } 640 } 641 642 void handle_number(nj_maker* m) { 643 handle_digits(m); 644 645 const int lead = m->current; 646 647 if (lead == '.') { 648 write_byte(m, '.'); 649 advance(m); 650 651 if (isdigit(m->current)) { 652 handle_digits(m); 653 } else { 654 write_byte(m, '0'); 655 } 656 return; 657 } 658 659 if (lead == 'e' || lead == 'E') { 660 write_byte(m, lead); 661 advance(m); 662 663 if (m->current == '+') { 664 advance(m); 665 } else if (m->current == '-') { 666 write_byte(m, '-'); 667 advance(m); 668 } 669 670 handle_digits(m); 671 } 672 } 673 674 void handle_dot(nj_maker* m) { 675 write_byte(m, '0'); 676 write_byte(m, '.'); 677 advance(m); 678 679 if (!isdigit(m->current)) { 680 fail(m, 1, "expected/missing digits after decimal dot"); 681 } 682 handle_digits(m); 683 } 684 685 void handle_plus_number(nj_maker* m) { 686 advance(m); 687 688 if (m->current == '.') { 689 handle_dot(m); 690 return; 691 } 692 handle_number(m); 693 } 694 695 void handle_minus_number(nj_maker* m) { 696 write_byte(m, '-'); 697 advance(m); 698 699 if (m->current == '.') { 700 handle_dot(m); 701 return; 702 } 703 handle_number(m); 704 } 705 706 // decode_hex assumes valid hex digits, checked by func is_valid_hex 707 uint32_t decode_hex(unsigned char hex) { 708 if ('0' <= hex && hex <= '9') { 709 return hex - '0'; 710 } 711 if ('A' <= hex && hex <= 'F') { 712 return hex - 'A' + 10; 713 } 714 if ('a' <= hex && hex <= 'f') { 715 return hex - 'a' + 10; 716 } 717 return 0xffff; 718 } 719 720 static inline bool is_valid_hex(unsigned char b) { 721 return false || 722 ('0' <= b && b <= '9') || 723 ('A' <= b && b <= 'F') || 724 ('a' <= b && b <= 'f'); 725 } 726 727 // handle_low_char ensures characters whose ASCII codes are lower than spaces 728 // are properly escaped for strings 729 void handle_low_char(nj_maker* m, int c) { 730 const char* hex = "0123456789ABCDEF"; 731 732 switch (c) { 733 case '\t': 734 write_byte(m, '\\'); 735 write_byte(m, 't'); 736 break; 737 case '\n': 738 write_byte(m, '\\'); 739 write_byte(m, 'n'); 740 break; 741 case '\r': 742 write_byte(m, '\\'); 743 write_byte(m, 'r'); 744 break; 745 case '\b': 746 write_byte(m, '\\'); 747 write_byte(m, 'b'); 748 break; 749 case '\f': 750 write_byte(m, '\\'); 751 write_byte(m, 'f'); 752 break; 753 case '\v': 754 write_byte(m, '\\'); 755 write_byte(m, 'v'); 756 break; 757 default: 758 write_byte(m, '\\'); 759 write_byte(m, 'u'); 760 write_byte(m, '0'); 761 write_byte(m, '0'); 762 write_byte(m, hex[c / 16]); 763 write_byte(m, hex[c % 16]); 764 break; 765 } 766 } 767 768 void write_inner_string_hex_quad(nj_maker* m, const unsigned char quad[4]) { 769 const uint32_t n = 0 + 770 (decode_hex(quad[0]) << 12) + 771 (decode_hex(quad[1]) << 8) + 772 (decode_hex(quad[2]) << 4) + 773 (decode_hex(quad[3]) << 0); 774 775 switch (n) { 776 case '"': 777 write_byte(m, '\\'); 778 write_byte(m, '"'); 779 return; 780 case '\\': 781 write_byte(m, '\\'); 782 write_byte(m, '\\'); 783 return; 784 } 785 786 if (n >= ' ') { 787 write_rune(m, n); 788 } else { 789 handle_low_char(m, n); 790 } 791 } 792 793 void handle_hex_quad(nj_maker* m) { 794 unsigned char quad[4]; 795 for (size_t i = 0; i < 4; i++) { 796 advance(m); 797 const int lead = m->current; 798 if (lead == EOF) { 799 fail(m, 1, "end of input before end of string"); 800 } 801 if (is_valid_hex(lead)) { 802 quad[i] = lead; 803 continue; 804 } 805 fail(m, 1, "invalid hexadecimal digit in string"); 806 } 807 808 write_inner_string_hex_quad(m, quad); 809 } 810 811 void handle_hex_pair(nj_maker* m) { 812 unsigned char quad[4] = {'0', '0', '0', '0'}; 813 advance(m); 814 const int a = m->current; 815 advance(m); 816 const int b = m->current; 817 if (a == EOF || b == EOF) { 818 fail(m, 1, "end of input before end of string"); 819 } 820 if (!is_valid_hex(a) || !is_valid_hex(b)) { 821 fail(m, 1, "invalid hexadecimal digit in string"); 822 } 823 824 quad[2] = a; 825 quad[3] = b; 826 write_inner_string_hex_quad(m, quad); 827 } 828 829 void handle_string_escape(nj_maker* m, int c) { 830 switch (c) { 831 case '"': 832 case '\\': 833 case 'b': 834 case 'f': 835 case 'n': 836 case 'r': 837 case 't': 838 write_byte(m, '\\'); 839 write_byte(m, c); 840 break; 841 case 'u': 842 handle_hex_quad(m); 843 break; 844 case 'x': 845 handle_hex_pair(m); 846 break; 847 case '\'': 848 write_byte(m, '\''); 849 break; 850 default: 851 write_byte(m, m->current); 852 break; 853 } 854 } 855 856 ssize_t handle_inner_string(nj_maker* m) { 857 const unsigned char quote = m->current; 858 bool escaped = false; 859 860 for (size_t i = 0; true; i++) { 861 advance(m); 862 863 int c = m->current; 864 if (c == EOF) { 865 fail(m, 1, "input ended before string was close-quoted"); 866 } 867 868 if (escaped) { 869 handle_string_escape(m, c); 870 escaped = false; 871 continue; 872 } 873 874 switch (c) { 875 case '\\': 876 escaped = true; 877 break; 878 default: 879 if (c == quote) { 880 advance(m); 881 return i; 882 } 883 884 // write_byte(m, c); 885 if (c < ' ') { 886 handle_low_char(m, c); 887 } else { 888 copy_utf8_rune(m); 889 } 890 break; 891 } 892 } 893 } 894 895 void handle_quoted_key(nj_maker* m) { 896 if (m->current != m->next) { 897 EMIT_CONST(m, "\"" KEY_STYLE); 898 handle_inner_string(m); 899 EMIT_CONST(m, SYNTAX_STYLE "\""); 900 } else { 901 write_byte(m, '"'); 902 handle_inner_string(m); 903 write_byte(m, '"'); 904 } 905 } 906 907 void handle_string(nj_maker* m) { 908 if (m->current != m->next) { 909 EMIT_CONST(m, "\"" RESET_STYLE); 910 handle_inner_string(m); 911 EMIT_CONST(m, SYNTAX_STYLE "\""); 912 } else { 913 write_byte(m, '"'); 914 handle_inner_string(m); 915 write_byte(m, '"'); 916 } 917 } 918 919 void handle_token(nj_maker* m, ssize_t lead_level); 920 921 void handle_array(nj_maker* m) { 922 m->level++; 923 write_byte(m, '['); 924 advance(m); 925 926 for (size_t i = 0; true; i++) { 927 seek_token(m); 928 const int lead = m->current; 929 930 if (lead == EOF) { 931 fail(m, 1, "unclosed array"); 932 } 933 934 if (lead == ',') { 935 advance(m); 936 continue; 937 } 938 939 if (lead == ']') { 940 m->level--; 941 if (i > 0) { 942 write_byte(m, '\n'); 943 indent(m); 944 EMIT_CONST(m, SYNTAX_STYLE "]"); 945 } else { 946 write_byte(m, ']'); 947 } 948 advance(m); 949 return; 950 } 951 952 if (i > 0) { 953 EMIT_CONST(m, SYNTAX_STYLE ","); 954 } 955 write_byte(m, '\n'); 956 if (feof(m->out)) { 957 return; 958 } 959 handle_token(m, m->level); 960 } 961 } 962 963 void handle_unquoted_key(nj_maker* m) { 964 EMIT_CONST(m, SYNTAX_STYLE "\"" KEY_STYLE); 965 966 while (true) { 967 int c = m->current; 968 if (c == EOF) { 969 fail(m, 1, "input ended with an object key"); 970 } 971 972 write_byte(m, c); 973 advance(m); 974 975 c = m->current; 976 if (!isalpha(c) && !isdigit(c) && c != '_') { 977 break; 978 } 979 } 980 981 EMIT_CONST(m, SYNTAX_STYLE "\""); 982 } 983 984 void handle_object(nj_maker* m) { 985 m->level++; 986 write_byte(m, '{'); 987 advance(m); 988 989 for (size_t i = 0; true; i++) { 990 seek_token(m); 991 int lead = m->current; 992 993 if (lead == EOF) { 994 fail(m, 1, "unclosed object"); 995 } 996 997 if (lead == ',') { 998 advance(m); 999 continue; 1000 } 1001 1002 if (lead == '}') { 1003 m->level--; 1004 if (i > 0) { 1005 write_byte(m, '\n'); 1006 indent(m); 1007 EMIT_CONST(m, SYNTAX_STYLE "}"); 1008 } else { 1009 write_byte(m, '}'); 1010 } 1011 advance(m); 1012 return; 1013 } 1014 1015 if (feof(m->out)) { 1016 return; 1017 } 1018 1019 if (lead == '"' || lead == '\'') { 1020 if (i > 0) { 1021 EMIT_CONST(m, SYNTAX_STYLE ","); 1022 } 1023 write_byte(m, '\n'); 1024 indent(m); 1025 EMIT_CONST(m, SYNTAX_STYLE); 1026 handle_quoted_key(m); 1027 } else if (isalpha(lead) || lead == '_') { 1028 if (i > 0) { 1029 EMIT_CONST(m, SYNTAX_STYLE ","); 1030 } 1031 write_byte(m, '\n'); 1032 indent(m); 1033 handle_unquoted_key(m); 1034 } else { 1035 fail(m, 1, "only strings or identifiers can be object keys"); 1036 } 1037 1038 seek_token(m); 1039 lead = m->current; 1040 1041 if (lead == EOF) { 1042 fail(m, 1, "input ended after object-key and before value"); 1043 } 1044 1045 if (lead != ':') { 1046 fail(m, 1, "a `:` must follow all object keys"); 1047 } 1048 1049 EMIT_CONST(m, ": "); 1050 advance(m); 1051 1052 seek_token(m); 1053 if (m->current == EOF) { 1054 fail(m, 1, "input ended after a `:` following an object-key"); 1055 } 1056 1057 handle_token(m, 0); 1058 } 1059 } 1060 1061 // styles ties leading bytes/chars in tokens to their leading ANSI styles 1062 slice styles[256] = {}; 1063 1064 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 1065 void (*dispatch[256])() = {}; 1066 1067 void handle_token(nj_maker* m, ssize_t lead_level) { 1068 const unsigned char b = m->current; 1069 write_spaces(m, INDENTATION * lead_level); 1070 write_bytes(m, styles[b].ptr, styles[b].len); 1071 dispatch[b](m); 1072 } 1073 1074 // handle_invalid_token shows an error message and quits the app right after 1075 void handle_invalid_token(nj_maker* m) { 1076 char msg[64]; 1077 unsigned char c = (unsigned char)m->current; 1078 sprintf(msg, "%c (%d): invalid token", c, c); 1079 fail(m, 1, msg); 1080 } 1081 1082 void handle_input(FILE* src) { 1083 unsigned char ibuf[IBUF_SIZE]; 1084 unsigned char obuf[OBUF_SIZE]; 1085 1086 nj_maker m; 1087 m.ibuf = ibuf; 1088 m.icap = sizeof(ibuf); 1089 m.obuf = obuf; 1090 m.ocap = sizeof(obuf); 1091 restart_state(&m, stdout, src); 1092 1093 // ignore leading whitespace/comment bytes, if present 1094 seek_token(&m); 1095 1096 if (m.current == EOF) { 1097 fail(&m, 1, "empty input isn't valid JSON"); 1098 } 1099 1100 handle_token(&m, 0); 1101 EMIT_CONST(&m, RESET_STYLE); 1102 write_byte(&m, '\n'); 1103 flush(&m); 1104 1105 // ignore trailing whitespace/comment bytes, if present 1106 seek_token(&m); 1107 1108 // ignore trailing semicolon, if present 1109 if (m.current == ';') { 1110 advance(&m); 1111 // ignore trailing whitespace/comment bytes, if present 1112 seek_token(&m); 1113 } 1114 1115 if (!feof(src) || m.current != EOF) { 1116 fail(&m, 1, "unexpected trailing JSON data"); 1117 } 1118 } 1119 1120 bool is_help_option(const char* s) { 1121 return (s[0] == '-' && s[1] != 0) && ( 1122 strcmp(s, "-h") == 0 || 1123 strcmp(s, "--h") == 0 || 1124 strcmp(s, "-help") == 0 || 1125 strcmp(s, "--help") == 0 1126 ); 1127 } 1128 1129 // run returns the error code 1130 int run(int nargs, char** args) { 1131 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1132 nargs--; 1133 args++; 1134 } 1135 1136 if (nargs > 1) { 1137 const char* msg = "can't use more than 1 named input"; 1138 fprintf(stderr, ERROR_LINE("%s"), msg); 1139 return 1; 1140 } 1141 1142 // use stdin when not given a filepath 1143 if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) { 1144 handle_input(stdin); 1145 return 0; 1146 } 1147 1148 const char* path = args[0]; 1149 FILE* f = fopen(path, "rb"); 1150 if (f == NULL) { 1151 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1152 return 1; 1153 } 1154 1155 handle_input(f); 1156 fclose(f); 1157 1158 return 0; 1159 } 1160 1161 int main(int argc, char** argv) { 1162 #ifdef _WIN32 1163 setmode(fileno(stdin), O_BINARY); 1164 // ensure output lines end in LF instead of CRLF on windows 1165 setmode(fileno(stdout), O_BINARY); 1166 setmode(fileno(stderr), O_BINARY); 1167 #endif 1168 1169 if (argc > 1 && is_help_option(argv[1])) { 1170 printf("%s", info); 1171 return 0; 1172 } 1173 1174 memset(dispatch, 0, sizeof(dispatch)); 1175 memset(styles, 0, sizeof(styles)); 1176 1177 // the dispatch table starts as all null function-pointers 1178 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1179 dispatch[i] = handle_invalid_token; 1180 } 1181 1182 for (size_t i = '0'; i <= '9'; i++) { 1183 dispatch[i] = handle_number; 1184 CONST_SLICE(&styles[i], NUMBER_STYLE); 1185 } 1186 1187 dispatch['n'] = handle_null; 1188 dispatch['t'] = handle_true; 1189 dispatch['f'] = handle_false; 1190 dispatch['N'] = handle_capital_none; 1191 dispatch['T'] = handle_capital_true; 1192 dispatch['F'] = handle_capital_false; 1193 dispatch['.'] = handle_dot; 1194 dispatch['+'] = handle_plus_number; 1195 dispatch['-'] = handle_minus_number; 1196 dispatch['"'] = handle_string; 1197 dispatch['\''] = handle_string; 1198 dispatch['['] = handle_array; 1199 dispatch['{'] = handle_object; 1200 1201 CONST_SLICE(&styles['n'], NULL_STYLE); 1202 CONST_SLICE(&styles['t'], BOOL_STYLE); 1203 CONST_SLICE(&styles['f'], BOOL_STYLE); 1204 CONST_SLICE(&styles['N'], NULL_STYLE); 1205 CONST_SLICE(&styles['T'], BOOL_STYLE); 1206 CONST_SLICE(&styles['F'], BOOL_STYLE); 1207 CONST_SLICE(&styles['.'], NUMBER_STYLE); 1208 CONST_SLICE(&styles['+'], NUMBER_STYLE); 1209 // CONST_SLICE(&styles['-'], NUMBER_STYLE); 1210 CONST_SLICE(&styles['-'], NEGATIVE_STYLE); 1211 CONST_SLICE(&styles['"'], SYNTAX_STYLE); 1212 CONST_SLICE(&styles['\''], SYNTAX_STYLE); 1213 CONST_SLICE(&styles['['], SYNTAX_STYLE); 1214 CONST_SLICE(&styles['{'], SYNTAX_STYLE); 1215 1216 // enable full/block-buffering for standard output 1217 setvbuf(stdout, NULL, _IOFBF, 0); 1218 1219 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1220 }