File: nj.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./nj ./nj.c 29 30 Building with COMPACT_OUTPUT defined makes `nj` output many fewer bytes, at 31 the cost of using arguably worse colors. You can do that by running 32 33 cc -s -O3 -march=native -mtune=native -flto -D COMPACT_OUTPUT -o ./nj ./nj.c 34 35 Building for macos always uses COMPACT_OUTPUT, as the default terminal app 36 there still doesn't support rgb colors. 37 */ 38 39 #include <ctype.h> 40 #include <stdarg.h> 41 #include <stdbool.h> 42 #include <stdint.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <string.h> 46 47 #ifdef _WIN32 48 #include <fcntl.h> 49 #include <windows.h> 50 #endif 51 52 #ifdef RED_ERRORS 53 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 54 #ifdef __APPLE__ 55 #define ERROR_STYLE "\x1b[31m" 56 #endif 57 #define ERROR_LINE(MSG) (ERROR_STYLE MSG "\x1b[0m\n") 58 #else 59 #define ERROR_LINE(MSG) (MSG "\n") 60 #endif 61 62 #ifdef __APPLE__ 63 #define COMPACT_OUTPUT 64 #endif 65 66 #ifndef IBUF_SIZE 67 #define IBUF_SIZE (32 * 1024) 68 #endif 69 70 #ifndef OBUF_SIZE 71 #define OBUF_SIZE (8 * 1024) 72 #endif 73 74 #ifndef INDENTATION 75 #define INDENTATION 2 76 #endif 77 78 // CONST_SLICE initializes a slice struct using the string-constant given 79 #define CONST_SLICE(s, x) init_slice(s, (unsigned char*)x, sizeof(x) - 1) 80 81 // EMIT_CONST emits string constants without their final null byte 82 #define EMIT_CONST(w, x) write_bytes(w, (unsigned char*)x, sizeof(x) - 1) 83 84 #define RESET_STYLE "\x1b[0m" 85 86 #ifdef COMPACT_OUTPUT 87 #define NULL_STYLE "\x1b[33m" 88 #define BOOL_STYLE "\x1b[36m" 89 #define NUMBER_STYLE "\x1b[32m" 90 #define NEGATIVE_STYLE "\x1b[31m" 91 #define KEY_STYLE "\x1b[35m" 92 // #define SYNTAX_STYLE "\x1b[37m" 93 #define SYNTAX_STYLE "\x1b[33m" 94 // #define NULL_STYLE "\x1b[38;5;248m" 95 // #define BOOL_STYLE "\x1b[38;5;74m" 96 // #define NUMBER_STYLE "\x1b[38;5;29m" 97 // #define NEGATIVE_STYLE "\x1b[38;5;1m" 98 // #define KEY_STYLE "\x1b[38;5;99m" 99 // #define SYNTAX_STYLE "\x1b[38;5;248m" 100 #else 101 #define NULL_STYLE "\x1b[38;2;168;168;168m" 102 #define BOOL_STYLE "\x1b[38;2;95;175;215m" 103 #define NUMBER_STYLE "\x1b[38;2;0;135;95m" 104 #define NEGATIVE_STYLE "\x1b[38;2;204;0;0m" 105 // #define KEY_STYLE "\x1b[38;2;135;95;255m" 106 #define KEY_STYLE "\x1b[38;2;135;135;225m" 107 #define SYNTAX_STYLE "\x1b[38;2;168;168;168m" 108 #endif 109 110 const char* info = "" 111 "nj [options...] [file...]\n" 112 "\n" 113 "\n" 114 "Nice Json converts/fixes JSON/pseudo-JSON input into ANSI-styled multi-line\n" 115 "JSON which uses 2 spaces for each indentation level.\n" 116 "\n" 117 "Besides styling and indenting JSON, this tool also adapts almost-JSON input\n" 118 "into valid JSON, since it\n" 119 "\n" 120 " - ignores both rest-of-line and multi-line comments\n" 121 " - ignores extra/trailing commas in arrays and objects\n" 122 " - turns single-quoted strings/keys into double-quoted strings\n" 123 " - double-quotes unquoted object keys\n" 124 " - changes \\x 2-hex-digit into \\u 4-hex-digit string-escapes\n" 125 "\n" 126 "All options available can either start with a single or a double-dash\n" 127 "\n" 128 " -h show this help message\n" 129 " -help show this help message\n" 130 ""; 131 132 typedef struct slice { 133 unsigned char* ptr; 134 size_t len; 135 } slice; 136 137 static inline void init_slice(slice* s, unsigned char* ptr, size_t len) { 138 s->ptr = ptr; 139 s->len = len; 140 } 141 142 typedef struct nj_maker { 143 FILE* in; 144 FILE* out; 145 146 unsigned char* ibuf; 147 size_t ilen; // how many bytes are being used in the input buffer 148 size_t icap; // the input buffer's capacity 149 size_t ipos; // the current position in the input buffer 150 151 size_t line; // the current line, used to show useful error messages 152 size_t pos; // the position in the current line, for error messages 153 154 unsigned char* obuf; 155 size_t ocap; // the output buffer's capacity 156 size_t opos; // the current position in the output buffer 157 158 ssize_t level; // the current indentation/nesting level 159 160 int current; 161 int next; 162 } nj_maker; 163 164 // advance_reader_pos helps func read_byte do its job 165 static inline void advance_reader_pos(nj_maker* r, unsigned char b) { 166 r->ipos++; 167 if (b == '\n') { 168 r->line++; 169 r->pos = 1; 170 } else { 171 r->pos++; 172 } 173 } 174 175 // read_byte does as it says: check its return for the value EOF, before 176 // using it as the next byte 177 int read_byte(nj_maker* r) { 178 if (r->ipos < r->ilen) { 179 // inside current chunk 180 const unsigned char b = r->ibuf[r->ipos]; 181 advance_reader_pos(r, b); 182 return b; 183 } 184 185 // need to read the next block 186 r->ipos = 0; 187 r->ilen = fread(r->ibuf, sizeof(unsigned char), r->icap, r->in); 188 if (r->ilen > 0) { 189 const unsigned char b = r->ibuf[r->ipos]; 190 advance_reader_pos(r, b); 191 return b; 192 } 193 194 // reached the end of data 195 return EOF; 196 } 197 198 // advance is used in most of the code, instead of calling read_byte directly 199 static inline void advance(nj_maker* r) { 200 r->current = r->next; 201 r->next = read_byte(r); 202 } 203 204 void fail(nj_maker* m, int code, const char* msg); 205 206 void skip_line(nj_maker* r) { 207 while (true) { 208 advance(r); 209 const int lead = r->current; 210 211 if (lead == EOF) { 212 break; 213 } 214 215 if (lead == '\n') { 216 advance(r); 217 break; 218 } 219 } 220 } 221 222 void skip_multiline_comment(nj_maker* r) { 223 unsigned char prev = 0; 224 225 while (true) { 226 advance(r); 227 const int lead = r->current; 228 229 if (lead == EOF) { 230 break; 231 } 232 233 if (prev == '*' && lead == '/') { 234 advance(r); 235 break; 236 } 237 238 prev = (unsigned char)lead; 239 } 240 } 241 242 void skip_comment(nj_maker* r) { 243 int lead = r->current; 244 245 if (lead == '#') { 246 skip_line(r); 247 return; 248 } 249 250 if (lead != '/') { 251 fail(r, 1, "expected a slash to start comments"); 252 } 253 254 advance(r); 255 lead = r->current; 256 257 if (lead == '/') { 258 skip_line(r); 259 return; 260 } 261 262 if (lead == '*') { 263 skip_multiline_comment(r); 264 return; 265 } 266 267 fail(r, 1, "expected `//` or `/*` to start comments"); 268 } 269 270 static inline void seek_token(nj_maker* r) { 271 while (true) { 272 const int lead = r->current; 273 274 if (lead != EOF && lead <= ' ') { 275 advance(r); 276 continue; 277 } 278 279 if (lead == '/' || lead == '#') { 280 skip_comment(r); 281 continue; 282 } 283 284 break; 285 } 286 } 287 288 bool starts_with_bom(const unsigned char* b, const size_t n) { 289 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 290 } 291 292 void restart_state(nj_maker* m, FILE* w, FILE* r) { 293 m->in = r; 294 m->ilen = 0; 295 m->ipos = 0; 296 297 m->out = w; 298 m->opos = 0; 299 300 m->line = 1; 301 m->pos = 1; 302 303 m->current = EOF; 304 m->next = EOF; 305 306 m->current = read_byte(m); 307 if (m->current == EOF) { 308 return; 309 } 310 m->next = read_byte(m); 311 312 m->level = 0; 313 314 // skip leading UTF-8 BOM (byte-order mark), if present 315 if (starts_with_bom(m->ibuf, m->ilen)) { 316 // a UTF-8 BOM has 3 bytes 317 for (size_t i = 0; i < 3 && m->current != EOF; i++) { 318 advance(m); 319 } 320 } 321 } 322 323 void write_byte(nj_maker* m, unsigned char b) { 324 if (m->opos < m->ocap) { 325 m->obuf[m->opos++] = b; 326 return; 327 } 328 329 fwrite(m->obuf, 1, m->ocap, m->out); 330 m->obuf[0] = b; 331 m->opos = 1; 332 } 333 334 // write_bytes does as it says, minimizing the number of calls to fwrite 335 void write_bytes(nj_maker* m, const unsigned char* src, size_t len) { 336 const size_t rem = m->ocap - m->opos; 337 if (len < rem) { 338 memcpy(m->obuf + m->opos, src, len); 339 m->opos += len; 340 return; 341 } 342 343 for (size_t i = 0; i < len; i++) { 344 write_byte(m, src[i]); 345 } 346 } 347 348 void write_spaces(nj_maker* m, ssize_t n) { 349 const unsigned char spaces[32] = " "; 350 while (n > sizeof(spaces)) { 351 write_bytes(m, spaces, sizeof(spaces)); 352 n -= sizeof(spaces); 353 } 354 if (n > 0) { 355 write_bytes(m, spaces, n); 356 } 357 } 358 359 static inline void indent(nj_maker* m) { 360 write_spaces(m, INDENTATION * m->level); 361 } 362 363 void flush(nj_maker* m) { 364 if (m->opos > 0) { 365 fwrite(m->obuf, 1, m->opos, m->out); 366 } 367 m->opos = 0; 368 fflush(m->out); 369 } 370 371 // https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 372 373 static inline bool check_2_byte_rune(int a, int b) { 374 return (0xc2 <= a && a <= 0xdf) && (0x80 <= b && b <= 0xbf); 375 } 376 377 bool check_3_byte_rune(int a, int b, int c) { 378 return ( 379 (a == 0xe0) && 380 (0xa0 <= b && b <= 0xbf) && 381 (0x80 <= c && c <= 0xbf) 382 ) || ( 383 (0xe1 <= a && a <= 0xec) && 384 (0x80 <= b && b <= 0xbf) && 385 (0x80 <= c && c <= 0xbf) 386 ) || ( 387 (a == 0xed) && 388 (0x80 <= b && b <= 0x9f) && 389 (0x80 <= c && c <= 0xbf) 390 ) || ( 391 (a == 0xee || a == 0xef) && 392 (0x80 <= b && b <= 0xbf) && 393 (0x80 <= c && c <= 0xbf) 394 ); 395 } 396 397 bool check_4_byte_rune(int a, int b, int c, int d) { 398 return ( 399 (a == 0xf0) && 400 (0x90 <= b && b <= 0xbf) && 401 (0x80 <= c && c <= 0xbf) && 402 (0x80 <= d && d <= 0xbf) 403 ) || ( 404 (a == 0xf1 || a == 0xf3) && 405 (0x80 <= b && b <= 0xbf) && 406 (0x80 <= c && c <= 0xbf) && 407 (0x80 <= d && d <= 0xbf) 408 ) || ( 409 (a == 0xf4) && 410 (0x80 <= b && b <= 0xbf) && 411 (0x80 <= c && c <= 0x8f) && 412 (0x80 <= d && d <= 0xbf) 413 ); 414 } 415 416 // write_replacement_char is the recommended action to handle invalid bytes 417 void write_replacement_char(nj_maker* m) { 418 write_byte(m, 0xef); 419 write_byte(m, 0xbf); 420 write_byte(m, 0xbd); 421 } 422 423 void handle_invalid_rune(nj_maker* m) { 424 // fail(m, 1, "invalid unicode value"); 425 write_replacement_char(m); 426 } 427 428 // write_rune is following the table at https://en.wikipedia.org/wiki/UTF-8 429 void write_rune(nj_maker* m, uint32_t rune) { 430 if (rune < (1 << 7)) { 431 write_byte(m, rune); 432 return; 433 } 434 435 if (rune < (1 << (5 + 6))) { 436 const int a = 0b11000000 | (rune >> 6); 437 const int b = 0b10000000 | (rune & 0b00111111); 438 if (check_2_byte_rune(a, b)) { 439 write_byte(m, a); 440 write_byte(m, b); 441 } else { 442 write_replacement_char(m); 443 } 444 return; 445 } 446 447 if (rune < (1 << (4 + 6 + 6))) { 448 const int a = 0b11100000 | (rune >> 12); 449 const int b = 0b10000000 | ((rune >> 6) & 0b00111111); 450 const int c = 0b10000000 | (rune & 0b00111111); 451 if (check_3_byte_rune(a, b, c)) { 452 write_byte(m, a); 453 write_byte(m, b); 454 write_byte(m, c); 455 } else { 456 write_replacement_char(m); 457 } 458 return; 459 } 460 461 if (rune < (1 << (3 + 6 + 6 + 6))) { 462 const int a = 0b11110000 | (rune >> 18); 463 const int b = 0b10000000 | ((rune >> 12) & 0b00111111); 464 const int c = 0b10000000 | ((rune >> 6) & 0b00111111); 465 const int d = 0b10000000 | (rune & 0b00111111); 466 if (check_4_byte_rune(a, b, c, d)) { 467 write_byte(m, a); 468 write_byte(m, b); 469 write_byte(m, c); 470 write_byte(m, d); 471 } else { 472 write_replacement_char(m); 473 } 474 return; 475 } 476 477 write_replacement_char(m); 478 } 479 480 void copy_utf8_rune(nj_maker* m) { 481 const int a = m->current; 482 483 if (a == EOF) { 484 return; 485 } 486 487 // handle 1-byte runes 488 if (a < 128) { 489 write_byte(m, a); 490 return; 491 } 492 493 advance(m); 494 const int b = m->current; 495 496 if (b == EOF) { 497 handle_invalid_rune(m); 498 return; 499 } 500 501 // handle 2-byte runes 502 if (check_2_byte_rune(a, b)) { 503 write_byte(m, a); 504 write_byte(m, b); 505 return; 506 } 507 508 advance(m); 509 const int c = m->current; 510 511 if (c == EOF) { 512 handle_invalid_rune(m); 513 return; 514 } 515 516 // handle 3-byte runes 517 if (check_3_byte_rune(a, b, c)) { 518 write_byte(m, a); 519 write_byte(m, b); 520 write_byte(m, c); 521 return; 522 } 523 524 advance(m); 525 const int d = m->current; 526 527 if (d == EOF) { 528 handle_invalid_rune(m); 529 return; 530 } 531 532 // handle 4-byte runes 533 if (check_4_byte_rune(a, b, c, d)) { 534 write_byte(m, a); 535 write_byte(m, b); 536 write_byte(m, c); 537 write_byte(m, d); 538 return; 539 } 540 541 handle_invalid_rune(m); 542 } 543 544 // debug is available to diagnose any bug found 545 void debug(nj_maker* m, const char* fmt, ...) { 546 va_list args; 547 va_start(args, fmt); 548 549 if (m->in != stdin) { 550 fclose(m->in); 551 } 552 553 write_byte(m, '\n'); 554 555 const unsigned long line = m->line; 556 const unsigned long pos = m->pos; 557 fprintf(stderr, "\x1b[46m\x1b[37mline %lu, pos %lu: ", line, pos); 558 fprintf(stderr, fmt, args); 559 fprintf(stderr, "\x1b[0m\n"); 560 561 va_end(args); 562 563 exit(10); 564 } 565 566 // fail quits this app right after showing the error message given 567 void fail(nj_maker* m, int code, const char* msg) { 568 const unsigned long line = m->line; 569 const unsigned long pos = m->pos; 570 571 EMIT_CONST(m, "\x1b[0m"); 572 write_byte(m, '\n'); 573 flush(m); 574 fprintf(stderr, ERROR_LINE("line %lu, pos %lu: %s"), line, pos, msg); 575 exit(code); 576 } 577 578 bool demand_keyword(nj_maker* m, char* rest) { 579 for (; rest[0] != 0; rest++) { 580 const int lead = m->current; 581 if (lead == EOF || lead != rest[0]) { 582 return false; 583 } 584 advance(m); 585 } 586 587 return rest[0] == 0; 588 } 589 590 void handle_null(nj_maker* m) { 591 if (!demand_keyword(m, "null")) { 592 fail(m, 1, "expected `null` keyword"); 593 } 594 EMIT_CONST(m, "null"); 595 } 596 597 void handle_true(nj_maker* m) { 598 if (!demand_keyword(m, "true")) { 599 fail(m, 1, "expected `true` keyword"); 600 } 601 EMIT_CONST(m, "true"); 602 } 603 604 void handle_false(nj_maker* m) { 605 if (!demand_keyword(m, "false")) { 606 fail(m, 1, "expected `false` keyword"); 607 } 608 EMIT_CONST(m, "false"); 609 } 610 611 void handle_capital_none(nj_maker* m) { 612 if (!demand_keyword(m, "None")) { 613 fail(m, 1, "expected `None` keyword"); 614 } 615 EMIT_CONST(m, "null"); 616 } 617 618 void handle_capital_true(nj_maker* m) { 619 if (!demand_keyword(m, "True")) { 620 fail(m, 1, "expected `True` keyword"); 621 } 622 EMIT_CONST(m, "true"); 623 } 624 625 void handle_capital_false(nj_maker* m) { 626 if (!demand_keyword(m, "False")) { 627 fail(m, 1, "expected `False` keyword"); 628 } 629 EMIT_CONST(m, "false"); 630 } 631 632 void handle_digits(nj_maker* m) { 633 if (!isdigit(m->current)) { 634 fail(m, 1, "expected/missing digits"); 635 } 636 637 while (isdigit(m->current)) { 638 write_byte(m, m->current); 639 advance(m); 640 } 641 } 642 643 void handle_number(nj_maker* m) { 644 handle_digits(m); 645 646 const int lead = m->current; 647 648 if (lead == '.') { 649 write_byte(m, '.'); 650 advance(m); 651 652 if (isdigit(m->current)) { 653 handle_digits(m); 654 } else { 655 write_byte(m, '0'); 656 } 657 return; 658 } 659 660 if (lead == 'e' || lead == 'E') { 661 write_byte(m, lead); 662 advance(m); 663 664 if (m->current == '+') { 665 advance(m); 666 } else if (m->current == '-') { 667 write_byte(m, '-'); 668 advance(m); 669 } 670 671 handle_digits(m); 672 } 673 } 674 675 void handle_dot(nj_maker* m) { 676 write_byte(m, '0'); 677 write_byte(m, '.'); 678 advance(m); 679 680 if (!isdigit(m->current)) { 681 fail(m, 1, "expected/missing digits after decimal dot"); 682 } 683 handle_digits(m); 684 } 685 686 void handle_plus_number(nj_maker* m) { 687 advance(m); 688 689 if (m->current == '.') { 690 handle_dot(m); 691 return; 692 } 693 handle_number(m); 694 } 695 696 void handle_minus_number(nj_maker* m) { 697 write_byte(m, '-'); 698 advance(m); 699 700 if (m->current == '.') { 701 handle_dot(m); 702 return; 703 } 704 handle_number(m); 705 } 706 707 // decode_hex assumes valid hex digits, checked by func is_valid_hex 708 uint32_t decode_hex(unsigned char hex) { 709 if ('0' <= hex && hex <= '9') { 710 return hex - '0'; 711 } 712 if ('A' <= hex && hex <= 'F') { 713 return hex - 'A' + 10; 714 } 715 if ('a' <= hex && hex <= 'f') { 716 return hex - 'a' + 10; 717 } 718 return 0xffff; 719 } 720 721 static inline bool is_valid_hex(unsigned char b) { 722 return false || 723 ('0' <= b && b <= '9') || 724 ('A' <= b && b <= 'F') || 725 ('a' <= b && b <= 'f'); 726 } 727 728 // handle_low_char ensures characters whose ASCII codes are lower than spaces 729 // are properly escaped for strings 730 void handle_low_char(nj_maker* m, int c) { 731 const char* hex = "0123456789ABCDEF"; 732 733 switch (c) { 734 case '\t': 735 write_byte(m, '\\'); 736 write_byte(m, 't'); 737 break; 738 case '\n': 739 write_byte(m, '\\'); 740 write_byte(m, 'n'); 741 break; 742 case '\r': 743 write_byte(m, '\\'); 744 write_byte(m, 'r'); 745 break; 746 case '\b': 747 write_byte(m, '\\'); 748 write_byte(m, 'b'); 749 break; 750 case '\f': 751 write_byte(m, '\\'); 752 write_byte(m, 'f'); 753 break; 754 case '\v': 755 write_byte(m, '\\'); 756 write_byte(m, 'v'); 757 break; 758 default: 759 write_byte(m, '\\'); 760 write_byte(m, 'u'); 761 write_byte(m, '0'); 762 write_byte(m, '0'); 763 write_byte(m, hex[c / 16]); 764 write_byte(m, hex[c % 16]); 765 break; 766 } 767 } 768 769 void write_inner_string_hex_quad(nj_maker* m, const unsigned char quad[4]) { 770 const uint32_t n = 0 + 771 (decode_hex(quad[0]) << 12) + 772 (decode_hex(quad[1]) << 8) + 773 (decode_hex(quad[2]) << 4) + 774 (decode_hex(quad[3]) << 0); 775 776 switch (n) { 777 case '"': 778 write_byte(m, '\\'); 779 write_byte(m, '"'); 780 return; 781 case '\\': 782 write_byte(m, '\\'); 783 write_byte(m, '\\'); 784 return; 785 } 786 787 if (n >= ' ') { 788 write_rune(m, n); 789 } else { 790 handle_low_char(m, n); 791 } 792 } 793 794 void handle_hex_quad(nj_maker* m) { 795 unsigned char quad[4]; 796 for (size_t i = 0; i < 4; i++) { 797 advance(m); 798 const int lead = m->current; 799 if (lead == EOF) { 800 fail(m, 1, "end of input before end of string"); 801 } 802 if (is_valid_hex(lead)) { 803 quad[i] = lead; 804 continue; 805 } 806 fail(m, 1, "invalid hexadecimal digit in string"); 807 } 808 809 write_inner_string_hex_quad(m, quad); 810 } 811 812 void handle_hex_pair(nj_maker* m) { 813 unsigned char quad[4] = {'0', '0', '0', '0'}; 814 advance(m); 815 const int a = m->current; 816 advance(m); 817 const int b = m->current; 818 if (a == EOF || b == EOF) { 819 fail(m, 1, "end of input before end of string"); 820 } 821 if (!is_valid_hex(a) || !is_valid_hex(b)) { 822 fail(m, 1, "invalid hexadecimal digit in string"); 823 } 824 825 quad[2] = a; 826 quad[3] = b; 827 write_inner_string_hex_quad(m, quad); 828 } 829 830 void handle_string_escape(nj_maker* m, int c) { 831 switch (c) { 832 case '"': 833 case '\\': 834 case 'b': 835 case 'f': 836 case 'n': 837 case 'r': 838 case 't': 839 write_byte(m, '\\'); 840 write_byte(m, c); 841 break; 842 case 'u': 843 handle_hex_quad(m); 844 break; 845 case 'x': 846 handle_hex_pair(m); 847 break; 848 case '\'': 849 write_byte(m, '\''); 850 break; 851 default: 852 write_byte(m, m->current); 853 break; 854 } 855 } 856 857 ssize_t handle_inner_string(nj_maker* m) { 858 const unsigned char quote = m->current; 859 bool escaped = false; 860 861 for (size_t i = 0; true; i++) { 862 advance(m); 863 864 int c = m->current; 865 if (c == EOF) { 866 fail(m, 1, "input ended before string was close-quoted"); 867 } 868 869 if (escaped) { 870 handle_string_escape(m, c); 871 escaped = false; 872 continue; 873 } 874 875 switch (c) { 876 case '\\': 877 escaped = true; 878 break; 879 default: 880 if (c == quote) { 881 advance(m); 882 return i; 883 } 884 885 // write_byte(m, c); 886 if (c < ' ') { 887 handle_low_char(m, c); 888 } else { 889 copy_utf8_rune(m); 890 } 891 break; 892 } 893 } 894 } 895 896 void handle_quoted_key(nj_maker* m) { 897 if (m->current != m->next) { 898 EMIT_CONST(m, "\"" KEY_STYLE); 899 handle_inner_string(m); 900 EMIT_CONST(m, SYNTAX_STYLE "\""); 901 } else { 902 write_byte(m, '"'); 903 handle_inner_string(m); 904 write_byte(m, '"'); 905 } 906 } 907 908 void handle_string(nj_maker* m) { 909 if (m->current != m->next) { 910 EMIT_CONST(m, "\"" RESET_STYLE); 911 handle_inner_string(m); 912 EMIT_CONST(m, SYNTAX_STYLE "\""); 913 } else { 914 write_byte(m, '"'); 915 handle_inner_string(m); 916 write_byte(m, '"'); 917 } 918 } 919 920 void handle_token(nj_maker* m, ssize_t lead_level); 921 922 void handle_array(nj_maker* m) { 923 size_t items = 0; 924 const unsigned char end = m->current == '[' ? ']' : ')'; 925 m->level++; 926 write_byte(m, '['); 927 advance(m); 928 929 while (true) { 930 seek_token(m); 931 const int lead = m->current; 932 933 if (lead == EOF) { 934 fail(m, 1, "unclosed array"); 935 } 936 937 if (lead == ',') { 938 advance(m); 939 continue; 940 } 941 942 if (lead == end) { 943 m->level--; 944 if (items > 0) { 945 write_byte(m, '\n'); 946 indent(m); 947 EMIT_CONST(m, SYNTAX_STYLE "]"); 948 } else { 949 write_byte(m, ']'); 950 } 951 advance(m); 952 return; 953 } 954 955 if (items > 0) { 956 EMIT_CONST(m, SYNTAX_STYLE ","); 957 } 958 write_byte(m, '\n'); 959 if (feof(m->out)) { 960 return; 961 } 962 handle_token(m, m->level); 963 items++; 964 } 965 } 966 967 void handle_unquoted_key(nj_maker* m) { 968 EMIT_CONST(m, SYNTAX_STYLE "\"" KEY_STYLE); 969 970 while (true) { 971 int c = m->current; 972 if (c == EOF) { 973 fail(m, 1, "input ended with an object key"); 974 } 975 976 write_byte(m, c); 977 advance(m); 978 979 c = m->current; 980 if (!isalpha(c) && !isdigit(c) && c != '_') { 981 break; 982 } 983 } 984 985 EMIT_CONST(m, SYNTAX_STYLE "\""); 986 } 987 988 void handle_object(nj_maker* m) { 989 size_t items = 0; 990 m->level++; 991 write_byte(m, '{'); 992 advance(m); 993 994 while (true) { 995 seek_token(m); 996 int lead = m->current; 997 998 if (lead == EOF) { 999 fail(m, 1, "unclosed object"); 1000 } 1001 1002 if (lead == ',') { 1003 advance(m); 1004 continue; 1005 } 1006 1007 if (lead == '}') { 1008 m->level--; 1009 if (items > 0) { 1010 write_byte(m, '\n'); 1011 indent(m); 1012 EMIT_CONST(m, SYNTAX_STYLE "}"); 1013 } else { 1014 write_byte(m, '}'); 1015 } 1016 advance(m); 1017 return; 1018 } 1019 1020 if (feof(m->out)) { 1021 return; 1022 } 1023 1024 if (lead == '"' || lead == '\'') { 1025 if (items > 0) { 1026 EMIT_CONST(m, SYNTAX_STYLE ","); 1027 } 1028 write_byte(m, '\n'); 1029 indent(m); 1030 EMIT_CONST(m, SYNTAX_STYLE); 1031 handle_quoted_key(m); 1032 } else if (isalpha(lead) || lead == '_') { 1033 if (items > 0) { 1034 EMIT_CONST(m, SYNTAX_STYLE ","); 1035 } 1036 write_byte(m, '\n'); 1037 indent(m); 1038 handle_unquoted_key(m); 1039 } else { 1040 fail(m, 1, "only strings or identifiers can be object keys"); 1041 } 1042 1043 seek_token(m); 1044 lead = m->current; 1045 1046 if (lead == EOF) { 1047 fail(m, 1, "input ended after object-key and before value"); 1048 } 1049 1050 if (lead != ':') { 1051 fail(m, 1, "a `:` must follow all object keys"); 1052 } 1053 1054 EMIT_CONST(m, ": "); 1055 advance(m); 1056 1057 seek_token(m); 1058 if (m->current == EOF) { 1059 fail(m, 1, "input ended after a `:` following an object-key"); 1060 } 1061 1062 handle_token(m, 0); 1063 items++; 1064 } 1065 } 1066 1067 // styles ties leading bytes/chars in tokens to their leading ANSI styles 1068 slice styles[256] = {}; 1069 1070 // dispatch ties leading bytes/chars in tokens to the funcs which handle them 1071 void (*dispatch[256])() = {}; 1072 1073 void handle_token(nj_maker* m, ssize_t lead_level) { 1074 const unsigned char b = m->current; 1075 write_spaces(m, INDENTATION * lead_level); 1076 write_bytes(m, styles[b].ptr, styles[b].len); 1077 dispatch[b](m); 1078 } 1079 1080 // handle_invalid_token shows an error message and quits the app right after 1081 void handle_invalid_token(nj_maker* m) { 1082 char msg[64]; 1083 unsigned char c = (unsigned char)m->current; 1084 sprintf(msg, "%c (%d): invalid token", c, c); 1085 fail(m, 1, msg); 1086 } 1087 1088 void handle_input(FILE* src) { 1089 unsigned char ibuf[IBUF_SIZE]; 1090 unsigned char obuf[OBUF_SIZE]; 1091 1092 nj_maker m; 1093 m.ibuf = ibuf; 1094 m.icap = sizeof(ibuf); 1095 m.obuf = obuf; 1096 m.ocap = sizeof(obuf); 1097 restart_state(&m, stdout, src); 1098 1099 // ignore leading whitespace/comment bytes, if present 1100 seek_token(&m); 1101 1102 if (m.current == EOF) { 1103 fail(&m, 1, "empty input isn't valid JSON"); 1104 } 1105 1106 handle_token(&m, 0); 1107 EMIT_CONST(&m, RESET_STYLE); 1108 write_byte(&m, '\n'); 1109 flush(&m); 1110 1111 // ignore trailing whitespace/comment bytes, if present 1112 seek_token(&m); 1113 1114 // ignore trailing semicolon, if present 1115 if (m.current == ';') { 1116 advance(&m); 1117 // ignore trailing whitespace/comment bytes, if present 1118 seek_token(&m); 1119 } 1120 1121 if (!feof(src) || m.current != EOF) { 1122 fail(&m, 1, "unexpected trailing JSON data"); 1123 } 1124 } 1125 1126 bool is_help_option(const char* s) { 1127 return (s[0] == '-' && s[1] != 0) && ( 1128 strcmp(s, "-h") == 0 || 1129 strcmp(s, "--h") == 0 || 1130 strcmp(s, "-help") == 0 || 1131 strcmp(s, "--help") == 0 1132 ); 1133 } 1134 1135 // run returns the error code 1136 int run(int nargs, char** args) { 1137 if (nargs > 0 && strcmp(args[0], "--") == 0) { 1138 nargs--; 1139 args++; 1140 } 1141 1142 if (nargs > 1) { 1143 const char* msg = "can't use more than 1 named input"; 1144 fprintf(stderr, ERROR_LINE("%s"), msg); 1145 return 1; 1146 } 1147 1148 // use stdin when not given a filepath 1149 if (nargs == 0 || strcmp(args[0], "") == 0 || strcmp(args[0], "-") == 0) { 1150 handle_input(stdin); 1151 return 0; 1152 } 1153 1154 const char* path = args[0]; 1155 FILE* f = fopen(path, "rb"); 1156 if (f == NULL) { 1157 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 1158 return 1; 1159 } 1160 1161 handle_input(f); 1162 fclose(f); 1163 1164 return 0; 1165 } 1166 1167 int main(int argc, char** argv) { 1168 #ifdef _WIN32 1169 setmode(fileno(stdin), O_BINARY); 1170 // ensure output lines end in LF instead of CRLF on windows 1171 setmode(fileno(stdout), O_BINARY); 1172 setmode(fileno(stderr), O_BINARY); 1173 #endif 1174 1175 if (argc > 1 && is_help_option(argv[1])) { 1176 printf("%s", info); 1177 return 0; 1178 } 1179 1180 memset(dispatch, 0, sizeof(dispatch)); 1181 memset(styles, 0, sizeof(styles)); 1182 1183 // the dispatch table starts as all null function-pointers 1184 for (size_t i = 0; i < sizeof(dispatch) / sizeof(dispatch[0]); i++) { 1185 dispatch[i] = handle_invalid_token; 1186 } 1187 1188 for (size_t i = '0'; i <= '9'; i++) { 1189 dispatch[i] = handle_number; 1190 CONST_SLICE(&styles[i], NUMBER_STYLE); 1191 } 1192 1193 dispatch['n'] = handle_null; 1194 dispatch['t'] = handle_true; 1195 dispatch['f'] = handle_false; 1196 dispatch['N'] = handle_capital_none; 1197 dispatch['T'] = handle_capital_true; 1198 dispatch['F'] = handle_capital_false; 1199 dispatch['.'] = handle_dot; 1200 dispatch['+'] = handle_plus_number; 1201 dispatch['-'] = handle_minus_number; 1202 dispatch['"'] = handle_string; 1203 dispatch['\''] = handle_string; 1204 dispatch['['] = handle_array; 1205 dispatch['('] = handle_array; 1206 dispatch['{'] = handle_object; 1207 1208 CONST_SLICE(&styles['n'], NULL_STYLE); 1209 CONST_SLICE(&styles['t'], BOOL_STYLE); 1210 CONST_SLICE(&styles['f'], BOOL_STYLE); 1211 CONST_SLICE(&styles['N'], NULL_STYLE); 1212 CONST_SLICE(&styles['T'], BOOL_STYLE); 1213 CONST_SLICE(&styles['F'], BOOL_STYLE); 1214 CONST_SLICE(&styles['.'], NUMBER_STYLE); 1215 CONST_SLICE(&styles['+'], NUMBER_STYLE); 1216 // CONST_SLICE(&styles['-'], NUMBER_STYLE); 1217 CONST_SLICE(&styles['-'], NEGATIVE_STYLE); 1218 CONST_SLICE(&styles['"'], SYNTAX_STYLE); 1219 CONST_SLICE(&styles['\''], SYNTAX_STYLE); 1220 CONST_SLICE(&styles['['], SYNTAX_STYLE); 1221 CONST_SLICE(&styles['('], SYNTAX_STYLE); 1222 CONST_SLICE(&styles['{'], SYNTAX_STYLE); 1223 1224 // enable full/block-buffering for standard output 1225 setvbuf(stdout, NULL, _IOFBF, 0); 1226 1227 return run(argc - 1, argv + 1) == 0 ? 0 : 1; 1228 }