File: json0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import BufferedReader, BytesIO 27 from sys import argv, exit, stderr, stdin, stdout 28 29 30 info = ''' 31 json0 [filepath/URI...] 32 33 JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 34 35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid 36 JSON, since it ignores comments and trailing commas, neither of which are 37 supported in JSON, but which are still commonly used. 38 39 It also turns single-quoted strings into proper double-quoted ones, as well 40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u` 41 hexadecimal escapes. When backslashes in strings are followed by an invalid 42 escape letter, the backslash is ignored. 43 44 Output is always a single line of valid JSON, ending with a line-feed. 45 ''' 46 47 # handle standard help cmd-line options, quitting right away in that case 48 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 49 print(info.strip()) 50 exit(0) 51 52 53 # note: using regexes doesn't seem to speed-up number/string-handling 54 55 56 def read(r, size: int) -> bytes: 57 global pos, linenum 58 59 chunk = r.read(size) 60 if not chunk: 61 return chunk 62 63 if not (10 in chunk): 64 pos += len(chunk) 65 return chunk 66 67 for b in chunk: 68 if b == 10: 69 pos = 1 70 linenum += 1 71 else: 72 pos += 1 73 return chunk 74 75 76 def skip_byte(r) -> None: 77 global pos, linenum 78 79 chunk = r.read(1) 80 if not chunk: 81 return 82 83 if chunk[0] == 10: 84 pos = 1 85 linenum += 1 86 else: 87 pos += 1 88 89 90 def peek_byte(r) -> int: 91 chunk = r.peek(64) 92 if len(chunk) > 0: 93 return chunk[0] 94 return -1 95 96 97 def handle_array(w, r) -> None: 98 seek_next = seek_next_token 99 100 n = 0 101 lead = peek_byte(r) 102 end = 0 103 if lead < 0: 104 raise ValueError('unexpected end of input data, before "]"') 105 if lead == 91: # ord('[') 106 end = 93 # ord(']') 107 elif lead == 40: # ord('(') 108 end = 41 # ord(')') 109 else: 110 raise ValueError('expected "[" or "("') 111 skip_byte(r) 112 w.write(b'[') 113 114 while True: 115 # whitespace/comments may precede the next item/comma 116 seek_next(r) 117 b = peek_byte(r) 118 if b < 0: 119 raise ValueError('unexpected end of input data, before "]"') 120 121 comma = b == 44 # ord(',') 122 123 if comma: 124 skip_byte(r) 125 # whitespace/comments may follow the comma 126 seek_next(r) 127 b = peek_byte(r) 128 if b < 0: 129 raise ValueError('unexpected end of input data, before "]"') 130 131 if b == end: 132 skip_byte(r) 133 w.write(b']') 134 return 135 136 if n > 0: 137 if not comma: 138 raise ValueError('missing a comma between array values') 139 w.write(b',') 140 141 b = peek_byte(r) 142 if b > 0: 143 handlers[b](w, r) 144 n += 1 145 146 147 def handle_double_quoted_string(w, r) -> None: 148 skip_byte(r) 149 w.write(b'"') 150 handle_inner_string(w, r, 34) # ord('"') 151 w.write(b'"') 152 153 154 def handle_dot(w, r) -> None: 155 skip_byte(r) 156 # precede the leading decimal dot with a 0 157 w.write(b'0.') 158 159 # handle decimals, which in this case aren't optional, as a leading 160 # dot is what led to this point 161 if copy_digits(w, r) < 1: 162 raise ValueError('expected numeric digits, but found none') 163 164 165 def handle_false(w, r) -> None: 166 demand(r, b'false') 167 w.write(b'false') 168 169 170 def handle_False(w, r) -> None: 171 demand(r, b'False') 172 w.write(b'false') 173 174 175 def handle_invalid(w, r) -> None: 176 b = peek_byte(r) 177 if b < 0: 178 raise ValueError('unexpected end of input data') 179 # raise ValueError(f'unexpected JSON byte-value {b}') 180 if 32 < b <= 126: 181 msg = f'unexpected symbol {chr(b)}' 182 else: 183 msg = f'unexpected byte-value {b}' 184 raise ValueError(msg) 185 186 187 def handle_negative(w, r) -> None: 188 skip_byte(r) 189 w.write(b'-') 190 191 if peek_byte(r) == 46: # ord('.') 192 skip_byte(r) 193 w.write(b'0.') 194 if copy_digits(w, r) < 1: 195 raise ValueError('expected numeric digits, but found none') 196 else: 197 handle_number(w, r) 198 199 200 def handle_null(w, r) -> None: 201 demand(r, b'null') 202 w.write(b'null') 203 204 205 def handle_None(w, r) -> None: 206 demand(r, b'None') 207 w.write(b'null') 208 209 210 def handle_number(w, r) -> None: 211 # handle integer part 212 if copy_digits(w, r) < 1: 213 raise ValueError('expected numeric digits, but found none') 214 215 # handle optional decimals 216 b = peek_byte(r) 217 if b == 46: # ord('.') 218 skip_byte(r) 219 w.write(b'.') 220 if copy_digits(w, r) < 1: 221 # follow a trailing decimal dot with a 0 222 w.write(b'0') 223 224 # handle optional exponent 225 if b == 101 or b == 69: # ord('e'), ord('E') 226 skip_byte(r) 227 w.write(b'e' if b == 101 else b'E') 228 b = peek_byte(r) 229 if b == 43: # ord('+') 230 skip_byte(r) 231 elif b == 45: # ord('-') 232 w.write(b'-') 233 skip_byte(r) 234 if copy_digits(w, r) < 1: 235 raise ValueError('expected numeric digits, but found none') 236 237 238 def handle_object(w, r) -> None: 239 seek_next = seek_next_token 240 241 num_pairs = 0 242 skip_byte(r) 243 w.write(b'{') 244 245 while True: 246 # whitespace/comments may precede the next item/comma 247 seek_next(r) 248 b = peek_byte(r) 249 if b < 0: 250 raise ValueError('unexpected end of input data, before "}"') 251 252 comma = b == 44 # ord(',') 253 254 if comma: 255 skip_byte(r) 256 # whitespace/comments may follow the comma 257 seek_next(r) 258 b = peek_byte(r) 259 if b < 0: 260 raise ValueError('unexpected end of input data, before "}"') 261 262 if b == 125: # ord('}') 263 skip_byte(r) 264 w.write(b'}') 265 return 266 267 if num_pairs > 0: 268 if not comma: 269 raise ValueError('missing a comma between key-value pairs') 270 w.write(b',') 271 272 demand_string(w, r) 273 # whitespace/comments may follow the key 274 seek_next(r) 275 demand(r, b':') 276 w.write(b':') 277 # whitespace/comments may follow the colon 278 seek_next(r) 279 b = peek_byte(r) 280 if b > 0: 281 handlers[b](w, r) 282 num_pairs += 1 283 284 285 def handle_positive(w, r) -> None: 286 # do nothing with the leading plus sign, which isn't allowed in JSON 287 skip_byte(r) 288 289 if peek_byte(r) == 46: # ord('.') 290 skip_byte(r) 291 w.write(b'0.') 292 if copy_digits(w, r) < 1: 293 raise ValueError('expected numeric digits, but found none') 294 else: 295 handle_number(w, r) 296 297 298 def handle_single_quoted_string(w, r) -> None: 299 skip_byte(r) 300 w.write(b'"') 301 handle_inner_string(w, r, 39) # ord('\'') 302 w.write(b'"') 303 304 305 def demand_string(w, r) -> None: 306 quote = peek_byte(r) 307 if quote < 0: 308 msg = 'unexpected end of input, instead of a string quote' 309 raise ValueError(msg) 310 311 if quote == 34: # ord('"') 312 handle_double_quoted_string(w, r) 313 return 314 315 if quote == 39: # ord('\'') 316 handle_single_quoted_string(w, r) 317 return 318 319 if 32 < quote <= 126: # ord(' '), ord('~') 320 msg = f'expected ", or even \', but got "{chr(quote)}" instead' 321 else: 322 msg = f'expected ", or even \', but got byte "{quote}" instead' 323 raise ValueError(msg) 324 325 326 def handle_inner_string(w, r, quote: int) -> None: 327 esc = False 328 bad_hex_msg = 'invalid hexadecimal symbols' 329 early_end_msg = 'input data ended while still in quoted string' 330 331 def is_hex(x: int) -> bool: 332 # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f') 333 return 48 <= x <= 57 or 97 <= x <= 102 334 335 def lower(x: int) -> bool: 336 # 65 is ord('A'), 90 is ord('Z') 337 return x + 32 if 65 <= x <= 90 else x 338 339 while True: 340 chunk = r.peek(1) 341 if len(chunk) < 1: 342 raise ValueError(early_end_msg) 343 b = chunk[0] 344 345 if esc: 346 esc = False 347 348 if b == 120: # ord('x') 349 skip_byte(r) 350 chunk = read(r, 2) 351 if len(chunk) != 2: 352 raise ValueError(early_end_msg) 353 a = lower(chunk[0]) 354 b = lower(chunk[1]) 355 w.write(b'\\u00') 356 if not (is_hex(a) and is_hex(b)): 357 raise ValueError(bad_hex_msg) 358 w.write(a) 359 w.write(b) 360 continue 361 362 if b == 117: # ord('u') 363 skip_byte(r) 364 chunk = read(r, 4) 365 if len(chunk) != 4: 366 raise ValueError(early_end_msg) 367 a = lower(chunk[0]) 368 b = lower(chunk[1]) 369 c = lower(chunk[2]) 370 d = lower(chunk[3]) 371 w.write(b'\\u') 372 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)): 373 raise ValueError(bad_hex_msg) 374 w.write(chunk) 375 continue 376 377 # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f' 378 if b in (34, 92, 110, 116, 114, 98, 102): 379 w.write(b'\\') 380 381 w.write(read(r, 1)) 382 continue 383 384 if b == 92: # ord('\\') 385 esc = True 386 skip_byte(r) 387 continue 388 389 if b == quote: 390 skip_byte(r) 391 return 392 393 # emit normal string-byte 394 w.write(read(r, 1)) 395 396 397 def handle_true(w, r) -> None: 398 demand(r, b'true') 399 w.write(b'true') 400 401 402 def handle_True(w, r) -> None: 403 demand(r, b'True') 404 w.write(b'true') 405 406 407 # setup byte-handling lookup tuple 408 bh = [handle_invalid for i in range(256)] 409 bh[ord('0')] = handle_number 410 bh[ord('1')] = handle_number 411 bh[ord('2')] = handle_number 412 bh[ord('3')] = handle_number 413 bh[ord('4')] = handle_number 414 bh[ord('5')] = handle_number 415 bh[ord('6')] = handle_number 416 bh[ord('7')] = handle_number 417 bh[ord('8')] = handle_number 418 bh[ord('9')] = handle_number 419 bh[ord('+')] = handle_positive 420 bh[ord('-')] = handle_negative 421 bh[ord('.')] = handle_dot 422 bh[ord('"')] = handle_double_quoted_string 423 bh[ord('\'')] = handle_single_quoted_string 424 bh[ord('F')] = handle_False 425 bh[ord('N')] = handle_None 426 bh[ord('T')] = handle_True 427 bh[ord('f')] = handle_false 428 bh[ord('n')] = handle_null 429 bh[ord('t')] = handle_true 430 bh[ord('[')] = handle_array 431 bh[ord('(')] = handle_array 432 bh[ord('{')] = handle_object 433 434 # handlers is the immutable byte-driven func-dispatch table 435 handlers = tuple(bh) 436 437 438 def copy_digits(w, r) -> int: 439 'Returns how many digits were copied/handled.' 440 441 copied = 0 442 while True: 443 chunk = r.peek(64) 444 if len(chunk) == 0: 445 return copied 446 447 i = find_digits_end_index(chunk) 448 if i >= 0: 449 w.write(read(r, i)) 450 copied += i 451 return copied 452 else: 453 w.write(chunk) 454 read(r, len(chunk)) 455 copied += len(chunk) 456 457 458 def seek_next_token(r) -> None: 459 'Skip an arbitrarily-long mix of whitespace and comments.' 460 461 while True: 462 chunk = r.peek(1024) 463 if len(chunk) == 0: 464 # input is over, and this func doesn't consider that an error 465 return 466 467 comment = False 468 469 for i, b in enumerate(chunk): 470 # skip space, tab, line-feed, carriage-return, or form-feed 471 if b in (9, 10, 11, 13, 32): 472 continue 473 474 if b == 47 or b == 35: # ord('/'), ord('#') 475 read(r, i) 476 demand_comment(r) 477 comment = True 478 break 479 480 # found start of next token 481 read(r, i) 482 return 483 484 if not comment: 485 read(r, len(chunk)) 486 487 488 def skip_line(r) -> None: 489 while True: 490 chunk = r.peek(1024) 491 if len(chunk) == 0: 492 return 493 494 i = chunk.find(b'\n') 495 if i >= 0: 496 read(r, i + 1) 497 return 498 499 read(r, len(chunk)) 500 501 502 def skip_general_comment(r) -> None: 503 while True: 504 chunk = r.peek(1024) 505 if len(chunk) == 0: 506 raise ValueError(f'input data ended before an expected */') 507 508 i = chunk.find(b'*') 509 if i < 0: 510 # no */ in this chunk, so skip it and try with the next one 511 read(r, len(chunk)) 512 continue 513 514 # skip right past the * just found, then check if a / follows it 515 read(r, i + 1) 516 if peek_byte(r) == 47: # ord('/') 517 # got */, the end of this comment 518 skip_byte(r) 519 return 520 521 522 def find_digits_end_index(chunk: bytes) -> int: 523 i = 0 524 for b in chunk: 525 if 48 <= b <= 57: 526 i += 1 527 else: 528 return i 529 530 # all bytes (if any) were digits, so no end was found 531 return -1 532 533 534 def demand(r, what: bytes) -> None: 535 lead = read(r, len(what)) 536 if not lead.startswith(what): 537 lead = str(lead, encoding='utf-8') 538 what = str(what, encoding='utf-8') 539 raise ValueError(f'expected {what}, but got {lead} instead') 540 541 542 def demand_comment(r) -> None: 543 b = peek_byte(r) 544 if b < 0: 545 raise ValueError('unexpected end of input data') 546 if b == 35: # ord('#') 547 # handle single-line comment 548 skip_line(r) 549 return 550 551 demand(r, b'/') 552 b = peek_byte(r) 553 if b < 0: 554 raise ValueError('unexpected end of input data') 555 556 if b == 47: # ord('/') 557 # handle single-line comment 558 skip_line(r) 559 return 560 561 if b == 42: # ord('*') 562 # handle (potentially) multi-line comment 563 skip_general_comment(r) 564 return 565 566 raise ValueError('expected * or another /, after a /') 567 568 569 def json0(w, src, end) -> None: 570 r = BufferedReader(src) 571 572 # skip leading UTF-8 BOM (byte-order mark) 573 if r.peek(3) == b'\xef\xbb\xbf': 574 read(r, 3) 575 576 # skip leading whitespace/comments 577 seek_next_token(r) 578 579 # emit a single output line, ending with a line-feed 580 b = peek_byte(r) 581 if b >= 0: 582 handlers[b](w, r) 583 else: 584 # w.write(b'null') 585 # treat empty(ish) input as invalid JSON 586 raise ValueError('can\'t turn empty(ish) input into JSON') 587 588 # deliberately run post-processing before checking for trailing-data 589 # errors: for example, if post-proc func emits new line, errors will 590 # show up on their separate line, which is nicer 591 end(w) 592 593 # ignore trailing whitespace/comment bytes, if present 594 seek_next_token(r) 595 596 # ignore trailing semicolon, if present 597 b = peek_byte(r) 598 if b == 59: # ord(';') 599 read(r, 1) 600 # ignore trailing whitespace/comment bytes, if present 601 seek_next_token(r) 602 603 if len(r.peek(1)) > 0: 604 raise ValueError('unexpected trailing bytes in JSON data') 605 606 607 def seems_url(s: str) -> bool: 608 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 609 return any(s.startswith(p) for p in protocols) 610 611 612 def handle_json(w, r) -> None: 613 def end(w) -> None: 614 w.write(b'\n') 615 w.flush() 616 json0(w, r, end) 617 618 619 def handle_json_lines(w, r) -> None: 620 global pos, linenum 621 622 items = 0 623 linenum = 0 624 w.write(b'[') 625 626 while True: 627 line = r.readline().lstrip() 628 if not line: 629 break 630 631 pos = 1 632 linenum += 1 633 634 stripped = line.strip() 635 if not stripped or stripped.startswith(b'//'): 636 continue 637 638 items += 1 639 if items > 1: 640 w.write(b',') 641 642 json0(w, BytesIO(line), lambda w: w.flush()) 643 644 w.write(b']\n') 645 646 647 start_args = 1 648 handle_input = handle_json 649 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'): 650 start_args = 2 651 handle_input = handle_json_lines 652 653 if len(argv) - 1 > start_args: 654 print(f'multiple inputs not allowed', file=stderr) 655 exit(1) 656 657 w = stdout.buffer 658 name = argv[start_args] if len(argv) > start_args else '-' 659 660 # values keeping track of the input-position, shown in case of errors 661 pos = 1 662 linenum = 1 663 664 try: 665 if name == '-': 666 handle_input(w, stdin.buffer) 667 elif seems_url(name): 668 from urllib.request import urlopen 669 with urlopen(name) as inp: 670 handle_input(w, inp) 671 else: 672 with open(name, mode='rb') as inp: 673 handle_input(w, inp) 674 except BrokenPipeError: 675 # quit quietly, instead of showing a confusing error message 676 stderr.close() 677 exit(0) 678 except KeyboardInterrupt: 679 exit(2) 680 except Exception as e: 681 stdout.write('\n') 682 print(f'line {linenum}, pos {pos} : {e}', file=stderr) 683 exit(1)