File: j0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2020-2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import BufferedReader, BytesIO 27 from sys import argv, exit, stderr, stdin, stdout 28 29 30 info = ''' 31 j0 [filepath/URI...] 32 33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 34 35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid 36 JSON, since it ignores comments and trailing commas, neither of which are 37 supported in JSON, but which are still commonly used. 38 39 It also turns single-quoted strings into proper double-quoted ones, as well 40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u` 41 hexadecimal escapes. When backslashes in strings are followed by an invalid 42 escape letter, the backslash is ignored. 43 44 Output is always a single line of valid JSON, ending with a line-feed. 45 ''' 46 47 # handle standard help cmd-line options, quitting right away in that case 48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 49 print(info.strip()) 50 exit(0) 51 52 53 # note: using regexes doesn't seem to speed-up number/string-handling 54 55 56 def read(r, size: int) -> bytes: 57 global pos, linenum 58 59 chunk = r.read(size) 60 if not chunk: 61 return chunk 62 63 if not (10 in chunk): 64 pos += len(chunk) 65 return chunk 66 67 for b in chunk: 68 if b == 10: 69 pos = 1 70 linenum += 1 71 else: 72 pos += 1 73 return chunk 74 75 76 def skip_byte(r) -> None: 77 global pos, linenum 78 79 chunk = r.read(1) 80 if not chunk: 81 return 82 83 if chunk[0] == 10: 84 pos = 1 85 linenum += 1 86 else: 87 pos += 1 88 89 90 def peek_byte(r) -> int: 91 chunk = r.peek(64) 92 if len(chunk) > 0: 93 return chunk[0] 94 return -1 95 96 97 def handle_array(w, r) -> None: 98 seek_next = seek_next_token 99 100 n = 0 101 skip_byte(r) 102 w.write(b'[') 103 104 while True: 105 # whitespace/comments may precede the next item/comma 106 seek_next(r) 107 b = peek_byte(r) 108 if b < 0: 109 raise ValueError('unexpected end of input data, before "]"') 110 111 comma = b == 44 # ord(',') 112 113 if comma: 114 skip_byte(r) 115 # whitespace/comments may follow the comma 116 seek_next(r) 117 b = peek_byte(r) 118 if b < 0: 119 raise ValueError('unexpected end of input data, before "]"') 120 121 if b == 93: # ord(']') 122 skip_byte(r) 123 w.write(b']') 124 return 125 126 if n > 0: 127 if not comma: 128 raise ValueError('missing a comma between array values') 129 w.write(b',') 130 131 b = peek_byte(r) 132 if b > 0: 133 handlers[b](w, r) 134 n += 1 135 136 137 def handle_double_quoted_string(w, r) -> None: 138 skip_byte(r) 139 w.write(b'"') 140 handle_inner_string(w, r, 34) # ord('"') 141 w.write(b'"') 142 143 144 def handle_dot(w, r) -> None: 145 skip_byte(r) 146 # precede the leading decimal dot with a 0 147 w.write(b'0.') 148 149 # handle decimals, which in this case aren't optional, as a leading 150 # dot is what led to this point 151 if copy_digits(w, r) < 1: 152 raise ValueError('expected numeric digits, but found none') 153 154 155 def handle_false(w, r) -> None: 156 demand(r, b'false') 157 w.write(b'false') 158 159 160 def handle_False(w, r) -> None: 161 demand(r, b'False') 162 w.write(b'false') 163 164 165 def handle_invalid(w, r) -> None: 166 b = peek_byte(r) 167 if b < 0: 168 raise ValueError('unexpected end of input data') 169 # raise ValueError(f'unexpected JSON byte-value {b}') 170 if 32 < b <= 126: 171 msg = f'unexpected symbol {chr(b)}' 172 else: 173 msg = f'unexpected byte-value {b}' 174 raise ValueError(msg) 175 176 177 def handle_negative(w, r) -> None: 178 skip_byte(r) 179 w.write(b'-') 180 181 if peek_byte(r) == 46: # ord('.') 182 skip_byte(r) 183 w.write(b'0.') 184 if copy_digits(w, r) < 1: 185 raise ValueError('expected numeric digits, but found none') 186 else: 187 handle_number(w, r) 188 189 190 def handle_null(w, r) -> None: 191 demand(r, b'null') 192 w.write(b'null') 193 194 195 def handle_None(w, r) -> None: 196 demand(r, b'None') 197 w.write(b'null') 198 199 200 def handle_number(w, r) -> None: 201 # handle integer part 202 if copy_digits(w, r) < 1: 203 raise ValueError('expected numeric digits, but found none') 204 205 # handle optional decimals 206 b = peek_byte(r) 207 if b == 46: # ord('.') 208 skip_byte(r) 209 w.write(b'.') 210 if copy_digits(w, r) < 1: 211 # follow a trailing decimal dot with a 0 212 w.write(b'0') 213 214 # handle optional exponent 215 if b == 101 or b == 69: # ord('e'), ord('E') 216 skip_byte(r) 217 w.write(b'e' if b == 101 else b'E') 218 b = peek_byte(r) 219 if b == 43: # ord('+') 220 skip_byte(r) 221 elif b == 45: # ord('-') 222 w.write(b'-') 223 skip_byte(r) 224 if copy_digits(w, r) < 1: 225 raise ValueError('expected numeric digits, but found none') 226 227 228 def handle_object(w, r) -> None: 229 seek_next = seek_next_token 230 231 num_pairs = 0 232 skip_byte(r) 233 w.write(b'{') 234 235 while True: 236 # whitespace/comments may precede the next item/comma 237 seek_next(r) 238 b = peek_byte(r) 239 if b < 0: 240 raise ValueError('unexpected end of input data, before "}"') 241 242 comma = b == 44 # ord(',') 243 244 if comma: 245 skip_byte(r) 246 # whitespace/comments may follow the comma 247 seek_next(r) 248 b = peek_byte(r) 249 if b < 0: 250 raise ValueError('unexpected end of input data, before "}"') 251 252 if b == 125: # ord('}') 253 skip_byte(r) 254 w.write(b'}') 255 return 256 257 if num_pairs > 0: 258 if not comma: 259 raise ValueError('missing a comma between key-value pairs') 260 w.write(b',') 261 262 demand_string(w, r) 263 # whitespace/comments may follow the key 264 seek_next(r) 265 demand(r, b':') 266 w.write(b':') 267 # whitespace/comments may follow the colon 268 seek_next(r) 269 b = peek_byte(r) 270 if b > 0: 271 handlers[b](w, r) 272 num_pairs += 1 273 274 275 def handle_positive(w, r) -> None: 276 # do nothing with the leading plus sign, which isn't allowed in JSON 277 skip_byte(r) 278 279 if peek_byte(r) == 46: # ord('.') 280 skip_byte(r) 281 w.write(b'0.') 282 if copy_digits(w, r) < 1: 283 raise ValueError('expected numeric digits, but found none') 284 else: 285 handle_number(w, r) 286 287 288 def handle_single_quoted_string(w, r) -> None: 289 skip_byte(r) 290 w.write(b'"') 291 handle_inner_string(w, r, 39) # ord('\'') 292 w.write(b'"') 293 294 295 def demand_string(w, r) -> None: 296 quote = peek_byte(r) 297 if quote < 0: 298 msg = 'unexpected end of input, instead of a string quote' 299 raise ValueError(msg) 300 301 if quote == 34: # ord('"') 302 handle_double_quoted_string(w, r) 303 return 304 305 if quote == 39: # ord('\'') 306 handle_single_quoted_string(w, r) 307 return 308 309 if 32 < quote <= 126: # ord(' '), ord('~') 310 msg = f'expected ", or even \', but got "{chr(quote)}" instead' 311 else: 312 msg = f'expected ", or even \', but got byte "{quote}" instead' 313 raise ValueError(msg) 314 315 316 def handle_inner_string(w, r, quote: int) -> None: 317 esc = False 318 bad_hex_msg = 'invalid hexadecimal symbols' 319 early_end_msg = 'input data ended while still in quoted string' 320 321 def is_hex(x: int) -> bool: 322 # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f') 323 return 48 <= x <= 57 or 97 <= x <= 102 324 325 def lower(x: int) -> bool: 326 # 65 is ord('A'), 90 is ord('Z') 327 return x + 32 if 65 <= x <= 90 else x 328 329 while True: 330 chunk = r.peek(1) 331 if len(chunk) < 1: 332 raise ValueError(early_end_msg) 333 b = chunk[0] 334 335 if esc: 336 esc = False 337 338 if b == 120: # ord('x') 339 skip_byte(r) 340 chunk = read(r, 2) 341 if len(chunk) != 2: 342 raise ValueError(early_end_msg) 343 a = lower(chunk[0]) 344 b = lower(chunk[1]) 345 w.write(b'\\u00') 346 if not (is_hex(a) and is_hex(b)): 347 raise ValueError(bad_hex_msg) 348 w.write(a) 349 w.write(b) 350 continue 351 352 if b == 117: # ord('u') 353 skip_byte(r) 354 chunk = read(r, 4) 355 if len(chunk) != 4: 356 raise ValueError(early_end_msg) 357 a = lower(chunk[0]) 358 b = lower(chunk[1]) 359 c = lower(chunk[2]) 360 d = lower(chunk[3]) 361 w.write(b'\\u') 362 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)): 363 raise ValueError(bad_hex_msg) 364 w.write(chunk) 365 continue 366 367 # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f' 368 if b in (34, 92, 110, 116, 114, 98, 102): 369 w.write(b'\\') 370 371 w.write(read(r, 1)) 372 continue 373 374 if b == 92: # ord('\\') 375 esc = True 376 skip_byte(r) 377 continue 378 379 if b == quote: 380 skip_byte(r) 381 return 382 383 # emit normal string-byte 384 w.write(read(r, 1)) 385 386 387 def handle_true(w, r) -> None: 388 demand(r, b'true') 389 w.write(b'true') 390 391 392 def handle_True(w, r) -> None: 393 demand(r, b'True') 394 w.write(b'true') 395 396 397 # setup byte-handling lookup tuple 398 bh = [handle_invalid for i in range(256)] 399 bh[ord('0')] = handle_number 400 bh[ord('1')] = handle_number 401 bh[ord('2')] = handle_number 402 bh[ord('3')] = handle_number 403 bh[ord('4')] = handle_number 404 bh[ord('5')] = handle_number 405 bh[ord('6')] = handle_number 406 bh[ord('7')] = handle_number 407 bh[ord('8')] = handle_number 408 bh[ord('9')] = handle_number 409 bh[ord('+')] = handle_positive 410 bh[ord('-')] = handle_negative 411 bh[ord('.')] = handle_dot 412 bh[ord('"')] = handle_double_quoted_string 413 bh[ord('\'')] = handle_single_quoted_string 414 bh[ord('F')] = handle_False 415 bh[ord('N')] = handle_None 416 bh[ord('T')] = handle_True 417 bh[ord('f')] = handle_false 418 bh[ord('n')] = handle_null 419 bh[ord('t')] = handle_true 420 bh[ord('[')] = handle_array 421 bh[ord('{')] = handle_object 422 423 # handlers is the immutable byte-driven func-dispatch table 424 handlers = tuple(bh) 425 426 427 def copy_digits(w, r) -> int: 428 'Returns how many digits were copied/handled.' 429 430 copied = 0 431 while True: 432 chunk = r.peek(64) 433 if len(chunk) == 0: 434 return copied 435 436 i = find_digits_end_index(chunk) 437 if i >= 0: 438 w.write(read(r, i)) 439 copied += i 440 return copied 441 else: 442 w.write(chunk) 443 read(r, len(chunk)) 444 copied += len(chunk) 445 446 447 def seek_next_token(r) -> None: 448 'Skip an arbitrarily-long mix of whitespace and comments.' 449 450 while True: 451 chunk = r.peek(1024) 452 if len(chunk) == 0: 453 # input is over, and this func doesn't consider that an error 454 return 455 456 comment = False 457 458 for i, b in enumerate(chunk): 459 # skip space, tab, line-feed, carriage-return, or form-feed 460 if b in (9, 10, 11, 13, 32): 461 continue 462 463 if b == 47: # ord('/') 464 read(r, i) 465 demand_comment(r) 466 comment = True 467 break 468 469 # found start of next token 470 read(r, i) 471 return 472 473 if not comment: 474 read(r, len(chunk)) 475 476 477 def skip_line(r) -> None: 478 while True: 479 chunk = r.peek(1024) 480 if len(chunk) == 0: 481 return 482 483 i = chunk.find(b'\n') 484 if i >= 0: 485 read(r, i + 1) 486 return 487 488 read(r, len(chunk)) 489 490 491 def skip_general_comment(r) -> None: 492 while True: 493 chunk = r.peek(1024) 494 if len(chunk) == 0: 495 raise ValueError(f'input data ended before an expected */') 496 497 i = chunk.find(b'*') 498 if i < 0: 499 # no */ in this chunk, so skip it and try with the next one 500 read(r, len(chunk)) 501 continue 502 503 # skip right past the * just found, then check if a / follows it 504 read(r, i + 1) 505 if peek_byte(r) == 47: # ord('/') 506 # got */, the end of this comment 507 skip_byte(r) 508 return 509 510 511 def find_digits_end_index(chunk: bytes) -> int: 512 i = 0 513 for b in chunk: 514 if 48 <= b <= 57: 515 i += 1 516 else: 517 return i 518 519 # all bytes (if any) were digits, so no end was found 520 return -1 521 522 523 def demand(r, what: bytes) -> None: 524 lead = read(r, len(what)) 525 if not lead.startswith(what): 526 lead = str(lead, encoding='utf-8') 527 what = str(what, encoding='utf-8') 528 raise ValueError(f'expected {what}, but got {lead} instead') 529 530 531 def demand_comment(r) -> None: 532 demand(r, b'/') 533 b = peek_byte(r) 534 if b < 0: 535 raise ValueError('unexpected end of input data') 536 537 if b == 47: # ord('/') 538 # handle single-line comment 539 skip_line(r) 540 return 541 542 if b == 42: # ord('*') 543 # handle (potentially) multi-line comment 544 skip_general_comment(r) 545 return 546 547 raise ValueError('expected * or another /, after a /') 548 549 550 def json0(w, src, end) -> None: 551 r = BufferedReader(src) 552 553 # skip leading UTF-8 BOM (byte-order mark) 554 if r.peek(3) == b'\xef\xbb\xbf': 555 read(r, 3) 556 557 # skip leading whitespace/comments 558 seek_next_token(r) 559 560 # emit a single output line, ending with a line-feed 561 b = peek_byte(r) 562 if b >= 0: 563 handlers[b](w, r) 564 else: 565 # w.write(b'null') 566 # treat empty(ish) input as invalid JSON 567 raise ValueError('can\'t turn empty(ish) input into JSON') 568 569 # deliberately run post-processing before checking for trailing-data 570 # errors: for example, if post-proc func emits new line, errors will 571 # show up on their separate line, which is nicer 572 end(w) 573 574 # ignore trailing whitespace/comment bytes, if present 575 seek_next_token(r) 576 577 # ignore trailing semicolon, if present 578 b = peek_byte(r) 579 if b == 59: # ord(';') 580 read(r, 1) 581 # ignore trailing whitespace/comment bytes, if present 582 seek_next_token(r) 583 584 if len(r.peek(1)) > 0: 585 raise ValueError('unexpected trailing bytes in JSON data') 586 587 588 def seems_url(s: str) -> bool: 589 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 590 return any(s.startswith(p) for p in protocols) 591 592 593 def handle_json(w, r) -> None: 594 def end(w) -> None: 595 w.write(b'\n') 596 w.flush() 597 json0(w, r, end) 598 599 600 def handle_json_lines(w, r) -> None: 601 global pos, linenum 602 603 items = 0 604 linenum = 0 605 w.write(b'[') 606 607 while True: 608 line = r.readline().lstrip() 609 if not line: 610 break 611 612 pos = 1 613 linenum += 1 614 615 stripped = line.strip() 616 if not stripped or stripped.startswith(b'//'): 617 continue 618 619 items += 1 620 if items > 1: 621 w.write(b',') 622 623 json0(w, BytesIO(line), lambda w: w.flush()) 624 625 w.write(b']\n') 626 627 628 start_args = 1 629 handle_input = handle_json 630 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'): 631 start_args = 2 632 handle_input = handle_json_lines 633 634 if len(argv) - 1 > start_args: 635 print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr) 636 exit(1) 637 638 w = stdout.buffer 639 name = argv[start_args] if len(argv) > start_args else '-' 640 641 # values keeping track of the input-position, shown in case of errors 642 pos = 1 643 linenum = 1 644 645 try: 646 if name == '-': 647 handle_input(w, stdin.buffer) 648 elif seems_url(name): 649 from urllib.request import urlopen 650 with urlopen(name) as inp: 651 handle_input(w, inp) 652 else: 653 with open(name, mode='rb') as inp: 654 handle_input(w, inp) 655 except BrokenPipeError: 656 # quit quietly, instead of showing a confusing error message 657 stderr.close() 658 exit(0) 659 except KeyboardInterrupt: 660 exit(2) 661 except Exception as e: 662 stdout.write('\n') 663 print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr) 664 exit(1)