File: j0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2020-2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import BufferedReader, BytesIO 27 from sys import argv, exit, stderr, stdin, stdout 28 29 30 info = ''' 31 j0 [filepath/URI...] 32 33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 34 35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid 36 JSON, since it ignores comments and trailing commas, neither of which are 37 supported in JSON, but which are still commonly used. 38 39 It also turns single-quoted strings into proper double-quoted ones, as well 40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u` 41 hexadecimal escapes. When backslashes in strings are followed by an invalid 42 escape letter, the backslash is ignored. 43 44 Output is always a single line of valid JSON, ending with a line-feed. 45 ''' 46 47 # handle standard help cmd-line options, quitting right away in that case 48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 49 print(info.strip()) 50 exit(0) 51 52 53 # note: using regexes doesn't seem to speed-up number/string-handling 54 55 56 def read(r, size: int) -> bytes: 57 global pos, linenum 58 59 chunk = r.read(size) 60 if not chunk: 61 return chunk 62 63 if not (10 in chunk): 64 pos += len(chunk) 65 return chunk 66 67 for b in chunk: 68 if b == 10: 69 pos = 1 70 linenum += 1 71 else: 72 pos += 1 73 return chunk 74 75 76 def skip_byte(r) -> None: 77 global pos, linenum 78 79 chunk = r.read(1) 80 if not chunk: 81 return 82 83 if chunk[0] == 10: 84 pos = 1 85 linenum += 1 86 else: 87 pos += 1 88 89 90 def peek_byte(r) -> int: 91 chunk = r.peek(64) 92 if len(chunk) > 0: 93 return chunk[0] 94 return -1 95 96 97 def handle_array(w, r) -> None: 98 seek_next = seek_next_token 99 100 n = 0 101 skip_byte(r) 102 w.write(b'[') 103 104 while True: 105 # whitespace/comments may precede the next item/comma 106 seek_next(r) 107 b = peek_byte(r) 108 if b < 0: 109 raise ValueError('unexpected end of input data, before "]"') 110 111 comma = b == 44 # ord(',') 112 113 if comma: 114 skip_byte(r) 115 # whitespace/comments may follow the comma 116 seek_next(r) 117 b = peek_byte(r) 118 if b < 0: 119 raise ValueError('unexpected end of input data, before "]"') 120 121 if b == 93: # ord(']') 122 skip_byte(r) 123 w.write(b']') 124 return 125 126 if n > 0: 127 if not comma: 128 raise ValueError('missing a comma between array values') 129 w.write(b',') 130 131 b = peek_byte(r) 132 if b > 0: 133 handlers[b](w, r) 134 n += 1 135 136 137 def handle_double_quoted_string(w, r) -> None: 138 skip_byte(r) 139 w.write(b'"') 140 handle_inner_string(w, r, 34) # ord('"') 141 w.write(b'"') 142 143 144 def handle_dot(w, r) -> None: 145 skip_byte(r) 146 # precede the leading decimal dot with a 0 147 w.write(b'0.') 148 149 # handle decimals, which in this case aren't optional, as a leading 150 # dot is what led to this point 151 if copy_digits(w, r) < 1: 152 raise ValueError('expected numeric digits, but found none') 153 154 155 def handle_false(w, r) -> None: 156 demand(r, b'false') 157 w.write(b'false') 158 159 160 def handle_invalid(w, r) -> None: 161 b = peek_byte(r) 162 if b < 0: 163 raise ValueError('unexpected end of input data') 164 # raise ValueError(f'unexpected JSON byte-value {b}') 165 if 32 < b <= 126: 166 msg = f'unexpected symbol {chr(b)}' 167 else: 168 msg = f'unexpected byte-value {b}' 169 raise ValueError(msg) 170 171 172 def handle_negative(w, r) -> None: 173 skip_byte(r) 174 w.write(b'-') 175 176 if peek_byte(r) == 46: # ord('.') 177 skip_byte(r) 178 w.write(b'0.') 179 if copy_digits(w, r) < 1: 180 raise ValueError('expected numeric digits, but found none') 181 else: 182 handle_number(w, r) 183 184 185 def handle_null(w, r) -> None: 186 demand(r, b'null') 187 w.write(b'null') 188 189 190 def handle_number(w, r) -> None: 191 # handle integer part 192 if copy_digits(w, r) < 1: 193 raise ValueError('expected numeric digits, but found none') 194 195 # handle optional decimals 196 b = peek_byte(r) 197 if b == 46: # ord('.') 198 skip_byte(r) 199 w.write(b'.') 200 if copy_digits(w, r) < 1: 201 # follow a trailing decimal dot with a 0 202 w.write(b'0') 203 204 # handle optional exponent 205 if b == 101 or b == 69: # ord('e'), ord('E') 206 skip_byte(r) 207 w.write(b'e' if b == 101 else b'E') 208 b = peek_byte(r) 209 if b == 43: # ord('+') 210 skip_byte(r) 211 elif b == 45: # ord('-') 212 w.write(b'-') 213 skip_byte(r) 214 if copy_digits(w, r) < 1: 215 raise ValueError('expected numeric digits, but found none') 216 217 218 def handle_object(w, r) -> None: 219 seek_next = seek_next_token 220 221 num_pairs = 0 222 skip_byte(r) 223 w.write(b'{') 224 225 while True: 226 # whitespace/comments may precede the next item/comma 227 seek_next(r) 228 b = peek_byte(r) 229 if b < 0: 230 raise ValueError('unexpected end of input data, before "}"') 231 232 comma = b == 44 # ord(',') 233 234 if comma: 235 skip_byte(r) 236 # whitespace/comments may follow the comma 237 seek_next(r) 238 b = peek_byte(r) 239 if b < 0: 240 raise ValueError('unexpected end of input data, before "}"') 241 242 if b == 125: # ord('}') 243 skip_byte(r) 244 w.write(b'}') 245 return 246 247 if num_pairs > 0: 248 if not comma: 249 raise ValueError('missing a comma between key-value pairs') 250 w.write(b',') 251 252 demand_string(w, r) 253 # whitespace/comments may follow the key 254 seek_next(r) 255 demand(r, b':') 256 w.write(b':') 257 # whitespace/comments may follow the colon 258 seek_next(r) 259 b = peek_byte(r) 260 if b > 0: 261 handlers[b](w, r) 262 num_pairs += 1 263 264 265 def handle_positive(w, r) -> None: 266 # do nothing with the leading plus sign, which isn't allowed in JSON 267 skip_byte(r) 268 269 if peek_byte(r) == 46: # ord('.') 270 skip_byte(r) 271 w.write(b'0.') 272 if copy_digits(w, r) < 1: 273 raise ValueError('expected numeric digits, but found none') 274 else: 275 handle_number(w, r) 276 277 278 def handle_single_quoted_string(w, r) -> None: 279 skip_byte(r) 280 w.write(b'"') 281 handle_inner_string(w, r, 39) # ord('\'') 282 w.write(b'"') 283 284 285 def demand_string(w, r) -> None: 286 quote = peek_byte(r) 287 if quote < 0: 288 msg = 'unexpected end of input, instead of a string quote' 289 raise ValueError(msg) 290 291 if quote == 34: # ord('"') 292 handle_double_quoted_string(w, r) 293 return 294 295 if quote == 39: # ord('\'') 296 handle_single_quoted_string(w, r) 297 return 298 299 if 32 < quote <= 126: # ord(' '), ord('~') 300 msg = f'expected ", or even \', but got "{chr(quote)}" instead' 301 else: 302 msg = f'expected ", or even \', but got byte "{quote}" instead' 303 raise ValueError(msg) 304 305 306 def handle_inner_string(w, r, quote: int) -> None: 307 esc = False 308 bad_hex_msg = 'invalid hexadecimal symbols' 309 early_end_msg = 'input data ended while still in quoted string' 310 311 def is_hex(x: int) -> bool: 312 # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f') 313 return 48 <= x <= 57 or 97 <= x <= 102 314 315 def lower(x: int) -> bool: 316 # 65 is ord('A'), 90 is ord('Z') 317 return x + 32 if 65 <= x <= 90 else x 318 319 while True: 320 chunk = r.peek(1) 321 if len(chunk) < 1: 322 raise ValueError(early_end_msg) 323 b = chunk[0] 324 325 if esc: 326 esc = False 327 328 if b == 120: # ord('x') 329 skip_byte(r) 330 chunk = read(r, 2) 331 if len(chunk) != 2: 332 raise ValueError(early_end_msg) 333 a = lower(chunk[0]) 334 b = lower(chunk[1]) 335 w.write(b'\\u00') 336 if not (is_hex(a) and is_hex(b)): 337 raise ValueError(bad_hex_msg) 338 w.write(a) 339 w.write(b) 340 continue 341 342 if b == 117: # ord('u') 343 skip_byte(r) 344 chunk = read(r, 4) 345 if len(chunk) != 4: 346 raise ValueError(early_end_msg) 347 a = lower(chunk[0]) 348 b = lower(chunk[1]) 349 c = lower(chunk[2]) 350 d = lower(chunk[3]) 351 w.write(b'\\u') 352 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)): 353 raise ValueError(bad_hex_msg) 354 w.write(chunk) 355 continue 356 357 # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f' 358 if b in (34, 92, 110, 116, 114, 98, 102): 359 w.write(b'\\') 360 361 w.write(read(r, 1)) 362 continue 363 364 if b == 92: # ord('\\') 365 esc = True 366 skip_byte(r) 367 continue 368 369 if b == quote: 370 skip_byte(r) 371 return 372 373 # emit normal string-byte 374 w.write(read(r, 1)) 375 376 377 def handle_true(w, r) -> None: 378 demand(r, b'true') 379 w.write(b'true') 380 381 382 # setup byte-handling lookup tuple 383 bh = [handle_invalid for i in range(256)] 384 bh[ord('0')] = handle_number 385 bh[ord('1')] = handle_number 386 bh[ord('2')] = handle_number 387 bh[ord('3')] = handle_number 388 bh[ord('4')] = handle_number 389 bh[ord('5')] = handle_number 390 bh[ord('6')] = handle_number 391 bh[ord('7')] = handle_number 392 bh[ord('8')] = handle_number 393 bh[ord('9')] = handle_number 394 bh[ord('+')] = handle_positive 395 bh[ord('-')] = handle_negative 396 bh[ord('.')] = handle_dot 397 bh[ord('"')] = handle_double_quoted_string 398 bh[ord('\'')] = handle_single_quoted_string 399 bh[ord('f')] = handle_false 400 bh[ord('n')] = handle_null 401 bh[ord('t')] = handle_true 402 bh[ord('[')] = handle_array 403 bh[ord('{')] = handle_object 404 405 # handlers is the immutable byte-driven func-dispatch table 406 handlers = tuple(bh) 407 408 409 def copy_digits(w, r) -> int: 410 'Returns how many digits were copied/handled.' 411 412 copied = 0 413 while True: 414 chunk = r.peek(64) 415 if len(chunk) == 0: 416 return copied 417 418 i = find_digits_end_index(chunk) 419 if i >= 0: 420 w.write(read(r, i)) 421 copied += i 422 return copied 423 else: 424 w.write(chunk) 425 read(r, len(chunk)) 426 copied += len(chunk) 427 428 429 def seek_next_token(r) -> None: 430 'Skip an arbitrarily-long mix of whitespace and comments.' 431 432 while True: 433 chunk = r.peek(1024) 434 if len(chunk) == 0: 435 # input is over, and this func doesn't consider that an error 436 return 437 438 comment = False 439 440 for i, b in enumerate(chunk): 441 # skip space, tab, line-feed, carriage-return, or form-feed 442 if b in (9, 10, 11, 13, 32): 443 continue 444 445 if b == 47: # ord('/') 446 read(r, i) 447 demand_comment(r) 448 comment = True 449 break 450 451 # found start of next token 452 read(r, i) 453 return 454 455 if not comment: 456 read(r, len(chunk)) 457 458 459 def skip_line(r) -> None: 460 while True: 461 chunk = r.peek(1024) 462 if len(chunk) == 0: 463 return 464 465 i = chunk.find(b'\n') 466 if i >= 0: 467 read(r, i + 1) 468 return 469 470 read(r, len(chunk)) 471 472 473 def skip_general_comment(r) -> None: 474 while True: 475 chunk = r.peek(1024) 476 if len(chunk) == 0: 477 raise ValueError(f'input data ended before an expected */') 478 479 i = chunk.find(b'*') 480 if i < 0: 481 # no */ in this chunk, so skip it and try with the next one 482 read(r, len(chunk)) 483 continue 484 485 # skip right past the * just found, then check if a / follows it 486 read(r, i + 1) 487 if peek_byte(r) == 47: # ord('/') 488 # got */, the end of this comment 489 skip_byte(r) 490 return 491 492 493 def find_digits_end_index(chunk: bytes) -> int: 494 i = 0 495 for b in chunk: 496 if 48 <= b <= 57: 497 i += 1 498 else: 499 return i 500 501 # all bytes (if any) were digits, so no end was found 502 return -1 503 504 505 def demand(r, what: bytes) -> None: 506 lead = read(r, len(what)) 507 if not lead.startswith(what): 508 lead = str(lead, encoding='utf-8') 509 what = str(what, encoding='utf-8') 510 raise ValueError(f'expected {what}, but got {lead} instead') 511 512 513 def demand_comment(r) -> None: 514 demand(r, b'/') 515 b = peek_byte(r) 516 if b < 0: 517 raise ValueError('unexpected end of input data') 518 519 if b == 47: # ord('/') 520 # handle single-line comment 521 skip_line(r) 522 return 523 524 if b == 42: # ord('*') 525 # handle (potentially) multi-line comment 526 skip_general_comment(r) 527 return 528 529 raise ValueError('expected * or another /, after a /') 530 531 532 def json0(w, src, end) -> None: 533 r = BufferedReader(src) 534 535 # skip leading UTF-8 BOM (byte-order mark) 536 if r.peek(3) == b'\xef\xbb\xbf': 537 read(r, 3) 538 539 # skip leading whitespace/comments 540 seek_next_token(r) 541 542 # emit a single output line, ending with a line-feed 543 b = peek_byte(r) 544 if b >= 0: 545 handlers[b](w, r) 546 else: 547 # w.write(b'null') 548 # treat empty(ish) input as invalid JSON 549 raise ValueError('can\'t turn empty(ish) input into JSON') 550 551 # deliberately run post-processing before checking for trailing-data 552 # errors: for example, if post-proc func emits new line, errors will 553 # show up on their separate line, which is nicer 554 end(w) 555 556 # ignore trailing whitespace/comment bytes, if present 557 seek_next_token(r) 558 559 # ignore trailing semicolon, if present 560 b = peek_byte(r) 561 if b == 59: # ord(';') 562 read(r, 1) 563 # ignore trailing whitespace/comment bytes, if present 564 seek_next_token(r) 565 566 if len(r.peek(1)) > 0: 567 raise ValueError('unexpected trailing bytes in JSON data') 568 569 570 def seems_url(s: str) -> bool: 571 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 572 return any(s.startswith(p) for p in protocols) 573 574 575 def handle_json(w, r) -> None: 576 def end(w) -> None: 577 w.write(b'\n') 578 w.flush() 579 json0(w, r, end) 580 581 582 def handle_json_lines(w, r) -> None: 583 global pos, linenum 584 585 items = 0 586 linenum = 0 587 w.write(b'[') 588 589 while True: 590 line = r.readline().lstrip() 591 if not line: 592 break 593 594 pos = 1 595 linenum += 1 596 597 stripped = line.strip() 598 if not stripped or stripped.startswith(b'//'): 599 continue 600 601 items += 1 602 if items > 1: 603 w.write(b',') 604 605 json0(w, BytesIO(line), lambda w: w.flush()) 606 607 w.write(b']\n') 608 609 610 start_args = 1 611 handle_input = handle_json 612 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'): 613 start_args = 2 614 handle_input = handle_json_lines 615 616 if len(argv) - 1 > start_args: 617 print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr) 618 exit(1) 619 620 w = stdout.buffer 621 name = argv[start_args] if len(argv) > start_args else '-' 622 623 # values keeping track of the input-position, shown in case of errors 624 pos = 1 625 linenum = 1 626 627 try: 628 if name == '-': 629 handle_input(w, stdin.buffer) 630 elif seems_url(name): 631 from urllib.request import urlopen 632 with urlopen(name) as inp: 633 handle_input(w, inp) 634 else: 635 with open(name, mode='rb') as inp: 636 handle_input(w, inp) 637 except BrokenPipeError: 638 # quit quietly, instead of showing a confusing error message 639 stderr.close() 640 exit(0) 641 except KeyboardInterrupt: 642 exit(2) 643 except Exception as e: 644 stdout.write('\n') 645 print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr) 646 exit(1)