File: j0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import BufferedReader, BytesIO 27 from sys import argv, exit, stderr, stdin, stdout 28 29 30 info = ''' 31 j0 [filepath/URI...] 32 33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 34 35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid 36 JSON, since it ignores comments and trailing commas, neither of which are 37 supported in JSON, but which are still commonly used. 38 39 It also turns single-quoted strings into proper double-quoted ones, as well 40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u` 41 hexadecimal escapes. When backslashes in strings are followed by an invalid 42 escape letter, the backslash is ignored. 43 44 Output is always a single line of valid JSON, ending with a line-feed. 45 ''' 46 47 # handle standard help cmd-line options, quitting right away in that case 48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 49 print(info.strip()) 50 exit(0) 51 52 53 # note: using regexes doesn't seem to speed-up number/string-handling 54 55 56 def read(r, size: int) -> bytes: 57 global pos, linenum 58 59 chunk = r.read(size) 60 if not chunk: 61 return chunk 62 63 if not (10 in chunk): 64 pos += len(chunk) 65 return chunk 66 67 for b in chunk: 68 if b == 10: 69 pos = 1 70 linenum += 1 71 else: 72 pos += 1 73 74 return chunk 75 76 77 def skip_byte(r) -> None: 78 global pos, linenum 79 80 chunk = r.read(1) 81 if not chunk: 82 return 83 84 if chunk[0] == 10: 85 pos = 1 86 linenum += 1 87 else: 88 pos += 1 89 90 91 def peek_byte(r) -> int: 92 chunk = r.peek(64) 93 if len(chunk) > 0: 94 return chunk[0] 95 return -1 96 97 98 def handle_array(w, r) -> None: 99 seek_next = seek_next_token 100 101 n = 0 102 skip_byte(r) 103 w.write(b'[') 104 105 while True: 106 # whitespace/comments may precede the next item/comma 107 seek_next(r) 108 b = peek_byte(r) 109 if b < 0: 110 raise ValueError('unexpected end of input data, before "]"') 111 112 comma = b == 44 # ord(',') 113 114 if comma: 115 skip_byte(r) 116 # whitespace/comments may follow the comma 117 seek_next(r) 118 b = peek_byte(r) 119 if b < 0: 120 raise ValueError('unexpected end of input data, before "]"') 121 122 if b == 93: # ord(']') 123 skip_byte(r) 124 w.write(b']') 125 return 126 127 if n > 0: 128 if not comma: 129 raise ValueError('missing a comma between array values') 130 w.write(b',') 131 132 b = peek_byte(r) 133 if b > 0: 134 handlers[b](w, r) 135 n += 1 136 137 138 def handle_double_quoted_string(w, r) -> None: 139 skip_byte(r) 140 w.write(b'"') 141 handle_inner_string(w, r, 34) # ord('"') 142 w.write(b'"') 143 144 145 def handle_dot(w, r) -> None: 146 skip_byte(r) 147 # precede the leading decimal dot with a 0 148 w.write(b'0.') 149 150 # handle decimals, which in this case aren't optional, as a leading 151 # dot is what led to this point 152 if copy_digits(w, r) < 1: 153 raise ValueError('expected numeric digits, but found none') 154 155 156 def handle_false(w, r) -> None: 157 demand(r, b'false') 158 w.write(b'false') 159 160 161 def handle_invalid(w, r) -> None: 162 b = peek_byte(r) 163 if b < 0: 164 raise ValueError('unexpected end of input data') 165 # raise ValueError(f'unexpected JSON byte-value {b}') 166 if 32 < b <= 126: 167 msg = f'unexpected symbol {chr(b)}' 168 else: 169 msg = f'unexpected byte-value {b}' 170 raise ValueError(msg) 171 172 173 def handle_negative(w, r) -> None: 174 skip_byte(r) 175 w.write(b'-') 176 177 if peek_byte(r) == 46: # ord('.') 178 skip_byte(r) 179 w.write(b'0.') 180 if copy_digits(w, r) < 1: 181 raise ValueError('expected numeric digits, but found none') 182 else: 183 handle_number(w, r) 184 185 186 def handle_null(w, r) -> None: 187 demand(r, b'null') 188 w.write(b'null') 189 190 191 def handle_number(w, r) -> None: 192 # handle integer part 193 if copy_digits(w, r) < 1: 194 raise ValueError('expected numeric digits, but found none') 195 196 # handle optional decimals 197 b = peek_byte(r) 198 if b == 46: # ord('.') 199 skip_byte(r) 200 w.write(b'.') 201 if copy_digits(w, r) < 1: 202 # follow a trailing decimal dot with a 0 203 w.write(b'0') 204 205 # handle optional exponent 206 if b == 101 or b == 69: # ord('e'), ord('E') 207 skip_byte(r) 208 w.write(b'e' if b == 101 else b'E') 209 b = peek_byte(r) 210 if b == 43: # ord('+') 211 skip_byte(r) 212 elif b == 45: # ord('-') 213 w.write(b'-') 214 skip_byte(r) 215 if copy_digits(w, r) < 1: 216 raise ValueError('expected numeric digits, but found none') 217 218 219 def handle_object(w, r) -> None: 220 seek_next = seek_next_token 221 222 num_pairs = 0 223 skip_byte(r) 224 w.write(b'{') 225 226 while True: 227 # whitespace/comments may precede the next item/comma 228 seek_next(r) 229 b = peek_byte(r) 230 if b < 0: 231 raise ValueError('unexpected end of input data, before "}"') 232 233 comma = b == 44 # ord(',') 234 235 if comma: 236 skip_byte(r) 237 # whitespace/comments may follow the comma 238 seek_next(r) 239 b = peek_byte(r) 240 if b < 0: 241 raise ValueError('unexpected end of input data, before "}"') 242 243 if b == 125: # ord('}') 244 skip_byte(r) 245 w.write(b'}') 246 return 247 248 if num_pairs > 0: 249 if not comma: 250 raise ValueError('missing a comma between key-value pairs') 251 w.write(b',') 252 253 demand_string(w, r) 254 # whitespace/comments may follow the key 255 seek_next(r) 256 demand(r, b':') 257 w.write(b':') 258 # whitespace/comments may follow the colon 259 seek_next(r) 260 b = peek_byte(r) 261 if b > 0: 262 handlers[b](w, r) 263 num_pairs += 1 264 265 266 def handle_positive(w, r) -> None: 267 # do nothing with the leading plus sign: strictly-speaking, JSON numbers 268 # can't start with a positive sign, and this tool's output is supposed 269 # to be `JSON-0` (minimized) anyway 270 skip_byte(r) 271 272 if peek_byte(r) == 46: # ord('.') 273 skip_byte(r) 274 w.write(b'0.') 275 if copy_digits(w, r) < 1: 276 raise ValueError('expected numeric digits, but found none') 277 else: 278 handle_number(w, r) 279 280 281 def handle_single_quoted_string(w, r) -> None: 282 skip_byte(r) 283 w.write(b'"') 284 handle_inner_string(w, r, 39) # ord('\'') 285 w.write(b'"') 286 287 288 def demand_string(w, r) -> None: 289 quote = peek_byte(r) 290 if quote < 0: 291 msg = 'unexpected end of input, instead of a string quote' 292 raise ValueError(msg) 293 294 if quote == 34: # ord('"') 295 handle_double_quoted_string(w, r) 296 return 297 298 if quote == 39: # ord('\'') 299 handle_single_quoted_string(w, r) 300 return 301 302 if 32 < quote <= 126: # ord(' '), ord('~') 303 msg = f'expected ", or even \', but got "{chr(quote)}" instead' 304 else: 305 msg = f'expected ", or even \', but got byte "{quote}" instead' 306 raise ValueError(msg) 307 308 309 def handle_inner_string(w, r, quote: int) -> None: 310 esc = False 311 bad_hex_msg = 'invalid hexadecimal symbols' 312 early_end_msg = 'input data ended while still in quoted string' 313 314 def is_hex(x: int) -> bool: 315 # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f') 316 return 48 <= x <= 57 or 97 <= x <= 102 317 318 def lower(x: int) -> bool: 319 # 65 is ord('A'), 90 is ord('Z') 320 return x + 32 if 65 <= x <= 90 else x 321 322 while True: 323 chunk = r.peek(1) 324 if len(chunk) < 1: 325 raise ValueError(early_end_msg) 326 b = chunk[0] 327 328 if esc: 329 esc = False 330 331 if b == 120: # ord('x') 332 skip_byte(r) 333 chunk = read(r, 2) 334 if len(chunk) != 2: 335 raise ValueError(early_end_msg) 336 a = lower(chunk[0]) 337 b = lower(chunk[1]) 338 w.write(b'\\u00') 339 if not (is_hex(a) and is_hex(b)): 340 raise ValueError(bad_hex_msg) 341 w.write(a) 342 w.write(b) 343 continue 344 345 if b == 117: # ord('u') 346 skip_byte(r) 347 chunk = read(r, 4) 348 if len(chunk) != 4: 349 raise ValueError(early_end_msg) 350 a = lower(chunk[0]) 351 b = lower(chunk[1]) 352 c = lower(chunk[2]) 353 d = lower(chunk[3]) 354 w.write(b'\\u') 355 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)): 356 raise ValueError(bad_hex_msg) 357 w.write(chunk) 358 continue 359 360 # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f' 361 if b in (34, 92, 110, 116, 114, 98, 102): 362 w.write(b'\\') 363 364 w.write(read(r, 1)) 365 continue 366 367 if b == 92: # ord('\\') 368 esc = True 369 skip_byte(r) 370 continue 371 372 if b == quote: 373 skip_byte(r) 374 return 375 376 # emit normal string-byte 377 w.write(read(r, 1)) 378 379 380 def handle_true(w, r) -> None: 381 demand(r, b'true') 382 w.write(b'true') 383 384 385 # setup byte-handling lookup tuple 386 bh = [handle_invalid for i in range(256)] 387 bh[ord('0')] = handle_number 388 bh[ord('1')] = handle_number 389 bh[ord('2')] = handle_number 390 bh[ord('3')] = handle_number 391 bh[ord('4')] = handle_number 392 bh[ord('5')] = handle_number 393 bh[ord('6')] = handle_number 394 bh[ord('7')] = handle_number 395 bh[ord('8')] = handle_number 396 bh[ord('9')] = handle_number 397 bh[ord('+')] = handle_positive 398 bh[ord('-')] = handle_negative 399 bh[ord('.')] = handle_dot 400 bh[ord('"')] = handle_double_quoted_string 401 bh[ord('\'')] = handle_single_quoted_string 402 bh[ord('f')] = handle_false 403 bh[ord('n')] = handle_null 404 bh[ord('t')] = handle_true 405 bh[ord('[')] = handle_array 406 bh[ord('{')] = handle_object 407 408 # handlers is the immutable byte-driven func-dispatch table 409 handlers = tuple(bh) 410 411 412 def copy_digits(w, r) -> int: 413 'Returns how many digits were copied/handled.' 414 415 copied = 0 416 while True: 417 chunk = r.peek(64) 418 if len(chunk) == 0: 419 return copied 420 421 i = find_digits_end_index(chunk) 422 if i >= 0: 423 w.write(read(r, i)) 424 copied += i 425 return copied 426 else: 427 w.write(chunk) 428 read(r, len(chunk)) 429 copied += len(chunk) 430 431 432 def seek_next_token(r) -> None: 433 'Skip an arbitrarily-long mix of whitespace and comments.' 434 435 while True: 436 chunk = r.peek(1024) 437 if len(chunk) == 0: 438 # input is over, and this func doesn't consider that an error 439 return 440 441 comment = False 442 443 for i, b in enumerate(chunk): 444 # skip space, tab, line-feed, carriage-return, or form-feed 445 if b in (9, 10, 11, 13, 32): 446 continue 447 448 if b == 47: # ord('/') 449 read(r, i) 450 demand_comment(r) 451 comment = True 452 break 453 454 # found start of next token 455 read(r, i) 456 return 457 458 if not comment: 459 read(r, len(chunk)) 460 461 462 def skip_line(r) -> None: 463 while True: 464 chunk = r.peek(1024) 465 if len(chunk) == 0: 466 return 467 468 i = chunk.find(b'\n') 469 if i >= 0: 470 read(r, i + 1) 471 return 472 473 read(r, len(chunk)) 474 475 476 def skip_general_comment(r) -> None: 477 while True: 478 chunk = r.peek(1024) 479 if len(chunk) == 0: 480 raise ValueError(f'input data ended before an expected */') 481 482 i = chunk.find(b'*') 483 if i < 0: 484 # no */ in this chunk, so skip it and try with the next one 485 read(r, len(chunk)) 486 continue 487 488 # skip right past the * just found, then check if a / follows it 489 read(r, i + 1) 490 if peek_byte(r) == 47: # ord('/') 491 # got */, the end of this comment 492 skip_byte(r) 493 return 494 495 496 def find_digits_end_index(chunk: bytes) -> int: 497 i = 0 498 for b in chunk: 499 if 48 <= b <= 57: 500 i += 1 501 else: 502 return i 503 504 # all bytes (if any) were digits, so no end was found 505 return -1 506 507 508 def demand(r, what: bytes) -> None: 509 lead = read(r, len(what)) 510 if not lead.startswith(what): 511 lead = str(lead, encoding='utf-8') 512 what = str(what, encoding='utf-8') 513 raise ValueError(f'expected {what}, but got {lead} instead') 514 515 516 def demand_comment(r) -> None: 517 demand(r, b'/') 518 b = peek_byte(r) 519 if b < 0: 520 raise ValueError('unexpected end of input data') 521 522 if b == 47: # ord('/') 523 # handle single-line comment 524 skip_line(r) 525 return 526 527 if b == 42: # ord('*') 528 # handle (potentially) multi-line comment 529 skip_general_comment(r) 530 return 531 532 raise ValueError('expected * or another /, after a /') 533 534 535 def json0(w, src, end) -> None: 536 r = BufferedReader(src) 537 538 # skip leading UTF-8 BOM (byte-order mark) 539 if r.peek(3) == b'\xef\xbb\xbf': 540 read(r, 3) 541 542 # skip leading whitespace/comments 543 seek_next_token(r) 544 545 # emit a single output line, ending with a line-feed 546 b = peek_byte(r) 547 if b >= 0: 548 handlers[b](w, r) 549 else: 550 # treat empty(ish) input as invalid JSON 551 raise ValueError('can\'t turn empty(ish) input into JSON') 552 end(w) 553 554 # check against trailing non-whitespace/non-comment bytes 555 seek_next_token(r) 556 if len(r.peek(1)) > 0: 557 raise ValueError('unexpected trailing bytes in JSON data') 558 559 560 def seems_url(s: str) -> bool: 561 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 562 return any(s.startswith(p) for p in protocols) 563 564 565 def handle_json(w, r) -> None: 566 json0(w, r, lambda w: w.write(b'\n')) 567 568 569 def handle_json_lines(w, r) -> None: 570 global pos, linenum 571 572 items = 0 573 linenum = 0 574 w.write(b'[') 575 576 while True: 577 line = r.readline().lstrip() 578 if not line: 579 break 580 581 pos = 1 582 linenum += 1 583 584 stripped = line.strip() 585 if not stripped or stripped.startswith(b'//'): 586 continue 587 588 items += 1 589 if items > 1: 590 w.write(b',') 591 592 json0(w, BytesIO(line), lambda w: None) 593 594 w.write(b']\n') 595 596 597 start_args = 1 598 handle_input = handle_json 599 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'): 600 start_args = 2 601 handle_input = handle_json_lines 602 603 if len(argv) - 1 > start_args: 604 print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr) 605 exit(1) 606 607 w = stdout.buffer 608 name = argv[start_args] if len(argv) > start_args else '-' 609 610 # values keeping track of the input-position, shown in case of errors 611 pos = 1 612 linenum = 1 613 614 try: 615 if name == '-': 616 handle_input(w, stdin.buffer) 617 elif seems_url(name): 618 from urllib.request import urlopen 619 with urlopen(name) as inp: 620 handle_input(w, inp) 621 else: 622 with open(name, mode='rb') as inp: 623 handle_input(w, inp) 624 except BrokenPipeError: 625 # quit quietly, instead of showing a confusing error message 626 stderr.close() 627 except KeyboardInterrupt: 628 exit(2) 629 except Exception as e: 630 stdout.write('\n') 631 stdout.flush() 632 print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr) 633 exit(1)