File: j0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import BufferedReader, BytesIO 27 from sys import argv, exit, stderr, stdin, stdout 28 29 30 info = ''' 31 j0 [filepath/URI...] 32 33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 34 35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid 36 JSON, since it ignores comments and trailing commas, neither of which are 37 supported in JSON, but which are still commonly used. 38 39 It also turns single-quoted strings into proper double-quoted ones, as well 40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u` 41 hexadecimal escapes. When backslashes in strings are followed by an invalid 42 escape letter, the backslash is ignored. 43 44 Output is always a single line of valid JSON, ending with a line-feed. 45 ''' 46 47 # handle standard help cmd-line options, quitting right away in that case 48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 49 print(info.strip(), file=stderr) 50 exit(0) 51 52 53 # note: using regexes doesn't seem to speed-up number/string-handling 54 55 56 def read(r, size: int) -> bytes: 57 global pos, linenum 58 59 chunk = r.read(size) 60 if not chunk: 61 return chunk 62 63 if not (10 in chunk): 64 pos += len(chunk) 65 return chunk 66 67 for b in chunk: 68 if b == 10: 69 pos = 1 70 linenum += 1 71 else: 72 pos += 1 73 74 return chunk 75 76 77 def skip_byte(r) -> None: 78 global pos, linenum 79 80 chunk = r.read(1) 81 if not chunk: 82 return 83 84 if chunk[0] == 10: 85 pos = 1 86 linenum += 1 87 else: 88 pos += 1 89 90 91 def peek_byte(r) -> int: 92 chunk = r.peek(64) 93 if len(chunk) > 0: 94 return chunk[0] 95 return -1 96 97 98 def handle_array(w, r) -> None: 99 seek_next = seek_next_token 100 101 n = 0 102 skip_byte(r) 103 w.write(b'[') 104 105 while True: 106 # whitespace/comments may precede the next item/comma 107 seek_next(r) 108 b = peek_byte(r) 109 if b < 0: 110 raise ValueError('unexpected end of input data, before "]"') 111 112 comma = b == 44 # ord(',') 113 114 if comma: 115 skip_byte(r) 116 # whitespace/comments may follow the comma 117 seek_next(r) 118 b = peek_byte(r) 119 if b < 0: 120 raise ValueError('unexpected end of input data, before "]"') 121 122 if b == 93: # ord(']') 123 skip_byte(r) 124 w.write(b']') 125 return 126 127 if n > 0: 128 if not comma: 129 raise ValueError('missing a comma between array values') 130 w.write(b',') 131 132 b = peek_byte(r) 133 if b > 0: 134 handlers[b](w, r) 135 n += 1 136 137 138 def handle_double_quoted_string(w, r) -> None: 139 skip_byte(r) 140 w.write(b'"') 141 handle_inner_string(w, r, 34) # ord('"') 142 w.write(b'"') 143 144 145 def handle_dot(w, r) -> None: 146 skip_byte(r) 147 # precede the leading decimal dot with a 0 148 w.write(b'0.') 149 150 # handle decimals, which in this case aren't optional, as a leading 151 # dot is what led to this point 152 if copy_digits(w, r) < 1: 153 raise ValueError('expected numeric digits, but found none') 154 155 156 def handle_false(w, r) -> None: 157 demand(r, b'false') 158 w.write(b'false') 159 160 161 def handle_invalid(w, r) -> None: 162 b = peek_byte(r) 163 if b < 0: 164 raise ValueError('unexpected end of input data') 165 # raise ValueError(f'unexpected JSON byte-value {b}') 166 if 32 < b <= 126: 167 msg = f'unexpected symbol {chr(b)}' 168 else: 169 msg = f'unexpected byte-value {b}' 170 raise ValueError(msg) 171 172 173 def handle_negative(w, r) -> None: 174 skip_byte(r) 175 w.write(b'-') 176 177 if peek_byte(r) == 46: # ord('.') 178 skip_byte(r) 179 w.write(b'0.') 180 if copy_digits(w, r) < 1: 181 raise ValueError('expected numeric digits, but found none') 182 else: 183 handle_number(w, r) 184 185 186 def handle_null(w, r) -> None: 187 demand(r, b'null') 188 w.write(b'null') 189 190 191 def handle_number(w, r) -> None: 192 # handle integer part 193 if copy_digits(w, r) < 1: 194 raise ValueError('expected numeric digits, but found none') 195 196 # handle optional decimals 197 b = peek_byte(r) 198 if b == 46: # ord('.') 199 skip_byte(r) 200 w.write(b'.') 201 if copy_digits(w, r) < 1: 202 # follow a trailing decimal dot with a 0 203 w.write(b'0') 204 205 206 def handle_object(w, r) -> None: 207 seek_next = seek_next_token 208 209 num_pairs = 0 210 skip_byte(r) 211 w.write(b'{') 212 213 while True: 214 # whitespace/comments may precede the next item/comma 215 seek_next(r) 216 b = peek_byte(r) 217 if b < 0: 218 raise ValueError('unexpected end of input data, before "}"') 219 220 comma = b == 44 # ord(',') 221 222 if comma: 223 skip_byte(r) 224 # whitespace/comments may follow the comma 225 seek_next(r) 226 b = peek_byte(r) 227 if b < 0: 228 raise ValueError('unexpected end of input data, before "}"') 229 230 if b == 125: # ord('}') 231 skip_byte(r) 232 w.write(b'}') 233 return 234 235 if num_pairs > 0: 236 if not comma: 237 raise ValueError('missing a comma between key-value pairs') 238 w.write(b',') 239 240 demand_string(w, r) 241 # whitespace/comments may follow the key 242 seek_next(r) 243 demand(r, b':') 244 w.write(b':') 245 # whitespace/comments may follow the colon 246 seek_next(r) 247 b = peek_byte(r) 248 if b > 0: 249 handlers[b](w, r) 250 num_pairs += 1 251 252 253 def handle_positive(w, r) -> None: 254 # do nothing with the leading plus sign: strictly-speaking, JSON numbers 255 # can't start with a positive sign, and this tool's output is supposed 256 # to be `JSON-0` (minimized) anyway 257 skip_byte(r) 258 259 if peek_byte(r) == 46: # ord('.') 260 skip_byte(r) 261 w.write(b'0.') 262 if copy_digits(w, r) < 1: 263 raise ValueError('expected numeric digits, but found none') 264 else: 265 handle_number(w, r) 266 267 268 def handle_single_quoted_string(w, r) -> None: 269 skip_byte(r) 270 w.write(b'"') 271 handle_inner_string(w, r, 39) # ord('\'') 272 w.write(b'"') 273 274 275 def demand_string(w, r) -> None: 276 quote = peek_byte(r) 277 if quote < 0: 278 msg = 'unexpected end of input, instead of a string quote' 279 raise ValueError(msg) 280 281 if quote == 34: # ord('"') 282 handle_double_quoted_string(w, r) 283 return 284 285 if quote == 39: # ord('\'') 286 handle_single_quoted_string(w, r) 287 return 288 289 if 32 < quote <= 126: # ord(' '), ord('~') 290 msg = f'expected ", or even \', but got {chr(quote)} instead' 291 else: 292 msg = f'expected ", or even \', but got byte {quote} instead' 293 raise ValueError(msg) 294 295 296 def handle_inner_string(w, r, quote: int) -> None: 297 esc = False 298 bad_hex_msg = 'invalid hexadecimal symbols' 299 early_end_msg = 'input data ended while still in quoted string' 300 301 def is_hex(x: int) -> bool: 302 # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f') 303 return 48 <= x <= 57 or 97 <= x <= 102 304 305 def lower(x: int) -> bool: 306 # 65 is ord('A'), 90 is ord('Z') 307 return x + 32 if 65 <= x <= 90 else x 308 309 while True: 310 chunk = r.peek(1) 311 if len(chunk) < 1: 312 raise ValueError(early_end_msg) 313 b = chunk[0] 314 315 if esc: 316 esc = False 317 318 if b == 120: # ord('x') 319 skip_byte(r) 320 chunk = read(r, 2) 321 if len(chunk) != 2: 322 raise ValueError(early_end_msg) 323 a = lower(chunk[0]) 324 b = lower(chunk[1]) 325 w.write(b'\\u00') 326 if not (is_hex(a) and is_hex(b)): 327 raise ValueError(bad_hex_msg) 328 w.write(a) 329 w.write(b) 330 continue 331 332 if b == 117: # ord('u') 333 skip_byte(r) 334 chunk = read(r, 4) 335 if len(chunk) != 4: 336 raise ValueError(early_end_msg) 337 a = lower(chunk[0]) 338 b = lower(chunk[1]) 339 c = lower(chunk[2]) 340 d = lower(chunk[3]) 341 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)): 342 raise ValueError(bad_hex_msg) 343 w.write(chunk) 344 continue 345 346 # these numbers stand for 't', 'n', 'r', 'v', 'u', '"', and '\\' 347 if b in (116, 110, 114, 118, 117, 34, 92): 348 w.write(b'\\') 349 350 w.write(read(r, 1)) 351 continue 352 353 if b == 92: # ord('\\') 354 esc = True 355 skip_byte(r) 356 continue 357 358 if b == quote: 359 skip_byte(r) 360 return 361 362 # emit normal string-byte 363 w.write(read(r, 1)) 364 365 366 def handle_true(w, r) -> None: 367 demand(r, b'true') 368 w.write(b'true') 369 370 371 # setup byte-handling lookup tuple 372 byte2handler = [handle_invalid for i in range(256)] 373 byte2handler[ord('0')] = handle_number 374 byte2handler[ord('1')] = handle_number 375 byte2handler[ord('2')] = handle_number 376 byte2handler[ord('3')] = handle_number 377 byte2handler[ord('4')] = handle_number 378 byte2handler[ord('5')] = handle_number 379 byte2handler[ord('6')] = handle_number 380 byte2handler[ord('7')] = handle_number 381 byte2handler[ord('8')] = handle_number 382 byte2handler[ord('9')] = handle_number 383 byte2handler[ord('+')] = handle_positive 384 byte2handler[ord('-')] = handle_negative 385 byte2handler[ord('.')] = handle_dot 386 byte2handler[ord('"')] = handle_double_quoted_string 387 byte2handler[ord('\'')] = handle_single_quoted_string 388 byte2handler[ord('f')] = handle_false 389 byte2handler[ord('n')] = handle_null 390 byte2handler[ord('t')] = handle_true 391 byte2handler[ord('[')] = handle_array 392 byte2handler[ord('{')] = handle_object 393 394 # handlers is the immutable byte-driven func-dispatch table 395 handlers = tuple(byte2handler) 396 397 398 def copy_digits(w, r) -> int: 399 'Returns how many digits were copied/handled.' 400 401 copied = 0 402 while True: 403 chunk = r.peek(64) 404 if len(chunk) == 0: 405 return copied 406 407 i = find_digits_end_index(chunk) 408 if i >= 0: 409 w.write(read(r, i)) 410 copied += i 411 return copied 412 else: 413 w.write(chunk) 414 read(r, len(chunk)) 415 copied += len(chunk) 416 417 418 def seek_next_token(r) -> None: 419 'Skip an arbitrarily-long mix of whitespace and comments.' 420 421 while True: 422 chunk = r.peek(1024) 423 if len(chunk) == 0: 424 # input is over, and this func doesn't consider that an error 425 return 426 427 comment = False 428 429 for i, b in enumerate(chunk): 430 # skip space, tab, line-feed, carriage-return, or form-feed 431 if b in (9, 10, 11, 13, 32): 432 continue 433 434 if b == 47: # ord('/') 435 read(r, i) 436 demand_comment(r) 437 comment = True 438 break 439 440 # found start of next token 441 read(r, i) 442 return 443 444 if not comment: 445 read(r, len(chunk)) 446 447 448 def skip_line(r) -> None: 449 while True: 450 chunk = r.peek(1024) 451 if len(chunk) == 0: 452 return 453 454 i = chunk.find(b'\n') 455 if i >= 0: 456 read(r, i + 1) 457 return 458 459 read(r, len(chunk)) 460 461 462 def skip_general_comment(r) -> None: 463 while True: 464 chunk = r.peek(1024) 465 if len(chunk) == 0: 466 raise ValueError(f'input data ended before an expected */') 467 468 i = chunk.find(b'*') 469 if i < 0: 470 # no */ in this chunk, so skip it and try with the next one 471 read(r, len(chunk)) 472 continue 473 474 # skip right past the * just found, then check if a / follows it 475 read(r, i + 1) 476 if peek_byte(r) == 47: # ord('/') 477 # got */, the end of this comment 478 skip_byte(r) 479 return 480 481 482 def find_digits_end_index(chunk: bytes) -> int: 483 i = 0 484 for b in chunk: 485 if 48 <= b <= 57: 486 i += 1 487 else: 488 return i 489 490 # all bytes (if any) were digits, so no end was found 491 return -1 492 493 494 def demand(r, what: bytes) -> None: 495 lead = read(r, len(what)) 496 if not lead.startswith(what): 497 lead = str(lead, encoding='utf-8') 498 what = str(what, encoding='utf-8') 499 raise ValueError(f'expected {what}, but got {lead} instead') 500 501 502 def demand_comment(r) -> None: 503 demand(r, b'/') 504 b = peek_byte(r) 505 if b < 0: 506 raise ValueError('unexpected end of input data') 507 508 if b == 47: # ord('/') 509 # handle single-line comment 510 skip_line(r) 511 return 512 513 if b == 42: # ord('*') 514 # handle (potentially) multi-line comment 515 skip_general_comment(r) 516 return 517 518 raise ValueError('expected * or another /, after a /') 519 520 521 def json0(w, src, end) -> None: 522 r = BufferedReader(src) 523 524 # skip leading UTF-8 BOM (byte-order mark) 525 if r.peek(3) == b'\xef\xbb\xbf': 526 read(r, 3) 527 528 # skip leading whitespace/comments 529 seek_next_token(r) 530 531 # emit a single output line, ending with a line-feed 532 b = peek_byte(r) 533 if b >= 0: 534 handlers[b](w, r) 535 else: 536 # treat empty(ish) input as invalid JSON 537 raise ValueError('can\'t turn empty(ish) input into JSON') 538 end(w) 539 540 # check against trailing non-whitespace/non-comment bytes 541 seek_next_token(r) 542 if len(r.peek(1)) > 0: 543 raise ValueError('unexpected trailing bytes in JSON data') 544 545 546 def seems_url(s: str) -> bool: 547 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 548 return any(s.startswith(p) for p in protocols) 549 550 551 def handle_json(w, r) -> None: 552 json0(w, r, lambda w: w.write(b'\n')) 553 554 555 def handle_json_lines(w, r) -> None: 556 global pos, linenum 557 558 items = 0 559 linenum = 0 560 w.write(b'[') 561 562 while True: 563 line = r.readline().lstrip() 564 if not line: 565 break 566 567 pos = 1 568 linenum += 1 569 570 stripped = line.strip() 571 if not stripped or stripped.startswith(b'//'): 572 continue 573 574 items += 1 575 if items > 1: 576 w.write(b',') 577 578 json0(w, BytesIO(line), lambda w: None) 579 580 w.write(b']\n') 581 582 583 start_args = 1 584 handle_input = handle_json 585 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'): 586 start_args = 2 587 handle_input = handle_json_lines 588 589 if len(argv) - 1 > start_args: 590 print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr) 591 exit(1) 592 593 w = stdout.buffer 594 name = argv[start_args] if len(argv) > start_args else '-' 595 596 # values keeping track of the input-position, shown in case of errors 597 pos = 1 598 linenum = 1 599 600 try: 601 if name == '-': 602 handle_input(w, stdin.buffer) 603 elif seems_url(name): 604 from urllib.request import urlopen 605 with urlopen(name) as inp: 606 handle_input(w, inp) 607 else: 608 with open(name, mode='rb') as inp: 609 handle_input(w, inp) 610 except BrokenPipeError: 611 # quit quietly, instead of showing a confusing error message 612 stderr.close() 613 except KeyboardInterrupt: 614 exit(2) 615 except Exception as e: 616 stdout.flush() 617 print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr) 618 exit(1)