File: j0.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # j0 [filepath/URI...] 27 # 28 # Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 29 # 30 # Besides minimizing bytes, this tool also adapts almost-JSON input into 31 # valid JSON, since it ignores comments and trailing commas, neither of 32 # which are supported in JSON, but which are still commonly used. 33 # 34 # Output is always a single line, which ends with a line-feed. 35 36 37 from io import BufferedReader 38 from sys import argv, exit, stderr, stdin, stdout 39 from urllib.request import urlopen 40 41 42 # info is the help message shown when asked to 43 info = ''' 44 j0 [filepath/URI...] 45 46 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output. 47 48 Besides minimizing bytes, this tool also adapts almost-JSON input into 49 valid JSON, since it ignores comments and trailing commas, neither of 50 which are supported in JSON, but which are still commonly used. 51 52 Output is always a single line, which ends with a line-feed. 53 '''.strip() 54 55 # handle standard help cmd-line options, quitting right away in that case 56 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 57 print(info, file=stderr) 58 exit(0) 59 60 61 # using regexes doesn't seem to speed-up number/string-handling 62 # not_digit_re = compile_re(b'[^0-9]') 63 # string_double_quote_re = compile_re(b'^"|[^\\\\]"') 64 # string_single_quote_re = compile_re(b'^\'|[^\\\\]\'') 65 66 67 def peek_byte(r) -> int: 68 chunk = r.peek(64) 69 if len(chunk) > 0: 70 return chunk[0] 71 return -1 72 73 74 def handle_array(w, r) -> None: 75 # looking up global vars is slower in older versions of python 76 seek_next = seek_next_token 77 handle_next = handlers 78 peek_byte_l = peek_byte 79 80 n = 0 81 r.read(1) 82 w.write(b'[') 83 84 while True: 85 # whitespace/comments may precede the next item/comma 86 seek_next(r) 87 b = peek_byte_l(r) 88 if b < 0: 89 raise ValueError('unexpected end of input data, before "]"') 90 91 # 44 is ord(',') 92 comma = b == 44 93 94 if comma: 95 r.read(1) 96 # whitespace/comments may follow the comma 97 seek_next(r) 98 b = peek_byte_l(r) 99 if b < 0: 100 raise ValueError('unexpected end of input data, before "]"') 101 102 # 93 is ord(']') 103 if b == 93: 104 r.read(1) 105 w.write(b']') 106 return 107 108 if n > 0: 109 if not comma: 110 raise ValueError('missing a comma between array values') 111 w.write(b',') 112 113 b = peek_byte_l(r) 114 if b > 0: 115 handle_next[b](w, r) 116 n += 1 117 118 119 def handle_double_quoted_string(w, r) -> None: 120 r.read(1) 121 w.write(b'"') 122 handle_inner_string(w, r, b'"') 123 w.write(b'"') 124 125 126 def handle_dot(w, r) -> None: 127 r.read(1) 128 # precede the leading decimal dot with a 0 129 w.write(b'0.') 130 131 # handle decimals, which in this case aren't optional, as a leading 132 # dot is what led to this point 133 if copy_digits(w, r) < 1: 134 raise ValueError('expected numeric digits, but found none') 135 136 137 def handle_false(w, r) -> None: 138 demand(r, b'false') 139 w.write(b'false') 140 141 142 def handle_invalid(w, r) -> None: 143 b = peek_byte(r) 144 if b < 0: 145 raise ValueError('unexpected end of input data') 146 # raise ValueError(f'unexpected JSON byte-value {b}') 147 if 32 < b <= 126: 148 msg = f'unexpected symbol {chr(b)}' 149 else: 150 msg = f'unexpected byte-value {b}' 151 raise ValueError(msg) 152 153 154 def handle_negative(w, r) -> None: 155 r.read(1) 156 w.write(b'-') 157 158 # 46 is ord('.') 159 if peek_byte(r) == 46: 160 r.read(1) 161 w.write(b'0.') 162 if copy_digits(w, r) < 1: 163 raise ValueError('expected numeric digits, but found none') 164 else: 165 handle_number(w, r) 166 167 168 def handle_null(w, r) -> None: 169 demand(r, b'null') 170 w.write(b'null') 171 172 173 def handle_number(w, r) -> None: 174 # handle integer part 175 if copy_digits(w, r) < 1: 176 raise ValueError('expected numeric digits, but found none') 177 178 # handle optional decimals 179 b = peek_byte(r) 180 # 46 is ord('.') 181 if b == 46: 182 r.read(1) 183 w.write(b'.') 184 if copy_digits(w, r) < 1: 185 # follow a trailing decimal dot with a 0 186 w.write(b'0') 187 188 189 def handle_object(w, r) -> None: 190 # looking up global vars is slower in older versions of python 191 seek_next = seek_next_token 192 demand_l = demand 193 handle_key = demand_string 194 handle_next = handlers 195 peek_byte_l = peek_byte 196 197 num_pairs = 0 198 r.read(1) 199 w.write(b'{') 200 201 while True: 202 # whitespace/comments may precede the next item/comma 203 seek_next(r) 204 b = peek_byte_l(r) 205 if b < 0: 206 raise ValueError('unexpected end of input data, before "}"') 207 208 # 44 is ord(',') 209 comma = b == 44 210 211 if comma: 212 r.read(1) 213 # whitespace/comments may follow the comma 214 seek_next(r) 215 b = peek_byte_l(r) 216 if b < 0: 217 raise ValueError('unexpected end of input data, before "}"') 218 219 # 125 is ord('}') 220 if b == 125: 221 r.read(1) 222 w.write(b'}') 223 return 224 225 if num_pairs > 0: 226 if not comma: 227 raise ValueError('missing a comma between key-value pairs') 228 w.write(b',') 229 230 handle_key(w, r) 231 # whitespace/comments may follow the key 232 seek_next(r) 233 demand_l(r, b':') 234 w.write(b':') 235 # whitespace/comments may follow the colon 236 seek_next(r) 237 b = peek_byte_l(r) 238 if b > 0: 239 handle_next[b](w, r) 240 num_pairs += 1 241 242 243 def handle_positive(w, r) -> None: 244 # do nothing with the leading plus sign: strictly-speaking, JSON numbers 245 # can't start with a positive sign, and this tool's output is supposed 246 # to be `JSON-0` (minimized) anyway 247 r.read(1) 248 249 # 46 is ord('.') 250 if peek_byte(r) == 46: 251 r.read(1) 252 w.write(b'0.') 253 if copy_digits(w, r) < 1: 254 raise ValueError('expected numeric digits, but found none') 255 else: 256 handle_number(w, r) 257 258 259 def handle_single_quoted_string(w, r) -> None: 260 r.read(1) 261 w.write(b'"') 262 handle_inner_string(w, r, b'\'') 263 w.write(b'"') 264 265 266 def demand_string(w, r) -> None: 267 '''Handle keys for func handle_object.''' 268 269 quote = peek_byte(r) 270 if quote < 0: 271 msg = 'unexpected end of input, instead of a string quote' 272 raise ValueError(msg) 273 274 # 34 is ord('"') 275 if quote == 34: 276 handle_double_quoted_string(w, r) 277 return 278 279 # 39 is ord('\'') 280 if quote == 39: 281 handle_single_quoted_string(w, r) 282 return 283 284 # 32 is ord(' '), 126 is ord('~') 285 if 32 < quote <= 126: 286 msg = f'expected ", or even \', but got {chr(quote)} instead' 287 else: 288 msg = f'expected ", or even \', but got byte {quote} instead' 289 raise ValueError(msg) 290 291 292 def handle_inner_string_slow(w, r, quote: bytes) -> None: 293 '''Experimental func with the aim to validate inner-string bytes.''' 294 295 esc = False 296 q = quote[0] 297 298 while True: 299 chunk = r.peek(1) 300 if len(chunk) < 1: 301 raise ValueError('input data ended while still in quoted string') 302 b = chunk[0] 303 304 if esc: 305 esc = False 306 w.write(r.read(1)) 307 continue 308 # 92 is ord('\\') 309 if b == 92: 310 esc = True 311 r.read(1) 312 continue 313 if b == q: 314 r.read(1) 315 w.write(b'"') 316 return 317 w.write(r.read(1)) 318 319 320 def handle_inner_string(w, r, quote: bytes) -> None: 321 while True: 322 chunk = r.peek(1024) 323 if len(chunk) == 0: 324 raise ValueError('input data ended while still in quoted string') 325 326 i = find_string_end_index(chunk, quote) 327 if i >= 0: 328 w.write(r.read(i)) 329 # read/discard closing quote separately; the quote may not 330 # always be the strictly-JSON `"`, so it's never emitted 331 # together with the inner-string part 332 r.read(1) 333 return 334 else: 335 w.write(chunk) 336 r.read(len(chunk)) 337 338 339 def handle_true(w, r) -> None: 340 demand(r, b'true') 341 w.write(b'true') 342 343 344 # setup byte-handling lookup tuple 345 byte2handler = [handle_invalid for i in range(256)] 346 byte2handler[ord('0')] = handle_number 347 byte2handler[ord('1')] = handle_number 348 byte2handler[ord('2')] = handle_number 349 byte2handler[ord('3')] = handle_number 350 byte2handler[ord('4')] = handle_number 351 byte2handler[ord('5')] = handle_number 352 byte2handler[ord('6')] = handle_number 353 byte2handler[ord('7')] = handle_number 354 byte2handler[ord('8')] = handle_number 355 byte2handler[ord('9')] = handle_number 356 byte2handler[ord('+')] = handle_positive 357 byte2handler[ord('-')] = handle_negative 358 byte2handler[ord('.')] = handle_dot 359 byte2handler[ord('"')] = handle_double_quoted_string 360 byte2handler[ord('\'')] = handle_single_quoted_string 361 byte2handler[ord('f')] = handle_false 362 byte2handler[ord('n')] = handle_null 363 byte2handler[ord('t')] = handle_true 364 byte2handler[ord('[')] = handle_array 365 byte2handler[ord('{')] = handle_object 366 367 # handlers is the immutable byte-driven func-dispatch table 368 handlers = tuple(byte2handler) 369 370 371 def copy_digits(w, r) -> int: 372 ''' 373 Help the number-handling funcs do their job quickly: returns 374 how many digits were copied/handled, so callers can check if 375 any digits were found/copied. 376 ''' 377 378 copied = 0 379 while True: 380 chunk = r.peek(64) 381 if len(chunk) == 0: 382 return copied 383 384 i = find_digits_end_index(chunk) 385 if i >= 0: 386 w.write(r.read(i)) 387 copied += i 388 return copied 389 else: 390 w.write(chunk) 391 r.read(len(chunk)) 392 copied += len(chunk) 393 394 395 def seek_next_token(r) -> None: 396 '''Skip an arbitrarily-long mix of whitespace and comments.''' 397 398 while True: 399 chunk = r.peek(1024) 400 if len(chunk) == 0: 401 # input is over, and this func doesn't consider that an error 402 return 403 404 comment = False 405 406 for i, b in enumerate(chunk): 407 # skip space, tab, line-feed, carriage-return, or form-feed 408 if b in (9, 10, 11, 13, 32): 409 continue 410 411 # 47 is ord('/') 412 if b == 47: 413 r.read(i) 414 demand_comment(r) 415 comment = True 416 break 417 418 # found start of next token 419 r.read(i) 420 return 421 422 if not comment: 423 r.read(len(chunk)) 424 425 426 def skip_line(r) -> None: 427 '''Help func demand_comment do its job.''' 428 429 while True: 430 chunk = r.peek(1024) 431 if len(chunk) == 0: 432 return 433 434 i = chunk.find(b'\n') 435 if i >= 0: 436 r.read(i + 1) 437 return 438 439 r.read(len(chunk)) 440 441 442 def skip_general_comment(r) -> None: 443 '''Help func demand_comment do its job.''' 444 445 # looking up global vars is slower in older versions of python 446 peek_byte_l = peek_byte 447 448 while True: 449 chunk = r.peek(1024) 450 if len(chunk) == 0: 451 raise ValueError(f'input data ended before an expected */') 452 453 i = chunk.find(b'*') 454 if i < 0: 455 # no */ in this chunk, so skip it and try with the next one 456 r.read(len(chunk)) 457 continue 458 459 # skip right past the * just found, then check if a / follows it 460 r.read(i + 1) 461 # 47 is ord('/') 462 if peek_byte_l(r) == 47: 463 # got */, the end of this comment 464 r.read(1) 465 return 466 467 468 def find_digits_end_index(chunk: bytes) -> int: 469 '''Help the digit-handling funcs do their job quickly.''' 470 471 i = 0 472 for b in chunk: 473 if 48 <= b <= 57: 474 i += 1 475 else: 476 return i 477 478 # all bytes (if any) were digits, so no end was found 479 return -1 480 481 482 def find_string_end_index(chunk: bytes, quote: bytes) -> int: 483 '''Help func demand_string do its job quickly.''' 484 485 # start remembers where to (re)start searching in case of fake matches 486 start = 0 487 488 while True: 489 i = chunk.find(quote, start) 490 if i <= 0: 491 # either no end was found in this chunk, or it's right at the 492 # start, at index 0: handling the latter avoids mistakenly 493 # trying to check if a backslash is before it 494 return i 495 496 # 92 is ord('\\') 497 if chunk[i - 1] != 92: 498 # reject match if a there's a backslash before it 499 return i 500 501 # keep searching, starting right past the fake-match index 502 start = i + 1 503 504 505 def demand(r, what: bytes) -> None: 506 lead = r.read(len(what)) 507 if not lead.startswith(what): 508 lead = str(lead, encoding='utf-8') 509 what = str(what, encoding='utf-8') 510 raise ValueError(f'expected {what}, but got {lead} instead') 511 512 513 def demand_comment(r) -> None: 514 demand(r, b'/') 515 b = peek_byte(r) 516 if b < 0: 517 raise ValueError('unexpected end of input data') 518 519 # 47 is ord('/') 520 if b == 47: 521 # handle single-line comment 522 skip_line(r) 523 return 524 525 # 42 is ord('*') 526 if b == 42: 527 # handle (potentially) multi-line comment 528 skip_general_comment(r) 529 return 530 531 raise ValueError('expected * or another /, after a /') 532 533 534 def json0(w, src) -> None: 535 r = BufferedReader(src) 536 537 # skip leading whitespace/comments 538 seek_next_token(r) 539 540 # emit a single output line, ending with a line-feed 541 b = peek_byte(r) 542 if b >= 0: 543 handlers[b](w, r) 544 else: 545 # treat empty(ish) input as invalid JSON 546 raise ValueError('can\'t turn empty(ish) input into JSON') 547 w.write(b'\n') 548 549 # check against trailing non-whitespace/non-comment bytes 550 seek_next_token(r) 551 if len(r.peek(1)) > 0: 552 raise ValueError('unexpected trailing bytes in JSON data') 553 554 555 def seems_url(s: str) -> bool: 556 for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'): 557 if s.startswith(prot): 558 return True 559 return False 560 561 562 try: 563 if len(argv) < 2: 564 json0(stdout.buffer, stdin.buffer) 565 elif len(argv) == 2: 566 name = argv[1] 567 if name == '-': 568 json0(stdout.buffer, stdin.buffer) 569 elif seems_url(name): 570 with urlopen(name) as inp: 571 json0(stdout.buffer, inp) 572 else: 573 with open(name, 'rb') as inp: 574 json0(stdout.buffer, inp) 575 else: 576 raise ValueError('multiple inputs not allowed') 577 except (BrokenPipeError, KeyboardInterrupt): 578 # quit quietly, instead of showing a confusing error message 579 stderr.flush() 580 stderr.close() 581 except Exception as e: 582 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 583 exit(1)