File: j0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import BufferedReader, BytesIO
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 j0 [filepath/URI...]
  32 
  33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  34 
  35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid
  36 JSON, since it ignores comments and trailing commas, neither of which are
  37 supported in JSON, but which are still commonly used.
  38 
  39 It also turns single-quoted strings into proper double-quoted ones, as well
  40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u`
  41 hexadecimal escapes. When backslashes in strings are followed by an invalid
  42 escape letter, the backslash is ignored.
  43 
  44 Output is always a single line of valid JSON, ending with a line-feed.
  45 '''
  46 
  47 # handle standard help cmd-line options, quitting right away in that case
  48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  49     print(info.strip())
  50     exit(0)
  51 
  52 
  53 # note: using regexes doesn't seem to speed-up number/string-handling
  54 
  55 
  56 def read(r, size: int) -> bytes:
  57     global pos, linenum
  58 
  59     chunk = r.read(size)
  60     if not chunk:
  61         return chunk
  62 
  63     if not (10 in chunk):
  64         pos += len(chunk)
  65         return chunk
  66 
  67     for b in chunk:
  68         if b == 10:
  69             pos = 1
  70             linenum += 1
  71         else:
  72             pos += 1
  73     return chunk
  74 
  75 
  76 def skip_byte(r) -> None:
  77     global pos, linenum
  78 
  79     chunk = r.read(1)
  80     if not chunk:
  81         return
  82 
  83     if chunk[0] == 10:
  84         pos = 1
  85         linenum += 1
  86     else:
  87         pos += 1
  88 
  89 
  90 def peek_byte(r) -> int:
  91     chunk = r.peek(64)
  92     if len(chunk) > 0:
  93         return chunk[0]
  94     return -1
  95 
  96 
  97 def handle_array(w, r) -> None:
  98     seek_next = seek_next_token
  99 
 100     n = 0
 101     skip_byte(r)
 102     w.write(b'[')
 103 
 104     while True:
 105         # whitespace/comments may precede the next item/comma
 106         seek_next(r)
 107         b = peek_byte(r)
 108         if b < 0:
 109             raise ValueError('unexpected end of input data, before "]"')
 110 
 111         comma = b == 44 # ord(',')
 112 
 113         if comma:
 114             skip_byte(r)
 115             # whitespace/comments may follow the comma
 116             seek_next(r)
 117             b = peek_byte(r)
 118             if b < 0:
 119                 raise ValueError('unexpected end of input data, before "]"')
 120 
 121         if b == 93: # ord(']')
 122             skip_byte(r)
 123             w.write(b']')
 124             return
 125 
 126         if n > 0:
 127             if not comma:
 128                 raise ValueError('missing a comma between array values')
 129             w.write(b',')
 130 
 131         b = peek_byte(r)
 132         if b > 0:
 133             handlers[b](w, r)
 134             n += 1
 135 
 136 
 137 def handle_double_quoted_string(w, r) -> None:
 138     skip_byte(r)
 139     w.write(b'"')
 140     handle_inner_string(w, r, 34) # ord('"')
 141     w.write(b'"')
 142 
 143 
 144 def handle_dot(w, r) -> None:
 145     skip_byte(r)
 146     # precede the leading decimal dot with a 0
 147     w.write(b'0.')
 148 
 149     # handle decimals, which in this case aren't optional, as a leading
 150     # dot is what led to this point
 151     if copy_digits(w, r) < 1:
 152         raise ValueError('expected numeric digits, but found none')
 153 
 154 
 155 def handle_false(w, r) -> None:
 156     demand(r, b'false')
 157     w.write(b'false')
 158 
 159 
 160 def handle_invalid(w, r) -> None:
 161     b = peek_byte(r)
 162     if b < 0:
 163         raise ValueError('unexpected end of input data')
 164     # raise ValueError(f'unexpected JSON byte-value {b}')
 165     if 32 < b <= 126:
 166         msg = f'unexpected symbol {chr(b)}'
 167     else:
 168         msg = f'unexpected byte-value {b}'
 169     raise ValueError(msg)
 170 
 171 
 172 def handle_negative(w, r) -> None:
 173     skip_byte(r)
 174     w.write(b'-')
 175 
 176     if peek_byte(r) == 46: # ord('.')
 177         skip_byte(r)
 178         w.write(b'0.')
 179         if copy_digits(w, r) < 1:
 180             raise ValueError('expected numeric digits, but found none')
 181     else:
 182         handle_number(w, r)
 183 
 184 
 185 def handle_null(w, r) -> None:
 186     demand(r, b'null')
 187     w.write(b'null')
 188 
 189 
 190 def handle_number(w, r) -> None:
 191     # handle integer part
 192     if copy_digits(w, r) < 1:
 193         raise ValueError('expected numeric digits, but found none')
 194 
 195     # handle optional decimals
 196     b = peek_byte(r)
 197     if b == 46: # ord('.')
 198         skip_byte(r)
 199         w.write(b'.')
 200         if copy_digits(w, r) < 1:
 201             # follow a trailing decimal dot with a 0
 202             w.write(b'0')
 203 
 204     # handle optional exponent
 205     if b == 101 or b == 69: # ord('e'), ord('E')
 206         skip_byte(r)
 207         w.write(b'e' if b == 101 else b'E')
 208         b = peek_byte(r)
 209         if b == 43: # ord('+')
 210             skip_byte(r)
 211         elif b == 45: # ord('-')
 212             w.write(b'-')
 213             skip_byte(r)
 214         if copy_digits(w, r) < 1:
 215             raise ValueError('expected numeric digits, but found none')
 216 
 217 
 218 def handle_object(w, r) -> None:
 219     seek_next = seek_next_token
 220 
 221     num_pairs = 0
 222     skip_byte(r)
 223     w.write(b'{')
 224 
 225     while True:
 226         # whitespace/comments may precede the next item/comma
 227         seek_next(r)
 228         b = peek_byte(r)
 229         if b < 0:
 230             raise ValueError('unexpected end of input data, before "}"')
 231 
 232         comma = b == 44 # ord(',')
 233 
 234         if comma:
 235             skip_byte(r)
 236             # whitespace/comments may follow the comma
 237             seek_next(r)
 238             b = peek_byte(r)
 239             if b < 0:
 240                 raise ValueError('unexpected end of input data, before "}"')
 241 
 242         if b == 125: # ord('}')
 243             skip_byte(r)
 244             w.write(b'}')
 245             return
 246 
 247         if num_pairs > 0:
 248             if not comma:
 249                 raise ValueError('missing a comma between key-value pairs')
 250             w.write(b',')
 251 
 252         demand_string(w, r)
 253         # whitespace/comments may follow the key
 254         seek_next(r)
 255         demand(r, b':')
 256         w.write(b':')
 257         # whitespace/comments may follow the colon
 258         seek_next(r)
 259         b = peek_byte(r)
 260         if b > 0:
 261             handlers[b](w, r)
 262             num_pairs += 1
 263 
 264 
 265 def handle_positive(w, r) -> None:
 266     # do nothing with the leading plus sign, which isn't allowed in JSON
 267     skip_byte(r)
 268 
 269     if peek_byte(r) == 46: # ord('.')
 270         skip_byte(r)
 271         w.write(b'0.')
 272         if copy_digits(w, r) < 1:
 273             raise ValueError('expected numeric digits, but found none')
 274     else:
 275         handle_number(w, r)
 276 
 277 
 278 def handle_single_quoted_string(w, r) -> None:
 279     skip_byte(r)
 280     w.write(b'"')
 281     handle_inner_string(w, r, 39) # ord('\'')
 282     w.write(b'"')
 283 
 284 
 285 def demand_string(w, r) -> None:
 286     quote = peek_byte(r)
 287     if quote < 0:
 288         msg = 'unexpected end of input, instead of a string quote'
 289         raise ValueError(msg)
 290 
 291     if quote == 34: # ord('"')
 292         handle_double_quoted_string(w, r)
 293         return
 294 
 295     if quote == 39: # ord('\'')
 296         handle_single_quoted_string(w, r)
 297         return
 298 
 299     if 32 < quote <= 126: # ord(' '), ord('~')
 300         msg = f'expected ", or even \', but got "{chr(quote)}" instead'
 301     else:
 302         msg = f'expected ", or even \', but got byte "{quote}" instead'
 303     raise ValueError(msg)
 304 
 305 
 306 def handle_inner_string(w, r, quote: int) -> None:
 307     esc = False
 308     bad_hex_msg = 'invalid hexadecimal symbols'
 309     early_end_msg = 'input data ended while still in quoted string'
 310 
 311     def is_hex(x: int) -> bool:
 312         # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f')
 313         return 48 <= x <= 57 or 97 <= x <= 102
 314 
 315     def lower(x: int) -> bool:
 316         # 65 is ord('A'), 90 is ord('Z')
 317         return x + 32 if 65 <= x <= 90 else x
 318 
 319     while True:
 320         chunk = r.peek(1)
 321         if len(chunk) < 1:
 322             raise ValueError(early_end_msg)
 323         b = chunk[0]
 324 
 325         if esc:
 326             esc = False
 327 
 328             if b == 120: # ord('x')
 329                 skip_byte(r)
 330                 chunk = read(r, 2)
 331                 if len(chunk) != 2:
 332                     raise ValueError(early_end_msg)
 333                 a = lower(chunk[0])
 334                 b = lower(chunk[1])
 335                 w.write(b'\\u00')
 336                 if not (is_hex(a) and is_hex(b)):
 337                     raise ValueError(bad_hex_msg)
 338                 w.write(a)
 339                 w.write(b)
 340                 continue
 341 
 342             if b == 117: # ord('u')
 343                 skip_byte(r)
 344                 chunk = read(r, 4)
 345                 if len(chunk) != 4:
 346                     raise ValueError(early_end_msg)
 347                 a = lower(chunk[0])
 348                 b = lower(chunk[1])
 349                 c = lower(chunk[2])
 350                 d = lower(chunk[3])
 351                 w.write(b'\\u')
 352                 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)):
 353                     raise ValueError(bad_hex_msg)
 354                 w.write(chunk)
 355                 continue
 356 
 357             # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f'
 358             if b in (34, 92, 110, 116, 114, 98, 102):
 359                 w.write(b'\\')
 360 
 361             w.write(read(r, 1))
 362             continue
 363 
 364         if b == 92: # ord('\\')
 365             esc = True
 366             skip_byte(r)
 367             continue
 368 
 369         if b == quote:
 370             skip_byte(r)
 371             return
 372 
 373         # emit normal string-byte
 374         w.write(read(r, 1))
 375 
 376 
 377 def handle_true(w, r) -> None:
 378     demand(r, b'true')
 379     w.write(b'true')
 380 
 381 
 382 # setup byte-handling lookup tuple
 383 bh = [handle_invalid for i in range(256)]
 384 bh[ord('0')] = handle_number
 385 bh[ord('1')] = handle_number
 386 bh[ord('2')] = handle_number
 387 bh[ord('3')] = handle_number
 388 bh[ord('4')] = handle_number
 389 bh[ord('5')] = handle_number
 390 bh[ord('6')] = handle_number
 391 bh[ord('7')] = handle_number
 392 bh[ord('8')] = handle_number
 393 bh[ord('9')] = handle_number
 394 bh[ord('+')] = handle_positive
 395 bh[ord('-')] = handle_negative
 396 bh[ord('.')] = handle_dot
 397 bh[ord('"')] = handle_double_quoted_string
 398 bh[ord('\'')] = handle_single_quoted_string
 399 bh[ord('f')] = handle_false
 400 bh[ord('n')] = handle_null
 401 bh[ord('t')] = handle_true
 402 bh[ord('[')] = handle_array
 403 bh[ord('{')] = handle_object
 404 
 405 # handlers is the immutable byte-driven func-dispatch table
 406 handlers = tuple(bh)
 407 
 408 
 409 def copy_digits(w, r) -> int:
 410     'Returns how many digits were copied/handled.'
 411 
 412     copied = 0
 413     while True:
 414         chunk = r.peek(64)
 415         if len(chunk) == 0:
 416             return copied
 417 
 418         i = find_digits_end_index(chunk)
 419         if i >= 0:
 420             w.write(read(r, i))
 421             copied += i
 422             return copied
 423         else:
 424             w.write(chunk)
 425             read(r, len(chunk))
 426             copied += len(chunk)
 427 
 428 
 429 def seek_next_token(r) -> None:
 430     'Skip an arbitrarily-long mix of whitespace and comments.'
 431 
 432     while True:
 433         chunk = r.peek(1024)
 434         if len(chunk) == 0:
 435             # input is over, and this func doesn't consider that an error
 436             return
 437 
 438         comment = False
 439 
 440         for i, b in enumerate(chunk):
 441             # skip space, tab, line-feed, carriage-return, or form-feed
 442             if b in (9, 10, 11, 13, 32):
 443                 continue
 444 
 445             if b == 47: # ord('/')
 446                 read(r, i)
 447                 demand_comment(r)
 448                 comment = True
 449                 break
 450 
 451             # found start of next token
 452             read(r, i)
 453             return
 454 
 455         if not comment:
 456             read(r, len(chunk))
 457 
 458 
 459 def skip_line(r) -> None:
 460     while True:
 461         chunk = r.peek(1024)
 462         if len(chunk) == 0:
 463             return
 464 
 465         i = chunk.find(b'\n')
 466         if i >= 0:
 467             read(r, i + 1)
 468             return
 469 
 470         read(r, len(chunk))
 471 
 472 
 473 def skip_general_comment(r) -> None:
 474     while True:
 475         chunk = r.peek(1024)
 476         if len(chunk) == 0:
 477             raise ValueError(f'input data ended before an expected */')
 478 
 479         i = chunk.find(b'*')
 480         if i < 0:
 481             # no */ in this chunk, so skip it and try with the next one
 482             read(r, len(chunk))
 483             continue
 484 
 485         # skip right past the * just found, then check if a / follows it
 486         read(r, i + 1)
 487         if peek_byte(r) == 47: # ord('/')
 488             # got */, the end of this comment
 489             skip_byte(r)
 490             return
 491 
 492 
 493 def find_digits_end_index(chunk: bytes) -> int:
 494     i = 0
 495     for b in chunk:
 496         if 48 <= b <= 57:
 497             i += 1
 498         else:
 499             return i
 500 
 501     # all bytes (if any) were digits, so no end was found
 502     return -1
 503 
 504 
 505 def demand(r, what: bytes) -> None:
 506     lead = read(r, len(what))
 507     if not lead.startswith(what):
 508         lead = str(lead, encoding='utf-8')
 509         what = str(what, encoding='utf-8')
 510         raise ValueError(f'expected {what}, but got {lead} instead')
 511 
 512 
 513 def demand_comment(r) -> None:
 514     demand(r, b'/')
 515     b = peek_byte(r)
 516     if b < 0:
 517         raise ValueError('unexpected end of input data')
 518 
 519     if b == 47: # ord('/')
 520         # handle single-line comment
 521         skip_line(r)
 522         return
 523 
 524     if b == 42: # ord('*')
 525         # handle (potentially) multi-line comment
 526         skip_general_comment(r)
 527         return
 528 
 529     raise ValueError('expected * or another /, after a /')
 530 
 531 
 532 def json0(w, src, end) -> None:
 533     r = BufferedReader(src)
 534 
 535     # skip leading UTF-8 BOM (byte-order mark)
 536     if r.peek(3) == b'\xef\xbb\xbf':
 537         read(r, 3)
 538 
 539     # skip leading whitespace/comments
 540     seek_next_token(r)
 541 
 542     # emit a single output line, ending with a line-feed
 543     b = peek_byte(r)
 544     if b >= 0:
 545         handlers[b](w, r)
 546     else:
 547         # w.write(b'null')
 548         # treat empty(ish) input as invalid JSON
 549         raise ValueError('can\'t turn empty(ish) input into JSON')
 550 
 551     # deliberately run post-processing before checking for trailing-data
 552     # errors: for example, if post-proc func emits new line, errors will
 553     # show up on their separate line, which is nicer
 554     end(w)
 555 
 556     # ignore trailing whitespace/comment bytes, if present
 557     seek_next_token(r)
 558 
 559     # ignore trailing semicolon, if present
 560     b = peek_byte(r)
 561     if b == 59: # ord(';')
 562         read(r, 1)
 563         # ignore trailing whitespace/comment bytes, if present
 564         seek_next_token(r)
 565 
 566     if len(r.peek(1)) > 0:
 567         raise ValueError('unexpected trailing bytes in JSON data')
 568 
 569 
 570 def seems_url(s: str) -> bool:
 571     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 572     return any(s.startswith(p) for p in protocols)
 573 
 574 
 575 def handle_json(w, r) -> None:
 576     def end(w) -> None:
 577         w.write(b'\n')
 578         w.flush()
 579     json0(w, r, end)
 580 
 581 
 582 def handle_json_lines(w, r) -> None:
 583     global pos, linenum
 584 
 585     items = 0
 586     linenum = 0
 587     w.write(b'[')
 588 
 589     while True:
 590         line = r.readline().lstrip()
 591         if not line:
 592             break
 593 
 594         pos = 1
 595         linenum += 1
 596 
 597         stripped = line.strip()
 598         if not stripped or stripped.startswith(b'//'):
 599             continue
 600 
 601         items += 1
 602         if items > 1:
 603             w.write(b',')
 604 
 605         json0(w, BytesIO(line), lambda w: w.flush())
 606 
 607     w.write(b']\n')
 608 
 609 
 610 start_args = 1
 611 handle_input = handle_json
 612 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'):
 613     start_args = 2
 614     handle_input = handle_json_lines
 615 
 616 if len(argv) - 1 > start_args:
 617     print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr)
 618     exit(1)
 619 
 620 w = stdout.buffer
 621 name = argv[start_args] if len(argv) > start_args else '-'
 622 
 623 # values keeping track of the input-position, shown in case of errors
 624 pos = 1
 625 linenum = 1
 626 
 627 try:
 628     if name == '-':
 629         handle_input(w, stdin.buffer)
 630     elif seems_url(name):
 631         from urllib.request import urlopen
 632         with urlopen(name) as inp:
 633             handle_input(w, inp)
 634     else:
 635         with open(name, mode='rb') as inp:
 636             handle_input(w, inp)
 637 except BrokenPipeError:
 638     # quit quietly, instead of showing a confusing error message
 639     stderr.close()
 640     exit(0)
 641 except KeyboardInterrupt:
 642     exit(2)
 643 except Exception as e:
 644     stdout.write('\n')
 645     print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr)
 646     exit(1)