File: json0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import BufferedReader, BytesIO
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 json0 [filepath/URI...]
  32 
  33 JSON-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  34 
  35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid
  36 JSON, since it ignores comments and trailing commas, neither of which are
  37 supported in JSON, but which are still commonly used.
  38 
  39 It also turns single-quoted strings into proper double-quoted ones, as well
  40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u`
  41 hexadecimal escapes. When backslashes in strings are followed by an invalid
  42 escape letter, the backslash is ignored.
  43 
  44 Output is always a single line of valid JSON, ending with a line-feed.
  45 '''
  46 
  47 # handle standard help cmd-line options, quitting right away in that case
  48 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  49     print(info.strip())
  50     exit(0)
  51 
  52 
  53 # note: using regexes doesn't seem to speed-up number/string-handling
  54 
  55 
  56 def read(r, size: int) -> bytes:
  57     global pos, linenum
  58 
  59     chunk = r.read(size)
  60     if not chunk:
  61         return chunk
  62 
  63     if not (10 in chunk):
  64         pos += len(chunk)
  65         return chunk
  66 
  67     for b in chunk:
  68         if b == 10:
  69             pos = 1
  70             linenum += 1
  71         else:
  72             pos += 1
  73     return chunk
  74 
  75 
  76 def skip_byte(r) -> None:
  77     global pos, linenum
  78 
  79     chunk = r.read(1)
  80     if not chunk:
  81         return
  82 
  83     if chunk[0] == 10:
  84         pos = 1
  85         linenum += 1
  86     else:
  87         pos += 1
  88 
  89 
  90 def peek_byte(r) -> int:
  91     chunk = r.peek(64)
  92     if len(chunk) > 0:
  93         return chunk[0]
  94     return -1
  95 
  96 
  97 def handle_array(w, r) -> None:
  98     seek_next = seek_next_token
  99 
 100     n = 0
 101     lead = peek_byte(r)
 102     end = 0
 103     if lead < 0:
 104         raise ValueError('unexpected end of input data, before "]"')
 105     if lead == 91: # ord('[')
 106         end = 93 # ord(']')
 107     elif lead == 40: # ord('(')
 108         end = 41 # ord(')')
 109     else:
 110         raise ValueError('expected "[" or "("')
 111     skip_byte(r)
 112     w.write(b'[')
 113 
 114     while True:
 115         # whitespace/comments may precede the next item/comma
 116         seek_next(r)
 117         b = peek_byte(r)
 118         if b < 0:
 119             raise ValueError('unexpected end of input data, before "]"')
 120 
 121         comma = b == 44 # ord(',')
 122 
 123         if comma:
 124             skip_byte(r)
 125             # whitespace/comments may follow the comma
 126             seek_next(r)
 127             b = peek_byte(r)
 128             if b < 0:
 129                 raise ValueError('unexpected end of input data, before "]"')
 130 
 131         if b == end:
 132             skip_byte(r)
 133             w.write(b']')
 134             return
 135 
 136         if n > 0:
 137             if not comma:
 138                 raise ValueError('missing a comma between array values')
 139             w.write(b',')
 140 
 141         b = peek_byte(r)
 142         if b > 0:
 143             handlers[b](w, r)
 144             n += 1
 145 
 146 
 147 def handle_double_quoted_string(w, r) -> None:
 148     skip_byte(r)
 149     w.write(b'"')
 150     handle_inner_string(w, r, 34) # ord('"')
 151     w.write(b'"')
 152 
 153 
 154 def handle_dot(w, r) -> None:
 155     skip_byte(r)
 156     # precede the leading decimal dot with a 0
 157     w.write(b'0.')
 158 
 159     # handle decimals, which in this case aren't optional, as a leading
 160     # dot is what led to this point
 161     if copy_digits(w, r) < 1:
 162         raise ValueError('expected numeric digits, but found none')
 163 
 164 
 165 def handle_false(w, r) -> None:
 166     demand(r, b'false')
 167     w.write(b'false')
 168 
 169 
 170 def handle_False(w, r) -> None:
 171     demand(r, b'False')
 172     w.write(b'false')
 173 
 174 
 175 def handle_invalid(w, r) -> None:
 176     b = peek_byte(r)
 177     if b < 0:
 178         raise ValueError('unexpected end of input data')
 179     # raise ValueError(f'unexpected JSON byte-value {b}')
 180     if 32 < b <= 126:
 181         msg = f'unexpected symbol {chr(b)}'
 182     else:
 183         msg = f'unexpected byte-value {b}'
 184     raise ValueError(msg)
 185 
 186 
 187 def handle_negative(w, r) -> None:
 188     skip_byte(r)
 189     w.write(b'-')
 190 
 191     if peek_byte(r) == 46: # ord('.')
 192         skip_byte(r)
 193         w.write(b'0.')
 194         if copy_digits(w, r) < 1:
 195             raise ValueError('expected numeric digits, but found none')
 196     else:
 197         handle_number(w, r)
 198 
 199 
 200 def handle_null(w, r) -> None:
 201     demand(r, b'null')
 202     w.write(b'null')
 203 
 204 
 205 def handle_None(w, r) -> None:
 206     demand(r, b'None')
 207     w.write(b'null')
 208 
 209 
 210 def handle_number(w, r) -> None:
 211     # handle integer part
 212     if copy_digits(w, r) < 1:
 213         raise ValueError('expected numeric digits, but found none')
 214 
 215     # handle optional decimals
 216     b = peek_byte(r)
 217     if b == 46: # ord('.')
 218         skip_byte(r)
 219         w.write(b'.')
 220         if copy_digits(w, r) < 1:
 221             # follow a trailing decimal dot with a 0
 222             w.write(b'0')
 223 
 224     # handle optional exponent
 225     if b == 101 or b == 69: # ord('e'), ord('E')
 226         skip_byte(r)
 227         w.write(b'e' if b == 101 else b'E')
 228         b = peek_byte(r)
 229         if b == 43: # ord('+')
 230             skip_byte(r)
 231         elif b == 45: # ord('-')
 232             w.write(b'-')
 233             skip_byte(r)
 234         if copy_digits(w, r) < 1:
 235             raise ValueError('expected numeric digits, but found none')
 236 
 237 
 238 def handle_object(w, r) -> None:
 239     seek_next = seek_next_token
 240 
 241     num_pairs = 0
 242     skip_byte(r)
 243     w.write(b'{')
 244 
 245     while True:
 246         # whitespace/comments may precede the next item/comma
 247         seek_next(r)
 248         b = peek_byte(r)
 249         if b < 0:
 250             raise ValueError('unexpected end of input data, before "}"')
 251 
 252         comma = b == 44 # ord(',')
 253 
 254         if comma:
 255             skip_byte(r)
 256             # whitespace/comments may follow the comma
 257             seek_next(r)
 258             b = peek_byte(r)
 259             if b < 0:
 260                 raise ValueError('unexpected end of input data, before "}"')
 261 
 262         if b == 125: # ord('}')
 263             skip_byte(r)
 264             w.write(b'}')
 265             return
 266 
 267         if num_pairs > 0:
 268             if not comma:
 269                 raise ValueError('missing a comma between key-value pairs')
 270             w.write(b',')
 271 
 272         demand_string(w, r)
 273         # whitespace/comments may follow the key
 274         seek_next(r)
 275         demand(r, b':')
 276         w.write(b':')
 277         # whitespace/comments may follow the colon
 278         seek_next(r)
 279         b = peek_byte(r)
 280         if b > 0:
 281             handlers[b](w, r)
 282             num_pairs += 1
 283 
 284 
 285 def handle_positive(w, r) -> None:
 286     # do nothing with the leading plus sign, which isn't allowed in JSON
 287     skip_byte(r)
 288 
 289     if peek_byte(r) == 46: # ord('.')
 290         skip_byte(r)
 291         w.write(b'0.')
 292         if copy_digits(w, r) < 1:
 293             raise ValueError('expected numeric digits, but found none')
 294     else:
 295         handle_number(w, r)
 296 
 297 
 298 def handle_single_quoted_string(w, r) -> None:
 299     skip_byte(r)
 300     w.write(b'"')
 301     handle_inner_string(w, r, 39) # ord('\'')
 302     w.write(b'"')
 303 
 304 
 305 def demand_string(w, r) -> None:
 306     quote = peek_byte(r)
 307     if quote < 0:
 308         msg = 'unexpected end of input, instead of a string quote'
 309         raise ValueError(msg)
 310 
 311     if quote == 34: # ord('"')
 312         handle_double_quoted_string(w, r)
 313         return
 314 
 315     if quote == 39: # ord('\'')
 316         handle_single_quoted_string(w, r)
 317         return
 318 
 319     if 32 < quote <= 126: # ord(' '), ord('~')
 320         msg = f'expected ", or even \', but got "{chr(quote)}" instead'
 321     else:
 322         msg = f'expected ", or even \', but got byte "{quote}" instead'
 323     raise ValueError(msg)
 324 
 325 
 326 def handle_inner_string(w, r, quote: int) -> None:
 327     esc = False
 328     bad_hex_msg = 'invalid hexadecimal symbols'
 329     early_end_msg = 'input data ended while still in quoted string'
 330 
 331     def is_hex(x: int) -> bool:
 332         # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f')
 333         return 48 <= x <= 57 or 97 <= x <= 102
 334 
 335     def lower(x: int) -> bool:
 336         # 65 is ord('A'), 90 is ord('Z')
 337         return x + 32 if 65 <= x <= 90 else x
 338 
 339     while True:
 340         chunk = r.peek(1)
 341         if len(chunk) < 1:
 342             raise ValueError(early_end_msg)
 343         b = chunk[0]
 344 
 345         if esc:
 346             esc = False
 347 
 348             if b == 120: # ord('x')
 349                 skip_byte(r)
 350                 chunk = read(r, 2)
 351                 if len(chunk) != 2:
 352                     raise ValueError(early_end_msg)
 353                 a = lower(chunk[0])
 354                 b = lower(chunk[1])
 355                 w.write(b'\\u00')
 356                 if not (is_hex(a) and is_hex(b)):
 357                     raise ValueError(bad_hex_msg)
 358                 w.write(a)
 359                 w.write(b)
 360                 continue
 361 
 362             if b == 117: # ord('u')
 363                 skip_byte(r)
 364                 chunk = read(r, 4)
 365                 if len(chunk) != 4:
 366                     raise ValueError(early_end_msg)
 367                 a = lower(chunk[0])
 368                 b = lower(chunk[1])
 369                 c = lower(chunk[2])
 370                 d = lower(chunk[3])
 371                 w.write(b'\\u')
 372                 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)):
 373                     raise ValueError(bad_hex_msg)
 374                 w.write(chunk)
 375                 continue
 376 
 377             # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f'
 378             if b in (34, 92, 110, 116, 114, 98, 102):
 379                 w.write(b'\\')
 380 
 381             w.write(read(r, 1))
 382             continue
 383 
 384         if b == 92: # ord('\\')
 385             esc = True
 386             skip_byte(r)
 387             continue
 388 
 389         if b == quote:
 390             skip_byte(r)
 391             return
 392 
 393         # emit normal string-byte
 394         w.write(read(r, 1))
 395 
 396 
 397 def handle_true(w, r) -> None:
 398     demand(r, b'true')
 399     w.write(b'true')
 400 
 401 
 402 def handle_True(w, r) -> None:
 403     demand(r, b'True')
 404     w.write(b'true')
 405 
 406 
 407 # setup byte-handling lookup tuple
 408 bh = [handle_invalid for i in range(256)]
 409 bh[ord('0')] = handle_number
 410 bh[ord('1')] = handle_number
 411 bh[ord('2')] = handle_number
 412 bh[ord('3')] = handle_number
 413 bh[ord('4')] = handle_number
 414 bh[ord('5')] = handle_number
 415 bh[ord('6')] = handle_number
 416 bh[ord('7')] = handle_number
 417 bh[ord('8')] = handle_number
 418 bh[ord('9')] = handle_number
 419 bh[ord('+')] = handle_positive
 420 bh[ord('-')] = handle_negative
 421 bh[ord('.')] = handle_dot
 422 bh[ord('"')] = handle_double_quoted_string
 423 bh[ord('\'')] = handle_single_quoted_string
 424 bh[ord('F')] = handle_False
 425 bh[ord('N')] = handle_None
 426 bh[ord('T')] = handle_True
 427 bh[ord('f')] = handle_false
 428 bh[ord('n')] = handle_null
 429 bh[ord('t')] = handle_true
 430 bh[ord('[')] = handle_array
 431 bh[ord('(')] = handle_array
 432 bh[ord('{')] = handle_object
 433 
 434 # handlers is the immutable byte-driven func-dispatch table
 435 handlers = tuple(bh)
 436 
 437 
 438 def copy_digits(w, r) -> int:
 439     'Returns how many digits were copied/handled.'
 440 
 441     copied = 0
 442     while True:
 443         chunk = r.peek(64)
 444         if len(chunk) == 0:
 445             return copied
 446 
 447         i = find_digits_end_index(chunk)
 448         if i >= 0:
 449             w.write(read(r, i))
 450             copied += i
 451             return copied
 452         else:
 453             w.write(chunk)
 454             read(r, len(chunk))
 455             copied += len(chunk)
 456 
 457 
 458 def seek_next_token(r) -> None:
 459     'Skip an arbitrarily-long mix of whitespace and comments.'
 460 
 461     while True:
 462         chunk = r.peek(1024)
 463         if len(chunk) == 0:
 464             # input is over, and this func doesn't consider that an error
 465             return
 466 
 467         comment = False
 468 
 469         for i, b in enumerate(chunk):
 470             # skip space, tab, line-feed, carriage-return, or form-feed
 471             if b in (9, 10, 11, 13, 32):
 472                 continue
 473 
 474             if b == 47 or b == 35: # ord('/'), ord('#')
 475                 read(r, i)
 476                 demand_comment(r)
 477                 comment = True
 478                 break
 479 
 480             # found start of next token
 481             read(r, i)
 482             return
 483 
 484         if not comment:
 485             read(r, len(chunk))
 486 
 487 
 488 def skip_line(r) -> None:
 489     while True:
 490         chunk = r.peek(1024)
 491         if len(chunk) == 0:
 492             return
 493 
 494         i = chunk.find(b'\n')
 495         if i >= 0:
 496             read(r, i + 1)
 497             return
 498 
 499         read(r, len(chunk))
 500 
 501 
 502 def skip_general_comment(r) -> None:
 503     while True:
 504         chunk = r.peek(1024)
 505         if len(chunk) == 0:
 506             raise ValueError(f'input data ended before an expected */')
 507 
 508         i = chunk.find(b'*')
 509         if i < 0:
 510             # no */ in this chunk, so skip it and try with the next one
 511             read(r, len(chunk))
 512             continue
 513 
 514         # skip right past the * just found, then check if a / follows it
 515         read(r, i + 1)
 516         if peek_byte(r) == 47: # ord('/')
 517             # got */, the end of this comment
 518             skip_byte(r)
 519             return
 520 
 521 
 522 def find_digits_end_index(chunk: bytes) -> int:
 523     i = 0
 524     for b in chunk:
 525         if 48 <= b <= 57:
 526             i += 1
 527         else:
 528             return i
 529 
 530     # all bytes (if any) were digits, so no end was found
 531     return -1
 532 
 533 
 534 def demand(r, what: bytes) -> None:
 535     lead = read(r, len(what))
 536     if not lead.startswith(what):
 537         lead = str(lead, encoding='utf-8')
 538         what = str(what, encoding='utf-8')
 539         raise ValueError(f'expected {what}, but got {lead} instead')
 540 
 541 
 542 def demand_comment(r) -> None:
 543     b = peek_byte(r)
 544     if b < 0:
 545         raise ValueError('unexpected end of input data')
 546     if b == 35: # ord('#')
 547         # handle single-line comment
 548         skip_line(r)
 549         return
 550 
 551     demand(r, b'/')
 552     b = peek_byte(r)
 553     if b < 0:
 554         raise ValueError('unexpected end of input data')
 555 
 556     if b == 47: # ord('/')
 557         # handle single-line comment
 558         skip_line(r)
 559         return
 560 
 561     if b == 42: # ord('*')
 562         # handle (potentially) multi-line comment
 563         skip_general_comment(r)
 564         return
 565 
 566     raise ValueError('expected * or another /, after a /')
 567 
 568 
 569 def json0(w, src, end) -> None:
 570     r = BufferedReader(src)
 571 
 572     # skip leading UTF-8 BOM (byte-order mark)
 573     if r.peek(3) == b'\xef\xbb\xbf':
 574         read(r, 3)
 575 
 576     # skip leading whitespace/comments
 577     seek_next_token(r)
 578 
 579     # emit a single output line, ending with a line-feed
 580     b = peek_byte(r)
 581     if b >= 0:
 582         handlers[b](w, r)
 583     else:
 584         # w.write(b'null')
 585         # treat empty(ish) input as invalid JSON
 586         raise ValueError('can\'t turn empty(ish) input into JSON')
 587 
 588     # deliberately run post-processing before checking for trailing-data
 589     # errors: for example, if post-proc func emits new line, errors will
 590     # show up on their separate line, which is nicer
 591     end(w)
 592 
 593     # ignore trailing whitespace/comment bytes, if present
 594     seek_next_token(r)
 595 
 596     # ignore trailing semicolon, if present
 597     b = peek_byte(r)
 598     if b == 59: # ord(';')
 599         read(r, 1)
 600         # ignore trailing whitespace/comment bytes, if present
 601         seek_next_token(r)
 602 
 603     if len(r.peek(1)) > 0:
 604         raise ValueError('unexpected trailing bytes in JSON data')
 605 
 606 
 607 def seems_url(s: str) -> bool:
 608     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 609     return any(s.startswith(p) for p in protocols)
 610 
 611 
 612 def handle_json(w, r) -> None:
 613     def end(w) -> None:
 614         w.write(b'\n')
 615         w.flush()
 616     json0(w, r, end)
 617 
 618 
 619 def handle_json_lines(w, r) -> None:
 620     global pos, linenum
 621 
 622     items = 0
 623     linenum = 0
 624     w.write(b'[')
 625 
 626     while True:
 627         line = r.readline().lstrip()
 628         if not line:
 629             break
 630 
 631         pos = 1
 632         linenum += 1
 633 
 634         stripped = line.strip()
 635         if not stripped or stripped.startswith(b'//'):
 636             continue
 637 
 638         items += 1
 639         if items > 1:
 640             w.write(b',')
 641 
 642         json0(w, BytesIO(line), lambda w: w.flush())
 643 
 644     w.write(b']\n')
 645 
 646 
 647 start_args = 1
 648 handle_input = handle_json
 649 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'):
 650     start_args = 2
 651     handle_input = handle_json_lines
 652 
 653 if len(argv) - 1 > start_args:
 654     print(f'multiple inputs not allowed', file=stderr)
 655     exit(1)
 656 
 657 w = stdout.buffer
 658 name = argv[start_args] if len(argv) > start_args else '-'
 659 
 660 # values keeping track of the input-position, shown in case of errors
 661 pos = 1
 662 linenum = 1
 663 
 664 try:
 665     if name == '-':
 666         handle_input(w, stdin.buffer)
 667     elif seems_url(name):
 668         from urllib.request import urlopen
 669         with urlopen(name) as inp:
 670             handle_input(w, inp)
 671     else:
 672         with open(name, mode='rb') as inp:
 673             handle_input(w, inp)
 674 except BrokenPipeError:
 675     # quit quietly, instead of showing a confusing error message
 676     stderr.close()
 677     exit(0)
 678 except KeyboardInterrupt:
 679     exit(2)
 680 except Exception as e:
 681     stdout.write('\n')
 682     print(f'line {linenum}, pos {pos} : {e}', file=stderr)
 683     exit(1)