File: j0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import BufferedReader, BytesIO
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 j0 [filepath/URI...]
  32 
  33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  34 
  35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid
  36 JSON, since it ignores comments and trailing commas, neither of which are
  37 supported in JSON, but which are still commonly used.
  38 
  39 It also turns single-quoted strings into proper double-quoted ones, as well
  40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u`
  41 hexadecimal escapes. When backslashes in strings are followed by an invalid
  42 escape letter, the backslash is ignored.
  43 
  44 Output is always a single line of valid JSON, ending with a line-feed.
  45 '''
  46 
  47 # handle standard help cmd-line options, quitting right away in that case
  48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  49     print(info.strip())
  50     exit(0)
  51 
  52 
  53 # note: using regexes doesn't seem to speed-up number/string-handling
  54 
  55 
  56 def read(r, size: int) -> bytes:
  57     global pos, linenum
  58 
  59     chunk = r.read(size)
  60     if not chunk:
  61         return chunk
  62 
  63     if not (10 in chunk):
  64         pos += len(chunk)
  65         return chunk
  66 
  67     for b in chunk:
  68         if b == 10:
  69             pos = 1
  70             linenum += 1
  71         else:
  72             pos += 1
  73     return chunk
  74 
  75 
  76 def skip_byte(r) -> None:
  77     global pos, linenum
  78 
  79     chunk = r.read(1)
  80     if not chunk:
  81         return
  82 
  83     if chunk[0] == 10:
  84         pos = 1
  85         linenum += 1
  86     else:
  87         pos += 1
  88 
  89 
  90 def peek_byte(r) -> int:
  91     chunk = r.peek(64)
  92     if len(chunk) > 0:
  93         return chunk[0]
  94     return -1
  95 
  96 
  97 def handle_array(w, r) -> None:
  98     seek_next = seek_next_token
  99 
 100     n = 0
 101     skip_byte(r)
 102     w.write(b'[')
 103 
 104     while True:
 105         # whitespace/comments may precede the next item/comma
 106         seek_next(r)
 107         b = peek_byte(r)
 108         if b < 0:
 109             raise ValueError('unexpected end of input data, before "]"')
 110 
 111         comma = b == 44 # ord(',')
 112 
 113         if comma:
 114             skip_byte(r)
 115             # whitespace/comments may follow the comma
 116             seek_next(r)
 117             b = peek_byte(r)
 118             if b < 0:
 119                 raise ValueError('unexpected end of input data, before "]"')
 120 
 121         if b == 93: # ord(']')
 122             skip_byte(r)
 123             w.write(b']')
 124             return
 125 
 126         if n > 0:
 127             if not comma:
 128                 raise ValueError('missing a comma between array values')
 129             w.write(b',')
 130 
 131         b = peek_byte(r)
 132         if b > 0:
 133             handlers[b](w, r)
 134             n += 1
 135 
 136 
 137 def handle_double_quoted_string(w, r) -> None:
 138     skip_byte(r)
 139     w.write(b'"')
 140     handle_inner_string(w, r, 34) # ord('"')
 141     w.write(b'"')
 142 
 143 
 144 def handle_dot(w, r) -> None:
 145     skip_byte(r)
 146     # precede the leading decimal dot with a 0
 147     w.write(b'0.')
 148 
 149     # handle decimals, which in this case aren't optional, as a leading
 150     # dot is what led to this point
 151     if copy_digits(w, r) < 1:
 152         raise ValueError('expected numeric digits, but found none')
 153 
 154 
 155 def handle_false(w, r) -> None:
 156     demand(r, b'false')
 157     w.write(b'false')
 158 
 159 
 160 def handle_False(w, r) -> None:
 161     demand(r, b'False')
 162     w.write(b'false')
 163 
 164 
 165 def handle_invalid(w, r) -> None:
 166     b = peek_byte(r)
 167     if b < 0:
 168         raise ValueError('unexpected end of input data')
 169     # raise ValueError(f'unexpected JSON byte-value {b}')
 170     if 32 < b <= 126:
 171         msg = f'unexpected symbol {chr(b)}'
 172     else:
 173         msg = f'unexpected byte-value {b}'
 174     raise ValueError(msg)
 175 
 176 
 177 def handle_negative(w, r) -> None:
 178     skip_byte(r)
 179     w.write(b'-')
 180 
 181     if peek_byte(r) == 46: # ord('.')
 182         skip_byte(r)
 183         w.write(b'0.')
 184         if copy_digits(w, r) < 1:
 185             raise ValueError('expected numeric digits, but found none')
 186     else:
 187         handle_number(w, r)
 188 
 189 
 190 def handle_null(w, r) -> None:
 191     demand(r, b'null')
 192     w.write(b'null')
 193 
 194 
 195 def handle_None(w, r) -> None:
 196     demand(r, b'None')
 197     w.write(b'null')
 198 
 199 
 200 def handle_number(w, r) -> None:
 201     # handle integer part
 202     if copy_digits(w, r) < 1:
 203         raise ValueError('expected numeric digits, but found none')
 204 
 205     # handle optional decimals
 206     b = peek_byte(r)
 207     if b == 46: # ord('.')
 208         skip_byte(r)
 209         w.write(b'.')
 210         if copy_digits(w, r) < 1:
 211             # follow a trailing decimal dot with a 0
 212             w.write(b'0')
 213 
 214     # handle optional exponent
 215     if b == 101 or b == 69: # ord('e'), ord('E')
 216         skip_byte(r)
 217         w.write(b'e' if b == 101 else b'E')
 218         b = peek_byte(r)
 219         if b == 43: # ord('+')
 220             skip_byte(r)
 221         elif b == 45: # ord('-')
 222             w.write(b'-')
 223             skip_byte(r)
 224         if copy_digits(w, r) < 1:
 225             raise ValueError('expected numeric digits, but found none')
 226 
 227 
 228 def handle_object(w, r) -> None:
 229     seek_next = seek_next_token
 230 
 231     num_pairs = 0
 232     skip_byte(r)
 233     w.write(b'{')
 234 
 235     while True:
 236         # whitespace/comments may precede the next item/comma
 237         seek_next(r)
 238         b = peek_byte(r)
 239         if b < 0:
 240             raise ValueError('unexpected end of input data, before "}"')
 241 
 242         comma = b == 44 # ord(',')
 243 
 244         if comma:
 245             skip_byte(r)
 246             # whitespace/comments may follow the comma
 247             seek_next(r)
 248             b = peek_byte(r)
 249             if b < 0:
 250                 raise ValueError('unexpected end of input data, before "}"')
 251 
 252         if b == 125: # ord('}')
 253             skip_byte(r)
 254             w.write(b'}')
 255             return
 256 
 257         if num_pairs > 0:
 258             if not comma:
 259                 raise ValueError('missing a comma between key-value pairs')
 260             w.write(b',')
 261 
 262         demand_string(w, r)
 263         # whitespace/comments may follow the key
 264         seek_next(r)
 265         demand(r, b':')
 266         w.write(b':')
 267         # whitespace/comments may follow the colon
 268         seek_next(r)
 269         b = peek_byte(r)
 270         if b > 0:
 271             handlers[b](w, r)
 272             num_pairs += 1
 273 
 274 
 275 def handle_positive(w, r) -> None:
 276     # do nothing with the leading plus sign, which isn't allowed in JSON
 277     skip_byte(r)
 278 
 279     if peek_byte(r) == 46: # ord('.')
 280         skip_byte(r)
 281         w.write(b'0.')
 282         if copy_digits(w, r) < 1:
 283             raise ValueError('expected numeric digits, but found none')
 284     else:
 285         handle_number(w, r)
 286 
 287 
 288 def handle_single_quoted_string(w, r) -> None:
 289     skip_byte(r)
 290     w.write(b'"')
 291     handle_inner_string(w, r, 39) # ord('\'')
 292     w.write(b'"')
 293 
 294 
 295 def demand_string(w, r) -> None:
 296     quote = peek_byte(r)
 297     if quote < 0:
 298         msg = 'unexpected end of input, instead of a string quote'
 299         raise ValueError(msg)
 300 
 301     if quote == 34: # ord('"')
 302         handle_double_quoted_string(w, r)
 303         return
 304 
 305     if quote == 39: # ord('\'')
 306         handle_single_quoted_string(w, r)
 307         return
 308 
 309     if 32 < quote <= 126: # ord(' '), ord('~')
 310         msg = f'expected ", or even \', but got "{chr(quote)}" instead'
 311     else:
 312         msg = f'expected ", or even \', but got byte "{quote}" instead'
 313     raise ValueError(msg)
 314 
 315 
 316 def handle_inner_string(w, r, quote: int) -> None:
 317     esc = False
 318     bad_hex_msg = 'invalid hexadecimal symbols'
 319     early_end_msg = 'input data ended while still in quoted string'
 320 
 321     def is_hex(x: int) -> bool:
 322         # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f')
 323         return 48 <= x <= 57 or 97 <= x <= 102
 324 
 325     def lower(x: int) -> bool:
 326         # 65 is ord('A'), 90 is ord('Z')
 327         return x + 32 if 65 <= x <= 90 else x
 328 
 329     while True:
 330         chunk = r.peek(1)
 331         if len(chunk) < 1:
 332             raise ValueError(early_end_msg)
 333         b = chunk[0]
 334 
 335         if esc:
 336             esc = False
 337 
 338             if b == 120: # ord('x')
 339                 skip_byte(r)
 340                 chunk = read(r, 2)
 341                 if len(chunk) != 2:
 342                     raise ValueError(early_end_msg)
 343                 a = lower(chunk[0])
 344                 b = lower(chunk[1])
 345                 w.write(b'\\u00')
 346                 if not (is_hex(a) and is_hex(b)):
 347                     raise ValueError(bad_hex_msg)
 348                 w.write(a)
 349                 w.write(b)
 350                 continue
 351 
 352             if b == 117: # ord('u')
 353                 skip_byte(r)
 354                 chunk = read(r, 4)
 355                 if len(chunk) != 4:
 356                     raise ValueError(early_end_msg)
 357                 a = lower(chunk[0])
 358                 b = lower(chunk[1])
 359                 c = lower(chunk[2])
 360                 d = lower(chunk[3])
 361                 w.write(b'\\u')
 362                 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)):
 363                     raise ValueError(bad_hex_msg)
 364                 w.write(chunk)
 365                 continue
 366 
 367             # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f'
 368             if b in (34, 92, 110, 116, 114, 98, 102):
 369                 w.write(b'\\')
 370 
 371             w.write(read(r, 1))
 372             continue
 373 
 374         if b == 92: # ord('\\')
 375             esc = True
 376             skip_byte(r)
 377             continue
 378 
 379         if b == quote:
 380             skip_byte(r)
 381             return
 382 
 383         # emit normal string-byte
 384         w.write(read(r, 1))
 385 
 386 
 387 def handle_true(w, r) -> None:
 388     demand(r, b'true')
 389     w.write(b'true')
 390 
 391 
 392 def handle_True(w, r) -> None:
 393     demand(r, b'True')
 394     w.write(b'true')
 395 
 396 
 397 # setup byte-handling lookup tuple
 398 bh = [handle_invalid for i in range(256)]
 399 bh[ord('0')] = handle_number
 400 bh[ord('1')] = handle_number
 401 bh[ord('2')] = handle_number
 402 bh[ord('3')] = handle_number
 403 bh[ord('4')] = handle_number
 404 bh[ord('5')] = handle_number
 405 bh[ord('6')] = handle_number
 406 bh[ord('7')] = handle_number
 407 bh[ord('8')] = handle_number
 408 bh[ord('9')] = handle_number
 409 bh[ord('+')] = handle_positive
 410 bh[ord('-')] = handle_negative
 411 bh[ord('.')] = handle_dot
 412 bh[ord('"')] = handle_double_quoted_string
 413 bh[ord('\'')] = handle_single_quoted_string
 414 bh[ord('F')] = handle_False
 415 bh[ord('N')] = handle_None
 416 bh[ord('T')] = handle_True
 417 bh[ord('f')] = handle_false
 418 bh[ord('n')] = handle_null
 419 bh[ord('t')] = handle_true
 420 bh[ord('[')] = handle_array
 421 bh[ord('{')] = handle_object
 422 
 423 # handlers is the immutable byte-driven func-dispatch table
 424 handlers = tuple(bh)
 425 
 426 
 427 def copy_digits(w, r) -> int:
 428     'Returns how many digits were copied/handled.'
 429 
 430     copied = 0
 431     while True:
 432         chunk = r.peek(64)
 433         if len(chunk) == 0:
 434             return copied
 435 
 436         i = find_digits_end_index(chunk)
 437         if i >= 0:
 438             w.write(read(r, i))
 439             copied += i
 440             return copied
 441         else:
 442             w.write(chunk)
 443             read(r, len(chunk))
 444             copied += len(chunk)
 445 
 446 
 447 def seek_next_token(r) -> None:
 448     'Skip an arbitrarily-long mix of whitespace and comments.'
 449 
 450     while True:
 451         chunk = r.peek(1024)
 452         if len(chunk) == 0:
 453             # input is over, and this func doesn't consider that an error
 454             return
 455 
 456         comment = False
 457 
 458         for i, b in enumerate(chunk):
 459             # skip space, tab, line-feed, carriage-return, or form-feed
 460             if b in (9, 10, 11, 13, 32):
 461                 continue
 462 
 463             if b == 47: # ord('/')
 464                 read(r, i)
 465                 demand_comment(r)
 466                 comment = True
 467                 break
 468 
 469             # found start of next token
 470             read(r, i)
 471             return
 472 
 473         if not comment:
 474             read(r, len(chunk))
 475 
 476 
 477 def skip_line(r) -> None:
 478     while True:
 479         chunk = r.peek(1024)
 480         if len(chunk) == 0:
 481             return
 482 
 483         i = chunk.find(b'\n')
 484         if i >= 0:
 485             read(r, i + 1)
 486             return
 487 
 488         read(r, len(chunk))
 489 
 490 
 491 def skip_general_comment(r) -> None:
 492     while True:
 493         chunk = r.peek(1024)
 494         if len(chunk) == 0:
 495             raise ValueError(f'input data ended before an expected */')
 496 
 497         i = chunk.find(b'*')
 498         if i < 0:
 499             # no */ in this chunk, so skip it and try with the next one
 500             read(r, len(chunk))
 501             continue
 502 
 503         # skip right past the * just found, then check if a / follows it
 504         read(r, i + 1)
 505         if peek_byte(r) == 47: # ord('/')
 506             # got */, the end of this comment
 507             skip_byte(r)
 508             return
 509 
 510 
 511 def find_digits_end_index(chunk: bytes) -> int:
 512     i = 0
 513     for b in chunk:
 514         if 48 <= b <= 57:
 515             i += 1
 516         else:
 517             return i
 518 
 519     # all bytes (if any) were digits, so no end was found
 520     return -1
 521 
 522 
 523 def demand(r, what: bytes) -> None:
 524     lead = read(r, len(what))
 525     if not lead.startswith(what):
 526         lead = str(lead, encoding='utf-8')
 527         what = str(what, encoding='utf-8')
 528         raise ValueError(f'expected {what}, but got {lead} instead')
 529 
 530 
 531 def demand_comment(r) -> None:
 532     demand(r, b'/')
 533     b = peek_byte(r)
 534     if b < 0:
 535         raise ValueError('unexpected end of input data')
 536 
 537     if b == 47: # ord('/')
 538         # handle single-line comment
 539         skip_line(r)
 540         return
 541 
 542     if b == 42: # ord('*')
 543         # handle (potentially) multi-line comment
 544         skip_general_comment(r)
 545         return
 546 
 547     raise ValueError('expected * or another /, after a /')
 548 
 549 
 550 def json0(w, src, end) -> None:
 551     r = BufferedReader(src)
 552 
 553     # skip leading UTF-8 BOM (byte-order mark)
 554     if r.peek(3) == b'\xef\xbb\xbf':
 555         read(r, 3)
 556 
 557     # skip leading whitespace/comments
 558     seek_next_token(r)
 559 
 560     # emit a single output line, ending with a line-feed
 561     b = peek_byte(r)
 562     if b >= 0:
 563         handlers[b](w, r)
 564     else:
 565         # w.write(b'null')
 566         # treat empty(ish) input as invalid JSON
 567         raise ValueError('can\'t turn empty(ish) input into JSON')
 568 
 569     # deliberately run post-processing before checking for trailing-data
 570     # errors: for example, if post-proc func emits new line, errors will
 571     # show up on their separate line, which is nicer
 572     end(w)
 573 
 574     # ignore trailing whitespace/comment bytes, if present
 575     seek_next_token(r)
 576 
 577     # ignore trailing semicolon, if present
 578     b = peek_byte(r)
 579     if b == 59: # ord(';')
 580         read(r, 1)
 581         # ignore trailing whitespace/comment bytes, if present
 582         seek_next_token(r)
 583 
 584     if len(r.peek(1)) > 0:
 585         raise ValueError('unexpected trailing bytes in JSON data')
 586 
 587 
 588 def seems_url(s: str) -> bool:
 589     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 590     return any(s.startswith(p) for p in protocols)
 591 
 592 
 593 def handle_json(w, r) -> None:
 594     def end(w) -> None:
 595         w.write(b'\n')
 596         w.flush()
 597     json0(w, r, end)
 598 
 599 
 600 def handle_json_lines(w, r) -> None:
 601     global pos, linenum
 602 
 603     items = 0
 604     linenum = 0
 605     w.write(b'[')
 606 
 607     while True:
 608         line = r.readline().lstrip()
 609         if not line:
 610             break
 611 
 612         pos = 1
 613         linenum += 1
 614 
 615         stripped = line.strip()
 616         if not stripped or stripped.startswith(b'//'):
 617             continue
 618 
 619         items += 1
 620         if items > 1:
 621             w.write(b',')
 622 
 623         json0(w, BytesIO(line), lambda w: w.flush())
 624 
 625     w.write(b']\n')
 626 
 627 
 628 start_args = 1
 629 handle_input = handle_json
 630 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'):
 631     start_args = 2
 632     handle_input = handle_json_lines
 633 
 634 if len(argv) - 1 > start_args:
 635     print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr)
 636     exit(1)
 637 
 638 w = stdout.buffer
 639 name = argv[start_args] if len(argv) > start_args else '-'
 640 
 641 # values keeping track of the input-position, shown in case of errors
 642 pos = 1
 643 linenum = 1
 644 
 645 try:
 646     if name == '-':
 647         handle_input(w, stdin.buffer)
 648     elif seems_url(name):
 649         from urllib.request import urlopen
 650         with urlopen(name) as inp:
 651             handle_input(w, inp)
 652     else:
 653         with open(name, mode='rb') as inp:
 654             handle_input(w, inp)
 655 except BrokenPipeError:
 656     # quit quietly, instead of showing a confusing error message
 657     stderr.close()
 658     exit(0)
 659 except KeyboardInterrupt:
 660     exit(2)
 661 except Exception as e:
 662     stdout.write('\n')
 663     print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr)
 664     exit(1)