File: j0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import BufferedReader, BytesIO
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 j0 [filepath/URI...]
  32 
  33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  34 
  35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid
  36 JSON, since it ignores comments and trailing commas, neither of which are
  37 supported in JSON, but which are still commonly used.
  38 
  39 It also turns single-quoted strings into proper double-quoted ones, as well
  40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u`
  41 hexadecimal escapes. When backslashes in strings are followed by an invalid
  42 escape letter, the backslash is ignored.
  43 
  44 Output is always a single line of valid JSON, ending with a line-feed.
  45 '''
  46 
  47 # handle standard help cmd-line options, quitting right away in that case
  48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  49     print(info.strip())
  50     exit(0)
  51 
  52 
  53 # note: using regexes doesn't seem to speed-up number/string-handling
  54 
  55 
  56 def read(r, size: int) -> bytes:
  57     global pos, linenum
  58 
  59     chunk = r.read(size)
  60     if not chunk:
  61         return chunk
  62 
  63     if not (10 in chunk):
  64         pos += len(chunk)
  65         return chunk
  66 
  67     for b in chunk:
  68         if b == 10:
  69             pos = 1
  70             linenum += 1
  71         else:
  72             pos += 1
  73 
  74     return chunk
  75 
  76 
  77 def skip_byte(r) -> None:
  78     global pos, linenum
  79 
  80     chunk = r.read(1)
  81     if not chunk:
  82         return
  83 
  84     if chunk[0] == 10:
  85         pos = 1
  86         linenum += 1
  87     else:
  88         pos += 1
  89 
  90 
  91 def peek_byte(r) -> int:
  92     chunk = r.peek(64)
  93     if len(chunk) > 0:
  94         return chunk[0]
  95     return -1
  96 
  97 
  98 def handle_array(w, r) -> None:
  99     seek_next = seek_next_token
 100 
 101     n = 0
 102     skip_byte(r)
 103     w.write(b'[')
 104 
 105     while True:
 106         # whitespace/comments may precede the next item/comma
 107         seek_next(r)
 108         b = peek_byte(r)
 109         if b < 0:
 110             raise ValueError('unexpected end of input data, before "]"')
 111 
 112         comma = b == 44 # ord(',')
 113 
 114         if comma:
 115             skip_byte(r)
 116             # whitespace/comments may follow the comma
 117             seek_next(r)
 118             b = peek_byte(r)
 119             if b < 0:
 120                 raise ValueError('unexpected end of input data, before "]"')
 121 
 122         if b == 93: # ord(']')
 123             skip_byte(r)
 124             w.write(b']')
 125             return
 126 
 127         if n > 0:
 128             if not comma:
 129                 raise ValueError('missing a comma between array values')
 130             w.write(b',')
 131 
 132         b = peek_byte(r)
 133         if b > 0:
 134             handlers[b](w, r)
 135             n += 1
 136 
 137 
 138 def handle_double_quoted_string(w, r) -> None:
 139     skip_byte(r)
 140     w.write(b'"')
 141     handle_inner_string(w, r, 34) # ord('"')
 142     w.write(b'"')
 143 
 144 
 145 def handle_dot(w, r) -> None:
 146     skip_byte(r)
 147     # precede the leading decimal dot with a 0
 148     w.write(b'0.')
 149 
 150     # handle decimals, which in this case aren't optional, as a leading
 151     # dot is what led to this point
 152     if copy_digits(w, r) < 1:
 153         raise ValueError('expected numeric digits, but found none')
 154 
 155 
 156 def handle_false(w, r) -> None:
 157     demand(r, b'false')
 158     w.write(b'false')
 159 
 160 
 161 def handle_invalid(w, r) -> None:
 162     b = peek_byte(r)
 163     if b < 0:
 164         raise ValueError('unexpected end of input data')
 165     # raise ValueError(f'unexpected JSON byte-value {b}')
 166     if 32 < b <= 126:
 167         msg = f'unexpected symbol {chr(b)}'
 168     else:
 169         msg = f'unexpected byte-value {b}'
 170     raise ValueError(msg)
 171 
 172 
 173 def handle_negative(w, r) -> None:
 174     skip_byte(r)
 175     w.write(b'-')
 176 
 177     if peek_byte(r) == 46: # ord('.')
 178         skip_byte(r)
 179         w.write(b'0.')
 180         if copy_digits(w, r) < 1:
 181             raise ValueError('expected numeric digits, but found none')
 182     else:
 183         handle_number(w, r)
 184 
 185 
 186 def handle_null(w, r) -> None:
 187     demand(r, b'null')
 188     w.write(b'null')
 189 
 190 
 191 def handle_number(w, r) -> None:
 192     # handle integer part
 193     if copy_digits(w, r) < 1:
 194         raise ValueError('expected numeric digits, but found none')
 195 
 196     # handle optional decimals
 197     b = peek_byte(r)
 198     if b == 46: # ord('.')
 199         skip_byte(r)
 200         w.write(b'.')
 201         if copy_digits(w, r) < 1:
 202             # follow a trailing decimal dot with a 0
 203             w.write(b'0')
 204 
 205     # handle optional exponent
 206     if b == 101 or b == 69: # ord('e'), ord('E')
 207         skip_byte(r)
 208         w.write(b'e' if b == 101 else b'E')
 209         b = peek_byte(r)
 210         if b == 43: # ord('+')
 211             skip_byte(r)
 212         elif b == 45: # ord('-')
 213             w.write(b'-')
 214             skip_byte(r)
 215         if copy_digits(w, r) < 1:
 216             raise ValueError('expected numeric digits, but found none')
 217 
 218 
 219 def handle_object(w, r) -> None:
 220     seek_next = seek_next_token
 221 
 222     num_pairs = 0
 223     skip_byte(r)
 224     w.write(b'{')
 225 
 226     while True:
 227         # whitespace/comments may precede the next item/comma
 228         seek_next(r)
 229         b = peek_byte(r)
 230         if b < 0:
 231             raise ValueError('unexpected end of input data, before "}"')
 232 
 233         comma = b == 44 # ord(',')
 234 
 235         if comma:
 236             skip_byte(r)
 237             # whitespace/comments may follow the comma
 238             seek_next(r)
 239             b = peek_byte(r)
 240             if b < 0:
 241                 raise ValueError('unexpected end of input data, before "}"')
 242 
 243         if b == 125: # ord('}')
 244             skip_byte(r)
 245             w.write(b'}')
 246             return
 247 
 248         if num_pairs > 0:
 249             if not comma:
 250                 raise ValueError('missing a comma between key-value pairs')
 251             w.write(b',')
 252 
 253         demand_string(w, r)
 254         # whitespace/comments may follow the key
 255         seek_next(r)
 256         demand(r, b':')
 257         w.write(b':')
 258         # whitespace/comments may follow the colon
 259         seek_next(r)
 260         b = peek_byte(r)
 261         if b > 0:
 262             handlers[b](w, r)
 263             num_pairs += 1
 264 
 265 
 266 def handle_positive(w, r) -> None:
 267     # do nothing with the leading plus sign: strictly-speaking, JSON numbers
 268     # can't start with a positive sign, and this tool's output is supposed
 269     # to be `JSON-0` (minimized) anyway
 270     skip_byte(r)
 271 
 272     if peek_byte(r) == 46: # ord('.')
 273         skip_byte(r)
 274         w.write(b'0.')
 275         if copy_digits(w, r) < 1:
 276             raise ValueError('expected numeric digits, but found none')
 277     else:
 278         handle_number(w, r)
 279 
 280 
 281 def handle_single_quoted_string(w, r) -> None:
 282     skip_byte(r)
 283     w.write(b'"')
 284     handle_inner_string(w, r, 39) # ord('\'')
 285     w.write(b'"')
 286 
 287 
 288 def demand_string(w, r) -> None:
 289     quote = peek_byte(r)
 290     if quote < 0:
 291         msg = 'unexpected end of input, instead of a string quote'
 292         raise ValueError(msg)
 293 
 294     if quote == 34: # ord('"')
 295         handle_double_quoted_string(w, r)
 296         return
 297 
 298     if quote == 39: # ord('\'')
 299         handle_single_quoted_string(w, r)
 300         return
 301 
 302     if 32 < quote <= 126: # ord(' '), ord('~')
 303         msg = f'expected ", or even \', but got "{chr(quote)}" instead'
 304     else:
 305         msg = f'expected ", or even \', but got byte "{quote}" instead'
 306     raise ValueError(msg)
 307 
 308 
 309 def handle_inner_string(w, r, quote: int) -> None:
 310     esc = False
 311     bad_hex_msg = 'invalid hexadecimal symbols'
 312     early_end_msg = 'input data ended while still in quoted string'
 313 
 314     def is_hex(x: int) -> bool:
 315         # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f')
 316         return 48 <= x <= 57 or 97 <= x <= 102
 317 
 318     def lower(x: int) -> bool:
 319         # 65 is ord('A'), 90 is ord('Z')
 320         return x + 32 if 65 <= x <= 90 else x
 321 
 322     while True:
 323         chunk = r.peek(1)
 324         if len(chunk) < 1:
 325             raise ValueError(early_end_msg)
 326         b = chunk[0]
 327 
 328         if esc:
 329             esc = False
 330 
 331             if b == 120: # ord('x')
 332                 skip_byte(r)
 333                 chunk = read(r, 2)
 334                 if len(chunk) != 2:
 335                     raise ValueError(early_end_msg)
 336                 a = lower(chunk[0])
 337                 b = lower(chunk[1])
 338                 w.write(b'\\u00')
 339                 if not (is_hex(a) and is_hex(b)):
 340                     raise ValueError(bad_hex_msg)
 341                 w.write(a)
 342                 w.write(b)
 343                 continue
 344 
 345             if b == 117: # ord('u')
 346                 skip_byte(r)
 347                 chunk = read(r, 4)
 348                 if len(chunk) != 4:
 349                     raise ValueError(early_end_msg)
 350                 a = lower(chunk[0])
 351                 b = lower(chunk[1])
 352                 c = lower(chunk[2])
 353                 d = lower(chunk[3])
 354                 w.write(b'\\u')
 355                 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)):
 356                     raise ValueError(bad_hex_msg)
 357                 w.write(chunk)
 358                 continue
 359 
 360             # numbers for '"', '\\', 'n', 't', 'r', 'b', and 'f'
 361             if b in (34, 92, 110, 116, 114, 98, 102):
 362                 w.write(b'\\')
 363 
 364             w.write(read(r, 1))
 365             continue
 366 
 367         if b == 92: # ord('\\')
 368             esc = True
 369             skip_byte(r)
 370             continue
 371 
 372         if b == quote:
 373             skip_byte(r)
 374             return
 375 
 376         # emit normal string-byte
 377         w.write(read(r, 1))
 378 
 379 
 380 def handle_true(w, r) -> None:
 381     demand(r, b'true')
 382     w.write(b'true')
 383 
 384 
 385 # setup byte-handling lookup tuple
 386 bh = [handle_invalid for i in range(256)]
 387 bh[ord('0')] = handle_number
 388 bh[ord('1')] = handle_number
 389 bh[ord('2')] = handle_number
 390 bh[ord('3')] = handle_number
 391 bh[ord('4')] = handle_number
 392 bh[ord('5')] = handle_number
 393 bh[ord('6')] = handle_number
 394 bh[ord('7')] = handle_number
 395 bh[ord('8')] = handle_number
 396 bh[ord('9')] = handle_number
 397 bh[ord('+')] = handle_positive
 398 bh[ord('-')] = handle_negative
 399 bh[ord('.')] = handle_dot
 400 bh[ord('"')] = handle_double_quoted_string
 401 bh[ord('\'')] = handle_single_quoted_string
 402 bh[ord('f')] = handle_false
 403 bh[ord('n')] = handle_null
 404 bh[ord('t')] = handle_true
 405 bh[ord('[')] = handle_array
 406 bh[ord('{')] = handle_object
 407 
 408 # handlers is the immutable byte-driven func-dispatch table
 409 handlers = tuple(bh)
 410 
 411 
 412 def copy_digits(w, r) -> int:
 413     'Returns how many digits were copied/handled.'
 414 
 415     copied = 0
 416     while True:
 417         chunk = r.peek(64)
 418         if len(chunk) == 0:
 419             return copied
 420 
 421         i = find_digits_end_index(chunk)
 422         if i >= 0:
 423             w.write(read(r, i))
 424             copied += i
 425             return copied
 426         else:
 427             w.write(chunk)
 428             read(r, len(chunk))
 429             copied += len(chunk)
 430 
 431 
 432 def seek_next_token(r) -> None:
 433     'Skip an arbitrarily-long mix of whitespace and comments.'
 434 
 435     while True:
 436         chunk = r.peek(1024)
 437         if len(chunk) == 0:
 438             # input is over, and this func doesn't consider that an error
 439             return
 440 
 441         comment = False
 442 
 443         for i, b in enumerate(chunk):
 444             # skip space, tab, line-feed, carriage-return, or form-feed
 445             if b in (9, 10, 11, 13, 32):
 446                 continue
 447 
 448             if b == 47: # ord('/')
 449                 read(r, i)
 450                 demand_comment(r)
 451                 comment = True
 452                 break
 453 
 454             # found start of next token
 455             read(r, i)
 456             return
 457 
 458         if not comment:
 459             read(r, len(chunk))
 460 
 461 
 462 def skip_line(r) -> None:
 463     while True:
 464         chunk = r.peek(1024)
 465         if len(chunk) == 0:
 466             return
 467 
 468         i = chunk.find(b'\n')
 469         if i >= 0:
 470             read(r, i + 1)
 471             return
 472 
 473         read(r, len(chunk))
 474 
 475 
 476 def skip_general_comment(r) -> None:
 477     while True:
 478         chunk = r.peek(1024)
 479         if len(chunk) == 0:
 480             raise ValueError(f'input data ended before an expected */')
 481 
 482         i = chunk.find(b'*')
 483         if i < 0:
 484             # no */ in this chunk, so skip it and try with the next one
 485             read(r, len(chunk))
 486             continue
 487 
 488         # skip right past the * just found, then check if a / follows it
 489         read(r, i + 1)
 490         if peek_byte(r) == 47: # ord('/')
 491             # got */, the end of this comment
 492             skip_byte(r)
 493             return
 494 
 495 
 496 def find_digits_end_index(chunk: bytes) -> int:
 497     i = 0
 498     for b in chunk:
 499         if 48 <= b <= 57:
 500             i += 1
 501         else:
 502             return i
 503 
 504     # all bytes (if any) were digits, so no end was found
 505     return -1
 506 
 507 
 508 def demand(r, what: bytes) -> None:
 509     lead = read(r, len(what))
 510     if not lead.startswith(what):
 511         lead = str(lead, encoding='utf-8')
 512         what = str(what, encoding='utf-8')
 513         raise ValueError(f'expected {what}, but got {lead} instead')
 514 
 515 
 516 def demand_comment(r) -> None:
 517     demand(r, b'/')
 518     b = peek_byte(r)
 519     if b < 0:
 520         raise ValueError('unexpected end of input data')
 521 
 522     if b == 47: # ord('/')
 523         # handle single-line comment
 524         skip_line(r)
 525         return
 526 
 527     if b == 42: # ord('*')
 528         # handle (potentially) multi-line comment
 529         skip_general_comment(r)
 530         return
 531 
 532     raise ValueError('expected * or another /, after a /')
 533 
 534 
 535 def json0(w, src, end) -> None:
 536     r = BufferedReader(src)
 537 
 538     # skip leading UTF-8 BOM (byte-order mark)
 539     if r.peek(3) == b'\xef\xbb\xbf':
 540         read(r, 3)
 541 
 542     # skip leading whitespace/comments
 543     seek_next_token(r)
 544 
 545     # emit a single output line, ending with a line-feed
 546     b = peek_byte(r)
 547     if b >= 0:
 548         handlers[b](w, r)
 549     else:
 550         # treat empty(ish) input as invalid JSON
 551         raise ValueError('can\'t turn empty(ish) input into JSON')
 552     end(w)
 553 
 554     # check against trailing non-whitespace/non-comment bytes
 555     seek_next_token(r)
 556     if len(r.peek(1)) > 0:
 557         raise ValueError('unexpected trailing bytes in JSON data')
 558 
 559 
 560 def seems_url(s: str) -> bool:
 561     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 562     return any(s.startswith(p) for p in protocols)
 563 
 564 
 565 def handle_json(w, r) -> None:
 566     json0(w, r, lambda w: w.write(b'\n'))
 567 
 568 
 569 def handle_json_lines(w, r) -> None:
 570     global pos, linenum
 571 
 572     items = 0
 573     linenum = 0
 574     w.write(b'[')
 575 
 576     while True:
 577         line = r.readline().lstrip()
 578         if not line:
 579             break
 580 
 581         pos = 1
 582         linenum += 1
 583 
 584         stripped = line.strip()
 585         if not stripped or stripped.startswith(b'//'):
 586             continue
 587 
 588         items += 1
 589         if items > 1:
 590             w.write(b',')
 591 
 592         json0(w, BytesIO(line), lambda w: None)
 593 
 594     w.write(b']\n')
 595 
 596 
 597 start_args = 1
 598 handle_input = handle_json
 599 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'):
 600     start_args = 2
 601     handle_input = handle_json_lines
 602 
 603 if len(argv) - 1 > start_args:
 604     print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr)
 605     exit(1)
 606 
 607 w = stdout.buffer
 608 name = argv[start_args] if len(argv) > start_args else '-'
 609 
 610 # values keeping track of the input-position, shown in case of errors
 611 pos = 1
 612 linenum = 1
 613 
 614 try:
 615     if name == '-':
 616         handle_input(w, stdin.buffer)
 617     elif seems_url(name):
 618         from urllib.request import urlopen
 619         with urlopen(name) as inp:
 620             handle_input(w, inp)
 621     else:
 622         with open(name, mode='rb') as inp:
 623             handle_input(w, inp)
 624 except BrokenPipeError:
 625     # quit quietly, instead of showing a confusing error message
 626     stderr.close()
 627 except KeyboardInterrupt:
 628     exit(2)
 629 except Exception as e:
 630     stdout.write('\n')
 631     stdout.flush()
 632     print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr)
 633     exit(1)