File: j0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import BufferedReader, BytesIO
  27 from sys import argv, exit, stderr, stdin, stdout
  28 
  29 
  30 info = '''
  31 j0 [filepath/URI...]
  32 
  33 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  34 
  35 Besides minimizing bytes, this tool also adapts almost-JSON input into valid
  36 JSON, since it ignores comments and trailing commas, neither of which are
  37 supported in JSON, but which are still commonly used.
  38 
  39 It also turns single-quoted strings into proper double-quoted ones, as well
  40 as change invalid 2-digit `\\x` hexadecimal escapes into JSON's 4-digit `\\u`
  41 hexadecimal escapes. When backslashes in strings are followed by an invalid
  42 escape letter, the backslash is ignored.
  43 
  44 Output is always a single line of valid JSON, ending with a line-feed.
  45 '''
  46 
  47 # handle standard help cmd-line options, quitting right away in that case
  48 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  49     print(info.strip(), file=stderr)
  50     exit(0)
  51 
  52 
  53 # note: using regexes doesn't seem to speed-up number/string-handling
  54 
  55 
  56 def read(r, size: int) -> bytes:
  57     global pos, linenum
  58 
  59     chunk = r.read(size)
  60     if not chunk:
  61         return chunk
  62 
  63     if not (10 in chunk):
  64         pos += len(chunk)
  65         return chunk
  66 
  67     for b in chunk:
  68         if b == 10:
  69             pos = 1
  70             linenum += 1
  71         else:
  72             pos += 1
  73 
  74     return chunk
  75 
  76 
  77 def skip_byte(r) -> None:
  78     global pos, linenum
  79 
  80     chunk = r.read(1)
  81     if not chunk:
  82         return
  83 
  84     if chunk[0] == 10:
  85         pos = 1
  86         linenum += 1
  87     else:
  88         pos += 1
  89 
  90 
  91 def peek_byte(r) -> int:
  92     chunk = r.peek(64)
  93     if len(chunk) > 0:
  94         return chunk[0]
  95     return -1
  96 
  97 
  98 def handle_array(w, r) -> None:
  99     seek_next = seek_next_token
 100 
 101     n = 0
 102     skip_byte(r)
 103     w.write(b'[')
 104 
 105     while True:
 106         # whitespace/comments may precede the next item/comma
 107         seek_next(r)
 108         b = peek_byte(r)
 109         if b < 0:
 110             raise ValueError('unexpected end of input data, before "]"')
 111 
 112         comma = b == 44 # ord(',')
 113 
 114         if comma:
 115             skip_byte(r)
 116             # whitespace/comments may follow the comma
 117             seek_next(r)
 118             b = peek_byte(r)
 119             if b < 0:
 120                 raise ValueError('unexpected end of input data, before "]"')
 121 
 122         if b == 93: # ord(']')
 123             skip_byte(r)
 124             w.write(b']')
 125             return
 126 
 127         if n > 0:
 128             if not comma:
 129                 raise ValueError('missing a comma between array values')
 130             w.write(b',')
 131 
 132         b = peek_byte(r)
 133         if b > 0:
 134             handlers[b](w, r)
 135             n += 1
 136 
 137 
 138 def handle_double_quoted_string(w, r) -> None:
 139     skip_byte(r)
 140     w.write(b'"')
 141     handle_inner_string(w, r, 34) # ord('"')
 142     w.write(b'"')
 143 
 144 
 145 def handle_dot(w, r) -> None:
 146     skip_byte(r)
 147     # precede the leading decimal dot with a 0
 148     w.write(b'0.')
 149 
 150     # handle decimals, which in this case aren't optional, as a leading
 151     # dot is what led to this point
 152     if copy_digits(w, r) < 1:
 153         raise ValueError('expected numeric digits, but found none')
 154 
 155 
 156 def handle_false(w, r) -> None:
 157     demand(r, b'false')
 158     w.write(b'false')
 159 
 160 
 161 def handle_invalid(w, r) -> None:
 162     b = peek_byte(r)
 163     if b < 0:
 164         raise ValueError('unexpected end of input data')
 165     # raise ValueError(f'unexpected JSON byte-value {b}')
 166     if 32 < b <= 126:
 167         msg = f'unexpected symbol {chr(b)}'
 168     else:
 169         msg = f'unexpected byte-value {b}'
 170     raise ValueError(msg)
 171 
 172 
 173 def handle_negative(w, r) -> None:
 174     skip_byte(r)
 175     w.write(b'-')
 176 
 177     if peek_byte(r) == 46: # ord('.')
 178         skip_byte(r)
 179         w.write(b'0.')
 180         if copy_digits(w, r) < 1:
 181             raise ValueError('expected numeric digits, but found none')
 182     else:
 183         handle_number(w, r)
 184 
 185 
 186 def handle_null(w, r) -> None:
 187     demand(r, b'null')
 188     w.write(b'null')
 189 
 190 
 191 def handle_number(w, r) -> None:
 192     # handle integer part
 193     if copy_digits(w, r) < 1:
 194         raise ValueError('expected numeric digits, but found none')
 195 
 196     # handle optional decimals
 197     b = peek_byte(r)
 198     if b == 46: # ord('.')
 199         skip_byte(r)
 200         w.write(b'.')
 201         if copy_digits(w, r) < 1:
 202             # follow a trailing decimal dot with a 0
 203             w.write(b'0')
 204 
 205 
 206 def handle_object(w, r) -> None:
 207     seek_next = seek_next_token
 208 
 209     num_pairs = 0
 210     skip_byte(r)
 211     w.write(b'{')
 212 
 213     while True:
 214         # whitespace/comments may precede the next item/comma
 215         seek_next(r)
 216         b = peek_byte(r)
 217         if b < 0:
 218             raise ValueError('unexpected end of input data, before "}"')
 219 
 220         comma = b == 44 # ord(',')
 221 
 222         if comma:
 223             skip_byte(r)
 224             # whitespace/comments may follow the comma
 225             seek_next(r)
 226             b = peek_byte(r)
 227             if b < 0:
 228                 raise ValueError('unexpected end of input data, before "}"')
 229 
 230         if b == 125: # ord('}')
 231             skip_byte(r)
 232             w.write(b'}')
 233             return
 234 
 235         if num_pairs > 0:
 236             if not comma:
 237                 raise ValueError('missing a comma between key-value pairs')
 238             w.write(b',')
 239 
 240         demand_string(w, r)
 241         # whitespace/comments may follow the key
 242         seek_next(r)
 243         demand(r, b':')
 244         w.write(b':')
 245         # whitespace/comments may follow the colon
 246         seek_next(r)
 247         b = peek_byte(r)
 248         if b > 0:
 249             handlers[b](w, r)
 250             num_pairs += 1
 251 
 252 
 253 def handle_positive(w, r) -> None:
 254     # do nothing with the leading plus sign: strictly-speaking, JSON numbers
 255     # can't start with a positive sign, and this tool's output is supposed
 256     # to be `JSON-0` (minimized) anyway
 257     skip_byte(r)
 258 
 259     if peek_byte(r) == 46: # ord('.')
 260         skip_byte(r)
 261         w.write(b'0.')
 262         if copy_digits(w, r) < 1:
 263             raise ValueError('expected numeric digits, but found none')
 264     else:
 265         handle_number(w, r)
 266 
 267 
 268 def handle_single_quoted_string(w, r) -> None:
 269     skip_byte(r)
 270     w.write(b'"')
 271     handle_inner_string(w, r, 39) # ord('\'')
 272     w.write(b'"')
 273 
 274 
 275 def demand_string(w, r) -> None:
 276     quote = peek_byte(r)
 277     if quote < 0:
 278         msg = 'unexpected end of input, instead of a string quote'
 279         raise ValueError(msg)
 280 
 281     if quote == 34: # ord('"')
 282         handle_double_quoted_string(w, r)
 283         return
 284 
 285     if quote == 39: # ord('\'')
 286         handle_single_quoted_string(w, r)
 287         return
 288 
 289     if 32 < quote <= 126: # ord(' '), ord('~')
 290         msg = f'expected ", or even \', but got {chr(quote)} instead'
 291     else:
 292         msg = f'expected ", or even \', but got byte {quote} instead'
 293     raise ValueError(msg)
 294 
 295 
 296 def handle_inner_string(w, r, quote: int) -> None:
 297     esc = False
 298     bad_hex_msg = 'invalid hexadecimal symbols'
 299     early_end_msg = 'input data ended while still in quoted string'
 300 
 301     def is_hex(x: int) -> bool:
 302         # 48 is ord('0'), 57 is ord('9'), 97 is ord('a'), 102 is ord('f')
 303         return 48 <= x <= 57 or 97 <= x <= 102
 304 
 305     def lower(x: int) -> bool:
 306         # 65 is ord('A'), 90 is ord('Z')
 307         return x + 32 if 65 <= x <= 90 else x
 308 
 309     while True:
 310         chunk = r.peek(1)
 311         if len(chunk) < 1:
 312             raise ValueError(early_end_msg)
 313         b = chunk[0]
 314 
 315         if esc:
 316             esc = False
 317 
 318             if b == 120: # ord('x')
 319                 skip_byte(r)
 320                 chunk = read(r, 2)
 321                 if len(chunk) != 2:
 322                     raise ValueError(early_end_msg)
 323                 a = lower(chunk[0])
 324                 b = lower(chunk[1])
 325                 w.write(b'\\u00')
 326                 if not (is_hex(a) and is_hex(b)):
 327                     raise ValueError(bad_hex_msg)
 328                 w.write(a)
 329                 w.write(b)
 330                 continue
 331 
 332             if b == 117: # ord('u')
 333                 skip_byte(r)
 334                 chunk = read(r, 4)
 335                 if len(chunk) != 4:
 336                     raise ValueError(early_end_msg)
 337                 a = lower(chunk[0])
 338                 b = lower(chunk[1])
 339                 c = lower(chunk[2])
 340                 d = lower(chunk[3])
 341                 if not (is_hex(a) and is_hex(b) and is_hex(c) and is_hex(d)):
 342                     raise ValueError(bad_hex_msg)
 343                 w.write(chunk)
 344                 continue
 345 
 346             # these numbers stand for 't', 'n', 'r', 'v', 'u', '"', and '\\'
 347             if b in (116, 110, 114, 118, 117, 34, 92):
 348                 w.write(b'\\')
 349 
 350             w.write(read(r, 1))
 351             continue
 352 
 353         if b == 92: # ord('\\')
 354             esc = True
 355             skip_byte(r)
 356             continue
 357 
 358         if b == quote:
 359             skip_byte(r)
 360             return
 361 
 362         # emit normal string-byte
 363         w.write(read(r, 1))
 364 
 365 
 366 def handle_true(w, r) -> None:
 367     demand(r, b'true')
 368     w.write(b'true')
 369 
 370 
 371 # setup byte-handling lookup tuple
 372 byte2handler = [handle_invalid for i in range(256)]
 373 byte2handler[ord('0')] = handle_number
 374 byte2handler[ord('1')] = handle_number
 375 byte2handler[ord('2')] = handle_number
 376 byte2handler[ord('3')] = handle_number
 377 byte2handler[ord('4')] = handle_number
 378 byte2handler[ord('5')] = handle_number
 379 byte2handler[ord('6')] = handle_number
 380 byte2handler[ord('7')] = handle_number
 381 byte2handler[ord('8')] = handle_number
 382 byte2handler[ord('9')] = handle_number
 383 byte2handler[ord('+')] = handle_positive
 384 byte2handler[ord('-')] = handle_negative
 385 byte2handler[ord('.')] = handle_dot
 386 byte2handler[ord('"')] = handle_double_quoted_string
 387 byte2handler[ord('\'')] = handle_single_quoted_string
 388 byte2handler[ord('f')] = handle_false
 389 byte2handler[ord('n')] = handle_null
 390 byte2handler[ord('t')] = handle_true
 391 byte2handler[ord('[')] = handle_array
 392 byte2handler[ord('{')] = handle_object
 393 
 394 # handlers is the immutable byte-driven func-dispatch table
 395 handlers = tuple(byte2handler)
 396 
 397 
 398 def copy_digits(w, r) -> int:
 399     'Returns how many digits were copied/handled.'
 400 
 401     copied = 0
 402     while True:
 403         chunk = r.peek(64)
 404         if len(chunk) == 0:
 405             return copied
 406 
 407         i = find_digits_end_index(chunk)
 408         if i >= 0:
 409             w.write(read(r, i))
 410             copied += i
 411             return copied
 412         else:
 413             w.write(chunk)
 414             read(r, len(chunk))
 415             copied += len(chunk)
 416 
 417 
 418 def seek_next_token(r) -> None:
 419     'Skip an arbitrarily-long mix of whitespace and comments.'
 420 
 421     while True:
 422         chunk = r.peek(1024)
 423         if len(chunk) == 0:
 424             # input is over, and this func doesn't consider that an error
 425             return
 426 
 427         comment = False
 428 
 429         for i, b in enumerate(chunk):
 430             # skip space, tab, line-feed, carriage-return, or form-feed
 431             if b in (9, 10, 11, 13, 32):
 432                 continue
 433 
 434             if b == 47: # ord('/')
 435                 read(r, i)
 436                 demand_comment(r)
 437                 comment = True
 438                 break
 439 
 440             # found start of next token
 441             read(r, i)
 442             return
 443 
 444         if not comment:
 445             read(r, len(chunk))
 446 
 447 
 448 def skip_line(r) -> None:
 449     while True:
 450         chunk = r.peek(1024)
 451         if len(chunk) == 0:
 452             return
 453 
 454         i = chunk.find(b'\n')
 455         if i >= 0:
 456             read(r, i + 1)
 457             return
 458 
 459         read(r, len(chunk))
 460 
 461 
 462 def skip_general_comment(r) -> None:
 463     while True:
 464         chunk = r.peek(1024)
 465         if len(chunk) == 0:
 466             raise ValueError(f'input data ended before an expected */')
 467 
 468         i = chunk.find(b'*')
 469         if i < 0:
 470             # no */ in this chunk, so skip it and try with the next one
 471             read(r, len(chunk))
 472             continue
 473 
 474         # skip right past the * just found, then check if a / follows it
 475         read(r, i + 1)
 476         if peek_byte(r) == 47: # ord('/')
 477             # got */, the end of this comment
 478             skip_byte(r)
 479             return
 480 
 481 
 482 def find_digits_end_index(chunk: bytes) -> int:
 483     i = 0
 484     for b in chunk:
 485         if 48 <= b <= 57:
 486             i += 1
 487         else:
 488             return i
 489 
 490     # all bytes (if any) were digits, so no end was found
 491     return -1
 492 
 493 
 494 def demand(r, what: bytes) -> None:
 495     lead = read(r, len(what))
 496     if not lead.startswith(what):
 497         lead = str(lead, encoding='utf-8')
 498         what = str(what, encoding='utf-8')
 499         raise ValueError(f'expected {what}, but got {lead} instead')
 500 
 501 
 502 def demand_comment(r) -> None:
 503     demand(r, b'/')
 504     b = peek_byte(r)
 505     if b < 0:
 506         raise ValueError('unexpected end of input data')
 507 
 508     if b == 47: # ord('/')
 509         # handle single-line comment
 510         skip_line(r)
 511         return
 512 
 513     if b == 42: # ord('*')
 514         # handle (potentially) multi-line comment
 515         skip_general_comment(r)
 516         return
 517 
 518     raise ValueError('expected * or another /, after a /')
 519 
 520 
 521 def json0(w, src, end) -> None:
 522     r = BufferedReader(src)
 523 
 524     # skip leading UTF-8 BOM (byte-order mark)
 525     if r.peek(3) == b'\xef\xbb\xbf':
 526         read(r, 3)
 527 
 528     # skip leading whitespace/comments
 529     seek_next_token(r)
 530 
 531     # emit a single output line, ending with a line-feed
 532     b = peek_byte(r)
 533     if b >= 0:
 534         handlers[b](w, r)
 535     else:
 536         # treat empty(ish) input as invalid JSON
 537         raise ValueError('can\'t turn empty(ish) input into JSON')
 538     end(w)
 539 
 540     # check against trailing non-whitespace/non-comment bytes
 541     seek_next_token(r)
 542     if len(r.peek(1)) > 0:
 543         raise ValueError('unexpected trailing bytes in JSON data')
 544 
 545 
 546 def seems_url(s: str) -> bool:
 547     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 548     return any(s.startswith(p) for p in protocols)
 549 
 550 
 551 def handle_json(w, r) -> None:
 552     json0(w, r, lambda w: w.write(b'\n'))
 553 
 554 
 555 def handle_json_lines(w, r) -> None:
 556     global pos, linenum
 557 
 558     items = 0
 559     linenum = 0
 560     w.write(b'[')
 561 
 562     while True:
 563         line = r.readline().lstrip()
 564         if not line:
 565             break
 566 
 567         pos = 1
 568         linenum += 1
 569 
 570         stripped = line.strip()
 571         if not stripped or stripped.startswith(b'//'):
 572             continue
 573 
 574         items += 1
 575         if items > 1:
 576             w.write(b',')
 577 
 578         json0(w, BytesIO(line), lambda w: None)
 579 
 580     w.write(b']\n')
 581 
 582 
 583 start_args = 1
 584 handle_input = handle_json
 585 if len(argv) > 1 and argv[1] in ('-jl', '--jl', '-jsonl', '--jsonl'):
 586     start_args = 2
 587     handle_input = handle_json_lines
 588 
 589 if len(argv) - 1 > start_args:
 590     print(f'\x1b[31mmultiple inputs not allowed\x1b[0m', file=stderr)
 591     exit(1)
 592 
 593 w = stdout.buffer
 594 name = argv[start_args] if len(argv) > start_args else '-'
 595 
 596 # values keeping track of the input-position, shown in case of errors
 597 pos = 1
 598 linenum = 1
 599 
 600 try:
 601     if name == '-':
 602         handle_input(w, stdin.buffer)
 603     elif seems_url(name):
 604         from urllib.request import urlopen
 605         with urlopen(name) as inp:
 606             handle_input(w, inp)
 607     else:
 608         with open(name, mode='rb') as inp:
 609             handle_input(w, inp)
 610 except BrokenPipeError:
 611     # quit quietly, instead of showing a confusing error message
 612     stderr.close()
 613 except KeyboardInterrupt:
 614     exit(2)
 615 except Exception as e:
 616     stdout.flush()
 617     print(f'\x1b[31mline {linenum}, pos {pos} : {e}\x1b[0m', file=stderr)
 618     exit(1)