File: j0.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # j0 [filepath/URI...]
  27 #
  28 # Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  29 #
  30 # Besides minimizing bytes, this tool also adapts almost-JSON input into
  31 # valid JSON, since it ignores comments and trailing commas, neither of
  32 # which are supported in JSON, but which are still commonly used.
  33 #
  34 # Output is always a single line, which ends with a line-feed.
  35 
  36 
  37 from io import BufferedReader
  38 from sys import argv, exit, stderr, stdin, stdout
  39 from urllib.request import urlopen
  40 
  41 
  42 # info is the help message shown when asked to
  43 info = '''
  44 j0 [filepath/URI...]
  45 
  46 Json-0 converts/fixes JSON/pseudo-JSON input into minimal JSON output.
  47 
  48 Besides minimizing bytes, this tool also adapts almost-JSON input into
  49 valid JSON, since it ignores comments and trailing commas, neither of
  50 which are supported in JSON, but which are still commonly used.
  51 
  52 Output is always a single line, which ends with a line-feed.
  53 '''.strip()
  54 
  55 # handle standard help cmd-line options, quitting right away in that case
  56 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  57     print(info, file=stderr)
  58     exit(0)
  59 
  60 
  61 # using regexes doesn't seem to speed-up number/string-handling
  62 # not_digit_re = compile_re(b'[^0-9]')
  63 # string_double_quote_re = compile_re(b'^"|[^\\\\]"')
  64 # string_single_quote_re = compile_re(b'^\'|[^\\\\]\'')
  65 
  66 
  67 def peek_byte(r) -> int:
  68     chunk = r.peek(64)
  69     if len(chunk) > 0:
  70         return chunk[0]
  71     return -1
  72 
  73 
  74 def handle_array(w, r) -> None:
  75     # looking up global vars is slower in older versions of python
  76     seek_next = seek_next_token
  77     handle_next = handlers
  78     peek_byte_l = peek_byte
  79 
  80     n = 0
  81     r.read(1)
  82     w.write(b'[')
  83 
  84     while True:
  85         # whitespace/comments may precede the next item/comma
  86         seek_next(r)
  87         b = peek_byte_l(r)
  88         if b < 0:
  89             raise ValueError('unexpected end of input data, before "]"')
  90 
  91         # 44 is ord(',')
  92         comma = b == 44
  93 
  94         if comma:
  95             r.read(1)
  96             # whitespace/comments may follow the comma
  97             seek_next(r)
  98             b = peek_byte_l(r)
  99             if b < 0:
 100                 raise ValueError('unexpected end of input data, before "]"')
 101 
 102         # 93 is ord(']')
 103         if b == 93:
 104             r.read(1)
 105             w.write(b']')
 106             return
 107 
 108         if n > 0:
 109             if not comma:
 110                 raise ValueError('missing a comma between array values')
 111             w.write(b',')
 112 
 113         b = peek_byte_l(r)
 114         if b > 0:
 115             handle_next[b](w, r)
 116             n += 1
 117 
 118 
 119 def handle_double_quoted_string(w, r) -> None:
 120     r.read(1)
 121     w.write(b'"')
 122     handle_inner_string(w, r, b'"')
 123     w.write(b'"')
 124 
 125 
 126 def handle_dot(w, r) -> None:
 127     r.read(1)
 128     # precede the leading decimal dot with a 0
 129     w.write(b'0.')
 130 
 131     # handle decimals, which in this case aren't optional, as a leading
 132     # dot is what led to this point
 133     if copy_digits(w, r) < 1:
 134         raise ValueError('expected numeric digits, but found none')
 135 
 136 
 137 def handle_false(w, r) -> None:
 138     demand(r, b'false')
 139     w.write(b'false')
 140 
 141 
 142 def handle_invalid(w, r) -> None:
 143     b = peek_byte(r)
 144     if b < 0:
 145         raise ValueError('unexpected end of input data')
 146     # raise ValueError(f'unexpected JSON byte-value {b}')
 147     if 32 < b <= 126:
 148         msg = f'unexpected symbol {chr(b)}'
 149     else:
 150         msg = f'unexpected byte-value {b}'
 151     raise ValueError(msg)
 152 
 153 
 154 def handle_negative(w, r) -> None:
 155     r.read(1)
 156     w.write(b'-')
 157 
 158     # 46 is ord('.')
 159     if peek_byte(r) == 46:
 160         r.read(1)
 161         w.write(b'0.')
 162         if copy_digits(w, r) < 1:
 163             raise ValueError('expected numeric digits, but found none')
 164     else:
 165         handle_number(w, r)
 166 
 167 
 168 def handle_null(w, r) -> None:
 169     demand(r, b'null')
 170     w.write(b'null')
 171 
 172 
 173 def handle_number(w, r) -> None:
 174     # handle integer part
 175     if copy_digits(w, r) < 1:
 176         raise ValueError('expected numeric digits, but found none')
 177 
 178     # handle optional decimals
 179     b = peek_byte(r)
 180     # 46 is ord('.')
 181     if b == 46:
 182         r.read(1)
 183         w.write(b'.')
 184         if copy_digits(w, r) < 1:
 185             # follow a trailing decimal dot with a 0
 186             w.write(b'0')
 187 
 188 
 189 def handle_object(w, r) -> None:
 190     # looking up global vars is slower in older versions of python
 191     seek_next = seek_next_token
 192     demand_l = demand
 193     handle_key = demand_string
 194     handle_next = handlers
 195     peek_byte_l = peek_byte
 196 
 197     num_pairs = 0
 198     r.read(1)
 199     w.write(b'{')
 200 
 201     while True:
 202         # whitespace/comments may precede the next item/comma
 203         seek_next(r)
 204         b = peek_byte_l(r)
 205         if b < 0:
 206             raise ValueError('unexpected end of input data, before "}"')
 207 
 208         # 44 is ord(',')
 209         comma = b == 44
 210 
 211         if comma:
 212             r.read(1)
 213             # whitespace/comments may follow the comma
 214             seek_next(r)
 215             b = peek_byte_l(r)
 216             if b < 0:
 217                 raise ValueError('unexpected end of input data, before "}"')
 218 
 219         # 125 is ord('}')
 220         if b == 125:
 221             r.read(1)
 222             w.write(b'}')
 223             return
 224 
 225         if num_pairs > 0:
 226             if not comma:
 227                 raise ValueError('missing a comma between key-value pairs')
 228             w.write(b',')
 229 
 230         handle_key(w, r)
 231         # whitespace/comments may follow the key
 232         seek_next(r)
 233         demand_l(r, b':')
 234         w.write(b':')
 235         # whitespace/comments may follow the colon
 236         seek_next(r)
 237         b = peek_byte_l(r)
 238         if b > 0:
 239             handle_next[b](w, r)
 240             num_pairs += 1
 241 
 242 
 243 def handle_positive(w, r) -> None:
 244     # do nothing with the leading plus sign: strictly-speaking, JSON numbers
 245     # can't start with a positive sign, and this tool's output is supposed
 246     # to be `JSON-0` (minimized) anyway
 247     r.read(1)
 248 
 249     # 46 is ord('.')
 250     if peek_byte(r) == 46:
 251         r.read(1)
 252         w.write(b'0.')
 253         if copy_digits(w, r) < 1:
 254             raise ValueError('expected numeric digits, but found none')
 255     else:
 256         handle_number(w, r)
 257 
 258 
 259 def handle_single_quoted_string(w, r) -> None:
 260     r.read(1)
 261     w.write(b'"')
 262     handle_inner_string(w, r, b'\'')
 263     w.write(b'"')
 264 
 265 
 266 def demand_string(w, r) -> None:
 267     '''Handle keys for func handle_object.'''
 268 
 269     quote = peek_byte(r)
 270     if quote < 0:
 271         msg = 'unexpected end of input, instead of a string quote'
 272         raise ValueError(msg)
 273 
 274     # 34 is ord('"')
 275     if quote == 34:
 276         handle_double_quoted_string(w, r)
 277         return
 278 
 279     # 39 is ord('\'')
 280     if quote == 39:
 281         handle_single_quoted_string(w, r)
 282         return
 283 
 284     # 32 is ord(' '), 126 is ord('~')
 285     if 32 < quote <= 126:
 286         msg = f'expected ", or even \', but got {chr(quote)} instead'
 287     else:
 288         msg = f'expected ", or even \', but got byte {quote} instead'
 289     raise ValueError(msg)
 290 
 291 
 292 def handle_inner_string_slow(w, r, quote: bytes) -> None:
 293     '''Experimental func with the aim to validate inner-string bytes.'''
 294 
 295     esc = False
 296     q = quote[0]
 297 
 298     while True:
 299         chunk = r.peek(1)
 300         if len(chunk) < 1:
 301             raise ValueError('input data ended while still in quoted string')
 302         b = chunk[0]
 303 
 304         if esc:
 305             esc = False
 306             w.write(r.read(1))
 307             continue
 308         # 92 is ord('\\')
 309         if b == 92:
 310             esc = True
 311             r.read(1)
 312             continue
 313         if b == q:
 314             r.read(1)
 315             w.write(b'"')
 316             return
 317         w.write(r.read(1))
 318 
 319 
 320 def handle_inner_string(w, r, quote: bytes) -> None:
 321     while True:
 322         chunk = r.peek(1024)
 323         if len(chunk) == 0:
 324             raise ValueError('input data ended while still in quoted string')
 325 
 326         i = find_string_end_index(chunk, quote)
 327         if i >= 0:
 328             w.write(r.read(i))
 329             # read/discard closing quote separately; the quote may not
 330             # always be the strictly-JSON `"`, so it's never emitted
 331             # together with the inner-string part
 332             r.read(1)
 333             return
 334         else:
 335             w.write(chunk)
 336             r.read(len(chunk))
 337 
 338 
 339 def handle_true(w, r) -> None:
 340     demand(r, b'true')
 341     w.write(b'true')
 342 
 343 
 344 # setup byte-handling lookup tuple
 345 byte2handler = [handle_invalid for i in range(256)]
 346 byte2handler[ord('0')] = handle_number
 347 byte2handler[ord('1')] = handle_number
 348 byte2handler[ord('2')] = handle_number
 349 byte2handler[ord('3')] = handle_number
 350 byte2handler[ord('4')] = handle_number
 351 byte2handler[ord('5')] = handle_number
 352 byte2handler[ord('6')] = handle_number
 353 byte2handler[ord('7')] = handle_number
 354 byte2handler[ord('8')] = handle_number
 355 byte2handler[ord('9')] = handle_number
 356 byte2handler[ord('+')] = handle_positive
 357 byte2handler[ord('-')] = handle_negative
 358 byte2handler[ord('.')] = handle_dot
 359 byte2handler[ord('"')] = handle_double_quoted_string
 360 byte2handler[ord('\'')] = handle_single_quoted_string
 361 byte2handler[ord('f')] = handle_false
 362 byte2handler[ord('n')] = handle_null
 363 byte2handler[ord('t')] = handle_true
 364 byte2handler[ord('[')] = handle_array
 365 byte2handler[ord('{')] = handle_object
 366 
 367 # handlers is the immutable byte-driven func-dispatch table
 368 handlers = tuple(byte2handler)
 369 
 370 
 371 def copy_digits(w, r) -> int:
 372     '''
 373     Help the number-handling funcs do their job quickly: returns
 374     how many digits were copied/handled, so callers can check if
 375     any digits were found/copied.
 376     '''
 377 
 378     copied = 0
 379     while True:
 380         chunk = r.peek(64)
 381         if len(chunk) == 0:
 382             return copied
 383 
 384         i = find_digits_end_index(chunk)
 385         if i >= 0:
 386             w.write(r.read(i))
 387             copied += i
 388             return copied
 389         else:
 390             w.write(chunk)
 391             r.read(len(chunk))
 392             copied += len(chunk)
 393 
 394 
 395 def seek_next_token(r) -> None:
 396     '''Skip an arbitrarily-long mix of whitespace and comments.'''
 397 
 398     while True:
 399         chunk = r.peek(1024)
 400         if len(chunk) == 0:
 401             # input is over, and this func doesn't consider that an error
 402             return
 403 
 404         comment = False
 405 
 406         for i, b in enumerate(chunk):
 407             # skip space, tab, line-feed, carriage-return, or form-feed
 408             if b in (9, 10, 11, 13, 32):
 409                 continue
 410 
 411             # 47 is ord('/')
 412             if b == 47:
 413                 r.read(i)
 414                 demand_comment(r)
 415                 comment = True
 416                 break
 417 
 418             # found start of next token
 419             r.read(i)
 420             return
 421 
 422         if not comment:
 423             r.read(len(chunk))
 424 
 425 
 426 def skip_line(r) -> None:
 427     '''Help func demand_comment do its job.'''
 428 
 429     while True:
 430         chunk = r.peek(1024)
 431         if len(chunk) == 0:
 432             return
 433 
 434         i = chunk.find(b'\n')
 435         if i >= 0:
 436             r.read(i + 1)
 437             return
 438 
 439         r.read(len(chunk))
 440 
 441 
 442 def skip_general_comment(r) -> None:
 443     '''Help func demand_comment do its job.'''
 444 
 445     # looking up global vars is slower in older versions of python
 446     peek_byte_l = peek_byte
 447 
 448     while True:
 449         chunk = r.peek(1024)
 450         if len(chunk) == 0:
 451             raise ValueError(f'input data ended before an expected */')
 452 
 453         i = chunk.find(b'*')
 454         if i < 0:
 455             # no */ in this chunk, so skip it and try with the next one
 456             r.read(len(chunk))
 457             continue
 458 
 459         # skip right past the * just found, then check if a / follows it
 460         r.read(i + 1)
 461         # 47 is ord('/')
 462         if peek_byte_l(r) == 47:
 463             # got */, the end of this comment
 464             r.read(1)
 465             return
 466 
 467 
 468 def find_digits_end_index(chunk: bytes) -> int:
 469     '''Help the digit-handling funcs do their job quickly.'''
 470 
 471     i = 0
 472     for b in chunk:
 473         if 48 <= b <= 57:
 474             i += 1
 475         else:
 476             return i
 477 
 478     # all bytes (if any) were digits, so no end was found
 479     return -1
 480 
 481 
 482 def find_string_end_index(chunk: bytes, quote: bytes) -> int:
 483     '''Help func demand_string do its job quickly.'''
 484 
 485     # start remembers where to (re)start searching in case of fake matches
 486     start = 0
 487 
 488     while True:
 489         i = chunk.find(quote, start)
 490         if i <= 0:
 491             # either no end was found in this chunk, or it's right at the
 492             # start, at index 0: handling the latter avoids mistakenly
 493             # trying to check if a backslash is before it
 494             return i
 495 
 496         # 92 is ord('\\')
 497         if chunk[i - 1] != 92:
 498             # reject match if a there's a backslash before it
 499             return i
 500 
 501         # keep searching, starting right past the fake-match index
 502         start = i + 1
 503 
 504 
 505 def demand(r, what: bytes) -> None:
 506     lead = r.read(len(what))
 507     if not lead.startswith(what):
 508         lead = str(lead, encoding='utf-8')
 509         what = str(what, encoding='utf-8')
 510         raise ValueError(f'expected {what}, but got {lead} instead')
 511 
 512 
 513 def demand_comment(r) -> None:
 514     demand(r, b'/')
 515     b = peek_byte(r)
 516     if b < 0:
 517         raise ValueError('unexpected end of input data')
 518 
 519     # 47 is ord('/')
 520     if b == 47:
 521         # handle single-line comment
 522         skip_line(r)
 523         return
 524 
 525     # 42 is ord('*')
 526     if b == 42:
 527         # handle (potentially) multi-line comment
 528         skip_general_comment(r)
 529         return
 530 
 531     raise ValueError('expected * or another /, after a /')
 532 
 533 
 534 def json0(w, src) -> None:
 535     r = BufferedReader(src)
 536 
 537     # skip leading whitespace/comments
 538     seek_next_token(r)
 539 
 540     # emit a single output line, ending with a line-feed
 541     b = peek_byte(r)
 542     if b >= 0:
 543         handlers[b](w, r)
 544     else:
 545         # treat empty(ish) input as invalid JSON
 546         raise ValueError('can\'t turn empty(ish) input into JSON')
 547     w.write(b'\n')
 548 
 549     # check against trailing non-whitespace/non-comment bytes
 550     seek_next_token(r)
 551     if len(r.peek(1)) > 0:
 552         raise ValueError('unexpected trailing bytes in JSON data')
 553 
 554 
 555 def seems_url(s: str) -> bool:
 556     for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'):
 557         if s.startswith(prot):
 558             return True
 559     return False
 560 
 561 
 562 try:
 563     if len(argv) < 2:
 564         json0(stdout.buffer, stdin.buffer)
 565     elif len(argv) == 2:
 566         name = argv[1]
 567         if name == '-':
 568             json0(stdout.buffer, stdin.buffer)
 569         elif seems_url(name):
 570             with urlopen(name) as inp:
 571                 json0(stdout.buffer, inp)
 572         else:
 573             with open(name, 'rb') as inp:
 574                 json0(stdout.buffer, inp)
 575     else:
 576         raise ValueError('multiple inputs not allowed')
 577 except (BrokenPipeError, KeyboardInterrupt):
 578     # quit quietly, instead of showing a confusing error message
 579     stderr.flush()
 580     stderr.close()
 581 except Exception as e:
 582     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 583     exit(1)