File: detsv.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from json import dumps
  27 from math import isinf, isnan
  28 from sys import argv, exit, stderr, stdin, stdout
  29 from typing import Any
  30 
  31 
  32 info = '''
  33 detsv [options...] [filepath/URI...]
  34 
  35 
  36 DE-TSV turns a TSV-format (Tab-Separated Values) table into other common
  37 data formats, like JSON (JavaScript Object Notation), or `JSONS` (JSON
  38 Strings).
  39 
  40 The output is JSONS by default, since keeping the original input-data as
  41 strings can't accidentally mangle values.
  42 
  43 All (optional) leading options start with either single or double-dash,
  44 and most of them change the style/color used. Some of the options are,
  45 shown in their single-dash form:
  46 
  47     -h          show this help message
  48     -help       show this help message
  49 
  50     -j          convert into JSON, parsing numbers, and using null for empty
  51     -json       convert into JSON, parsing numbers, and using null for empty
  52 
  53     -jsons      convert into JSON Strings, where values are strings or null
  54     -json-s     convert into JSON Strings, where values are strings or null
  55 '''
  56 
  57 # handle standard help cmd-line options, quitting right away in that case
  58 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  59     print(info.strip())
  60     exit(0)
  61 
  62 
  63 def narrow_value(s: str) -> Any:
  64     # empty strings become null values
  65     if s == '':
  66         return None
  67 
  68     # try parsing valid floating-point values
  69     try:
  70         f = float(s)
  71         if not (isnan(f) or isinf(f)):
  72             return f
  73     except Exception:
  74         pass
  75 
  76     # can't narrow/parse the string, so keep it as is
  77     return s
  78 
  79 
  80 def to_json(w, src, narrow = narrow_value) -> None:
  81     try:
  82         line = next(src)
  83         line = line.rstrip('\r\n').rstrip('\n').split('\t')
  84         header = line
  85     except Exception:
  86         # not even a header line
  87         w.write('[]\n')
  88         return
  89 
  90     # prepare row-holder dictionary
  91     kv = {}
  92     for s in header:
  93         kv[s] = None
  94 
  95     n = 0
  96     for line in src:
  97         row = line.rstrip('\r\n').rstrip('\n').split('\t')
  98         w.write(',\n  ' if n > 0 else '[\n  ')
  99 
 100         # don't forget to reset values from previous row
 101         for s in header:
 102             kv[s] = None
 103         # update values up to what the current row gives
 104         for j, s in enumerate(row):
 105             kv[header[j]] = narrow(s)
 106 
 107         w.write(dumps(kv))
 108         n += 1
 109 
 110     # don't forget to close top-level array
 111     w.write('[]\n' if n == 0 else '\n]\n')
 112 
 113 
 114 def to_jsons(w, src) -> None:
 115     return to_json(w, src, lambda s: s)
 116 
 117 
 118 def to_tsv(w, src) -> None:
 119     for line in src:
 120         row = line.rstrip('\r\n').rstrip('\n').split('\t')
 121         for i, s in enumerate(row):
 122             if i > 0:
 123                 w.write('\t')
 124             w.write(s)
 125         w.write('\n')
 126 
 127 
 128 def seems_url(s: str) -> bool:
 129     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 130     return any(s.startswith(p) for p in protocols)
 131 
 132 
 133 def handle_input(w, r, outfmt: str) -> None:
 134     fmt2handler = {
 135         'json': to_json,
 136         'jsons': to_jsons,
 137     }
 138 
 139     if outfmt in fmt2handler:
 140         fmt2handler[outfmt](w, r)
 141     else:
 142         raise ValueError(f'unsupported output format {outfmt}')
 143 
 144 
 145 # fmt_names normalizes output-format names, and even has `self-aliases`,
 146 # so a membership check also checks if a name is supported
 147 fmt_names = {
 148     'j': 'json',
 149     'json': 'json',
 150 
 151     'jsons': 'jsons',
 152     'jsonstr': 'jsons',
 153     'jsonstrings': 'jsons',
 154     'json-s': 'jsons',
 155     'json-str': 'jsons',
 156     'json-strings': 'jsons',
 157 }
 158 
 159 args = argv[1:]
 160 # default output format is JSONS, since it can't accidentally mangle data
 161 out_format = fmt_names['jsons']
 162 
 163 # handle leading output-format option, if present
 164 if len(args) > 0 and args[0].startswith('-'):
 165     s = args[0].lstrip('-').lower()
 166     if s in fmt_names:
 167         out_format = fmt_names[s]
 168         # skip leading arg, since it's clearly not a filepath
 169         args = args[1:]
 170 
 171 try:
 172     if len(args) == 0:
 173         path = '-'
 174     elif len(args) == 1:
 175         path = args[0]
 176     else:
 177         raise ValueError('multiple inputs not allowed')
 178 
 179     if path == '-':
 180         handle_input(stdout, stdin, out_format)
 181     elif seems_url(path):
 182         from io import TextIOWrapper
 183         from urllib.request import urlopen
 184         with urlopen(path) as inp:
 185             # can't use byte-sources directly during string-processing
 186             with TextIOWrapper(inp, encoding='utf-8') as txt:
 187                 handle_input(stdout, txt.readlines(), out_format)
 188     else:
 189         with open(path, encoding='utf-8') as inp:
 190             handle_input(stdout, inp, out_format)
 191 except BrokenPipeError:
 192     # quit quietly, instead of showing a confusing error message
 193     stderr.close()
 194 except KeyboardInterrupt:
 195     exit(2)
 196 except Exception as e:
 197     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 198     exit(1)