File: detsv.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from io import TextIOWrapper
  27 from json import dumps
  28 from math import isinf, isnan
  29 from sys import argv, exit, stderr, stdin, stdout
  30 from typing import Any
  31 
  32 
  33 info = '''
  34 detsv [options...] [filepath/URI...]
  35 
  36 
  37 DE-TSV turns a TSV-format (Tab-Separated Values) table into other common
  38 data formats, like JSON (JavaScript Object Notation), or `JSONS` (JSON
  39 Strings).
  40 
  41 The output is JSONS by default, since keeping the original input-data as
  42 strings can't accidentally mangle values.
  43 
  44 All (optional) leading options start with either single or double-dash,
  45 and most of them change the style/color used. Some of the options are,
  46 shown in their single-dash form:
  47 
  48     -h          show this help message
  49     -help       show this help message
  50 
  51     -j          convert into JSON, parsing numbers, and using null for empty
  52     -json       convert into JSON, parsing numbers, and using null for empty
  53 
  54     -jsons      convert into JSON Strings, where values are strings or null
  55     -json-s     convert into JSON Strings, where values are strings or null
  56 '''
  57 
  58 # handle standard help cmd-line options, quitting right away in that case
  59 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  60     print(info.strip(), file=stderr)
  61     exit(0)
  62 
  63 
  64 def narrow_value(s: str) -> Any:
  65     # empty strings become null values
  66     if s == '':
  67         return None
  68 
  69     # try parsing valid floating-point values
  70     try:
  71         f = float(s)
  72         if not (isnan(f) or isinf(f)):
  73             return f
  74     except Exception:
  75         pass
  76 
  77     # can't narrow/parse the string, so keep it as is
  78     return s
  79 
  80 
  81 def to_json(w, src, narrow = narrow_value) -> None:
  82     try:
  83         line = next(src)
  84         line = line.rstrip('\r\n').rstrip('\n').split('\t')
  85         header = line
  86     except Exception:
  87         # not even a header line
  88         w.write('[]\n')
  89         return
  90 
  91     # prepare row-holder dictionary
  92     kv = {}
  93     for s in header:
  94         kv[s] = None
  95 
  96     n = 0
  97     for line in src:
  98         row = line.rstrip('\r\n').rstrip('\n').split('\t')
  99         w.write(',\n  ' if n > 0 else '[\n  ')
 100 
 101         # don't forget to reset values from previous row
 102         for s in header:
 103             kv[s] = None
 104         # update values up to what the current row gives
 105         for j, s in enumerate(row):
 106             kv[header[j]] = narrow(s)
 107 
 108         w.write(dumps(kv))
 109         n += 1
 110 
 111     # don't forget to close top-level array
 112     w.write('[]\n' if n == 0 else '\n]\n')
 113 
 114 
 115 def to_jsons(w, src) -> None:
 116     return to_json(w, src, lambda s: s)
 117 
 118 
 119 def to_tsv(w, src) -> None:
 120     for line in src:
 121         row = line.rstrip('\r\n').rstrip('\n').split('\t')
 122         for i, s in enumerate(row):
 123             if i > 0:
 124                 w.write('\t')
 125             w.write(s)
 126         w.write('\n')
 127 
 128 
 129 def seems_url(s: str) -> bool:
 130     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 131     return any(s.startswith(p) for p in protocols)
 132 
 133 
 134 def handle_input(w, r, outfmt: str) -> None:
 135     fmt2handler = {
 136         'json': to_json,
 137         'jsons': to_jsons,
 138     }
 139 
 140     if outfmt in fmt2handler:
 141         fmt2handler[outfmt](w, r)
 142     else:
 143         raise ValueError(f'unsupported output format {outfmt}')
 144 
 145 
 146 # fmt_names normalizes output-format names, and even has `self-aliases`,
 147 # so a membership check also checks if a name is supported
 148 fmt_names = {
 149     'j': 'json',
 150     'json': 'json',
 151 
 152     'jsons': 'jsons',
 153     'jsonstr': 'jsons',
 154     'jsonstrings': 'jsons',
 155     'json-s': 'jsons',
 156     'json-str': 'jsons',
 157     'json-strings': 'jsons',
 158 }
 159 
 160 args = argv[1:]
 161 # default output format is JSONS, since it can't accidentally mangle data
 162 out_format = fmt_names['jsons']
 163 
 164 # handle leading output-format option, if present
 165 if len(args) > 0 and args[0].startswith('-'):
 166     s = args[0].lstrip('-').lower()
 167     if s in fmt_names:
 168         out_format = fmt_names[s]
 169         # skip leading arg, since it's clearly not a filepath
 170         args = args[1:]
 171 
 172 try:
 173     if len(args) == 0:
 174         path = '-'
 175     elif len(args) == 1:
 176         path = args[0]
 177     else:
 178         raise ValueError('multiple inputs not allowed')
 179 
 180     if path == '-':
 181         handle_input(stdout, stdin, out_format)
 182     elif seems_url(path):
 183         from urllib.request import urlopen
 184         with urlopen(path) as inp:
 185             # can't use byte-sources directly during string-processing
 186             with TextIOWrapper(inp, encoding='utf-8') as txt:
 187                 handle_input(stdout, txt.readlines(), out_format)
 188     else:
 189         with open(path, encoding='utf-8') as inp:
 190             handle_input(stdout, inp, out_format)
 191 except BrokenPipeError:
 192     # quit quietly, instead of showing a confusing error message
 193     stderr.close()
 194 except KeyboardInterrupt:
 195     exit(2)
 196 except Exception as e:
 197     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 198     exit(1)