File: decsv.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # decsv [options...] [filepath/URI...]
  27 #
  28 # DE-CSV turns a CSV-format (comma-separated values) table into other common
  29 # data formats, like TSV (tab-separated values) and JSON (javascript object
  30 # notation).
  31 #
  32 # All (optional) leading options start with either single or double-dash,
  33 # and most of them change the style/color used. Some of the options are,
  34 # shown in their single-dash form:
  35 #
  36 #     -h          show this help message
  37 #     -help       show this help message
  38 #
  39 #     -j          convert into JSON, parsing numbers, and using null for empty
  40 #     -json       convert into JSON, parsing numbers, and using null for empty
  41 #
  42 #     -jsons      convert into JSON Strings, where values can only be strings
  43 #     -json-s     convert into JSON Strings, where values can only be strings
  44 #
  45 #     -t          convert into TSV (tab-separated values) format
  46 #     -tab        convert into TSV (tab-separated values) format
  47 #     -tabs       convert into TSV (tab-separated values) format
  48 #     -tsv        convert into TSV (tab-separated values) format
  49 
  50 
  51 from csv import reader
  52 from io import TextIOWrapper
  53 from json import dumps
  54 from math import isinf, isnan
  55 from sys import argv, exit, stderr, stdin, stdout
  56 from typing import Any
  57 from urllib.request import urlopen
  58 
  59 
  60 # info is the help message shown when asked to
  61 info = '''
  62 decsv [options...] [filepath/URI...]
  63 
  64 DE-CSV turns a CSV-format (comma-separated values) table into other common
  65 data formats, like TSV (tab-separated values) and JSON (javascript object
  66 notation).
  67 
  68 All (optional) leading options start with either single or double-dash,
  69 and most of them change the style/color used. Some of the options are,
  70 shown in their single-dash form:
  71 
  72     -h          show this help message
  73     -help       show this help message
  74 
  75     -j          convert into JSON, parsing numbers, and using null for empty
  76     -json       convert into JSON, parsing numbers, and using null for empty
  77 
  78     -jsons      convert into JSON Strings, where values can only be strings
  79     -json-s     convert into JSON Strings, where values can only be strings
  80 
  81     -t          convert into TSV (tab-separated values) format
  82     -tab        convert into TSV (tab-separated values) format
  83     -tabs       convert into TSV (tab-separated values) format
  84     -tsv        convert into TSV (tab-separated values) format
  85 '''.strip()
  86 
  87 # handle standard help cmd-line options, quitting right away in that case
  88 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  89     print(info, file=stderr)
  90     exit(0)
  91 
  92 
  93 def narrow_value(s: str) -> Any:
  94     # empty strings become null values
  95     if s == '':
  96         return None
  97 
  98     # try parsing valid floating-point values
  99     try:
 100         f = float(s)
 101         if not (isnan(f) or isinf(f)):
 102             return f
 103     except:
 104         pass
 105 
 106     # can't narrow/parse the string, so keep it as is
 107     return s
 108 
 109 
 110 def to_json(w, rows: reader, narrow = narrow_value) -> None:
 111     try:
 112         header = next(rows)
 113     except:
 114         # not even a header line
 115         w.write('[]\n')
 116         return
 117 
 118     # prepare row-holder dictionary
 119     kv = {}
 120     for s in header:
 121         kv[s] = None
 122 
 123     n = 0
 124     for row in rows:
 125         w.write(',\n  ' if n > 0 else '[\n  ')
 126 
 127         # don't forget to reset values from previous row
 128         for s in header:
 129             kv[s] = None
 130         # update values up to what the current row gives
 131         for j, s in enumerate(row):
 132             kv[header[j]] = narrow(s)
 133 
 134         w.write(dumps(kv))
 135         n += 1
 136 
 137     # don't forget to close top-level array
 138     w.write('[]\n' if n == 0 else '\n]\n')
 139 
 140 
 141 def to_jsons(w, rows: reader) -> None:
 142     return to_json(w, rows, lambda s: s)
 143 
 144 
 145 def to_tsv(w, rows: reader) -> None:
 146     for row in rows:
 147         for i, s in enumerate(row):
 148             if i > 0:
 149                 w.write('\t')
 150             w.write(s)
 151         w.write('\n')
 152 
 153 
 154 def seems_url(s: str) -> bool:
 155     for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'):
 156         if s.startswith(prot):
 157             return True
 158     return False
 159 
 160 
 161 def handle_input(w, r, outfmt: str) -> None:
 162     fmt2handler = {
 163         'json': to_json,
 164         'jsons': to_jsons,
 165         'tsv': to_tsv,
 166     }
 167 
 168     if outfmt in fmt2handler:
 169         row_reader = reader(r, delimiter=',')
 170         fmt2handler[outfmt](w, row_reader)
 171     else:
 172         raise ValueError(f'unsupported output format {outfmt}')
 173 
 174 
 175 # fmt_names normalizes output-format names, and even has `self-aliases`,
 176 # so a membership check also checks if a name is supported
 177 fmt_names = {
 178     'j': 'json',
 179     'json': 'json',
 180 
 181     'jsons': 'jsons',
 182     'jsonstr': 'jsons',
 183     'jsonstrings': 'jsons',
 184     'json-s': 'jsons',
 185     'json-str': 'jsons',
 186     'json-strings': 'jsons',
 187 
 188     't': 'tsv',
 189     'tab': 'tsv',
 190     'tabs': 'tsv',
 191     'tsv': 'tsv',
 192 }
 193 
 194 args = argv[1:]
 195 # default output format is TSV
 196 out_format = fmt_names['tsv']
 197 
 198 # handle leading output-format option, if present
 199 if len(args) > 0 and args[0].startswith('-'):
 200     s = args[0].lstrip('-').lower()
 201     if s in fmt_names:
 202         out_format = fmt_names[s]
 203         # skip leading arg, since it's clearly not a filepath
 204         args = args[1:]
 205 
 206 try:
 207     if len(args) == 0:
 208         path = '-'
 209     elif len(args) == 1:
 210         path = args[0]
 211     else:
 212         raise ValueError('multiple inputs not allowed')
 213 
 214     stdout.reconfigure(newline='\n', encoding='utf-8')
 215 
 216     if path == '-':
 217         handle_input(stdout, stdin, out_format)
 218     elif seems_url(path):
 219         with urlopen(path) as inp:
 220             # CSV-reader from the stdlib can't use byte-sources directly
 221             with TextIOWrapper(inp, encoding='utf-8') as txt:
 222                 handle_input(stdout, txt.readlines(), out_format)
 223     else:
 224         with open(path) as inp:
 225             handle_input(stdout, inp, out_format)
 226 except (BrokenPipeError, KeyboardInterrupt):
 227     # quit quietly, instead of showing a confusing error message
 228     stderr.flush()
 229     stderr.close()
 230 except Exception as e:
 231     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 232     exit(1)