File: decsv.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from csv import reader
  27 from io import TextIOWrapper
  28 from json import dumps
  29 from math import isinf, isnan
  30 from sys import argv, exit, stderr, stdin, stdout
  31 from typing import Any
  32 
  33 
  34 info = '''
  35 decsv [options...] [filepath/URI...]
  36 
  37 
  38 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common
  39 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object
  40 Notation).
  41 
  42 All (optional) leading options start with either single or double-dash,
  43 and most of them change the style/color used. Some of the options are,
  44 shown in their single-dash form:
  45 
  46     -h          show this help message
  47     -help       show this help message
  48 
  49     -j          convert into JSON, parsing numbers, and using null for empty
  50     -json       convert into JSON, parsing numbers, and using null for empty
  51 
  52     -jsons      convert into JSON Strings, where values are strings or null
  53     -json-s     convert into JSON Strings, where values are strings or null
  54 
  55     -t          convert into TSV (tab-separated values) format
  56     -tab        convert into TSV (tab-separated values) format
  57     -tabs       convert into TSV (tab-separated values) format
  58     -tsv        convert into TSV (tab-separated values) format
  59 '''
  60 
  61 # handle standard help cmd-line options, quitting right away in that case
  62 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  63     print(info.strip(), file=stderr)
  64     exit(0)
  65 
  66 
  67 def narrow_value(s: str) -> Any:
  68     # empty strings become null values
  69     if s == '':
  70         return None
  71 
  72     # try parsing valid floating-point values
  73     try:
  74         f = float(s)
  75         if not (isnan(f) or isinf(f)):
  76             return f
  77     except Exception:
  78         pass
  79 
  80     # can't narrow/parse the string, so keep it as is
  81     return s
  82 
  83 
  84 def to_json(w, rows: reader, narrow = narrow_value) -> None:
  85     try:
  86         header = next(rows)
  87     except Exception:
  88         # not even a header line
  89         w.write('[]\n')
  90         return
  91 
  92     # prepare row-holder dictionary
  93     kv = {}
  94     for s in header:
  95         kv[s] = None
  96 
  97     n = 0
  98     for row in rows:
  99         w.write(',\n  ' if n > 0 else '[\n  ')
 100 
 101         # don't forget to reset values from previous row
 102         for s in header:
 103             kv[s] = None
 104         # update values up to what the current row gives
 105         for j, s in enumerate(row):
 106             kv[header[j]] = narrow(s)
 107 
 108         w.write(dumps(kv))
 109         n += 1
 110 
 111     # don't forget to close top-level array
 112     w.write('[]\n' if n == 0 else '\n]\n')
 113 
 114 
 115 def to_jsons(w, rows: reader) -> None:
 116     return to_json(w, rows, lambda s: s)
 117 
 118 
 119 def to_tsv(w, rows: reader) -> None:
 120     for row in rows:
 121         for i, s in enumerate(row):
 122             if '\t' in s:
 123                 msg = 'can\'t convert CSV whose items have tabs to TSV'
 124                 raise Exception(msg)
 125             if i > 0:
 126                 w.write('\t')
 127             w.write(s)
 128         w.write('\n')
 129 
 130 
 131 def seems_url(s: str) -> bool:
 132     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 133     return any(s.startswith(p) for p in protocols)
 134 
 135 
 136 def handle_input(w, r, outfmt: str) -> None:
 137     fmt2handler = {
 138         'json': to_json,
 139         'jsons': to_jsons,
 140         'tsv': to_tsv,
 141     }
 142 
 143     if outfmt in fmt2handler:
 144         row_reader = reader(r, delimiter=',')
 145         fmt2handler[outfmt](w, row_reader)
 146     else:
 147         raise ValueError(f'unsupported output format {outfmt}')
 148 
 149 
 150 # fmt_names normalizes output-format names, and even has `self-aliases`,
 151 # so a membership check also checks if a name is supported
 152 fmt_names = {
 153     'j': 'json',
 154     'json': 'json',
 155 
 156     'jsons': 'jsons',
 157     'jsonstr': 'jsons',
 158     'jsonstrings': 'jsons',
 159     'json-s': 'jsons',
 160     'json-str': 'jsons',
 161     'json-strings': 'jsons',
 162 
 163     't': 'tsv',
 164     'tab': 'tsv',
 165     'tabs': 'tsv',
 166     'tsv': 'tsv',
 167 }
 168 
 169 args = argv[1:]
 170 # default output format is TSV
 171 out_format = fmt_names['tsv']
 172 
 173 # handle leading output-format option, if present
 174 if len(args) > 0 and args[0].startswith('-'):
 175     s = args[0].lstrip('-').lower()
 176     if s in fmt_names:
 177         out_format = fmt_names[s]
 178         # skip leading arg, since it's clearly not a filepath
 179         args = args[1:]
 180 
 181 try:
 182     if len(args) == 0:
 183         path = '-'
 184     elif len(args) == 1:
 185         path = args[0]
 186     else:
 187         raise ValueError('multiple inputs not allowed')
 188 
 189     if path == '-':
 190         handle_input(stdout, stdin, out_format)
 191     elif seems_url(path):
 192         from urllib.request import urlopen
 193         with urlopen(path) as inp:
 194             # CSV-reader from the stdlib can't use byte-sources directly
 195             with TextIOWrapper(inp, encoding='utf-8') as txt:
 196                 handle_input(stdout, txt.readlines(), out_format)
 197     else:
 198         with open(path, encoding='utf-8') as inp:
 199             handle_input(stdout, inp, out_format)
 200 except BrokenPipeError:
 201     # quit quietly, instead of showing a confusing error message
 202     stderr.close()
 203 except KeyboardInterrupt:
 204     exit(2)
 205 except Exception as e:
 206     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 207     exit(1)