File: decsv.py
   1 #!/usr/bin/python
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright (c) 2026 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the "Software"), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from csv import reader
  27 from json import dumps
  28 from math import isinf, isnan
  29 from sys import argv, exit, stderr, stdin, stdout
  30 from typing import Any
  31 
  32 
  33 info = '''
  34 decsv [options...] [filepath/URI...]
  35 
  36 
  37 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common
  38 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object
  39 Notation).
  40 
  41 All (optional) leading options start with either single or double-dash,
  42 and most of them change the style/color used. Some of the options are,
  43 shown in their single-dash form:
  44 
  45     -h, -help    show this help message
  46 
  47     -j          convert into JSON, parsing numbers, and using null for empty
  48     -json       convert into JSON, parsing numbers, and using null for empty
  49 
  50     -jsons      convert into JSON Strings, where values are strings or null
  51     -json-s     convert into JSON Strings, where values are strings or null
  52 
  53     -t          convert into TSV (tab-separated values) format
  54     -tab        convert into TSV (tab-separated values) format
  55     -tabs       convert into TSV (tab-separated values) format
  56     -tsv        convert into TSV (tab-separated values) format
  57 '''
  58 
  59 # handle standard help cmd-line options, quitting right away in that case
  60 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  61     print(info.strip())
  62     exit(0)
  63 
  64 
  65 def narrow_value(s: str) -> Any:
  66     # empty strings become null values
  67     if s == '':
  68         return None
  69 
  70     # try parsing valid floating-point values
  71     try:
  72         f = float(s)
  73         if not (isnan(f) or isinf(f)):
  74             return f
  75     except Exception:
  76         pass
  77 
  78     # can't narrow/parse the string, so keep it as is
  79     return s
  80 
  81 
  82 def to_json(w, rows: reader, narrow = narrow_value) -> None:
  83     try:
  84         header = next(rows)
  85     except Exception:
  86         # not even a header line
  87         w.write('[]\n')
  88         return
  89 
  90     # prepare row-holder dictionary
  91     kv = {}
  92     for s in header:
  93         kv[s] = None
  94 
  95     n = 0
  96     for row in rows:
  97         w.write(',\n  ' if n > 0 else '[\n  ')
  98 
  99         # don't forget to reset values from previous row
 100         for s in header:
 101             kv[s] = None
 102         # update values up to what the current row gives
 103         for j, s in enumerate(row):
 104             kv[header[j]] = narrow(s)
 105 
 106         w.write(dumps(kv))
 107         n += 1
 108 
 109     # don't forget to close top-level array
 110     w.write('[]\n' if n == 0 else '\n]\n')
 111 
 112 
 113 def to_jsonl(w, rows: reader, narrow = narrow_value) -> None:
 114     try:
 115         header = next(rows)
 116     except Exception:
 117         # not even a header line
 118         return
 119 
 120     # prepare row-holder dictionary
 121     kv = {}
 122     for s in header:
 123         kv[s] = None
 124 
 125     for row in rows:
 126         # don't forget to reset values from previous row
 127         for s in header:
 128             kv[s] = None
 129         # update values up to what the current row gives
 130         for j, s in enumerate(row):
 131             kv[header[j]] = narrow(s)
 132 
 133         w.write(dumps(kv))
 134         w.write('\n')
 135 
 136 
 137 def to_jsons(w, rows: reader) -> None:
 138     return to_json(w, rows, lambda s: s)
 139 
 140 
 141 def to_tsv(w, rows: reader) -> None:
 142     for row in rows:
 143         for i, s in enumerate(row):
 144             if '\t' in s:
 145                 msg = 'can\'t convert CSV whose items have tabs to TSV'
 146                 raise Exception(msg)
 147             if i > 0:
 148                 w.write('\t')
 149             w.write(s)
 150         w.write('\n')
 151 
 152 
 153 def seems_url(s: str) -> bool:
 154     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 155     return any(s.startswith(p) for p in protocols)
 156 
 157 
 158 def handle_input(w, r, outfmt: str) -> None:
 159     fmt2handler = {
 160         'json': to_json,
 161         'jsonl': to_jsonl,
 162         'jsons': to_jsons,
 163         'tsv': to_tsv,
 164     }
 165 
 166     if outfmt in fmt2handler:
 167         row_reader = reader(r, delimiter=',')
 168         fmt2handler[outfmt](w, row_reader)
 169     else:
 170         raise ValueError(f'unsupported output format {outfmt}')
 171 
 172 
 173 # fmt_names normalizes output-format names, and even has `self-aliases`,
 174 # so a membership check also checks if a name is supported
 175 fmt_names = {
 176     'j': 'json',
 177     'json': 'json',
 178 
 179     'jsonl': 'jsonl',
 180 
 181     'jsons': 'jsons',
 182     'jsonstr': 'jsons',
 183     'jsonstrings': 'jsons',
 184     'json-s': 'jsons',
 185     'json-str': 'jsons',
 186     'json-strings': 'jsons',
 187 
 188     't': 'tsv',
 189     'tab': 'tsv',
 190     'tabs': 'tsv',
 191     'tsv': 'tsv',
 192 }
 193 
 194 args = argv[1:]
 195 # default output format is TSV
 196 out_format = fmt_names['tsv']
 197 
 198 # handle leading output-format option, if present
 199 if len(args) > 0 and args[0].startswith('-'):
 200     s = args[0].lstrip('-').lower()
 201     if s in fmt_names:
 202         out_format = fmt_names[s]
 203         # skip leading arg, since it's clearly not a filepath
 204         args = args[1:]
 205 
 206 try:
 207     if len(args) == 0:
 208         path = '-'
 209     elif len(args) == 1:
 210         path = args[0]
 211     else:
 212         raise ValueError('multiple inputs not allowed')
 213 
 214     if path == '-':
 215         handle_input(stdout, stdin, out_format)
 216     elif seems_url(path):
 217         from io import TextIOWrapper
 218         from urllib.request import urlopen
 219         with urlopen(path) as inp:
 220             # CSV-reader from the stdlib can't use byte-sources directly
 221             with TextIOWrapper(inp, encoding='utf-8') as txt:
 222                 handle_input(stdout, txt.readlines(), out_format)
 223     else:
 224         with open(path, encoding='utf-8') as inp:
 225             handle_input(stdout, inp, out_format)
 226 except BrokenPipeError:
 227     # quit quietly, instead of showing a confusing error message
 228     stderr.close()
 229 except KeyboardInterrupt:
 230     exit(2)
 231 except Exception as e:
 232     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 233     exit(1)