File: decsv.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from csv import reader
  27 from json import dumps
  28 from math import isinf, isnan
  29 from sys import argv, exit, stderr, stdin, stdout
  30 from typing import Any
  31 
  32 
  33 info = '''
  34 decsv [options...] [filepath/URI...]
  35 
  36 
  37 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common
  38 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object
  39 Notation).
  40 
  41 All (optional) leading options start with either single or double-dash,
  42 and most of them change the style/color used. Some of the options are,
  43 shown in their single-dash form:
  44 
  45     -h          show this help message
  46     -help       show this help message
  47 
  48     -j          convert into JSON, parsing numbers, and using null for empty
  49     -json       convert into JSON, parsing numbers, and using null for empty
  50 
  51     -jsons      convert into JSON Strings, where values are strings or null
  52     -json-s     convert into JSON Strings, where values are strings or null
  53 
  54     -t          convert into TSV (tab-separated values) format
  55     -tab        convert into TSV (tab-separated values) format
  56     -tabs       convert into TSV (tab-separated values) format
  57     -tsv        convert into TSV (tab-separated values) format
  58 '''
  59 
  60 # handle standard help cmd-line options, quitting right away in that case
  61 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  62     print(info.strip())
  63     exit(0)
  64 
  65 
  66 def narrow_value(s: str) -> Any:
  67     # empty strings become null values
  68     if s == '':
  69         return None
  70 
  71     # try parsing valid floating-point values
  72     try:
  73         f = float(s)
  74         if not (isnan(f) or isinf(f)):
  75             return f
  76     except Exception:
  77         pass
  78 
  79     # can't narrow/parse the string, so keep it as is
  80     return s
  81 
  82 
  83 def to_json(w, rows: reader, narrow = narrow_value) -> None:
  84     try:
  85         header = next(rows)
  86     except Exception:
  87         # not even a header line
  88         w.write('[]\n')
  89         return
  90 
  91     # prepare row-holder dictionary
  92     kv = {}
  93     for s in header:
  94         kv[s] = None
  95 
  96     n = 0
  97     for row in rows:
  98         w.write(',\n  ' if n > 0 else '[\n  ')
  99 
 100         # don't forget to reset values from previous row
 101         for s in header:
 102             kv[s] = None
 103         # update values up to what the current row gives
 104         for j, s in enumerate(row):
 105             kv[header[j]] = narrow(s)
 106 
 107         w.write(dumps(kv))
 108         n += 1
 109 
 110     # don't forget to close top-level array
 111     w.write('[]\n' if n == 0 else '\n]\n')
 112 
 113 
 114 def to_jsonl(w, rows: reader, narrow = narrow_value) -> None:
 115     try:
 116         header = next(rows)
 117     except Exception:
 118         # not even a header line
 119         return
 120 
 121     # prepare row-holder dictionary
 122     kv = {}
 123     for s in header:
 124         kv[s] = None
 125 
 126     for row in rows:
 127         # don't forget to reset values from previous row
 128         for s in header:
 129             kv[s] = None
 130         # update values up to what the current row gives
 131         for j, s in enumerate(row):
 132             kv[header[j]] = narrow(s)
 133 
 134         w.write(dumps(kv))
 135         w.write('\n')
 136 
 137 
 138 def to_jsons(w, rows: reader) -> None:
 139     return to_json(w, rows, lambda s: s)
 140 
 141 
 142 def to_tsv(w, rows: reader) -> None:
 143     for row in rows:
 144         for i, s in enumerate(row):
 145             if '\t' in s:
 146                 msg = 'can\'t convert CSV whose items have tabs to TSV'
 147                 raise Exception(msg)
 148             if i > 0:
 149                 w.write('\t')
 150             w.write(s)
 151         w.write('\n')
 152 
 153 
 154 def seems_url(s: str) -> bool:
 155     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 156     return any(s.startswith(p) for p in protocols)
 157 
 158 
 159 def handle_input(w, r, outfmt: str) -> None:
 160     fmt2handler = {
 161         'json': to_json,
 162         'jsonl': to_jsonl,
 163         'jsons': to_jsons,
 164         'tsv': to_tsv,
 165     }
 166 
 167     if outfmt in fmt2handler:
 168         row_reader = reader(r, delimiter=',')
 169         fmt2handler[outfmt](w, row_reader)
 170     else:
 171         raise ValueError(f'unsupported output format {outfmt}')
 172 
 173 
 174 # fmt_names normalizes output-format names, and even has `self-aliases`,
 175 # so a membership check also checks if a name is supported
 176 fmt_names = {
 177     'j': 'json',
 178     'json': 'json',
 179 
 180     'jsonl': 'jsonl',
 181 
 182     'jsons': 'jsons',
 183     'jsonstr': 'jsons',
 184     'jsonstrings': 'jsons',
 185     'json-s': 'jsons',
 186     'json-str': 'jsons',
 187     'json-strings': 'jsons',
 188 
 189     't': 'tsv',
 190     'tab': 'tsv',
 191     'tabs': 'tsv',
 192     'tsv': 'tsv',
 193 }
 194 
 195 args = argv[1:]
 196 # default output format is TSV
 197 out_format = fmt_names['tsv']
 198 
 199 # handle leading output-format option, if present
 200 if len(args) > 0 and args[0].startswith('-'):
 201     s = args[0].lstrip('-').lower()
 202     if s in fmt_names:
 203         out_format = fmt_names[s]
 204         # skip leading arg, since it's clearly not a filepath
 205         args = args[1:]
 206 
 207 try:
 208     if len(args) == 0:
 209         path = '-'
 210     elif len(args) == 1:
 211         path = args[0]
 212     else:
 213         raise ValueError('multiple inputs not allowed')
 214 
 215     if path == '-':
 216         handle_input(stdout, stdin, out_format)
 217     elif seems_url(path):
 218         from io import TextIOWrapper
 219         from urllib.request import urlopen
 220         with urlopen(path) as inp:
 221             # CSV-reader from the stdlib can't use byte-sources directly
 222             with TextIOWrapper(inp, encoding='utf-8') as txt:
 223                 handle_input(stdout, txt.readlines(), out_format)
 224     else:
 225         with open(path, encoding='utf-8') as inp:
 226             handle_input(stdout, inp, out_format)
 227 except BrokenPipeError:
 228     # quit quietly, instead of showing a confusing error message
 229     stderr.close()
 230 except KeyboardInterrupt:
 231     exit(2)
 232 except Exception as e:
 233     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 234     exit(1)