File: decsv.py 1 #!/usr/bin/python 2 3 # The MIT License (MIT) 4 # 5 # Copyright (c) 2026 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the "Software"), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from csv import reader 27 from json import dumps 28 from math import isinf, isnan 29 from sys import argv, exit, stderr, stdin, stdout 30 from typing import Any 31 32 33 info = ''' 34 decsv [options...] [filepath/URI...] 35 36 37 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common 38 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object 39 Notation). 40 41 All (optional) leading options start with either single or double-dash, 42 and most of them change the style/color used. Some of the options are, 43 shown in their single-dash form: 44 45 -h, -help show this help message 46 47 -j convert into JSON, parsing numbers, and using null for empty 48 -json convert into JSON, parsing numbers, and using null for empty 49 50 -jsons convert into JSON Strings, where values are strings or null 51 -json-s convert into JSON Strings, where values are strings or null 52 53 -t convert into TSV (tab-separated values) format 54 -tab convert into TSV (tab-separated values) format 55 -tabs convert into TSV (tab-separated values) format 56 -tsv convert into TSV (tab-separated values) format 57 ''' 58 59 # handle standard help cmd-line options, quitting right away in that case 60 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 61 print(info.strip()) 62 exit(0) 63 64 65 def narrow_value(s: str) -> Any: 66 # empty strings become null values 67 if s == '': 68 return None 69 70 # try parsing valid floating-point values 71 try: 72 f = float(s) 73 if not (isnan(f) or isinf(f)): 74 return f 75 except Exception: 76 pass 77 78 # can't narrow/parse the string, so keep it as is 79 return s 80 81 82 def to_json(w, rows: reader, narrow = narrow_value) -> None: 83 try: 84 header = next(rows) 85 except Exception: 86 # not even a header line 87 w.write('[]\n') 88 return 89 90 # prepare row-holder dictionary 91 kv = {} 92 for s in header: 93 kv[s] = None 94 95 n = 0 96 for row in rows: 97 w.write(',\n ' if n > 0 else '[\n ') 98 99 # don't forget to reset values from previous row 100 for s in header: 101 kv[s] = None 102 # update values up to what the current row gives 103 for j, s in enumerate(row): 104 kv[header[j]] = narrow(s) 105 106 w.write(dumps(kv)) 107 n += 1 108 109 # don't forget to close top-level array 110 w.write('[]\n' if n == 0 else '\n]\n') 111 112 113 def to_jsonl(w, rows: reader, narrow = narrow_value) -> None: 114 try: 115 header = next(rows) 116 except Exception: 117 # not even a header line 118 return 119 120 # prepare row-holder dictionary 121 kv = {} 122 for s in header: 123 kv[s] = None 124 125 for row in rows: 126 # don't forget to reset values from previous row 127 for s in header: 128 kv[s] = None 129 # update values up to what the current row gives 130 for j, s in enumerate(row): 131 kv[header[j]] = narrow(s) 132 133 w.write(dumps(kv)) 134 w.write('\n') 135 136 137 def to_jsons(w, rows: reader) -> None: 138 return to_json(w, rows, lambda s: s) 139 140 141 def to_tsv(w, rows: reader) -> None: 142 for row in rows: 143 for i, s in enumerate(row): 144 if '\t' in s: 145 msg = 'can\'t convert CSV whose items have tabs to TSV' 146 raise Exception(msg) 147 if i > 0: 148 w.write('\t') 149 w.write(s) 150 w.write('\n') 151 152 153 def seems_url(s: str) -> bool: 154 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 155 return any(s.startswith(p) for p in protocols) 156 157 158 def handle_input(w, r, outfmt: str) -> None: 159 fmt2handler = { 160 'json': to_json, 161 'jsonl': to_jsonl, 162 'jsons': to_jsons, 163 'tsv': to_tsv, 164 } 165 166 if outfmt in fmt2handler: 167 row_reader = reader(r, delimiter=',') 168 fmt2handler[outfmt](w, row_reader) 169 else: 170 raise ValueError(f'unsupported output format {outfmt}') 171 172 173 # fmt_names normalizes output-format names, and even has `self-aliases`, 174 # so a membership check also checks if a name is supported 175 fmt_names = { 176 'j': 'json', 177 'json': 'json', 178 179 'jsonl': 'jsonl', 180 181 'jsons': 'jsons', 182 'jsonstr': 'jsons', 183 'jsonstrings': 'jsons', 184 'json-s': 'jsons', 185 'json-str': 'jsons', 186 'json-strings': 'jsons', 187 188 't': 'tsv', 189 'tab': 'tsv', 190 'tabs': 'tsv', 191 'tsv': 'tsv', 192 } 193 194 args = argv[1:] 195 # default output format is TSV 196 out_format = fmt_names['tsv'] 197 198 # handle leading output-format option, if present 199 if len(args) > 0 and args[0].startswith('-'): 200 s = args[0].lstrip('-').lower() 201 if s in fmt_names: 202 out_format = fmt_names[s] 203 # skip leading arg, since it's clearly not a filepath 204 args = args[1:] 205 206 try: 207 if len(args) == 0: 208 path = '-' 209 elif len(args) == 1: 210 path = args[0] 211 else: 212 raise ValueError('multiple inputs not allowed') 213 214 if path == '-': 215 handle_input(stdout, stdin, out_format) 216 elif seems_url(path): 217 from io import TextIOWrapper 218 from urllib.request import urlopen 219 with urlopen(path) as inp: 220 # CSV-reader from the stdlib can't use byte-sources directly 221 with TextIOWrapper(inp, encoding='utf-8') as txt: 222 handle_input(stdout, txt.readlines(), out_format) 223 else: 224 with open(path, encoding='utf-8') as inp: 225 handle_input(stdout, inp, out_format) 226 except BrokenPipeError: 227 # quit quietly, instead of showing a confusing error message 228 stderr.close() 229 except KeyboardInterrupt: 230 exit(2) 231 except Exception as e: 232 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 233 exit(1)