File: decsv.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2020-2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from csv import reader 27 from json import dumps 28 from math import isinf, isnan 29 from sys import argv, exit, stderr, stdin, stdout 30 from typing import Any 31 32 33 info = ''' 34 decsv [options...] [filepath/URI...] 35 36 37 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common 38 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object 39 Notation). 40 41 All (optional) leading options start with either single or double-dash, 42 and most of them change the style/color used. Some of the options are, 43 shown in their single-dash form: 44 45 -h show this help message 46 -help show this help message 47 48 -j convert into JSON, parsing numbers, and using null for empty 49 -json convert into JSON, parsing numbers, and using null for empty 50 51 -jsons convert into JSON Strings, where values are strings or null 52 -json-s convert into JSON Strings, where values are strings or null 53 54 -t convert into TSV (tab-separated values) format 55 -tab convert into TSV (tab-separated values) format 56 -tabs convert into TSV (tab-separated values) format 57 -tsv convert into TSV (tab-separated values) format 58 ''' 59 60 # handle standard help cmd-line options, quitting right away in that case 61 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 62 print(info.strip()) 63 exit(0) 64 65 66 def narrow_value(s: str) -> Any: 67 # empty strings become null values 68 if s == '': 69 return None 70 71 # try parsing valid floating-point values 72 try: 73 f = float(s) 74 if not (isnan(f) or isinf(f)): 75 return f 76 except Exception: 77 pass 78 79 # can't narrow/parse the string, so keep it as is 80 return s 81 82 83 def to_json(w, rows: reader, narrow = narrow_value) -> None: 84 try: 85 header = next(rows) 86 except Exception: 87 # not even a header line 88 w.write('[]\n') 89 return 90 91 # prepare row-holder dictionary 92 kv = {} 93 for s in header: 94 kv[s] = None 95 96 n = 0 97 for row in rows: 98 w.write(',\n ' if n > 0 else '[\n ') 99 100 # don't forget to reset values from previous row 101 for s in header: 102 kv[s] = None 103 # update values up to what the current row gives 104 for j, s in enumerate(row): 105 kv[header[j]] = narrow(s) 106 107 w.write(dumps(kv)) 108 n += 1 109 110 # don't forget to close top-level array 111 w.write('[]\n' if n == 0 else '\n]\n') 112 113 114 def to_jsonl(w, rows: reader, narrow = narrow_value) -> None: 115 try: 116 header = next(rows) 117 except Exception: 118 # not even a header line 119 return 120 121 # prepare row-holder dictionary 122 kv = {} 123 for s in header: 124 kv[s] = None 125 126 for row in rows: 127 # don't forget to reset values from previous row 128 for s in header: 129 kv[s] = None 130 # update values up to what the current row gives 131 for j, s in enumerate(row): 132 kv[header[j]] = narrow(s) 133 134 w.write(dumps(kv)) 135 w.write('\n') 136 137 138 def to_jsons(w, rows: reader) -> None: 139 return to_json(w, rows, lambda s: s) 140 141 142 def to_tsv(w, rows: reader) -> None: 143 for row in rows: 144 for i, s in enumerate(row): 145 if '\t' in s: 146 msg = 'can\'t convert CSV whose items have tabs to TSV' 147 raise Exception(msg) 148 if i > 0: 149 w.write('\t') 150 w.write(s) 151 w.write('\n') 152 153 154 def seems_url(s: str) -> bool: 155 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 156 return any(s.startswith(p) for p in protocols) 157 158 159 def handle_input(w, r, outfmt: str) -> None: 160 fmt2handler = { 161 'json': to_json, 162 'jsonl': to_jsonl, 163 'jsons': to_jsons, 164 'tsv': to_tsv, 165 } 166 167 if outfmt in fmt2handler: 168 row_reader = reader(r, delimiter=',') 169 fmt2handler[outfmt](w, row_reader) 170 else: 171 raise ValueError(f'unsupported output format {outfmt}') 172 173 174 # fmt_names normalizes output-format names, and even has `self-aliases`, 175 # so a membership check also checks if a name is supported 176 fmt_names = { 177 'j': 'json', 178 'json': 'json', 179 180 'jsonl': 'jsonl', 181 182 'jsons': 'jsons', 183 'jsonstr': 'jsons', 184 'jsonstrings': 'jsons', 185 'json-s': 'jsons', 186 'json-str': 'jsons', 187 'json-strings': 'jsons', 188 189 't': 'tsv', 190 'tab': 'tsv', 191 'tabs': 'tsv', 192 'tsv': 'tsv', 193 } 194 195 args = argv[1:] 196 # default output format is TSV 197 out_format = fmt_names['tsv'] 198 199 # handle leading output-format option, if present 200 if len(args) > 0 and args[0].startswith('-'): 201 s = args[0].lstrip('-').lower() 202 if s in fmt_names: 203 out_format = fmt_names[s] 204 # skip leading arg, since it's clearly not a filepath 205 args = args[1:] 206 207 try: 208 if len(args) == 0: 209 path = '-' 210 elif len(args) == 1: 211 path = args[0] 212 else: 213 raise ValueError('multiple inputs not allowed') 214 215 if path == '-': 216 handle_input(stdout, stdin, out_format) 217 elif seems_url(path): 218 from io import TextIOWrapper 219 from urllib.request import urlopen 220 with urlopen(path) as inp: 221 # CSV-reader from the stdlib can't use byte-sources directly 222 with TextIOWrapper(inp, encoding='utf-8') as txt: 223 handle_input(stdout, txt.readlines(), out_format) 224 else: 225 with open(path, encoding='utf-8') as inp: 226 handle_input(stdout, inp, out_format) 227 except BrokenPipeError: 228 # quit quietly, instead of showing a confusing error message 229 stderr.close() 230 except KeyboardInterrupt: 231 exit(2) 232 except Exception as e: 233 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 234 exit(1)