File: decsv.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # decsv [options...] [filepath/URI...] 27 # 28 # DE-CSV turns a CSV-format (comma-separated values) table into other common 29 # data formats, like TSV (tab-separated values) and JSON (javascript object 30 # notation). 31 # 32 # All (optional) leading options start with either single or double-dash, 33 # and most of them change the style/color used. Some of the options are, 34 # shown in their single-dash form: 35 # 36 # -h show this help message 37 # -help show this help message 38 # 39 # -j convert into JSON, parsing numbers, and using null for empty 40 # -json convert into JSON, parsing numbers, and using null for empty 41 # 42 # -jsons convert into JSON Strings, where values can only be strings 43 # -json-s convert into JSON Strings, where values can only be strings 44 # 45 # -t convert into TSV (tab-separated values) format 46 # -tab convert into TSV (tab-separated values) format 47 # -tabs convert into TSV (tab-separated values) format 48 # -tsv convert into TSV (tab-separated values) format 49 50 51 from csv import reader 52 from io import TextIOWrapper 53 from json import dumps 54 from math import isinf, isnan 55 from sys import argv, exit, stderr, stdin, stdout 56 from typing import Any 57 from urllib.request import urlopen 58 59 60 # info is the help message shown when asked to 61 info = ''' 62 decsv [options...] [filepath/URI...] 63 64 DE-CSV turns a CSV-format (comma-separated values) table into other common 65 data formats, like TSV (tab-separated values) and JSON (javascript object 66 notation). 67 68 All (optional) leading options start with either single or double-dash, 69 and most of them change the style/color used. Some of the options are, 70 shown in their single-dash form: 71 72 -h show this help message 73 -help show this help message 74 75 -j convert into JSON, parsing numbers, and using null for empty 76 -json convert into JSON, parsing numbers, and using null for empty 77 78 -jsons convert into JSON Strings, where values can only be strings 79 -json-s convert into JSON Strings, where values can only be strings 80 81 -t convert into TSV (tab-separated values) format 82 -tab convert into TSV (tab-separated values) format 83 -tabs convert into TSV (tab-separated values) format 84 -tsv convert into TSV (tab-separated values) format 85 '''.strip() 86 87 # handle standard help cmd-line options, quitting right away in that case 88 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 89 print(info, file=stderr) 90 exit(0) 91 92 93 def narrow_value(s: str) -> Any: 94 # empty strings become null values 95 if s == '': 96 return None 97 98 # try parsing valid floating-point values 99 try: 100 f = float(s) 101 if not (isnan(f) or isinf(f)): 102 return f 103 except: 104 pass 105 106 # can't narrow/parse the string, so keep it as is 107 return s 108 109 110 def to_json(w, rows: reader, narrow = narrow_value) -> None: 111 try: 112 header = next(rows) 113 except: 114 # not even a header line 115 w.write('[]\n') 116 return 117 118 # prepare row-holder dictionary 119 kv = {} 120 for s in header: 121 kv[s] = None 122 123 n = 0 124 for row in rows: 125 w.write(',\n ' if n > 0 else '[\n ') 126 127 # don't forget to reset values from previous row 128 for s in header: 129 kv[s] = None 130 # update values up to what the current row gives 131 for j, s in enumerate(row): 132 kv[header[j]] = narrow(s) 133 134 w.write(dumps(kv)) 135 n += 1 136 137 # don't forget to close top-level array 138 w.write('[]\n' if n == 0 else '\n]\n') 139 140 141 def to_jsons(w, rows: reader) -> None: 142 return to_json(w, rows, lambda s: s) 143 144 145 def to_tsv(w, rows: reader) -> None: 146 for row in rows: 147 for i, s in enumerate(row): 148 if i > 0: 149 w.write('\t') 150 w.write(s) 151 w.write('\n') 152 153 154 def seems_url(s: str) -> bool: 155 for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'): 156 if s.startswith(prot): 157 return True 158 return False 159 160 161 def handle_input(w, r, outfmt: str) -> None: 162 fmt2handler = { 163 'json': to_json, 164 'jsons': to_jsons, 165 'tsv': to_tsv, 166 } 167 168 if outfmt in fmt2handler: 169 row_reader = reader(r, delimiter=',') 170 fmt2handler[outfmt](w, row_reader) 171 else: 172 raise ValueError(f'unsupported output format {outfmt}') 173 174 175 # fmt_names normalizes output-format names, and even has `self-aliases`, 176 # so a membership check also checks if a name is supported 177 fmt_names = { 178 'j': 'json', 179 'json': 'json', 180 181 'jsons': 'jsons', 182 'jsonstr': 'jsons', 183 'jsonstrings': 'jsons', 184 'json-s': 'jsons', 185 'json-str': 'jsons', 186 'json-strings': 'jsons', 187 188 't': 'tsv', 189 'tab': 'tsv', 190 'tabs': 'tsv', 191 'tsv': 'tsv', 192 } 193 194 args = argv[1:] 195 # default output format is TSV 196 out_format = fmt_names['tsv'] 197 198 # handle leading output-format option, if present 199 if len(args) > 0 and args[0].startswith('-'): 200 s = args[0].lstrip('-').lower() 201 if s in fmt_names: 202 out_format = fmt_names[s] 203 # skip leading arg, since it's clearly not a filepath 204 args = args[1:] 205 206 try: 207 if len(args) == 0: 208 path = '-' 209 elif len(args) == 1: 210 path = args[0] 211 else: 212 raise ValueError('multiple inputs not allowed') 213 214 stdout.reconfigure(newline='\n', encoding='utf-8') 215 216 if path == '-': 217 handle_input(stdout, stdin, out_format) 218 elif seems_url(path): 219 with urlopen(path) as inp: 220 # CSV-reader from the stdlib can't use byte-sources directly 221 with TextIOWrapper(inp, encoding='utf-8') as txt: 222 handle_input(stdout, txt.readlines(), out_format) 223 else: 224 with open(path) as inp: 225 handle_input(stdout, inp, out_format) 226 except (BrokenPipeError, KeyboardInterrupt): 227 # quit quietly, instead of showing a confusing error message 228 stderr.flush() 229 stderr.close() 230 except Exception as e: 231 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 232 exit(1)