File: decsv.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from csv import reader 27 from io import TextIOWrapper 28 from json import dumps 29 from math import isinf, isnan 30 from sys import argv, exit, stderr, stdin, stdout 31 from typing import Any 32 33 34 info = ''' 35 decsv [options...] [filepath/URI...] 36 37 38 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common 39 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object 40 Notation). 41 42 All (optional) leading options start with either single or double-dash, 43 and most of them change the style/color used. Some of the options are, 44 shown in their single-dash form: 45 46 -h show this help message 47 -help show this help message 48 49 -j convert into JSON, parsing numbers, and using null for empty 50 -json convert into JSON, parsing numbers, and using null for empty 51 52 -jsons convert into JSON Strings, where values are strings or null 53 -json-s convert into JSON Strings, where values are strings or null 54 55 -t convert into TSV (tab-separated values) format 56 -tab convert into TSV (tab-separated values) format 57 -tabs convert into TSV (tab-separated values) format 58 -tsv convert into TSV (tab-separated values) format 59 ''' 60 61 # handle standard help cmd-line options, quitting right away in that case 62 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 63 print(info.strip(), file=stderr) 64 exit(0) 65 66 67 def narrow_value(s: str) -> Any: 68 # empty strings become null values 69 if s == '': 70 return None 71 72 # try parsing valid floating-point values 73 try: 74 f = float(s) 75 if not (isnan(f) or isinf(f)): 76 return f 77 except Exception: 78 pass 79 80 # can't narrow/parse the string, so keep it as is 81 return s 82 83 84 def to_json(w, rows: reader, narrow = narrow_value) -> None: 85 try: 86 header = next(rows) 87 except Exception: 88 # not even a header line 89 w.write('[]\n') 90 return 91 92 # prepare row-holder dictionary 93 kv = {} 94 for s in header: 95 kv[s] = None 96 97 n = 0 98 for row in rows: 99 w.write(',\n ' if n > 0 else '[\n ') 100 101 # don't forget to reset values from previous row 102 for s in header: 103 kv[s] = None 104 # update values up to what the current row gives 105 for j, s in enumerate(row): 106 kv[header[j]] = narrow(s) 107 108 w.write(dumps(kv)) 109 n += 1 110 111 # don't forget to close top-level array 112 w.write('[]\n' if n == 0 else '\n]\n') 113 114 115 def to_jsons(w, rows: reader) -> None: 116 return to_json(w, rows, lambda s: s) 117 118 119 def to_tsv(w, rows: reader) -> None: 120 for row in rows: 121 for i, s in enumerate(row): 122 if '\t' in s: 123 msg = 'can\'t convert CSV whose items have tabs to TSV' 124 raise Exception(msg) 125 if i > 0: 126 w.write('\t') 127 w.write(s) 128 w.write('\n') 129 130 131 def seems_url(s: str) -> bool: 132 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 133 return any(s.startswith(p) for p in protocols) 134 135 136 def handle_input(w, r, outfmt: str) -> None: 137 fmt2handler = { 138 'json': to_json, 139 'jsons': to_jsons, 140 'tsv': to_tsv, 141 } 142 143 if outfmt in fmt2handler: 144 row_reader = reader(r, delimiter=',') 145 fmt2handler[outfmt](w, row_reader) 146 else: 147 raise ValueError(f'unsupported output format {outfmt}') 148 149 150 # fmt_names normalizes output-format names, and even has `self-aliases`, 151 # so a membership check also checks if a name is supported 152 fmt_names = { 153 'j': 'json', 154 'json': 'json', 155 156 'jsons': 'jsons', 157 'jsonstr': 'jsons', 158 'jsonstrings': 'jsons', 159 'json-s': 'jsons', 160 'json-str': 'jsons', 161 'json-strings': 'jsons', 162 163 't': 'tsv', 164 'tab': 'tsv', 165 'tabs': 'tsv', 166 'tsv': 'tsv', 167 } 168 169 args = argv[1:] 170 # default output format is TSV 171 out_format = fmt_names['tsv'] 172 173 # handle leading output-format option, if present 174 if len(args) > 0 and args[0].startswith('-'): 175 s = args[0].lstrip('-').lower() 176 if s in fmt_names: 177 out_format = fmt_names[s] 178 # skip leading arg, since it's clearly not a filepath 179 args = args[1:] 180 181 try: 182 if len(args) == 0: 183 path = '-' 184 elif len(args) == 1: 185 path = args[0] 186 else: 187 raise ValueError('multiple inputs not allowed') 188 189 if path == '-': 190 handle_input(stdout, stdin, out_format) 191 elif seems_url(path): 192 from urllib.request import urlopen 193 with urlopen(path) as inp: 194 # CSV-reader from the stdlib can't use byte-sources directly 195 with TextIOWrapper(inp, encoding='utf-8') as txt: 196 handle_input(stdout, txt.readlines(), out_format) 197 else: 198 with open(path, encoding='utf-8') as inp: 199 handle_input(stdout, inp, out_format) 200 except BrokenPipeError: 201 # quit quietly, instead of showing a confusing error message 202 stderr.close() 203 except KeyboardInterrupt: 204 exit(2) 205 except Exception as e: 206 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 207 exit(1)