File: decsv.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from csv import reader 27 from json import dumps 28 from math import isinf, isnan 29 from sys import argv, exit, stderr, stdin, stdout 30 from typing import Any 31 32 33 info = ''' 34 decsv [options...] [filepath/URI...] 35 36 37 DE-CSV turns a CSV-format (Comma-Separated Values) table into other common 38 data formats, like TSV (Tab-Separated Values) and JSON (JavaScript Object 39 Notation). 40 41 All (optional) leading options start with either single or double-dash, 42 and most of them change the style/color used. Some of the options are, 43 shown in their single-dash form: 44 45 -h show this help message 46 -help show this help message 47 48 -j convert into JSON, parsing numbers, and using null for empty 49 -json convert into JSON, parsing numbers, and using null for empty 50 51 -jsons convert into JSON Strings, where values are strings or null 52 -json-s convert into JSON Strings, where values are strings or null 53 54 -t convert into TSV (tab-separated values) format 55 -tab convert into TSV (tab-separated values) format 56 -tabs convert into TSV (tab-separated values) format 57 -tsv convert into TSV (tab-separated values) format 58 ''' 59 60 # handle standard help cmd-line options, quitting right away in that case 61 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 62 print(info.strip()) 63 exit(0) 64 65 66 def narrow_value(s: str) -> Any: 67 # empty strings become null values 68 if s == '': 69 return None 70 71 # try parsing valid floating-point values 72 try: 73 f = float(s) 74 if not (isnan(f) or isinf(f)): 75 return f 76 except Exception: 77 pass 78 79 # can't narrow/parse the string, so keep it as is 80 return s 81 82 83 def to_json(w, rows: reader, narrow = narrow_value) -> None: 84 try: 85 header = next(rows) 86 except Exception: 87 # not even a header line 88 w.write('[]\n') 89 return 90 91 # prepare row-holder dictionary 92 kv = {} 93 for s in header: 94 kv[s] = None 95 96 n = 0 97 for row in rows: 98 w.write(',\n ' if n > 0 else '[\n ') 99 100 # don't forget to reset values from previous row 101 for s in header: 102 kv[s] = None 103 # update values up to what the current row gives 104 for j, s in enumerate(row): 105 kv[header[j]] = narrow(s) 106 107 w.write(dumps(kv)) 108 n += 1 109 110 # don't forget to close top-level array 111 w.write('[]\n' if n == 0 else '\n]\n') 112 113 114 def to_jsons(w, rows: reader) -> None: 115 return to_json(w, rows, lambda s: s) 116 117 118 def to_tsv(w, rows: reader) -> None: 119 for row in rows: 120 for i, s in enumerate(row): 121 if '\t' in s: 122 msg = 'can\'t convert CSV whose items have tabs to TSV' 123 raise Exception(msg) 124 if i > 0: 125 w.write('\t') 126 w.write(s) 127 w.write('\n') 128 129 130 def seems_url(s: str) -> bool: 131 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 132 return any(s.startswith(p) for p in protocols) 133 134 135 def handle_input(w, r, outfmt: str) -> None: 136 fmt2handler = { 137 'json': to_json, 138 'jsons': to_jsons, 139 'tsv': to_tsv, 140 } 141 142 if outfmt in fmt2handler: 143 row_reader = reader(r, delimiter=',') 144 fmt2handler[outfmt](w, row_reader) 145 else: 146 raise ValueError(f'unsupported output format {outfmt}') 147 148 149 # fmt_names normalizes output-format names, and even has `self-aliases`, 150 # so a membership check also checks if a name is supported 151 fmt_names = { 152 'j': 'json', 153 'json': 'json', 154 155 'jsons': 'jsons', 156 'jsonstr': 'jsons', 157 'jsonstrings': 'jsons', 158 'json-s': 'jsons', 159 'json-str': 'jsons', 160 'json-strings': 'jsons', 161 162 't': 'tsv', 163 'tab': 'tsv', 164 'tabs': 'tsv', 165 'tsv': 'tsv', 166 } 167 168 args = argv[1:] 169 # default output format is TSV 170 out_format = fmt_names['tsv'] 171 172 # handle leading output-format option, if present 173 if len(args) > 0 and args[0].startswith('-'): 174 s = args[0].lstrip('-').lower() 175 if s in fmt_names: 176 out_format = fmt_names[s] 177 # skip leading arg, since it's clearly not a filepath 178 args = args[1:] 179 180 try: 181 if len(args) == 0: 182 path = '-' 183 elif len(args) == 1: 184 path = args[0] 185 else: 186 raise ValueError('multiple inputs not allowed') 187 188 if path == '-': 189 handle_input(stdout, stdin, out_format) 190 elif seems_url(path): 191 from io import TextIOWrapper 192 from urllib.request import urlopen 193 with urlopen(path) as inp: 194 # CSV-reader from the stdlib can't use byte-sources directly 195 with TextIOWrapper(inp, encoding='utf-8') as txt: 196 handle_input(stdout, txt.readlines(), out_format) 197 else: 198 with open(path, encoding='utf-8') as inp: 199 handle_input(stdout, inp, out_format) 200 except BrokenPipeError: 201 # quit quietly, instead of showing a confusing error message 202 stderr.close() 203 except KeyboardInterrupt: 204 exit(2) 205 except Exception as e: 206 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 207 exit(1)