File: detsv.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from io import TextIOWrapper 27 from json import dumps 28 from math import isinf, isnan 29 from sys import argv, exit, stderr, stdin, stdout 30 from typing import Any 31 32 33 info = ''' 34 detsv [options...] [filepath/URI...] 35 36 37 DE-TSV turns a TSV-format (Tab-Separated Values) table into other common 38 data formats, like JSON (JavaScript Object Notation), or `JSONS` (JSON 39 Strings). 40 41 The output is JSONS by default, since keeping the original input-data as 42 strings can't accidentally mangle values. 43 44 All (optional) leading options start with either single or double-dash, 45 and most of them change the style/color used. Some of the options are, 46 shown in their single-dash form: 47 48 -h show this help message 49 -help show this help message 50 51 -j convert into JSON, parsing numbers, and using null for empty 52 -json convert into JSON, parsing numbers, and using null for empty 53 54 -jsons convert into JSON Strings, where values are strings or null 55 -json-s convert into JSON Strings, where values are strings or null 56 ''' 57 58 # handle standard help cmd-line options, quitting right away in that case 59 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 60 print(info.strip(), file=stderr) 61 exit(0) 62 63 64 def narrow_value(s: str) -> Any: 65 # empty strings become null values 66 if s == '': 67 return None 68 69 # try parsing valid floating-point values 70 try: 71 f = float(s) 72 if not (isnan(f) or isinf(f)): 73 return f 74 except Exception: 75 pass 76 77 # can't narrow/parse the string, so keep it as is 78 return s 79 80 81 def to_json(w, src, narrow = narrow_value) -> None: 82 try: 83 line = next(src) 84 line = line.rstrip('\r\n').rstrip('\n').split('\t') 85 header = line 86 except Exception: 87 # not even a header line 88 w.write('[]\n') 89 return 90 91 # prepare row-holder dictionary 92 kv = {} 93 for s in header: 94 kv[s] = None 95 96 n = 0 97 for line in src: 98 row = line.rstrip('\r\n').rstrip('\n').split('\t') 99 w.write(',\n ' if n > 0 else '[\n ') 100 101 # don't forget to reset values from previous row 102 for s in header: 103 kv[s] = None 104 # update values up to what the current row gives 105 for j, s in enumerate(row): 106 kv[header[j]] = narrow(s) 107 108 w.write(dumps(kv)) 109 n += 1 110 111 # don't forget to close top-level array 112 w.write('[]\n' if n == 0 else '\n]\n') 113 114 115 def to_jsons(w, src) -> None: 116 return to_json(w, src, lambda s: s) 117 118 119 def to_tsv(w, src) -> None: 120 for line in src: 121 row = line.rstrip('\r\n').rstrip('\n').split('\t') 122 for i, s in enumerate(row): 123 if i > 0: 124 w.write('\t') 125 w.write(s) 126 w.write('\n') 127 128 129 def seems_url(s: str) -> bool: 130 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 131 return any(s.startswith(p) for p in protocols) 132 133 134 def handle_input(w, r, outfmt: str) -> None: 135 fmt2handler = { 136 'json': to_json, 137 'jsons': to_jsons, 138 } 139 140 if outfmt in fmt2handler: 141 fmt2handler[outfmt](w, r) 142 else: 143 raise ValueError(f'unsupported output format {outfmt}') 144 145 146 # fmt_names normalizes output-format names, and even has `self-aliases`, 147 # so a membership check also checks if a name is supported 148 fmt_names = { 149 'j': 'json', 150 'json': 'json', 151 152 'jsons': 'jsons', 153 'jsonstr': 'jsons', 154 'jsonstrings': 'jsons', 155 'json-s': 'jsons', 156 'json-str': 'jsons', 157 'json-strings': 'jsons', 158 } 159 160 args = argv[1:] 161 # default output format is JSONS, since it can't accidentally mangle data 162 out_format = fmt_names['jsons'] 163 164 # handle leading output-format option, if present 165 if len(args) > 0 and args[0].startswith('-'): 166 s = args[0].lstrip('-').lower() 167 if s in fmt_names: 168 out_format = fmt_names[s] 169 # skip leading arg, since it's clearly not a filepath 170 args = args[1:] 171 172 try: 173 if len(args) == 0: 174 path = '-' 175 elif len(args) == 1: 176 path = args[0] 177 else: 178 raise ValueError('multiple inputs not allowed') 179 180 if path == '-': 181 handle_input(stdout, stdin, out_format) 182 elif seems_url(path): 183 from urllib.request import urlopen 184 with urlopen(path) as inp: 185 # can't use byte-sources directly during string-processing 186 with TextIOWrapper(inp, encoding='utf-8') as txt: 187 handle_input(stdout, txt.readlines(), out_format) 188 else: 189 with open(path, encoding='utf-8') as inp: 190 handle_input(stdout, inp, out_format) 191 except BrokenPipeError: 192 # quit quietly, instead of showing a confusing error message 193 stderr.close() 194 except KeyboardInterrupt: 195 exit(2) 196 except Exception as e: 197 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 198 exit(1)