File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # htmlify [options...] [filepaths/URIs...] 27 # 28 # Render data/files into self-contained HTML. Supported input formats are 29 # 30 # - plain-text UTF-8 31 # - CSV 32 # - TSV 33 # - JSON 34 # 35 # All (optional) leading options start with either single or double-dash, 36 # and most of them change the style/color used. Some of the options are, 37 # shown in their single-dash form: 38 # 39 # -h show this help message 40 # -help show this help message 41 # 42 # -csv read standard-input as a CSV (comma-separated values) table 43 # -json read standard-input as JSON 44 # -tab read standard-input as a TSV (tab-separated values) table 45 # -tabs read standard-input as a TSV (tab-separated values) table 46 # -text read standard-input as a plain-text prose 47 # -title use the next argument as the title in the HTML output 48 # -tsv read standard-input as a TSV (tab-separated values) table 49 50 51 from csv import reader 52 from html import escape 53 from io import StringIO, TextIOWrapper 54 from json import load 55 from math import isinf, isnan 56 from re import compile 57 from sys import argv, exit, stderr, stdin, stdout 58 from typing import List 59 from urllib.parse import urlparse, urlunparse 60 from urllib.request import urlopen 61 62 63 # info is the help message shown when asked to 64 info = ''' 65 htmlify [options...] [filepaths/URIs...] 66 67 Render data/files into self-contained HTML. Supported input formats are 68 69 - plain-text UTF-8 70 - CSV 71 - TSV 72 - JSON 73 74 All (optional) leading options start with either single or double-dash, 75 and most of them change the style/color used. Some of the options are, 76 shown in their single-dash form: 77 78 -h show this help message 79 -help show this help message 80 81 -csv read standard-input as a CSV (comma-separated values) table 82 -json read standard-input as JSON 83 -tab read standard-input as a TSV (tab-separated values) table 84 -tabs read standard-input as a TSV (tab-separated values) table 85 -text read standard-input as a plain-text prose 86 -title use the next argument as the title in the HTML output 87 -tsv read standard-input as a TSV (tab-separated values) table 88 '''.strip() 89 90 # handle standard help cmd-line options, quitting right away in that case 91 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 92 print(info, file=stderr) 93 exit(0) 94 95 # links is used in func handle_normal_text_line to handle hyperlinks 96 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.?/&=-]+''') 97 98 99 def fail(msg, code: int = 1) -> None: 100 '''Show the error message given, and quit the app right away.''' 101 print(f'\x1b[31m{msg}\x1b[0m', file=stderr) 102 exit(code) 103 104 105 title = '' 106 fallback_type = 'text' 107 start_args = 1 108 while start_args < len(argv) and argv[start_args].startswith('-'): 109 l = argv[start_args].lstrip('-').lower() 110 if l in ('csv'): 111 fallback_type = 'csv' 112 start_args += 1 113 continue 114 if l in ('json'): 115 fallback_type = 'json' 116 start_args += 1 117 continue 118 if l in ('tab', 'table', 'tabs', 'tsv'): 119 fallback_type = 'tsv' 120 start_args += 1 121 continue 122 if l in ('text', 'plaintext', 'plain-text', 'prose'): 123 fallback_type = 'text' 124 start_args += 1 125 continue 126 if l in ('title'): 127 if start_args + 1 >= len(argv): 128 fail('missing actual title in cmd-line arguments', 1) 129 title = argv[start_args + 1] 130 start_args += 2 131 continue 132 break 133 args = argv[start_args:] 134 135 136 def is_base_64(n: int) -> bool: 137 '''Help build base64-byte-checker lookup tables.''' 138 139 if ord('0') <= n <= ord('9'): 140 return True 141 if ord('A') <= n <= ord('Z'): 142 return True 143 if ord('a') <= n <= ord('z'): 144 return True 145 return n in (ord('+'), ord('/'), ord('=')) 146 147 148 # valid_base64 helps func seems_supported_data_uri do its job quickly 149 valid_base64 = tuple(is_base_64(n) for n in range(256)) 150 151 152 def guess_type(path: str, fallback_type: str) -> str: 153 l = path.lower() 154 if l.endswith('.csv'): 155 return 'csv' 156 if l.endswith('.json'): 157 return 'json' 158 if l.endswith('.tsv'): 159 return 'tsv' 160 if l.endswith('.txt'): 161 return 'text' 162 return fallback_type 163 164 165 def handle_data(src, kind: str) -> None: 166 '''Render data according to the data-type given.''' 167 168 disp = { 169 'csv': handle_csv, 170 'json': handle_json, 171 'text': handle_text, 172 'tsv': handle_tsv, 173 } 174 175 if kind in disp: 176 return disp[kind](src) 177 raise ValueError(f'unsupported format {kind}') 178 179 180 def handle_json(src) -> None: 181 '''Dispatched func handler to render JSON data''' 182 print('<div>', end='') 183 emit_value(load(src)) 184 print('</div>') 185 186 187 def emit_value(v) -> None: 188 '''Handle recursive behavior for func handle_json''' 189 190 if v is None: 191 print('<span class="u">null</span>', end='') 192 elif isinstance(v, bool): 193 if v: 194 print('<span class="b">true</span>', end='') 195 else: 196 print('<span class="b">false</span>', end='') 197 elif isinstance(v, int): 198 emit_number(v) 199 elif isinstance(v, float): 200 if isnan(v) or isinf(v): 201 raise ValueError(f'JSON doesn\'t support {v} values') 202 emit_number(v) 203 elif isinstance(v, str): 204 if v.startswith('https://') or v.startswith('http://'): 205 print(anchorize(v), end='') 206 else: 207 s = escape(v).replace(' ', ' ').replace('\n', '<br>\n') 208 print(s, end='') 209 elif isinstance(v, list): 210 print('<ul>', end='') 211 for e in v: 212 print('<li>', end='') 213 emit_value(e) 214 print('</li>', end='') 215 print('</ul>', end='') 216 elif isinstance(v, dict): 217 print('<ul>', end='') 218 for k, e in v.items(): 219 emit_kv(k, e) 220 print('</ul>', end='') 221 else: 222 raise ValueError(f'unsupported JSON type {type(v)}') 223 224 def emit_number(v) -> None: 225 '''Handle valid numbers for func emit_value.''' 226 print(f'<span class="{detect_number_class(v)}">{v:,}</span>', end='') 227 228 229 def detect_number_class(v) -> str: 230 if v > 0: 231 return 'p' 232 elif v < 0: 233 return 'n' 234 else: 235 return 'z' 236 237 238 def emit_kv(k: str, v) -> None: 239 '''Handle key-value pairs for objects used in func emit_value.''' 240 241 if isinstance(v, list) or isinstance(v, dict): 242 # handle composite values, by showing them nested/indented 243 print(f'<li><span class="k">{escape(k)}</span><div>', end='') 244 emit_value(v) 245 print('</div></li>', end='') 246 else: 247 # handle simple values, by showing them beside their keys 248 print(f'<li><span class="k">{escape(k)}</span> ', end='') 249 emit_value(v) 250 print('</li>', end='') 251 252 253 def handle_text(src) -> None: 254 '''Dispatched func handler to render plain-text prose''' 255 256 prev = '' 257 # buf is a reusable string-buffer for func handle_normal_text_line 258 buf = StringIO() 259 260 for line in src: 261 line = line.rstrip('\r\n').rstrip('\n').rstrip() 262 if prev == '' and line == '': 263 # keep skipping empty(ish) lines in runs of such lines 264 continue 265 266 if line == '' and prev != '': 267 print('</p>') 268 if prev == '': 269 print('<p>') 270 prev = line 271 272 http = line.startswith('https://') or line.startswith('http://') 273 if http and not (' ' in line): 274 print(anchorize(line), end='') 275 print('<br>') 276 continue 277 278 if seems_supported_data_uri(line): 279 handle_data_uri(line) 280 print('<br>') 281 continue 282 283 handle_normal_text_line(line, buf) 284 285 # don't forget to close last paragraph 286 if line != '': 287 print('</p>') 288 289 290 def handle_normal_text_line(line: str, buf: StringIO) -> None: 291 '''Handle prose lines for func handle_text.''' 292 293 # get rid of previous buffer content 294 buf.truncate(0) 295 buf.seek(0) 296 297 # j keeps track of end of detected hyperlinks, and is used outside 298 # the regex-match loop to detect trailing parts in lines 299 j = 0 300 301 # matches is to keep track of whether any matches occurred 302 matches = 0 303 304 for m in links.finditer(line): 305 matches += 1 306 # remember previous index-end, used to emit the part before 307 # the current match 308 start = j 309 310 i = m.start() 311 j = m.end() 312 # remember part before match 313 buf.write(escape(line[start:i])) 314 # replace matched hyperlink with an html anchor tag for it 315 href = line[i:j] 316 buf.write(f'<a href="{href}">{href}</a>') 317 318 if matches == 0: 319 # avoid emptying lines with no matches 320 print(f'{escape(line)}<br>') 321 return 322 323 # no need to copy the line when it's not changing anyway 324 if j > 0: 325 # don't forget the last part of the line, or the whole line 326 buf.write(escape(line[j:])) 327 328 buf.write('<br>') 329 print(buf.getvalue()) 330 331 332 def seems_supported_data_uri(s: str) -> bool: 333 if not s.startswith('data:'): 334 return False 335 336 if s.startswith('data:image/'): 337 pass 338 elif s.startswith('data:audio/'): 339 pass 340 elif s.startswith('data:video/'): 341 pass 342 else: 343 return False 344 345 base64_index = s.find(';base64,') 346 if base64_index < 0: 347 return False 348 349 # check all payload bytes 350 start = base64_index + len(';base64,') 351 for i, c in enumerate(s): 352 if i >= start and not valid_base64[ord(c)]: 353 return False 354 return True 355 356 357 def handle_data_uri(s: str) -> bool: 358 if s.startswith('data:image/'): 359 print('<img src="', end='') 360 print(s, end='') 361 print('">', end='') 362 return True 363 elif s.startswith('data:audio/'): 364 print('<audio controls src="', end='') 365 print(s, end='') 366 print('">', end='') 367 return True 368 elif s.startswith('data:video/'): 369 print('<video controls src="', end='') 370 print(s, end='') 371 print('">', end='') 372 return True 373 else: 374 return False 375 376 377 def anchorize(href: str) -> str: 378 '''Turn a URI into an escaped anchor tag.''' 379 rel = 'rel="noopener noreferrer"' 380 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 381 382 383 def handle_table_rows(src) -> None: 384 '''Common logic used by funcs handle_csv and handle_tsv''' 385 386 n = 0 387 header = [] 388 389 for items in src: 390 if len(items) == 0: 391 continue 392 393 if n == 0: 394 print('<table>') 395 print('<thead>') 396 header = items 397 handle_header(header) 398 print('</thead>') 399 n += 1 400 continue 401 402 if n == 1: 403 print('<tbody>') 404 n += 1 405 406 handle_row(items) 407 408 if n >= 5: 409 print('<tfoot>') 410 handle_header(items) 411 print('</tfoot>') 412 413 if n > 1: 414 print('</tbody>') 415 if n > 0: 416 print('</table>') 417 418 419 def handle_row(items: List[str]) -> None: 420 '''Simplify control-flow of func handle_table_rows''' 421 422 print('<tr>', end='') 423 424 for s in items: 425 if s == '': 426 print('<td class="e">empty</td>') 427 continue 428 429 if s.startswith('https://') or s.startswith('http://'): 430 print(f'<td>{anchorize(s)}</td>', end='') 431 continue 432 if seems_supported_data_uri(s): 433 print('<td>', end='') 434 handle_data_uri(s) 435 print('</td>', end='') 436 continue 437 438 padded = s.startswith(' ') or s.endswith(' ') 439 440 try: 441 f = float(s) 442 if isnan(f) or isinf(f): 443 print(f'<td>{escape(s)}</td>', end='') 444 else: 445 c = detect_number_class(f) 446 if padded: 447 print(f'<td class="{c} w">{f:,}</td>', end='') 448 else: 449 print(f'<td class="{c}">{f:,}</td>', end='') 450 except: 451 if padded: 452 print(f'<td class="w">{escape(s)}</td>', end='') 453 else: 454 print(f'<td>{escape(s)}</td>', end='') 455 456 print('</tr>') 457 458 459 def handle_header(items: List[str]) -> None: 460 '''Handle inner contents of both header and footer table rows.''' 461 462 for s in items: 463 if s == '': 464 print('<td class="e">empty</td>') 465 continue 466 467 if s.startswith(' ') or s.endswith(' '): 468 print(f'<th class="w">{s}</th>') 469 else: 470 print(f'<th>{s}</th>') 471 472 473 def handle_csv(src) -> None: 474 '''Dispatched func handler to render CSV tables''' 475 476 def row_iter(): 477 row_reader = reader(src, delimiter=',') 478 for row in row_reader: 479 yield row 480 return handle_table_rows(row_iter()) 481 482 483 def handle_tsv(src) -> None: 484 '''Dispatched func handler to render TSV tables''' 485 486 def row_iter(): 487 for line in src: 488 line = line.rstrip('\r\n').rstrip('\n') 489 if line == '': 490 continue 491 yield line.split('\t') 492 return handle_table_rows(row_iter()) 493 494 495 def seems_url(s: str) -> bool: 496 for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'): 497 if s.startswith(prot): 498 return True 499 return False 500 501 502 # style is the `inner` CSS used inside the style tag, and handles all 503 # visual styles for all supported input types 504 style = ''' 505 body { margin: auto; line-height: 1.5rem; } 506 a { color: steelblue; text-decoration: none; } 507 p { display: block; margin: auto; width: 80ch; } 508 table { margin: 2rem auto; border-collapse: collapse; } 509 thead>* { position: sticky; top: 0; background-color: white; } 510 tfoot th { user-select: none; } 511 th, td { 512 padding: 0.1rem 1ch; min-width: 4ch; 513 border-bottom: solid thin transparent; 514 } 515 tr:nth-child(5n) td { border-bottom: solid thin #ccc; } 516 audio, video { display: block; margin: auto; min-width: 80ch; } 517 img { display: block; margin: auto; } 518 .u { color: #aaa; } 519 .b { color: royalblue; } 520 .k { color: blueviolet; } 521 .n { text-align: right; color: rgb(194, 78, 57); } 522 .z { text-align: right; color: #0083e4; } 523 .p { text-align: right; color: rgb(19, 87, 19); } 524 .e { 525 min-width: 2ch; background-color: #f4f4f4; color: gray; 526 text-align: center; font-style: italic; 527 } 528 .w { min-width: 2ch; background-color: yellow; } 529 '''.strip('\n') 530 531 try: 532 if args.count('-') > 1: 533 msg = 'reading from `-` (standard input) more than once not allowed' 534 raise ValueError(msg) 535 536 stdout.reconfigure(newline='\n', encoding='utf-8') 537 538 print('<!DOCTYPE html>') 539 print('<html lang="en">') 540 print('<head>') 541 print(' <meta charset="UTF-8">') 542 print(' <link rel="icon" href="data:,">') 543 cattr = 'content="width=device-width, initial-scale=1.0"' 544 print(f' <meta name="viewport" {cattr}>') 545 if title != '': 546 print(f' <title>{escape(title)}</title>') 547 print(' <style>') 548 print(style) 549 print(' </style>') 550 print('</head>') 551 print('<body>') 552 553 for path in args: 554 if path == '-': 555 handle_data(stdin, fallback_type) 556 continue 557 558 if seems_url(path): 559 with urlopen(path) as inp: 560 ctype = inp.getheader('Content-Type') 561 if not isinstance(ctype, str): 562 ctype = '' 563 564 # try to detect response encoding, if given 565 enc = 'utf-8' 566 i = ctype.find('charset=') 567 if i >= 0: 568 enc = ctype[i + len('charset='):] 569 570 # CSV-reader from the stdlib can't use byte-sources 571 # directly: just adapt reader no matter the output 572 with TextIOWrapper(inp, encoding=enc) as txt: 573 if ctype.startswith('text/plain'): 574 handle_data(txt, 'text') 575 elif ctype.startswith('text/csv'): 576 handle_data(txt, 'csv') 577 elif ctype.startswith('text/tsv'): 578 handle_data(txt, 'tsv') 579 elif ctype.startswith('text/json'): 580 handle_data(txt, 'json') 581 elif ctype.startswith('application/json'): 582 handle_data(txt, 'json') 583 else: 584 handle_data(txt, fallback_type) 585 continue 586 587 with open(path, 'r') as inp: 588 handle_data(inp, guess_type(path, fallback_type)) 589 590 if len(args) == 0: 591 handle_data(stdin, fallback_type) 592 593 print('</body>') 594 print('</html>') 595 except BrokenPipeError: 596 # quit quietly, instead of showing a confusing error message 597 stderr.flush() 598 stderr.close() 599 except KeyboardInterrupt: 600 # quit quietly, instead of showing a confusing error message 601 stderr.flush() 602 stderr.close() 603 exit(2) 604 except Exception as e: 605 fail(e, 1)