File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # htmlify [options...] [filepaths/URIs...]
  27 #
  28 # Render data/files into self-contained HTML. Supported input formats are
  29 #
  30 #     - plain-text UTF-8
  31 #     - CSV
  32 #     - TSV
  33 #     - JSON
  34 #
  35 # All (optional) leading options start with either single or double-dash,
  36 # and most of them change the style/color used. Some of the options are,
  37 # shown in their single-dash form:
  38 #
  39 #     -h          show this help message
  40 #     -help       show this help message
  41 #
  42 #     -csv        read standard-input as a CSV (comma-separated values) table
  43 #     -json       read standard-input as JSON
  44 #     -tab        read standard-input as a TSV (tab-separated values) table
  45 #     -tabs       read standard-input as a TSV (tab-separated values) table
  46 #     -text       read standard-input as a plain-text prose
  47 #     -title      use the next argument as the title in the HTML output
  48 #     -tsv        read standard-input as a TSV (tab-separated values) table
  49 
  50 
  51 from csv import reader
  52 from html import escape
  53 from io import StringIO, TextIOWrapper
  54 from json import load
  55 from math import isinf, isnan
  56 from re import compile
  57 from sys import argv, exit, stderr, stdin, stdout
  58 from typing import List
  59 from urllib.parse import urlparse, urlunparse
  60 from urllib.request import urlopen
  61 
  62 
  63 # info is the help message shown when asked to
  64 info = '''
  65 htmlify [options...] [filepaths/URIs...]
  66 
  67 Render data/files into self-contained HTML. Supported input formats are
  68 
  69     - plain-text UTF-8
  70     - CSV
  71     - TSV
  72     - JSON
  73 
  74 All (optional) leading options start with either single or double-dash,
  75 and most of them change the style/color used. Some of the options are,
  76 shown in their single-dash form:
  77 
  78     -h          show this help message
  79     -help       show this help message
  80 
  81     -csv        read standard-input as a CSV (comma-separated values) table
  82     -json       read standard-input as JSON
  83     -tab        read standard-input as a TSV (tab-separated values) table
  84     -tabs       read standard-input as a TSV (tab-separated values) table
  85     -text       read standard-input as a plain-text prose
  86     -title      use the next argument as the title in the HTML output
  87     -tsv        read standard-input as a TSV (tab-separated values) table
  88 '''.strip()
  89 
  90 # handle standard help cmd-line options, quitting right away in that case
  91 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  92     print(info, file=stderr)
  93     exit(0)
  94 
  95 # links is used in func handle_normal_text_line to handle hyperlinks
  96 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.?/&=-]+''')
  97 
  98 
  99 def fail(msg, code: int = 1) -> None:
 100     '''Show the error message given, and quit the app right away.'''
 101     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
 102     exit(code)
 103 
 104 
 105 title = ''
 106 fallback_type = 'text'
 107 start_args = 1
 108 while start_args < len(argv) and argv[start_args].startswith('-'):
 109     l = argv[start_args].lstrip('-').lower()
 110     if l in ('csv'):
 111         fallback_type = 'csv'
 112         start_args += 1
 113         continue
 114     if l in ('json'):
 115         fallback_type = 'json'
 116         start_args += 1
 117         continue
 118     if l in ('tab', 'table', 'tabs', 'tsv'):
 119         fallback_type = 'tsv'
 120         start_args += 1
 121         continue
 122     if l in ('text', 'plaintext', 'plain-text', 'prose'):
 123         fallback_type = 'text'
 124         start_args += 1
 125         continue
 126     if l in ('title'):
 127         if start_args + 1 >= len(argv):
 128             fail('missing actual title in cmd-line arguments', 1)
 129         title = argv[start_args + 1]
 130         start_args += 2
 131         continue
 132     break
 133 args = argv[start_args:]
 134 
 135 
 136 def is_base_64(n: int) -> bool:
 137     '''Help build base64-byte-checker lookup tables.'''
 138 
 139     if ord('0') <= n <= ord('9'):
 140         return True
 141     if ord('A') <= n <= ord('Z'):
 142         return True
 143     if ord('a') <= n <= ord('z'):
 144         return True
 145     return n in (ord('+'), ord('/'), ord('='))
 146 
 147 
 148 # valid_base64 helps func seems_supported_data_uri do its job quickly
 149 valid_base64 = tuple(is_base_64(n) for n in range(256))
 150 
 151 
 152 def guess_type(path: str, fallback_type: str) -> str:
 153     l = path.lower()
 154     if l.endswith('.csv'):
 155         return 'csv'
 156     if l.endswith('.json'):
 157         return 'json'
 158     if l.endswith('.tsv'):
 159         return 'tsv'
 160     if l.endswith('.txt'):
 161         return 'text'
 162     return fallback_type
 163 
 164 
 165 def handle_data(src, kind: str) -> None:
 166     '''Render data according to the data-type given.'''
 167 
 168     disp = {
 169         'csv': handle_csv,
 170         'json': handle_json,
 171         'text': handle_text,
 172         'tsv': handle_tsv,
 173     }
 174 
 175     if kind in disp:
 176         return disp[kind](src)
 177     raise ValueError(f'unsupported format {kind}')
 178 
 179 
 180 def handle_json(src) -> None:
 181     '''Dispatched func handler to render JSON data'''
 182     print('<div>', end='')
 183     emit_value(load(src))
 184     print('</div>')
 185 
 186 
 187 def emit_value(v) -> None:
 188     '''Handle recursive behavior for func handle_json'''
 189 
 190     if v is None:
 191         print('<span class="u">null</span>', end='')
 192     elif isinstance(v, bool):
 193         if v:
 194             print('<span class="b">true</span>', end='')
 195         else:
 196             print('<span class="b">false</span>', end='')
 197     elif isinstance(v, int):
 198         emit_number(v)
 199     elif isinstance(v, float):
 200         if isnan(v) or isinf(v):
 201             raise ValueError(f'JSON doesn\'t support {v} values')
 202         emit_number(v)
 203     elif isinstance(v, str):
 204         if v.startswith('https://') or v.startswith('http://'):
 205             print(anchorize(v), end='')
 206         else:
 207             s = escape(v).replace(' ', '&nbsp;').replace('\n', '<br>\n')
 208             print(s, end='')
 209     elif isinstance(v, list):
 210         print('<ul>', end='')
 211         for e in v:
 212             print('<li>', end='')
 213             emit_value(e)
 214             print('</li>', end='')
 215         print('</ul>', end='')
 216     elif isinstance(v, dict):
 217         print('<ul>', end='')
 218         for k, e in v.items():
 219             emit_kv(k, e)
 220         print('</ul>', end='')
 221     else:
 222         raise ValueError(f'unsupported JSON type {type(v)}')
 223 
 224 def emit_number(v) -> None:
 225     '''Handle valid numbers for func emit_value.'''
 226     print(f'<span class="{detect_number_class(v)}">{v:,}</span>', end='')
 227 
 228 
 229 def detect_number_class(v) -> str:
 230     if v > 0:
 231         return 'p'
 232     elif v < 0:
 233         return 'n'
 234     else:
 235         return 'z'
 236 
 237 
 238 def emit_kv(k: str, v) -> None:
 239     '''Handle key-value pairs for objects used in func emit_value.'''
 240 
 241     if isinstance(v, list) or isinstance(v, dict):
 242         # handle composite values, by showing them nested/indented
 243         print(f'<li><span class="k">{escape(k)}</span><div>', end='')
 244         emit_value(v)
 245         print('</div></li>', end='')
 246     else:
 247         # handle simple values, by showing them beside their keys
 248         print(f'<li><span class="k">{escape(k)}</span>&nbsp;', end='')
 249         emit_value(v)
 250         print('</li>', end='')
 251 
 252 
 253 def handle_text(src) -> None:
 254     '''Dispatched func handler to render plain-text prose'''
 255 
 256     prev = ''
 257     # buf is a reusable string-buffer for func handle_normal_text_line
 258     buf = StringIO()
 259 
 260     for line in src:
 261         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 262         if prev == '' and line == '':
 263             # keep skipping empty(ish) lines in runs of such lines
 264             continue
 265 
 266         if line == '' and prev != '':
 267             print('</p>')
 268         if prev == '':
 269             print('<p>')
 270         prev = line
 271 
 272         http = line.startswith('https://') or line.startswith('http://')
 273         if http and not (' ' in line):
 274             print(anchorize(line), end='')
 275             print('<br>')
 276             continue
 277 
 278         if seems_supported_data_uri(line):
 279             handle_data_uri(line)
 280             print('<br>')
 281             continue
 282 
 283         handle_normal_text_line(line, buf)
 284 
 285     # don't forget to close last paragraph
 286     if line != '':
 287         print('</p>')
 288 
 289 
 290 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 291     '''Handle prose lines for func handle_text.'''
 292 
 293     # get rid of previous buffer content
 294     buf.truncate(0)
 295     buf.seek(0)
 296 
 297     # j keeps track of end of detected hyperlinks, and is used outside
 298     # the regex-match loop to detect trailing parts in lines
 299     j = 0
 300 
 301     # matches is to keep track of whether any matches occurred
 302     matches = 0
 303 
 304     for m in links.finditer(line):
 305         matches += 1
 306         # remember previous index-end, used to emit the part before
 307         # the current match
 308         start = j
 309 
 310         i = m.start()
 311         j = m.end()
 312         # remember part before match
 313         buf.write(escape(line[start:i]))
 314         # replace matched hyperlink with an html anchor tag for it
 315         href = line[i:j]
 316         buf.write(f'<a href="{href}">{href}</a>')
 317 
 318     if matches == 0:
 319         # avoid emptying lines with no matches
 320         print(f'{escape(line)}<br>')
 321         return
 322 
 323     # no need to copy the line when it's not changing anyway
 324     if j > 0:
 325         # don't forget the last part of the line, or the whole line
 326         buf.write(escape(line[j:]))
 327 
 328     buf.write('<br>')
 329     print(buf.getvalue())
 330 
 331 
 332 def seems_supported_data_uri(s: str) -> bool:
 333     if not s.startswith('data:'):
 334         return False
 335 
 336     if s.startswith('data:image/'):
 337         pass
 338     elif s.startswith('data:audio/'):
 339         pass
 340     elif s.startswith('data:video/'):
 341         pass
 342     else:
 343         return False
 344 
 345     base64_index = s.find(';base64,')
 346     if base64_index < 0:
 347         return False
 348 
 349     # check all payload bytes
 350     start = base64_index + len(';base64,')
 351     for i, c in enumerate(s):
 352         if i >= start and not valid_base64[ord(c)]:
 353             return False
 354     return True
 355 
 356 
 357 def handle_data_uri(s: str) -> bool:
 358     if s.startswith('data:image/'):
 359         print('<img src="', end='')
 360         print(s, end='')
 361         print('">', end='')
 362         return True
 363     elif s.startswith('data:audio/'):
 364         print('<audio controls src="', end='')
 365         print(s, end='')
 366         print('">', end='')
 367         return True
 368     elif s.startswith('data:video/'):
 369         print('<video controls src="', end='')
 370         print(s, end='')
 371         print('">', end='')
 372         return True
 373     else:
 374         return False
 375 
 376 
 377 def anchorize(href: str) -> str:
 378     '''Turn a URI into an escaped anchor tag.'''
 379     rel = 'rel="noopener noreferrer"'
 380     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 381 
 382 
 383 def handle_table_rows(src) -> None:
 384     '''Common logic used by funcs handle_csv and handle_tsv'''
 385 
 386     n = 0
 387     header = []
 388 
 389     for items in src:
 390         if len(items) == 0:
 391             continue
 392 
 393         if n == 0:
 394             print('<table>')
 395             print('<thead>')
 396             header = items
 397             handle_header(header)
 398             print('</thead>')
 399             n += 1
 400             continue
 401 
 402         if n == 1:
 403             print('<tbody>')
 404         n += 1
 405 
 406         handle_row(items)
 407 
 408     if n >= 5:
 409         print('<tfoot>')
 410         handle_header(items)
 411         print('</tfoot>')
 412 
 413     if n > 1:
 414         print('</tbody>')
 415     if n > 0:
 416         print('</table>')
 417 
 418 
 419 def handle_row(items: List[str]) -> None:
 420         '''Simplify control-flow of func handle_table_rows'''
 421 
 422         print('<tr>', end='')
 423 
 424         for s in items:
 425             if s == '':
 426                 print('<td class="e">empty</td>')
 427                 continue
 428 
 429             if s.startswith('https://') or s.startswith('http://'):
 430                 print(f'<td>{anchorize(s)}</td>', end='')
 431                 continue
 432             if seems_supported_data_uri(s):
 433                 print('<td>', end='')
 434                 handle_data_uri(s)
 435                 print('</td>', end='')
 436                 continue
 437 
 438             padded = s.startswith(' ') or s.endswith(' ')
 439 
 440             try:
 441                 f = float(s)
 442                 if isnan(f) or isinf(f):
 443                     print(f'<td>{escape(s)}</td>', end='')
 444                 else:
 445                     c = detect_number_class(f)
 446                     if padded:
 447                         print(f'<td class="{c} w">{f:,}</td>', end='')
 448                     else:
 449                         print(f'<td class="{c}">{f:,}</td>', end='')
 450             except:
 451                 if padded:
 452                     print(f'<td class="w">{escape(s)}</td>', end='')
 453                 else:
 454                     print(f'<td>{escape(s)}</td>', end='')
 455 
 456         print('</tr>')
 457 
 458 
 459 def handle_header(items: List[str]) -> None:
 460     '''Handle inner contents of both header and footer table rows.'''
 461 
 462     for s in items:
 463         if s == '':
 464             print('<td class="e">empty</td>')
 465             continue
 466 
 467         if s.startswith(' ') or s.endswith(' '):
 468             print(f'<th class="w">{s}</th>')
 469         else:
 470             print(f'<th>{s}</th>')
 471 
 472 
 473 def handle_csv(src) -> None:
 474     '''Dispatched func handler to render CSV tables'''
 475 
 476     def row_iter():
 477         row_reader = reader(src, delimiter=',')
 478         for row in row_reader:
 479             yield row
 480     return handle_table_rows(row_iter())
 481 
 482 
 483 def handle_tsv(src) -> None:
 484     '''Dispatched func handler to render TSV tables'''
 485 
 486     def row_iter():
 487         for line in src:
 488             line = line.rstrip('\r\n').rstrip('\n')
 489             if line == '':
 490                 continue
 491             yield line.split('\t')
 492     return handle_table_rows(row_iter())
 493 
 494 
 495 def seems_url(s: str) -> bool:
 496     for prot in ('https://', 'http://', 'file://', 'ftp://', 'data:'):
 497         if s.startswith(prot):
 498             return True
 499     return False
 500 
 501 
 502 # style is the `inner` CSS used inside the style tag, and handles all
 503 # visual styles for all supported input types
 504 style = '''
 505         body { margin: auto; line-height: 1.5rem; }
 506         a { color: steelblue; text-decoration: none; }
 507         p { display: block; margin: auto; width: 80ch; }
 508         table { margin: 2rem auto; border-collapse: collapse; }
 509         thead>* { position: sticky; top: 0; background-color: white; }
 510         tfoot th { user-select: none; }
 511         th, td {
 512             padding: 0.1rem 1ch; min-width: 4ch;
 513             border-bottom: solid thin transparent;
 514         }
 515         tr:nth-child(5n) td { border-bottom: solid thin #ccc; }
 516         audio, video { display: block; margin: auto; min-width: 80ch; }
 517         img { display: block; margin: auto; }
 518         .u { color: #aaa; }
 519         .b { color: royalblue; }
 520         .k { color: blueviolet; }
 521         .n { text-align: right; color: rgb(194, 78, 57); }
 522         .z { text-align: right; color: #0083e4; }
 523         .p { text-align: right; color: rgb(19, 87, 19); }
 524         .e {
 525             min-width: 2ch; background-color: #f4f4f4; color: gray;
 526             text-align: center; font-style: italic;
 527         }
 528         .w { min-width: 2ch; background-color: yellow; }
 529 '''.strip('\n')
 530 
 531 try:
 532     if args.count('-') > 1:
 533         msg = 'reading from `-` (standard input) more than once not allowed'
 534         raise ValueError(msg)
 535 
 536     stdout.reconfigure(newline='\n', encoding='utf-8')
 537 
 538     print('<!DOCTYPE html>')
 539     print('<html lang="en">')
 540     print('<head>')
 541     print('    <meta charset="UTF-8">')
 542     print('    <link rel="icon" href="data:,">')
 543     cattr = 'content="width=device-width, initial-scale=1.0"'
 544     print(f'    <meta name="viewport" {cattr}>')
 545     if title != '':
 546         print(f'    <title>{escape(title)}</title>')
 547     print('    <style>')
 548     print(style)
 549     print('    </style>')
 550     print('</head>')
 551     print('<body>')
 552 
 553     for path in args:
 554         if path == '-':
 555             handle_data(stdin, fallback_type)
 556             continue
 557 
 558         if seems_url(path):
 559             with urlopen(path) as inp:
 560                 ctype = inp.getheader('Content-Type')
 561                 if not isinstance(ctype, str):
 562                     ctype = ''
 563 
 564                 # try to detect response encoding, if given
 565                 enc = 'utf-8'
 566                 i = ctype.find('charset=')
 567                 if i >= 0:
 568                     enc = ctype[i + len('charset='):]
 569 
 570                 # CSV-reader from the stdlib can't use byte-sources
 571                 # directly: just adapt reader no matter the output
 572                 with TextIOWrapper(inp, encoding=enc) as txt:
 573                     if ctype.startswith('text/plain'):
 574                         handle_data(txt, 'text')
 575                     elif ctype.startswith('text/csv'):
 576                         handle_data(txt, 'csv')
 577                     elif ctype.startswith('text/tsv'):
 578                         handle_data(txt, 'tsv')
 579                     elif ctype.startswith('text/json'):
 580                         handle_data(txt, 'json')
 581                     elif ctype.startswith('application/json'):
 582                         handle_data(txt, 'json')
 583                     else:
 584                         handle_data(txt, fallback_type)
 585             continue
 586 
 587         with open(path, 'r') as inp:
 588             handle_data(inp, guess_type(path, fallback_type))
 589 
 590     if len(args) == 0:
 591         handle_data(stdin, fallback_type)
 592 
 593     print('</body>')
 594     print('</html>')
 595 except BrokenPipeError:
 596     # quit quietly, instead of showing a confusing error message
 597     stderr.flush()
 598     stderr.close()
 599 except KeyboardInterrupt:
 600     # quit quietly, instead of showing a confusing error message
 601     stderr.flush()
 602     stderr.close()
 603     exit(2)
 604 except Exception as e:
 605     fail(e, 1)