File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from html import escape
  27 from io import StringIO, TextIOWrapper
  28 from re import compile
  29 from sys import argv, exit, stderr, stdin
  30 from urllib.parse import urlparse, urlunparse
  31 
  32 
  33 info = '''
  34 htmlify [options...] [filepaths/URIs...]
  35 
  36 
  37 Render plain-text prose into self-contained HTML. Lines which are just a
  38 valid data-URI are turned into pictures, audio, or even video elements.
  39 
  40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
  41 lines have multiple URIs in them.
  42 
  43 If a title isn't given from the cmd-line options, the first line is used
  44 as the title.
  45 
  46 All (optional) leading options start with either single or double-dash,
  47 and most of them change the style/color used. Some of the options are,
  48 shown in their single-dash form:
  49 
  50     -h          show this help message
  51     -help       show this help message
  52 
  53     -title      use the next argument as the title in the HTML output
  54 '''
  55 
  56 # handle standard help cmd-line options, quitting right away in that case
  57 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  58     print(info.strip())
  59     exit(0)
  60 
  61 # links is used in func handle_normal_text_line to handle hyperlinks
  62 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''')
  63 
  64 # style is the `inner` CSS used inside the style tag, and handles all
  65 # visual styles for all supported input types
  66 style = '''
  67         body {
  68             margin: 1rem auto 2rem auto;
  69             padding: 0.25rem;
  70             font-size: 1.1rem;
  71             line-height: 1.8rem;
  72             font-family: sans-serif;
  73 
  74             max-width: 95vw;
  75             /* width: max-content; */
  76             width: fit-content;
  77 
  78             box-sizing: border-box;
  79             display: block;
  80         }
  81 
  82         a {
  83             color: steelblue;
  84             text-decoration: none;
  85         }
  86 
  87         p {
  88             display: block;
  89             margin: auto;
  90             max-width: 80ch;
  91         }
  92 
  93         img {
  94             margin: none;
  95         }
  96 
  97         audio {
  98             width: 60ch;
  99         }
 100 
 101         table {
 102             margin: 2rem auto;
 103             border-collapse: collapse;
 104         }
 105 
 106         thead>* {
 107             position: sticky;
 108             top: 0;
 109             background-color: white;
 110         }
 111 
 112         tfoot th {
 113             user-select: none;
 114         }
 115 
 116         th, td {
 117             padding: 0.1rem 1ch;
 118             min-width: 4ch;
 119             border-bottom: solid thin transparent;
 120         }
 121 
 122         tr:nth-child(5n) td {
 123             border-bottom: solid thin #ccc;
 124         }
 125 
 126         .monospace {
 127             font-family: monospace;
 128         }
 129 '''.strip('\n')
 130 
 131 
 132 def fail(msg, code: int = 1) -> None:
 133     'Show the error message given, and quit the app right away.'
 134     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
 135     exit(code)
 136 
 137 
 138 title = ''
 139 monospace = False
 140 start_args = 1
 141 while start_args < len(argv) and argv[start_args].startswith('-'):
 142     if argv[start_args] in ('-title', '--title'):
 143         if start_args + 1 >= len(argv):
 144             fail('missing actual title in cmd-line arguments', 1)
 145         title = escape(argv[start_args + 1])
 146         start_args += 2
 147         continue
 148     if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'):
 149         monospace = True
 150         start_args += 1
 151         continue
 152     if argv[start_args] == '--':
 153         start_args += 1
 154         break
 155     break
 156 args = argv[start_args:]
 157 
 158 
 159 def is_base64(n: int) -> bool:
 160     'Help build base64-byte-checker lookup tables.'
 161 
 162     if ord('0') <= n <= ord('9'):
 163         return True
 164     if ord('A') <= n <= ord('Z'):
 165         return True
 166     if ord('a') <= n <= ord('z'):
 167         return True
 168     return n in (ord('+'), ord('/'), ord('='))
 169 
 170 
 171 # valid_base64 helps func seems_supported_data_uri do its job quickly
 172 valid_base64 = tuple(is_base64(n) for n in range(256))
 173 
 174 
 175 def start_page(title: str, mono: bool) -> None:
 176     print('<!DOCTYPE html>')
 177     print('<html lang="en">')
 178     print('<head>')
 179     print('    <meta charset="UTF-8">')
 180     print('    <link rel="icon" href="data:,">')
 181     cattr = 'content="width=device-width, initial-scale=1.0"'
 182     print(f'    <meta name="viewport" {cattr}>')
 183     if title:
 184         print(f'    <title>{escape(title)}</title>')
 185     print('    <style>')
 186     print(style)
 187     print('    </style>')
 188     print('</head>')
 189     print('<body class="monospace">' if mono else '<body>')
 190 
 191 
 192 def shorten(s: str, maxchars: int) -> str:
 193     return s if len(s) <= maxchars else s[:maxchars]
 194 
 195 
 196 def handle_text(src, title: str, first: bool, mono: bool) -> None:
 197     'Render plain-text prose.'
 198 
 199     prev = ''
 200     # buf is a reusable string-buffer for func handle_normal_text_line
 201     buf = StringIO()
 202     num_lines = 0
 203 
 204     for i, line in enumerate(src):
 205         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 206         if not (prev or line):
 207             # keep skipping empty(ish) lines in runs of such lines
 208             continue
 209 
 210         num_lines += 1
 211         if first and num_lines == 1:
 212             if title:
 213                 start_page(shorten(title, 100), mono)
 214             else:
 215                 start_page(shorten(line, 100), mono)
 216                 continue
 217 
 218         if (not line) and prev:
 219             print('</p>')
 220         if not prev:
 221             print('<p>')
 222         prev = line
 223 
 224         if seems_supported_data_uri(line):
 225             handle_data_uri(line)
 226             print('<br>')
 227         else:
 228             handle_normal_text_line(line, buf)
 229 
 230     # don't forget to close last paragraph
 231     if line:
 232         print('</p>')
 233 
 234 
 235 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 236     'Handle prose lines for func handle_text.'
 237 
 238     # get rid of previous buffer content
 239     buf.truncate(0)
 240     buf.seek(0)
 241 
 242     # j keeps track of end of detected hyperlinks, and is used outside
 243     # the regex-match loop to detect trailing parts in lines
 244     j = 0
 245 
 246     # matches is to keep track of whether any matches occurred
 247     matches = 0
 248 
 249     for m in links.finditer(line):
 250         matches += 1
 251         # remember previous index-end, used to emit the part before
 252         # the current match
 253         start = j
 254 
 255         i = m.start()
 256         j = m.end()
 257         # remember part before match
 258         buf.write(escape(line[start:i]))
 259         # replace matched hyperlink with an html anchor tag for it
 260         href = line[i:j]
 261         buf.write(f'<a href="{href}">{href}</a>')
 262 
 263     if matches == 0:
 264         # avoid emptying lines with no matches
 265         print(f'{escape(line)}<br>')
 266         return
 267 
 268     # no need to copy the line when it's not changing anyway
 269     if j > 0:
 270         # don't forget the last part of the line, or the whole line
 271         buf.write(escape(line[j:]))
 272 
 273     buf.write('<br>')
 274     print(buf.getvalue())
 275 
 276 
 277 data_uri_starts = {
 278     'data:image/': '<img src="',
 279     'data:audio/': '<audio controls src="',
 280     'data:video/': '<video controls src="',
 281 }
 282 
 283 
 284 def seems_supported_data_uri(s: str) -> bool:
 285     if not any(s.startswith(e) for e in data_uri_starts.keys()):
 286         return False
 287 
 288     base64_index = s.find(';base64,')
 289     if base64_index < 0:
 290         return False
 291 
 292     # check all payload bytes
 293     start = base64_index + len(';base64,')
 294     for i, c in enumerate(s):
 295         if i >= start and (not valid_base64[ord(c)]):
 296             return False
 297     return True
 298 
 299 
 300 def handle_data_uri(s: str) -> bool:
 301     for kind, start in data_uri_starts.items():
 302         if s.startswith(kind):
 303             print(start, end='')
 304             print(s, end='')
 305             print('">', end='')
 306             return True
 307     return False
 308 
 309 
 310 def anchorize(href: str) -> str:
 311     rel = 'rel="noopener noreferrer"'
 312     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 313 
 314 
 315 def seems_url(s: str) -> bool:
 316     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 317     return any(s.startswith(p) for p in protocols)
 318 
 319 
 320 if args.count('-') > 1:
 321     fail('reading from `-` (standard input) more than once not allowed', 1)
 322 
 323 if any(seems_url(e) for e in args):
 324     from urllib.request import urlopen
 325 
 326 try:
 327     for i, path in enumerate(args):
 328         is_first_input = i == 0
 329 
 330         if path == '-':
 331             handle_text(stdin, title, is_first_input, monospace)
 332             continue
 333 
 334         if seems_url(path):
 335             with urlopen(path) as inp:
 336                 ctype = inp.getheader('Content-Type')
 337                 if not isinstance(ctype, str):
 338                     ctype = ''
 339 
 340                 # try to detect response encoding, if given
 341                 enc = 'utf-8'
 342                 i = ctype.find('charset=')
 343                 if i >= 0:
 344                     enc = ctype[i + len('charset='):]
 345 
 346                 with TextIOWrapper(inp, encoding=enc) as txt:
 347                     handle_text(txt, title, is_first_input, monospace)
 348             continue
 349 
 350         with open(path, encoding='utf-8') as inp:
 351             handle_text(inp, title, is_first_input, monospace)
 352 
 353     if len(args) == 0:
 354         handle_text(stdin, title, True, monospace)
 355 
 356     print('</body>')
 357     print('</html>')
 358 except BrokenPipeError:
 359     # quit quietly, instead of showing a confusing error message
 360     exit(0)
 361 except KeyboardInterrupt:
 362     exit(2)
 363 except Exception as e:
 364     fail(e, 1)