File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2026 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from html import escape
  27 from io import StringIO, TextIOWrapper
  28 from re import compile
  29 from sys import argv, exit, stderr, stdin
  30 from urllib.parse import urlparse, urlunparse
  31 
  32 
  33 info = '''
  34 htmlify [options...] [filepaths/URIs...]
  35 
  36 
  37 Render plain-text prose into self-contained HTML. Lines which are just a
  38 valid data-URI are turned into pictures, audio, or even video elements.
  39 
  40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
  41 lines have multiple URIs in them.
  42 
  43 If a title isn't given from the cmd-line options, the first line is used
  44 as the title.
  45 
  46 All (optional) leading options start with either single or double-dash,
  47 and most of them change the style/color used. Some of the options are,
  48 shown in their single-dash form:
  49 
  50     -h, -help            show this help message
  51     -mono, -monospace    use a monospace font for text
  52     -t, -title           use the next argument as the webpage title
  53 '''
  54 
  55 # handle standard help cmd-line options, quitting right away in that case
  56 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  57     print(info.strip())
  58     exit(0)
  59 
  60 # links is used in func handle_normal_text_line to handle hyperlinks
  61 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''')
  62 
  63 # style is the `inner` CSS used inside the style tag, and handles all
  64 # visual styles for all supported input types
  65 style = '''
  66         body {
  67             margin: 1rem auto 2rem auto;
  68             padding: 0.25rem;
  69             font-size: 1.1rem;
  70             line-height: 1.8rem;
  71             font-family: sans-serif;
  72 
  73             max-width: 95vw;
  74             /* width: max-content; */
  75             width: fit-content;
  76 
  77             box-sizing: border-box;
  78             display: block;
  79         }
  80 
  81         a {
  82             color: steelblue;
  83             text-decoration: none;
  84         }
  85 
  86         p {
  87             display: block;
  88             margin: auto;
  89             max-width: 80ch;
  90         }
  91 
  92         img {
  93             margin: none;
  94         }
  95 
  96         audio {
  97             width: 60ch;
  98         }
  99 
 100         table {
 101             margin: 2rem auto;
 102             border-collapse: collapse;
 103         }
 104 
 105         thead>* {
 106             position: sticky;
 107             top: 0;
 108             background-color: white;
 109         }
 110 
 111         tfoot th {
 112             user-select: none;
 113         }
 114 
 115         th, td {
 116             padding: 0.1rem 1ch;
 117             min-width: 4ch;
 118             border-bottom: solid thin transparent;
 119         }
 120 
 121         tr:nth-child(5n) td {
 122             border-bottom: solid thin #ccc;
 123         }
 124 
 125         .monospace {
 126             font-family: monospace;
 127         }
 128 '''.strip('\n')
 129 
 130 
 131 # fail shows the error message given, and quits the app right away
 132 def fail(msg, code: int = 1) -> None:
 133     print(str(msg), file=stderr)
 134     exit(code)
 135 
 136 
 137 title = ''
 138 monospace = False
 139 start_args = 1
 140 while start_args < len(argv) and argv[start_args].startswith('-'):
 141     if argv[start_args] in ('-t', '--t', '-title', '--title'):
 142         if start_args + 1 >= len(argv):
 143             fail('missing actual title in cmd-line arguments', 1)
 144         # title = escape(argv[start_args + 1])
 145         title = argv[start_args + 1]
 146         start_args += 2
 147         continue
 148     if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'):
 149         monospace = True
 150         start_args += 1
 151         continue
 152     if argv[start_args] == '--':
 153         start_args += 1
 154         break
 155     break
 156 args = argv[start_args:]
 157 
 158 
 159 def start_page(title: str, mono: bool) -> None:
 160     print('<!DOCTYPE html>')
 161     print('<html lang="en">')
 162     print('<head>')
 163     print('    <meta charset="UTF-8">')
 164     print('    <link rel="icon" href="data:,">')
 165     cattr = 'content="width=device-width, initial-scale=1.0"'
 166     print(f'    <meta name="viewport" {cattr}>')
 167     if title:
 168         print(f'    <title>{escape(title)}</title>')
 169     print('    <style>')
 170     print(style)
 171     print('    </style>')
 172     print('</head>')
 173     print('<body class="monospace">' if mono else '<body>')
 174 
 175 
 176 def shorten(s: str, maxchars: int) -> str:
 177     return s if len(s) <= maxchars else s[:maxchars]
 178 
 179 
 180 # handle_text renders plain-text prose
 181 def handle_text(src, title: str, first: bool, mono: bool) -> None:
 182     prev = ''
 183     # buf is a reusable string-buffer for func handle_normal_text_line
 184     buf = StringIO()
 185     num_lines = 0
 186 
 187     for i, line in enumerate(src):
 188         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 189         if not (prev or line):
 190             # keep skipping empty(ish) lines in runs of such lines
 191             continue
 192 
 193         num_lines += 1
 194         if first and num_lines == 1:
 195             if title:
 196                 start_page(shorten(title, 100), mono)
 197             else:
 198                 start_page(shorten(line, 100), mono)
 199                 continue
 200 
 201         if (not line) and prev:
 202             print('</p>')
 203         if not prev:
 204             print('<p>')
 205         prev = line
 206 
 207         if seems_supported_data_uri(line):
 208             handle_data_uri(line)
 209             print('<br>')
 210         else:
 211             handle_normal_text_line(line, buf)
 212 
 213     # don't forget to close last paragraph
 214     if line:
 215         print('</p>')
 216 
 217 
 218 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 219     # get rid of previous buffer content
 220     buf.truncate(0)
 221     buf.seek(0)
 222 
 223     # j keeps track of end of detected hyperlinks, and is used outside
 224     # the regex-match loop to detect trailing parts in lines
 225     j = 0
 226 
 227     # matches is to keep track of whether any matches occurred
 228     matches = 0
 229 
 230     for m in links.finditer(line):
 231         matches += 1
 232         # remember previous index-end, used to emit the part before
 233         # the current match
 234         start = j
 235 
 236         i = m.start()
 237         j = m.end()
 238         # remember part before match
 239         buf.write(escape(line[start:i]))
 240         # replace matched hyperlink with an html anchor tag for it
 241         href = line[i:j]
 242         buf.write(f'<a href="{href}">{href}</a>')
 243 
 244     if matches == 0:
 245         # avoid emptying lines with no matches
 246         print(f'{escape(line)}<br>')
 247         return
 248 
 249     # no need to copy the line when it's not changing anyway
 250     if j > 0:
 251         # don't forget the last part of the line, or the whole line
 252         buf.write(escape(line[j:]))
 253 
 254     buf.write('<br>')
 255     print(buf.getvalue())
 256 
 257 
 258 def is_base64(n: int) -> bool:
 259     if ord('0') <= n <= ord('9'):
 260         return True
 261     if ord('A') <= n <= ord('Z'):
 262         return True
 263     if ord('a') <= n <= ord('z'):
 264         return True
 265     return n in (ord('+'), ord('/'), ord('='))
 266 
 267 
 268 valid_base64 = tuple(is_base64(n) for n in range(256))
 269 
 270 data_uri_wraps = {
 271     'data:image/': ('<img src="', '">'),
 272     'data:audio/': ('<audio controls src="', '"></audio>'),
 273     'data:video/': ('<video controls src="', '"></video>'),
 274 }
 275 
 276 
 277 def seems_supported_data_uri(s: str) -> bool:
 278     if not any(s.startswith(e) for e in data_uri_starts.keys()):
 279         return False
 280 
 281     base64_index = s.find(';base64,')
 282     if base64_index < 0:
 283         return False
 284 
 285     # check all payload bytes
 286     start = base64_index + len(';base64,')
 287     for i, c in enumerate(s):
 288         if i >= start and (not valid_base64[ord(c)]):
 289             return False
 290     return True
 291 
 292 
 293 def handle_data_uri(s: str) -> bool:
 294     for kind, (start, end) in data_uri_starts.items():
 295         if s.startswith(kind):
 296             print(start, end='')
 297             print(s, end='')
 298             print(end, end='')
 299             return True
 300     return False
 301 
 302 
 303 def anchorize(href: str) -> str:
 304     rel = 'rel="noopener noreferrer"'
 305     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 306 
 307 
 308 def seems_url(s: str) -> bool:
 309     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 310     return any(s.startswith(p) for p in protocols)
 311 
 312 
 313 if args.count('-') > 1:
 314     fail('reading from `-` (standard input) more than once not allowed', 1)
 315 
 316 if any(seems_url(e) for e in args):
 317     from urllib.request import urlopen
 318 
 319 try:
 320     for i, path in enumerate(args):
 321         is_first_input = i == 0
 322 
 323         if path == '-':
 324             handle_text(stdin, title, is_first_input, monospace)
 325             continue
 326 
 327         if seems_url(path):
 328             with urlopen(path) as inp:
 329                 with TextIOWrapper(inp, encoding='utf-8') as txt:
 330                     handle_text(txt, title, is_first_input, monospace)
 331             continue
 332 
 333         with open(path, encoding='utf-8') as inp:
 334             handle_text(inp, title, is_first_input, monospace)
 335 
 336     if len(args) == 0:
 337         handle_text(stdin, title, True, monospace)
 338 
 339     print('</body>')
 340     print('</html>')
 341 except BrokenPipeError:
 342     # quit quietly, instead of showing a confusing error message
 343     exit(0)
 344 except KeyboardInterrupt:
 345     exit(2)
 346 except Exception as e:
 347     fail(e, 1)