File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from html import escape
  27 from io import StringIO, TextIOWrapper
  28 from re import compile
  29 from sys import argv, exit, stderr, stdin
  30 from urllib.parse import urlparse, urlunparse
  31 
  32 
  33 info = '''
  34 htmlify [options...] [filepaths/URIs...]
  35 
  36 
  37 Render plain-text prose into self-contained HTML. Lines which are just a
  38 valid data-URI are turned into pictures, audio, or even video elements.
  39 
  40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
  41 lines have multiple URIs in them.
  42 
  43 If a title isn't given from the cmd-line options, the first line is used
  44 as the title.
  45 
  46 All (optional) leading options start with either single or double-dash,
  47 and most of them change the style/color used. Some of the options are,
  48 shown in their single-dash form:
  49 
  50     -h, -help            show this help message
  51     -mono, -monospace    use a monospace font for text
  52     -t, -title           use the next argument as the webpage title
  53 '''
  54 
  55 # handle standard help cmd-line options, quitting right away in that case
  56 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  57     print(info.strip())
  58     exit(0)
  59 
  60 # links is used in func handle_normal_text_line to handle hyperlinks
  61 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''')
  62 
  63 # style is the `inner` CSS used inside the style tag, and handles all
  64 # visual styles for all supported input types
  65 style = '''
  66         body {
  67             margin: 1rem auto 2rem auto;
  68             padding: 0.25rem;
  69             font-size: 1.1rem;
  70             line-height: 1.8rem;
  71             font-family: sans-serif;
  72 
  73             max-width: 95vw;
  74             /* width: max-content; */
  75             width: fit-content;
  76 
  77             box-sizing: border-box;
  78             display: block;
  79         }
  80 
  81         a {
  82             color: steelblue;
  83             text-decoration: none;
  84         }
  85 
  86         p {
  87             display: block;
  88             margin: auto;
  89             max-width: 80ch;
  90         }
  91 
  92         img {
  93             margin: none;
  94         }
  95 
  96         audio {
  97             width: 60ch;
  98         }
  99 
 100         table {
 101             margin: 2rem auto;
 102             border-collapse: collapse;
 103         }
 104 
 105         thead>* {
 106             position: sticky;
 107             top: 0;
 108             background-color: white;
 109         }
 110 
 111         tfoot th {
 112             user-select: none;
 113         }
 114 
 115         th, td {
 116             padding: 0.1rem 1ch;
 117             min-width: 4ch;
 118             border-bottom: solid thin transparent;
 119         }
 120 
 121         tr:nth-child(5n) td {
 122             border-bottom: solid thin #ccc;
 123         }
 124 
 125         .monospace {
 126             font-family: monospace;
 127         }
 128 '''.strip('\n')
 129 
 130 
 131 def fail(msg, code: int = 1) -> None:
 132     'Show the error message given, and quit the app right away.'
 133     print(str(msg), file=stderr)
 134     exit(code)
 135 
 136 
 137 title = ''
 138 monospace = False
 139 start_args = 1
 140 while start_args < len(argv) and argv[start_args].startswith('-'):
 141     if argv[start_args] in ('-t', '--t', '-title', '--title'):
 142         if start_args + 1 >= len(argv):
 143             fail('missing actual title in cmd-line arguments', 1)
 144         title = escape(argv[start_args + 1])
 145         start_args += 2
 146         continue
 147     if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'):
 148         monospace = True
 149         start_args += 1
 150         continue
 151     if argv[start_args] == '--':
 152         start_args += 1
 153         break
 154     break
 155 args = argv[start_args:]
 156 
 157 
 158 def is_base64(n: int) -> bool:
 159     if ord('0') <= n <= ord('9'):
 160         return True
 161     if ord('A') <= n <= ord('Z'):
 162         return True
 163     if ord('a') <= n <= ord('z'):
 164         return True
 165     return n in (ord('+'), ord('/'), ord('='))
 166 
 167 
 168 # valid_base64 helps func seems_supported_data_uri do its job quickly
 169 valid_base64 = tuple(is_base64(n) for n in range(256))
 170 
 171 
 172 def start_page(title: str, mono: bool) -> None:
 173     print('<!DOCTYPE html>')
 174     print('<html lang="en">')
 175     print('<head>')
 176     print('    <meta charset="UTF-8">')
 177     print('    <link rel="icon" href="data:,">')
 178     cattr = 'content="width=device-width, initial-scale=1.0"'
 179     print(f'    <meta name="viewport" {cattr}>')
 180     if title:
 181         print(f'    <title>{escape(title)}</title>')
 182     print('    <style>')
 183     print(style)
 184     print('    </style>')
 185     print('</head>')
 186     print('<body class="monospace">' if mono else '<body>')
 187 
 188 
 189 def shorten(s: str, maxchars: int) -> str:
 190     return s if len(s) <= maxchars else s[:maxchars]
 191 
 192 
 193 def handle_text(src, title: str, first: bool, mono: bool) -> None:
 194     'Render plain-text prose.'
 195 
 196     prev = ''
 197     # buf is a reusable string-buffer for func handle_normal_text_line
 198     buf = StringIO()
 199     num_lines = 0
 200 
 201     for i, line in enumerate(src):
 202         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 203         if not (prev or line):
 204             # keep skipping empty(ish) lines in runs of such lines
 205             continue
 206 
 207         num_lines += 1
 208         if first and num_lines == 1:
 209             if title:
 210                 start_page(shorten(title, 100), mono)
 211             else:
 212                 start_page(shorten(line, 100), mono)
 213                 continue
 214 
 215         if (not line) and prev:
 216             print('</p>')
 217         if not prev:
 218             print('<p>')
 219         prev = line
 220 
 221         if seems_supported_data_uri(line):
 222             handle_data_uri(line)
 223             print('<br>')
 224         else:
 225             handle_normal_text_line(line, buf)
 226 
 227     # don't forget to close last paragraph
 228     if line:
 229         print('</p>')
 230 
 231 
 232 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 233     # get rid of previous buffer content
 234     buf.truncate(0)
 235     buf.seek(0)
 236 
 237     # j keeps track of end of detected hyperlinks, and is used outside
 238     # the regex-match loop to detect trailing parts in lines
 239     j = 0
 240 
 241     # matches is to keep track of whether any matches occurred
 242     matches = 0
 243 
 244     for m in links.finditer(line):
 245         matches += 1
 246         # remember previous index-end, used to emit the part before
 247         # the current match
 248         start = j
 249 
 250         i = m.start()
 251         j = m.end()
 252         # remember part before match
 253         buf.write(escape(line[start:i]))
 254         # replace matched hyperlink with an html anchor tag for it
 255         href = line[i:j]
 256         buf.write(f'<a href="{href}">{href}</a>')
 257 
 258     if matches == 0:
 259         # avoid emptying lines with no matches
 260         print(f'{escape(line)}<br>')
 261         return
 262 
 263     # no need to copy the line when it's not changing anyway
 264     if j > 0:
 265         # don't forget the last part of the line, or the whole line
 266         buf.write(escape(line[j:]))
 267 
 268     buf.write('<br>')
 269     print(buf.getvalue())
 270 
 271 
 272 data_uri_starts = {
 273     'data:image/': '<img src="',
 274     'data:audio/': '<audio controls src="',
 275     'data:video/': '<video controls src="',
 276 }
 277 
 278 
 279 def seems_supported_data_uri(s: str) -> bool:
 280     if not any(s.startswith(e) for e in data_uri_starts.keys()):
 281         return False
 282 
 283     base64_index = s.find(';base64,')
 284     if base64_index < 0:
 285         return False
 286 
 287     # check all payload bytes
 288     start = base64_index + len(';base64,')
 289     for i, c in enumerate(s):
 290         if i >= start and (not valid_base64[ord(c)]):
 291             return False
 292     return True
 293 
 294 
 295 def handle_data_uri(s: str) -> bool:
 296     for kind, start in data_uri_starts.items():
 297         if s.startswith(kind):
 298             print(start, end='')
 299             print(s, end='')
 300             print('">', end='')
 301             return True
 302     return False
 303 
 304 
 305 def anchorize(href: str) -> str:
 306     rel = 'rel="noopener noreferrer"'
 307     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 308 
 309 
 310 def seems_url(s: str) -> bool:
 311     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 312     return any(s.startswith(p) for p in protocols)
 313 
 314 
 315 if args.count('-') > 1:
 316     fail('reading from `-` (standard input) more than once not allowed', 1)
 317 
 318 if any(seems_url(e) for e in args):
 319     from urllib.request import urlopen
 320 
 321 try:
 322     for i, path in enumerate(args):
 323         is_first_input = i == 0
 324 
 325         if path == '-':
 326             handle_text(stdin, title, is_first_input, monospace)
 327             continue
 328 
 329         if seems_url(path):
 330             with urlopen(path) as inp:
 331                 ctype = inp.getheader('Content-Type')
 332                 if not isinstance(ctype, str):
 333                     ctype = ''
 334 
 335                 # try to detect response encoding, if given
 336                 enc = 'utf-8'
 337                 i = ctype.find('charset=')
 338                 if i >= 0:
 339                     enc = ctype[i + len('charset='):]
 340 
 341                 with TextIOWrapper(inp, encoding=enc) as txt:
 342                     handle_text(txt, title, is_first_input, monospace)
 343             continue
 344 
 345         with open(path, encoding='utf-8') as inp:
 346             handle_text(inp, title, is_first_input, monospace)
 347 
 348     if len(args) == 0:
 349         handle_text(stdin, title, True, monospace)
 350 
 351     print('</body>')
 352     print('</html>')
 353 except BrokenPipeError:
 354     # quit quietly, instead of showing a confusing error message
 355     exit(0)
 356 except KeyboardInterrupt:
 357     exit(2)
 358 except Exception as e:
 359     fail(e, 1)