htmlify

     File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from html import escape
  27 from io import StringIO, TextIOWrapper
  28 from re import compile
  29 from sys import argv, exit, stderr, stdin
  30 from urllib.parse import urlparse, urlunparse
  31 
  32 
  33 info = '''
  34 htmlify [options...] [filepaths/URIs...]
  35 
  36 
  37 Render plain-text prose into self-contained HTML. Lines which are just a
  38 valid data-URI are turned into pictures, audio, or even video elements.
  39 
  40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
  41 lines have multiple URIs in them.
  42 
  43 If a title isn't given from the cmd-line options, the first line is used
  44 as the title.
  45 
  46 All (optional) leading options start with either single or double-dash,
  47 and most of them change the style/color used. Some of the options are,
  48 shown in their single-dash form:
  49 
  50     -h          show this help message
  51     -help       show this help message
  52 
  53     -title      use the next argument as the title in the HTML output
  54 '''
  55 
  56 # handle standard help cmd-line options, quitting right away in that case
  57 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  58     print(info.strip())
  59     exit(0)
  60 
  61 # links is used in func handle_normal_text_line to handle hyperlinks
  62 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.,?/&=#-]+''')
  63 
  64 # style is the `inner` CSS used inside the style tag, and handles all
  65 # visual styles for all supported input types
  66 style = '''
  67         body {
  68             margin: 1rem auto 2rem auto;
  69             padding: 0.25rem;
  70             font-size: 1.1rem;
  71             line-height: 1.8rem;
  72             font-family: Arial, Helvetica, sans-serif;
  73 
  74             max-width: 95vw;
  75             /* width: max-content; */
  76             width: fit-content;
  77 
  78             box-sizing: border-box;
  79             display: block;
  80         }
  81 
  82         a {
  83             color: steelblue;
  84             text-decoration: none;
  85         }
  86 
  87         p {
  88             display: block;
  89             margin: auto;
  90             width: 80ch;
  91         }
  92 
  93         audio {
  94             width: 60ch;
  95         }
  96 
  97         table {
  98             margin: 2rem auto;
  99             border-collapse: collapse;
 100         }
 101 
 102         thead>* {
 103             position: sticky;
 104             top: 0;
 105             background-color: white;
 106         }
 107 
 108         tfoot th {
 109             user-select: none;
 110         }
 111 
 112         th, td {
 113             padding: 0.1rem 1ch;
 114             min-width: 4ch;
 115             border-bottom: solid thin transparent;
 116         }
 117 
 118         tr:nth-child(5n) td {
 119             border-bottom: solid thin #ccc;
 120         }
 121 
 122         .monospace {
 123             font-family: monospace;
 124         }
 125 '''.strip('\n')
 126 
 127 
 128 def fail(msg, code: int = 1) -> None:
 129     'Show the error message given, and quit the app right away.'
 130     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
 131     exit(code)
 132 
 133 
 134 title = ''
 135 start_args = 1
 136 while start_args < len(argv) and argv[start_args].startswith('-'):
 137     if argv[start_args] in ('-title', '--title'):
 138         if start_args + 1 >= len(argv):
 139             fail('missing actual title in cmd-line arguments', 1)
 140         title = escape(argv[start_args + 1])
 141         start_args += 2
 142         continue
 143     break
 144 args = argv[start_args:]
 145 
 146 
 147 def is_base64(n: int) -> bool:
 148     'Help build base64-byte-checker lookup tables.'
 149 
 150     if ord('0') <= n <= ord('9'):
 151         return True
 152     if ord('A') <= n <= ord('Z'):
 153         return True
 154     if ord('a') <= n <= ord('z'):
 155         return True
 156     return n in (ord('+'), ord('/'), ord('='))
 157 
 158 
 159 # valid_base64 helps func seems_supported_data_uri do its job quickly
 160 valid_base64 = tuple(is_base64(n) for n in range(256))
 161 
 162 
 163 def start_page(title: str) -> None:
 164     print('<!DOCTYPE html>')
 165     print('<html lang="en">')
 166     print('<head>')
 167     print('    <meta charset="UTF-8">')
 168     print('    <link rel="icon" href="data:,">')
 169     cattr = 'content="width=device-width, initial-scale=1.0"'
 170     print(f'    <meta name="viewport" {cattr}>')
 171     if title:
 172         print(f'    <title>{escape(title)}</title>')
 173     print('    <style>')
 174     print(style)
 175     print('    </style>')
 176     print('</head>')
 177     print('<body>')
 178 
 179 
 180 def shorten(s: str, maxchars: int) -> str:
 181     return s if len(s) <= maxchars else s[:maxchars]
 182 
 183 
 184 def handle_text(src, title: str, first: bool) -> None:
 185     'Render plain-text prose.'
 186 
 187     prev = ''
 188     # buf is a reusable string-buffer for func handle_normal_text_line
 189     buf = StringIO()
 190     num_lines = 0
 191 
 192     for i, line in enumerate(src):
 193         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 194         if not (prev or line):
 195             # keep skipping empty(ish) lines in runs of such lines
 196             continue
 197 
 198         num_lines += 1
 199         if first and num_lines == 1:
 200             if title:
 201                 start_page(shorten(title, 100))
 202             else:
 203                 start_page(shorten(line, 100))
 204                 continue
 205 
 206         if (not line) and prev:
 207             print('</p>')
 208         if not prev:
 209             print('<p>')
 210         prev = line
 211 
 212         if seems_supported_data_uri(line):
 213             handle_data_uri(line)
 214             print('<br>')
 215         else:
 216             handle_normal_text_line(line, buf)
 217 
 218     # don't forget to close last paragraph
 219     if line:
 220         print('</p>')
 221 
 222 
 223 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 224     'Handle prose lines for func handle_text.'
 225 
 226     # get rid of previous buffer content
 227     buf.truncate(0)
 228     buf.seek(0)
 229 
 230     # j keeps track of end of detected hyperlinks, and is used outside
 231     # the regex-match loop to detect trailing parts in lines
 232     j = 0
 233 
 234     # matches is to keep track of whether any matches occurred
 235     matches = 0
 236 
 237     for m in links.finditer(line):
 238         matches += 1
 239         # remember previous index-end, used to emit the part before
 240         # the current match
 241         start = j
 242 
 243         i = m.start()
 244         j = m.end()
 245         # remember part before match
 246         buf.write(escape(line[start:i]))
 247         # replace matched hyperlink with an html anchor tag for it
 248         href = line[i:j]
 249         buf.write(f'<a href="{href}">{href}</a>')
 250 
 251     if matches == 0:
 252         # avoid emptying lines with no matches
 253         print(f'{escape(line)}<br>')
 254         return
 255 
 256     # no need to copy the line when it's not changing anyway
 257     if j > 0:
 258         # don't forget the last part of the line, or the whole line
 259         buf.write(escape(line[j:]))
 260 
 261     buf.write('<br>')
 262     print(buf.getvalue())
 263 
 264 
 265 data_uri_starts = {
 266     'data:image/': '<img src="',
 267     'data:audio/': '<audio controls src="',
 268     'data:video/': '<video controls src="',
 269 }
 270 
 271 
 272 def seems_supported_data_uri(s: str) -> bool:
 273     if not any(s.startswith(e) for e in data_uri_starts.keys()):
 274         return False
 275 
 276     base64_index = s.find(';base64,')
 277     if base64_index < 0:
 278         return False
 279 
 280     # check all payload bytes
 281     start = base64_index + len(';base64,')
 282     for i, c in enumerate(s):
 283         if i >= start and (not valid_base64[ord(c)]):
 284             return False
 285     return True
 286 
 287 
 288 def handle_data_uri(s: str) -> bool:
 289     for kind, start in data_uri_starts.items():
 290         if s.starts_with(kind):
 291             print(start, end='')
 292             print(s, end='')
 293             print('">', end='')
 294             return True
 295     return False
 296 
 297 
 298 def anchorize(href: str) -> str:
 299     rel = 'rel="noopener noreferrer"'
 300     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 301 
 302 
 303 def seems_url(s: str) -> bool:
 304     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 305     return any(s.startswith(p) for p in protocols)
 306 
 307 
 308 if args.count('-') > 1:
 309     fail('reading from `-` (standard input) more than once not allowed', 1)
 310 
 311 if any(seems_url(e) for e in args):
 312     from urllib.request import urlopen
 313 
 314 try:
 315     for i, path in enumerate(args):
 316         if path == '-':
 317             handle_text(stdin, title, i == 0)
 318             continue
 319 
 320         if seems_url(path):
 321             with urlopen(path) as inp:
 322                 ctype = inp.getheader('Content-Type')
 323                 if not isinstance(ctype, str):
 324                     ctype = ''
 325 
 326                 # try to detect response encoding, if given
 327                 enc = 'utf-8'
 328                 i = ctype.find('charset=')
 329                 if i >= 0:
 330                     enc = ctype[i + len('charset='):]
 331 
 332                 with TextIOWrapper(inp, encoding=enc) as txt:
 333                     handle_text(txt, title, i == 0)
 334             continue
 335 
 336         with open(path, encoding='utf-8') as inp:
 337             handle_text(inp, title, i == 0)
 338 
 339     if len(args) == 0:
 340         handle_text(stdin, title, True)
 341 
 342     print('</body>')
 343     print('</html>')
 344 except BrokenPipeError:
 345     # quit quietly, instead of showing a confusing error message
 346     stderr.close()
 347     exit(0)
 348 except KeyboardInterrupt:
 349     exit(2)
 350 except Exception as e:
 351     fail(e, 1)