File: htmlify.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from html import escape
  27 from io import StringIO, TextIOWrapper
  28 from re import compile
  29 from sys import argv, exit, stderr, stdin
  30 from urllib.parse import urlparse, urlunparse
  31 
  32 
  33 info = '''
  34 htmlify [options...] [filepaths/URIs...]
  35 
  36 
  37 Render plain-text prose into self-contained HTML. Lines which are just a
  38 valid data-URI are turned into pictures, audio, or even video elements.
  39 
  40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
  41 lines have multiple URIs in them.
  42 
  43 If a title isn't given from the cmd-line options, the first line is used
  44 as the title.
  45 
  46 All (optional) leading options start with either single or double-dash,
  47 and most of them change the style/color used. Some of the options are,
  48 shown in their single-dash form:
  49 
  50     -h          show this help message
  51     -help       show this help message
  52 
  53     -title      use the next argument as the title in the HTML output
  54 '''
  55 
  56 # handle standard help cmd-line options, quitting right away in that case
  57 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
  58     print(info.strip(), file=stderr)
  59     exit(0)
  60 
  61 # links is used in func handle_normal_text_line to handle hyperlinks
  62 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.?/&=-]+''')
  63 
  64 # style is the `inner` CSS used inside the style tag, and handles all
  65 # visual styles for all supported input types
  66 style = '''
  67         body {
  68             margin: 1rem auto 2rem auto;
  69             padding: 0.25rem;
  70             font-size: 1.1rem;
  71             line-height: 1.8rem;
  72             font-family: Arial, Helvetica, sans-serif;
  73 
  74             max-width: 95vw;
  75             /* width: max-content; */
  76             width: fit-content;
  77 
  78             box-sizing: border-box;
  79             display: block;
  80         }
  81 
  82         a {
  83             color: steelblue;
  84             text-decoration: none;
  85         }
  86 
  87         p {
  88             display: block;
  89             margin: auto;
  90             width: 80ch;
  91         }
  92 
  93         audio {
  94             width: 60ch;
  95         }
  96 
  97         table {
  98             margin: 2rem auto;
  99             border-collapse: collapse;
 100         }
 101 
 102         thead>* {
 103             position: sticky;
 104             top: 0;
 105             background-color: white;
 106         }
 107 
 108         tfoot th {
 109             user-select: none;
 110         }
 111 
 112         th, td {
 113             padding: 0.1rem 1ch;
 114             min-width: 4ch;
 115             border-bottom: solid thin transparent;
 116         }
 117 
 118         tr:nth-child(5n) td {
 119             border-bottom: solid thin #ccc;
 120         }
 121 
 122         .monospace {
 123             font-family: monospace;
 124         }
 125 '''.strip('\n')
 126 
 127 
 128 def fail(msg, code: int = 1) -> None:
 129     'Show the error message given, and quit the app right away.'
 130     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
 131     exit(code)
 132 
 133 
 134 title = ''
 135 start_args = 1
 136 while start_args < len(argv) and argv[start_args].startswith('-'):
 137     if argv[start_args] in ('-title', '--title'):
 138         if start_args + 1 >= len(argv):
 139             fail('missing actual title in cmd-line arguments', 1)
 140         title = html_escape(argv[start_args + 1])
 141         start_args += 2
 142         continue
 143     break
 144 args = argv[start_args:]
 145 
 146 
 147 def is_base64(n: int) -> bool:
 148     'Help build base64-byte-checker lookup tables.'
 149 
 150     if ord('0') <= n <= ord('9'):
 151         return True
 152     if ord('A') <= n <= ord('Z'):
 153         return True
 154     if ord('a') <= n <= ord('z'):
 155         return True
 156     return n in (ord('+'), ord('/'), ord('='))
 157 
 158 
 159 # valid_base64 helps func seems_supported_data_uri do its job quickly
 160 valid_base64 = tuple(is_base64(n) for n in range(256))
 161 
 162 
 163 def start_page(title: str) -> None:
 164     print('<!DOCTYPE html>')
 165     print('<html lang="en">')
 166     print('<head>')
 167     print('    <meta charset="UTF-8">')
 168     print('    <link rel="icon" href="data:,">')
 169     cattr = 'content="width=device-width, initial-scale=1.0"'
 170     print(f'    <meta name="viewport" {cattr}>')
 171     if title:
 172         print(f'    <title>{escape(title)}</title>')
 173     print('    <style>')
 174     print(style)
 175     print('    </style>')
 176     print('</head>')
 177     print('<body>')
 178 
 179 
 180 def html_escape(s: str) -> str:
 181     'Safely escape generic plain-text.'
 182 
 183     s = s.replace('&', '&amp;')
 184     s = s.replace('<', '&lt;')
 185     s = s.replace('>', '&gt;')
 186     return s
 187 
 188 
 189 def shorten(s: str, maxchars: int) -> str:
 190     return s if len(s) <= maxchars else s[:maxchars]
 191 
 192 
 193 def handle_text(src, title: str, first: bool) -> None:
 194     'Render plain-text prose.'
 195 
 196     prev = ''
 197     # buf is a reusable string-buffer for func handle_normal_text_line
 198     buf = StringIO()
 199 
 200     for i, line in enumerate(src):
 201         line = line.rstrip('\r\n').rstrip('\n').rstrip()
 202         if not (prev or line):
 203             # keep skipping empty(ish) lines in runs of such lines
 204             continue
 205 
 206         line = html_escape(line)
 207 
 208         if first and i == 0:
 209             title = title if title else line
 210             start_page(shorten(title, 100))
 211 
 212         if (not line) and prev:
 213             print('</p>')
 214         if not prev:
 215             print('<p>')
 216         prev = line
 217 
 218         if seems_supported_data_uri(line):
 219             handle_data_uri(line)
 220             print('<br>')
 221         else:
 222             handle_normal_text_line(line, buf)
 223 
 224     # don't forget to close last paragraph
 225     if line:
 226         print('</p>')
 227 
 228 
 229 def handle_normal_text_line(line: str, buf: StringIO) -> None:
 230     'Handle prose lines for func handle_text.'
 231 
 232     # get rid of previous buffer content
 233     buf.truncate(0)
 234     buf.seek(0)
 235 
 236     # j keeps track of end of detected hyperlinks, and is used outside
 237     # the regex-match loop to detect trailing parts in lines
 238     j = 0
 239 
 240     # matches is to keep track of whether any matches occurred
 241     matches = 0
 242 
 243     for m in links.finditer(line):
 244         matches += 1
 245         # remember previous index-end, used to emit the part before
 246         # the current match
 247         start = j
 248 
 249         i = m.start()
 250         j = m.end()
 251         # remember part before match
 252         buf.write(escape(line[start:i]))
 253         # replace matched hyperlink with an html anchor tag for it
 254         href = line[i:j]
 255         buf.write(f'<a href="{href}">{href}</a>')
 256 
 257     if matches == 0:
 258         # avoid emptying lines with no matches
 259         print(f'{escape(line)}<br>')
 260         return
 261 
 262     # no need to copy the line when it's not changing anyway
 263     if j > 0:
 264         # don't forget the last part of the line, or the whole line
 265         buf.write(escape(line[j:]))
 266 
 267     buf.write('<br>')
 268     print(buf.getvalue())
 269 
 270 
 271 def seems_supported_data_uri(s: str) -> bool:
 272     supported = ('data:image/','data:audio/', 'data:video/')
 273     if not any(s.startswith(e) for e in supported):
 274         return False
 275 
 276     base64_index = s.find(';base64,')
 277     if base64_index < 0:
 278         return False
 279 
 280     # check all payload bytes
 281     start = base64_index + len(';base64,')
 282     for i, c in enumerate(s):
 283         if i >= start and (not valid_base64[ord(c)]):
 284             return False
 285     return True
 286 
 287 
 288 def handle_data_uri(s: str) -> bool:
 289     if s.startswith('data:image/'):
 290         print('<img src="', end='')
 291         print(s, end='')
 292         print('">', end='')
 293         return True
 294 
 295     if s.startswith('data:audio/'):
 296         print('<audio controls src="', end='')
 297         print(s, end='')
 298         print('">', end='')
 299         return True
 300 
 301     if s.startswith('data:video/'):
 302         print('<video controls src="', end='')
 303         print(s, end='')
 304         print('">', end='')
 305         return True
 306 
 307     return False
 308 
 309 
 310 def anchorize(href: str) -> str:
 311     'Turn a URI into an escaped anchor tag.'
 312     rel = 'rel="noopener noreferrer"'
 313     return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'
 314 
 315 
 316 def seems_url(s: str) -> bool:
 317     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 318     return any(s.startswith(p) for p in protocols)
 319 
 320 
 321 if args.count('-') > 1:
 322     fail('reading from `-` (standard input) more than once not allowed', 1)
 323 
 324 if any(seems_url(e) for e in args):
 325     from urllib.request import urlopen
 326 
 327 try:
 328     for i, path in enumerate(args):
 329         if path == '-':
 330             handle_text(stdin, title, i == 0)
 331             continue
 332 
 333         if seems_url(path):
 334             with urlopen(path) as inp:
 335                 ctype = inp.getheader('Content-Type')
 336                 if not isinstance(ctype, str):
 337                     ctype = ''
 338 
 339                 # try to detect response encoding, if given
 340                 enc = 'utf-8'
 341                 i = ctype.find('charset=')
 342                 if i >= 0:
 343                     enc = ctype[i + len('charset='):]
 344 
 345                 with TextIOWrapper(inp, encoding=enc) as txt:
 346                     handle_text(txt, title, i == 0)
 347             continue
 348 
 349         with open(path, encoding='utf-8') as inp:
 350             handle_text(inp, title, i == 0)
 351 
 352     if len(args) == 0:
 353         handle_text(stdin, title, True)
 354 
 355     print('</body>')
 356     print('</html>')
 357 except BrokenPipeError:
 358     # quit quietly, instead of showing a confusing error message
 359     stderr.close()
 360 except KeyboardInterrupt:
 361     exit(2)
 362 except Exception as e:
 363     fail(e, 1)