File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2020-2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from html import escape 27 from io import StringIO, TextIOWrapper 28 from re import compile 29 from sys import argv, exit, stderr, stdin 30 from urllib.parse import urlparse, urlunparse 31 32 33 info = ''' 34 htmlify [options...] [filepaths/URIs...] 35 36 37 Render plain-text prose into self-contained HTML. Lines which are just a 38 valid data-URI are turned into pictures, audio, or even video elements. 39 40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when 41 lines have multiple URIs in them. 42 43 If a title isn't given from the cmd-line options, the first line is used 44 as the title. 45 46 All (optional) leading options start with either single or double-dash, 47 and most of them change the style/color used. Some of the options are, 48 shown in their single-dash form: 49 50 -h show this help message 51 -help show this help message 52 53 -title use the next argument as the title in the HTML output 54 ''' 55 56 # handle standard help cmd-line options, quitting right away in that case 57 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 58 print(info.strip()) 59 exit(0) 60 61 # links is used in func handle_normal_text_line to handle hyperlinks 62 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''') 63 64 # style is the `inner` CSS used inside the style tag, and handles all 65 # visual styles for all supported input types 66 style = ''' 67 body { 68 margin: 1rem auto 2rem auto; 69 padding: 0.25rem; 70 font-size: 1.1rem; 71 line-height: 1.8rem; 72 font-family: sans-serif; 73 74 max-width: 95vw; 75 /* width: max-content; */ 76 width: fit-content; 77 78 box-sizing: border-box; 79 display: block; 80 } 81 82 a { 83 color: steelblue; 84 text-decoration: none; 85 } 86 87 p { 88 display: block; 89 margin: auto; 90 max-width: 80ch; 91 } 92 93 img { 94 margin: none; 95 } 96 97 audio { 98 width: 60ch; 99 } 100 101 table { 102 margin: 2rem auto; 103 border-collapse: collapse; 104 } 105 106 thead>* { 107 position: sticky; 108 top: 0; 109 background-color: white; 110 } 111 112 tfoot th { 113 user-select: none; 114 } 115 116 th, td { 117 padding: 0.1rem 1ch; 118 min-width: 4ch; 119 border-bottom: solid thin transparent; 120 } 121 122 tr:nth-child(5n) td { 123 border-bottom: solid thin #ccc; 124 } 125 126 .monospace { 127 font-family: monospace; 128 } 129 '''.strip('\n') 130 131 132 def fail(msg, code: int = 1) -> None: 133 'Show the error message given, and quit the app right away.' 134 print(f'\x1b[31m{msg}\x1b[0m', file=stderr) 135 exit(code) 136 137 138 title = '' 139 monospace = False 140 start_args = 1 141 while start_args < len(argv) and argv[start_args].startswith('-'): 142 if argv[start_args] in ('-title', '--title'): 143 if start_args + 1 >= len(argv): 144 fail('missing actual title in cmd-line arguments', 1) 145 title = escape(argv[start_args + 1]) 146 start_args += 2 147 continue 148 if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'): 149 monospace = True 150 start_args += 1 151 continue 152 if argv[start_args] == '--': 153 start_args += 1 154 break 155 break 156 args = argv[start_args:] 157 158 159 def is_base64(n: int) -> bool: 160 'Help build base64-byte-checker lookup tables.' 161 162 if ord('0') <= n <= ord('9'): 163 return True 164 if ord('A') <= n <= ord('Z'): 165 return True 166 if ord('a') <= n <= ord('z'): 167 return True 168 return n in (ord('+'), ord('/'), ord('=')) 169 170 171 # valid_base64 helps func seems_supported_data_uri do its job quickly 172 valid_base64 = tuple(is_base64(n) for n in range(256)) 173 174 175 def start_page(title: str, mono: bool) -> None: 176 print('<!DOCTYPE html>') 177 print('<html lang="en">') 178 print('<head>') 179 print(' <meta charset="UTF-8">') 180 print(' <link rel="icon" href="data:,">') 181 cattr = 'content="width=device-width, initial-scale=1.0"' 182 print(f' <meta name="viewport" {cattr}>') 183 if title: 184 print(f' <title>{escape(title)}</title>') 185 print(' <style>') 186 print(style) 187 print(' </style>') 188 print('</head>') 189 print('<body class="monospace">' if mono else '<body>') 190 191 192 def shorten(s: str, maxchars: int) -> str: 193 return s if len(s) <= maxchars else s[:maxchars] 194 195 196 def handle_text(src, title: str, first: bool, mono: bool) -> None: 197 'Render plain-text prose.' 198 199 prev = '' 200 # buf is a reusable string-buffer for func handle_normal_text_line 201 buf = StringIO() 202 num_lines = 0 203 204 for i, line in enumerate(src): 205 line = line.rstrip('\r\n').rstrip('\n').rstrip() 206 if not (prev or line): 207 # keep skipping empty(ish) lines in runs of such lines 208 continue 209 210 num_lines += 1 211 if first and num_lines == 1: 212 if title: 213 start_page(shorten(title, 100), mono) 214 else: 215 start_page(shorten(line, 100), mono) 216 continue 217 218 if (not line) and prev: 219 print('</p>') 220 if not prev: 221 print('<p>') 222 prev = line 223 224 if seems_supported_data_uri(line): 225 handle_data_uri(line) 226 print('<br>') 227 else: 228 handle_normal_text_line(line, buf) 229 230 # don't forget to close last paragraph 231 if line: 232 print('</p>') 233 234 235 def handle_normal_text_line(line: str, buf: StringIO) -> None: 236 'Handle prose lines for func handle_text.' 237 238 # get rid of previous buffer content 239 buf.truncate(0) 240 buf.seek(0) 241 242 # j keeps track of end of detected hyperlinks, and is used outside 243 # the regex-match loop to detect trailing parts in lines 244 j = 0 245 246 # matches is to keep track of whether any matches occurred 247 matches = 0 248 249 for m in links.finditer(line): 250 matches += 1 251 # remember previous index-end, used to emit the part before 252 # the current match 253 start = j 254 255 i = m.start() 256 j = m.end() 257 # remember part before match 258 buf.write(escape(line[start:i])) 259 # replace matched hyperlink with an html anchor tag for it 260 href = line[i:j] 261 buf.write(f'<a href="{href}">{href}</a>') 262 263 if matches == 0: 264 # avoid emptying lines with no matches 265 print(f'{escape(line)}<br>') 266 return 267 268 # no need to copy the line when it's not changing anyway 269 if j > 0: 270 # don't forget the last part of the line, or the whole line 271 buf.write(escape(line[j:])) 272 273 buf.write('<br>') 274 print(buf.getvalue()) 275 276 277 data_uri_starts = { 278 'data:image/': '<img src="', 279 'data:audio/': '<audio controls src="', 280 'data:video/': '<video controls src="', 281 } 282 283 284 def seems_supported_data_uri(s: str) -> bool: 285 if not any(s.startswith(e) for e in data_uri_starts.keys()): 286 return False 287 288 base64_index = s.find(';base64,') 289 if base64_index < 0: 290 return False 291 292 # check all payload bytes 293 start = base64_index + len(';base64,') 294 for i, c in enumerate(s): 295 if i >= start and (not valid_base64[ord(c)]): 296 return False 297 return True 298 299 300 def handle_data_uri(s: str) -> bool: 301 for kind, start in data_uri_starts.items(): 302 if s.startswith(kind): 303 print(start, end='') 304 print(s, end='') 305 print('">', end='') 306 return True 307 return False 308 309 310 def anchorize(href: str) -> str: 311 rel = 'rel="noopener noreferrer"' 312 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 313 314 315 def seems_url(s: str) -> bool: 316 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 317 return any(s.startswith(p) for p in protocols) 318 319 320 if args.count('-') > 1: 321 fail('reading from `-` (standard input) more than once not allowed', 1) 322 323 if any(seems_url(e) for e in args): 324 from urllib.request import urlopen 325 326 try: 327 for i, path in enumerate(args): 328 is_first_input = i == 0 329 330 if path == '-': 331 handle_text(stdin, title, is_first_input, monospace) 332 continue 333 334 if seems_url(path): 335 with urlopen(path) as inp: 336 ctype = inp.getheader('Content-Type') 337 if not isinstance(ctype, str): 338 ctype = '' 339 340 # try to detect response encoding, if given 341 enc = 'utf-8' 342 i = ctype.find('charset=') 343 if i >= 0: 344 enc = ctype[i + len('charset='):] 345 346 with TextIOWrapper(inp, encoding=enc) as txt: 347 handle_text(txt, title, is_first_input, monospace) 348 continue 349 350 with open(path, encoding='utf-8') as inp: 351 handle_text(inp, title, is_first_input, monospace) 352 353 if len(args) == 0: 354 handle_text(stdin, title, True, monospace) 355 356 print('</body>') 357 print('</html>') 358 except BrokenPipeError: 359 # quit quietly, instead of showing a confusing error message 360 exit(0) 361 except KeyboardInterrupt: 362 exit(2) 363 except Exception as e: 364 fail(e, 1)