File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2026 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from html import escape 27 from io import StringIO, TextIOWrapper 28 from re import compile 29 from sys import argv, exit, stderr, stdin 30 from urllib.parse import urlparse, urlunparse 31 32 33 info = ''' 34 htmlify [options...] [filepaths/URIs...] 35 36 37 Render plain-text prose into self-contained HTML. Lines which are just a 38 valid data-URI are turned into pictures, audio, or even video elements. 39 40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when 41 lines have multiple URIs in them. 42 43 If a title isn't given from the cmd-line options, the first line is used 44 as the title. 45 46 All (optional) leading options start with either single or double-dash, 47 and most of them change the style/color used. Some of the options are, 48 shown in their single-dash form: 49 50 -h, -help show this help message 51 -mono, -monospace use a monospace font for text 52 -t, -title use the next argument as the webpage title 53 ''' 54 55 # handle standard help cmd-line options, quitting right away in that case 56 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 57 print(info.strip()) 58 exit(0) 59 60 # links is used in func handle_normal_text_line to handle hyperlinks 61 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''') 62 63 # style is the `inner` CSS used inside the style tag, and handles all 64 # visual styles for all supported input types 65 style = ''' 66 body { 67 margin: 1rem auto 2rem auto; 68 padding: 0.25rem; 69 font-size: 1.1rem; 70 line-height: 1.8rem; 71 font-family: sans-serif; 72 73 max-width: 95vw; 74 /* width: max-content; */ 75 width: fit-content; 76 77 box-sizing: border-box; 78 display: block; 79 } 80 81 a { 82 color: steelblue; 83 text-decoration: none; 84 } 85 86 p { 87 display: block; 88 margin: auto; 89 max-width: 80ch; 90 } 91 92 img { 93 margin: none; 94 } 95 96 audio { 97 width: 60ch; 98 } 99 100 table { 101 margin: 2rem auto; 102 border-collapse: collapse; 103 } 104 105 thead>* { 106 position: sticky; 107 top: 0; 108 background-color: white; 109 } 110 111 tfoot th { 112 user-select: none; 113 } 114 115 th, td { 116 padding: 0.1rem 1ch; 117 min-width: 4ch; 118 border-bottom: solid thin transparent; 119 } 120 121 tr:nth-child(5n) td { 122 border-bottom: solid thin #ccc; 123 } 124 125 .monospace { 126 font-family: monospace; 127 } 128 '''.strip('\n') 129 130 131 # fail shows the error message given, and quits the app right away 132 def fail(msg, code: int = 1) -> None: 133 print(str(msg), file=stderr) 134 exit(code) 135 136 137 title = '' 138 monospace = False 139 start_args = 1 140 while start_args < len(argv) and argv[start_args].startswith('-'): 141 if argv[start_args] in ('-t', '--t', '-title', '--title'): 142 if start_args + 1 >= len(argv): 143 fail('missing actual title in cmd-line arguments', 1) 144 # title = escape(argv[start_args + 1]) 145 title = argv[start_args + 1] 146 start_args += 2 147 continue 148 if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'): 149 monospace = True 150 start_args += 1 151 continue 152 if argv[start_args] == '--': 153 start_args += 1 154 break 155 break 156 args = argv[start_args:] 157 158 159 def start_page(title: str, mono: bool) -> None: 160 print('<!DOCTYPE html>') 161 print('<html lang="en">') 162 print('<head>') 163 print(' <meta charset="UTF-8">') 164 print(' <link rel="icon" href="data:,">') 165 cattr = 'content="width=device-width, initial-scale=1.0"' 166 print(f' <meta name="viewport" {cattr}>') 167 if title: 168 print(f' <title>{escape(title)}</title>') 169 print(' <style>') 170 print(style) 171 print(' </style>') 172 print('</head>') 173 print('<body class="monospace">' if mono else '<body>') 174 175 176 def shorten(s: str, maxchars: int) -> str: 177 return s if len(s) <= maxchars else s[:maxchars] 178 179 180 # handle_text renders plain-text prose 181 def handle_text(src, title: str, first: bool, mono: bool) -> None: 182 prev = '' 183 # buf is a reusable string-buffer for func handle_normal_text_line 184 buf = StringIO() 185 num_lines = 0 186 187 for i, line in enumerate(src): 188 line = line.rstrip('\r\n').rstrip('\n').rstrip() 189 if not (prev or line): 190 # keep skipping empty(ish) lines in runs of such lines 191 continue 192 193 num_lines += 1 194 if first and num_lines == 1: 195 if title: 196 start_page(shorten(title, 100), mono) 197 else: 198 start_page(shorten(line, 100), mono) 199 continue 200 201 if (not line) and prev: 202 print('</p>') 203 if not prev: 204 print('<p>') 205 prev = line 206 207 if seems_supported_data_uri(line): 208 handle_data_uri(line) 209 print('<br>') 210 else: 211 handle_normal_text_line(line, buf) 212 213 # don't forget to close last paragraph 214 if line: 215 print('</p>') 216 217 218 def handle_normal_text_line(line: str, buf: StringIO) -> None: 219 # get rid of previous buffer content 220 buf.truncate(0) 221 buf.seek(0) 222 223 # j keeps track of end of detected hyperlinks, and is used outside 224 # the regex-match loop to detect trailing parts in lines 225 j = 0 226 227 # matches is to keep track of whether any matches occurred 228 matches = 0 229 230 for m in links.finditer(line): 231 matches += 1 232 # remember previous index-end, used to emit the part before 233 # the current match 234 start = j 235 236 i = m.start() 237 j = m.end() 238 # remember part before match 239 buf.write(escape(line[start:i])) 240 # replace matched hyperlink with an html anchor tag for it 241 href = line[i:j] 242 buf.write(f'<a href="{href}">{href}</a>') 243 244 if matches == 0: 245 # avoid emptying lines with no matches 246 print(f'{escape(line)}<br>') 247 return 248 249 # no need to copy the line when it's not changing anyway 250 if j > 0: 251 # don't forget the last part of the line, or the whole line 252 buf.write(escape(line[j:])) 253 254 buf.write('<br>') 255 print(buf.getvalue()) 256 257 258 def is_base64(n: int) -> bool: 259 if ord('0') <= n <= ord('9'): 260 return True 261 if ord('A') <= n <= ord('Z'): 262 return True 263 if ord('a') <= n <= ord('z'): 264 return True 265 return n in (ord('+'), ord('/'), ord('=')) 266 267 268 valid_base64 = tuple(is_base64(n) for n in range(256)) 269 270 data_uri_wraps = { 271 'data:image/': ('<img src="', '">'), 272 'data:audio/': ('<audio controls src="', '"></audio>'), 273 'data:video/': ('<video controls src="', '"></video>'), 274 } 275 276 277 def seems_supported_data_uri(s: str) -> bool: 278 if not any(s.startswith(e) for e in data_uri_starts.keys()): 279 return False 280 281 base64_index = s.find(';base64,') 282 if base64_index < 0: 283 return False 284 285 # check all payload bytes 286 start = base64_index + len(';base64,') 287 for i, c in enumerate(s): 288 if i >= start and (not valid_base64[ord(c)]): 289 return False 290 return True 291 292 293 def handle_data_uri(s: str) -> bool: 294 for kind, (start, end) in data_uri_starts.items(): 295 if s.startswith(kind): 296 print(start, end='') 297 print(s, end='') 298 print(end, end='') 299 return True 300 return False 301 302 303 def anchorize(href: str) -> str: 304 rel = 'rel="noopener noreferrer"' 305 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 306 307 308 def seems_url(s: str) -> bool: 309 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 310 return any(s.startswith(p) for p in protocols) 311 312 313 if args.count('-') > 1: 314 fail('reading from `-` (standard input) more than once not allowed', 1) 315 316 if any(seems_url(e) for e in args): 317 from urllib.request import urlopen 318 319 try: 320 for i, path in enumerate(args): 321 is_first_input = i == 0 322 323 if path == '-': 324 handle_text(stdin, title, is_first_input, monospace) 325 continue 326 327 if seems_url(path): 328 with urlopen(path) as inp: 329 with TextIOWrapper(inp, encoding='utf-8') as txt: 330 handle_text(txt, title, is_first_input, monospace) 331 continue 332 333 with open(path, encoding='utf-8') as inp: 334 handle_text(inp, title, is_first_input, monospace) 335 336 if len(args) == 0: 337 handle_text(stdin, title, True, monospace) 338 339 print('</body>') 340 print('</html>') 341 except BrokenPipeError: 342 # quit quietly, instead of showing a confusing error message 343 exit(0) 344 except KeyboardInterrupt: 345 exit(2) 346 except Exception as e: 347 fail(e, 1)