File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from html import escape 27 from io import StringIO, TextIOWrapper 28 from re import compile 29 from sys import argv, exit, stderr, stdin 30 from urllib.parse import urlparse, urlunparse 31 32 33 info = ''' 34 htmlify [options...] [filepaths/URIs...] 35 36 37 Render plain-text prose into self-contained HTML. Lines which are just a 38 valid data-URI are turned into pictures, audio, or even video elements. 39 40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when 41 lines have multiple URIs in them. 42 43 If a title isn't given from the cmd-line options, the first line is used 44 as the title. 45 46 All (optional) leading options start with either single or double-dash, 47 and most of them change the style/color used. Some of the options are, 48 shown in their single-dash form: 49 50 -h, -help show this help message 51 -mono, -monospace use a monospace font for text 52 -t, -title use the next argument as the webpage title 53 ''' 54 55 # handle standard help cmd-line options, quitting right away in that case 56 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 57 print(info.strip()) 58 exit(0) 59 60 # links is used in func handle_normal_text_line to handle hyperlinks 61 links = compile('''(https?|ftps?)://[a-zA-Z0-9_%.,?/&=#-]+''') 62 63 # style is the `inner` CSS used inside the style tag, and handles all 64 # visual styles for all supported input types 65 style = ''' 66 body { 67 margin: 1rem auto 2rem auto; 68 padding: 0.25rem; 69 font-size: 1.1rem; 70 line-height: 1.8rem; 71 font-family: sans-serif; 72 73 max-width: 95vw; 74 /* width: max-content; */ 75 width: fit-content; 76 77 box-sizing: border-box; 78 display: block; 79 } 80 81 a { 82 color: steelblue; 83 text-decoration: none; 84 } 85 86 p { 87 display: block; 88 margin: auto; 89 max-width: 80ch; 90 } 91 92 img { 93 margin: none; 94 } 95 96 audio { 97 width: 60ch; 98 } 99 100 table { 101 margin: 2rem auto; 102 border-collapse: collapse; 103 } 104 105 thead>* { 106 position: sticky; 107 top: 0; 108 background-color: white; 109 } 110 111 tfoot th { 112 user-select: none; 113 } 114 115 th, td { 116 padding: 0.1rem 1ch; 117 min-width: 4ch; 118 border-bottom: solid thin transparent; 119 } 120 121 tr:nth-child(5n) td { 122 border-bottom: solid thin #ccc; 123 } 124 125 .monospace { 126 font-family: monospace; 127 } 128 '''.strip('\n') 129 130 131 def fail(msg, code: int = 1) -> None: 132 'Show the error message given, and quit the app right away.' 133 print(str(msg), file=stderr) 134 exit(code) 135 136 137 title = '' 138 monospace = False 139 start_args = 1 140 while start_args < len(argv) and argv[start_args].startswith('-'): 141 if argv[start_args] in ('-t', '--t', '-title', '--title'): 142 if start_args + 1 >= len(argv): 143 fail('missing actual title in cmd-line arguments', 1) 144 title = escape(argv[start_args + 1]) 145 start_args += 2 146 continue 147 if argv[start_args] in ('-mono', '--mono', '-monospace', '--monospace'): 148 monospace = True 149 start_args += 1 150 continue 151 if argv[start_args] == '--': 152 start_args += 1 153 break 154 break 155 args = argv[start_args:] 156 157 158 def is_base64(n: int) -> bool: 159 if ord('0') <= n <= ord('9'): 160 return True 161 if ord('A') <= n <= ord('Z'): 162 return True 163 if ord('a') <= n <= ord('z'): 164 return True 165 return n in (ord('+'), ord('/'), ord('=')) 166 167 168 # valid_base64 helps func seems_supported_data_uri do its job quickly 169 valid_base64 = tuple(is_base64(n) for n in range(256)) 170 171 172 def start_page(title: str, mono: bool) -> None: 173 print('<!DOCTYPE html>') 174 print('<html lang="en">') 175 print('<head>') 176 print(' <meta charset="UTF-8">') 177 print(' <link rel="icon" href="data:,">') 178 cattr = 'content="width=device-width, initial-scale=1.0"' 179 print(f' <meta name="viewport" {cattr}>') 180 if title: 181 print(f' <title>{escape(title)}</title>') 182 print(' <style>') 183 print(style) 184 print(' </style>') 185 print('</head>') 186 print('<body class="monospace">' if mono else '<body>') 187 188 189 def shorten(s: str, maxchars: int) -> str: 190 return s if len(s) <= maxchars else s[:maxchars] 191 192 193 def handle_text(src, title: str, first: bool, mono: bool) -> None: 194 'Render plain-text prose.' 195 196 prev = '' 197 # buf is a reusable string-buffer for func handle_normal_text_line 198 buf = StringIO() 199 num_lines = 0 200 201 for i, line in enumerate(src): 202 line = line.rstrip('\r\n').rstrip('\n').rstrip() 203 if not (prev or line): 204 # keep skipping empty(ish) lines in runs of such lines 205 continue 206 207 num_lines += 1 208 if first and num_lines == 1: 209 if title: 210 start_page(shorten(title, 100), mono) 211 else: 212 start_page(shorten(line, 100), mono) 213 continue 214 215 if (not line) and prev: 216 print('</p>') 217 if not prev: 218 print('<p>') 219 prev = line 220 221 if seems_supported_data_uri(line): 222 handle_data_uri(line) 223 print('<br>') 224 else: 225 handle_normal_text_line(line, buf) 226 227 # don't forget to close last paragraph 228 if line: 229 print('</p>') 230 231 232 def handle_normal_text_line(line: str, buf: StringIO) -> None: 233 # get rid of previous buffer content 234 buf.truncate(0) 235 buf.seek(0) 236 237 # j keeps track of end of detected hyperlinks, and is used outside 238 # the regex-match loop to detect trailing parts in lines 239 j = 0 240 241 # matches is to keep track of whether any matches occurred 242 matches = 0 243 244 for m in links.finditer(line): 245 matches += 1 246 # remember previous index-end, used to emit the part before 247 # the current match 248 start = j 249 250 i = m.start() 251 j = m.end() 252 # remember part before match 253 buf.write(escape(line[start:i])) 254 # replace matched hyperlink with an html anchor tag for it 255 href = line[i:j] 256 buf.write(f'<a href="{href}">{href}</a>') 257 258 if matches == 0: 259 # avoid emptying lines with no matches 260 print(f'{escape(line)}<br>') 261 return 262 263 # no need to copy the line when it's not changing anyway 264 if j > 0: 265 # don't forget the last part of the line, or the whole line 266 buf.write(escape(line[j:])) 267 268 buf.write('<br>') 269 print(buf.getvalue()) 270 271 272 data_uri_starts = { 273 'data:image/': '<img src="', 274 'data:audio/': '<audio controls src="', 275 'data:video/': '<video controls src="', 276 } 277 278 279 def seems_supported_data_uri(s: str) -> bool: 280 if not any(s.startswith(e) for e in data_uri_starts.keys()): 281 return False 282 283 base64_index = s.find(';base64,') 284 if base64_index < 0: 285 return False 286 287 # check all payload bytes 288 start = base64_index + len(';base64,') 289 for i, c in enumerate(s): 290 if i >= start and (not valid_base64[ord(c)]): 291 return False 292 return True 293 294 295 def handle_data_uri(s: str) -> bool: 296 for kind, start in data_uri_starts.items(): 297 if s.startswith(kind): 298 print(start, end='') 299 print(s, end='') 300 print('">', end='') 301 return True 302 return False 303 304 305 def anchorize(href: str) -> str: 306 rel = 'rel="noopener noreferrer"' 307 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 308 309 310 def seems_url(s: str) -> bool: 311 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 312 return any(s.startswith(p) for p in protocols) 313 314 315 if args.count('-') > 1: 316 fail('reading from `-` (standard input) more than once not allowed', 1) 317 318 if any(seems_url(e) for e in args): 319 from urllib.request import urlopen 320 321 try: 322 for i, path in enumerate(args): 323 is_first_input = i == 0 324 325 if path == '-': 326 handle_text(stdin, title, is_first_input, monospace) 327 continue 328 329 if seems_url(path): 330 with urlopen(path) as inp: 331 ctype = inp.getheader('Content-Type') 332 if not isinstance(ctype, str): 333 ctype = '' 334 335 # try to detect response encoding, if given 336 enc = 'utf-8' 337 i = ctype.find('charset=') 338 if i >= 0: 339 enc = ctype[i + len('charset='):] 340 341 with TextIOWrapper(inp, encoding=enc) as txt: 342 handle_text(txt, title, is_first_input, monospace) 343 continue 344 345 with open(path, encoding='utf-8') as inp: 346 handle_text(inp, title, is_first_input, monospace) 347 348 if len(args) == 0: 349 handle_text(stdin, title, True, monospace) 350 351 print('</body>') 352 print('</html>') 353 except BrokenPipeError: 354 # quit quietly, instead of showing a confusing error message 355 exit(0) 356 except KeyboardInterrupt: 357 exit(2) 358 except Exception as e: 359 fail(e, 1)