File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from html import escape 27 from io import StringIO, TextIOWrapper 28 from re import compile 29 from sys import argv, exit, stderr, stdin 30 from urllib.parse import urlparse, urlunparse 31 32 33 info = ''' 34 htmlify [options...] [filepaths/URIs...] 35 36 37 Render plain-text prose into self-contained HTML. Lines which are just a 38 valid data-URI are turned into pictures, audio, or even video elements. 39 40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when 41 lines have multiple URIs in them. 42 43 If a title isn't given from the cmd-line options, the first line is used 44 as the title. 45 46 All (optional) leading options start with either single or double-dash, 47 and most of them change the style/color used. Some of the options are, 48 shown in their single-dash form: 49 50 -h show this help message 51 -help show this help message 52 53 -title use the next argument as the title in the HTML output 54 ''' 55 56 # handle standard help cmd-line options, quitting right away in that case 57 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 58 print(info.strip()) 59 exit(0) 60 61 # links is used in func handle_normal_text_line to handle hyperlinks 62 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.,?/&=-]+''') 63 64 # style is the `inner` CSS used inside the style tag, and handles all 65 # visual styles for all supported input types 66 style = ''' 67 body { 68 margin: 1rem auto 2rem auto; 69 padding: 0.25rem; 70 font-size: 1.1rem; 71 line-height: 1.8rem; 72 font-family: Arial, Helvetica, sans-serif; 73 74 max-width: 95vw; 75 /* width: max-content; */ 76 width: fit-content; 77 78 box-sizing: border-box; 79 display: block; 80 } 81 82 a { 83 color: steelblue; 84 text-decoration: none; 85 } 86 87 p { 88 display: block; 89 margin: auto; 90 width: 80ch; 91 } 92 93 audio { 94 width: 60ch; 95 } 96 97 table { 98 margin: 2rem auto; 99 border-collapse: collapse; 100 } 101 102 thead>* { 103 position: sticky; 104 top: 0; 105 background-color: white; 106 } 107 108 tfoot th { 109 user-select: none; 110 } 111 112 th, td { 113 padding: 0.1rem 1ch; 114 min-width: 4ch; 115 border-bottom: solid thin transparent; 116 } 117 118 tr:nth-child(5n) td { 119 border-bottom: solid thin #ccc; 120 } 121 122 .monospace { 123 font-family: monospace; 124 } 125 '''.strip('\n') 126 127 128 def fail(msg, code: int = 1) -> None: 129 'Show the error message given, and quit the app right away.' 130 print(f'\x1b[31m{msg}\x1b[0m', file=stderr) 131 exit(code) 132 133 134 title = '' 135 start_args = 1 136 while start_args < len(argv) and argv[start_args].startswith('-'): 137 if argv[start_args] in ('-title', '--title'): 138 if start_args + 1 >= len(argv): 139 fail('missing actual title in cmd-line arguments', 1) 140 title = escape(argv[start_args + 1]) 141 start_args += 2 142 continue 143 break 144 args = argv[start_args:] 145 146 147 def is_base64(n: int) -> bool: 148 'Help build base64-byte-checker lookup tables.' 149 150 if ord('0') <= n <= ord('9'): 151 return True 152 if ord('A') <= n <= ord('Z'): 153 return True 154 if ord('a') <= n <= ord('z'): 155 return True 156 return n in (ord('+'), ord('/'), ord('=')) 157 158 159 # valid_base64 helps func seems_supported_data_uri do its job quickly 160 valid_base64 = tuple(is_base64(n) for n in range(256)) 161 162 163 def start_page(title: str) -> None: 164 print('<!DOCTYPE html>') 165 print('<html lang="en">') 166 print('<head>') 167 print(' <meta charset="UTF-8">') 168 print(' <link rel="icon" href="data:,">') 169 cattr = 'content="width=device-width, initial-scale=1.0"' 170 print(f' <meta name="viewport" {cattr}>') 171 if title: 172 print(f' <title>{escape(title)}</title>') 173 print(' <style>') 174 print(style) 175 print(' </style>') 176 print('</head>') 177 print('<body>') 178 179 180 def shorten(s: str, maxchars: int) -> str: 181 return s if len(s) <= maxchars else s[:maxchars] 182 183 184 def handle_text(src, title: str, first: bool) -> None: 185 'Render plain-text prose.' 186 187 prev = '' 188 # buf is a reusable string-buffer for func handle_normal_text_line 189 buf = StringIO() 190 num_lines = 0 191 192 for i, line in enumerate(src): 193 line = line.rstrip('\r\n').rstrip('\n').rstrip() 194 if not (prev or line): 195 # keep skipping empty(ish) lines in runs of such lines 196 continue 197 198 num_lines += 1 199 if first and num_lines == 1: 200 if title: 201 start_page(shorten(title, 100)) 202 else: 203 start_page(shorten(line, 100)) 204 continue 205 206 if (not line) and prev: 207 print('</p>') 208 if not prev: 209 print('<p>') 210 prev = line 211 212 if seems_supported_data_uri(line): 213 handle_data_uri(line) 214 print('<br>') 215 else: 216 handle_normal_text_line(line, buf) 217 218 # don't forget to close last paragraph 219 if line: 220 print('</p>') 221 222 223 def handle_normal_text_line(line: str, buf: StringIO) -> None: 224 'Handle prose lines for func handle_text.' 225 226 # get rid of previous buffer content 227 buf.truncate(0) 228 buf.seek(0) 229 230 # j keeps track of end of detected hyperlinks, and is used outside 231 # the regex-match loop to detect trailing parts in lines 232 j = 0 233 234 # matches is to keep track of whether any matches occurred 235 matches = 0 236 237 for m in links.finditer(line): 238 matches += 1 239 # remember previous index-end, used to emit the part before 240 # the current match 241 start = j 242 243 i = m.start() 244 j = m.end() 245 # remember part before match 246 buf.write(escape(line[start:i])) 247 # replace matched hyperlink with an html anchor tag for it 248 href = line[i:j] 249 buf.write(f'<a href="{href}">{href}</a>') 250 251 if matches == 0: 252 # avoid emptying lines with no matches 253 print(f'{escape(line)}<br>') 254 return 255 256 # no need to copy the line when it's not changing anyway 257 if j > 0: 258 # don't forget the last part of the line, or the whole line 259 buf.write(escape(line[j:])) 260 261 buf.write('<br>') 262 print(buf.getvalue()) 263 264 265 data_uri_starts = { 266 'data:image/': '<img src="', 267 'data:audio/': '<audio controls src="', 268 'data:video/': '<video controls src="', 269 } 270 271 272 def seems_supported_data_uri(s: str) -> bool: 273 if not any(s.startswith(e) for e in data_uri_starts.keys()): 274 return False 275 276 base64_index = s.find(';base64,') 277 if base64_index < 0: 278 return False 279 280 # check all payload bytes 281 start = base64_index + len(';base64,') 282 for i, c in enumerate(s): 283 if i >= start and (not valid_base64[ord(c)]): 284 return False 285 return True 286 287 288 def handle_data_uri(s: str) -> bool: 289 for kind, start in data_uri_starts.items(): 290 if s.starts_with(kind): 291 print(start, end='') 292 print(s, end='') 293 print('">', end='') 294 return True 295 return False 296 297 298 def anchorize(href: str) -> str: 299 rel = 'rel="noopener noreferrer"' 300 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 301 302 303 def seems_url(s: str) -> bool: 304 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 305 return any(s.startswith(p) for p in protocols) 306 307 308 if args.count('-') > 1: 309 fail('reading from `-` (standard input) more than once not allowed', 1) 310 311 if any(seems_url(e) for e in args): 312 from urllib.request import urlopen 313 314 try: 315 for i, path in enumerate(args): 316 if path == '-': 317 handle_text(stdin, title, i == 0) 318 continue 319 320 if seems_url(path): 321 with urlopen(path) as inp: 322 ctype = inp.getheader('Content-Type') 323 if not isinstance(ctype, str): 324 ctype = '' 325 326 # try to detect response encoding, if given 327 enc = 'utf-8' 328 i = ctype.find('charset=') 329 if i >= 0: 330 enc = ctype[i + len('charset='):] 331 332 with TextIOWrapper(inp, encoding=enc) as txt: 333 handle_text(txt, title, i == 0) 334 continue 335 336 with open(path, encoding='utf-8') as inp: 337 handle_text(inp, title, i == 0) 338 339 if len(args) == 0: 340 handle_text(stdin, title, True) 341 342 print('</body>') 343 print('</html>') 344 except BrokenPipeError: 345 # quit quietly, instead of showing a confusing error message 346 stderr.close() 347 exit(0) 348 except KeyboardInterrupt: 349 # stderr.close() 350 exit(2) 351 except Exception as e: 352 fail(e, 1)