File: htmlify.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from html import escape 27 from io import StringIO, TextIOWrapper 28 from re import compile 29 from sys import argv, exit, stderr, stdin 30 from urllib.parse import urlparse, urlunparse 31 32 33 info = ''' 34 htmlify [options...] [filepaths/URIs...] 35 36 37 Render plain-text prose into self-contained HTML. Lines which are just a 38 valid data-URI are turned into pictures, audio, or even video elements. 39 40 All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when 41 lines have multiple URIs in them. 42 43 If a title isn't given from the cmd-line options, the first line is used 44 as the title. 45 46 All (optional) leading options start with either single or double-dash, 47 and most of them change the style/color used. Some of the options are, 48 shown in their single-dash form: 49 50 -h show this help message 51 -help show this help message 52 53 -title use the next argument as the title in the HTML output 54 ''' 55 56 # handle standard help cmd-line options, quitting right away in that case 57 if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'): 58 print(info.strip(), file=stderr) 59 exit(0) 60 61 # links is used in func handle_normal_text_line to handle hyperlinks 62 links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.?/&=-]+''') 63 64 # style is the `inner` CSS used inside the style tag, and handles all 65 # visual styles for all supported input types 66 style = ''' 67 body { 68 margin: 1rem auto 2rem auto; 69 padding: 0.25rem; 70 font-size: 1.1rem; 71 line-height: 1.8rem; 72 font-family: Arial, Helvetica, sans-serif; 73 74 max-width: 95vw; 75 /* width: max-content; */ 76 width: fit-content; 77 78 box-sizing: border-box; 79 display: block; 80 } 81 82 a { 83 color: steelblue; 84 text-decoration: none; 85 } 86 87 p { 88 display: block; 89 margin: auto; 90 width: 80ch; 91 } 92 93 audio { 94 width: 60ch; 95 } 96 97 table { 98 margin: 2rem auto; 99 border-collapse: collapse; 100 } 101 102 thead>* { 103 position: sticky; 104 top: 0; 105 background-color: white; 106 } 107 108 tfoot th { 109 user-select: none; 110 } 111 112 th, td { 113 padding: 0.1rem 1ch; 114 min-width: 4ch; 115 border-bottom: solid thin transparent; 116 } 117 118 tr:nth-child(5n) td { 119 border-bottom: solid thin #ccc; 120 } 121 122 .monospace { 123 font-family: monospace; 124 } 125 '''.strip('\n') 126 127 128 def fail(msg, code: int = 1) -> None: 129 'Show the error message given, and quit the app right away.' 130 print(f'\x1b[31m{msg}\x1b[0m', file=stderr) 131 exit(code) 132 133 134 title = '' 135 start_args = 1 136 while start_args < len(argv) and argv[start_args].startswith('-'): 137 if argv[start_args] in ('-title', '--title'): 138 if start_args + 1 >= len(argv): 139 fail('missing actual title in cmd-line arguments', 1) 140 title = html_escape(argv[start_args + 1]) 141 start_args += 2 142 continue 143 break 144 args = argv[start_args:] 145 146 147 def is_base64(n: int) -> bool: 148 'Help build base64-byte-checker lookup tables.' 149 150 if ord('0') <= n <= ord('9'): 151 return True 152 if ord('A') <= n <= ord('Z'): 153 return True 154 if ord('a') <= n <= ord('z'): 155 return True 156 return n in (ord('+'), ord('/'), ord('=')) 157 158 159 # valid_base64 helps func seems_supported_data_uri do its job quickly 160 valid_base64 = tuple(is_base64(n) for n in range(256)) 161 162 163 def start_page(title: str) -> None: 164 print('<!DOCTYPE html>') 165 print('<html lang="en">') 166 print('<head>') 167 print(' <meta charset="UTF-8">') 168 print(' <link rel="icon" href="data:,">') 169 cattr = 'content="width=device-width, initial-scale=1.0"' 170 print(f' <meta name="viewport" {cattr}>') 171 if title: 172 print(f' <title>{escape(title)}</title>') 173 print(' <style>') 174 print(style) 175 print(' </style>') 176 print('</head>') 177 print('<body>') 178 179 180 def html_escape(s: str) -> str: 181 'Safely escape generic plain-text.' 182 183 s = s.replace('&', '&') 184 s = s.replace('<', '<') 185 s = s.replace('>', '>') 186 return s 187 188 189 def shorten(s: str, maxchars: int) -> str: 190 return s if len(s) <= maxchars else s[:maxchars] 191 192 193 def handle_text(src, title: str, first: bool) -> None: 194 'Render plain-text prose.' 195 196 prev = '' 197 # buf is a reusable string-buffer for func handle_normal_text_line 198 buf = StringIO() 199 200 for i, line in enumerate(src): 201 line = line.rstrip('\r\n').rstrip('\n').rstrip() 202 if not (prev or line): 203 # keep skipping empty(ish) lines in runs of such lines 204 continue 205 206 line = html_escape(line) 207 208 if first and i == 0: 209 title = title if title else line 210 start_page(shorten(title, 100)) 211 212 if (not line) and prev: 213 print('</p>') 214 if not prev: 215 print('<p>') 216 prev = line 217 218 if seems_supported_data_uri(line): 219 handle_data_uri(line) 220 print('<br>') 221 else: 222 handle_normal_text_line(line, buf) 223 224 # don't forget to close last paragraph 225 if line: 226 print('</p>') 227 228 229 def handle_normal_text_line(line: str, buf: StringIO) -> None: 230 'Handle prose lines for func handle_text.' 231 232 # get rid of previous buffer content 233 buf.truncate(0) 234 buf.seek(0) 235 236 # j keeps track of end of detected hyperlinks, and is used outside 237 # the regex-match loop to detect trailing parts in lines 238 j = 0 239 240 # matches is to keep track of whether any matches occurred 241 matches = 0 242 243 for m in links.finditer(line): 244 matches += 1 245 # remember previous index-end, used to emit the part before 246 # the current match 247 start = j 248 249 i = m.start() 250 j = m.end() 251 # remember part before match 252 buf.write(escape(line[start:i])) 253 # replace matched hyperlink with an html anchor tag for it 254 href = line[i:j] 255 buf.write(f'<a href="{href}">{href}</a>') 256 257 if matches == 0: 258 # avoid emptying lines with no matches 259 print(f'{escape(line)}<br>') 260 return 261 262 # no need to copy the line when it's not changing anyway 263 if j > 0: 264 # don't forget the last part of the line, or the whole line 265 buf.write(escape(line[j:])) 266 267 buf.write('<br>') 268 print(buf.getvalue()) 269 270 271 def seems_supported_data_uri(s: str) -> bool: 272 supported = ('data:image/','data:audio/', 'data:video/') 273 if not any(s.startswith(e) for e in supported): 274 return False 275 276 base64_index = s.find(';base64,') 277 if base64_index < 0: 278 return False 279 280 # check all payload bytes 281 start = base64_index + len(';base64,') 282 for i, c in enumerate(s): 283 if i >= start and (not valid_base64[ord(c)]): 284 return False 285 return True 286 287 288 def handle_data_uri(s: str) -> bool: 289 if s.startswith('data:image/'): 290 print('<img src="', end='') 291 print(s, end='') 292 print('">', end='') 293 return True 294 295 if s.startswith('data:audio/'): 296 print('<audio controls src="', end='') 297 print(s, end='') 298 print('">', end='') 299 return True 300 301 if s.startswith('data:video/'): 302 print('<video controls src="', end='') 303 print(s, end='') 304 print('">', end='') 305 return True 306 307 return False 308 309 310 def anchorize(href: str) -> str: 311 'Turn a URI into an escaped anchor tag.' 312 rel = 'rel="noopener noreferrer"' 313 return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>' 314 315 316 def seems_url(s: str) -> bool: 317 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 318 return any(s.startswith(p) for p in protocols) 319 320 321 if args.count('-') > 1: 322 fail('reading from `-` (standard input) more than once not allowed', 1) 323 324 if any(seems_url(e) for e in args): 325 from urllib.request import urlopen 326 327 try: 328 for i, path in enumerate(args): 329 if path == '-': 330 handle_text(stdin, title, i == 0) 331 continue 332 333 if seems_url(path): 334 with urlopen(path) as inp: 335 ctype = inp.getheader('Content-Type') 336 if not isinstance(ctype, str): 337 ctype = '' 338 339 # try to detect response encoding, if given 340 enc = 'utf-8' 341 i = ctype.find('charset=') 342 if i >= 0: 343 enc = ctype[i + len('charset='):] 344 345 with TextIOWrapper(inp, encoding=enc) as txt: 346 handle_text(txt, title, i == 0) 347 continue 348 349 with open(path, encoding='utf-8') as inp: 350 handle_text(inp, title, i == 0) 351 352 if len(args) == 0: 353 handle_text(stdin, title, True) 354 355 print('</body>') 356 print('</html>') 357 except BrokenPipeError: 358 # quit quietly, instead of showing a confusing error message 359 stderr.close() 360 except KeyboardInterrupt: 361 exit(2) 362 except Exception as e: 363 fail(e, 1)