File: datauri.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2024 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from base64 import b64encode 27 from re import compile as compile_re 28 from sys import argv, exit, stderr, stdin, stdout 29 30 31 info = ''' 32 datauri [options...] [filenames...] 33 34 35 Encode bytes as data-URIs, auto-detecting the file/data type using the first 36 few bytes from each data/file stream. When given multiple inputs, the output 37 will be multiple lines, one for each file given. 38 39 Empty files/inputs result in empty lines. A simple dash (-) stands for the 40 standard-input, which is also used automatically when not given any files. 41 42 Data-URIs are base64-encoded text representations of arbitrary data, which 43 include their payload's MIME-type, and which are directly useable/shareable 44 in web-browsers as links, despite not looking like normal links/URIs. 45 46 Some web-browsers limit the size of handled data-URIs to tens of kilobytes. 47 48 Options 49 50 -h, -help, --h, --help show this help message 51 -f, -fallback, --f, --fallback change the fallback MIME type 52 ''' 53 54 # no args or a leading help-option arg means show the help message and quit 55 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 56 print(info.strip()) 57 exit(0) 58 59 60 # hdr_dispatch groups format-description-groups by their first byte, thus 61 # shortening total lookups for some data header: notice how the `ftyp` data 62 # formats aren't handled here, since these can include any byte in parts of 63 # their first few bytes 64 hdr_dispatch = { 65 0x00: [ 66 (b'\x00\x00\x01\xba', 'video/mpeg'), 67 (b'\x00\x00\x01\xb3', 'video/mpeg'), 68 (b'\x00\x00\x01\x00', 'image/x-icon'), 69 (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'), 70 ], 71 0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format 72 0x23: [ 73 (b'#! ', 'text/plain; charset=UTF-8'), 74 (b'#!/', 'text/plain; charset=UTF-8'), 75 ], 76 0x25: [(b'%PDF', 'application/pdf'), (b'%!PS', 'application/postscript')], 77 0x2e: [(b'.snd', 'audio/basic')], 78 0x47: [(b'GIF87a', 'image/gif'), (b'GIF89a', 'image/gif')], 79 0x49: [ 80 # some MP3s start with an ID3 meta-data section 81 (b'ID3\x02', 'audio/mpeg'), 82 (b'ID3\x03', 'audio/mpeg'), 83 (b'ID3\x04', 'audio/mpeg'), 84 (b'II*\x00', 'image/tiff'), 85 ], 86 0x4d: [(b'MM\x00*', 'image/tiff'), (b'MThd', 'audio/midi')], 87 0x4f: [(b'OggS', 'audio/ogg')], 88 0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')], 89 0x66: [(b'fLaC', 'audio/x-flac')], 90 0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')], 91 0xff: [ 92 (b'\xff\xd8\xff', 'image/jpeg'), 93 # handle common ways MP3 data start 94 (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'), 95 (b'\xff\xfb', 'audio/mpeg'), 96 ], 97 } 98 99 100 # ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats 101 ftyp_types = ( 102 (b'M4A ', 'audio/aac'), 103 (b'M4A\x00', 'audio/aac'), 104 (b'dash', 'audio/aac'), 105 (b'isom', 'video/mp4'), 106 # (b'isom', 'audio/aac'), 107 (b'MSNV', 'video/mp4'), 108 (b'qt ', 'video/quicktime'), 109 (b'heic', 'image/heic'), 110 (b'avif', 'image/avif'), 111 ) 112 113 # xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML 114 xmlish_heuristics = ( 115 (b'<html>', 'text/html'), 116 (b'<html ', 'text/html'), 117 (b'<head>', 'text/html'), 118 (b'<body>', 'text/html'), 119 (b'<!DOCTYPE html', 'text/html'), 120 (b'<svg>', 'image/svg+xml'), 121 (b'<svg ', 'image/svg+xml'), 122 (b'<?xml', 'application/xml'), 123 ) 124 125 # json_heuristics helps func guess_mime auto-detect JSON via regexes: 126 # it's not perfect, but it seems effective-enough in practice 127 json_heuristics = ( 128 compile_re(b'''^\\s*\\{\\s*"'''), 129 compile_re(b'''^\\s*\\{\\s*\\['''), 130 compile_re(b'''^\\s*\\[\\s*"'''), 131 compile_re(b'''^\\s*\\[\\s*\\{'''), 132 compile_re(b'''^\\s*\\[\\s*\\['''), 133 ) 134 135 136 def exact_match(header: bytes, maybe: bytes) -> bool: 137 if len(header) < len(maybe): 138 # not enough bytes to tell if input data match 139 return False 140 return all(x == y for x, y in zip(header, maybe)) 141 142 143 def match_riff(header: bytes) -> str: 144 'Handle a few special cases for func guess_mime.' 145 146 if len(header) < 12 or not header.startswith(b'RIFF'): 147 return '' 148 149 if header.find(b'WEBP', 8, 12) == 8: 150 return 'image/webp' 151 if header.find(b'WAVE', 8, 12) == 8: 152 return 'audio/x-wav' 153 if header.find(b'AVI ', 8, 12) == 8: 154 return 'video/avi' 155 return '' 156 157 158 def match_form(header: bytes) -> str: 159 'Handle a few special cases for func guess_mime.' 160 161 if len(header) < 12 or not header.startswith(b'FORM'): 162 return '' 163 164 if header.find(b'AIFF', 8, 12) == 8: 165 return 'audio/aiff' 166 if header.find(b'AIFC', 8, 12) == 8: 167 return 'audio/aiff' 168 return '' 169 170 171 def match_ftyp(header: bytes) -> str: 172 'Handle a few special cases for func guess_mime.' 173 174 # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp' 175 if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4: 176 return '' 177 178 # next 4 bytes after the ASCII 'ftyp' declare the data-format 179 for marker, mime in ftyp_types: 180 if header.find(marker, 8, 12) == 8: 181 return mime 182 183 # unrecognized MPEG-4-style data-format 184 return '' 185 186 187 def guess_mime(header: bytes, fallback: str) -> str: 188 'Try to auto-detect common MIME-types, given the first few input bytes.' 189 190 # no bytes, no match 191 if len(header) == 0: 192 return fallback 193 194 # check the RIFF formats, AIFF audio, and MPEG-4-like formats 195 for f in (match_riff, match_form, match_ftyp): 196 m = f(header) 197 if m != '': 198 return m 199 200 # maybe it's a bitmap picture, which almost always has 40 on 15th byte 201 if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14: 202 return 'image/x-bmp' 203 204 # check general lookup-table 205 if header[0] in hdr_dispatch: 206 for maybe in hdr_dispatch[header[0]]: 207 if exact_match(header, maybe[0]): 208 return maybe[1] 209 210 if header.find(b'<!DOCTYPE html', 0, 64) >= 0: 211 return 'text/html' 212 213 # try HTML, SVG, and even XML 214 if header.find(b'<', 0, 8) >= 0: 215 for marker, mime in xmlish_heuristics: 216 if header.find(marker, 0, 64) >= 0: 217 return mime 218 219 # try some common cases for JSON 220 for pattern in json_heuristics: 221 if pattern.match(header): 222 return 'application/json' 223 224 # nothing matched 225 return fallback 226 227 228 def seems_url(s: str) -> bool: 229 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 230 return any(s.startswith(p) for p in protocols) 231 232 233 def handle_input(w, r, path: str, mime_fallback: str) -> None: 234 chunk_size = 3 * 1024 * 1024 235 chunk = r.read(chunk_size) 236 237 # empty data-streams result in empty lines 238 if not chunk: 239 w.write(b'\n') 240 w.flush() 241 return 242 243 mime = guess_mime(chunk, mime_fallback) 244 if not mime: 245 raise Exception(f'{path}: can\'t guess MIME-type') 246 247 if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')): 248 w.write(b'data:') 249 w.write(bytes(mime, encoding='utf-8')) 250 w.write(b';base64,') 251 252 while True: 253 w.write(b64encode(chunk)) 254 prev_size = len(chunk) 255 chunk = r.read(chunk_size) 256 if not chunk: 257 break 258 if prev_size % 3 != 0: 259 raise ValueError('latest chunk-size isn\'t a multiple of 3') 260 261 w.write(b'\n') 262 w.flush() 263 264 265 fallback_aliases = { 266 # 'text/json': 'application/json', 267 268 # 'xbmp': 'image/x-bmp', 269 # 'xflac': 'audio/x-flac', 270 # 'xicon': 'image/x-icon', 271 # 'xm4v': 'video/x-m4v', 272 # 'xsqlite3': 'application/x-sqlite3', 273 # 'xwav': 'audio/x-wav', 274 # 'xwave': 'audio/x-wav', 275 # 'x-bmp': 'image/x-bmp', 276 # 'x-flac': 'audio/x-flac', 277 # 'x-icon': 'image/x-icon', 278 # 'x-m4v': 'video/x-m4v', 279 # 'x-sqlite3': 'application/x-sqlite3', 280 # 'x-wav': 'audio/x-wav', 281 282 'b': 'application/octet-stream', 283 'j': 'application/json', 284 't': 'text/plain', 285 'u': 'text/plain; charset=UTF-8', 286 287 'e': '', 288 'err': '', 289 'error': '', 290 'f': '', 291 'fail': '', 292 293 'aac': 'audio/aac', 294 'aif': 'audio/aiff', 295 'bin': 'application/octet-stream', 296 'binary': 'application/octet-stream', 297 'gzip': 'application/gzip', 298 'midi': 'audio/midi', 299 'mpeg': 'video/mpeg', 300 'octet': 'application/octet-stream', 301 'octetstream': 'application/octet-stream', 302 'octet-stream': 'application/octet-stream', 303 'plain': 'text/plain', 304 'sqlite': 'application/x-sqlite3', 305 'svg+xml': 'image/svg+xml', 306 'tif': 'image/tiff', 307 'utf8': 'text/plain; charset=UTF-8', 308 'utf-8': 'text/plain; charset=UTF-8', 309 'wave': 'audio/x-wav', 310 'zstd': 'application/zstd', 311 312 'aiff': 'audio/aiff', 313 'au': 'audio/basic', 314 'avi': 'video/avi', 315 'avif': 'image/avif', 316 'bmp': 'image/x-bmp', 317 'caf': 'audio/x-caf', 318 'cur': 'image/vnd.microsoft.icon', 319 'css': 'text/css', 320 'csv': 'text/csv', 321 'djvu': 'image/x-djvu', 322 'elf': 'application/x-elf', 323 'exe': 'application/vnd.microsoft.portable-executable', 324 'flac': 'audio/x-flac', 325 'gif': 'image/gif', 326 'gz': 'application/gzip', 327 'heic': 'image/heic', 328 'htm': 'text/html', 329 'html': 'text/html', 330 'ico': 'image/x-icon', 331 'iso': 'application/octet-stream', 332 'jpg': 'image/jpeg', 333 'jpeg': 'image/jpeg', 334 'js': 'application/javascript', 335 'json': 'application/json', 336 'm4a': 'audio/aac', 337 'm4v': 'video/x-m4v', 338 'mid': 'audio/midi', 339 'mov': 'video/quicktime', 340 'mp4': 'video/mp4', 341 'mp3': 'audio/mpeg', 342 'mpg': 'video/mpeg', 343 'ogg': 'audio/ogg', 344 'opus': 'audio/opus', 345 'pdf': 'application/pdf', 346 'png': 'image/png', 347 'ps': 'application/postscript', 348 'psd': 'image/vnd.adobe.photoshop', 349 'rtf': 'application/rtf', 350 'sqlite3': 'application/x-sqlite3', 351 'svg': 'image/svg+xml', 352 'text': 'text/plain', 353 'tiff': 'image/tiff', 354 'tsv': 'text/tsv', 355 'wasm': 'application/wasm', 356 'wav': 'audio/x-wav', 357 'webp': 'image/webp', 358 'webm': 'video/webm', 359 'xml': 'application/xml', 360 'zip': 'application/zip', 361 'zst': 'application/zstd', 362 } 363 364 try: 365 if argv.count('-') > 1: 366 msg = 'reading from `-` (standard input) more than once not allowed' 367 raise ValueError(msg) 368 369 if any(seems_url(e) for e in argv): 370 from urllib.request import urlopen 371 372 inputs = 0 373 mime_fallback = '' 374 # mime_fallback = 'application/octet-stream' 375 change_fallback = False 376 377 for arg in argv[1:]: 378 if change_fallback: 379 if arg in fallback_aliases: 380 arg = fallback_aliases[arg] 381 mime_fallback = arg 382 change_fallback = False 383 continue 384 385 if arg in ('-f', '--f', '-fallback', '--fallback'): 386 change_fallback = True 387 continue 388 389 path = arg 390 inputs += 1 391 392 if path == '-': 393 path = '<stdin>' 394 handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) 395 continue 396 397 if seems_url(path): 398 with urlopen(path) as inp: 399 handle_input(stdout.buffer, inp, path, mime_fallback) 400 continue 401 402 with open(path, mode='rb') as inp: 403 handle_input(stdout.buffer, inp, path, mime_fallback) 404 405 if change_fallback: 406 raise ValueError('forgot new fallback MIME-type') 407 408 if inputs == 0: 409 path = '<stdin>' 410 handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) 411 except BrokenPipeError: 412 # quit quietly, instead of showing a confusing error message 413 stderr.close() 414 exit(0) 415 except KeyboardInterrupt: 416 # stderr.close() 417 exit(2) 418 except Exception as e: 419 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 420 exit(1)