File: datauri.py 1 #!/usr/bin/python3 2 3 # The MIT License (MIT) 4 # 5 # Copyright © 2020-2025 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the “Software”), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 from base64 import b64encode 27 from re import compile as compile_re 28 from sys import argv, exit, stderr, stdin, stdout 29 30 31 info = ''' 32 datauri [options...] [filenames...] 33 34 35 Encode bytes as data-URIs, auto-detecting the file/data type using the first 36 few bytes from each data/file stream. When given multiple inputs, the output 37 will be multiple lines, one for each file given. 38 39 Empty files/inputs result in empty lines. A simple dash (-) stands for the 40 standard-input, which is also used automatically when not given any files. 41 42 Data-URIs are base64-encoded text representations of arbitrary data, which 43 include their payload's MIME-type, and which are directly useable/shareable 44 in web-browsers as links, despite not looking like normal links/URIs. 45 46 Some web-browsers limit the size of handled data-URIs to tens of kilobytes. 47 48 Options 49 50 -h, -help, --h, --help show this help message 51 -f, -fallback, --f, --fallback change the fallback MIME type 52 ''' 53 54 # no args or a leading help-option arg means show the help message and quit 55 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): 56 print(info.strip()) 57 exit(0) 58 59 60 # hdr_dispatch groups format-description-groups by their first byte, thus 61 # shortening total lookups for some data header: notice how the `ftyp` data 62 # formats aren't handled here, since these can include any byte in parts of 63 # their first few bytes 64 hdr_dispatch = { 65 0x00: [ 66 (b'\x00\x00\x01\xba', 'video/mpeg'), 67 (b'\x00\x00\x01\xb3', 'video/mpeg'), 68 (b'\x00\x00\x01\x00', 'image/x-icon'), 69 (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'), 70 (b'\x00asm', 'application/wasm'), 71 ], 72 0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format 73 0x1f: [(b'\x1f\x8b\x08', 'application/gzip')], 74 0x23: [ 75 (b'#! ', 'text/plain; charset=UTF-8'), 76 (b'#!/', 'text/plain; charset=UTF-8'), 77 ], 78 0x25: [ 79 (b'%PDF', 'application/pdf'), 80 (b'%!PS', 'application/postscript'), 81 ], 82 0x2e: [(b'.snd', 'audio/basic')], 83 0x47: [ 84 (b'GIF87a', 'image/gif'), 85 (b'GIF89a', 'image/gif'), 86 ], 87 0x49: [ 88 # some MP3s start with an ID3 meta-data section 89 (b'ID3\x02', 'audio/mpeg'), 90 (b'ID3\x03', 'audio/mpeg'), 91 (b'ID3\x04', 'audio/mpeg'), 92 (b'II*\x00', 'image/tiff'), 93 ], 94 0x4d: [ 95 (b'MM\x00*', 'image/tiff'), 96 (b'MThd', 'audio/midi'), 97 ], 98 0x4f: [(b'OggS', 'audio/ogg')], 99 0x50: [(b'PK\x03\x04', 'application/zip')], 100 0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')], 101 0x66: [(b'fLaC', 'audio/x-flac')], 102 0x7b: [(b'{\\rtf', 'application/rtf')], 103 0x7f: [(b'\x7fELF', 'application/x-elf')], 104 0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')], 105 0xff: [ 106 (b'\xff\xd8\xff', 'image/jpeg'), 107 # handle common ways MP3 data start 108 (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'), 109 (b'\xff\xfb', 'audio/mpeg'), 110 ], 111 } 112 113 114 # ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats 115 ftyp_types = ( 116 (b'M4A ', 'audio/aac'), 117 (b'M4A\x00', 'audio/aac'), 118 (b'mp42', 'video/x-m4v'), 119 (b'dash', 'audio/aac'), 120 (b'isom', 'video/mp4'), 121 # (b'isom', 'audio/aac'), 122 (b'MSNV', 'video/mp4'), 123 (b'qt ', 'video/quicktime'), 124 (b'heic', 'image/heic'), 125 (b'avif', 'image/avif'), 126 ) 127 128 # xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML 129 xmlish_heuristics = ( 130 (b'<html>', 'text/html'), 131 (b'<html ', 'text/html'), 132 (b'<head>', 'text/html'), 133 (b'<body>', 'text/html'), 134 (b'<!DOCTYPE html', 'text/html'), 135 (b'<svg>', 'image/svg+xml'), 136 (b'<svg ', 'image/svg+xml'), 137 (b'<?xml', 'application/xml'), 138 ) 139 140 # json_heuristics helps func guess_mime auto-detect JSON via regexes: 141 # it's not perfect, but it seems effective-enough in practice 142 json_heuristics = ( 143 compile_re(b'''^\\s*\\{\\s*"'''), 144 compile_re(b'''^\\s*\\{\\s*\\['''), 145 compile_re(b'''^\\s*\\[\\s*"'''), 146 compile_re(b'''^\\s*\\[\\s*\\{'''), 147 compile_re(b'''^\\s*\\[\\s*\\['''), 148 ) 149 150 151 def exact_match(header: bytes, maybe: bytes) -> bool: 152 if len(header) < len(maybe): 153 # not enough bytes to tell if input data match 154 return False 155 return all(x == y for x, y in zip(header, maybe)) 156 157 158 def match_riff(header: bytes) -> str: 159 'Handle a few special cases for func guess_mime.' 160 161 if len(header) < 12 or not header.startswith(b'RIFF'): 162 return '' 163 164 if header.find(b'WEBP', 8, 12) == 8: 165 return 'image/webp' 166 if header.find(b'WAVE', 8, 12) == 8: 167 return 'audio/x-wav' 168 if header.find(b'AVI ', 8, 12) == 8: 169 return 'video/avi' 170 return '' 171 172 173 def match_form(header: bytes) -> str: 174 'Handle a few special cases for func guess_mime.' 175 176 if len(header) < 12 or not header.startswith(b'FORM'): 177 return '' 178 179 if header.find(b'AIFF', 8, 12) == 8: 180 return 'audio/aiff' 181 if header.find(b'AIFC', 8, 12) == 8: 182 return 'audio/aiff' 183 return '' 184 185 186 def match_ftyp(header: bytes) -> str: 187 'Handle a few special cases for func guess_mime.' 188 189 # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp' 190 if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4: 191 return '' 192 193 # next 4 bytes after the ASCII 'ftyp' declare the data-format 194 for marker, mime in ftyp_types: 195 if header.find(marker, 8, 12) == 8: 196 return mime 197 198 # unrecognized MPEG-4-style data-format 199 return '' 200 201 202 def guess_mime(header: bytes, fallback: str) -> str: 203 'Try to auto-detect common MIME-types, given the first few input bytes.' 204 205 # no bytes, no match 206 if len(header) == 0: 207 return fallback 208 209 # check the RIFF formats, AIFF audio, and MPEG-4-like formats 210 for f in (match_riff, match_form, match_ftyp): 211 m = f(header) 212 if m != '': 213 return m 214 215 # maybe it's a bitmap picture, which almost always has 40 on 15th byte 216 if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14: 217 return 'image/x-bmp' 218 219 # check general lookup-table 220 if header[0] in hdr_dispatch: 221 for maybe in hdr_dispatch[header[0]]: 222 if exact_match(header, maybe[0]): 223 return maybe[1] 224 225 if header.find(b'<!DOCTYPE html', 0, 64) >= 0: 226 return 'text/html' 227 228 # try HTML, SVG, and even XML 229 if header.find(b'<', 0, 8) >= 0: 230 for marker, mime in xmlish_heuristics: 231 if header.find(marker, 0, 64) >= 0: 232 return mime 233 234 # try some common cases for JSON 235 for pattern in json_heuristics: 236 if pattern.match(header): 237 return 'application/json' 238 239 # nothing matched 240 return fallback 241 242 243 def seems_url(s: str) -> bool: 244 protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') 245 return any(s.startswith(p) for p in protocols) 246 247 248 def handle_input(w, r, path: str, mime_fallback: str) -> None: 249 chunk_size = 3 * 1024 * 1024 250 chunk = r.read(chunk_size) 251 252 # empty data-streams result in empty lines 253 if not chunk: 254 w.write(b'\n') 255 w.flush() 256 return 257 258 mime = guess_mime(chunk, mime_fallback) 259 if not mime: 260 raise Exception(f'{path}: can\'t guess MIME-type') 261 262 if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')): 263 w.write(b'data:') 264 w.write(bytes(mime, encoding='utf-8')) 265 w.write(b';base64,') 266 267 while True: 268 w.write(b64encode(chunk)) 269 prev_size = len(chunk) 270 chunk = r.read(chunk_size) 271 if not chunk: 272 break 273 if prev_size % 3 != 0: 274 raise ValueError('latest chunk-size isn\'t a multiple of 3') 275 276 w.write(b'\n') 277 w.flush() 278 279 280 fallback_aliases = { 281 # 'text/json': 'application/json', 282 283 # 'xbmp': 'image/x-bmp', 284 # 'xflac': 'audio/x-flac', 285 # 'xicon': 'image/x-icon', 286 # 'xm4v': 'video/x-m4v', 287 # 'xsqlite3': 'application/x-sqlite3', 288 # 'xwav': 'audio/x-wav', 289 # 'xwave': 'audio/x-wav', 290 # 'x-bmp': 'image/x-bmp', 291 # 'x-flac': 'audio/x-flac', 292 # 'x-icon': 'image/x-icon', 293 # 'x-m4v': 'video/x-m4v', 294 # 'x-sqlite3': 'application/x-sqlite3', 295 # 'x-wav': 'audio/x-wav', 296 297 'b': 'application/octet-stream', 298 'j': 'application/json', 299 't': 'text/plain', 300 'u': 'text/plain; charset=UTF-8', 301 302 'e': '', 303 'err': '', 304 'error': '', 305 'f': '', 306 'fail': '', 307 308 'aac': 'audio/aac', 309 'aif': 'audio/aiff', 310 'bin': 'application/octet-stream', 311 'binary': 'application/octet-stream', 312 'gzip': 'application/gzip', 313 'midi': 'audio/midi', 314 'mpeg': 'video/mpeg', 315 'octet': 'application/octet-stream', 316 'octetstream': 'application/octet-stream', 317 'octet-stream': 'application/octet-stream', 318 'plain': 'text/plain', 319 'sqlite': 'application/x-sqlite3', 320 'svg+xml': 'image/svg+xml', 321 'tif': 'image/tiff', 322 'utf8': 'text/plain; charset=UTF-8', 323 'utf-8': 'text/plain; charset=UTF-8', 324 'wave': 'audio/x-wav', 325 'zstd': 'application/zstd', 326 327 'aiff': 'audio/aiff', 328 'au': 'audio/basic', 329 'avi': 'video/avi', 330 'avif': 'image/avif', 331 'bmp': 'image/x-bmp', 332 'caf': 'audio/x-caf', 333 'cur': 'image/vnd.microsoft.icon', 334 'css': 'text/css', 335 'csv': 'text/csv', 336 'djvu': 'image/x-djvu', 337 'elf': 'application/x-elf', 338 'exe': 'application/vnd.microsoft.portable-executable', 339 'flac': 'audio/x-flac', 340 'gif': 'image/gif', 341 'gz': 'application/gzip', 342 'heic': 'image/heic', 343 'htm': 'text/html', 344 'html': 'text/html', 345 'ico': 'image/x-icon', 346 'iso': 'application/octet-stream', 347 'jpg': 'image/jpeg', 348 'jpeg': 'image/jpeg', 349 'js': 'application/javascript', 350 'json': 'application/json', 351 'm4a': 'audio/aac', 352 'm4v': 'video/x-m4v', 353 'mid': 'audio/midi', 354 'mov': 'video/quicktime', 355 'mp4': 'video/mp4', 356 'mp3': 'audio/mpeg', 357 'mpg': 'video/mpeg', 358 'ogg': 'audio/ogg', 359 'opus': 'audio/opus', 360 'pdf': 'application/pdf', 361 'png': 'image/png', 362 'ps': 'application/postscript', 363 'psd': 'image/vnd.adobe.photoshop', 364 'rtf': 'application/rtf', 365 'sqlite3': 'application/x-sqlite3', 366 'svg': 'image/svg+xml', 367 'text': 'text/plain', 368 'tiff': 'image/tiff', 369 'tsv': 'text/tsv', 370 'wasm': 'application/wasm', 371 'wav': 'audio/x-wav', 372 'webp': 'image/webp', 373 'webm': 'video/webm', 374 'xml': 'application/xml', 375 'zip': 'application/zip', 376 'zst': 'application/zstd', 377 } 378 379 try: 380 if argv.count('-') > 1: 381 msg = 'reading from `-` (standard input) more than once not allowed' 382 raise ValueError(msg) 383 384 if any(seems_url(e) for e in argv): 385 from urllib.request import urlopen 386 387 inputs = 0 388 mime_fallback = '' 389 # mime_fallback = 'application/octet-stream' 390 change_fallback = False 391 392 for arg in argv[1:]: 393 if change_fallback: 394 if arg in fallback_aliases: 395 arg = fallback_aliases[arg] 396 mime_fallback = arg 397 change_fallback = False 398 continue 399 400 if arg in ('-f', '--f', '-fallback', '--fallback'): 401 change_fallback = True 402 continue 403 404 path = arg 405 inputs += 1 406 407 if path == '-': 408 path = '<stdin>' 409 handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) 410 continue 411 412 if seems_url(path): 413 with urlopen(path) as inp: 414 handle_input(stdout.buffer, inp, path, mime_fallback) 415 continue 416 417 with open(path, mode='rb') as inp: 418 handle_input(stdout.buffer, inp, path, mime_fallback) 419 420 if change_fallback: 421 raise ValueError('forgot new fallback MIME-type') 422 423 if inputs == 0: 424 path = '<stdin>' 425 handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) 426 except BrokenPipeError: 427 # quit quietly, instead of showing a confusing error message 428 stderr.close() 429 exit(0) 430 except KeyboardInterrupt: 431 exit(2) 432 except Exception as e: 433 print(f'\x1b[31m{e}\x1b[0m', file=stderr) 434 exit(1)