datauri

     File: datauri.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2020-2025 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from base64 import b64encode
  27 from re import compile as compile_re
  28 from sys import argv, exit, stderr, stdin, stdout
  29 
  30 
  31 info = '''
  32 datauri [options...] [filenames...]
  33 
  34 
  35 Encode bytes as data-URIs, auto-detecting the file/data type using the first
  36 few bytes from each data/file stream. When given multiple inputs, the output
  37 will be multiple lines, one for each file given.
  38 
  39 Empty files/inputs result in empty lines. A simple dash (-) stands for the
  40 standard-input, which is also used automatically when not given any files.
  41 
  42 Data-URIs are base64-encoded text representations of arbitrary data, which
  43 include their payload's MIME-type, and which are directly useable/shareable
  44 in web-browsers as links, despite not looking like normal links/URIs.
  45 
  46 Some web-browsers limit the size of handled data-URIs to tens of kilobytes.
  47 
  48 Options
  49 
  50     -h, -help, --h, --help              show this help message
  51     -f, -fallback, --f, --fallback      change the fallback MIME type
  52 '''
  53 
  54 # no args or a leading help-option arg means show the help message and quit
  55 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  56     print(info.strip())
  57     exit(0)
  58 
  59 
  60 # hdr_dispatch groups format-description-groups by their first byte, thus
  61 # shortening total lookups for some data header: notice how the `ftyp` data
  62 # formats aren't handled here, since these can include any byte in parts of
  63 # their first few bytes
  64 hdr_dispatch = {
  65     0x00: [
  66         (b'\x00\x00\x01\xba', 'video/mpeg'),
  67         (b'\x00\x00\x01\xb3', 'video/mpeg'),
  68         (b'\x00\x00\x01\x00', 'image/x-icon'),
  69         (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'),
  70         (b'\x00asm', 'application/wasm'),
  71     ],
  72     0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format
  73     0x1f: [(b'\x1f\x8b\x08', 'application/gzip')],
  74     0x23: [
  75         (b'#! ', 'text/plain; charset=UTF-8'),
  76         (b'#!/', 'text/plain; charset=UTF-8'),
  77     ],
  78     0x25: [
  79         (b'%PDF', 'application/pdf'),
  80         (b'%!PS', 'application/postscript'),
  81     ],
  82     0x2e: [(b'.snd', 'audio/basic')],
  83     0x47: [
  84         (b'GIF87a', 'image/gif'),
  85         (b'GIF89a', 'image/gif'),
  86     ],
  87     0x49: [
  88         # some MP3s start with an ID3 meta-data section
  89         (b'ID3\x02', 'audio/mpeg'),
  90         (b'ID3\x03', 'audio/mpeg'),
  91         (b'ID3\x04', 'audio/mpeg'),
  92         (b'II*\x00', 'image/tiff'),
  93     ],
  94     0x4d: [
  95         (b'MM\x00*', 'image/tiff'),
  96         (b'MThd', 'audio/midi'),
  97     ],
  98     0x4f: [(b'OggS', 'audio/ogg')],
  99     0x50: [(b'PK\x03\x04', 'application/zip')],
 100     0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')],
 101     0x66: [(b'fLaC', 'audio/x-flac')],
 102     0x7b: [(b'{\\rtf', 'application/rtf')],
 103     0x7f: [(b'\x7fELF', 'application/x-elf')],
 104     0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')],
 105     0xff: [
 106         (b'\xff\xd8\xff', 'image/jpeg'),
 107         # handle common ways MP3 data start
 108         (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'),
 109         (b'\xff\xfb', 'audio/mpeg'),
 110     ],
 111 }
 112 
 113 
 114 # ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats
 115 ftyp_types = (
 116     (b'M4A ', 'audio/aac'),
 117     (b'M4A\x00', 'audio/aac'),
 118     (b'mp42', 'video/x-m4v'),
 119     (b'dash', 'audio/aac'),
 120     (b'isom', 'video/mp4'),
 121     # (b'isom', 'audio/aac'),
 122     (b'MSNV', 'video/mp4'),
 123     (b'qt  ', 'video/quicktime'),
 124     (b'heic', 'image/heic'),
 125     (b'avif', 'image/avif'),
 126 )
 127 
 128 # xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML
 129 xmlish_heuristics = (
 130     (b'<html>', 'text/html'),
 131     (b'<html ', 'text/html'),
 132     (b'<head>', 'text/html'),
 133     (b'<body>', 'text/html'),
 134     (b'<!DOCTYPE html', 'text/html'),
 135     (b'<svg>', 'image/svg+xml'),
 136     (b'<svg ', 'image/svg+xml'),
 137     (b'<?xml', 'application/xml'),
 138 )
 139 
 140 # json_heuristics helps func guess_mime auto-detect JSON via regexes:
 141 # it's not perfect, but it seems effective-enough in practice
 142 json_heuristics = (
 143     compile_re(b'''^\\s*\\{\\s*"'''),
 144     compile_re(b'''^\\s*\\{\\s*\\['''),
 145     compile_re(b'''^\\s*\\[\\s*"'''),
 146     compile_re(b'''^\\s*\\[\\s*\\{'''),
 147     compile_re(b'''^\\s*\\[\\s*\\['''),
 148 )
 149 
 150 
 151 def exact_match(header: bytes, maybe: bytes) -> bool:
 152     if len(header) < len(maybe):
 153         # not enough bytes to tell if input data match
 154         return False
 155     return all(x == y for x, y in zip(header, maybe))
 156 
 157 
 158 def match_riff(header: bytes) -> str:
 159     'Handle a few special cases for func guess_mime.'
 160 
 161     if len(header) < 12 or not header.startswith(b'RIFF'):
 162         return ''
 163 
 164     if header.find(b'WEBP', 8, 12) == 8:
 165         return 'image/webp'
 166     if header.find(b'WAVE', 8, 12) == 8:
 167         return 'audio/x-wav'
 168     if header.find(b'AVI ', 8, 12) == 8:
 169         return 'video/avi'
 170     return ''
 171 
 172 
 173 def match_form(header: bytes) -> str:
 174     'Handle a few special cases for func guess_mime.'
 175 
 176     if len(header) < 12 or not header.startswith(b'FORM'):
 177         return ''
 178 
 179     if header.find(b'AIFF', 8, 12) == 8:
 180         return 'audio/aiff'
 181     if header.find(b'AIFC', 8, 12) == 8:
 182         return 'audio/aiff'
 183     return ''
 184 
 185 
 186 def match_ftyp(header: bytes) -> str:
 187     'Handle a few special cases for func guess_mime.'
 188 
 189     # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp'
 190     if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4:
 191         return ''
 192 
 193     # next 4 bytes after the ASCII 'ftyp' declare the data-format
 194     for marker, mime in ftyp_types:
 195         if header.find(marker, 8, 12) == 8:
 196             return mime
 197 
 198     # unrecognized MPEG-4-style data-format
 199     return ''
 200 
 201 
 202 def guess_mime(header: bytes, fallback: str) -> str:
 203     'Try to auto-detect common MIME-types, given the first few input bytes.'
 204 
 205     # no bytes, no match
 206     if len(header) == 0:
 207         return fallback
 208 
 209     # check the RIFF formats, AIFF audio, and MPEG-4-like formats
 210     for f in (match_riff, match_form, match_ftyp):
 211         m = f(header)
 212         if m != '':
 213             return m
 214 
 215     # maybe it's a bitmap picture, which almost always has 40 on 15th byte
 216     if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14:
 217         return 'image/x-bmp'
 218 
 219     # check general lookup-table
 220     if header[0] in hdr_dispatch:
 221         for maybe in hdr_dispatch[header[0]]:
 222             if exact_match(header, maybe[0]):
 223                 return maybe[1]
 224 
 225     if header.find(b'<!DOCTYPE html', 0, 64) >= 0:
 226         return 'text/html'
 227 
 228     # try HTML, SVG, and even XML
 229     if header.find(b'<', 0, 8) >= 0:
 230         for marker, mime in xmlish_heuristics:
 231             if header.find(marker, 0, 64) >= 0:
 232                 return mime
 233 
 234     # try some common cases for JSON
 235     for pattern in json_heuristics:
 236         if pattern.match(header):
 237             return 'application/json'
 238 
 239     # nothing matched
 240     return fallback
 241 
 242 
 243 def seems_url(s: str) -> bool:
 244     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 245     return any(s.startswith(p) for p in protocols)
 246 
 247 
 248 def handle_input(w, r, path: str, mime_fallback: str) -> None:
 249     chunk_size = 3 * 1024 * 1024
 250     chunk = r.read(chunk_size)
 251 
 252     # empty data-streams result in empty lines
 253     if not chunk:
 254         w.write(b'\n')
 255         w.flush()
 256         return
 257 
 258     mime = guess_mime(chunk, mime_fallback)
 259     if not mime:
 260         raise Exception(f'{path}: can\'t guess MIME-type')
 261 
 262     if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')):
 263         w.write(b'data:')
 264         w.write(bytes(mime, encoding='utf-8'))
 265         w.write(b';base64,')
 266 
 267     while True:
 268         w.write(b64encode(chunk))
 269         prev_size = len(chunk)
 270         chunk = r.read(chunk_size)
 271         if not chunk:
 272             break
 273         if prev_size % 3 != 0:
 274             raise ValueError('latest chunk-size isn\'t a multiple of 3')
 275 
 276     w.write(b'\n')
 277     w.flush()
 278 
 279 
 280 fallback_aliases = {
 281     # 'text/json': 'application/json',
 282 
 283     # 'xbmp': 'image/x-bmp',
 284     # 'xflac': 'audio/x-flac',
 285     # 'xicon': 'image/x-icon',
 286     # 'xm4v': 'video/x-m4v',
 287     # 'xsqlite3': 'application/x-sqlite3',
 288     # 'xwav': 'audio/x-wav',
 289     # 'xwave': 'audio/x-wav',
 290     # 'x-bmp': 'image/x-bmp',
 291     # 'x-flac': 'audio/x-flac',
 292     # 'x-icon': 'image/x-icon',
 293     # 'x-m4v': 'video/x-m4v',
 294     # 'x-sqlite3': 'application/x-sqlite3',
 295     # 'x-wav': 'audio/x-wav',
 296 
 297     'b': 'application/octet-stream',
 298     'j': 'application/json',
 299     't': 'text/plain',
 300     'u': 'text/plain; charset=UTF-8',
 301 
 302     'e': '',
 303     'err': '',
 304     'error': '',
 305     'f': '',
 306     'fail': '',
 307 
 308     'aac': 'audio/aac',
 309     'aif': 'audio/aiff',
 310     'bin': 'application/octet-stream',
 311     'binary': 'application/octet-stream',
 312     'gzip': 'application/gzip',
 313     'midi': 'audio/midi',
 314     'mpeg': 'video/mpeg',
 315     'octet': 'application/octet-stream',
 316     'octetstream': 'application/octet-stream',
 317     'octet-stream': 'application/octet-stream',
 318     'plain': 'text/plain',
 319     'sqlite': 'application/x-sqlite3',
 320     'svg+xml': 'image/svg+xml',
 321     'tif': 'image/tiff',
 322     'utf8': 'text/plain; charset=UTF-8',
 323     'utf-8': 'text/plain; charset=UTF-8',
 324     'wave': 'audio/x-wav',
 325     'zstd': 'application/zstd',
 326 
 327     'aiff': 'audio/aiff',
 328     'au': 'audio/basic',
 329     'avi': 'video/avi',
 330     'avif': 'image/avif',
 331     'bmp': 'image/x-bmp',
 332     'caf': 'audio/x-caf',
 333     'cur': 'image/vnd.microsoft.icon',
 334     'css': 'text/css',
 335     'csv': 'text/csv',
 336     'djvu': 'image/x-djvu',
 337     'elf': 'application/x-elf',
 338     'exe': 'application/vnd.microsoft.portable-executable',
 339     'flac': 'audio/x-flac',
 340     'gif': 'image/gif',
 341     'gz': 'application/gzip',
 342     'heic': 'image/heic',
 343     'htm': 'text/html',
 344     'html': 'text/html',
 345     'ico': 'image/x-icon',
 346     'iso': 'application/octet-stream',
 347     'jpg': 'image/jpeg',
 348     'jpeg': 'image/jpeg',
 349     'js': 'application/javascript',
 350     'json': 'application/json',
 351     'm4a': 'audio/aac',
 352     'm4v': 'video/x-m4v',
 353     'mid': 'audio/midi',
 354     'mov': 'video/quicktime',
 355     'mp4': 'video/mp4',
 356     'mp3': 'audio/mpeg',
 357     'mpg': 'video/mpeg',
 358     'ogg': 'audio/ogg',
 359     'opus': 'audio/opus',
 360     'pdf': 'application/pdf',
 361     'png': 'image/png',
 362     'ps': 'application/postscript',
 363     'psd': 'image/vnd.adobe.photoshop',
 364     'rtf': 'application/rtf',
 365     'sqlite3': 'application/x-sqlite3',
 366     'svg': 'image/svg+xml',
 367     'text': 'text/plain',
 368     'tiff': 'image/tiff',
 369     'tsv': 'text/tsv',
 370     'wasm': 'application/wasm',
 371     'wav': 'audio/x-wav',
 372     'webp': 'image/webp',
 373     'webm': 'video/webm',
 374     'xml': 'application/xml',
 375     'zip': 'application/zip',
 376     'zst': 'application/zstd',
 377 }
 378 
 379 try:
 380     if argv.count('-') > 1:
 381         msg = 'reading from `-` (standard input) more than once not allowed'
 382         raise ValueError(msg)
 383 
 384     if any(seems_url(e) for e in argv):
 385         from urllib.request import urlopen
 386 
 387     inputs = 0
 388     mime_fallback = ''
 389     # mime_fallback = 'application/octet-stream'
 390     change_fallback = False
 391 
 392     for arg in argv[1:]:
 393         if change_fallback:
 394             if arg in fallback_aliases:
 395                 arg = fallback_aliases[arg]
 396             mime_fallback = arg
 397             change_fallback = False
 398             continue
 399 
 400         if arg in ('-f', '--f', '-fallback', '--fallback'):
 401             change_fallback = True
 402             continue
 403 
 404         path = arg
 405         inputs += 1
 406 
 407         if path == '-':
 408             path = '<stdin>'
 409             handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
 410             continue
 411 
 412         if seems_url(path):
 413             with urlopen(path) as inp:
 414                 handle_input(stdout.buffer, inp, path, mime_fallback)
 415             continue
 416 
 417         with open(path, mode='rb') as inp:
 418             handle_input(stdout.buffer, inp, path, mime_fallback)
 419 
 420     if change_fallback:
 421         raise ValueError('forgot new fallback MIME-type')
 422 
 423     if inputs == 0:
 424         path = '<stdin>'
 425         handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
 426 except BrokenPipeError:
 427     # quit quietly, instead of showing a confusing error message
 428     stderr.close()
 429     exit(0)
 430 except KeyboardInterrupt:
 431     exit(2)
 432 except Exception as e:
 433     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 434     exit(1)