File: datauri.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 from base64 import b64encode
  27 from re import compile as compile_re
  28 from sys import argv, exit, stderr, stdin, stdout
  29 
  30 
  31 info = '''
  32 datauri [options...] [filenames...]
  33 
  34 
  35 Encode bytes as data-URIs, auto-detecting the file/data type using the first
  36 few bytes from each data/file stream. When given multiple inputs, the output
  37 will be multiple lines, one for each file given.
  38 
  39 Empty files/inputs result in empty lines. A simple dash (-) stands for the
  40 standard-input, which is also used automatically when not given any files.
  41 
  42 Data-URIs are base64-encoded text representations of arbitrary data, which
  43 include their payload's MIME-type, and which are directly useable/shareable
  44 in web-browsers as links, despite not looking like normal links/URIs.
  45 
  46 Some web-browsers limit the size of handled data-URIs to tens of kilobytes.
  47 
  48 Options
  49 
  50     -h, -help, --h, --help              show this help message
  51     -f, -fallback, --f, --fallback      change the fallback MIME type
  52 '''
  53 
  54 # no args or a leading help-option arg means show the help message and quit
  55 if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
  56     print(info.strip())
  57     exit(0)
  58 
  59 
  60 # hdr_dispatch groups format-description-groups by their first byte, thus
  61 # shortening total lookups for some data header: notice how the `ftyp` data
  62 # formats aren't handled here, since these can include any byte in parts of
  63 # their first few bytes
  64 hdr_dispatch = {
  65     0x00: [
  66         (b'\x00\x00\x01\xba', 'video/mpeg'),
  67         (b'\x00\x00\x01\xb3', 'video/mpeg'),
  68         (b'\x00\x00\x01\x00', 'image/x-icon'),
  69         (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'),
  70     ],
  71     0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format
  72     0x23: [
  73         (b'#! ', 'text/plain; charset=UTF-8'),
  74         (b'#!/', 'text/plain; charset=UTF-8'),
  75     ],
  76     0x25: [(b'%PDF', 'application/pdf'), (b'%!PS', 'application/postscript')],
  77     0x2e: [(b'.snd', 'audio/basic')],
  78     0x47: [(b'GIF87a', 'image/gif'), (b'GIF89a', 'image/gif')],
  79     0x49: [
  80         # some MP3s start with an ID3 meta-data section
  81         (b'ID3\x02', 'audio/mpeg'),
  82         (b'ID3\x03', 'audio/mpeg'),
  83         (b'ID3\x04', 'audio/mpeg'),
  84         (b'II*\x00', 'image/tiff'),
  85     ],
  86     0x4d: [(b'MM\x00*', 'image/tiff'), (b'MThd', 'audio/midi')],
  87     0x4f: [(b'OggS', 'audio/ogg')],
  88     0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')],
  89     0x66: [(b'fLaC', 'audio/x-flac')],
  90     0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')],
  91     0xff: [
  92         (b'\xff\xd8\xff', 'image/jpeg'),
  93         # handle common ways MP3 data start
  94         (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'),
  95         (b'\xff\xfb', 'audio/mpeg'),
  96     ],
  97 }
  98 
  99 
 100 # ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats
 101 ftyp_types = (
 102     (b'M4A ', 'audio/aac'),
 103     (b'M4A\x00', 'audio/aac'),
 104     (b'dash', 'audio/aac'),
 105     (b'isom', 'video/mp4'),
 106     # (b'isom', 'audio/aac'),
 107     (b'MSNV', 'video/mp4'),
 108     (b'qt  ', 'video/quicktime'),
 109     (b'heic', 'image/heic'),
 110     (b'avif', 'image/avif'),
 111 )
 112 
 113 # xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML
 114 xmlish_heuristics = (
 115     (b'<html>', 'text/html'),
 116     (b'<html ', 'text/html'),
 117     (b'<head>', 'text/html'),
 118     (b'<body>', 'text/html'),
 119     (b'<!DOCTYPE html', 'text/html'),
 120     (b'<svg>', 'image/svg+xml'),
 121     (b'<svg ', 'image/svg+xml'),
 122     (b'<?xml', 'application/xml'),
 123 )
 124 
 125 # json_heuristics helps func guess_mime auto-detect JSON via regexes:
 126 # it's not perfect, but it seems effective-enough in practice
 127 json_heuristics = (
 128     compile_re(b'''^\\s*\\{\\s*"'''),
 129     compile_re(b'''^\\s*\\{\\s*\\['''),
 130     compile_re(b'''^\\s*\\[\\s*"'''),
 131     compile_re(b'''^\\s*\\[\\s*\\{'''),
 132     compile_re(b'''^\\s*\\[\\s*\\['''),
 133 )
 134 
 135 
 136 def exact_match(header: bytes, maybe: bytes) -> bool:
 137     if len(header) < len(maybe):
 138         # not enough bytes to tell if input data match
 139         return False
 140     return all(x == y for x, y in zip(header, maybe))
 141 
 142 
 143 def match_riff(header: bytes) -> str:
 144     'Handle a few special cases for func guess_mime.'
 145 
 146     if len(header) < 12 or not header.startswith(b'RIFF'):
 147         return ''
 148 
 149     if header.find(b'WEBP', 8, 12) == 8:
 150         return 'image/webp'
 151     if header.find(b'WAVE', 8, 12) == 8:
 152         return 'audio/x-wav'
 153     if header.find(b'AVI ', 8, 12) == 8:
 154         return 'video/avi'
 155     return ''
 156 
 157 
 158 def match_form(header: bytes) -> str:
 159     'Handle a few special cases for func guess_mime.'
 160 
 161     if len(header) < 12 or not header.startswith(b'FORM'):
 162         return ''
 163 
 164     if header.find(b'AIFF', 8, 12) == 8:
 165         return 'audio/aiff'
 166     if header.find(b'AIFC', 8, 12) == 8:
 167         return 'audio/aiff'
 168     return ''
 169 
 170 
 171 def match_ftyp(header: bytes) -> str:
 172     'Handle a few special cases for func guess_mime.'
 173 
 174     # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp'
 175     if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4:
 176         return ''
 177 
 178     # next 4 bytes after the ASCII 'ftyp' declare the data-format
 179     for marker, mime in ftyp_types:
 180         if header.find(marker, 8, 12) == 8:
 181             return mime
 182 
 183     # unrecognized MPEG-4-style data-format
 184     return ''
 185 
 186 
 187 def guess_mime(header: bytes, fallback: str) -> str:
 188     'Try to auto-detect common MIME-types, given the first few input bytes.'
 189 
 190     # no bytes, no match
 191     if len(header) == 0:
 192         return fallback
 193 
 194     # check the RIFF formats, AIFF audio, and MPEG-4-like formats
 195     for f in (match_riff, match_form, match_ftyp):
 196         m = f(header)
 197         if m != '':
 198             return m
 199 
 200     # maybe it's a bitmap picture, which almost always has 40 on 15th byte
 201     if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14:
 202         return 'image/x-bmp'
 203 
 204     # check general lookup-table
 205     if header[0] in hdr_dispatch:
 206         for maybe in hdr_dispatch[header[0]]:
 207             if exact_match(header, maybe[0]):
 208                 return maybe[1]
 209 
 210     if header.find(b'<!DOCTYPE html', 0, 64) >= 0:
 211         return 'text/html'
 212 
 213     # try HTML, SVG, and even XML
 214     if header.find(b'<', 0, 8) >= 0:
 215         for marker, mime in xmlish_heuristics:
 216             if header.find(marker, 0, 64) >= 0:
 217                 return mime
 218 
 219     # try some common cases for JSON
 220     for pattern in json_heuristics:
 221         if pattern.match(header):
 222             return 'application/json'
 223 
 224     # nothing matched
 225     return fallback
 226 
 227 
 228 def seems_url(s: str) -> bool:
 229     protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
 230     return any(s.startswith(p) for p in protocols)
 231 
 232 
 233 def handle_input(w, r, path: str, mime_fallback: str) -> None:
 234     chunk_size = 3 * 1024 * 1024
 235     chunk = r.read(chunk_size)
 236 
 237     # empty data-streams result in empty lines
 238     if not chunk:
 239         w.write(b'\n')
 240         w.flush()
 241         return
 242 
 243     mime = guess_mime(chunk, mime_fallback)
 244     if not mime:
 245         raise Exception(f'{path}: can\'t guess MIME-type')
 246 
 247     if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')):
 248         w.write(b'data:')
 249         w.write(bytes(mime, encoding='utf-8'))
 250         w.write(b';base64,')
 251 
 252     while True:
 253         w.write(b64encode(chunk))
 254         prev_size = len(chunk)
 255         chunk = r.read(chunk_size)
 256         if not chunk:
 257             break
 258         if prev_size % 3 != 0:
 259             raise ValueError('latest chunk-size isn\'t a multiple of 3')
 260 
 261     w.write(b'\n')
 262     w.flush()
 263 
 264 
 265 fallback_aliases = {
 266     # 'text/json': 'application/json',
 267 
 268     # 'xbmp': 'image/x-bmp',
 269     # 'xflac': 'audio/x-flac',
 270     # 'xicon': 'image/x-icon',
 271     # 'xm4v': 'video/x-m4v',
 272     # 'xsqlite3': 'application/x-sqlite3',
 273     # 'xwav': 'audio/x-wav',
 274     # 'xwave': 'audio/x-wav',
 275     # 'x-bmp': 'image/x-bmp',
 276     # 'x-flac': 'audio/x-flac',
 277     # 'x-icon': 'image/x-icon',
 278     # 'x-m4v': 'video/x-m4v',
 279     # 'x-sqlite3': 'application/x-sqlite3',
 280     # 'x-wav': 'audio/x-wav',
 281 
 282     'b': 'application/octet-stream',
 283     'j': 'application/json',
 284     't': 'text/plain',
 285     'u': 'text/plain; charset=UTF-8',
 286 
 287     'e': '',
 288     'err': '',
 289     'error': '',
 290     'f': '',
 291     'fail': '',
 292 
 293     'aac': 'audio/aac',
 294     'aif': 'audio/aiff',
 295     'bin': 'application/octet-stream',
 296     'binary': 'application/octet-stream',
 297     'gzip': 'application/gzip',
 298     'midi': 'audio/midi',
 299     'mpeg': 'video/mpeg',
 300     'octet': 'application/octet-stream',
 301     'octetstream': 'application/octet-stream',
 302     'octet-stream': 'application/octet-stream',
 303     'plain': 'text/plain',
 304     'sqlite': 'application/x-sqlite3',
 305     'svg+xml': 'image/svg+xml',
 306     'tif': 'image/tiff',
 307     'utf8': 'text/plain; charset=UTF-8',
 308     'utf-8': 'text/plain; charset=UTF-8',
 309     'wave': 'audio/x-wav',
 310     'zstd': 'application/zstd',
 311 
 312     'aiff': 'audio/aiff',
 313     'au': 'audio/basic',
 314     'avi': 'video/avi',
 315     'avif': 'image/avif',
 316     'bmp': 'image/x-bmp',
 317     'caf': 'audio/x-caf',
 318     'cur': 'image/vnd.microsoft.icon',
 319     'css': 'text/css',
 320     'csv': 'text/csv',
 321     'djvu': 'image/x-djvu',
 322     'elf': 'application/x-elf',
 323     'exe': 'application/vnd.microsoft.portable-executable',
 324     'flac': 'audio/x-flac',
 325     'gif': 'image/gif',
 326     'gz': 'application/gzip',
 327     'heic': 'image/heic',
 328     'htm': 'text/html',
 329     'html': 'text/html',
 330     'ico': 'image/x-icon',
 331     'iso': 'application/octet-stream',
 332     'jpg': 'image/jpeg',
 333     'jpeg': 'image/jpeg',
 334     'js': 'application/javascript',
 335     'json': 'application/json',
 336     'm4a': 'audio/aac',
 337     'm4v': 'video/x-m4v',
 338     'mid': 'audio/midi',
 339     'mov': 'video/quicktime',
 340     'mp4': 'video/mp4',
 341     'mp3': 'audio/mpeg',
 342     'mpg': 'video/mpeg',
 343     'ogg': 'audio/ogg',
 344     'opus': 'audio/opus',
 345     'pdf': 'application/pdf',
 346     'png': 'image/png',
 347     'ps': 'application/postscript',
 348     'psd': 'image/vnd.adobe.photoshop',
 349     'rtf': 'application/rtf',
 350     'sqlite3': 'application/x-sqlite3',
 351     'svg': 'image/svg+xml',
 352     'text': 'text/plain',
 353     'tiff': 'image/tiff',
 354     'tsv': 'text/tsv',
 355     'wasm': 'application/wasm',
 356     'wav': 'audio/x-wav',
 357     'webp': 'image/webp',
 358     'webm': 'video/webm',
 359     'xml': 'application/xml',
 360     'zip': 'application/zip',
 361     'zst': 'application/zstd',
 362 }
 363 
 364 try:
 365     if argv.count('-') > 1:
 366         msg = 'reading from `-` (standard input) more than once not allowed'
 367         raise ValueError(msg)
 368 
 369     if any(seems_url(e) for e in argv):
 370         from urllib.request import urlopen
 371 
 372     inputs = 0
 373     mime_fallback = ''
 374     # mime_fallback = 'application/octet-stream'
 375     change_fallback = False
 376 
 377     for arg in argv[1:]:
 378         if change_fallback:
 379             if arg in fallback_aliases:
 380                 arg = fallback_aliases[arg]
 381             mime_fallback = arg
 382             change_fallback = False
 383             continue
 384 
 385         if arg in ('-f', '--f', '-fallback', '--fallback'):
 386             change_fallback = True
 387             continue
 388 
 389         path = arg
 390         inputs += 1
 391 
 392         if path == '-':
 393             path = '<stdin>'
 394             handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
 395             continue
 396 
 397         if seems_url(path):
 398             with urlopen(path) as inp:
 399                 handle_input(stdout.buffer, inp, path, mime_fallback)
 400             continue
 401 
 402         with open(path, mode='rb') as inp:
 403             handle_input(stdout.buffer, inp, path, mime_fallback)
 404 
 405     if change_fallback:
 406         raise ValueError('forgot new fallback MIME-type')
 407 
 408     if inputs == 0:
 409         path = '<stdin>'
 410         handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
 411 except BrokenPipeError:
 412     # quit quietly, instead of showing a confusing error message
 413     stderr.close()
 414     exit(0)
 415 except KeyboardInterrupt:
 416     # stderr.close()
 417     exit(2)
 418 except Exception as e:
 419     print(f'\x1b[31m{e}\x1b[0m', file=stderr)
 420     exit(1)