#!/usr/bin/python3 # The MIT License (MIT) # # Copyright © 2020-2025 pacman64 # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the “Software”), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from base64 import b64encode from re import compile as compile_re from sys import argv, exit, stderr, stdin, stdout info = ''' datauri [options...] [filenames...] Encode bytes as data-URIs, auto-detecting the file/data type using the first few bytes from each data/file stream. When given multiple inputs, the output will be multiple lines, one for each file given. Empty files/inputs result in empty lines. A simple dash (-) stands for the standard-input, which is also used automatically when not given any files. Data-URIs are base64-encoded text representations of arbitrary data, which include their payload's MIME-type, and which are directly useable/shareable in web-browsers as links, despite not looking like normal links/URIs. Some web-browsers limit the size of handled data-URIs to tens of kilobytes. Options -h, -help, --h, --help show this help message -f, -fallback, --f, --fallback change the fallback MIME type ''' # no args or a leading help-option arg means show the help message and quit if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'): print(info.strip()) exit(0) # hdr_dispatch groups format-description-groups by their first byte, thus # shortening total lookups for some data header: notice how the `ftyp` data # formats aren't handled here, since these can include any byte in parts of # their first few bytes hdr_dispatch = { 0x00: [ (b'\x00\x00\x01\xba', 'video/mpeg'), (b'\x00\x00\x01\xb3', 'video/mpeg'), (b'\x00\x00\x01\x00', 'image/x-icon'), (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'), ], 0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format 0x23: [ (b'#! ', 'text/plain; charset=UTF-8'), (b'#!/', 'text/plain; charset=UTF-8'), ], 0x25: [(b'%PDF', 'application/pdf'), (b'%!PS', 'application/postscript')], 0x2e: [(b'.snd', 'audio/basic')], 0x47: [(b'GIF87a', 'image/gif'), (b'GIF89a', 'image/gif')], 0x49: [ # some MP3s start with an ID3 meta-data section (b'ID3\x02', 'audio/mpeg'), (b'ID3\x03', 'audio/mpeg'), (b'ID3\x04', 'audio/mpeg'), (b'II*\x00', 'image/tiff'), ], 0x4d: [(b'MM\x00*', 'image/tiff'), (b'MThd', 'audio/midi')], 0x4f: [(b'OggS', 'audio/ogg')], 0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')], 0x66: [(b'fLaC', 'audio/x-flac')], 0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')], 0xff: [ (b'\xff\xd8\xff', 'image/jpeg'), # handle common ways MP3 data start (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'), (b'\xff\xfb', 'audio/mpeg'), ], } # ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats ftyp_types = ( (b'M4A ', 'audio/aac'), (b'M4A\x00', 'audio/aac'), (b'dash', 'audio/aac'), (b'isom', 'video/mp4'), # (b'isom', 'audio/aac'), (b'MSNV', 'video/mp4'), (b'qt ', 'video/quicktime'), (b'heic', 'image/heic'), (b'avif', 'image/avif'), ) # xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML xmlish_heuristics = ( (b'', 'text/html'), (b'', 'text/html'), (b'', 'text/html'), (b'', 'image/svg+xml'), (b' bool: if len(header) < len(maybe): # not enough bytes to tell if input data match return False return all(x == y for x, y in zip(header, maybe)) def match_riff(header: bytes) -> str: 'Handle a few special cases for func guess_mime.' if len(header) < 12 or not header.startswith(b'RIFF'): return '' if header.find(b'WEBP', 8, 12) == 8: return 'image/webp' if header.find(b'WAVE', 8, 12) == 8: return 'audio/x-wav' if header.find(b'AVI ', 8, 12) == 8: return 'video/avi' return '' def match_form(header: bytes) -> str: 'Handle a few special cases for func guess_mime.' if len(header) < 12 or not header.startswith(b'FORM'): return '' if header.find(b'AIFF', 8, 12) == 8: return 'audio/aiff' if header.find(b'AIFC', 8, 12) == 8: return 'audio/aiff' return '' def match_ftyp(header: bytes) -> str: 'Handle a few special cases for func guess_mime.' # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp' if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4: return '' # next 4 bytes after the ASCII 'ftyp' declare the data-format for marker, mime in ftyp_types: if header.find(marker, 8, 12) == 8: return mime # unrecognized MPEG-4-style data-format return '' def guess_mime(header: bytes, fallback: str) -> str: 'Try to auto-detect common MIME-types, given the first few input bytes.' # no bytes, no match if len(header) == 0: return fallback # check the RIFF formats, AIFF audio, and MPEG-4-like formats for f in (match_riff, match_form, match_ftyp): m = f(header) if m != '': return m # maybe it's a bitmap picture, which almost always has 40 on 15th byte if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14: return 'image/x-bmp' # check general lookup-table if header[0] in hdr_dispatch: for maybe in hdr_dispatch[header[0]]: if exact_match(header, maybe[0]): return maybe[1] if header.find(b'= 0: return 'text/html' # try HTML, SVG, and even XML if header.find(b'<', 0, 8) >= 0: for marker, mime in xmlish_heuristics: if header.find(marker, 0, 64) >= 0: return mime # try some common cases for JSON for pattern in json_heuristics: if pattern.match(header): return 'application/json' # nothing matched return fallback def seems_url(s: str) -> bool: protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:') return any(s.startswith(p) for p in protocols) def handle_input(w, r, path: str, mime_fallback: str) -> None: chunk_size = 3 * 1024 * 1024 chunk = r.read(chunk_size) # empty data-streams result in empty lines if not chunk: w.write(b'\n') w.flush() return mime = guess_mime(chunk, mime_fallback) if not mime: raise Exception(f'{path}: can\'t guess MIME-type') if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')): w.write(b'data:') w.write(bytes(mime, encoding='utf-8')) w.write(b';base64,') while True: w.write(b64encode(chunk)) prev_size = len(chunk) chunk = r.read(chunk_size) if not chunk: break if prev_size % 3 != 0: raise ValueError('latest chunk-size isn\'t a multiple of 3') w.write(b'\n') w.flush() fallback_aliases = { # 'text/json': 'application/json', # 'xbmp': 'image/x-bmp', # 'xflac': 'audio/x-flac', # 'xicon': 'image/x-icon', # 'xm4v': 'video/x-m4v', # 'xsqlite3': 'application/x-sqlite3', # 'xwav': 'audio/x-wav', # 'xwave': 'audio/x-wav', # 'x-bmp': 'image/x-bmp', # 'x-flac': 'audio/x-flac', # 'x-icon': 'image/x-icon', # 'x-m4v': 'video/x-m4v', # 'x-sqlite3': 'application/x-sqlite3', # 'x-wav': 'audio/x-wav', 'b': 'application/octet-stream', 'j': 'application/json', 't': 'text/plain', 'u': 'text/plain; charset=UTF-8', 'e': '', 'err': '', 'error': '', 'f': '', 'fail': '', 'aac': 'audio/aac', 'aif': 'audio/aiff', 'bin': 'application/octet-stream', 'binary': 'application/octet-stream', 'gzip': 'application/gzip', 'midi': 'audio/midi', 'mpeg': 'video/mpeg', 'octet': 'application/octet-stream', 'octetstream': 'application/octet-stream', 'octet-stream': 'application/octet-stream', 'plain': 'text/plain', 'sqlite': 'application/x-sqlite3', 'svg+xml': 'image/svg+xml', 'tif': 'image/tiff', 'utf8': 'text/plain; charset=UTF-8', 'utf-8': 'text/plain; charset=UTF-8', 'wave': 'audio/x-wav', 'zstd': 'application/zstd', 'aiff': 'audio/aiff', 'au': 'audio/basic', 'avi': 'video/avi', 'avif': 'image/avif', 'bmp': 'image/x-bmp', 'caf': 'audio/x-caf', 'cur': 'image/vnd.microsoft.icon', 'css': 'text/css', 'csv': 'text/csv', 'djvu': 'image/x-djvu', 'elf': 'application/x-elf', 'exe': 'application/vnd.microsoft.portable-executable', 'flac': 'audio/x-flac', 'gif': 'image/gif', 'gz': 'application/gzip', 'heic': 'image/heic', 'htm': 'text/html', 'html': 'text/html', 'ico': 'image/x-icon', 'iso': 'application/octet-stream', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'js': 'application/javascript', 'json': 'application/json', 'm4a': 'audio/aac', 'm4v': 'video/x-m4v', 'mid': 'audio/midi', 'mov': 'video/quicktime', 'mp4': 'video/mp4', 'mp3': 'audio/mpeg', 'mpg': 'video/mpeg', 'ogg': 'audio/ogg', 'opus': 'audio/opus', 'pdf': 'application/pdf', 'png': 'image/png', 'ps': 'application/postscript', 'psd': 'image/vnd.adobe.photoshop', 'rtf': 'application/rtf', 'sqlite3': 'application/x-sqlite3', 'svg': 'image/svg+xml', 'text': 'text/plain', 'tiff': 'image/tiff', 'tsv': 'text/tsv', 'wasm': 'application/wasm', 'wav': 'audio/x-wav', 'webp': 'image/webp', 'webm': 'video/webm', 'xml': 'application/xml', 'zip': 'application/zip', 'zst': 'application/zstd', } try: if argv.count('-') > 1: msg = 'reading from `-` (standard input) more than once not allowed' raise ValueError(msg) if any(seems_url(e) for e in argv): from urllib.request import urlopen inputs = 0 mime_fallback = '' # mime_fallback = 'application/octet-stream' change_fallback = False for arg in argv[1:]: if change_fallback: if arg in fallback_aliases: arg = fallback_aliases[arg] mime_fallback = arg change_fallback = False continue if arg in ('-f', '--f', '-fallback', '--fallback'): change_fallback = True continue path = arg inputs += 1 if path == '-': path = '' handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) continue if seems_url(path): with urlopen(path) as inp: handle_input(stdout.buffer, inp, path, mime_fallback) continue with open(path, mode='rb') as inp: handle_input(stdout.buffer, inp, path, mime_fallback) if change_fallback: raise ValueError('forgot new fallback MIME-type') if inputs == 0: path = '' handle_input(stdout.buffer, stdin.buffer, path, mime_fallback) except BrokenPipeError: # quit quietly, instead of showing a confusing error message stderr.close() exit(0) except KeyboardInterrupt: exit(2) except Exception as e: print(f'\x1b[31m{e}\x1b[0m', file=stderr) exit(1)