#!/usr/bin/python3
# The MIT License (MIT)
#
# Copyright © 2020-2025 pacman64
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from base64 import b64encode
from re import compile as compile_re
from sys import argv, exit, stderr, stdin, stdout
info = '''
datauri [options...] [filenames...]
Encode bytes as data-URIs, auto-detecting the file/data type using the first
few bytes from each data/file stream. When given multiple inputs, the output
will be multiple lines, one for each file given.
Empty files/inputs result in empty lines. A simple dash (-) stands for the
standard-input, which is also used automatically when not given any files.
Data-URIs are base64-encoded text representations of arbitrary data, which
include their payload's MIME-type, and which are directly useable/shareable
in web-browsers as links, despite not looking like normal links/URIs.
Some web-browsers limit the size of handled data-URIs to tens of kilobytes.
Options
-h, -help, --h, --help show this help message
-f, -fallback, --f, --fallback change the fallback MIME type
'''
# no args or a leading help-option arg means show the help message and quit
if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
print(info.strip())
exit(0)
# hdr_dispatch groups format-description-groups by their first byte, thus
# shortening total lookups for some data header: notice how the `ftyp` data
# formats aren't handled here, since these can include any byte in parts of
# their first few bytes
hdr_dispatch = {
0x00: [
(b'\x00\x00\x01\xba', 'video/mpeg'),
(b'\x00\x00\x01\xb3', 'video/mpeg'),
(b'\x00\x00\x01\x00', 'image/x-icon'),
(b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'),
],
0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format
0x23: [
(b'#! ', 'text/plain; charset=UTF-8'),
(b'#!/', 'text/plain; charset=UTF-8'),
],
0x25: [(b'%PDF', 'application/pdf'), (b'%!PS', 'application/postscript')],
0x2e: [(b'.snd', 'audio/basic')],
0x47: [(b'GIF87a', 'image/gif'), (b'GIF89a', 'image/gif')],
0x49: [
# some MP3s start with an ID3 meta-data section
(b'ID3\x02', 'audio/mpeg'),
(b'ID3\x03', 'audio/mpeg'),
(b'ID3\x04', 'audio/mpeg'),
(b'II*\x00', 'image/tiff'),
],
0x4d: [(b'MM\x00*', 'image/tiff'), (b'MThd', 'audio/midi')],
0x4f: [(b'OggS', 'audio/ogg')],
0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')],
0x66: [(b'fLaC', 'audio/x-flac')],
0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')],
0xff: [
(b'\xff\xd8\xff', 'image/jpeg'),
# handle common ways MP3 data start
(b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'),
(b'\xff\xfb', 'audio/mpeg'),
],
}
# ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats
ftyp_types = (
(b'M4A ', 'audio/aac'),
(b'M4A\x00', 'audio/aac'),
(b'dash', 'audio/aac'),
(b'isom', 'video/mp4'),
# (b'isom', 'audio/aac'),
(b'MSNV', 'video/mp4'),
(b'qt ', 'video/quicktime'),
(b'heic', 'image/heic'),
(b'avif', 'image/avif'),
)
# xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML
xmlish_heuristics = (
(b'', 'text/html'),
(b'', 'text/html'),
(b'
', 'text/html'),
(b'', 'image/svg+xml'),
(b'