#!/usr/bin/python3

# The MIT License (MIT)
#
# Copyright © 2020-2025 pacman64
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


from base64 import b64encode
from re import compile as compile_re
from sys import argv, exit, stderr, stdin, stdout


info = '''
datauri [options...] [filenames...]


Encode bytes as data-URIs, auto-detecting the file/data type using the first
few bytes from each data/file stream. When given multiple inputs, the output
will be multiple lines, one for each file given.

Empty files/inputs result in empty lines. A simple dash (-) stands for the
standard-input, which is also used automatically when not given any files.

Data-URIs are base64-encoded text representations of arbitrary data, which
include their payload's MIME-type, and which are directly useable/shareable
in web-browsers as links, despite not looking like normal links/URIs.

Some web-browsers limit the size of handled data-URIs to tens of kilobytes.

Options

    -h, -help, --h, --help              show this help message
    -f, -fallback, --f, --fallback      change the fallback MIME type
'''

# no args or a leading help-option arg means show the help message and quit
if len(argv) > 1 and argv[1] in ('-h', '--h', '-help', '--help'):
    print(info.strip())
    exit(0)


# hdr_dispatch groups format-description-groups by their first byte, thus
# shortening total lookups for some data header: notice how the `ftyp` data
# formats aren't handled here, since these can include any byte in parts of
# their first few bytes
hdr_dispatch = {
    0x00: [
        (b'\x00\x00\x01\xba', 'video/mpeg'),
        (b'\x00\x00\x01\xb3', 'video/mpeg'),
        (b'\x00\x00\x01\x00', 'image/x-icon'),
        (b'\x00\x00\x02\x00', 'image/vnd.microsoft.icon'),
        (b'\x00asm', 'application/wasm'),
    ],
    0x1a: [(b'\x1a\x45\xdf\xa3', 'video/webm')], # matches general MKV format
    0x1f: [(b'\x1f\x8b\x08', 'application/gzip')],
    0x23: [
        (b'#! ', 'text/plain; charset=UTF-8'),
        (b'#!/', 'text/plain; charset=UTF-8'),
    ],
    0x25: [
        (b'%PDF', 'application/pdf'),
        (b'%!PS', 'application/postscript'),
    ],
    0x2e: [(b'.snd', 'audio/basic')],
    0x47: [
        (b'GIF87a', 'image/gif'),
        (b'GIF89a', 'image/gif'),
    ],
    0x49: [
        # some MP3s start with an ID3 meta-data section
        (b'ID3\x02', 'audio/mpeg'),
        (b'ID3\x03', 'audio/mpeg'),
        (b'ID3\x04', 'audio/mpeg'),
        (b'II*\x00', 'image/tiff'),
    ],
    0x4d: [
        (b'MM\x00*', 'image/tiff'),
        (b'MThd', 'audio/midi'),
    ],
    0x4f: [(b'OggS', 'audio/ogg')],
    0x50: [(b'PK\x03\x04', 'application/zip')],
    0x63: [(b'caff\x00\x01\x00\x00', 'audio/x-caf')],
    0x66: [(b'fLaC', 'audio/x-flac')],
    0x7b: [(b'{\\rtf', 'application/rtf')],
    0x7f: [(b'\x7fELF', 'application/x-elf')],
    0x89: [(b'\x89PNG\x0d\x0a\x1a\x0a', 'image/png')],
    0xff: [
        (b'\xff\xd8\xff', 'image/jpeg'),
        # handle common ways MP3 data start
        (b'\xff\xf3\x48\xc4\x00', 'audio/mpeg'),
        (b'\xff\xfb', 'audio/mpeg'),
    ],
}


# ftyp_types helps func match_ftyp auto-detect MPEG-4-like formats
ftyp_types = (
    (b'M4A ', 'audio/aac'),
    (b'M4A\x00', 'audio/aac'),
    (b'mp42', 'video/x-m4v'),
    (b'dash', 'audio/aac'),
    (b'isom', 'video/mp4'),
    # (b'isom', 'audio/aac'),
    (b'MSNV', 'video/mp4'),
    (b'qt  ', 'video/quicktime'),
    (b'heic', 'image/heic'),
    (b'avif', 'image/avif'),
)

# xmlish_heuristics helps func guess_mime auto-detect HTML, SVG, and XML
xmlish_heuristics = (
    (b'<html>', 'text/html'),
    (b'<html ', 'text/html'),
    (b'<head>', 'text/html'),
    (b'<body>', 'text/html'),
    (b'<!DOCTYPE html', 'text/html'),
    (b'<svg>', 'image/svg+xml'),
    (b'<svg ', 'image/svg+xml'),
    (b'<?xml', 'application/xml'),
)

# json_heuristics helps func guess_mime auto-detect JSON via regexes:
# it's not perfect, but it seems effective-enough in practice
json_heuristics = (
    compile_re(b'''^\\s*\\{\\s*"'''),
    compile_re(b'''^\\s*\\{\\s*\\['''),
    compile_re(b'''^\\s*\\[\\s*"'''),
    compile_re(b'''^\\s*\\[\\s*\\{'''),
    compile_re(b'''^\\s*\\[\\s*\\['''),
)


def exact_match(header: bytes, maybe: bytes) -> bool:
    if len(header) < len(maybe):
        # not enough bytes to tell if input data match
        return False
    return all(x == y for x, y in zip(header, maybe))


def match_riff(header: bytes) -> str:
    'Handle a few special cases for func guess_mime.'

    if len(header) < 12 or not header.startswith(b'RIFF'):
        return ''

    if header.find(b'WEBP', 8, 12) == 8:
        return 'image/webp'
    if header.find(b'WAVE', 8, 12) == 8:
        return 'audio/x-wav'
    if header.find(b'AVI ', 8, 12) == 8:
        return 'video/avi'
    return ''


def match_form(header: bytes) -> str:
    'Handle a few special cases for func guess_mime.'

    if len(header) < 12 or not header.startswith(b'FORM'):
        return ''

    if header.find(b'AIFF', 8, 12) == 8:
        return 'audio/aiff'
    if header.find(b'AIFC', 8, 12) == 8:
        return 'audio/aiff'
    return ''


def match_ftyp(header: bytes) -> str:
    'Handle a few special cases for func guess_mime.'

    # first 4 bytes can be anything, next 4 bytes must be ASCII 'ftyp'
    if len(header) < 12 or header.find(b'ftyp', 4, 8) != 4:
        return ''

    # next 4 bytes after the ASCII 'ftyp' declare the data-format
    for marker, mime in ftyp_types:
        if header.find(marker, 8, 12) == 8:
            return mime

    # unrecognized MPEG-4-style data-format
    return ''


def guess_mime(header: bytes, fallback: str) -> str:
    'Try to auto-detect common MIME-types, given the first few input bytes.'

    # no bytes, no match
    if len(header) == 0:
        return fallback

    # check the RIFF formats, AIFF audio, and MPEG-4-like formats
    for f in (match_riff, match_form, match_ftyp):
        m = f(header)
        if m != '':
            return m

    # maybe it's a bitmap picture, which almost always has 40 on 15th byte
    if header.startswith(b'BM') and header.find(b'\x28', 8, 16) == 14:
        return 'image/x-bmp'

    # check general lookup-table
    if header[0] in hdr_dispatch:
        for maybe in hdr_dispatch[header[0]]:
            if exact_match(header, maybe[0]):
                return maybe[1]

    if header.find(b'<!DOCTYPE html', 0, 64) >= 0:
        return 'text/html'

    # try HTML, SVG, and even XML
    if header.find(b'<', 0, 8) >= 0:
        for marker, mime in xmlish_heuristics:
            if header.find(marker, 0, 64) >= 0:
                return mime

    # try some common cases for JSON
    for pattern in json_heuristics:
        if pattern.match(header):
            return 'application/json'

    # nothing matched
    return fallback


def seems_url(s: str) -> bool:
    protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
    return any(s.startswith(p) for p in protocols)


def handle_input(w, r, path: str, mime_fallback: str) -> None:
    chunk_size = 3 * 1024 * 1024
    chunk = r.read(chunk_size)

    # empty data-streams result in empty lines
    if not chunk:
        w.write(b'\n')
        w.flush()
        return

    mime = guess_mime(chunk, mime_fallback)
    if not mime:
        raise Exception(f'{path}: can\'t guess MIME-type')

    if not (mime in ('no', 'nomime', 'no-mime', 'none', 'not')):
        w.write(b'data:')
        w.write(bytes(mime, encoding='utf-8'))
        w.write(b';base64,')

    while True:
        w.write(b64encode(chunk))
        prev_size = len(chunk)
        chunk = r.read(chunk_size)
        if not chunk:
            break
        if prev_size % 3 != 0:
            raise ValueError('latest chunk-size isn\'t a multiple of 3')

    w.write(b'\n')
    w.flush()


fallback_aliases = {
    # 'text/json': 'application/json',

    # 'xbmp': 'image/x-bmp',
    # 'xflac': 'audio/x-flac',
    # 'xicon': 'image/x-icon',
    # 'xm4v': 'video/x-m4v',
    # 'xsqlite3': 'application/x-sqlite3',
    # 'xwav': 'audio/x-wav',
    # 'xwave': 'audio/x-wav',
    # 'x-bmp': 'image/x-bmp',
    # 'x-flac': 'audio/x-flac',
    # 'x-icon': 'image/x-icon',
    # 'x-m4v': 'video/x-m4v',
    # 'x-sqlite3': 'application/x-sqlite3',
    # 'x-wav': 'audio/x-wav',

    'b': 'application/octet-stream',
    'j': 'application/json',
    't': 'text/plain',
    'u': 'text/plain; charset=UTF-8',

    'e': '',
    'err': '',
    'error': '',
    'f': '',
    'fail': '',

    'aac': 'audio/aac',
    'aif': 'audio/aiff',
    'bin': 'application/octet-stream',
    'binary': 'application/octet-stream',
    'gzip': 'application/gzip',
    'midi': 'audio/midi',
    'mpeg': 'video/mpeg',
    'octet': 'application/octet-stream',
    'octetstream': 'application/octet-stream',
    'octet-stream': 'application/octet-stream',
    'plain': 'text/plain',
    'sqlite': 'application/x-sqlite3',
    'svg+xml': 'image/svg+xml',
    'tif': 'image/tiff',
    'utf8': 'text/plain; charset=UTF-8',
    'utf-8': 'text/plain; charset=UTF-8',
    'wave': 'audio/x-wav',
    'zstd': 'application/zstd',

    'aiff': 'audio/aiff',
    'au': 'audio/basic',
    'avi': 'video/avi',
    'avif': 'image/avif',
    'bmp': 'image/x-bmp',
    'caf': 'audio/x-caf',
    'cur': 'image/vnd.microsoft.icon',
    'css': 'text/css',
    'csv': 'text/csv',
    'djvu': 'image/x-djvu',
    'elf': 'application/x-elf',
    'exe': 'application/vnd.microsoft.portable-executable',
    'flac': 'audio/x-flac',
    'gif': 'image/gif',
    'gz': 'application/gzip',
    'heic': 'image/heic',
    'htm': 'text/html',
    'html': 'text/html',
    'ico': 'image/x-icon',
    'iso': 'application/octet-stream',
    'jpg': 'image/jpeg',
    'jpeg': 'image/jpeg',
    'js': 'application/javascript',
    'json': 'application/json',
    'm4a': 'audio/aac',
    'm4v': 'video/x-m4v',
    'mid': 'audio/midi',
    'mov': 'video/quicktime',
    'mp4': 'video/mp4',
    'mp3': 'audio/mpeg',
    'mpg': 'video/mpeg',
    'ogg': 'audio/ogg',
    'opus': 'audio/opus',
    'pdf': 'application/pdf',
    'png': 'image/png',
    'ps': 'application/postscript',
    'psd': 'image/vnd.adobe.photoshop',
    'rtf': 'application/rtf',
    'sqlite3': 'application/x-sqlite3',
    'svg': 'image/svg+xml',
    'text': 'text/plain',
    'tiff': 'image/tiff',
    'tsv': 'text/tsv',
    'wasm': 'application/wasm',
    'wav': 'audio/x-wav',
    'webp': 'image/webp',
    'webm': 'video/webm',
    'xml': 'application/xml',
    'zip': 'application/zip',
    'zst': 'application/zstd',
}

try:
    if argv.count('-') > 1:
        msg = 'reading from `-` (standard input) more than once not allowed'
        raise ValueError(msg)

    if any(seems_url(e) for e in argv):
        from urllib.request import urlopen

    inputs = 0
    mime_fallback = ''
    # mime_fallback = 'application/octet-stream'
    change_fallback = False

    for arg in argv[1:]:
        if change_fallback:
            if arg in fallback_aliases:
                arg = fallback_aliases[arg]
            mime_fallback = arg
            change_fallback = False
            continue

        if arg in ('-f', '--f', '-fallback', '--fallback'):
            change_fallback = True
            continue

        path = arg
        inputs += 1

        if path == '-':
            path = '<stdin>'
            handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
            continue

        if seems_url(path):
            with urlopen(path) as inp:
                handle_input(stdout.buffer, inp, path, mime_fallback)
            continue

        with open(path, mode='rb') as inp:
            handle_input(stdout.buffer, inp, path, mime_fallback)

    if change_fallback:
        raise ValueError('forgot new fallback MIME-type')

    if inputs == 0:
        path = '<stdin>'
        handle_input(stdout.buffer, stdin.buffer, path, mime_fallback)
except BrokenPipeError:
    # quit quietly, instead of showing a confusing error message
    stderr.close()
    exit(0)
except KeyboardInterrupt:
    exit(2)
except Exception as e:
    print(f'\x1b[31m{e}\x1b[0m', file=stderr)
    exit(1)