/* The MIT License (MIT) Copyright © 2020-2025 pacman64 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* You can build this command-line app by running cc -Wall -s -O3 -march=native -mtune=native -flto -o ./datauri ./datauri.c */ #include #include #include #include #ifdef _WIN32 #include #endif #ifdef RED_ERRORS #define ERROR_STYLE "\x1b[38;2;204;0;0m" #ifdef __APPLE__ #define ERROR_STYLE "\x1b[31m" #endif #define RESET_STYLE "\x1b[0m" #else #define ERROR_STYLE #define RESET_STYLE #endif #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") #ifndef IBUF_SIZE #define IBUF_SIZE (32 * 1024) #endif #ifndef OBUF_SIZE #define OBUF_SIZE (8 * 1024) #endif const char* info = "" "datauri [options...] [filenames...]\n" "\n" "\n" "Encode bytes as data-URIs, auto-detecting the file/data type using the first\n" "few bytes from each data/file stream. When given multiple inputs, the output\n" "will be multiple lines, one for each file given.\n" "\n" "Empty files/inputs result in empty lines. A simple dash (-) stands for the\n" "standard-input, which is also used automatically when not given any files.\n" "\n" "Data-URIs are base64-encoded text representations of arbitrary data, which\n" "include their payload's MIME-type, and which are directly useable/shareable\n" "in web-browsers as links, despite not looking like normal links/URIs.\n" "\n" "Some web-browsers limit the size of handled data-URIs to tens of kilobytes.\n" "\n" "\n" "Options\n" "\n" " -h, -help, --h, --help show this help message\n" " -f, -fallback, --f, --fallback change the fallback MIME type\n" ""; const char* stdin_name = ""; #define octet "application/octet-stream" const char* fallback_mime_type = octet; // bufwriter is, as the name implies, a buffered-writer: when it's aimed at // stdout, it considerably speeds up this app, as intended typedef struct bufwriter { // buf is the buffer proper unsigned char* buf; // len is how many bytes of the buffer are currently being used size_t len; // cap is the capacity of the buffer, or the most bytes it can hold size_t cap; // out is the destination of all that's written into the buffer FILE* out; } bufwriter; void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) { w->buf = b; w->len = 0; w->cap = cap; w->out = out; } static inline void write_byte(bufwriter* w, unsigned char b) { if (w->len < w->cap) { w->buf[w->len++] = b; return; } fwrite(w->buf, 1, w->cap, w->out); w->buf[0] = b; w->len = 1; } void write_string(bufwriter* w, const char* s) { for (; *s != 0; s++) { write_byte(w, *s); } } void flush(bufwriter* w) { if (w->len > 0) { fwrite(w->buf, 1, w->len, w->out); } w->len = 0; fflush(w->out); } // can be anything: ensure this value differs from all other literal bytes // in the generic-headers table: failing that, its value could cause subtle // type-misdetection bugs; the value is chosen to be `obviously` findable // in the source, which also implies a constant beyond the ascii range, as // ascii char-constants are also used in the tables const unsigned char cba = 0xfd; // 253 #define aiff "audio/aiff" #define au "audio/basic" #define avi "video/avi" #define avif "image/avif" #define bmp "image/x-bmp" #define caf "audio/x-caf" #define cur "image/vnd.microsoft.icon" #define css "text/css" #define csv "text/csv" #define djvu "image/x-djvu" #define elf "application/x-elf" #define exe "application/vnd.microsoft.portable-executable" #define flac "audio/x-flac" #define gif "image/gif" #define gz "application/gzip" #define heic "image/heic" #define htm "text/html" #define html "text/html" #define ico "image/x-icon" #define iso "application/octet-stream" #define jpeg "image/jpeg" #define js "application/javascript" #define json "application/json" #define m4a "audio/aac" #define m4v "video/x-m4v" #define midi "audio/midi" #define mov "video/quicktime" #define mp4 "video/mp4" #define mp3 "audio/mpeg" #define mpg "video/mpeg" #define ogg "audio/ogg" #define opus "audio/opus" #define pdf "application/pdf" #define png "image/png" #define ps "application/postscript" #define psd "image/vnd.adobe.photoshop" #define rtf "application/rtf" #define sqlite3 "application/x-sqlite3" #define svg "image/svg+xml" #define text "text/plain" #define tiff "image/tiff" #define tsv "text/tsv" #define utf8 "text/plain; charset=UTF-8" #define wasm "application/wasm" #define wav "audio/x-wav" #define webp "image/webp" #define webm "video/webm" #define xml "application/xml" #define zip "application/zip" #define zst "application/zstd" // format_descriptor ties a file-header pattern to its data-format type; // the 15-byte header-limit nicely aligns with the 1-byte length before it typedef struct format_descriptor { unsigned char header_length; unsigned char header_bytes[15]; const char* mime; } format_descriptor; // starts_as tries to match header data to the pattern given: this includes // allowing `any byte` when the pattern indicates so, using a value reserved // for that purpose bool starts_as(const uint8_t* x, size_t xlen, const uint8_t* y, size_t ylen) { // when header data aren't enough for a pattern, there's no match if (xlen < ylen) { return false; } for (size_t i = 0; i < ylen; i++) { if (y[i] == cba) { // `can be anything` value always matches continue; } if (x[i] != y[i]) { return false; } } return true; } // not confident enough to actually use this, and replace all table entries #define start_format_descriptor(...) \ sizeof((unsigned char[]){ __VA_ARGS__ }) / sizeof(unsigned char), \ { __VA_ARGS__ } // format markers with leading wildcards, which should be checked before the // normal ones: this is to prevent mismatches with the latter types, even // though you can make probabilistic arguments which suggest these mismatches // should be very unlikely in practice format_descriptor special_headers[] = { {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', '4', 'A', ' '}, m4a}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', '4', 'A', 000}, m4a}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', 'S', 'N', 'V'}, mp4}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'i', 's', 'o', 'm'}, mp4}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'm', 'p', '4', '2'}, m4v}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'q', 't', ' ', ' '}, mov}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'h', 'e', 'i', 'c'}, heic}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'a', 'v', 'i', 'f'}, avif}, // { // 24, // { // cba, cba, cba, cba, 'f', 't', 'y', 'p', 'd', 'a', 's', 'h', // 000, 000, 000, 000, 'i', 's', 'o', '6', 'm', 'p', '4', '1', // }, // m4a, // }, {0}, }; format_descriptor hdr_dispatch_0[] = { {4, {000, 000, 001, 0xBA}, mpg}, {4, {000, 000, 001, 0xB3}, mpg}, {4, {000, 000, 001, 000}, ico}, {4, {000, 000, 002, 000}, cur}, {4, {000, 'a', 's', 'm'}, wasm}, {0}, }; format_descriptor hdr_dispatch_26[] = { {4, {0x1A, 0x45, 0xDF, 0xA3}, webm}, {0}, }; format_descriptor hdr_dispatch_31[] = { // {4, {0x1F, 0x8B, 0x08, 0x08}, gz}, {3, {0x1F, 0x8B, 0x08}, gz}, {0}, }; format_descriptor hdr_dispatch_35[] = { {3, "#! ", text}, {3, "#!/", text}, {0}, }; format_descriptor hdr_dispatch_37[] = { {4, "%PDF", pdf}, {4, "%!PS", ps}, {0}, }; format_descriptor hdr_dispatch_40[] = { {4, {0x28, 0xB5, 0x2F, 0xFD}, zst}, {0}, }; format_descriptor hdr_dispatch_46[] = { {4, ".snd", au}, {0}, }; format_descriptor hdr_dispatch_56[] = { {4, "8BPS", psd}, {0}, }; format_descriptor hdr_dispatch_60[] = { {14, " 0; i++) { const unsigned char* hb = special_headers[i].header_bytes; const size_t hl = special_headers[i].header_length; if (starts_as(buf, len, hb, hl)) { return special_headers[i].mime; } } // the m4a-dash header exceeds the 15-byte limit of the lookup tables const uint8_t header1[24] = { cba, cba, cba, cba, 'f', 't', 'y', 'p', 'd', 'a', 's', 'h', 000, 000, 000, 000, 'i', 's', 'o', '6', 'm', 'p', '4', '1', }; if (starts_as(buf, len, header1, sizeof(header1))) { return m4a; } // the sqlite3 header exceeds the 15-byte limit of the lookup tables const uint8_t header2[16] = "SQLite format 3\x00"; if (starts_as(buf, len, header2, sizeof(header2))) { return sqlite3; } format_descriptor* guesses = hdr_dispatch[buf[0]]; if (guesses == NULL) { return fallback_mime_type; } for (size_t i = 0; guesses[i].header_length > 0; i++) { const unsigned char* hb = guesses[i].header_bytes; const size_t hl = guesses[i].header_length; if (starts_as(buf, len, hb, hl)) { return guesses[i].mime; } } return fallback_mime_type; } bool is_mime_disabled(const char* mime) { return (mime != NULL) && (mime[0] == 'n') && ( strcmp(mime, "no") == 0 || strcmp(mime, "nomime") == 0 || strcmp(mime, "no-mime") == 0 || strcmp(mime, "none") == 0 || strcmp(mime, "not") == 0 ); } // start_data_uri starts the output by declaring the data-URI to be an // auto-detected MIME-type; the return value is the auto-detection success bool start_data_uri(bufwriter* w, const unsigned char* buf, size_t len) { const char* mime = guess_mime(buf, len); if (is_mime_disabled(mime)) { return true; } if (mime == NULL || mime[0] == 0) { return false; } write_string(w, "data:"); for (size_t i = 0; mime[i] != 0; i++) { write_byte(w, mime[i]); } write_string(w, ";base64,"); return true; } const unsigned char base64_lookup[64] = "" "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" ""; static inline uint32_t combine_triple(const unsigned char data[4]) { return (data[0] << 16) | (data[1] << 8) | (data[2] << 0); } static inline void emit_triple(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, base64_lookup[0x3f & (v >> 6)]); write_byte(w, base64_lookup[0x3f & (v >> 0)]); } void emit_couple(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, base64_lookup[0x3f & (v >> 6)]); write_byte(w, '='); } void emit_single(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, '='); write_byte(w, '='); } bool handle_reader(bufwriter* w, FILE* src, const char* path) { unsigned char buf[IBUF_SIZE]; uint64_t bytes = 0; // triple holds groups of 3 bytes at once, which is required by base64, // except for the last few bytes of input, which are padded with equals; // the 4th item is never used, but having it aligns things to 32 bits unsigned char triple[4]; triple[0] = 0; triple[1] = 0; triple[2] = 0; triple[3] = 0; while (!feof(w->out)) { const size_t n = fread(&buf, sizeof(buf[0]), sizeof(buf), src); if (n < 1) { // assume input is over when no bytes were read break; } if (bytes == 0 && !start_data_uri(w, buf, n)) { write_byte(w, '\n'); flush(w); const char* msg = "can't auto-detect MIME type for"; fprintf(stderr, ERROR_LINE("%s %s"), msg, path); return false; } size_t where = bytes % 3; for (size_t i = 0; i < n; i++, bytes++) { triple[where++] = buf[i]; if (where == 3) { emit_triple(w, combine_triple(triple)); where = 0; } } } // empty inputs result in empty outputs if (bytes == 0) { return true; } // don't forget unemitted trailing bytes, if any: these need special // handling, as they include `=` signs; if the input bytes were a // multiple of 3, there won't be any trailing bytes switch (bytes % 3) { case 1: triple[1] = 0; triple[2] = 0; emit_single(w, combine_triple(triple)); break; case 2: triple[2] = 0; emit_couple(w, combine_triple(triple)); break; } // end with a line-feed, so multiple input streams are each encoded in // their own line if (bytes > 0) { write_byte(w, '\n'); flush(w); } return true; } // handle_file handles data from the filename given; returns false only when // an error happened bool handle_file(bufwriter* w, const char* path) { // a `-` filename stands for the standard input if (path[0] == '-' && path[1] == 0) { return handle_reader(w, stdin, stdin_name); } FILE* f = fopen(path, "rb"); if (f == NULL) { fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); return false; } const bool ok = handle_reader(w, f, path); fclose(f); return ok; } // is_help_option simplifies control-flow for func run bool is_help_option(const char* s) { return s[0] == '-' && ( strcmp(s, "-h") == 0 || strcmp(s, "-help") == 0 || strcmp(s, "--h") == 0 || strcmp(s, "--help") == 0 ); } // is_fallback_option simplifies control-flow for func run bool is_fallback_option(const char* s) { return s[0] == '-' && ( strcmp(s, "-f") == 0 || strcmp(s, "-fallback") == 0 || strcmp(s, "--f") == 0 || strcmp(s, "--fallback") == 0 ); } const char* fallback_aliases[192] = { // tiny shortcuts "b", octet, "j", json, "t", text, "u", utf8, // failure fallbacks "e", "", "err", "", "error", "", "f", "", "fail", "", // common mistakes "text/json", json, "aif", aiff, "aiff", aiff, "au", au, "avi", avi, "avif", avif, "bmp", bmp, "caf", caf, "cur", cur, "css", css, "csv", csv, "djvu", djvu, "elf", elf, "exe", exe, "flac", flac, "gif", gif, "gz", gz, "heic", heic, "html", html, "ico", ico, "iso", iso, "jpg", jpeg, "jpeg", jpeg, "js", js, "json", json, "m4a", m4a, "m4v", m4v, "midi", midi, "mov", mov, "mp4", mp4, "mp3", mp3, "mpeg", mpg, "ogg", ogg, "opus", opus, "pdf", pdf, "png", png, "ps", ps, "psd", psd, "rtf", rtf, "sqlite3", sqlite3, "svg", svg, "text", text, "tiff", tiff, "tsv", tsv, "wasm", wasm, "wav", wav, "webp", webp, "webm", webm, "xml", xml, "zip", zip, "zst", zst, // longer shortcuts "aac", m4a, "aif", aiff, "bin", octet, "binary", octet, "bits", octet, "gzip", gz, "htm", htm, "mid", midi, "mpg", mpg, "octet", octet, "octets", octet, "octetstream", octet, "octet-stream", octet, "plain", text, "sqlite", sqlite3, "svg+xml", svg, "tif", tiff, "utf8", utf8, "utf-8", utf8, "xbmp", bmp, "xcaf", caf, "xflac", flac, "xicon", ico, "xm4v", m4v, "xsqlite3", sqlite3, "xwav", wav, "xwave", wav, "x-bmp", bmp, "x-caf", caf, "x-flac", flac, "x-icon", ico, "x-m4v", m4v, "x-sqlite3", sqlite3, "x-wav", wav, "wave", wav, "zstd", zst, }; const char* resolve_alias(const char* name) { const size_t n = sizeof(fallback_aliases) / sizeof(fallback_aliases[0]); for (size_t i = 0; i < n; i += 2) { if (strcmp(name, fallback_aliases[i]) == 0) { return fallback_aliases[i + 1]; } } return name; } // run returns the number of errors int run(int argc, char** argv, FILE* w) { unsigned char outbuf[OBUF_SIZE]; bufwriter bw; init_bufwriter(&bw, w, outbuf, sizeof(outbuf)); size_t files = 0; size_t errors = 0; bool change_fallback = false; // handle all filenames given for (size_t i = 1; i < argc && !feof(w); i++) { if (change_fallback) { fallback_mime_type = resolve_alias(argv[i]); change_fallback = false; continue; } if (is_fallback_option(argv[i])) { change_fallback = true; continue; } if (!handle_file(&bw, argv[i])) { errors++; } files++; } if (change_fallback) { flush(&bw); fprintf(stderr, ERROR_LINE("forgot new fallback MIME-type")); errors++; return errors; } // no filenames means use stdin as the only input if (files == 0) { if (!handle_reader(&bw, stdin, stdin_name)) { errors++; } } flush(&bw); return errors; } int main(int argc, char** argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); // ensure output lines end in LF instead of CRLF on windows setmode(fileno(stdout), O_BINARY); setmode(fileno(stderr), O_BINARY); #endif if (argc > 1 && is_help_option(argv[1])) { printf("%s", info); return 0; } // fill entries in the type-detection dispatch table hdr_dispatch[0] = hdr_dispatch_0; // 0 hdr_dispatch[26] = hdr_dispatch_26; // 26 hdr_dispatch[31] = hdr_dispatch_31; // 31 hdr_dispatch[35] = hdr_dispatch_35; // 35 # hdr_dispatch[37] = hdr_dispatch_37; // 37 % hdr_dispatch[40] = hdr_dispatch_40; // 40 ( hdr_dispatch[46] = hdr_dispatch_46; // 46 . hdr_dispatch[56] = hdr_dispatch_56; // 56 8 hdr_dispatch[60] = hdr_dispatch_60; // 60 < hdr_dispatch[65] = hdr_dispatch_65; // 65 A hdr_dispatch[66] = hdr_dispatch_66; // 66 B hdr_dispatch[70] = hdr_dispatch_70; // 70 F hdr_dispatch[71] = hdr_dispatch_71; // 71 G hdr_dispatch[73] = hdr_dispatch_73; // 73 I hdr_dispatch[77] = hdr_dispatch_77; // 77 M hdr_dispatch[79] = hdr_dispatch_79; // 79 O hdr_dispatch[80] = hdr_dispatch_80; // 80 P hdr_dispatch[82] = hdr_dispatch_82; // 82 R hdr_dispatch[83] = hdr_dispatch_83; // 83 S hdr_dispatch[99] = hdr_dispatch_99; // 99 c hdr_dispatch[102] = hdr_dispatch_102; // 102 f hdr_dispatch[123] = hdr_dispatch_123; // 123 { hdr_dispatch[127] = hdr_dispatch_127; // 127 hdr_dispatch[137] = hdr_dispatch_137; // 137 hdr_dispatch[255] = hdr_dispatch_255; // 255 return run(argc, argv, stdout) == 0 ? 0 : 1; }