/* The MIT License (MIT) Copyright © 2024 pacman64 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* You can build this command-line app by running cc -Wall -s -O2 -o ./datauri ./datauri.c */ #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif const char* info = "" "datauri [options...] [filenames...]\n" "\n" "\n" "Encode bytes as data-URIs, auto-detecting the file/data type using the first\n" "few bytes from each data/file stream. When given multiple inputs, the output\n" "will be multiple lines, one for each file given.\n" "\n" "Empty files/inputs result in empty lines. A simple dash (-) stands for the\n" "standard-input, which is also used automatically when not given any files.\n" "\n" "Data-URIs are base64-encoded text representations of arbitrary data, which\n" "include their payload's MIME-type, and which are directly useable/shareable\n" "in web-browsers as links, despite not looking like normal links/URIs.\n" "\n" "Some web-browsers limit the size of handled data-URIs to tens of kilobytes.\n" "\n" "\n" "Options\n" "\n" " -h, -help, --h, --help show this help message\n" " -f, -fallback, --f, --fallback change the fallback MIME type\n" ""; const char* stdin_name = ""; const char* fallback_mime_type = "application/octet-stream"; // bufwriter is, as the name implies, a buffered-writer: when it's aimed at // stdout, it considerably speeds up this app, as intended typedef struct bufwriter { // buf is the buffer proper unsigned char* buf; // len is how many bytes of the buffer are currently being used size_t len; // cap is the capacity of the buffer, or the most bytes it can hold size_t cap; // out is the destination of all that's written into the buffer FILE* out; // done signals when/if no more output is accepted at the destination bool done; } bufwriter; // init_bufwriter is the constructor for type bufwriter void init_bufwriter(bufwriter* w, FILE* dst, unsigned char* buf, size_t cap) { w->buf = buf; w->len = 0; w->cap = cap; w->out = dst; w->done = false; } // flush does as it says: it empties the buffer after ensuring its bytes end // on their intended destination void flush(bufwriter* w) { if (w->len > 0 && fwrite(w->buf, w->len, 1, w->out) < 1) { w->done = true; } w->len = 0; } // write_bytes does as it says, minimizing the number of calls to fwrite void write_bytes(bufwriter* w, const unsigned char* src, size_t len) { if (w->len + len < w->cap) { // all bytes fit into buffer memcpy(w->buf + w->len, src, len); w->len += len; return; } // ensure current buffer bytes go out, before crossing strides flush(w); // emit all chunks striding beyond/at the buffer's capacity for (; len >= w->cap; src += w->cap, len -= w->cap) { if (fwrite(src, w->cap, 1, w->out) < 1) { w->done = true; return; } } // now all, if any, remaining bytes will fit into the buffer memcpy(w->buf, src, len); w->len += len; } // write_byte does as it says void write_byte(bufwriter* w, unsigned char b) { if (w->len >= w->cap) { flush(w); } w->buf[w->len] = b; w->len++; } // EMIT_CONST abstracts a common use-case of the bufwriter, which is // emitting string constants without their final null byte #define EMIT_CONST(w, x) write_bytes(w, (unsigned char*)x, sizeof(x) - 1) // can be anything: ensure this value differs from all other literal bytes // in the generic-headers table: failing that, its value could cause subtle // type-misdetection bugs; the value is chosen to be `obviously` findable // in the source, which also implies a constant beyond the ascii range, as // ascii char-constants are also used in the tables const unsigned char cba = 0xfd; // 253 #define aiff "audio/aiff" #define au "audio/basic" #define avi "video/avi" #define avif "image/avif" #define bmp "image/x-bmp" #define caf "audio/x-caf" #define cur "image/vnd.microsoft.icon" #define css "text/css" #define csv "text/csv" #define djvu "image/x-djvu" #define elf "application/x-elf" #define exe "application/vnd.microsoft.portable-executable" #define flac "audio/x-flac" #define gif "image/gif" #define gz "application/gzip" #define heic "image/heic" #define htm "text/html" #define html "text/html" #define ico "image/x-icon" #define iso "application/octet-stream" #define jpg "image/jpeg" #define jpeg "image/jpeg" #define js "application/javascript" #define json "application/json" #define m4a "audio/aac" #define m4v "video/x-m4v" #define mid "audio/midi" #define mov "video/quicktime" #define mp4 "video/mp4" #define mp3 "audio/mpeg" #define mpg "video/mpeg" #define ogg "audio/ogg" #define opus "audio/opus" #define pdf "application/pdf" #define png "image/png" #define ps "application/postscript" #define psd "image/vnd.adobe.photoshop" #define rtf "application/rtf" #define sqlite3 "application/x-sqlite3" #define svg "image/svg+xml" #define text "text/plain" #define tiff "image/tiff" #define tsv "text/tsv" #define wasm "application/wasm" #define wav "audio/x-wav" #define webp "image/webp" #define webm "video/webm" #define xml "application/xml" #define zip "application/zip" #define zst "application/zstd" // format_descriptor ties a file-header pattern to its data-format type typedef struct format_descriptor { unsigned char header_length; unsigned char header_bytes[24]; const char* mime; } format_descriptor; // starts_as tries to match header data to the pattern given: this includes // allowing `any byte` when the pattern indicates so, using a value reserved // for that purpose bool starts_as(unsigned char* x, size_t xlen, unsigned char* y, size_t ylen) { // when header data aren't enough for a pattern, there's no match if (xlen < ylen) { return false; } for (size_t i = 0; i < xlen; i++) { if (y[i] == cba) { // `can be anything` value always matches continue; } if (x[i] != y[i]) { return false; } } return true; } // wrapper func to make func `starts_as` harder to miscall inline bool match_header(unsigned char* d, size_t len, format_descriptor* to) { return starts_as(d, len, to->header_bytes, to->header_length); } // not confident enough to actually use this, and replace all table entries #define start_format_descriptor(...) \ sizeof((unsigned char[]){ __VA_ARGS__ }) / sizeof(unsigned char), \ { __VA_ARGS__ } // format markers with leading wildcards, which should be checked before the // normal ones: this is to prevent mismatches with the latter types, even // though you can make probabilistic arguments which suggest these mismatches // should be very unlikely in practice format_descriptor special_headers[] = { {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', '4', 'A', ' '}, m4a}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', '4', 'A', 000}, m4a}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'M', 'S', 'N', 'V'}, mp4}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'i', 's', 'o', 'm'}, mp4}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'm', 'p', '4', '2'}, m4v}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'q', 't', ' ', ' '}, mov}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'h', 'e', 'i', 'c'}, heic}, {12, {cba, cba, cba, cba, 'f', 't', 'y', 'p', 'a', 'v', 'i', 'f'}, avif}, { 24, { cba, cba, cba, cba, 'f', 't', 'y', 'p', 'd', 'a', 's', 'h', 000, 000, 000, 000, 'i', 's', 'o', '6', 'm', 'p', '4', '1', }, m4a, }, {0}, }; format_descriptor hdr_dispatch_0[] = { {4, {000, 000, 001, 0xBA}, mpg}, {4, {000, 000, 001, 0xB3}, mpg}, {4, {000, 000, 001, 000}, ico}, {4, {000, 000, 002, 000}, cur}, {4, {000, 'a', 's', 'm'}, wasm}, {0}, }; format_descriptor hdr_dispatch_26[] = { {4, {0x1A, 0x45, 0xDF, 0xA3}, webm}, {0}, }; format_descriptor hdr_dispatch_31[] = { // {4, {0x1F, 0x8B, 0x08, 0x08}, gz}, {3, {0x1F, 0x8B, 0x08}, gz}, {0}, }; format_descriptor hdr_dispatch_35[] = { {3, "#! ", text}, {3, "#!/", text}, {0}, }; format_descriptor hdr_dispatch_37[] = { {4, "%PDF", pdf}, {4, "%!PS", ps}, {0}, }; format_descriptor hdr_dispatch_40[] = { {4, {0x28, 0xB5, 0x2F, 0xFD}, zst}, {0}, }; format_descriptor hdr_dispatch_46[] = { {4, ".snd", au}, {0}, }; format_descriptor hdr_dispatch_56[] = { {4, "8BPS", psd}, {0}, }; format_descriptor hdr_dispatch_60[] = { {14, " 0; i++) { if (match_header(buf, len, &special_headers[i])) { return special_headers[i].mime; } } format_descriptor* guesses = hdr_dispatch[buf[0]]; if (guesses == NULL) { return fallback_mime_type; } for (size_t i = 0; guesses[i].header_length > 0; i++) { if (match_header(buf, len, &guesses[i])) { return guesses[i].mime; } } return fallback_mime_type; } bool is_mime_disabled(const char* mime) { return (mime[0] == 'n') && ( strcmp(mime, "no") == 0 || strcmp(mime, "nomime") == 0 || strcmp(mime, "no-mime") == 0 || strcmp(mime, "none") == 0 || strcmp(mime, "not") ); } // start_data_uri starts the output by declaring the data-URI to be an // auto-detected MIME-type; the return value is the auto-detection success bool start_data_uri(bufwriter* w, unsigned char* buf, size_t len) { const char* mime = guess_mime(buf, len); if (is_mime_disabled(mime)) { return true; } if (mime == NULL || mime[0] == 0) { return false; } EMIT_CONST(w, "data:"); for (size_t i = 0; mime[i] != 0; i++) { write_byte(w, mime[i]); } EMIT_CONST(w, ";base64,"); return true; } const unsigned char base64_lookup[] = "" "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" ""; inline uint32_t combine_triple(unsigned char data[4]) { return (data[0] << 16) | (data[1] << 8) | data[2]; } inline void emit_triple(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, base64_lookup[0x3f & (v >> 6)]); write_byte(w, base64_lookup[0x3f & v]); } inline void emit_couple(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, base64_lookup[0x3f & (v >> 6)]); write_byte(w, '='); } inline void emit_single(bufwriter* w, uint32_t v) { write_byte(w, base64_lookup[0x3f & (v >> 18)]); write_byte(w, base64_lookup[0x3f & (v >> 12)]); write_byte(w, '='); write_byte(w, '='); } bool handle_reader(bufwriter* w, FILE* src, const char* path) { // size of the input-buffer must be a multiple of 3 unsigned char buf[48 * 1024]; size_t chunks = 0; size_t where = 0; unsigned char triple[4]; triple[0] = 0; triple[1] = 0; triple[2] = 0; triple[3] = 0; while (!w->done) { const size_t n = fread(&buf, sizeof(unsigned char), sizeof(buf), src); if (n < 1) { // assume input is over when no bytes were read break; } if (chunks == 0) { char* fmt = "\x1b[31mcan't auto-detect MIME type for %s\x1b[0m\n"; if (!start_data_uri(w, buf, n)) { write_byte(w, '\n'); flush(w); fprintf(stderr, fmt, path); return false; } } chunks++; for (size_t i = 0; i < n; i++) { triple[where] = buf[i]; if (where < 2) { where++; } else { where = 0; emit_triple(w, combine_triple(triple)); } } } // empty inputs result in empty outputs if (chunks == 0) { return true; } // don't forget unemitted trailing bytes, if any: these need special // handling, as they include `=` signs; if the input bytes were a // multiple of 3, there won't be any trailing bytes switch (where) { case 1: triple[1] = 0; triple[2] = 0; emit_single(w, combine_triple(triple)); break; case 2: triple[2] = 0; emit_couple(w, combine_triple(triple)); break; } // end with a line-feed, so multiple input streams are each encoded in // their own line if (chunks > 0) { write_byte(w, '\n'); } flush(w); return true; } // handle_file handles data from the filename given; returns false only when // an error happened bool handle_file(bufwriter* w, const char* path) { // a `-` filename stands for the standard input if (path[0] == '-' && path[1] == 0) { return handle_reader(w, stdin, stdin_name); } FILE* f = fopen(path, "rb"); if (f == NULL) { fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path); return false; } const bool ok = handle_reader(w, f, path); fclose(f); return ok; } // is_help_option simplifies control-flow for func run bool is_help_option(char* s) { return s[0] == '-' && ( strcmp(s, "-h") == 0 || strcmp(s, "-help") == 0 || strcmp(s, "--h") == 0 || strcmp(s, "--help") == 0 ); } // is_fallback_option simplifies control-flow for func run bool is_fallback_option(char* s) { return s[0] == '-' && ( strcmp(s, "-f") == 0 || strcmp(s, "-fallback") == 0 || strcmp(s, "--f") == 0 || strcmp(s, "--fallback") == 0 ); } const char* fallback_aliases[] = { // "text/json", "application/json", // "xbmp", "image/x-bmp", // "xflac", "audio/x-flac", // "xicon", "image/x-icon", // "xm4v", "video/x-m4v", // "xsqlite3", "application/x-sqlite3", // "xwav", "audio/x-wav", // "xwave", "audio/x-wav", // "x-bmp", "image/x-bmp", // "x-flac", "audio/x-flac", // "x-icon", "image/x-icon", // "x-m4v", "video/x-m4v", // "x-sqlite3", "application/x-sqlite3", // "x-wav", "audio/x-wav", "b", "application/octet-stream", "j", "application/json", "t", "text/plain", "u", "text/plain; charset=UTF-8", "e", "", "err", "", "error", "", "f", "", "fail", "", "aac", "audio/aac", "aif", "audio/aiff", "bin", "application/octet-stream", "binary", "application/octet-stream", "gzip", "application/gzip", "midi", "audio/midi", "mpeg", "video/mpeg", "octet", "application/octet-stream", "octetstream", "application/octet-stream", "octet-stream", "application/octet-stream", "plain", "text/plain", "sqlite", "application/x-sqlite3", "svg+xml", "image/svg+xml", "tif", "image/tiff", "utf8", "text/plain; charset=UTF-8", "utf-8", "text/plain; charset=UTF-8", "wave", "audio/x-wav", "zstd", "application/zstd", "aiff", "audio/aiff", "au", "audio/basic", "avi", "video/avi", "avif", "image/avif", "bmp", "image/x-bmp", "caf", "audio/x-caf", "cur", "image/vnd.microsoft.icon", "css", "text/css", "csv", "text/csv", "djvu", "image/x-djvu", "elf", "application/x-elf", "exe", "application/vnd.microsoft.portable-executable", "flac", "audio/x-flac", "gif", "image/gif", "gz", "application/gzip", "heic", "image/heic", "htm", "text/html", "html", "text/html", "ico", "image/x-icon", "iso", "application/octet-stream", "jpg", "image/jpeg", "jpeg", "image/jpeg", "js", "application/javascript", "json", "application/json", "m4a", "audio/aac", "m4v", "video/x-m4v", "mid", "audio/midi", "mov", "video/quicktime", "mp4", "video/mp4", "mp3", "audio/mpeg", "mpg", "video/mpeg", "ogg", "audio/ogg", "opus", "audio/opus", "pdf", "application/pdf", "png", "image/png", "ps", "application/postscript", "psd", "image/vnd.adobe.photoshop", "rtf", "application/rtf", "sqlite3", "application/x-sqlite3", "svg", "image/svg+xml", "text", "text/plain", "tiff", "image/tiff", "tsv", "text/tsv", "wasm", "application/wasm", "wav", "audio/x-wav", "webp", "image/webp", "webm", "video/webm", "xml", "application/xml", "zip", "application/zip", "zst", "application/zstd", }; const char* adapt_fallback(char* name) { for (size_t i = 0; i < sizeof(fallback_aliases) / sizeof(char*); i += 2) { if (strcmp(name, fallback_aliases[i]) == 0) { return fallback_aliases[i + 1]; } } return name; } // run returns the number of errors size_t run(int argc, char** argv) { // handle special cmd-line options for (size_t i = 1; i < argc; i++) { if (is_help_option(argv[i])) { // help option is handled right away, also quitting the app puts(info); return 0; } } bufwriter w; unsigned char buf[48 * 1024]; init_bufwriter(&w, stdout, buf, sizeof(buf)); size_t files = 0; size_t errors = 0; bool change_fallback = false; // handle all filenames given for (size_t i = 1; i < argc && !w.done; i++) { if (change_fallback) { fallback_mime_type = adapt_fallback(argv[i]); change_fallback = false; continue; } if (is_fallback_option(argv[i])) { change_fallback = true; continue; } if (!handle_file(&w, argv[i])) { errors++; } files++; } if (change_fallback) { fprintf(stderr, "\x1b[31mforgot new fallback MIME-type\x1b[0m\n"); errors++; } // no filenames means use stdin as the only input if (files == 0) { if (!handle_reader(&w, stdin, stdin_name)) { errors++; } } return errors; } int main(int argc, char** argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); // ensure output lines end in LF instead of CRLF on windows setmode(fileno(stdout), O_BINARY); setmode(fileno(stderr), O_BINARY); #endif // disable automatic stdio buffering, in favor of explicit buffering setvbuf(stdin, NULL, _IONBF, 0); setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); // fill entries in the type-detect dispatch table hdr_dispatch[0] = hdr_dispatch_0; // 0 hdr_dispatch[26] = hdr_dispatch_26; // 26 hdr_dispatch[31] = hdr_dispatch_31; // 31 hdr_dispatch[35] = hdr_dispatch_35; // 35 # hdr_dispatch[37] = hdr_dispatch_37; // 37 % hdr_dispatch[40] = hdr_dispatch_40; // 40 ( hdr_dispatch[46] = hdr_dispatch_46; // 46 . hdr_dispatch[56] = hdr_dispatch_56; // 56 8 hdr_dispatch[60] = hdr_dispatch_60; // 60 < hdr_dispatch[65] = hdr_dispatch_65; // 65 A hdr_dispatch[66] = hdr_dispatch_66; // 66 B hdr_dispatch[70] = hdr_dispatch_70; // 70 F hdr_dispatch[71] = hdr_dispatch_71; // 71 G hdr_dispatch[73] = hdr_dispatch_73; // 73 I hdr_dispatch[77] = hdr_dispatch_77; // 77 M hdr_dispatch[79] = hdr_dispatch_79; // 79 O hdr_dispatch[80] = hdr_dispatch_80; // 80 P hdr_dispatch[82] = hdr_dispatch_82; // 82 R hdr_dispatch[83] = hdr_dispatch_83; // 83 S hdr_dispatch[99] = hdr_dispatch_99; // 99 c hdr_dispatch[102] = hdr_dispatch_102; // 102 f hdr_dispatch[123] = hdr_dispatch_123; // 123 { hdr_dispatch[127] = hdr_dispatch_127; // 127 hdr_dispatch[137] = hdr_dispatch_137; // 137 hdr_dispatch[255] = hdr_dispatch_255; // 255 return run(argc, argv) == 0 ? 0 : 1; }