File: vulgarize.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./vulgarize ./vulgarize.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 36 #ifdef _WIN32 37 #include <fcntl.h> 38 #include <windows.h> 39 #endif 40 41 #ifdef RED_ERRORS 42 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 43 #ifdef __APPLE__ 44 #define ERROR_STYLE "\x1b[31m" 45 #endif 46 #define RESET_STYLE "\x1b[0m" 47 #else 48 #define ERROR_STYLE 49 #define RESET_STYLE 50 #endif 51 52 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 53 54 const char* info = "" 55 "vulgarize [filenames...]\n" 56 "\n" 57 "Turn files with ISO-8859-1 text into UTF8 text. The name `-` stands for the\n" 58 "standard input. When no names are given, the standard input is used by\n" 59 "default.\n" 60 ""; 61 62 // slice is a growable region of bytes in memory 63 typedef struct slice { 64 // ptr is the starting place of the region 65 unsigned char* ptr; 66 67 // len is how many bytes are currently being used 68 size_t len; 69 70 // cap is how many bytes the memory region has available 71 size_t cap; 72 } slice; 73 74 /* 75 tbp = 'range(0, 256)' | iconv -f iso-8859-1 -t utf8 | 76 tbp '(f"{str(e)}, " if e >= 128 else f"{str(e)}, 0, " for e in d)' 77 */ 78 const unsigned char iso2utf8[512] = { 79 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 80 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 81 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 82 31, 0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39, 0, 40, 0, 83 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47, 0, 48, 0, 49, 0, 50, 0, 84 51, 0, 52, 0, 53, 0, 54, 0, 55, 0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 85 61, 0, 62, 0, 63, 0, 64, 0, 65, 0, 66, 0, 67, 0, 68, 0, 69, 0, 70, 0, 86 71, 0, 72, 0, 73, 0, 74, 0, 75, 0, 76, 0, 77, 0, 78, 0, 79, 0, 80, 0, 87 81, 0, 82, 0, 83, 0, 84, 0, 85, 0, 86, 0, 87, 0, 88, 0, 89, 0, 90, 0, 88 91, 0, 92, 0, 93, 0, 94, 0, 95, 0, 96, 0, 97, 0, 98, 0, 99, 0, 100, 0, 89 101, 0, 102, 0, 103, 0, 104, 0, 105, 0, 106, 0, 107, 0, 108, 0, 109, 0, 90 110, 0, 111, 0, 112, 0, 113, 0, 114, 0, 115, 0, 116, 0, 117, 0, 118, 0, 91 119, 0, 120, 0, 121, 0, 122, 0, 123, 0, 124, 0, 125, 0, 126, 0, 127, 0, 92 194, 128, 194, 129, 194, 130, 194, 131, 194, 132, 194, 133, 194, 134, 93 194, 135, 194, 136, 194, 137, 194, 138, 194, 139, 194, 140, 194, 141, 94 194, 142, 194, 143, 194, 144, 194, 145, 194, 146, 194, 147, 194, 148, 95 194, 149, 194, 150, 194, 151, 194, 152, 194, 153, 194, 154, 194, 155, 96 194, 156, 194, 157, 194, 158, 194, 159, 194, 160, 194, 161, 194, 162, 97 194, 163, 194, 164, 194, 165, 194, 166, 194, 167, 194, 168, 194, 169, 98 194, 170, 194, 171, 194, 172, 194, 173, 194, 174, 194, 175, 194, 176, 99 194, 177, 194, 178, 194, 179, 194, 180, 194, 181, 194, 182, 194, 183, 100 194, 184, 194, 185, 194, 186, 194, 187, 194, 188, 194, 189, 194, 190, 101 194, 191, 195, 128, 195, 129, 195, 130, 195, 131, 195, 132, 195, 133, 102 195, 134, 195, 135, 195, 136, 195, 137, 195, 138, 195, 139, 195, 140, 103 195, 141, 195, 142, 195, 143, 195, 144, 195, 145, 195, 146, 195, 147, 104 195, 148, 195, 149, 195, 150, 195, 151, 195, 152, 195, 153, 195, 154, 105 195, 155, 195, 156, 195, 157, 195, 158, 195, 159, 195, 160, 195, 161, 106 195, 162, 195, 163, 195, 164, 195, 165, 195, 166, 195, 167, 195, 168, 107 195, 169, 195, 170, 195, 171, 195, 172, 195, 173, 195, 174, 195, 175, 108 195, 176, 195, 177, 195, 178, 195, 179, 195, 180, 195, 181, 195, 182, 109 195, 183, 195, 184, 195, 185, 195, 186, 195, 187, 195, 188, 195, 189, 110 195, 190, 195, 191, 111 }; 112 113 // handle_reader skips leading UTF-8 BOMs (byte-order marks), and turns all 114 // CR-LF pairs into single LF bytes 115 bool handle_reader(FILE* w, FILE* r, slice* line, slice* out) { 116 while (!feof(w)) { 117 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 118 if (line->ptr == NULL) { 119 fprintf(stderr, ERROR_LINE("out of memory")); 120 return false; 121 } 122 123 if (len < 0) { 124 break; 125 } 126 127 // ensure there's room for up to twice the current line's byte-count 128 if (out->cap < 2 * line->cap) { 129 size_t new_cap = 2 * line->cap; 130 void* new_ptr = realloc(out->ptr, new_cap); 131 if (new_ptr == NULL) { 132 fprintf(stderr, ERROR_LINE("out of memory")); 133 return false; 134 } 135 out->ptr = new_ptr; 136 out->cap = new_cap; 137 } 138 139 if (out->ptr == NULL) { 140 fprintf(stderr, ERROR_LINE("out of memory")); 141 return false; 142 } 143 144 size_t olen = 0; 145 unsigned char* dest = out->ptr; 146 147 for (size_t i = 0; i < len; i++) { 148 const unsigned char b = line->ptr[i]; 149 const size_t j = 2 * b; 150 dest[olen + 0] = iso2utf8[j + 0]; 151 dest[olen + 1] = iso2utf8[j + 1]; 152 olen += 1 + (b >> 7); 153 } 154 155 fwrite(out->ptr, olen, 1, w); 156 fflush(w); 157 } 158 159 return true; 160 } 161 162 // handle_file handles data from the filename given; returns false only when 163 // the file can't be opened 164 bool handle_file(FILE* w, const char* path, slice* line, slice* out) { 165 FILE* f = fopen(path, "rb"); 166 if (f == NULL) { 167 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 168 return false; 169 } 170 171 const bool ok = handle_reader(w, f, line, out); 172 fclose(f); 173 return ok; 174 } 175 176 // run returns the number of errors 177 int run(int argc, char** argv, FILE* w) { 178 size_t dashes = 0; 179 for (int i = 1; i < argc; i++) { 180 if (argv[i][0] == '-' && argv[i][1] == 0) { 181 dashes++; 182 } 183 } 184 185 if (dashes > 1) { 186 const char* m = "can't use the standard input (dash) more than once"; 187 fprintf(stderr, ERROR_LINE("%s"), m); 188 return 1; 189 } 190 191 slice line; 192 line.len = 0; 193 line.cap = 32 * 1024; 194 line.ptr = malloc(line.cap); 195 196 if (line.ptr == NULL) { 197 fprintf(stderr, ERROR_LINE("out of memory")); 198 return 1; 199 } 200 201 // out is the destination for decoded bytes from lines: since all bytes 202 // except trailing line-feeds could expand each into 2 bytes, twice the 203 // space is needed just in case 204 slice out; 205 out.len = 0; 206 out.cap = 2 * line.cap; 207 out.ptr = malloc(out.cap); 208 209 if (out.ptr == NULL) { 210 free(line.ptr); 211 fprintf(stderr, ERROR_LINE("out of memory")); 212 return 1; 213 } 214 215 size_t errors = 0; 216 for (int i = 1; i < argc && !feof(w) && line.ptr != NULL; i++) { 217 if (argv[i][0] == '-' && argv[i][1] == 0) { 218 if (!handle_reader(w, stdin, &line, &out)) { 219 errors++; 220 } 221 continue; 222 } 223 224 if (!handle_file(w, argv[i], &line, &out)) { 225 errors++; 226 } 227 } 228 229 // use stdin when not given any filepaths 230 if (argc <= 1) { 231 if (!handle_reader(w, stdin, &line, &out)) { 232 errors++; 233 } 234 } 235 236 free(line.ptr); 237 free(out.ptr); 238 return errors; 239 } 240 241 int main(int argc, char** argv) { 242 #ifdef _WIN32 243 setmode(fileno(stdin), O_BINARY); 244 // ensure output lines end in LF instead of CRLF on windows 245 setmode(fileno(stdout), O_BINARY); 246 setmode(fileno(stderr), O_BINARY); 247 #endif 248 249 if (argc > 1) { 250 if ( 251 strcmp(argv[1], "-h") == 0 || 252 strcmp(argv[1], "-help") == 0 || 253 strcmp(argv[1], "--h") == 0 || 254 strcmp(argv[1], "--help") == 0 255 ) { 256 fprintf(stdout, "%s", info); 257 return 0; 258 } 259 } 260 261 return run(argc, argv, stdout) == 0 ? 0 : 1; 262 }