File: vulgarize.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./vulgarize ./vulgarize.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 36 #ifdef _WIN32 37 #include <fcntl.h> 38 #include <windows.h> 39 #endif 40 41 #ifdef RED_ERRORS 42 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 43 #ifdef __APPLE__ 44 #define ERROR_STYLE "\x1b[31m" 45 #endif 46 #define RESET_STYLE "\x1b[0m" 47 #else 48 #define ERROR_STYLE 49 #define RESET_STYLE 50 #endif 51 52 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 53 54 #define BAD_ALLOC 2 55 56 const char* info = "" 57 "vulgarize [filenames...]\n" 58 "\n" 59 "Turn files with ISO-8859-1 text into UTF8 text. The name `-` stands for the\n" 60 "standard input. When no names are given, the standard input is used by\n" 61 "default.\n" 62 ""; 63 64 // slice is a growable region of bytes in memory 65 typedef struct slice { 66 // ptr is the starting place of the region 67 unsigned char* ptr; 68 69 // cap is how many bytes the memory region has available 70 size_t cap; 71 } slice; 72 73 /* 74 tbp = 'range(0, 256)' | iconv -f iso-8859-1 -t utf8 | 75 tbp '(f"{str(e)}, " if e >= 128 else f"{str(e)}, 0, " for e in d)' 76 */ 77 const unsigned char iso2utf8[512] = { 78 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 79 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 80 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 81 31, 0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39, 0, 40, 0, 82 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47, 0, 48, 0, 49, 0, 50, 0, 83 51, 0, 52, 0, 53, 0, 54, 0, 55, 0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 84 61, 0, 62, 0, 63, 0, 64, 0, 65, 0, 66, 0, 67, 0, 68, 0, 69, 0, 70, 0, 85 71, 0, 72, 0, 73, 0, 74, 0, 75, 0, 76, 0, 77, 0, 78, 0, 79, 0, 80, 0, 86 81, 0, 82, 0, 83, 0, 84, 0, 85, 0, 86, 0, 87, 0, 88, 0, 89, 0, 90, 0, 87 91, 0, 92, 0, 93, 0, 94, 0, 95, 0, 96, 0, 97, 0, 98, 0, 99, 0, 100, 0, 88 101, 0, 102, 0, 103, 0, 104, 0, 105, 0, 106, 0, 107, 0, 108, 0, 109, 0, 89 110, 0, 111, 0, 112, 0, 113, 0, 114, 0, 115, 0, 116, 0, 117, 0, 118, 0, 90 119, 0, 120, 0, 121, 0, 122, 0, 123, 0, 124, 0, 125, 0, 126, 0, 127, 0, 91 194, 128, 194, 129, 194, 130, 194, 131, 194, 132, 194, 133, 194, 134, 92 194, 135, 194, 136, 194, 137, 194, 138, 194, 139, 194, 140, 194, 141, 93 194, 142, 194, 143, 194, 144, 194, 145, 194, 146, 194, 147, 194, 148, 94 194, 149, 194, 150, 194, 151, 194, 152, 194, 153, 194, 154, 194, 155, 95 194, 156, 194, 157, 194, 158, 194, 159, 194, 160, 194, 161, 194, 162, 96 194, 163, 194, 164, 194, 165, 194, 166, 194, 167, 194, 168, 194, 169, 97 194, 170, 194, 171, 194, 172, 194, 173, 194, 174, 194, 175, 194, 176, 98 194, 177, 194, 178, 194, 179, 194, 180, 194, 181, 194, 182, 194, 183, 99 194, 184, 194, 185, 194, 186, 194, 187, 194, 188, 194, 189, 194, 190, 100 194, 191, 195, 128, 195, 129, 195, 130, 195, 131, 195, 132, 195, 133, 101 195, 134, 195, 135, 195, 136, 195, 137, 195, 138, 195, 139, 195, 140, 102 195, 141, 195, 142, 195, 143, 195, 144, 195, 145, 195, 146, 195, 147, 103 195, 148, 195, 149, 195, 150, 195, 151, 195, 152, 195, 153, 195, 154, 104 195, 155, 195, 156, 195, 157, 195, 158, 195, 159, 195, 160, 195, 161, 105 195, 162, 195, 163, 195, 164, 195, 165, 195, 166, 195, 167, 195, 168, 106 195, 169, 195, 170, 195, 171, 195, 172, 195, 173, 195, 174, 195, 175, 107 195, 176, 195, 177, 195, 178, 195, 179, 195, 180, 195, 181, 195, 182, 108 195, 183, 195, 184, 195, 185, 195, 186, 195, 187, 195, 188, 195, 189, 109 195, 190, 195, 191, 110 }; 111 112 void handle_reader(FILE* w, FILE* r, slice* line, slice* out) { 113 while (!feof(w)) { 114 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 115 if (line->ptr == NULL) { 116 fprintf(stderr, "\n"); 117 fprintf(stderr, ERROR_LINE("out of memory")); 118 exit(BAD_ALLOC); 119 } 120 121 if (len < 0) { 122 break; 123 } 124 125 // ensure there's room for up to twice the current line's byte-count 126 if (out->cap < 2 * line->cap) { 127 size_t new_cap = 2 * line->cap; 128 void* new_ptr = realloc(out->ptr, new_cap); 129 if (new_ptr == NULL) { 130 fprintf(stderr, "\n"); 131 fprintf(stderr, ERROR_LINE("out of memory")); 132 exit(BAD_ALLOC); 133 } 134 out->ptr = new_ptr; 135 out->cap = new_cap; 136 } 137 138 if (out->ptr == NULL) { 139 fprintf(stderr, "\n"); 140 fprintf(stderr, ERROR_LINE("out of memory")); 141 exit(BAD_ALLOC); 142 } 143 144 size_t olen = 0; 145 unsigned char* dest = out->ptr; 146 147 for (size_t i = 0; i < len; i++) { 148 const unsigned char b = line->ptr[i]; 149 const size_t j = 2 * b; 150 dest[olen + 0] = iso2utf8[j + 0]; 151 dest[olen + 1] = iso2utf8[j + 1]; 152 olen += 1 + (b >> 7); 153 } 154 155 fwrite(out->ptr, 1, olen, w); 156 fflush(w); 157 } 158 } 159 160 // handle_file handles data from the filename given; returns false only when 161 // the file can't be opened 162 bool handle_file(FILE* w, const char* path, slice* line, slice* out) { 163 FILE* f = fopen(path, "rb"); 164 if (f == NULL) { 165 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 166 return false; 167 } 168 169 handle_reader(w, f, line, out); 170 fclose(f); 171 return true; 172 } 173 174 // run returns the number of errors 175 int run(int argc, char** argv, FILE* w) { 176 size_t dashes = 0; 177 for (int i = 1; i < argc; i++) { 178 if (argv[i][0] == '-' && argv[i][1] == 0) { 179 dashes++; 180 } 181 } 182 183 if (dashes > 1) { 184 const char* m = "can't use the standard input (dash) more than once"; 185 fprintf(stderr, ERROR_LINE("%s"), m); 186 return 1; 187 } 188 189 slice line; 190 line.cap = 32 * 1024; 191 line.ptr = malloc(line.cap); 192 193 if (line.ptr == NULL) { 194 fprintf(stderr, ERROR_LINE("out of memory")); 195 exit(BAD_ALLOC); 196 } 197 198 // out is the destination for decoded bytes from lines: since all bytes 199 // except trailing line-feeds could expand each into 2 bytes, twice the 200 // space is needed just in case 201 slice out; 202 out.cap = 2 * line.cap; 203 out.ptr = malloc(out.cap); 204 205 if (out.ptr == NULL) { 206 free(line.ptr); 207 fprintf(stderr, ERROR_LINE("out of memory")); 208 exit(BAD_ALLOC); 209 } 210 211 size_t errors = 0; 212 for (int i = 1; i < argc && !feof(w); i++) { 213 if (argv[i][0] == '-' && argv[i][1] == 0) { 214 handle_reader(w, stdin, &line, &out); 215 continue; 216 } 217 218 if (!handle_file(w, argv[i], &line, &out)) { 219 errors++; 220 } 221 } 222 223 // use stdin when not given any filepaths 224 if (argc <= 1) { 225 handle_reader(w, stdin, &line, &out); 226 } 227 228 free(line.ptr); 229 free(out.ptr); 230 return errors; 231 } 232 233 int main(int argc, char** argv) { 234 #ifdef _WIN32 235 setmode(fileno(stdin), O_BINARY); 236 // ensure output lines end in LF instead of CRLF on windows 237 setmode(fileno(stdout), O_BINARY); 238 setmode(fileno(stderr), O_BINARY); 239 #endif 240 241 if (argc > 1) { 242 if ( 243 strcmp(argv[1], "-h") == 0 || 244 strcmp(argv[1], "-help") == 0 || 245 strcmp(argv[1], "--h") == 0 || 246 strcmp(argv[1], "--help") == 0 247 ) { 248 fprintf(stdout, "%s", info); 249 return 0; 250 } 251 } 252 253 return run(argc, argv, stdout) == 0 ? 0 : 1; 254 }