File: fixlines.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./fixlines ./fixlines.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <unistd.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #ifdef RED_ERRORS 43 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 44 #ifdef __APPLE__ 45 #define ERROR_STYLE "\x1b[31m" 46 #endif 47 #define RESET_STYLE "\x1b[0m" 48 #else 49 #define ERROR_STYLE 50 #define RESET_STYLE 51 #endif 52 53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 54 55 #define BAD_ALLOC 2 56 57 #ifndef IBUF_SIZE 58 #define IBUF_SIZE (32 * 1024) 59 #endif 60 61 const char* info = "" 62 "fixlines [options...] [filepaths...]\n" 63 "\n" 64 "This tool fixes lines in UTF-8 text, ignoring leading UTF-8 BOMs, trailing\n" 65 "carriage-returns on all lines, and ensures no lines across inputs are\n" 66 "accidentally joined, since all lines it outputs end with line-feeds,\n" 67 "even when the original files don't.\n" 68 "\n" 69 "The only option available is to show this help message, using any of\n" 70 "`-h`, `--h`, `-help`, or `--help`, without the quotes.\n" 71 ""; 72 73 // slice is a growable region of bytes in memory 74 typedef struct slice { 75 // ptr is the starting place of the region 76 unsigned char* ptr; 77 78 // cap is how many bytes the memory region has available 79 size_t cap; 80 } slice; 81 82 bool starts_with_bom(const unsigned char* b, const size_t n) { 83 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 84 } 85 86 static inline int64_t find_cr(const unsigned char* s, size_t len) { 87 for (size_t i = 0; i < len; i++) { 88 if (s[i] == '\r') { 89 return i; 90 } 91 } 92 return -1; 93 } 94 95 void handle_reader_faster(FILE* w, FILE* r) { 96 unsigned char buf[IBUF_SIZE]; 97 unsigned char last = '\n'; 98 99 for (size_t i = 0; !feof(w); i++) { 100 size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 101 if (len < 1) { 102 break; 103 } 104 105 unsigned char* ptr = buf; 106 if (last == '\r' && ptr[0] != '\n') { 107 fputc('\r', w); 108 } 109 last = ptr[len - 1]; 110 111 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 112 if (i == 0 && starts_with_bom(ptr, len)) { 113 ptr += 3; 114 len -= 3; 115 } 116 117 // handle all carriage-returns in the current chunk 118 while (len > 0) { 119 const int64_t j = find_cr(ptr, len); 120 if (j < 0) { 121 break; 122 } 123 124 // if carriage return is the last byte in chunk, remember that 125 // to check the start of the next chunk 126 if (j == len - 1) { 127 fwrite(ptr, 1, len - 1, w); 128 len = 0; 129 break; 130 } 131 132 // if it's a CRLF byte-pair, just emit the LF from that 133 if (j + 1 < len && ptr[j + 1] == '\n') { 134 ptr[j] = '\n'; 135 fwrite(ptr, 1, j + 1, w); 136 ptr += j + 2; 137 len -= j + 2; 138 continue; 139 } 140 141 // emit lone CRs inside chunks, as only CRLF byte-pairs are fixed 142 fwrite(ptr, 1, j + 1, w); 143 ptr += j + 1; 144 len -= j + 1; 145 } 146 147 // don't forget trailing part after last carriage-return in chunk 148 if (len > 0) { 149 fwrite(ptr, 1, len, w); 150 } 151 } 152 153 // handle edge-cases for the last chunk 154 if (last != '\n') { 155 fputc('\n', w); 156 } 157 158 fflush(w); 159 } 160 161 void handle_reader(FILE* w, FILE* r, slice* line, bool live_lines) { 162 if (!live_lines) { 163 handle_reader_faster(w, r); 164 return; 165 } 166 167 unsigned char last = '\n'; 168 169 for (size_t i = 0; !feof(w); i++) { 170 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 171 if (line->ptr == NULL) { 172 fprintf(stderr, "\n"); 173 fprintf(stderr, ERROR_LINE("out of memory")); 174 exit(BAD_ALLOC); 175 } 176 177 if (len < 0) { 178 break; 179 } 180 181 unsigned char* ptr = line->ptr; 182 last = line->ptr[len - 1]; 183 184 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 185 if (i == 0 && starts_with_bom(ptr, len)) { 186 ptr += 3; 187 len -= 3; 188 } 189 190 // turn trailing carriage-returns into line-feeds 191 if (len >= 1 && ptr[len - 1] == '\r') { 192 ptr[len - 1] = '\n'; 193 last = '\n'; 194 } 195 196 // get rid of carriage-returns preceding line-feeds 197 if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') { 198 ptr[len - 2] = '\n'; 199 len--; 200 } 201 202 fwrite(ptr, 1, len, w); 203 fflush(w); 204 } 205 206 // handle edge-cases for the last line 207 if (last != '\n') { 208 fputc('\n', w); 209 } 210 211 fflush(w); 212 } 213 214 // handle_file handles data from the filename given; returns false only when 215 // the file can't be opened 216 bool handle_file(FILE* w, const char* path, slice* line, bool live_lines) { 217 FILE* f = fopen(path, "rb"); 218 if (f == NULL) { 219 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 220 return false; 221 } 222 223 handle_reader(w, f, line, live_lines); 224 fclose(f); 225 return true; 226 } 227 228 // run returns the number of errors 229 int run(char** args, size_t nargs, FILE* w, bool live_lines) { 230 size_t dashes = 0; 231 for (size_t i = 0; i < nargs; i++) { 232 if (args[i][0] == '-' && args[i][1] == 0) { 233 dashes++; 234 } 235 } 236 237 if (dashes > 1) { 238 const char* m = "can't use the standard input (dash) more than once"; 239 fprintf(stderr, ERROR_LINE("%s"), m); 240 return 1; 241 } 242 243 slice line; 244 line.ptr = NULL; 245 line.cap = 0; 246 247 if (live_lines) { 248 line.cap = 32 * 1024; 249 line.ptr = malloc(line.cap); 250 if (line.ptr == NULL) { 251 fprintf(stderr, ERROR_LINE("out of memory")); 252 exit(BAD_ALLOC); 253 } 254 } 255 256 size_t errors = 0; 257 for (size_t i = 0; i < nargs && !feof(w); i++) { 258 if (args[i][0] == '-' && args[i][1] == 0) { 259 handle_reader(w, stdin, &line, live_lines); 260 continue; 261 } 262 263 if (!handle_file(w, args[i], &line, live_lines)) { 264 errors++; 265 } 266 } 267 268 // use stdin when not given any filepaths 269 if (nargs < 1) { 270 handle_reader(w, stdin, &line, live_lines); 271 } 272 273 free(line.ptr); 274 return errors; 275 } 276 277 int main(int argc, char** argv) { 278 #ifdef _WIN32 279 setmode(fileno(stdin), O_BINARY); 280 // ensure output lines end in LF instead of CRLF on windows 281 setmode(fileno(stdout), O_BINARY); 282 setmode(fileno(stderr), O_BINARY); 283 #endif 284 285 if (argc > 1) { 286 if ( 287 strcmp(argv[1], "-h") == 0 || 288 strcmp(argv[1], "-help") == 0 || 289 strcmp(argv[1], "--h") == 0 || 290 strcmp(argv[1], "--help") == 0 291 ) { 292 fprintf(stdout, "%s", info); 293 return 0; 294 } 295 } 296 297 size_t nargs = argc - 1; 298 char** args = argv + 1; 299 if (nargs > 0 && strcmp(args[0], "--") == 0) { 300 nargs--; 301 args++; 302 } 303 304 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 305 if (!live_lines) { 306 setvbuf(stdout, NULL, _IOFBF, 0); 307 } 308 return run(args, nargs, stdout, live_lines) == 0 ? 0 : 1; 309 }