File: fixlines.c 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./fixlines ./fixlines.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #ifdef _WIN32 39 #include <fcntl.h> 40 #include <windows.h> 41 #endif 42 43 #ifdef RED_ERRORS 44 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 45 #ifdef __APPLE__ 46 #define ERROR_STYLE "\x1b[31m" 47 #endif 48 #define RESET_STYLE "\x1b[0m" 49 #else 50 #define ERROR_STYLE 51 #define RESET_STYLE 52 #endif 53 54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 55 56 #define BAD_ALLOC 2 57 58 #ifndef IBUF_SIZE 59 #define IBUF_SIZE (32 * 1024) 60 #endif 61 62 const char* info = "" 63 "fixlines [options...] [filepaths...]\n" 64 "\n" 65 "This tool fixes lines in UTF-8 text, ignoring leading UTF-8 BOMs, trailing\n" 66 "carriage-returns on all lines, and ensures no lines across inputs are\n" 67 "accidentally joined, since all lines it outputs end with line-feeds,\n" 68 "even when the original files don't.\n" 69 "\n" 70 "The only option available is to show this help message, using any of\n" 71 "`-h`, `--h`, `-help`, or `--help`, without the quotes.\n" 72 ""; 73 74 // slice is a growable region of bytes in memory 75 typedef struct slice { 76 // ptr is the starting place of the region 77 unsigned char* ptr; 78 79 // cap is how many bytes the memory region has available 80 size_t cap; 81 } slice; 82 83 bool starts_with_bom(const unsigned char* b, const size_t n) { 84 return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); 85 } 86 87 static inline int64_t find_cr(const unsigned char* s, size_t len) { 88 for (size_t i = 0; i < len; i++) { 89 if (s[i] == '\r') { 90 return i; 91 } 92 } 93 return -1; 94 } 95 96 void handle_reader_faster(FILE* w, FILE* r) { 97 unsigned char buf[IBUF_SIZE]; 98 unsigned char last = '\n'; 99 100 for (size_t i = 0; !feof(w); i++) { 101 size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r); 102 if (len < 1) { 103 break; 104 } 105 106 unsigned char* ptr = buf; 107 if (last == '\r' && ptr[0] != '\n') { 108 fputc('\r', w); 109 } 110 last = ptr[len - 1]; 111 112 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 113 if (i == 0 && starts_with_bom(ptr, len)) { 114 ptr += 3; 115 len -= 3; 116 } 117 118 // handle all carriage-returns in the current chunk 119 while (len > 0) { 120 const int64_t j = find_cr(ptr, len); 121 if (j < 0) { 122 break; 123 } 124 125 // if carriage return is the last byte in chunk, remember that 126 // to check the start of the next chunk 127 if (j == len - 1) { 128 fwrite(ptr, 1, len - 1, w); 129 len = 0; 130 break; 131 } 132 133 // if it's a CRLF byte-pair, just emit the LF from that 134 if (j + 1 < len && ptr[j + 1] == '\n') { 135 ptr[j] = '\n'; 136 fwrite(ptr, 1, j + 1, w); 137 ptr += j + 2; 138 len -= j + 2; 139 continue; 140 } 141 142 // emit lone CRs inside chunks, as only CRLF byte-pairs are fixed 143 fwrite(ptr, 1, j + 1, w); 144 ptr += j + 1; 145 len -= j + 1; 146 } 147 148 // don't forget trailing part after last carriage-return in chunk 149 if (len > 0) { 150 fwrite(ptr, 1, len, w); 151 } 152 } 153 154 // handle edge-cases for the last chunk 155 if (last != '\n') { 156 fputc('\n', w); 157 } 158 159 fflush(w); 160 } 161 162 void handle_reader(FILE* w, FILE* r, slice* line, bool live_lines) { 163 if (!live_lines) { 164 handle_reader_faster(w, r); 165 return; 166 } 167 168 unsigned char last = '\n'; 169 170 for (size_t i = 0; !feof(w); i++) { 171 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 172 if (line->ptr == NULL) { 173 fprintf(stderr, "\n"); 174 fprintf(stderr, ERROR_LINE("out of memory")); 175 exit(BAD_ALLOC); 176 } 177 178 if (len < 0) { 179 break; 180 } 181 182 unsigned char* ptr = line->ptr; 183 last = line->ptr[len - 1]; 184 185 // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it 186 if (i == 0 && starts_with_bom(ptr, len)) { 187 ptr += 3; 188 len -= 3; 189 } 190 191 // turn trailing carriage-returns into line-feeds 192 if (len >= 1 && ptr[len - 1] == '\r') { 193 ptr[len - 1] = '\n'; 194 last = '\n'; 195 } 196 197 // get rid of carriage-returns preceding line-feeds 198 if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') { 199 ptr[len - 2] = '\n'; 200 len--; 201 } 202 203 fwrite(ptr, 1, len, w); 204 } 205 206 // handle edge-cases for the last line 207 if (last != '\n') { 208 fputc('\n', w); 209 } 210 } 211 212 // handle_file handles data from the filename given; returns false only when 213 // the file can't be opened 214 bool handle_file(FILE* w, const char* path, slice* line, bool live_lines) { 215 FILE* f = fopen(path, "rb"); 216 if (f == NULL) { 217 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 218 return false; 219 } 220 221 handle_reader(w, f, line, live_lines); 222 fclose(f); 223 return true; 224 } 225 226 // run returns the number of errors 227 int run(char** args, size_t nargs, FILE* w, bool live_lines) { 228 size_t dashes = 0; 229 for (size_t i = 0; i < nargs; i++) { 230 if (strcmp(args[i], "-") == 0) { 231 dashes++; 232 } 233 } 234 235 if (dashes > 1) { 236 const char* m = "can't use the standard input (dash) more than once"; 237 fprintf(stderr, ERROR_LINE("%s"), m); 238 return 1; 239 } 240 241 slice line; 242 line.ptr = NULL; 243 line.cap = 0; 244 245 if (live_lines) { 246 line.cap = 32 * 1024; 247 line.ptr = malloc(line.cap); 248 if (line.ptr == NULL) { 249 fprintf(stderr, ERROR_LINE("out of memory")); 250 exit(BAD_ALLOC); 251 } 252 } 253 254 size_t errors = 0; 255 for (size_t i = 0; i < nargs && !feof(w); i++) { 256 if (strcmp(args[i], "-") == 0) { 257 handle_reader(w, stdin, &line, live_lines); 258 continue; 259 } 260 261 if (!handle_file(w, args[i], &line, live_lines)) { 262 errors++; 263 } 264 } 265 266 // use stdin when not given any filepaths 267 if (nargs == 0) { 268 handle_reader(w, stdin, &line, live_lines); 269 } 270 271 free(line.ptr); 272 return errors; 273 } 274 275 int main(int argc, char** argv) { 276 #ifdef _WIN32 277 setmode(fileno(stdin), O_BINARY); 278 // ensure output lines end in LF instead of CRLF on windows 279 setmode(fileno(stdout), O_BINARY); 280 setmode(fileno(stderr), O_BINARY); 281 #endif 282 283 if (argc > 1) { 284 if ( 285 strcmp(argv[1], "-h") == 0 || 286 strcmp(argv[1], "-help") == 0 || 287 strcmp(argv[1], "--h") == 0 || 288 strcmp(argv[1], "--help") == 0 289 ) { 290 fprintf(stdout, "%s", info); 291 return 0; 292 } 293 } 294 295 size_t nargs = argc - 1; 296 char** args = argv + 1; 297 bool buffered = false; 298 299 if (nargs > 0) { 300 if ( 301 strcmp(args[0], "-b") == 0 || 302 strcmp(args[0], "--b") == 0 || 303 strcmp(args[0], "-buffered") == 0 || 304 strcmp(args[0], "--buffered") == 0 305 ) { 306 buffered = true; 307 nargs--; 308 args++; 309 } 310 } 311 312 if (nargs > 0 && strcmp(args[0], "--") == 0) { 313 nargs--; 314 args++; 315 } 316 317 const int fd = fileno(stdout); 318 const bool live_lines = !buffered && lseek(fd, 0, SEEK_CUR) != 0; 319 if (live_lines) { 320 setvbuf(stdout, NULL, _IOLBF, 0); 321 } else { 322 setvbuf(stdout, NULL, _IOFBF, 0); 323 } 324 return run(args, nargs, stdout, live_lines) == 0 ? 0 : 1; 325 }