File: fixlines.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -march=native -mtune=native -flto -o ./fixlines ./fixlines.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 
  38 #ifdef _WIN32
  39 #include <fcntl.h>
  40 #include <windows.h>
  41 #endif
  42 
  43 #ifdef RED_ERRORS
  44 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  45 #ifdef __APPLE__
  46 #define ERROR_STYLE "\x1b[31m"
  47 #endif
  48 #define RESET_STYLE "\x1b[0m"
  49 #else
  50 #define ERROR_STYLE
  51 #define RESET_STYLE
  52 #endif
  53 
  54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  55 
  56 #define BAD_ALLOC 2
  57 
  58 #ifndef IBUF_SIZE
  59 #define IBUF_SIZE (32 * 1024)
  60 #endif
  61 
  62 const char* info = ""
  63 "fixlines [options...] [filepaths...]\n"
  64 "\n"
  65 "This tool fixes lines in UTF-8 text, ignoring leading UTF-8 BOMs, trailing\n"
  66 "carriage-returns on all lines, and ensures no lines across inputs are\n"
  67 "accidentally joined, since all lines it outputs end with line-feeds,\n"
  68 "even when the original files don't.\n"
  69 "\n"
  70 "The only option available is to show this help message, using any of\n"
  71 "`-h`, `--h`, `-help`, or `--help`, without the quotes.\n"
  72 "";
  73 
  74 // slice is a growable region of bytes in memory
  75 typedef struct slice {
  76     // ptr is the starting place of the region
  77     unsigned char* ptr;
  78 
  79     // cap is how many bytes the memory region has available
  80     size_t cap;
  81 } slice;
  82 
  83 bool starts_with_bom(const unsigned char* b, const size_t n) {
  84     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
  85 }
  86 
  87 static inline int64_t find_cr(const unsigned char* s, size_t len) {
  88     for (size_t i = 0; i < len; i++) {
  89         if (s[i] == '\r') {
  90             return i;
  91         }
  92     }
  93     return -1;
  94 }
  95 
  96 void handle_reader_faster(FILE* w, FILE* r) {
  97     unsigned char buf[IBUF_SIZE];
  98     unsigned char last = '\n';
  99 
 100     for (size_t i = 0; !feof(w); i++) {
 101         size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 102         if (len < 1) {
 103             break;
 104         }
 105 
 106         unsigned char* ptr = buf;
 107         if (last == '\r' && ptr[0] != '\n') {
 108             fputc('\r', w);
 109         }
 110         last = ptr[len - 1];
 111 
 112         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 113         if (i == 0 && starts_with_bom(ptr, len)) {
 114             ptr += 3;
 115             len -= 3;
 116         }
 117 
 118         // handle all carriage-returns in the current chunk
 119         while (len > 0) {
 120             const int64_t j = find_cr(ptr, len);
 121             if (j < 0) {
 122                 break;
 123             }
 124 
 125             // if carriage return is the last byte in chunk, remember that
 126             // to check the start of the next chunk
 127             if (j == len - 1) {
 128                 fwrite(ptr, 1, len - 1, w);
 129                 len = 0;
 130                 break;
 131             }
 132 
 133             // if it's a CRLF byte-pair, just emit the LF from that
 134             if (j + 1 < len && ptr[j + 1] == '\n') {
 135                 ptr[j] = '\n';
 136                 fwrite(ptr, 1, j + 1, w);
 137                 ptr += j + 2;
 138                 len -= j + 2;
 139                 continue;
 140             }
 141 
 142             // emit lone CRs inside chunks, as only CRLF byte-pairs are fixed
 143             fwrite(ptr, 1, j + 1, w);
 144             ptr += j + 1;
 145             len -= j + 1;
 146         }
 147 
 148         // don't forget trailing part after last carriage-return in chunk
 149         if (len > 0) {
 150             fwrite(ptr, 1, len, w);
 151         }
 152     }
 153 
 154     // handle edge-cases for the last chunk
 155     if (last != '\n') {
 156         fputc('\n', w);
 157     }
 158 
 159     fflush(w);
 160 }
 161 
 162 void handle_reader(FILE* w, FILE* r, slice* line, bool live_lines) {
 163     if (!live_lines) {
 164         handle_reader_faster(w, r);
 165         return;
 166     }
 167 
 168     unsigned char last = '\n';
 169 
 170     for (size_t i = 0; !feof(w); i++) {
 171         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 172         if (line->ptr == NULL) {
 173             fprintf(stderr, "\n");
 174             fprintf(stderr, ERROR_LINE("out of memory"));
 175             exit(BAD_ALLOC);
 176         }
 177 
 178         if (len < 0) {
 179             break;
 180         }
 181 
 182         unsigned char* ptr = line->ptr;
 183         last = line->ptr[len - 1];
 184 
 185         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 186         if (i == 0 && starts_with_bom(ptr, len)) {
 187             ptr += 3;
 188             len -= 3;
 189         }
 190 
 191         // turn trailing carriage-returns into line-feeds
 192         if (len >= 1 && ptr[len - 1] == '\r') {
 193             ptr[len - 1] = '\n';
 194             last = '\n';
 195         }
 196 
 197         // get rid of carriage-returns preceding line-feeds
 198         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 199             ptr[len - 2] = '\n';
 200             len--;
 201         }
 202 
 203         fwrite(ptr, 1, len, w);
 204     }
 205 
 206     // handle edge-cases for the last line
 207     if (last != '\n') {
 208         fputc('\n', w);
 209     }
 210 }
 211 
 212 // handle_file handles data from the filename given; returns false only when
 213 // the file can't be opened
 214 bool handle_file(FILE* w, const char* path, slice* line, bool live_lines) {
 215     FILE* f = fopen(path, "rb");
 216     if (f == NULL) {
 217         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 218         return false;
 219     }
 220 
 221     handle_reader(w, f, line, live_lines);
 222     fclose(f);
 223     return true;
 224 }
 225 
 226 // run returns the number of errors
 227 int run(char** args, size_t nargs, FILE* w, bool live_lines) {
 228     size_t dashes = 0;
 229     for (size_t i = 0; i < nargs; i++) {
 230         if (strcmp(args[i], "-") == 0) {
 231             dashes++;
 232         }
 233     }
 234 
 235     if (dashes > 1) {
 236         const char* m = "can't use the standard input (dash) more than once";
 237         fprintf(stderr, ERROR_LINE("%s"), m);
 238         return 1;
 239     }
 240 
 241     slice line;
 242     line.ptr = NULL;
 243     line.cap = 0;
 244 
 245     if (live_lines) {
 246         line.cap = 32 * 1024;
 247         line.ptr = malloc(line.cap);
 248         if (line.ptr == NULL) {
 249             fprintf(stderr, ERROR_LINE("out of memory"));
 250             exit(BAD_ALLOC);
 251         }
 252     }
 253 
 254     size_t errors = 0;
 255     for (size_t i = 0; i < nargs && !feof(w); i++) {
 256         if (strcmp(args[i], "-") == 0) {
 257             handle_reader(w, stdin, &line, live_lines);
 258             continue;
 259         }
 260 
 261         if (!handle_file(w, args[i], &line, live_lines)) {
 262             errors++;
 263         }
 264     }
 265 
 266     // use stdin when not given any filepaths
 267     if (nargs == 0) {
 268         handle_reader(w, stdin, &line, live_lines);
 269     }
 270 
 271     free(line.ptr);
 272     return errors;
 273 }
 274 
 275 int main(int argc, char** argv) {
 276 #ifdef _WIN32
 277     setmode(fileno(stdin), O_BINARY);
 278     // ensure output lines end in LF instead of CRLF on windows
 279     setmode(fileno(stdout), O_BINARY);
 280     setmode(fileno(stderr), O_BINARY);
 281 #endif
 282 
 283     if (argc > 1) {
 284         if (
 285             strcmp(argv[1], "-h") == 0 ||
 286             strcmp(argv[1], "-help") == 0 ||
 287             strcmp(argv[1], "--h") == 0 ||
 288             strcmp(argv[1], "--help") == 0
 289         ) {
 290             fprintf(stdout, "%s", info);
 291             return 0;
 292         }
 293     }
 294 
 295     size_t nargs = argc - 1;
 296     char** args = argv + 1;
 297     bool buffered = false;
 298 
 299     if (nargs > 0) {
 300         if (
 301             strcmp(args[0], "-b") == 0 ||
 302             strcmp(args[0], "--b") == 0 ||
 303             strcmp(args[0], "-buffered") == 0 ||
 304             strcmp(args[0], "--buffered") == 0
 305         ) {
 306             buffered = true;
 307             nargs--;
 308             args++;
 309         }
 310     }
 311 
 312     if (nargs > 0 && strcmp(args[0], "--") == 0) {
 313         nargs--;
 314         args++;
 315     }
 316 
 317     const int fd = fileno(stdout);
 318     const bool live_lines = !buffered && lseek(fd, 0, SEEK_CUR) != 0;
 319     if (live_lines) {
 320         setvbuf(stdout, NULL, _IOLBF, 0);
 321     } else {
 322         setvbuf(stdout, NULL, _IOFBF, 0);
 323     }
 324     return run(args, nargs, stdout, live_lines) == 0 ? 0 : 1;
 325 }