File: fixlines.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./fixlines ./fixlines.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #include <unistd.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #ifdef RED_ERRORS
  43 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  44 #ifdef __APPLE__
  45 #define ERROR_STYLE "\x1b[31m"
  46 #endif
  47 #define RESET_STYLE "\x1b[0m"
  48 #else
  49 #define ERROR_STYLE
  50 #define RESET_STYLE
  51 #endif
  52 
  53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  54 
  55 #define BAD_ALLOC 2
  56 
  57 #ifndef IBUF_SIZE
  58 #define IBUF_SIZE (32 * 1024)
  59 #endif
  60 
  61 const char* info = ""
  62 "fixlines [options...] [filepaths...]\n"
  63 "\n"
  64 "This tool fixes lines in UTF-8 text, ignoring leading UTF-8 BOMs, trailing\n"
  65 "carriage-returns on all lines, and ensures no lines across inputs are\n"
  66 "accidentally joined, since all lines it outputs end with line-feeds,\n"
  67 "even when the original files don't.\n"
  68 "\n"
  69 "The only option available is to show this help message, using any of\n"
  70 "`-h`, `--h`, `-help`, or `--help`, without the quotes.\n"
  71 "";
  72 
  73 // slice is a growable region of bytes in memory
  74 typedef struct slice {
  75     // ptr is the starting place of the region
  76     unsigned char* ptr;
  77 
  78     // cap is how many bytes the memory region has available
  79     size_t cap;
  80 } slice;
  81 
  82 bool starts_with_bom(const unsigned char* b, const size_t n) {
  83     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
  84 }
  85 
  86 static inline int64_t find_cr(const unsigned char* s, size_t len) {
  87     for (size_t i = 0; i < len; i++) {
  88         if (s[i] == '\r') {
  89             return i;
  90         }
  91     }
  92     return -1;
  93 }
  94 
  95 void handle_reader_faster(FILE* w, FILE* r) {
  96     unsigned char buf[IBUF_SIZE];
  97     unsigned char last = '\n';
  98 
  99     for (size_t i = 0; !feof(w); i++) {
 100         size_t len = fread(buf, sizeof(buf[0]), sizeof(buf), r);
 101         if (len < 1) {
 102             break;
 103         }
 104 
 105         unsigned char* ptr = buf;
 106         if (last == '\r' && ptr[0] != '\n') {
 107             fputc('\r', w);
 108         }
 109         last = ptr[len - 1];
 110 
 111         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 112         if (i == 0 && starts_with_bom(ptr, len)) {
 113             ptr += 3;
 114             len -= 3;
 115         }
 116 
 117         // handle all carriage-returns in the current chunk
 118         while (len > 0) {
 119             const int64_t j = find_cr(ptr, len);
 120             if (j < 0) {
 121                 break;
 122             }
 123 
 124             // if carriage return is the last byte in chunk, remember that
 125             // to check the start of the next chunk
 126             if (j == len - 1) {
 127                 fwrite(ptr, 1, len - 1, w);
 128                 len = 0;
 129                 break;
 130             }
 131 
 132             // if it's a CRLF byte-pair, just emit the LF from that
 133             if (j + 1 < len && ptr[j + 1] == '\n') {
 134                 ptr[j] = '\n';
 135                 fwrite(ptr, 1, j + 1, w);
 136                 ptr += j + 2;
 137                 len -= j + 2;
 138                 continue;
 139             }
 140 
 141             // emit lone CRs inside chunks, as only CRLF byte-pairs are fixed
 142             fwrite(ptr, 1, j + 1, w);
 143             ptr += j + 1;
 144             len -= j + 1;
 145         }
 146 
 147         // don't forget trailing part after last carriage-return in chunk
 148         if (len > 0) {
 149             fwrite(ptr, 1, len, w);
 150         }
 151     }
 152 
 153     // handle edge-cases for the last chunk
 154     if (last != '\n') {
 155         fputc('\n', w);
 156     }
 157 
 158     fflush(w);
 159 }
 160 
 161 void handle_reader(FILE* w, FILE* r, slice* line, bool live_lines) {
 162     if (!live_lines) {
 163         handle_reader_faster(w, r);
 164         return;
 165     }
 166 
 167     unsigned char last = '\n';
 168 
 169     for (size_t i = 0; !feof(w); i++) {
 170         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 171         if (line->ptr == NULL) {
 172             fprintf(stderr, "\n");
 173             fprintf(stderr, ERROR_LINE("out of memory"));
 174             exit(BAD_ALLOC);
 175         }
 176 
 177         if (len < 0) {
 178             break;
 179         }
 180 
 181         unsigned char* ptr = line->ptr;
 182         last = line->ptr[len - 1];
 183 
 184         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 185         if (i == 0 && starts_with_bom(ptr, len)) {
 186             ptr += 3;
 187             len -= 3;
 188         }
 189 
 190         // turn trailing carriage-returns into line-feeds
 191         if (len >= 1 && ptr[len - 1] == '\r') {
 192             ptr[len - 1] = '\n';
 193             last = '\n';
 194         }
 195 
 196         // get rid of carriage-returns preceding line-feeds
 197         if (len >= 2 && ptr[len - 2] == '\r' && ptr[len - 1] == '\n') {
 198             ptr[len - 2] = '\n';
 199             len--;
 200         }
 201 
 202         fwrite(ptr, 1, len, w);
 203         fflush(w);
 204     }
 205 
 206     // handle edge-cases for the last line
 207     if (last != '\n') {
 208         fputc('\n', w);
 209     }
 210 
 211     fflush(w);
 212 }
 213 
 214 // handle_file handles data from the filename given; returns false only when
 215 // the file can't be opened
 216 bool handle_file(FILE* w, const char* path, slice* line, bool live_lines) {
 217     FILE* f = fopen(path, "rb");
 218     if (f == NULL) {
 219         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 220         return false;
 221     }
 222 
 223     handle_reader(w, f, line, live_lines);
 224     fclose(f);
 225     return true;
 226 }
 227 
 228 // run returns the number of errors
 229 int run(char** args, size_t nargs, FILE* w, bool live_lines) {
 230     size_t dashes = 0;
 231     for (size_t i = 0; i < nargs; i++) {
 232         if (args[i][0] == '-' && args[i][1] == 0) {
 233             dashes++;
 234         }
 235     }
 236 
 237     if (dashes > 1) {
 238         const char* m = "can't use the standard input (dash) more than once";
 239         fprintf(stderr, ERROR_LINE("%s"), m);
 240         return 1;
 241     }
 242 
 243     slice line;
 244     line.ptr = NULL;
 245     line.cap = 0;
 246 
 247     if (live_lines) {
 248         line.cap = 32 * 1024;
 249         line.ptr = malloc(line.cap);
 250         if (line.ptr == NULL) {
 251             fprintf(stderr, ERROR_LINE("out of memory"));
 252             exit(BAD_ALLOC);
 253         }
 254     }
 255 
 256     size_t errors = 0;
 257     for (size_t i = 0; i < nargs && !feof(w); i++) {
 258         if (args[i][0] == '-' && args[i][1] == 0) {
 259             handle_reader(w, stdin, &line, live_lines);
 260             continue;
 261         }
 262 
 263         if (!handle_file(w, args[i], &line, live_lines)) {
 264             errors++;
 265         }
 266     }
 267 
 268     // use stdin when not given any filepaths
 269     if (nargs < 1) {
 270         handle_reader(w, stdin, &line, live_lines);
 271     }
 272 
 273     free(line.ptr);
 274     return errors;
 275 }
 276 
 277 int main(int argc, char** argv) {
 278 #ifdef _WIN32
 279     setmode(fileno(stdin), O_BINARY);
 280     // ensure output lines end in LF instead of CRLF on windows
 281     setmode(fileno(stdout), O_BINARY);
 282     setmode(fileno(stderr), O_BINARY);
 283 #endif
 284 
 285     if (argc > 1) {
 286         if (
 287             strcmp(argv[1], "-h") == 0 ||
 288             strcmp(argv[1], "-help") == 0 ||
 289             strcmp(argv[1], "--h") == 0 ||
 290             strcmp(argv[1], "--help") == 0
 291         ) {
 292             fprintf(stdout, "%s", info);
 293             return 0;
 294         }
 295     }
 296 
 297     size_t nargs = argc - 1;
 298     char** args = argv + 1;
 299     if (nargs > 0 && strcmp(args[0], "--") == 0) {
 300         nargs--;
 301         args++;
 302     }
 303 
 304     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 305     if (!live_lines) {
 306         setvbuf(stdout, NULL, _IOFBF, 0);
 307     }
 308     return run(args, nargs, stdout, live_lines) == 0 ? 0 : 1;
 309 }