File: squeeze.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./squeeze ./squeeze.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 
  36 #ifdef _WIN32
  37 #include <fcntl.h>
  38 #include <windows.h>
  39 #endif
  40 
  41 #ifdef RED_ERRORS
  42 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  43 #ifdef __APPLE__
  44 #define ERROR_STYLE "\x1b[31m"
  45 #endif
  46 #define RESET_STYLE "\x1b[0m"
  47 #else
  48 #define ERROR_STYLE
  49 #define RESET_STYLE
  50 #endif
  51 
  52 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  53 
  54 const char* info = ""
  55 "squeeze [filenames...]\n"
  56 "\n"
  57 "Ignore leading/trailing spaces (and carriage-returns) on lines, also turning\n"
  58 "all runs of multiple consecutive spaces into single spaces. Spaces around\n"
  59 "tabs are ignored as well.\n"
  60 "";
  61 
  62 // bufwriter is, as the name implies, a buffered-writer: when it's aimed at
  63 // stdout, it considerably speeds up this app, as intended
  64 typedef struct bufwriter {
  65     // buf is the buffer proper
  66     unsigned char* buf;
  67 
  68     // len is how many bytes of the buffer are currently being used
  69     size_t len;
  70 
  71     // cap is the capacity of the buffer, or the most bytes it can hold
  72     size_t cap;
  73 
  74     // out is the destination of all that's written into the buffer
  75     FILE* out;
  76 } bufwriter;
  77 
  78 void init_bufwriter(bufwriter* w, FILE* out, unsigned char* b, size_t cap) {
  79     w->buf = b;
  80     w->len = 0;
  81     w->cap = cap;
  82     w->out = out;
  83 }
  84 
  85 void write_byte(bufwriter* w, unsigned char b) {
  86     if (w->len < w->cap) {
  87         w->buf[w->len++] = b;
  88         return;
  89     }
  90 
  91     fwrite(w->buf, w->cap, 1, w->out);
  92     w->buf[0] = b;
  93     w->len = 1;
  94 }
  95 
  96 void flush(bufwriter* w) {
  97     if (w->len > 0) {
  98         fwrite(w->buf, w->len, 1, w->out);
  99     }
 100     w->len = 0;
 101     fflush(w->out);
 102 }
 103 
 104 // slice is a growable region of bytes in memory
 105 typedef struct slice {
 106     // ptr is the starting place of the region
 107     unsigned char* ptr;
 108 
 109     // len is how many bytes are currently being used
 110     size_t len;
 111 
 112     // cap is how many bytes the memory region has available
 113     size_t cap;
 114 } slice;
 115 
 116 bool starts_with_bom(const unsigned char* b, const size_t n) {
 117     return (n >= 3 && b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 118 }
 119 
 120 void write_squeezed_line(bufwriter* w, const unsigned char* p, size_t len) {
 121     // ignore trailing CRLF byte-pairs, or trailing line-feed bytes
 122     if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 123         len -= 2;
 124     } else if (len >= 1 && p[len - 1] == '\n') {
 125         len--;
 126     }
 127 
 128     // ignore leading spaces
 129     while (len > 0 && p[0] == ' ') {
 130         p++;
 131         len--;
 132     }
 133 
 134     // ignore trailing spaces
 135     // while (len > 0 && p[len - 1] == ' ') {
 136     //     len--;
 137     // }
 138 
 139     // trailing spaces won't be emitted, since there's nothing to follow
 140     // them, which in turn prevents their normal single-space replacement
 141     // from being emitted
 142 
 143     bool space = false;
 144 
 145     for (size_t i = 0; i < len; i++) {
 146         unsigned char b = p[i];
 147 
 148         if (b == ' ') {
 149             space = true;
 150             continue;
 151         }
 152 
 153         if (b == '\t') {
 154             space = false;
 155             write_byte(w, '\t');
 156 
 157             // ignore spaces right after tabs
 158             while (i + 1 < len && p[i + 1] == ' ') {
 159                 i++;
 160             }
 161 
 162             continue;
 163         }
 164 
 165         if (space) {
 166             write_byte(w, ' ');
 167             space = false;
 168         }
 169         write_byte(w, b);
 170     }
 171 
 172     write_byte(w, '\n');
 173     flush(w);
 174 }
 175 
 176 // handle_reader skips leading UTF-8 BOMs (byte-order marks), and turns all
 177 // CR-LF pairs into single LF bytes
 178 bool handle_reader(bufwriter* w, FILE* r, slice* line) {
 179     for (size_t i = 0; !feof(w->out); i++) {
 180         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 181         if (len < 0) {
 182             break;
 183         }
 184 
 185         if (line->ptr == NULL) {
 186             fprintf(stderr, ERROR_LINE("out of memory"));
 187             return false;
 188         }
 189 
 190         // line->len = len;
 191 
 192         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 193         if (i == 0 && starts_with_bom(line->ptr, len)) {
 194             write_squeezed_line(w, line->ptr + 3, len - 3);
 195             continue;
 196         }
 197 
 198         write_squeezed_line(w, line->ptr, len);
 199     }
 200 
 201     flush(w);
 202     return true;
 203 }
 204 
 205 // handle_file handles data from the filename given; returns false only when
 206 // the file can't be opened
 207 bool handle_file(bufwriter* w, const char* path, slice* line) {
 208     FILE* f = fopen(path, "rb");
 209     if (f == NULL) {
 210         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 211         return false;
 212     }
 213 
 214     const bool ok = handle_reader(w, f, line);
 215     fclose(f);
 216     return ok;
 217 }
 218 
 219 // run returns the number of errors
 220 int run(int argc, char** argv, FILE* w) {
 221     unsigned char outbuf[8 * 1024];
 222     bufwriter bw;
 223 
 224     size_t dashes = 0;
 225     for (int i = 1; i < argc; i++) {
 226         if (argv[i][0] == '-' && argv[i][1] == 0) {
 227             dashes++;
 228         }
 229     }
 230 
 231     if (dashes > 1) {
 232         const char* m = "can't use the standard input (dash) more than once";
 233         fprintf(stderr, ERROR_LINE("%s"), m);
 234         return 1;
 235     }
 236 
 237     slice line;
 238     line.len = 0;
 239     line.cap = 32 * 1024;
 240     line.ptr = malloc(line.cap);
 241 
 242     if (line.ptr == NULL) {
 243         fprintf(stderr, ERROR_LINE("out of memory"));
 244         return 1;
 245     }
 246 
 247     init_bufwriter(&bw, w, outbuf, sizeof(outbuf));
 248 
 249     size_t errors = 0;
 250     for (int i = 1; i < argc && !feof(stdout) && line.ptr != NULL; i++) {
 251         if (argv[i][0] == '-' && argv[i][1] == 0) {
 252             if (!handle_reader(&bw, stdin, &line)) {
 253                 errors++;
 254             }
 255             continue;
 256         }
 257 
 258         if (!handle_file(&bw, argv[i], &line)) {
 259             errors++;
 260         }
 261     }
 262 
 263     // use stdin when not given any filepaths
 264     if (argc <= 1) {
 265         if (!handle_reader(&bw, stdin, &line)) {
 266             errors++;
 267         }
 268     }
 269 
 270     free(line.ptr);
 271     flush(&bw);
 272     return errors;
 273 }
 274 
 275 int main(int argc, char** argv) {
 276 #ifdef _WIN32
 277     setmode(fileno(stdin), O_BINARY);
 278     // ensure output lines end in LF instead of CRLF on windows
 279     setmode(fileno(stdout), O_BINARY);
 280     setmode(fileno(stderr), O_BINARY);
 281 #endif
 282 
 283     if (argc > 1) {
 284         if (
 285             strcmp(argv[1], "-h") == 0 ||
 286             strcmp(argv[1], "-help") == 0 ||
 287             strcmp(argv[1], "--h") == 0 ||
 288             strcmp(argv[1], "--help") == 0
 289         ) {
 290             fprintf(stdout, "%s", info);
 291             return 0;
 292         }
 293     }
 294 
 295     return run(argc, argv, stdout) == 0 ? 0 : 1;
 296 }